Exemple #1
0
def main():
    log.debug("Loading data from '%s'." % args.data)
    data = torch.load(args.data + "/data.pt")
    vocabs = data["vocabs"]

    dev_data = get_dataset(data, args, "dev")
    test_data = get_dataset(data, args, "test")

    state_dict = torch.load(args.export_path)
    args.word_emb_size = state_dict["module.word_lut"].size(1)
    args.mention_len = dev_data.get_mention_len()
    args.context_len = dev_data.get_context_len()

    log.debug("Building model...")
    model = Model(args, vocabs)
    model = DataParallel(model)
    model.load_state_dict(state_dict)
    model.to(DEVICE)

    runner = Runner(model, None, None, vocabs, None, None, dev_data, test_data,
                    args)

    log.info("INFERENCE ON DEV DATA:")
    runner.instance_printer.show(dev_data)
    runner.print_full_validation(dev_data, "DEV")

    log.info("\n\nINFERENCE ON TEST DATA:")
    runner.instance_printer.show(test_data)
    runner.print_full_validation(test_data, "TEST")
def main():
    log.debug("Loading data from '%s'." % args.data)
    data = torch.load(args.data + "/data.pt")
    vocabs = data["vocabs"]

    dev_data = get_dataset(data, args, "dev")
    test_data = get_dataset(data, args, "test")

    state_dict = torch.load("models/" + args.export_path + ".pt")
    args.type_dims = state_dict["type_lut.weight"].size(1)

    log.debug("Building model...")
    model = figet.Models.Model(args, vocabs)
    model.load_state_dict(state_dict)

    if len(args.gpus) >= 1:
        model.cuda()

    type2vec = model.type_lut.weight.data

    coach = figet.Coach(model, None, vocabs, None, dev_data, test_data,
                        type2vec, None, args)

    log.info("INFERENCE ON DEV DATA:")
    coach.instance_printer.show(dev_data)
    coach.print_results(dev_data, "DEV")

    log.info("\n\nINFERENCE ON TEST DATA:")
    coach.instance_printer.show(test_data)

    coach.print_results(test_data, "TEST")
Exemple #3
0
def get_generators(is_test=False):
    """Get the dataset class according to the neptune parameters."""
    datasets = get_dataset(is_test=is_test)

    # Fetch the processor
    processor = bootstrap.get_processor()

    # If len is 1, then return the test set
    if len(datasets) == 1:
        gen = data.DataGenerator(is_training=False,
                                 is_test=True,
                                 dataset=datasets[0],
                                 batch_size=20,  # For some reason, 18000/batch_size must be an integer
                                 shuffle=not is_test,
                                 processor=processor)
        return gen
    else:
        data_generators = []
        is_training = True
        for dataset in datasets:
            data_generators.append(data.DataGenerator(is_training=is_training,
                                                      is_test=False,
                                                      dataset=dataset,
                                                      batch_size=params.batch_size,
                                                      shuffle=is_training,
                                                      processor=processor))
            is_training = False

        return tuple(data_generators)
def main(params, date, epoch, gpu):
    net = PonderEnhancer(params['model'])
    ckpt = get_model_ckpt(params, date, epoch)
    net.load_state_dict(torch.load(ckpt))

    net.cuda()
    net.eval()

    # run  test 
    test_dset = get_dataset(params['test_data_config'])
    test_dataloader = get_dataloader(params, test_dset, train=False)
    loss_fn = get_loss_fn(params['loss'])
    fs = params['test_data_config']['fs']

    per_db_results = {}
    for (clean, noise, mix, file_db) in tqdm.tqdm(test_dataloader):
        clean, mix = clean.cuda(), mix.cuda()
        db = file_db[0][:-4]

        # Train
        pred, ponder = net(mix, verbose=False)  # change debug -> verbose
        _, loss_ponder = loss_fn(clean, pred, ponder)

        if db not in per_db_results:
            per_db_results[db] = {'enhance': [], 'ponder': []}

        # get the perceptual metrics
        np_clean = clean.detach().cpu().numpy().reshape(-1)
        np_pred = pred.detach().cpu().numpy().reshape(-1)
        
        per_db_results[db]['enhance'].append(pystoi.stoi(clean, enhanced, sr))
        per_db_results[db]['ponder'].append(loss_ponder.item())

    # save it all
    save_dict(per_db_results, 'stoi', epoch, date, params)
Exemple #5
0
def get_samelist():
    print('Loading features dataframe to generate "same_list" ... ')
    # make same user list, diff user list
    combine = get_dataset(d_type='30days')
    df_test = combine.loc[TRAIN_SHAPE:]
    same_list = list(set(combine['card']) & set(df_test['card']))
    del combine, df_test
    gc.collect()
    return same_list
Exemple #6
0
def main(argv):
    torch.manual_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)

    hlog.flags()

    dataset = get_dataset()
    model = pick_model(dataset)

    model.prepare(dataset)
    if isinstance(model, nn.Module):
        path = os.path.join(FLAGS.model_dir, FLAGS.model)
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint)

    realized = set()
    examples = pick_examples(dataset)
    while len(realized) < FLAGS.n_sample:
        try:
            templ, names = next(examples)
        except StopIteration:
            break
        datum = make_batch([(templ, templ) for _ in range(10)],
                           dataset.vocab,
                           staged=True)
        (inps, outs), scores = model.sample(datum.inp_data, datum.out_data)

        keep = []
        for inp, out, score in zip(inps, outs, scores):
            inp_realized, inp_used = dataset.realize(inp, names)
            out_realized, out_used = dataset.realize(out, names)
            if ((not FLAGS.output_only)
                    and len(inp_used) == 0) or len(out_used) == 0:
                continue
            if len(inp_used | out_used) != len(names):
                continue
            if not ((FLAGS.output_only or dataset.novel(inp=inp_realized))
                    and dataset.novel(out=out_realized)):
                continue
            if (inp_realized, out_realized) in realized:
                continue
            keep.append(((inp_realized, out_realized), score))
        for (inp_realized, out_realized), score in keep:
            with hlog.task(str(len(realized))):
                hlog.value("inp", " ".join(dataset.vocab.decode(templ[0])))
                hlog.value("out", " ".join(dataset.vocab.decode(templ[1])))
                hlog.value("var", names)
                hlog.value("score", score)
                with hlog.task("realized"):
                    hlog.value("inp", " ".join(inp_realized))
                    hlog.value("out", " ".join(out_realized))
            realized.add((inp_realized, out_realized))

    data = [{"inp": inp, "out": out} for inp, out in realized]
    with open(FLAGS.write, "w") as fh:
        json.dump(data, fh, indent=2)
Exemple #7
0
def main(argv):
    torch.manual_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    hlog.flags()
    if not os.path.exists(FLAGS.model_dir):
        os.mkdir(FLAGS.model_dir)
    dataset = get_dataset()
    model = ContextModel(dataset.vocab).to(_flags.device())
    def callback(i_epoch):
        pass
    train(dataset, model, dataset.sample_ctx_train, callback, staged=False)
Exemple #8
0
def main(argv):
    dataset = get_dataset()
    vocab = dataset.vocab
    for inp, out in dataset.get_train():
        print(" ".join(vocab.decode(inp)))
        print(" ".join(vocab.decode(out)))
        print()

    print("\n\n\n\n")

    for inp, out in dataset.get_val():
        print(" ".join(vocab.decode(inp)))
        print(" ".join(vocab.decode(out)))
        print()
Exemple #9
0
def retrain_models(args, old_networks, train_loader, test_loader, config, tensorboard_obj=None, initial_acc=None, nicks=None):
    accuracies = []
    retrained_networks = []
    # nicks = []

    # assert len(old_networks) >= 4

    for i in range(len(old_networks)):
        nick = nicks[i]
        # if i == len(old_networks) - 1:
        #     nick = 'naive_averaging'
        # elif i == len(old_networks) - 2:
        #     nick = 'geometric'
        # else:
        #     nick = 'model_' + str(i)
        # nicks.append(nick)
        print("Retraining model : ", nick)

        if initial_acc is not None:
            start_acc = initial_acc[i]
        else:
            start_acc = -1
        if args.dataset.lower()[0:7] == 'cifar10':

            if args.reinit_trainloaders:
                print('reiniting trainloader')
                retrain_loader, _ = cifar_train.get_dataset(config, no_randomness=args.no_random_trainloaders)
            else:
                retrain_loader = train_loader

            output_root_dir = "{}/{}_models_ensembled/".format(args.baseroot, (args.dataset).lower())
            output_root_dir = os.path.join(output_root_dir, args.exp_name, nick)
            os.makedirs(output_root_dir, exist_ok=True)

            retrained_network, acc = cifar_train.get_retrained_model(args, retrain_loader, test_loader, old_networks[i], config, output_root_dir, tensorboard_obj=tensorboard_obj, nick=nick, start_acc=initial_acc[i])
            
        elif args.dataset.lower() == 'mnist':

            if args.reinit_trainloaders:
                print('reiniting trainloader')
                retrain_loader, _ = get_dataloader(args, no_randomness=args.no_random_trainloaders)
            else:
                retrain_loader = train_loader
                
            start_acc = initial_acc[i]
            retrained_network, acc = get_retrained_model(args, retrain_loader, test_loader, old_network=old_networks[i], tensorboard_obj=tensorboard_obj, nick=nick, start_acc=start_acc, retrain_seed=args.retrain_seed)
        retrained_networks.append(retrained_network)
        accuracies.append(acc)
    return retrained_networks, accuracies
Exemple #10
0
def get_model_activations(args,
                          models,
                          config=None,
                          layer_name=None,
                          selective=False,
                          personal_dataset=None):
    import compute_activations
    from data import get_dataloader

    if args.activation_histograms and args.act_num_samples > 0:
        if args.dataset == 'mnist':
            unit_batch_train_loader, _ = get_dataloader(args, unit_batch=True)

        elif args.dataset.lower()[0:7] == 'cifar10':
            if config is None:
                config = args.config  # just use the config in arg
            unit_batch_train_loader, _ = cifar_train.get_dataset(
                config, unit_batch_train=True)

        if args.activation_mode is None:
            activations = compute_activations.compute_activations_across_models(
                args, models, unit_batch_train_loader, args.act_num_samples)
        else:
            if selective and args.update_acts:
                activations = compute_activations.compute_selective_activation(
                    args, models, layer_name, unit_batch_train_loader,
                    args.act_num_samples)
            else:
                if personal_dataset is not None:
                    # personal training set is passed which consists of (inp, tgts)
                    print('using the one from partition')
                    loader = partition.to_dataloader_from_tens(
                        personal_dataset[0], personal_dataset[1], 1)
                else:
                    loader = unit_batch_train_loader

                activations = compute_activations.compute_activations_across_models_v1(
                    args,
                    models,
                    loader,
                    args.act_num_samples,
                    mode=args.activation_mode)

    else:
        activations = None

    return activations
Exemple #11
0
def main(argv):
    torch.manual_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)

    hlog.flags()

    if FLAGS.augment is not None:
        with open(FLAGS.augment) as fh:
            aug_data = json.load(fh)
    else:
        aug_data = []

    dataset = get_dataset(aug_data=aug_data, invert=FLAGS.invert)
    model = GeneratorModel(dataset.vocab, copy=True,
                           self_attention=False).to(_flags.device())

    fine_tune = [True]

    def sample():
        if fine_tune[0]:
            return dataset.sample_train(aug_ratio=FLAGS.aug_ratio)
        else:
            return dataset.sample_train(aug_ratio=FLAGS.aug_ratio)

    def callback(i_epoch):
        if not fine_tune[0] and i_epoch >= 20:
            hlog.log("FINE_TUNE")
            fine_tune[0] = True
        model.eval()
        final = i_epoch == FLAGS.n_epochs - 1
        with hlog.task("eval_train", timer=False):
            train_data = [dataset.sample_train() for _ in range(1000)]
            evaluate(model, train_data, dataset)
        with hlog.task("eval_val", timer=False):
            val_data = dataset.get_val()
            val_acc = evaluate(model, val_data, dataset, vis=final, beam=final)
        if FLAGS.TEST and (final or FLAGS.test_curve):
            with hlog.task("eval_test", timer=False):
                test_data = dataset.get_test()
                evaluate(model, test_data, dataset, beam=final)
        if (i_epoch + 1) % FLAGS.n_checkpoint == 0:
            torch.save(
                model.state_dict(),
                os.path.join(FLAGS.model_dir, "model.%05d.chk" % i_epoch))
        return val_acc

    train(dataset, model, sample, callback, staged=False)
Exemple #12
0
def main(argv):
    torch.manual_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)

    hlog.flags()

    if FLAGS.augment is not None:
        with open(FLAGS.augment) as fh:
            aug_data = json.load(fh)
    else:
        aug_data = []

    dataset = get_dataset(aug_data=aug_data)
    if FLAGS.use_mkn:
        mkn_main(dataset)
    else:
        rnn_main(dataset)
Exemple #13
0
def main(argv):
    hlog.flags()

    assert FLAGS.augment is not None
    with open(FLAGS.augment) as fh:
        aug_data = json.load(fh)

    dataset = get_dataset()
    train_data = dataset.get_train()

    utts = []
    if FLAGS.concat:
        for _, utt in train_data:
            s = " ".join(dataset.vocab.decode(utt))
            utts.append(s)
    for pair in aug_data:
        s = " ".join(pair["out"])
        utts.append(s)

    with open(FLAGS.write, "w") as fh:
        for utt in utts:
            print(utt, file=fh)
def main():
    LAYERS = 3
    pkl_fname = "data/preprocess/stage1_train_set_rgb.pkl"
    images, masks = get_dataset(pkl_fname)
    logging.info("read train set: %s, %s", images.shape, masks.shape)
    logging.info("image:[%s, %s], mask:[%s, %s]", np.max(images), np.min(images), np.max(masks), np.min(masks))

    # pred_size, offset = unet_size(256, LAYERS)
    # logging.info("pred_size: %d, offset: %d", pred_size, offset)
    # images = padding_array(images, offset, default_val=0.0)
    # masks = padding_array(masks, offset, default_val=False)


    # args.data_dir = args.data_dir.strip()
    # if len(args.data_dir) >= 0:
    #     fnames = [os.path.join(args.data_dir, x) for x in fnames]

    train_ratio = 0.9
    n_train = int(len(images)*train_ratio)
    logging.info("train_ratio: %s, n_train: %s, n_val: %s", train_ratio, n_train, len(images)-n_train)
    convert_dataset(images[:n_train], masks[:n_train], "data/tfrecords/256x256/train", 4)
    convert_dataset(images[n_train:], masks[n_train:], "data/tfrecords/256x256/val", 2)
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('-w',
                    '--working_dir',
                    default='train',
                    help='Where to save checkpoints and logs')
    ap.add_argument(
        '-i',
        '--predict_input_dir',
        help='Directory of files produced by the preprocessing script')
    ap.add_argument(
        '-o',
        '--predict_output_dir',
        help='Directory of files produced by the preprocessing script')
    ap.add_argument(
        '-r',
        '--predict_raw_dir',
        help='Directory of files produced by the preprocessing script')
    ap.add_argument('-p',
                    '--checkpoint_number',
                    default=9,
                    help='checkpoint path of model')
    ap.add_argument('-b',
                    '--balance',
                    type=float,
                    default=0,
                    help='The dropout percentage')

    args = ap.parse_args()
    #  args = ap.parse_args(args=["../googletrends_japanese_data_5000","--working_dir","../googletrends_japanese_train_5000", "--test_dir", "../local_data_japan"])

    params_path = os.path.join(args.working_dir, 'params.csv')
    params = {}
    with open(params_path, 'r') as f:
        for i in csv.reader(f):
            params[i[0]] = i[1]

    DATA_DIR = params["DATA_DIR"]
    #文章をノードごとに分割
    file_path = args.predict_raw_dir
    doc_data, _, _ = parse_doc(util.get_filenames(file_path))

    info_file = os.path.join(DATA_DIR, 'info.pkl')
    with open(info_file, 'rb') as fp:
        info = pickle.load(fp)
        info['num_train_examples'] = len(doc_data)
        train_steps = math.ceil(info['num_train_examples'] /
                                int(params["batch_size"]))

    predict_set_file = os.path.join(args.predict_input_dir, 'train.tfrecords')
    predict_dataset = get_dataset(predict_set_file,
                                  1,
                                  repeat=False,
                                  shuffle=False)

    kwargs = {
        'input_size': info['num_words'] + info['num_tags'],
        'hidden_size': int(params['hidden_units']),
        'num_layers': int(params['num_layers']),
        'dropout': float(params['dropout']),
        'dense_size': int(params['dense_size'])
    }
    clf = LeafClassifier(**kwargs)

    checkpoint_path = args.working_dir + "/ckpt/model.{:03d}.h5".format(
        int(args.checkpoint_number))
    clf.model = load_model(checkpoint_path)

    _, y_pred = clf.eval_recall(predict_dataset,
                                train_steps,
                                desc="",
                                balance=args.balance)

    filenames = util.get_filenames(file_path)
    raw_filenames = [i.split("/")[-1] for i in filenames]

    #予測結果を保存
    pred_index = [bool(i) for i in y_pred]
    delete_index = [not bool(i) for i in y_pred]
    counter = 0
    for i in range(len(raw_filenames)):
        numpy_data = np.array(doc_data[raw_filenames[i]])
        contents = numpy_data[pred_index[counter:counter + len(numpy_data)]]
        delete_contents = numpy_data[delete_index[counter:counter +
                                                  len(numpy_data)]]
        counter += len(numpy_data)
        content_text = ""
        delete_text = ""
        for content in contents:
            content_text += str(content) + "\n"
        for delete_content in delete_contents:
            delete_text += str(delete_content) + "\n"
        dir_path = args.predict_output_dir
        os.makedirs(dir_path, exist_ok=True)
        with open(
                os.path.join(
                    dir_path,
                    raw_filenames[i].replace("/", "_").replace(".", "_") +
                    ".txt"), 'w') as file:
            file.write(content_text)
        with open(
                os.path.join(
                    dir_path,
                    raw_filenames[i].replace("/", "_").replace(".", "_") +
                    "delete.txt"), 'w') as file:
            file.write(delete_text)
Exemple #16
0
def get_dataloader(args, unit_batch = False, no_randomness=False):
    if unit_batch:
        bsz = (1, 1)
    else:
        bsz = (args.batch_size_train, args.batch_size_test)

    if no_randomness:
        enable_shuffle = False
    else:
        enable_shuffle = True

    if args.dataset.lower() == 'mnist':

        train_loader = torch.utils.data.DataLoader(
          torchvision.datasets.MNIST('./files/', train=True, download=args.to_download,
                                     transform=torchvision.transforms.Compose([
                                       torchvision.transforms.ToTensor(),
                                       torchvision.transforms.Normalize(
                                           # only 1 channel
                                           (0.1307,), (0.3081,))
                                     ])),
          batch_size=bsz[0], shuffle=enable_shuffle
        )


        test_loader = torch.utils.data.DataLoader(
          torchvision.datasets.MNIST('./files/', train=False, download=args.to_download,
                                     transform=torchvision.transforms.Compose([
                                       torchvision.transforms.ToTensor(),
                                       torchvision.transforms.Normalize(
                                           (0.1307,), (0.3081,))
                                     ])),
          batch_size=bsz[1], shuffle=enable_shuffle
        )

        return train_loader, test_loader

    elif args.dataset.lower() == 'cifar10':
        if args.cifar_style_data:
            train_loader, test_loader = cifar_train.get_dataset(args.config)
        else:

            train_loader = torch.utils.data.DataLoader(
                torchvision.datasets.CIFAR10('./data/', train=True, download=args.to_download,
                                           transform=torchvision.transforms.Compose([
                                               torchvision.transforms.ToTensor(),
                                               torchvision.transforms.Normalize(
                                                   # Note this normalization is not same as in MNIST
                                                   # (mean_ch1, mean_ch2, mean_ch3), (std1, std2, std3)
                                                   (0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
                                           ])),
                batch_size=bsz[0], shuffle=enable_shuffle
            )

            test_loader = torch.utils.data.DataLoader(
                torchvision.datasets.CIFAR10('./data/', train=False, download=args.to_download,
                                           transform=torchvision.transforms.Compose([
                                               torchvision.transforms.ToTensor(),
                                               torchvision.transforms.Normalize(
                                                   # (mean_ch1, mean_ch2, mean_ch3), (std1, std2, std3)
                                                   (0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
                                           ])),
                batch_size=bsz[1], shuffle=enable_shuffle
            )

        return train_loader, test_loader
Exemple #17
0
        # checkpoint_type = 'final'  # which checkpoint to use for ensembling (either of 'best' or 'final)

        if args.dataset == 'mnist':
            train_loader, test_loader = get_dataloader(args)
            retrain_loader, _ = get_dataloader(
                args, no_randomness=args.no_random_trainloaders)
        elif args.dataset.lower()[0:7] == 'cifar10':
            args.cifar_init_lr = config['optimizer_learning_rate']
            if args.second_model_name is not None:
                assert second_config is not None
                assert args.cifar_init_lr == second_config[
                    'optimizer_learning_rate']
                # also the below things should be fine as it is just dataloader loading!
            print('loading {} dataloaders'.format(args.dataset.lower()))
            train_loader, test_loader = cifar_train.get_dataset(config)
            retrain_loader, _ = cifar_train.get_dataset(
                config, no_randomness=args.no_random_trainloaders)

        models = []
        accuracies = []

        for idx in range(args.num_models):
            print("loading model with idx {} and checkpoint_type is {}".format(
                idx, args.ckpt_type))

            if args.dataset.lower()[0:7] == 'cifar10' and (
                    args.model_name.lower()[0:5] == 'vgg11'
                    or args.model_name.lower()[0:6] == 'resnet'):
                if idx == 0:
                    config_used = config
Exemple #18
0
def finetune(epochs, train_steps):
    warmup_steps = int(train_steps * epochs * 0.1)

    train_dataset = get_dataset()
    model = load_model(train_steps, warmup_steps)
    train(model, train_dataset, epochs, train_steps)
Exemple #19
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--num_tweets", type=int, default=5, help="Number of tweets to generate")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=40, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=10, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=4, help="Filter top-k tokens before sampling (<=0: no filtering)")
    args = parser.parse_args()

    model = 'gpt2'

    p = Path('./runs')
    posibilities = [x for x in p.iterdir() if x.is_dir()]
    print('Please Select a Checkpoint: ')
    for i, check in enumerate(posibilities):
        print(f'{i}) {str(check)}')
    selection = int(input('> '))
    model_checkpoint = posibilities[selection]
    

    distractor_path = "Elizabeth Warren.csv"
    distractor = []

    with open(distractor_path) as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            distractor.append(row)

    def clean(data):
        return [tweet[1:] for tweet in data if tweet[2].count(' ') > 3][1:]
    
    distractor = clean(distractor)
    num_distractor = len(distractor)




    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))
	
	
    if args.seed != 0:
    	random.seed(args.seed)
    	torch.random.manual_seed(args.seed)
    	torch.cuda.manual_seed(args.seed)


    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = GPT2Tokenizer, GPT2LMHeadModel
    tokenizer = tokenizer_class.from_pretrained(model_checkpoint)
    model = model_class.from_pretrained(model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a twitter user")
    dataset = get_dataset(tokenizer)
    personalities = [dialog["name"] for dataset in dataset.values() for dialog in dataset]
    personality = random.choice(personalities)
    name = tokenizer.decode(personality)
    logger.info("Selected personality: %s", name)

    for i in range(args.num_tweets):
        base_tweet = distractor[random.randint(0, num_distractor-1)]
        context = tokenizer.encode(base_tweet[2])
        with torch.no_grad():
            out_ids = sample_sequence(personality, context, tokenizer, model, args)
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(f'\n\nTweet {i})')
        print(f"Original {base_tweet[0]} Tweet:")
        print(base_tweet[1])
        print('\nContext:')
        print(base_tweet[2])
        print(f"\nGenerated {name} Tweet: ")
        print(out_text)
Exemple #20
0
def main(args):
    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

    # Data loading code
    print("Loading data")

    dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path)
    dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path)

    print("Creating data loaders")
    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
    else:
        train_sampler = torch.utils.data.RandomSampler(dataset)
        test_sampler = torch.utils.data.SequentialSampler(dataset_test)

    if args.aspect_ratio_group_factor >= 0:
        group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
    else:
        train_batch_sampler = torch.utils.data.BatchSampler(
            train_sampler, args.batch_size, drop_last=True)

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
        collate_fn=utils.collate_fn)

    data_loader_test = torch.utils.data.DataLoader(
        dataset_test, batch_size=5,
        sampler=test_sampler, num_workers=args.workers,
        collate_fn=utils.collate_fn)

    print("Creating model")
    model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes,
                                                              pretrained=args.pretrained)
    model.to(device)

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(
        params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model_without_ddp.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
        args.start_epoch = checkpoint['epoch'] + 1

    if args.test_only:
        evaluate(model, data_loader_test, device=device)
        return

    print("Start training")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq)
        lr_scheduler.step()
        if args.output_dir:
            utils.save_on_master({
                'model': model_without_ddp.state_dict(),
                'optimizer': optimizer.state_dict(),
                'lr_scheduler': lr_scheduler.state_dict(),
                'args': args,
                'epoch': epoch},
                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))

        # evaluate after every epoch
        evaluate(model, data_loader_test, device=device)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))
Exemple #21
0
def main():
    np.random.seed(12345)
    LAYERS = 3
    padding_pkl_fname = "data/preprocess/stage1_train_set_padding.pkl"

    dump = False
    if dump:
        pkl_fname = "data/preprocess/stage1_train_set.pkl"
        images, masks = get_dataset(pkl_fname)
        logging.info("read train set: %s, %s", images.shape, masks.shape)
        logging.info("image:[%s, %s], mask:[%s, %s]", np.max(images),
                     np.min(images), np.max(masks), np.min(masks))

        pred_size, offset = unet_size(256, LAYERS)
        logging.info("pred_size: %d, offset: %d", pred_size, offset)
        images = padding_array(images, offset, default_val=0.0)
        masks = padding_array(masks, offset, default_val=False)
        logging.info("shape after padded: %s, %s", images.shape, masks.shape)
        with open(padding_pkl_fname, "wb") as f:
            pickle.dump((images, masks), f, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(padding_pkl_fname, "rb") as f:
            images, masks = pickle.load(f)

    # test_data(images, masks, 1679)
    data_provider = image_util.SimpleDataProvider(images, masks)
    logging.info("data_provider.channels: %s, data_provider.n_class: %s",
                 data_provider.channels, data_provider.n_class)

    # test_data_provider(data_provider)
    net = unet.Unet(
        channels=data_provider.channels,
        n_class=data_provider.n_class,
        cost='cross_entropy',
        layers=LAYERS,
        features_root=64,
        cost_kwargs=dict(regularizer=0.001),
    )

    net.load_weight("log/20180414/model.cpkt")
    for i in range(2):
        x_test, y_test = data_provider(1)
        prediction = net.infer(x_test)
        logging.info("%s, %s", x_test.shape, y_test.shape)
        mask = prediction[0, ..., 1] > 0.3
        img_cropped = crop_to_shape(x_test[0, ..., 0], mask.shape)
        gt = y_test[0, ..., 1]
        gt_cropped = crop_to_shape(gt, mask.shape)

        fig, ax = plt.subplots(1, 3, sharex=True, sharey=True, figsize=(12, 5))
        ax[0].imshow(img_cropped, aspect="auto")

        ax[1].imshow(gt_cropped, aspect="auto")
        ax[2].imshow(mask, aspect="auto")
        ax[0].set_title("Input")
        ax[1].set_title("Ground truth")
        ax[2].set_title("Prediction")
        fig.tight_layout()
        logging.info("%s, %s", mask.shape, gt_cropped.shape)
        logging.info("dice: %f", calc_dice(mask, gt_cropped))
        plt.show()
Exemple #22
0
import torch
import time
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE

from train import get_dataset


NUM = 2000
pca_cls = PCA(n_components=3)
# pca_cls = LDA(n_components=3)
# pca_cls = TSNE(n_components=3)

train_loader, test_loader = get_dataset(1, 1, 6000)

train_x = torch.flatten(train_loader.dataset.data, 1).numpy()
train_y = train_loader.dataset.targets.numpy()

test_x = torch.flatten(test_loader.dataset.data, 1).numpy()[:NUM]
test_y = test_loader.dataset.targets.numpy()[:NUM]

start = time.time()
pca_cls.fit(train_x, train_y)
print(time.time() - start)
start = time.time()
new_X = pca_cls.transform(test_x)
# new_X = pca_cls.fit_transform(test_x)
new_y = test_y
print(time.time() - start)
Exemple #23
0
def train_ray(opt, checkpoint_dir=None, data_dir="../data"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data = get_dataset(opt)
    g = data[0]
    if opt['gpu'] < 0:
        cuda = False
    else:
        cuda = True
        g = g.int().to(opt['gpu'])

    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    num_feats = features.shape[1]
    n_classes = data.num_classes
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
    #Edges %d
    #Classes %d
    #Train samples %d
    #Val samples %d
    #Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))

    # add self loop
    g = dgl.remove_self_loop(g)
    g = dgl.add_self_loop(g)
    n_edges = g.number_of_edges()
    # create model
    heads = ([opt['num_heads']] * opt['num_layers']) + [opt['num_out_heads']]

    models = []
    optimizers = []
    datas = [g for i in range(opt['num_init'])]

    for split in range(opt['num_init']):
        if opt['model'] == 'GAT':
            model = GAT(g, opt['num_layers'], num_feats, opt['num_hidden'],
                        n_classes, heads, F.elu, opt['in_drop'],
                        opt['attn_drop'], opt['negative_slope'],
                        opt['residual'], opt)
        elif opt['model'] == 'AGNN':
            model = AGNN(g, opt['num_layers'], num_feats, opt['num_hidden'],
                         n_classes, opt['in_drop'], opt)

        train_this = train
        model = model.to(device)
        models.append(model)

        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)

        # model = model.to(device)
        parameters = [p for p in model.parameters() if p.requires_grad]

        optimizer = get_optimizer(opt['optimizer'],
                                  parameters,
                                  lr=opt['lr'],
                                  weight_decay=opt['weight_decay'])
        optimizers.append(optimizer)

        # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint
        # should be restored.
        if checkpoint_dir:
            checkpoint = os.path.join(checkpoint_dir, "checkpoint")
            model_state, optimizer_state = torch.load(checkpoint)
            model.load_state_dict(model_state)
            optimizer.load_state_dict(optimizer_state)

    for epoch in range(1, opt['epochs']):
        loss = np.mean([
            train_this(model, optimizer, features, train_mask,
                       labels)[0].item()
            for model, optimizer in zip(models, optimizers)
        ])
        train_accs, val_accs, tmp_test_accs = average_test(models, datas)
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            best = np.argmax(val_accs)
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save(
                (models[best].state_dict(), optimizers[best].state_dict()),
                path)
        tune.report(loss=loss,
                    accuracy=np.mean(val_accs),
                    test_acc=np.mean(tmp_test_accs),
                    train_acc=np.mean(train_accs))
Exemple #24
0
def train_ray_int(opt, checkpoint_dir=None, data_dir="../data"):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data = get_dataset(opt)
    g = data[0]
    if opt['gpu'] < 0:
        cuda = False
    else:
        cuda = True
        g = g.int().to(opt['gpu'])

    # if opt["num_splits"] > 0:
    #   dataset.data = set_train_val_test_split(
    #     23 * np.random.randint(0, opt["num_splits"]),  # random prime 23 to make the splits 'more' random. Could remove
    #     dataset.data,
    #     num_development=5000 if opt["dataset"] == "CoauthorCS" else 1500)

    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    num_feats = features.shape[1]
    n_classes = data.num_classes
    n_edges = data.graph.number_of_edges()
    print("""----Data statistics------'
  #Edges %d
  #Classes %d
  #Train samples %d
  #Val samples %d
  #Test samples %d""" %
          (n_edges, n_classes, train_mask.int().sum().item(),
           val_mask.int().sum().item(), test_mask.int().sum().item()))

    # add self loop
    g = dgl.remove_self_loop(g)
    g = dgl.add_self_loop(g)
    n_edges = g.number_of_edges()
    # create model
    heads = ([opt['num_heads']] * opt['num_layers']) + [opt['num_out_heads']]
    if opt['model'] == 'GAT':
        model = GAT(g, opt['num_layers'], num_feats, opt['num_hidden'],
                    n_classes, heads, F.elu, opt['in_drop'], opt['attn_drop'],
                    opt['negative_slope'], opt['residual'], opt)
    elif opt['model'] == 'AGNN':
        model = AGNN(g, opt['num_layers'], num_feats, opt['num_hidden'],
                     n_classes, opt['in_drop'], opt)

    model = model.to(device)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    parameters = [p for p in model.parameters() if p.requires_grad]
    optimizer = get_optimizer(opt["optimizer"],
                              parameters,
                              lr=opt["lr"],
                              weight_decay=opt["weight_decay"])

    if checkpoint_dir:
        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
        model_state, optimizer_state = torch.load(checkpoint)
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    train_this = train
    this_test = test_OGB if opt['dataset'] == 'ogbn-arxiv' else test
    best_time = best_epoch = train_acc = val_acc = test_acc = 0
    for epoch in range(1, opt["epoch"]):
        # loss = train(model, optimizer, data)
        loss = train_this(model, optimizer, features, train_mask,
                          labels)[0].item()
        if opt["no_early"]:
            tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, g)
            best_time = opt['time']
        else:
            tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, g)
        if tmp_val_acc > val_acc:
            best_epoch = epoch
            train_acc = tmp_train_acc
            val_acc = tmp_val_acc
            test_acc = tmp_test_acc
        with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)
        tune.report(loss=loss,
                    accuracy=val_acc,
                    test_acc=test_acc,
                    train_acc=train_acc,
                    best_time=best_time,
                    best_epoch=best_epoch)
Exemple #25
0
            ensemble_root_dir = "{}/{}_models/".format(args.baseroot, (args.dataset).lower())
            ensemble_dir = ensemble_root_dir + args.load_models

        utils.mkdir(ensemble_dir)
        # checkpoint_type = 'final'  # which checkpoint to use for ensembling (either of 'best' or 'final)

        if args.dataset=='mnist':
            train_loader, test_loader = get_dataloader(args)
        elif args.dataset.lower() == 'cifar10':
            args.cifar_init_lr = config['optimizer_learning_rate']
            if args.second_model_name is not None:
                assert second_config is not None
                assert args.cifar_init_lr == second_config['optimizer_learning_rate']
                # also the below things should be fine as it is just dataloader loading!
            print('loading {} dataloaders'.format(args.dataset.lower()))
            train_loader, test_loader = cifar_train.get_dataset(config)

        models = []
        accuracies = []
        local_accuracies = []
        for idx in range(args.num_models):
            print("loading model with idx {} and checkpoint_type is {}".format(idx, args.ckpt_type))

            if args.dataset.lower()[0:7] == 'cifar10' and (args.model_name.lower()[0:5] == 'vgg11' or args.model_name.lower()[0:6] == 'resnet'):
                if idx == 0:
                    config_used = config
                elif idx == 1:
                    config_used = second_config

                model, accuracy = cifar_train.get_pretrained_model(
                        config_used, os.path.join(ensemble_dir, 'model_{}/{}.checkpoint'.format(idx, args.ckpt_type)),
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset.bin',
                        help="Path or url of the dataset cache")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        raise ValueError(
            "Interacting with Model requires passing a finetuned model_checkpoint"
        )

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a twitter user")
    dataset = get_dataset(tokenizer)
    personalities = [
        dialog["name"] for dataset in dataset.values() for dialog in dataset
    ]
    personality = random.choice(personalities)
    logger.info("Selected personality: %s", tokenizer.decode(personality))

    previous_tweet = []
    while True:
        raw_text = input("Please enter a previous tweet: ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input("Please enter a previous tweet: ")
        previous_tweet = tokenizer.encode(raw_text)
        with torch.no_grad():
            out_ids = sample_sequence(personality, previous_tweet, tokenizer,
                                      model, args)
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print('New Tweet:')
        print(out_text)
Exemple #27
0
def main(argv):
    for i in range(10):
        with open(AUG % i) as fh:
            aug_data = json.load(fh)
        dataset = get_dataset(aug_data=aug_data)