Esempio n. 1
0
def __main__():
    train = data.Dataset('train_data.txt')
    dev = data.Dataset('dev_data.txt')

    accs = []
    for i in range(5):
        print("Seed", i)
        classifier, acc = models.train_classifier(train, dev, seed=i)
        accs.append(acc)
        reporter.report_results(classifier, dev)
        print()
def train(config_path):
    """ Trains a model

    Args:
    config_path: string, path to a config.json file
    """

    # Load configuration
    if not os.path.exists(config_path):
        print('Error: No configuration file present at specified path.')
        return

    config = util.load_config(config_path)
    print('Loaded configuration from: %s' % config_path)

    # Create session directory
    if 'session_dir' not in config['training'] or os.path.exists(
            config['training']['session_dir']):
        create_new_session(config)

    model = sfun.SFUN(config)
    dataset = data.Dataset(config).load_dataset()

    train_set_generator = dataset.get_random_batch_generator('train')
    val_set_generator = dataset.get_random_batch_generator('val')

    model.fit_model(train_set_generator, config['training']['num_steps_train'],
                    val_set_generator, config['training']['num_steps_val'],
                    config['training']['num_epochs'])
Esempio n. 3
0
def bn_update(tf_config, logger):
    dataset = data.Dataset(cfg.DATASET, cfg.RNG_SEED)
    cfg.MODEL.BN_MOMENTUM = 0.
    assert cfg.MODEL.BN_MOMENTUM == 0., 'BN_MOMENTUM should be 0. for update step'
    imgs, _ = dataset.preprocessing(training=True,
                                    augment=False,
                                    batch_size=dataset.train_num,
                                    num_epochs=1)

    net, _ = model.unet(imgs,
                        bn_training=True,
                        dropout_training=False,
                        dataset=cfg.DATASET)
    with tf.variable_scope('cls'):
        _ = tf.layers.conv2d(net, 1, 1, activation=tf.nn.relu)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    saver = tf.train.Saver(max_to_keep=1000)
    with tf.Session(config=tf_config) as sess:
        weights_path = tf.train.latest_checkpoint(cfg.OUTPUT_DIR)
        logger.info('Restoring weights from {}'.format(weights_path))
        saver.restore(sess, weights_path)

        sess.run(update_ops)

        weights_path = saver.save(sess,
                                  os.path.join(cfg.OUTPUT_DIR, 'bn-model'),
                                  global_step=int(weights_path.split('-')[-1]))
        logger.info('Updating weights to {}'.format(weights_path))
    tf.reset_default_graph()
Esempio n. 4
0
    def __init__(self,
                 train=True,
                 common_params=None,
                 solver_params=None,
                 net_params=None,
                 dataset_params=None):

        self.device_id = int(common_params['gpus'])  # 0
        self.image_size = int(common_params['image_size'])  #256
        self.batch_size = int(common_params['batch_size'])
        self.num_gpus = 1

        self.learning_rate = float(solver_params['learning_rate'])
        self.moment = float(solver_params['moment'])  #?
        self.max_steps = int(solver_params['max_iterators'])
        self.train_dir = str(solver_params['train_dir'])
        # don't know
        self.lr_decay = float(solver_params['lr_decay'])
        self.decay_steps = int(solver_params['decay_steps'])

        self.train = train  # ?
        self.cnn = cnn.Model(train=train,
                             common_params=common_params,
                             net_params=net_params)
        self.dataset = data.Dataset(common_params=common_params,
                                    dataset_params=dataset_params)
Esempio n. 5
0
def initialize_dataset():
    d = data.Dataset("Test", data.Conditions(), sequence="MEGAMAN")
    d.create_peptide("MEGA", start_residue=1)
    d.create_peptide("MEGAMA", start_residue=1)
    d.create_peptide("GAMAN", start_residue=3)
    d.create_peptide("AMAN", start_residue=4)
    return d
Esempio n. 6
0
def evaluate_policy_docs():
    opt = make_options()
    dataset = data.Dataset()
    feeder = data.Feeder(dataset)
    model, _ = models.load_or_create_models(opt, False)
    translator = Translator(model, opt.beam_size, opt.min_length,
                            opt.max_length)
    docs = data.load_policy_documents()
    for doc in docs:
        data.parse_paragraphs(doc)
    lines = []
    for doc in docs:
        paras = [p for p in doc.paragraphs if 50 <= len(p) <= 400]
        if not paras:
            continue
        lines.append('=================================')
        lines.append(doc.title)
        if len(paras) > 16:
            paras = random.sample(paras, 16)
        paras = sorted(paras, key=lambda x: -len(x))
        pids = [feeder.sent_to_ids(p) for p in paras]
        pids = data.align2d(pids)
        src = nu.tensor(pids)
        lengths = (src != data.NULL_ID).sum(-1)
        tgt = translator.translate(src.transpose(0, 1), lengths,
                                   opt.best_k_questions)
        questions = [[feeder.ids_to_sent(t) for t in qs] for qs in tgt]
        for p, qs in zip(paras, questions):
            lines.append('--------------------------------')
            lines.append(p)
            for k, q in enumerate(qs):
                lines.append('predict {}: {}'.format(k, q))
    utils.write_all_lines(opt.output_file, lines)
Esempio n. 7
0
def evaluate():
    opt = make_options()
    dataset = data.Dataset()
    model, _ = models.load_or_create_models(opt, False)
    evaluate_accuracy(model, dataset, opt.batch_size, opt.beam_size,
                      opt.min_length, opt.max_length, opt.best_k_questions,
                      None, opt.output_file)
Esempio n. 8
0
 def _init(self):
     args = self._args
     os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device)
     self._device = f'cuda:{args.device}' if torch.cuda.is_available(
     ) else "cpu"
     self._aug = utils.Augmentation(float(args.aug_params[0]),
                                    float(args.aug_params[1]),
                                    float(args.aug_params[2]),
                                    float(args.aug_params[3]))
     self._dataset = data.Dataset(root=args.root,
                                  name=args.name,
                                  num_parts=args.init_parts,
                                  final_parts=args.final_parts,
                                  augumentation=self._aug)
     self._loader = DataLoader(
         dataset=self._dataset)  # [self._dataset.data]
     print(f"Data: {self._dataset.data}")
     hidden_layers = [int(l) for l in args.layers]
     layers = [self._dataset.data.x1.shape[1]] + hidden_layers
     self._model = models.BGRL(layer_config=layers,
                               pred_hid=args.pred_hid,
                               dropout=args.dropout,
                               epochs=args.epochs).to(self._device)
     print(self._model)
     self._optimizer = torch.optim.Adam(params=self._model.parameters(),
                                        lr=args.lr,
                                        weight_decay=1e-5)
Esempio n. 9
0
def main(opt):
    dataset = data.Dataset(dataset=opt.dataset, pool_size=opt.pool_size, sample_size=opt.sample_size)
    dataset.show_inf()
    feature_size, att_size = dataset.feature_size, dataset.att_size
    discriminator = model.Discriminator(feature_size, att_size).cuda()
    generator = model.Generator(feature_size, att_size).cuda()

    for epoch in range(opt.epochs):
        # d_loss = train.train_disciminator(discriminator, generator, dataset, opt.lr, opt.batch_size, epoch)
        # g_loss = train.train_generator(discriminator, generator, dataset, opt.lr, opt.batch_size, epoch)
        d_loss, g_loss = train.train_together(discriminator, generator, dataset, opt.lr, opt.batch_size, epoch)
        D_zsl_acc = test.compute_acc(discriminator, dataset, opt1='zsl', opt2='test_unseen')
        D_seen_acc = test.compute_acc(discriminator, dataset, opt1='gzsl', opt2='test_seen')
        D_unseen_acc = test.compute_acc(discriminator, dataset, opt1='gzsl', opt2='test_unseen')
        D_harmonic_mean = (2 * D_seen_acc * D_unseen_acc) / (D_seen_acc + D_unseen_acc)
        print("Epoch {}/{}...".format(epoch + 1, opt.epochs))
        print("D_Loss: {:.4f}".format(d_loss),
              "zsl_acc: {:.4f}".format(D_zsl_acc),
              "seen_acc: {:.4f}".format(D_seen_acc),
              "unseen_acc: {:.4f}".format(D_unseen_acc),
              "harmonic_mean: {:.4f}".format(D_harmonic_mean)
              )
        G_zsl_acc = test.compute_acc(generator, dataset, opt1='zsl', opt2='test_unseen')
        G_seen_acc = test.compute_acc(generator, dataset, opt1='gzsl', opt2='test_seen')
        G_unseen_acc = test.compute_acc(generator, dataset, opt1='gzsl', opt2='test_unseen')
        G_harmonic_mean = (2 * G_seen_acc * G_unseen_acc) / (G_seen_acc + G_unseen_acc)
        print("G_Loss: {:.4f}".format(g_loss),
              "zsl_acc: {:.4f}".format(G_zsl_acc),
              "seen_acc: {:.4f}".format(G_seen_acc),
              "unseen_acc: {:.4f}".format(G_unseen_acc),
              "harmonic_mean: {:.4f}".format(G_harmonic_mean)
              )
Esempio n. 10
0
def parsePlinkBfile(prefix, noPheno=False, params=defaultParams, options=None):
    fam = prefix + '.fam'
    bim = prefix + '.bim'
    bed = prefix + '.bed'
    ## individual data
    idata = parseFamFile(fam)  ## options not implemented
    ni = len(idata)
    ### SNP data
    sdata = parseBimFile(bim)
    if options and options.chr:
        mySnpIdx = _get_snpIdx(sdata['map'], options.chr, options.posleft,
                               options.posright, options.other_map)
        ns = len(mySnpIdx)
    else:
        ns = len(sdata['snps'])
        mySnpIdx = range(ns)
    dataset = data.Dataset(prefix, nsnp=ns, nindiv=ni)
    ## fill in SNP data
    for s in [sdata['snps'][i] for i in mySnpIdx]:
        dataset.addSnp(s.name)
        dataset.snp[s.name].initAlleles(s.alleles[0], s.alleles[1])
    ## fill in indiv data
    for ind in idata:
        dataset.addIndividual(pop=ind[0],
                              ID=ind[1],
                              fatherID=ind[2],
                              motherID=ind[3],
                              sex=ind[4],
                              phenotype=ind[5])
    fillBedData(bed, dataset.Data, mySnpIdx)
    return {'dataset': dataset, 'map': sdata['map']}
    def test(self):
        test_data = data.Dataset(self.config['test_data_file'], shuffle=False)
        test_graph = tf.Graph()

        with tf.Session(graph=test_graph) as sess:
            model = models.RumourDetectModel(
                embed_dim=self.config['embed_dim'],
                vocab_size=self.config['vocab_size'],
                sent_hidden_dims=self.config['sent_hidden_dims'],
                branch_hidden_dims=self.config['branch_hidden_dims'],
                sdqc_attn_dim=self.config['sdqc_attn_dim'],
                veracity_attn_dim=self.config['veracity_attn_dim'],
                sdqc_hidden_dim=self.config['sdqc_hidden_dim'],
                veracity_hidden_dim=self.config['veracity_hidden_dim'],
                embed_pret_file=self.config['embed_pret_file'],
                dicts_file=self.config['dicts_file'],
                keep_prob=1.0,
                reuse=None)

            model(is_train=False)
            sess.run(tf.global_variables_initializer())
            if self.config['embed_pret_file']:
                model.embedder.init_pretrained_emb(sess)

            saver = tf.train.Saver(max_to_keep=self.config['max_ckpts'])
            ckpt_dir = os.path.dirname(self.config['ckpt'])
            ckpt = tf.train.latest_checkpoint(ckpt_dir)
            saver.restore(sess, ckpt)

            utils.print_log('Testing ...')
            batch_num = int(
                math.floor(len(test_data.records) / self.config['batch_size']))

            sdqc_corr, sdqc_total, veracity_corr, veracity_total = 0, 0, 0, 0
            for _ in range(batch_num):
                X, X_pret, Y_sdqc, Y_veracity, sent_length, branch_length = \
                    test_data.get_next(self.config['batch_size'])
                c1, t1, c2, t2 = sess.run(
                    [
                        model.sdqc_correct_count, model.sdqc_total_count,
                        model.veracity_correct_count,
                        model.veracity_total_count
                    ],
                    feed_dict={
                        model.word_ids: X,
                        model.word_ids_pret: X_pret,
                        model.sdqc_labels: Y_sdqc,
                        model.veracity_labels: Y_veracity,
                        model.sent_length: sent_length,
                        model.branch_length: branch_length,
                    })
                sdqc_corr += c1
                sdqc_total += t1
                veracity_corr += c2
                veracity_total += t2

            utils.print_log(
                'SDQC Task Acc = {}, Veracity Task Acc = {}'.format(
                    float(sdqc_corr) / sdqc_total,
                    float(veracity_corr) / veracity_total))
Esempio n. 12
0
def create_testing():
    """create testing dataset"""
    dataset = []
    """
    # for oxford floswer 17
    labels = [0]
    img_dir = "jpg"
    with open('labels.txt','rb') as f:
        for line in f:
            labels.append(int(line.strip())-1)
    for img in os.listdir(img_dir):
        if img.find('.jpg') ==-1: #.txt file
            continue 
        index = int(img.replace('image_','').replace('.jpg',''))
        dataset.append(data.ImageClassData(
            cv2.imread(osp.join(img_dir, img), cv2.IMREAD_COLOR), labels[index], img))

        sys.stdout.write("\r{:7d}".format(len(dataset)))
        sys.stdout.flush()

    """
    # for cifar10 
    print('Loading cifar 10 testing data...')
    batch_dic = unpickle('cifar-10-batches-py/test_batch')
    for index in xrange(len(batch_dic['data'])):
        dataset.append(data.ImageClassData(batch_dic['data'][index], batch_dic['labels'][index]))
        sys.stdout.write("\r{:7d}".format(len(dataset)))
        sys.stdout.flush()
    
    return data.Dataset(dataset)
Esempio n. 13
0
def exhaustive_search(model, full_dataset, country_code):

	global fig
	plt.figure(fig)

	data.set_full_dataset(full_dataset)

	own_model = data.Dataset(country_code).to_model()
	own_model.set_pca(4)
	own_model.set_epochs(300)
	own_model.train()

	plt.subplot(3, 2, 1)
	plt.title(country_code + ' - all data')
	_search(model, country_code, plot_revenue=True, with_index=True)
	plt.subplot(3, 2, 2)
	plt.title(country_code + ' - all data')
	_search(model, country_code, with_index=True)

	plt.subplot(3, 2, 3)
	own_model.plot_history(False)

	plt.subplot(3, 2, 5)
	plt.title(country_code + ' - own data')
	_search(own_model, country_code, plot_revenue=True)
	plt.subplot(3, 2, 6)
	plt.title(country_code + ' - own data')
	_search(own_model, country_code)

	fig += 1
Esempio n. 14
0
def process(code, div):
    import openface
    import openface.helper
    import dlib
    from openface.alignment import NaiveDlib  # Depends on dlib.
    code = int(code)
    div = int(div)
    dlibModelDir = os.path.join(fileDir, "./openface/models/dlib")
    dlibFaceMean = os.path.join(dlibModelDir, "mean.csv")
    dlibFacePredictor = os.path.join(dlibModelDir,
                                     "shape_predictor_68_face_landmarks.dat")
    align = NaiveDlib(dlibFaceMean, dlibFacePredictor)
    dataset = data.Dataset()
    last = time.time()
    count = 0
    for model, key, img in dataset.get_images(BUCKET_NAME):
        if hash(key) % div == code:
            bb = align.getLargestFaceBoundingBox(img)
            aligned = align.alignImg("affine", 224, img, bb)
            # print time.time() - last
            last = time.time()
            count += 1
            if not aligned is None:
                # print model,key,img.shape,bb,aligned.shape
                cv2.imwrite(
                    "output/face_{}".format(
                        key.replace('/', '_').replace('models', '')), aligned)
                # cv2.imshow("test",aligned)
                # cv2.waitKey(0)
                # cv2.destroyAllWindows()
                # break
        if count % 20 == 0 and code == 0:
            local(
                'aws s3 mv output/ s3://aub3data/output/ --recursive --storage-class "REDUCED_REDUNDANCY"  --region "us-east-1"'
            )
Esempio n. 15
0
    def _init(self):
        args = self._args
        self._device = torch.device(
            utils.get_device_id(torch.cuda.is_available()))
        self._aug = utils.Augmentations(method=args.aug)

        self._dataset = data.Dataset(root=args.root,
                                     name=args.name,
                                     num_parts=args.init_parts,
                                     final_parts=args.final_parts,
                                     augumentation=self._aug)
        self._loader = DataLoader(
            dataset=self._dataset)  # [self._dataset.data]
        print(f"Data Augmentation method {args.aug}")
        print(f"Data: {self._dataset.data}")
        hidden_layers = [int(l) for l in args.layers]
        layers = [self._dataset.data.x.shape[1]] + hidden_layers
        self._norm_config = utils.get_norm_configs(args.norms)
        self._model = models.SelfGNN(layer_config=layers,
                                     dropout=args.dropout,
                                     gnn_type=args.model,
                                     heads=args.heads,
                                     **self._norm_config).to(self._device)

        print(self._model)
        self._optimizer = torch.optim.Adam(params=self._model.parameters(),
                                           lr=args.lr)
Esempio n. 16
0
def main(opt):
    dataset = data.Dataset(dataset=opt.dataset, pool_size=opt.pool_size, sample_size=opt.sample_size)
    dataset.show_inf()
    feature_size, att_size = dataset.feature_size, dataset.att_size
    discriminator = model.Discriminator(feature_size, att_size, opt.t1).cuda()
    generator = model.Generator(feature_size, att_size, opt.t2).cuda()
    train2.train(discriminator, generator, dataset, d_lr=opt.d_lr, g_lr=opt.g_lr,\
                 batch_size=opt.batch_size, alpha=opt.alpha, epochs=opt.epochs)
Esempio n. 17
0
def build_translator():
    opt = make_options()
    dataset = data.Dataset()
    feeder = data.Feeder(dataset)
    model, _ = models.load_or_create_models(opt, False)
    translator = Translator(model, opt.beam_size, opt.min_length,
                            opt.max_length)
    return translator, feeder
def calculateDataLoaderTrain(args_dict):
    # Augmentation
    train_transformation = transforms.Compose([
        transforms.Resize(
            256),  # rescale the image keeping the original aspect ratio
        transforms.CenterCrop(256),  # we get only the center of that rescaled
        transforms.RandomCrop(
            224),  # random crop within the center crop (data augmentation)
        transforms.ColorJitter(brightness=(0.9, 1.1)),
        transforms.RandomRotation((-10, 10)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomAffine(0,
                                translate=(0.1, 0.1),
                                shear=10,
                                scale=(0.85, 1.15),
                                fillcolor=0),
        # TransformShow(), # visualize transformed pic
        transforms.ToTensor(),
    ])

    # Dataloaders for training and validation
    # preprocess the given txt files: Train
    datasets_train, _, labels, labels_non, labels_cov = data.preprocessSplit(
        args_dict.train_txt)

    # create Datasets
    train_non_covid = data.Dataset(datasets_train[0],
                                   labels_non,
                                   args_dict.train_folder,
                                   transform=train_transformation)
    train_covid = data.Dataset(datasets_train[1],
                               labels_cov,
                               args_dict.train_folder,
                               transform=train_transformation)

    covid_size = max(int(args_dict.batch * args_dict.covid_percent), 1)

    # create data loader
    dl_non_covid = DataLoader(train_non_covid,
                              batch_size=(args_dict.batch - covid_size),
                              shuffle=True)  # num_workers= 2
    dl_covid = DataLoader(train_covid, batch_size=covid_size,
                          shuffle=True)  # num_workers= 2

    return dl_non_covid, dl_covid
Esempio n. 19
0
def main():
    parser = argparse.ArgumentParser(description='Yelp Rating Interpretation')

    parser.add_argument('--n-estimators', type=int, default=100)
    parser.add_argument('--criterion',
                        type=str,
                        default='gini',
                        choices=['gini', 'entropy'])
    parser.add_argument('--max-depth', type=int, default=20)
    parser.add_argument('--seed', type=int, default=23)
    parser.add_argument('--top-n-features', type=int)

    parser.add_argument('--train-datafile', type=str, default='data/train.csv')
    parser.add_argument('--test-datafile', type=str, default='data/test.csv')
    parser.add_argument('--model-path', type=str, default='models/model.pkl')
    parser.add_argument('--fig-path',
                        type=str,
                        default='figure/importance.png')

    args = parser.parse_args()

    model = models.RatingInterpreter(n_estimators=args.n_estimators,
                                     criterion=args.criterion,
                                     max_depth=args.max_depth,
                                     seed=args.seed,
                                     top_n_features=args.top_n_features)

    # if os.path.exists(args.model_path):
    #   model.load(args.model_path)
    # else:
    train_dataset = data.Dataset(args.train_datafile)
    test_dataset = data.Dataset(args.test_datafile)

    # acc, rmse = model.train(train_dataset, test_dataset)
    acc = model.train(train_dataset, test_dataset)
    model.save(args.model_path)

    importances, std = model.get_importance()
    # visualize.display(importances, std, acc, rmse, args.fig_path,
    #                   top_n_features=args.top_n_features)
    visualize.display(importances,
                      std,
                      acc,
                      args.fig_path,
                      top_n_features=args.top_n_features)
Esempio n. 20
0
def build_train_model(opt, dataset=None):
    dataset = dataset or data.Dataset(opt)
    model = build_model(opt, dataset)
    feeder = data.TrainFeeder(dataset, opt.batch_size, opt.char_limit)
    optimizer = torch.optim.Adam(
        [p for p in model.parameters() if p.requires_grad],
        lr=opt.learning_rate)
    feeder.prepare('train')
    return model, optimizer, feeder
Esempio n. 21
0
def build_train_models(opt):
    dataset = data.Dataset()
    generator = build_model(opt, dataset.vocab_size)
    discriminator = build_discriminator(opt)
    feeder = data.TrainFeeder(dataset)
    g_optimizer = torch.optim.Adam(generator.parameters(), lr=opt.learning_rate)
    d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=opt.learning_rate)
    feeder.prepare('train', opt.batch_size)
    return generator, discriminator, g_optimizer, d_optimizer, feeder
Esempio n. 22
0
def main():
    """ Run training and export summaries to data_dir/logs for a single test
    setup and a single set of parameters. Summaries include a) TensorBoard
    summaries, b) the latest train/test accuracies and raw edit distances
    (status.txt), c) the latest test predictions along with test ground-truth
    labels (test_label_seqs.pkl, test_prediction_seqs.pkl), d) visualizations
    as training progresses (test_visualizations_######.png)."""

    args = define_and_process_args()
    print('\n', 'ARGUMENTS', '\n\n', args, '\n')

    log_dir = get_log_dir(args)
    print('\n', 'LOG DIRECTORY', '\n\n', log_dir, '\n')

    standardized_data_path = os.path.join(args.data_dir, args.data_filename)
    if not os.path.exists(standardized_data_path):
        message = '%s does not exist.' % standardized_data_path
        raise ValueError(message)

    dataset = data.Dataset(standardized_data_path)
    train_raw_seqs, test_raw_seqs = dataset.get_splits(args.test_users)
    train_triplets = [data.prepare_raw_seq(seq) for seq in train_raw_seqs]
    test_triplets = [data.prepare_raw_seq(seq) for seq in test_raw_seqs]

    train_input_seqs, train_reset_seqs, train_label_seqs = zip(*train_triplets)
    test_input_seqs, test_reset_seqs, test_label_seqs = zip(*test_triplets)

    Model = eval('models.' + args.model_type + 'Model')
    input_size = dataset.input_size
    target_size = dataset.num_classes

    # This is just to satisfy a low-CPU requirement on our cluster
    # when using GPUs.
    if 'CUDA_VISIBLE_DEVICES' in os.environ:
        config = tf.ConfigProto(intra_op_parallelism_threads=2,
                                inter_op_parallelism_threads=2)
    else:
        config = None

    with tf.Session(config=config) as sess:
        model = Model(input_size, target_size, args.num_layers,
                      args.hidden_layer_size, args.init_scale,
                      args.dropout_keep_prob)
        optimizer = optimizers.Optimizer(model.loss, args.num_train_sweeps,
                                         args.initial_learning_rate,
                                         args.num_initial_sweeps,
                                         args.num_sweeps_per_decay,
                                         args.decay_factor,
                                         args.max_global_grad_norm)
        train(sess, model, optimizer, log_dir, args.batch_size,
              args.num_sweeps_per_summary, args.num_sweeps_per_save,
              train_input_seqs, train_reset_seqs, train_label_seqs,
              test_input_seqs, test_reset_seqs, test_label_seqs)
Esempio n. 23
0
    def get_detector_datasets(self):
        train_token_input, train_predections_detector, _, \
        val_token_input, val_predections_detector, _ = self.get_hot_flip_data()

        detector_dataset = data.Dataset(train_seq=train_token_input,
                                        train_lbl=train_predections_detector,
                                        val_seq=val_token_input,
                                        val_lbl=val_predections_detector,
                                        test_seq=None,
                                        test_lbl=None)

        return detector_dataset
Esempio n. 24
0
def parseSyncFile(fileName, populations, popNames=None):
    print "Reading", fileName

    ## open reader
    reader = sync.SyncReader(fileName)
    nsnp = reader.countSnps()
    print "Found", nsnp, "in file"

    ## get population names if not provided
    if popNames is None:
        popNames = ["pop" + str(i) for i in xrange(1, len(populations) + 1)]
    elif len(populations) != len(popNames):
        popNames = [popNames[x - 1] for x in populations]
    ## map object
    myMap = data.Map()
    ## create empty dataset
    dataset = data.Dataset(fileName, nsnp=nsnp, nindiv=len(popNames))
    for pop in popNames:
        dataset.addIndividual(pop, pop=pop)
    sync_bases = ['A', 'T', 'C', 'G']
    frqs = []
    # for each record
    for record in reader:
        totalCounts = np.zeros(shape=(len(populations), len(sync_bases)),
                               dtype='int16')
        i = 0
        for pop in record.subpopulations(populations):
            popCounts = []
            for base in sync_bases:
                popCounts.append(pop.countForAllele(base))
            totalCounts[i, ] = popCounts
            i += 1
        # two major alleles indexes
        indexes = np.argpartition(sum(totalCounts), 1)[len(sync_bases) - 2:]
        ## get biallelic
        biallelic = totalCounts[:, indexes]
        ## get the alleles
        alleles = [sync_bases[x] for x in indexes]
        alleleFreqs = biallelic.astype(float) / np.sum(biallelic, 1)[:, None]
        frqs.append(alleleFreqs[:, 0])
        # get the name for the SNP
        name = record.chr + "_" + str(record.pos)
        ## add the marker to the map and to the dataset
        myMap.addMarker(M=name, C=record.chr, posG=record.pos, posP=record.pos)
        snp = dataset.addSnp(name)
        snp.initAlleles(alleles[0], alleles[1])
    ## return all including freqs
    return {
        'dataset': dataset,
        'map': myMap,
        'freqs': np.transpose(np.vstack(frqs)),
        'pops': popNames
    }
Esempio n. 25
0
    def get_char_selector_datasets(self):
        train_token_input, _, train_predections_char_selector, \
        val_token_input, _, val_predections_char_selector = self.get_hot_flip_data()

        char_selector_dataset = data.Dataset(
            train_seq=train_token_input,
            train_lbl=train_predections_char_selector,
            val_seq=val_token_input,
            val_lbl=val_predections_char_selector,
            test_seq=None,
            test_lbl=None)

        return char_selector_dataset
def prepare_dataloaders(word2idx,ints,en1_pos,en2_pos,predicates):
    # ========= Preparing DataLoader =========#
    train_loader = torch.utils.data.DataLoader(
        data.Dataset(
            word2idx=word2idx,
            insts=ints,
            en1_pos=en1_pos,
            en2_pos=en2_pos,
            predicates=predicates),
        batch_size=128,
        collate_fn=collate_fn,
        shuffle=True)

    valid_loader = torch.utils.data.DataLoader(
        data.Dataset(
            word2idx=word2idx,
            insts=ints,
            en1_pos=en1_pos,
            en2_pos=en2_pos,
            predicates=predicates),
        batch_size=128,
        collate_fn=collate_fn)
    return train_loader, valid_loader
Esempio n. 27
0
def create_training():
    """create training dataset"""
    dataset = []
    """
    # for oxford floswer 17
    labels = [0]

    img_dir = "jpg"
    with open('labels.txt','rb') as f:
        for line in f:
            labels.append(int(line.strip())-1)
    for img in os.listdir(img_dir):
        if img.find('.jpg') ==-1: #.txt file
            continue 
        index = int(img.replace('image_','').replace('.jpg',''))


        dataset.append(data.ImageClassData(cv2.imread(osp.join(img_dir, img), cv2.IMREAD_COLOR), labels[index], img))

        sys.stdout.write("\r{:7d}".format(len(dataset)))
        sys.stdout.flush()

    """
    # for cifar10 
    print('Loading cifar 10 training data...')
    for batch_index in xrange(1, 6):
        batch_dic = unpickle('cifar-10-batches-py/data_batch_{}'.format(batch_index))
        for index in xrange(len(batch_dic['data'])):
            dataset.append(data.ImageClassData(batch_dic['data'][index], batch_dic['labels'][index]))
            sys.stdout.write("\r{:7d}".format(len(dataset)))
            sys.stdout.flush()
    
    val_ratio = 1./5 # split for validation
    split_index = int(len(dataset) *val_ratio)
    random.shuffle(dataset)

    return data.Dataset(dataset[split_index:]), data.Dataset(dataset[:split_index])
Esempio n. 28
0
def initialize_system_and_dataset():
    sequence = "MEGAMAN"
    sys = system.System()       
    mol = sys.add_macromolecule(sequence, "test_molecule")
    d = data.Dataset("Test", data.Conditions(), sequence=sequence)
    d.create_peptide("MEGA", start_residue=1)
    d.create_peptide("MEGAMA", start_residue=1)
    d.create_peptide("GAMAN", start_residue=3)
    d.create_peptide("AMAN", start_residue=4)

    for pep in d.get_peptides():
        pep.add_timepoints([10, 100, 1000])

    mol.get_state(0).add_dataset(d)
    return mol.get_state(0)
Esempio n. 29
0
def main(args):
    dataset = data.Dataset(args.data_dir)
    criterion = nn.CrossEntropyLoss()

    net = model.Net(data.VOCAB_SIZE)

    if os.path.isfile(args.model_filename):
        checkpoint = (torch.load(args.model_filename) if
                      (torch.cuda.is_available()) else torch.load(
                          filename, map_location=lambda storage, loc: storage))
        net.load_state_dict(checkpoint['state_dict'])

    if (torch.cuda.is_available()):
        net.cuda()

    print('\nRunning test')
    epoch_end = False
    total_loss = []

    test_step = 0
    num_correct = 0
    test_size = len(dataset.dataset['test'])
    while epoch_end == False:
        test_step += 1
        minibatch, epoch_end = dataset.get_next_minibatch('test', 1)
        batch_tensor = Variable(minibatch[0])
        labels_tensor = Variable(minibatch[1])

        if (torch.cuda.is_available()):
            batch_tensor = batch_tensor.cuda()
            labels_tensor = labels_tensor.cuda()

        output = net.forward(batch_tensor)

        label_index = torch.max(labels_tensor, 1)[1]
        output_index = torch.max(output, 1)[1]
        num_correct += (label_index == output_index).sum().type(
            torch.LongTensor)

        loss = criterion(output, label_index)
        total_loss.append(loss.data)

        sys.stdout.write('Test step %i/%i\r' % (test_step, test_size))
        sys.stdout.flush()

    total_loss = float(sum(total_loss)[0]) / float(len(total_loss))
    print('\nTest loss: %f' % total_loss)
    print('\nAccuracy: %f%%' % ((float(num_correct) / float(test_size)) * 100))
Esempio n. 30
0
def load_dir(basedir, num_target, window_size):
    """Load dataset from every csv file in `basedir`

    Args:
        - `basedir`:        path to dataset directory
        - `num_target`:     number of class in the dataset
        - `window_size`:    sliding window size

    Returns:
        A `Dataset` containing `data` and `target`

    """
    filenames = get_filenames(basedir)
    features, target = data.get(filenames, num_target, window_size)

    return data.Dataset(features, target)