Beispiel #1
0
def load_data(args):

    train_val_filelist = glob.glob(args.data_train)
    n_train = int(args.train_val_split * len(train_val_filelist))

    wgtvar = args.weight_names
    if wgtvar == '': wgtvar = None

    d = DataFormat(train_groups, train_vars, label_var, wgtvar, obs_vars, extra_label_vars=extra_label_vars, filename=train_val_filelist[0])

    logging.info('Using the following variables:\n' +
                 '\n'.join([v_group + '\n\t' + str(train_vars[v_group]) for v_group in train_groups ]))
    logging.info('Using weight\n' + str(wgtvar))

    orig_metadata = os.path.join(os.path.dirname(train_val_filelist[0]), 'metadata.json')
    output_metadata = os.path.join(os.path.dirname(args.model_prefix), 'preprocessing.json')

    if args.predict:
        test_filelist = glob.glob(args.data_test)
        test = DataLoader(test_filelist, d, batch_size=args.batch_size, predict_mode=True, shuffle=False, args=args)
        return test
    else:
        train = DataLoader(train_val_filelist[:n_train], d, batch_size=args.batch_size, args=args)
        val = DataLoader(train_val_filelist[n_train:], d, batch_size=args.batch_size, args=args)
        if not os.path.exists(output_metadata):
            train_shapes = {}
            for k, v in train.provide_data:
                train_shapes[k] = (1,) + v[1:]
            dump_input_metadata(orig_metadata, groups=train_groups, shapes=train_shapes,
                                var_names=train_vars, output=output_metadata)
        return (train, val)
Beispiel #2
0
def build_dataloader(datasets_dir,
                     ds,
                     split,
                     subset,
                     batch_size,
                     min_seq=16,
                     max_seq=16,
                     shuffle=True,
                     num_workers=4,
                     verbose=False,
                     print_dropped=False):
    """Builds a Dataloader."""
    if ds == 'hmdb51':
        Dataset = HMDB51Dataset
    elif ds == 'ucf101':
        Dataset = UCF101Dataset
    else:
        raise ValueError(f'invalid ds={ds}')
    ds_dir = join(datasets_dir, ds)
    if verbose:
        print(f'Building dataloader for {ds}')
    ds = Dataset(ds_dir,
                 split=split,
                 subset=subset,
                 min_seq=min_seq,
                 max_seq=max_seq,
                 verbose=verbose,
                 print_dropped=print_dropped)
    dl = DataLoader(ds,
                    batch_size=batch_size,
                    shuffle=shuffle,
                    num_workers=num_workers)
    return dl
Beispiel #3
0
def test(parser,
         vocab,
         num_buckets_test,
         test_batch_size,
         test_file,
         output_file,
         debug=False):
    data_loader = DataLoader(test_file, num_buckets_test, vocab)
    record = data_loader.idx_sequence
    results = [None] * len(record)
    idx = 0
    for words, tags, arcs, rels in data_loader.get_batches(
            batch_size=test_batch_size, shuffle=False):
        outputs = parser.run(words, tags, is_train=False)
        for output in outputs:
            sent_idx = record[idx]
            results[sent_idx] = output
            idx += 1

    arcs = reduce(lambda x, y: x + y, [list(result[0]) for result in results])
    rels = reduce(lambda x, y: x + y, [list(result[1]) for result in results])
    idx = 0
    with open(test_file) as f:
        if debug:
            f = f.readlines()[:1000]
        with open(output_file, 'w') as fo:
            for line in f:
                info = line.strip().split()
                if info:
                    assert len(info) == 10, 'Illegal line: %s' % line
                    info[6] = str(arcs[idx])
                    info[7] = vocab.id2rel(rels[idx])
                    fo.write('\t'.join(info) + '\n')
                    idx += 1
                else:
                    fo.write('\n')

    os.system('perl run/eval.pl -q -b -g %s -s %s -o tmp' %
              (test_file, output_file))
    os.system('tail -n 3 tmp > score_tmp')
    LAS, UAS = [
        float(line.strip().split()[-2])
        for line in open('score_tmp').readlines()[:2]
    ]
    print('LAS %.2f, UAS %.2f' % (LAS, UAS))
    os.system('rm tmp score_tmp')
    return LAS, UAS
Beispiel #4
0
    config = Config(args.config_file, extra_args)

    vocab = Vocab(config.train_file,
                  None if config.debug else config.pretrained_embeddings_file,
                  config.min_occur_count)
    if not config.debug:
        pickle.dump(vocab, open(config.save_vocab_path, 'wb'))
    with mx.Context(mx.gpu(0) if 'cuda' in os.environ['PATH'] else mx.cpu()):
        parser = BiaffineParser(vocab, config.word_dims, config.tag_dims,
                                config.dropout_emb, config.lstm_layers,
                                config.lstm_hiddens, config.dropout_lstm_input,
                                config.dropout_lstm_hidden,
                                config.mlp_arc_size, config.mlp_rel_size,
                                config.dropout_mlp, config.debug)
        parser.initialize(force_reinit=True)
        data_loader = DataLoader(config.train_file, config.num_buckets_train,
                                 vocab)
        # trainer = dy.AdamTrainer(pc, config.learning_rate, config.beta_1, config.beta_2, config.epsilon)
        trainer = gluon.Trainer(parser.collect_params(), 'adam',
                                {'learning_rate': config.learning_rate})

        global_step = 0
        epoch = 0
        best_UAS = 0.
        history = lambda x, y: open(
            os.path.join(config.save_dir, 'valid_history'), 'a').write(
                '%.2f %.2f\n' % (x, y))
        while global_step < config.train_iters:
            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                  ' Start training epoch #%d' % (epoch, ))
            epoch += 1
            for words, tags, arcs, rels in data_loader.get_batches(