コード例 #1
0
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

# python3 generate.py --data data-bin/iwslt14.tokenized.de-en/WMT/preprocessed/  --src_lang en --trg_lang de --batch-size 64 --gpuid 0

#python3 generate.py --data data-bin/UN/million6way/en-fr/bpe/preprocessed/  --src_lang en --trg_lang fr  --batch-size 80 --gpuid 0 --model-dir data-bin/UN/million6way/en-fr/checkpoints/lr34clip1/best_gmodel.pt

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.DEBUG)

parser = argparse.ArgumentParser(
    description="Driver program for JHU Adversarial-NMT.")

# Load args
options.add_general_args(parser)
options.add_dataset_args(parser)
options.add_checkpoint_args(parser)
options.add_distributed_training_args(parser)
options.add_generation_args(parser)
options.add_generator_model_args(parser)


def main(args):

    use_cuda = (len(args.gpuid) >= 1)
    if args.gpuid:
        cuda.set_device(args.gpuid[0])
        print(args.replace_unk)  #None
        # Load dataset
        if args.replace_unk is None:
            dataset = data.load_dataset(
コード例 #2
0
def main():
    parser = options.get_parser('Trainer')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    
    args = parser.parse_args()
    print(args)
    
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    
    # checkpoint
    checkpoint_dir = os.path.dirname(args.checkpoint)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    
    # load dataset
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir)
    assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!'
    train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus]
    val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus]
    test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus]
    
    
    start_epoch = 0
    caseless = args.caseless
    batch_size = args.batch_size
    num_epoch = args.num_epoch
    
    # preprocessing
    sents = [tup[0] for tup in train_corpus + val_corpus]
    feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless)
    ##
#    target_map = {c:i for i, c in enumerate(['null', 'true'])}
    target_map = ddi2013.target_map
    train_features, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless)
    val_features, val_targets = utils.build_corpus(val_corpus, feature_map, target_map, caseless)
    test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless)
    
    class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None
    train_loader = utils.construct_bucket_dataloader(train_features, train_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=True)
    val_loader = utils.construct_bucket_dataloader(val_features, val_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False)
    test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False)
    print('Preprocessing done! Vocab size: {}'.format(len(feature_map)))
    
    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = utils.build_model(args, vocab_size, tagset_size)
    
    # loss
    criterion = utils.build_loss(args, class_weights=class_weights)
    
    # load states
    if os.path.isfile(args.load_checkpoint):
        print('Loading checkpoint file from {}...'.format(args.load_checkpoint))
        checkpoint_file = torch.load(args.load_checkpoint)
        start_epoch = checkpoint_file['epoch'] + 1
        model.load_state_dict(checkpoint_file['state_dict'])
    #    optimizer.load_state_dict(checkpoint_file['optimizer'])
    else:
        print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint))
        if not args.rand_embedding:
            pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim)
            print(pretrained_word_embedding.size())
            print(vocab_size)
            model.load_pretrained_embedding(pretrained_word_embedding)
            if args.disable_fine_tune:
                model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words
        model.rand_init(init_embedding=args.rand_embedding)
    
    # trainer
    trainer = SeqTrainer(args, model, criterion)
    
    if os.path.isfile(args.load_checkpoint):
        dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format(
            dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1))
    
    track_list = []
    best_f1 = float('-inf')
    patience_count = 0
    start_time = time.time()
    
    
    for epoch in range(start_epoch, num_epoch):
        epoch_loss = train(train_loader, trainer, epoch)
    
        # update lr
        trainer.lr_step()
           
        dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        if dev_f1 >= best_f1:
            patience_count = 0
            best_f1 = dev_f1
    
            test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
    
            track_list.append({'epoch': epoch, 'loss': epoch_loss, 
                'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 
                'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
            try:
                utils.save_checkpoint({
                            'epoch': epoch,
                            'state_dict': model.state_dict(),
                            'optimizer': trainer.optimizer.state_dict(),
                            'f_map': feature_map,
                            't_map': target_map,
                        }, {'track_list': track_list,
                            'args': vars(args)
                            }, args.checkpoint + '_lstm')
            except Exception as inst:
                print(inst)
        else:
            patience_count += 1
            track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss))
    
        print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time))
        if patience_count >= args.patience:
            break
コード例 #3
0
def main():
    parser = options.get_parser('Generator')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_generation_args(parser)
    
    args = parser.parse_args()
    print(args)
    
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    
    
    caseless = args.caseless
    batch_size = args.batch_size
    
    
    if os.path.isfile(args.load_checkpoint):
        print('Loading checkpoint file from {}...'.format(args.load_checkpoint))
        checkpoint_file = torch.load(args.load_checkpoint)
    else:
        print('No checkpoint file found: {}'.format(args.load_checkpoint))
        raise OSError
        
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=True)
    test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus]
    
    # preprocessing
    feature_map = checkpoint_file['f_map']
    target_map = checkpoint_file['t_map']
    test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless)
    
    # train/val split
    test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False)
    
    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = utils.build_model(args, vocab_size, tagset_size)
    # loss
    criterion = utils.build_loss(args)
    
    # load states
    model.load_state_dict(checkpoint_file['state_dict'])
    
    # trainer
    trainer = SeqTrainer(args, model, criterion)
    
    if args.cuda:
        model.cuda()
    
    y_true, y_pred, att_weights = predict(trainer, test_loader, target_map, cuda=args.cuda)
    assert len(y_pred) == len(test_corpus), 'length of prediction is inconsistent with that of data set'
    # prediction
    print('Predicting...')
    assert len(y_pred) == len(test_corpus), 'length of prediction is inconsistent with that of data set'
    # write result: sent_id|e1|e2|ddi|type
    with open(args.predict_file, 'w') as f:
        for tup, pred in zip(test_raw_corpus, y_pred):
            ddi = 0 if pred == 'null' else 1
            f.write('|'.join([tup.sent_id, tup.e1, tup.e2, str(ddi), pred]))
            f.write('\n')

    # error analysis
    print('Analyzing...')
    with open(args.error_file, 'w') as f:
        f.write(' | '.join(['sent_id', 'e1', 'e2', 'target', 'pred']))
        f.write('\n')
        for tup, target, pred, att_weight in zip(test_raw_corpus, y_true, y_pred, att_weights):
            if target != pred:
                size = len(tup.sent)
                f.write('{}\n'.format(' '.join(tup.sent)))
                if args.model != 'InterAttentionLSTM':
                    att_weight = [att_weight]
                for i in range(len(att_weight)):
                    f.write('{}\n'.format(' '.join(map(lambda x: str(round(x, 4)), att_weight[i][:size]))))
                f.write('{}\n\n'.format(' | '.join([tup.sent_id, tup.e1, tup.e2, target, pred])))
            
    # attention
    print('Writing attention scores...')
    with open(args.att_file, 'w') as f:
        f.write(' | '.join(['target', 'sent', 'att_weight']))
        f.write('\n')
        for tup, target, pred, att_weight in zip(test_raw_corpus, y_true, y_pred, att_weights):
            if target == pred and target != 'null':
                size = len(tup.sent)
                f.write('{}\n'.format(target))
                f.write('{}\n'.format(' '.join(tup.sent)))
                if args.model != 'InterAttentionLSTM':
                    att_weight = [att_weight]
                for i in range(len(att_weight)):
                    f.write('{}\n'.format(' '.join(map(lambda x: str(round(x, 4)), att_weight[i][:size]))))
コード例 #4
0
def main():
    parser = options.get_parser('Generator')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    options.add_generation_args(parser)

    args = parser.parse_args()

    model_path = args.load_checkpoint + '.model'
    args_path = args.load_checkpoint + '.json'
    with open(args_path, 'r') as f:
        _args = json.load(f)['args']
    [setattr(args, k, v) for k, v in _args.items()]

    args.cuda = not args.disable_cuda and torch.cuda.is_available()

    print(args)

    if args.cuda:
        torch.backends.cudnn.benchmark = True

    # increase recursion depth
    sys.setrecursionlimit(10000)

    # load dataset
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(
        args.processed_dir, ddi=False)
    assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!'
    train_corpus = [(line.sent, line.type, line.p1, line.p2)
                    for line in train_raw_corpus]
    val_corpus = [(line.sent, line.type, line.p1, line.p2)
                  for line in val_raw_corpus]

    caseless = args.caseless
    batch_size = args.batch_size

    # build vocab
    sents = [tup[0] for tup in train_corpus + val_corpus]
    feature_map = utils.build_vocab(sents,
                                    min_count=args.min_count,
                                    caseless=caseless)
    target_map = ddi2013.target_map

    # get class weights
    _, train_targets = utils.build_corpus(train_corpus, feature_map,
                                          target_map, caseless)
    class_weights = torch.Tensor(
        utils.get_class_weights(train_targets)) if args.class_weight else None

    # load dataets
    _, _, test_loader = utils.load_datasets(args.processed_dir,
                                            args.train_size,
                                            args,
                                            feature_map,
                                            dataloader=True)

    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = RelationTreeModel(vocab_size, tagset_size, args)

    # loss
    criterion = utils.build_loss(args, class_weights=class_weights)

    # load states
    assert os.path.isfile(model_path), "Checkpoint not found!"
    print('Loading checkpoint file from {}...'.format(model_path))
    checkpoint_file = torch.load(model_path)
    model.load_state_dict(checkpoint_file['state_dict'])

    # trainer
    trainer = TreeTrainer(args, model, criterion)

    # predict
    y_true, y_pred, treelists, f1_by_len = predict(trainer,
                                                   test_loader,
                                                   target_map,
                                                   cuda=args.cuda)

    # assign words to roots
    for tup, treelist in zip(test_raw_corpus, treelists):
        for t in treelist:
            t.idx = tup.sent[t.idx] if t.idx < len(tup.sent) else None

    # prediction
    print('Predicting...')
    # write result: sent_id|e1|e2|ddi|type
    with open(args.predict_file, 'w') as f:
        for tup, pred in zip(test_raw_corpus, y_pred):
            ddi = 0 if pred == 'null' else 1
            f.write('|'.join([tup.sent_id, tup.e1, tup.e2, str(ddi), pred]))
            f.write('\n')

    def print_info(f, tup, target, pred, root):
        f.write('{}\n'.format(' '.join(tup.sent)))
        f.write('{}\n'.format(' | '.join(
            [tup.sent_id, tup.e1, tup.e2, target, pred])))
        f.write('{}\n\n'.format(root))

    # error analysis
    print('Analyzing...')
    with open(args.error_file, 'w') as f:
        f.write(' | '.join(['sent_id', 'e1', 'e2', 'target', 'pred']))
        f.write('\n')
        for tup, target, pred, treelist in zip(test_raw_corpus, y_true, y_pred,
                                               treelists):
            if target != pred:
                print_info(f, tup, target, pred, treelist[-1])

    # attention
    print('Writing attention scores...')
    with open(args.correct_file, 'w') as f:
        f.write(' | '.join(['target', 'sent', 'att_weight']))
        f.write('\n')
        for tup, target, pred, treelist in zip(test_raw_corpus, y_true, y_pred,
                                               treelists):
            if target == pred and target != 'null':
                print_info(f, tup, target, pred, treelist[-1])
コード例 #5
0
def main():
    parser = options.get_parser('Trainer')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    options.add_model_args(parser)
    options.add_optimization_args(parser)
    options.add_checkpoint_args(parser)
    
    args = parser.parse_args()
    print(args)
    
    args.cuda = not args.disable_cuda and torch.cuda.is_available()
    torch.manual_seed(5)
    
    if args.cuda:
        torch.backends.cudnn.benchmark = True
    
    # increase recursion depth
    sys.setrecursionlimit(10000)
    # checkpoint
    checkpoint_dir = os.path.dirname(args.checkpoint)
    if not os.path.isdir(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    
    # load dataset
    train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=False)
    assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!'
    train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus]
    val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus]    
    
    start_epoch = 0
    caseless = args.caseless
    batch_size = args.batch_size
    num_epoch = args.num_epoch
    
    # build vocab
    sents = [tup[0] for tup in train_corpus + val_corpus]
    feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless)
    target_map = ddi2013.target_map
    
    # get class weights
    _, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless)
    class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None
        
    train_loader, val_loader, test_loader = utils.load_datasets(args.processed_dir, args.train_size, args, feature_map, dataloader=True)            
    
    # build model
    vocab_size = len(feature_map)
    tagset_size = len(target_map)
    model = RelationTreeModel(vocab_size, tagset_size, args)
    
    # loss
    criterion = utils.build_loss(args, class_weights=class_weights)
    
    # load states
    if os.path.isfile(args.load_checkpoint):
        print('Loading checkpoint file from {}...'.format(args.load_checkpoint))
        checkpoint_file = torch.load(args.load_checkpoint)
        start_epoch = checkpoint_file['epoch'] + 1
        model.load_state_dict(checkpoint_file['state_dict'])
    #    optimizer.load_state_dict(checkpoint_file['optimizer'])
    else:
        print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint))
        if not args.rand_embedding:
            pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim)
            print(pretrained_word_embedding.size())
            print(vocab_size)
            model.load_pretrained_embedding(pretrained_word_embedding)
            if args.disable_fine_tune:
                model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words
        model.rand_init(init_embedding=args.rand_embedding)
    
    # trainer
    trainer = TreeTrainer(args, model, criterion)
    
    best_f1 = float('-inf')
    
    if os.path.isfile(args.load_checkpoint):
        dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        best_f1 = dev_f1
        print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format(
            dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1))
        
    track_list = []
    
    patience_count = 0
    start_time = time.time()
    q = mp.Queue()
    
    # set start methods
    try:
        mp.set_start_method('spawn')
    except RuntimeError:
        pass

    for epoch in range(start_epoch, num_epoch):
        epoch_loss = train(train_loader, trainer, epoch)
#        processes = []
#        for rank in range(args.num_processes):
#            p = mp.Process(target=train, args=(train_loader, trainer, epoch, q))
#            p.start()
#            processes.append(p)
#        for p in processes:
#            p.join()
#        
#        epoch_loss = q.get()

                
        # update lr
        trainer.lr_step(epoch_loss)
        
        dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda)
        test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda)
        if dev_f1 >= best_f1:
            patience_count = 0
            best_f1 = dev_f1
    
            track_list.append({'epoch': epoch, 'loss': epoch_loss, 
                'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 
                'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
            try:
                utils.save_checkpoint({
                            'epoch': epoch,
                            'state_dict': model.state_dict(),
                            'optimizer': trainer.optimizer.state_dict(),
                            'f_map': feature_map,
                            't_map': target_map,
                        }, {'track_list': track_list,
                            'args': vars(args)
                            }, args.checkpoint)
            except Exception as inst:
                print(inst)
        else:
            patience_count += 1
            track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss})
            print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1))
    
        print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time))
        if patience_count >= args.patience:
            break
コード例 #6
0
def main():
    # build parser
    parser = options.get_parser('Preprocessor')
    options.add_dataset_args(parser)
    options.add_preprocessing_args(parser)
    args = parser.parse_args()
    print(args)

    # make dirs
    base_dir = os.path.dirname(os.path.realpath(__file__))
    lib_dir = os.path.join(base_dir, 'lib')
    processed_dir = args.processed_dir
    train_dir = os.path.join(processed_dir, 'train')
    val_dir = os.path.join(processed_dir, 'val')
    test_dir = os.path.join(processed_dir, 'test')
    utils.make_dirs(
        [args.processed_dir, lib_dir, train_dir, val_dir, test_dir])

    # preprocess
    train_corpus = ddi2013.preprocess_ddi(os.path.join(args.raw_dir, 'train'),
                                          position=True)
    test_corpus = ddi2013.preprocess_ddi(os.path.join(args.raw_dir, 'test'),
                                         position=True)

    # get train targets
    input_targets = utils.map_iterable([item.type for item in train_corpus],
                                       ddi2013.target_map)

    # train/val split
    train_corpus, _, val_corpus, _ = utils.stratified_shuffle_split(
        train_corpus, input_targets, train_size=args.train_size)

    # write to files
    if not os.path.isdir(args.processed_dir):
        os.mkdir(args.processed_dir)
    ddi2013.write_to_file(train_corpus,
                          os.path.join(args.processed_dir, 'train.ddi'))
    ddi2013.write_to_file(val_corpus,
                          os.path.join(args.processed_dir, 'val.ddi'))
    ddi2013.write_to_file(test_corpus,
                          os.path.join(args.processed_dir, 'test.ddi'))

    # download necessary tools
    download_tagger(lib_dir)
    download_parser(lib_dir)

    # parse
    #TODO: sometimes compile failed
    os.system(
        'CLASSPATH="lib:lib/stanford-parser/stanford-parser.jar:lib/stanford-parser/stanford-parser-3.5.1-models.jar"'
    )
    os.system('javac -cp $CLASSPATH lib/*.java')

    print('=' * 80)
    print('Preprocessing dataset')
    print('=' * 80)

    # java classpath for calling Stanford parser
    classpath = ':'.join([
        lib_dir,
        os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
        os.path.join(lib_dir,
                     'stanford-parser/stanford-parser-3.5.1-models.jar')
    ])

    # split into separate files
    split(os.path.join(processed_dir, 'train.ddi'), train_dir)
    split(os.path.join(processed_dir, 'val.ddi'), val_dir)
    split(os.path.join(processed_dir, 'test.ddi'), test_dir)

    # parse sentences
    for d in [train_dir, val_dir, test_dir]:
        parse(d, cp=classpath, dep=args.dep, const=args.const)

    # get vocabulary
    build_vocab(glob.glob(os.path.join(processed_dir, '*/*.toks')),
                os.path.join(processed_dir, 'vocab.txt'))
    build_vocab(glob.glob(os.path.join(processed_dir, '*/*.toks')),
                os.path.join(processed_dir, 'vocab-cased.txt'),
                lowercase=False)