def load_data(args): train_val_filelist = glob.glob(args.data_train) n_train = int(args.train_val_split * len(train_val_filelist)) wgtvar = args.weight_names if wgtvar == '': wgtvar = None d = DataFormat(train_groups, train_vars, label_var, wgtvar, obs_vars, extra_label_vars=extra_label_vars, filename=train_val_filelist[0]) logging.info('Using the following variables:\n' + '\n'.join([v_group + '\n\t' + str(train_vars[v_group]) for v_group in train_groups ])) logging.info('Using weight\n' + str(wgtvar)) orig_metadata = os.path.join(os.path.dirname(train_val_filelist[0]), 'metadata.json') output_metadata = os.path.join(os.path.dirname(args.model_prefix), 'preprocessing.json') if args.predict: test_filelist = glob.glob(args.data_test) test = DataLoader(test_filelist, d, batch_size=args.batch_size, predict_mode=True, shuffle=False, args=args) return test else: train = DataLoader(train_val_filelist[:n_train], d, batch_size=args.batch_size, args=args) val = DataLoader(train_val_filelist[n_train:], d, batch_size=args.batch_size, args=args) if not os.path.exists(output_metadata): train_shapes = {} for k, v in train.provide_data: train_shapes[k] = (1,) + v[1:] dump_input_metadata(orig_metadata, groups=train_groups, shapes=train_shapes, var_names=train_vars, output=output_metadata) return (train, val)
def build_dataloader(datasets_dir, ds, split, subset, batch_size, min_seq=16, max_seq=16, shuffle=True, num_workers=4, verbose=False, print_dropped=False): """Builds a Dataloader.""" if ds == 'hmdb51': Dataset = HMDB51Dataset elif ds == 'ucf101': Dataset = UCF101Dataset else: raise ValueError(f'invalid ds={ds}') ds_dir = join(datasets_dir, ds) if verbose: print(f'Building dataloader for {ds}') ds = Dataset(ds_dir, split=split, subset=subset, min_seq=min_seq, max_seq=max_seq, verbose=verbose, print_dropped=print_dropped) dl = DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers) return dl
def test(parser, vocab, num_buckets_test, test_batch_size, test_file, output_file, debug=False): data_loader = DataLoader(test_file, num_buckets_test, vocab) record = data_loader.idx_sequence results = [None] * len(record) idx = 0 for words, tags, arcs, rels in data_loader.get_batches( batch_size=test_batch_size, shuffle=False): outputs = parser.run(words, tags, is_train=False) for output in outputs: sent_idx = record[idx] results[sent_idx] = output idx += 1 arcs = reduce(lambda x, y: x + y, [list(result[0]) for result in results]) rels = reduce(lambda x, y: x + y, [list(result[1]) for result in results]) idx = 0 with open(test_file) as f: if debug: f = f.readlines()[:1000] with open(output_file, 'w') as fo: for line in f: info = line.strip().split() if info: assert len(info) == 10, 'Illegal line: %s' % line info[6] = str(arcs[idx]) info[7] = vocab.id2rel(rels[idx]) fo.write('\t'.join(info) + '\n') idx += 1 else: fo.write('\n') os.system('perl run/eval.pl -q -b -g %s -s %s -o tmp' % (test_file, output_file)) os.system('tail -n 3 tmp > score_tmp') LAS, UAS = [ float(line.strip().split()[-2]) for line in open('score_tmp').readlines()[:2] ] print('LAS %.2f, UAS %.2f' % (LAS, UAS)) os.system('rm tmp score_tmp') return LAS, UAS
config = Config(args.config_file, extra_args) vocab = Vocab(config.train_file, None if config.debug else config.pretrained_embeddings_file, config.min_occur_count) if not config.debug: pickle.dump(vocab, open(config.save_vocab_path, 'wb')) with mx.Context(mx.gpu(0) if 'cuda' in os.environ['PATH'] else mx.cpu()): parser = BiaffineParser(vocab, config.word_dims, config.tag_dims, config.dropout_emb, config.lstm_layers, config.lstm_hiddens, config.dropout_lstm_input, config.dropout_lstm_hidden, config.mlp_arc_size, config.mlp_rel_size, config.dropout_mlp, config.debug) parser.initialize(force_reinit=True) data_loader = DataLoader(config.train_file, config.num_buckets_train, vocab) # trainer = dy.AdamTrainer(pc, config.learning_rate, config.beta_1, config.beta_2, config.epsilon) trainer = gluon.Trainer(parser.collect_params(), 'adam', {'learning_rate': config.learning_rate}) global_step = 0 epoch = 0 best_UAS = 0. history = lambda x, y: open( os.path.join(config.save_dir, 'valid_history'), 'a').write( '%.2f %.2f\n' % (x, y)) while global_step < config.train_iters: print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), ' Start training epoch #%d' % (epoch, )) epoch += 1 for words, tags, arcs, rels in data_loader.get_batches(