def main(opts): # set a seed for reproducible network random.seed(42) labels_map = opts.labels_map if not labels_map: # get a copy of the labels if not provided labels_map = dict(LABELS_MAP) if not opts.include_negative_samples: # pop out the "NONE" label if no negative # samples are provided labels_map.pop('NONE') labels_map = {k: v - 1 for k, v in labels_map.items()} # load the dataset dataset = pipeline.load_abstracts_relations(opts.subtask) # get list of all dependency tags used in the dataset dependencies_map = pipeline.get_dependencies_map(dataset) # get list of all pos tags used in the dataset pos_map = pipeline.get_part_of_speech_map(dataset) # split it by sentence, potentially include negative samples sentences_dataset = pipeline.split_dataset_into_sentences( *dataset, include_negative_samples=opts.include_negative_samples) # split sentences between train and test according to the # official dataset split train_sentences, validation_sentences = pipeline.split_train_test_sentences( opts.subtask, sentences_dataset) test_dataset = pipeline.load_abstracts_relations(opts.subtask, load_test=True) test_sentences = pipeline.split_dataset_into_sentences( *test_dataset, include_negative_samples=opts.include_negative_samples) if opts.evaluate_output: evaluate_dataset = pipeline.load_abstracts_relations(opts.subtask, load_test=True) evaluate_sentences_dataset = pipeline.split_dataset_into_sentences( *evaluate_dataset, include_negative_samples=opts.include_negative_samples) else: # so that static code analyzers don't freak out! evaluate_sentences_dataset = None # get distribution info for entities in training set ent_distr = pipeline.get_distribution_ent_length(train_sentences) # get the mxnet context (aka cpu or gpu) as # provided by the user. if none is provided, use cpu0 context = mxnet_utils.get_context_from_string(opts.mxnet_context) # path to embeddings file in word2vec text format # as specified by the user embeddings_path = os.path.expanduser( EMBEDDINGS_PATHS[opts.embeddings_type]) # download embeddings from google drive embeddings_path = download_embeddings(path, opts.emb_type) # execute mxnet operations accoring in specified context with context: # load embeddings and vocabulary vocabulary, embeddings = \ mxnet_utils.word2vec_mxnet_embedding_initializer( embeddings_path, max_embeddings=opts.max_embeddings ) # get training data; has to be executed after vocabulary and # embeddings (which need to be placed on the GPU if specified, # hence the context) are loaded. train_data = prepare_data_for_net( vocabulary, train_sentences, labels_map, dependencies_map=dependencies_map, pos_map=pos_map, include_entities_nodes=opts.include_entities_nodes, include_entities_children=opts.include_entities_children, entity_length_distribution=ent_distr, case_sensitive=opts.case_sensitive) # doing the same thing, but with test data test_data = prepare_data_for_net( vocabulary, test_sentences, labels_map, dependencies_map=dependencies_map, pos_map=pos_map, include_entities_children=opts.include_entities_children, include_entities_nodes=opts.include_entities_nodes, entity_length_distribution=ent_distr, case_sensitive=opts.case_sensitive) # doing the same thing, but with test data validation_data = prepare_data_for_net( vocabulary, validation_sentences, labels_map, dependencies_map=dependencies_map, pos_map=pos_map, include_entities_children=opts.include_entities_children, include_entities_nodes=opts.include_entities_nodes, entity_length_distribution=ent_distr, case_sensitive=opts.case_sensitive) # get stats abt average size of parse tree parse_tree_lengths = [ len(t) for _, _, t, *_ in itertools.chain(train_data, test_data) ] print('[info] parse tree length: {:.2f} +/- {:.2f}'.format( np.mean(parse_tree_lengths), np.std(parse_tree_lengths))) max_tree_height = max(max(t[4]) for t in train_data) + 1 max_tree_height = (max_tree_height if 'height' in opts.extra_features else 0) dependencies_num = \ len(dependencies_map) if 'dep' in opts.extra_features else 0 pos_num = \ len(pos_map) if 'pos' in opts.extra_features else 0 include_ent_len = \ True if 'ent-len' in opts.extra_features else False net = Net(embeddings, len(labels_map), dropout=opts.dropout, trainable_embeddings=opts.trainable_embeddings, dependencies_num=dependencies_num, part_of_speech_num=pos_num, include_ent_len=include_ent_len, max_tree_height=max_tree_height) net.initialize() # loos and trainer initialized here softmax_cross_entropy_labels = mx.gluon.loss.SoftmaxCrossEntropyLoss() trainer = mx.gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opts.learning_rate}) # object to calculate F1 metric for the dataset f1_score_class = mxnet_utils.F1Score(num_classes=len(labels_map)) for epoch in range(1, opts.epochs + 1): # random.shuffle(train_data) cumulative_loss = total_steps = 0 probs, labels = [], [] for sample in tqdm.tqdm(train_data, desc='Epoch {}'.format(epoch)): with mx.autograd.record(): ( tokens, # the tokens in sentence deps, # dependency tags pos, # part of speech tags ent_lens, # length of the input entities dist_from_tree, # distance from root of subtree tree, # the subtree entities, # indication for entity location label # the label for this sample ) = sample tokens = mx.nd.array(tokens) entities = mx.nd.array(entities) idx = mx.nd.array([tree.idx]) adj = mx.nd.array(tree.to_array()) deps = mx.nd.array(deps) pos = mx.nd.array(pos) ent_lens = mx.nd.array(ent_lens) dist_from_tree = mx.nd.array(dist_from_tree) out = net(tokens, deps, pos, ent_lens, dist_from_tree, adj, entities, idx, True) probs.append(out) labels.append([label]) if len(probs) == opts.batch_size: total_steps += opts.batch_size with mx.autograd.record(): probs = mx.nd.concat(*probs, dim=0) labels = mx.nd.array(labels) loss = softmax_cross_entropy_labels(probs, labels) if opts.include_negative_samples: factor = (mx.nd.argmax(probs, axis=1) == 0) * 9 + 1 loss = mx.nd.multiply(loss, factor) loss.backward() trainer.step(opts.batch_size) cumulative_loss += mx.nd.sum(loss).asscalar() pred_labels = mx.nd.argmax(probs, axis=1) f1_score_class.update(preds=pred_labels, labels=labels) probs, labels = [], [] # get precision, recall, and F1 score for the two # subtasks on the training set for this epoch prec, recall, f1 = map( lambda arr: mx.nd.mean(arr).asscalar() * 100, f1_score_class.get()) # also calculate average loss avg_loss = cumulative_loss / total_steps # print everything msg = ('Epoch {e} // training data // avg_loss={l:.4f}\n' 'Classification: P={p:.2f} R={r:.2f} F1={f:.2f}').format( e=epoch, l=avg_loss, p=prec, r=recall, f=f1) print(msg) if opts.validate_every > 0 and epoch % opts.validate_every == 0: if opts.error_analysis_path: p = '{}{}.{}.txt'.format( os.path.splitext(opts.error_analysis_path)[0], 'val', epoch) else: p = None evaluate_on_test_data( net, validation_sentences, validation_data, labels_map, output_for_error_analysis=p, ) if opts.test_every > 0 and epoch % opts.test_every == 0: if opts.error_analysis_path: p = '{}{}.{}.txt'.format( os.path.splitext(opts.error_analysis_path)[0], 'test', epoch) else: p = None evaluate_on_test_data( net, test_sentences, test_data, labels_map, output_for_error_analysis=p, ) if opts.evaluate_output: evaluate_data = prepare_data_for_net( vocabulary, evaluate_sentences_dataset, labels_map, dependencies_map=dependencies_map, pos_map=pos_map, include_entities_children=opts.include_entities_children, include_entities_nodes=opts.include_entities_nodes, entity_length_distribution=ent_distr, case_sensitive=opts.case_sensitive) evaluate_on_test_data(net, evaluate_sentences_dataset, evaluate_data, labels_map, evaluate_output=opts.evaluate_output)
graph = nx.DiGraph() with open(path) as f: rd = csv.reader(f) for k1, k2 in rd: graph.add_edge() if __name__ == '__main__': subtask = '1.1' oth_path = '/home/ls/blue-hd/datasets/saffron-hierarchies-acl/saffron-ACL-cleaned.csv' nlp = spacy.load('en') customize_tokenizer(nlp) dataset = pipeline.load_abstracts_relations(subtask) # get list of all dependency tags used in the dataset dependencies_map = pipeline.get_dependencies_map(dataset) # get list of all pos tags used in the dataset pos_map = pipeline.get_part_of_speech_map(dataset) # split it by sentence, potentially include negative samples sentences_dataset = pipeline.split_dataset_into_sentences(*dataset) # split sentences between train and test according to the # official dataset split train_sentences, _ = pipeline.split_train_test_sentences( subtask, sentences_dataset)