def annotate_data(): raw_data = get_raw_data() parsed_data = get_parsed_data(raw_data) # combining text at nodes occurs within these functions sst_trees = get_sst_trees(raw_data) dep_trees = get_dep_trees(parsed_data) # use compare and annotate to annotate for dataset in dep_trees.keys(): dep_set = dep_trees[dataset] sst_set = sst_trees[dataset] for i in range(len(dep_set)): compare_and_annotate(sst_set[i], dep_set[i]) # report every so often to check integrity if i % 100 == 0: print('ORIGINAL') for node in sst_set[i].node_list: print('%s\t%s\t%s' % (node.id, node.tag, node.text_at_node)) print('DEP') for node in dep_set[i].node_list: print('%s\t%s\t%s' % ( node.id, node.annotation, node.text_at_node)) # save a pickle pickling.save(dep_trees, glovar.PKL_DIR, 'annotated_dep_trees.pkl') return dep_trees
"""For pre-processing the data.""" from ext import vocab_emb, pickling from data import sst, nli import glovar import os if not os.path.exists(glovar.PKL_DIR): os.makedirs(glovar.PKL_DIR) if not os.path.exists(glovar.CKPT_DIR): os.makedirs(glovar.CKPT_DIR) # Create the vocab dictionary print('Creating vocab dict...') #sst_text = sst.get_text() nli_text = nli.get_text() #all_text = ' '.join([sst_text, nli_text]) all_text = ' '.join([nli_text]) vocab_dict, _ = vocab_emb.create_vocab_dict(all_text) pickling.save(vocab_dict, glovar.PKL_DIR, 'vocab_dict.pkl') print('Success.') # Create GloVe embeddings print('Creating GloVe embeddings...') embedding_mat = vocab_emb.create_embeddings(vocab_dict, 300, glovar.GLOVE_DIR) pickling.save(embedding_mat, glovar.PKL_DIR, 'glove_embeddings.pkl') print('Success.')
def save(self): pickling.save(self, glovar.PKL_DIR, 'history_%s.pkl' % self.name)