def tokenize(self, file_ids): def tree2list(tree): if isinstance(tree, nltk.Tree): if tree.label() in word_tags: return tree.leaves()[0] else: root = [] for child in tree: c = tree2list(child) if c != []: root.append(c) if len(root) > 1: return root elif len(root) == 1: return root[0] return [] sens_idx = [] sens = [] trees = [] for id in file_ids: sentences = ptb.parsed_sents(id) for sen_tree in sentences: words = self.filter_words(sen_tree) words = ['<s>'] + words + ['</s>'] # if len(words) > 50: # continue sens.append(words) idx = [] for word in words: idx.append(self.dictionary[word]) sens_idx.append(torch.LongTensor(idx)) trees.append(tree2list(sen_tree)) return sens_idx, sens, trees
def addTrees(sec, trees): secNum = ("" if sec >= 10 else "0") + str(sec) files = os.listdir("/u/scr/corpora/ldc/1999/LDC99T42/parsed/mrg/wsj/"+secNum) for name in files: for tree in ptb.parsed_sents("WSJ/"+secNum+"/"+name): trees.append(tree)
def trees(self, file_ids): def tree2list(tree): if isinstance(tree, nltk.Tree): if tree.label() in word_tags: w = tree.leaves()[0].lower() w = re.sub('[0-9]+', 'N', w) return w else: root = [] for child in tree: c = tree2list(child) if c != []: root.append(c) if len(root) > 1: return root elif len(root) == 1: return root[0] return [] trees = [] nltk_trees = [] sens = [] for id in file_ids: sentences = ptb.parsed_sents(id) for sen_tree in sentences: words = self.filter_words(sen_tree) words = words + ['<eos>'] sens.append(words) nltk.treetransforms.chomsky_normal_form(sen_tree) trees.append(tree2list(sen_tree)) nltk_trees.append(sen_tree) return sens, trees, nltk_trees
def add_words(self, file_ids): for file_id_i in file_ids: sentences = ptb.parsed_sents(file_id_i) for sen_tree in sentences: words = self.filter_words(sen_tree) for word in words: self.dictionary.add_word(word)
def add_words(self, file_ids): # Add words to the dictionary for id in file_ids: sentences = ptb.parsed_sents(id) for sen_tree in sentences: words = self.filter_words(sen_tree) words = ['<eos>'] + words + ['<eos>'] for word in words: self.dictionary.add_word(word)
def save(fileids, filename): with open(filename, 'w') as out: for fileid in tqdm(list(fileids)): for tree in ptb.parsed_sents(fileid): tokens, parse = [], [] if _filter_none: tree = drop_none(tree) flatten(tree, tokens, parse) sanity_checks(tokens, parse) data = { 'tokens': ' '.join(tokens), 'parse': ' '.join(parse) } out.write(json.dumps(data) + '\n')
def tokenize(self, file_ids, wsj10): def tree2list(tree): if isinstance(tree, nltk.Tree): if tree.label() in word_tags: return tree.leaves()[0] else: root = [] for child in tree: c = tree2list(child) if c != []: root.append(c) if len(root) > 1: return root elif len(root) == 1: return root[0] return [] def tree2tree_wo_punc(old_tree): tree = ParentedTree.convert(old_tree) for sub in reversed(list(tree.subtrees())): if sub.height() == 2 and sub.label( ) not in word_tags: # find not word subtree parent = sub.parent() while parent and len(parent) == 1: sub = parent parent = sub.parent() print(sub, "will be deleted") del tree[sub.treeposition()] return tree sens = [] trees = [] nltk_trees = [] for id in tqdm(file_ids): sentences = ptb.parsed_sents(id) for sen_tree in sentences: words = self.filter_words(sen_tree) if len(words) > 10 and wsj10: continue sens.append(words) trees.append(tree2list(sen_tree)) nltk_trees.append(sen_tree) return sens, trees, nltk_trees # corpus = Corpus('./data/WSJ', 'WSJ23')
def addTrees(sec, trees): secNum = ("" if sec >= 10 else "0") + str(sec) files = os.listdir("/u/scr/corpora/ldc/1999/LDC99T42/parsed/mrg/wsj/" + secNum) for name in files: for tree in ptb.parsed_sents("WSJ/" + secNum + "/" + name): leaves = " ".join([ ("(" if x == "-LRB-" else (")" if x == "-RRB-" else x.replace( "\/", "/").replace("\*", "*"))) for x in tree.leaves() if "*-" not in x and ( not x.startswith("*")) and x not in ["0", "*U*", "*?*"] ]) if leaves not in deps: # only applies to one sentence in the training partition print(leaves) continue trees.append((tree, deps[leaves]))
def node2span(node, offsets): section_id, article_id, sentence_id, head_token_id, tree_height = parse_node_id( node) ptree = ParentedTree.convert( ptb.parsed_sents(f"wsj/{section_id}/wsj_{article_id}.mrg") [sentence_id]) # Index each leaf node with its offset into the document offset = offsets[article_id][sentence_id] ptree = index_tree(ptree, offset) leaf_position = ptree.leaf_treeposition(head_token_id) span_position = leaf_position[:-(tree_height + 1)] span_tokens = ptree[span_position].leaves() return section_id, article_id, sentence_id, span_tokens, ptree
def tokenize(self, file_ids): def tree2list(tree): if isinstance(tree, nltk.Tree): if tree.label() in word_tags: w = tree.leaves()[0].lower() w = re.sub('[0-9]+', 'N', w) return w else: root = [] for child in tree: c = tree2list(child) if c != []: root.append(c) if len(root) > 1: return root elif len(root) == 1: return root[0] return [] sens_idx = [] sens = [] trees = [] nltk_trees = [] for id in file_ids: sentences = ptb.parsed_sents(id) for sen_tree in sentences: words = self.filter_words(sen_tree) words = ['<eos>'] + words + ['<eos>'] sens.append(words) if self.wvec: word2idx = tools.pkl_loader(os.path.join('data/wordvec', self.wvec, 'words2idx')) idx = tools.indexesFromSentence(words, word2idx) sens_idx.append(idx) else: idx = [] for word in words: idx.append(self.dictionary[word]) sens_idx.append(torch.LongTensor(idx)) trees.append(tree2list(sen_tree)) nltk_trees.append(sen_tree) return sens_idx, sens, trees, nltk_trees
def tokenize(self, file_ids): def tree2list(tree): if isinstance(tree, nltk.Tree): if tree.label() in word_tags: w = tree.leaves()[0].lower() w = re.sub('[0-9]+', 'N', w) return w else: root = [] for child in tree: c = tree2list(child) if c != []: root.append(c) if len(root) > 1: return root elif len(root) == 1: return root[0] return [] sens_idx = [] sens = [] trees = [] nltk_trees = [] N = 0 for id in file_ids: sentences = ptb.parsed_sents(id) for sen_tree in sentences: words = self.filter_words(sen_tree) words = self.filter_words_tag(sen_tree) words = words + ['<eos>'] sens.append(words) idx = [] for word in words: idx.append(self.dictionary[word]) sens_idx.append(idx) trees.append(tree2list(sen_tree)) nltk_trees.append(sen_tree) N += len(words) return sens_idx, sens, trees, nltk_trees
def get_nltk_sents(sent_ids): raw = {} tagged = {} trees = {} data = {} for i in tqdm(sent_ids, desc="Collecting sentences and trees"): file_num, sent_num = i.split('_') subdir = file_num[:2] sent_num = int(sent_num) path = f'WSJ/{subdir}/WSJ_{file_num}.MRG' raw[i] = ptb.sents(path)[sent_num] tagged[i] = ptb.tagged_sents(path)[sent_num] trees[i] = ptb.parsed_sents(path)[sent_num] data['raw'] = raw data['tagged'] = tagged data['trees'] = trees return data
def save_file(file_ids, out_file): sens = [] trees = [] tags = [] f_out = open(out_file, 'w') for f in file_ids: sentences = ptb.parsed_sents(f) for sen_tree in sentences: orig = sen_tree.pformat(margin=sys.maxsize).strip() c = 0 while not all([tag in word_tags for _, tag in sen_tree.pos()]): del_tags(sen_tree, word_tags) c += 1 if c > 10: assert False out = sen_tree.pformat(margin=sys.maxsize).strip() while re.search('\(([A-Z0-9]{1,})((-|=)[A-Z0-9]*)*\s{1,}\)', out) is not None: out = re.sub('\(([A-Z0-9]{1,})((-|=)[A-Z0-9]*)*\s{1,}\)', '', out) out = out.replace(' )', ')') out = re.sub('\s{2,}', ' ', out) f_out.write(out + '\n') f_out.close()
def get_raw_data(): raw_data = {} fileids = ptb.fileids() obj_sofar = 0 for fileid in fileids: corpus, section, _ = fileid.split('/') if corpus.lower() != 'wsj': continue section = int(section) if section >= 2 and section <= 21: split = 'train' elif section == 22: split = 'valid' elif section == 23: split = 'test' else: split = None sent_sofar = 0 for y in ptb.parsed_sents(fileid): words, part_of_speech = zip(*y.pos()) constituency_parse = tree_to_tuple(y) obj = collections.OrderedDict() obj['example_id'] = 'ptb{}'.format(obj_sofar) obj['file_id'] = fileid obj['sent_id'] = sent_sofar obj['words'] = words obj['part_of_speech'] = part_of_speech obj['constituency_parse'] = constituency_parse sent_sofar += 1 obj_sofar += 1 raw_data.setdefault('all', []).append(obj) if split is not None: raw_data.setdefault(split, []).append(obj) return raw_data
def tokenize(self, file_ids): """Tokenizes a mrg file.""" def tree2list(tree): if isinstance(tree, nltk.Tree): if tree.label() in WORD_TAGS: w = tree.leaves()[0].lower() w = re.sub('[0-9]+', 'N', w) return w else: root = [] for child in tree: c = tree2list(child) if c: root.append(c) if len(root) > 1: return root elif len(root) == 1: return root[0] return [] sens_idx = [] sens = [] trees = [] nltk_trees = [] for file_id_i in file_ids: sentences = ptb.parsed_sents(file_id_i) for sen_tree in sentences: words = self.filter_words(sen_tree) sens.append(words) idx = [] for word in words: idx.append(self.dictionary[word]) sens_idx.append(idx) trees.append(tree2list(sen_tree)) nltk_trees.append(sen_tree) return sens_idx, sens, trees, nltk_trees
"""process chunk""" path = '/Users/pengwu5501/Downloads/wsj-2' files = os.listdir(path) l = [] grammar = {} i = 0 dict_word = {} dict_unit_rule = {} unit = 0 ter = 0 for file in files: sub_path = os.path.join(path, file) mrg = os.listdir(sub_path) for item in mrg: name = os.path.join(sub_path, item) tree = ptb.parsed_sents(name) for tre in tree: list = [] p = traversal(tre, list) for item in p: it = item.split('->') late = it[1].split() if len(late) > 2: for item_ in late: if item_.find('\'') == 0: break it[0] += ' ' it[0] += '->' it[0] += ' ' it[0] += late[0] it[0] += ' '
from nltk.corpus import ptb import os def tree2prod(trees): prods = [] for t in trees: prods += t.productions() return prods path = '/Users/pengwu5501/nltk_data/corpora/ptb/wsj' files = os.listdir(path) productions = [] cnt = 0 for file in files: sub_path = os.path.join(path, file) sub_file = os.listdir(sub_path) for item in sub_file: name = os.path.join(sub_path, item) tbank = ptb.parsed_sents(name) productions += tree2prod(tbank) set(productions) cnt += 1 print(len(productions)) gramm = CFG(Nonterminal('S'), productions) print(gramm)
num_files_in_dir = len( os.listdir('/Users/morischick/nltk_data/corpora/ptb/WSJ/' + dir_num)) #print(dir_num, num_files_in_dir) print("Beginning WSJ/", dir_num, "...") # loop through each file for j in range(0, num_files_in_dir): file_num = str(j) if j < 10: file_num = "0" + str(j) try: file_name = 'WSJ/' + dir_num + '/WSJ_' + dir_num + file_num + '.MRG' num_sentences = len(ptb.parsed_sents(file_name)) genre = genre_dict[file_name] #print(file_name, i , j, num_sentences, genre) except: print( "This file does not exist and a genre cannot be found for it" ) if genreCount[genre] < NUM_EXAMPLES: try: # loop through each sentence for x in range(0, num_sentences): if genreCount[genre] < NUM_EXAMPLES:
from nltk.corpus import ptb from nltk.tree import Tree t = ptb.parsed_sents( '/Users/pengwu5501/nltk_data/corpora/ptb/wsj/00/wsj_0001.mrg') def getCFG(tree): line = '' if isinstance(tree, Tree): line += tree.label() line += ' ' line += '->' for subtree in tree: if isinstance(subtree, Tree): line += ' ' line += subtree.label() else: line += ' ' line += '"' line += subtree line += '"' return line lis = [] def traversal(tree): lis.append(getCFG(tree)) for subtree in tree: if isinstance(subtree, Tree):
ap = argparse.ArgumentParser() ap.add_argument("--ptbfiles", help="PennTreebank files") ap.add_argument("--trees", help="Output trees") ap.add_argument("--words", help="Output words, sentence per line") args = ap.parse_args() tree_file = open(args.trees, 'w') word_file = open(args.words, 'w') #TODO can not -> cannot for filename in sorted( glob.glob('/home/marecek/nltk_data/corpora/ptb/' + args.ptbfiles)): #print("Processing " + filename) trees = ptb.parsed_sents(filename) for i in range(len(trees)): # remove traces and other empty nodes for sub in trees[i].subtrees(): for n, child in enumerate(sub): if isinstance(child, str): continue if (all(leaf.startswith("*") for leaf in child.leaves()) or child.label() == '-NONE-'): del sub[n] # extract list of POS tags and remove POS tags from the trees sent_tags = list() for sub in trees[i].subtrees(): sub.set_label("X") for n, child in enumerate(sub): if isinstance(child, str):
def main(test=False): """ makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and serializes them all to disk for future use. The ViterbiParser runs in cubic time and give the most likely parse. The ShiftReduceParser runs in linear time and gives a single parse. https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc https://www.nltk.org/_modules/nltk/grammar.html """ vocabulary = chainer.datasets.get_ptb_words_vocabulary() freq_thresh = 0 ## ARBITRARY word_freqs = FreqDist(ptb.words()) if not os.path.isfile('parsers/grammar.pkl'): productions = [] add_dict = {} # use the entire treebank's parsed sentences to generate the CFG for i, tree in enumerate(ptb.parsed_sents()): # is it a good idea to combine this with my preprocessing? tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) # preprocess all productions by removing all tags these_productions = tree.productions() for production in these_productions: # remove all tags from the LHS (only keep primary tag) production._lhs = preprocess_nt(production._lhs) rhs = [] for item in production._rhs: # remove all tags from the Nonterminals on the RHS if type(item) == nltk.grammar.Nonterminal: rhs.append(preprocess_nt(item)) # replace numbers with N elif is_number(item): rhs.append('N') # items not in dictionary replaced with <unk> # dictionary requires lower elif not is_key(vocabulary, item.lower()): rhs.append('<unk>') # replace infrequent words with <unk> elif word_freqs[item] < freq_thresh: rhs.append('<unk>') # lowercase all entries in the grammar else: rhs.append(item.lower()) production._rhs = tuple(rhs) if not is_key(add_dict, production.unicode_repr()): add_dict[production.unicode_repr()] = True productions.append(production) print('** {} productions found! **'.format(len(productions))) grammar = induce_pcfg(Nonterminal('S'), productions) with open('parsers/grammar.pkl', 'wb') as f: f.write(pickle.dumps(grammar)) if not os.path.isfile('parsers/viterbi_parser.pkl'): filename = open('parsers/grammar.pkl', 'rb') grammar = pickle.load(filename) viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time with open('parsers/viterbi_parser.pkl', 'wb') as f: f.write(pickle.dumps(viterbi_parser)) if not os.path.isfile('parsers/shift_reduce_parser.pkl'): filename = open('parsers/grammar.pkl', 'rb') grammar = pickle.load(filename) shift_reduce_parser = ShiftReduceParser(grammar, trace=0) # linear time with open('parsers/shift_reduce_parser.pkl', 'wb') as f: f.write(pickle.dumps(shift_reduce_parser)) with open('data/ptb.train.txt', 'r') as f: data = f.readlines() if test: for sample in [1, 23, 20330, 20332, 443]: t1 = time.time() viterbi_parser.parse_one(data[sample].split()) t2 = time.time() print('viterbi = {:.2f} sec for {} words'.format( t2-t1, len(data[sample].split()))) t1 = time.time() shift_reduce_parser.parse_one(data[sample].split()) t2 = time.time() print('shift reduce = {:.2f} sec for {} words'.format( t2-t1, len(data[sample].split())))
import nltk from nltk import Nonterminal as NT from nltk.grammar import PCFG from nltk.corpus import ptb from nltk import induce_pcfg import pickle productions = [] for i, tree in enumerate(ptb.parsed_sents()): tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = NT('S') grammar = induce_pcfg(S, productions) with open('ptb_grammar.pcfg', 'wb') as w: pickle.dump(grammar, w)
#!/usr/bin/env python3 from nltk.corpus import ptb import re """ slicing penn treebank to extract sentences of less than N words and output the gold standard trees. the directory of the treebank mrg directory must be in ~/nltk_data/ptb this outputs trees, and one can use make item to convert into tagwords or words """ sent_length = 20 # the maximum number of words in a sentence parsed_sents = ptb.parsed_sents() with open('../test.txt', 'w') as fi: for index, sent in enumerate(ptb.tagged_sents()): count = 0 for word, tag in sent: if 'NONE' not in tag: count += 1 if count <= sent_length: tree = parsed_sents[index] for pos in tree.treepositions('leaves'): tree[pos] = tree[pos].lower() # tree.collapse_unary(collapsePOS=True) tree = str(tree).replace('\n', '') tree = re.sub('\s+', ' ', tree) print(tree, file=fi)
def get_bnp_from_ptb(ptb_dir): #train_fileids = [ptb_dir+"%02d"%x for x in range(2, 22)] #valid_fileids = [ptb_dir+"%02d"%x for x in range(22, 23)] #test_fileids = [ptb_dir+"%02d"%x for x in range(23, 24)] train_fileids = [path.join(ptb_dir, "%02d" % x) for x in range(2, 22)] valid_fileids = [path.join(ptb_dir, "%02d" % x) for x in range(22, 23)] test_fileids = [path.join(ptb_dir, "%02d" % x) for x in range(23, 24)] train = [] valid = [] test = [] basenp_count_train = 0 basenp_count_valid = 0 basenp_count_test = 0 token_count_train = 0 token_count_valid = 0 token_count_test = 0 np_lens_train = [] np_lens_valid = [] np_lens_test = [] all_lens_train = [] print_every = 1000 for split_fileids, split_label in zip( [train_fileids, valid_fileids, test_fileids], ["train", "valid", "test"]): sent_ctr = 0 for wsj_section_folderpath in split_fileids: all_mrg_files_in_split = glob.glob(wsj_section_folderpath + "/*.mrg") for mrg_file in all_mrg_files_in_split: parsed_sents = ptb.parsed_sents(mrg_file) parsed_sents_ctr = 0 for parsed_tree in parsed_sents: if sent_ctr % print_every == 0: print "Currently processing %d in %s" % (sent_ctr, split_label) sent_ctr += 1 base_np_delineated_tokens = traverse_tree( parsed_tree, True) parsed_sents_ctr += 1 if split_label == "train": basenp_count_split, token_count_split, np_lens_split, all_lens_split = count_basenps( base_np_delineated_tokens) basenp_count_train += basenp_count_split token_count_train += token_count_split np_lens_train.extend(np_lens_split) all_lens_train.extend(all_lens_split) train.append(base_np_delineated_tokens) elif split_label == "valid": basenp_count_split, token_count_split, np_lens_split, _ = count_basenps( base_np_delineated_tokens) basenp_count_valid += basenp_count_split token_count_valid += token_count_split np_lens_valid.extend(np_lens_split) valid.append(base_np_delineated_tokens) elif split_label == "test": basenp_count_split, token_count_split, np_lens_split, _ = count_basenps( base_np_delineated_tokens) basenp_count_test += basenp_count_split token_count_test += token_count_split np_lens_test.extend(np_lens_split) test.append(base_np_delineated_tokens) print "Train" print "Total bag size: %d, Average size of item in bag %f" % ( len(all_lens_train), np.mean(all_lens_train)) print "Base NP count: %d, Average Base NP length: %f, Token count: %d, Sentence count: %d" % ( basenp_count_train, np.mean(np_lens_train), token_count_train, len(train)) print "Valid" print "Base NP count: %d, Average Base NP length: %f, Token count: %d, Sentence count: %d" % ( basenp_count_valid, np.mean(np_lens_valid), token_count_valid, len(valid)) print "Test" print "Base NP count: %d, Average Base NP length: %f, Token count: %d, Sentence count: %d" % ( basenp_count_test, np.mean(np_lens_test), token_count_test, len(test)) #startswith("NP"): Number of base NPs in the training set: 228399 #Train #Base NP count: 228399, Average Base NP length: 2.212983, Token count: 949938, Sentence count: 39832 #Valid #Base NP count: 9536, Average Base NP length: 2.273700, Token count: 40104, Sentence count: 1700 #Test #Base NP count: 13457, Average Base NP length: 2.192465, Token count: 56674, Sentence count: 2416 #Size of vocab (i.e., tokens that appear in training, not including the additional <unk>) 10000 raw_train = remove_base_np_syms(train) raw_valid = remove_base_np_syms(valid) raw_test = remove_base_np_syms(test) return raw_train, raw_valid, raw_test, train, valid, test
def _read_document(file_path: str): print(f"Reading GC2012 instances from dataset file at: {file_path}") xml_tree = ET.parse(file_path) root = xml_tree.getroot() # Read in all relevant documents to get token offsets sentence_offsets = dict( ) # sentence_offsets[doc][sentence] = starting_token_idx # Remove special parse tokens (e.g., "*RNR*-1") from text and reindex tokens # It appears that all special characters have a parent of '-NONE-'. token_map = dict( ) # token_map[doc][original_token_idx] = remapped_token_idx texts = dict() filtered_texts = dict() for annotations in root.getchildren(): trigger_node = annotations.get('for_node') # wsj_xxxx:a:b:c section_id, article_id, _, _, _ = parse_node_id(trigger_node) if article_id in sentence_offsets.keys(): # we've already processed this document continue else: sentence_offsets[article_id] = dict() token_map[article_id] = dict() parse_trees = ptb.parsed_sents( f"wsj/{section_id}/wsj_{article_id}.mrg") total_valid_tokens_seen = 0 total_tokens_seen = 0 text = [] filtered_text = [] for sent_id, parse_tree in enumerate(parse_trees): ptree = ParentedTree.convert(parse_tree) tokens = ptree.leaves() valid_token_indices = [ i if valid_token(ptree, i) else None for i, x in enumerate(tokens) ] valid_tokens = [ x for i, x in enumerate(tokens) if valid_token(ptree, i) ] sentence_offsets[article_id][sent_id] = total_tokens_seen for i, x in enumerate(valid_token_indices): if x is None: # special token that should be removed (e.g. *RNR*-1) token_map[article_id][total_tokens_seen] = None else: token_map[article_id][ total_tokens_seen] = total_valid_tokens_seen total_valid_tokens_seen += 1 total_tokens_seen += 1 text.append(tokens) filtered_text.append(valid_tokens) texts[article_id] = text filtered_texts[article_id] = filtered_text # See `http://lair.cse.msu.edu/projects/implicit_annotations.html` for details. packets = [] for annotations in root.getchildren(): trigger_node = annotations.get('for_node') # wsj_xxxx:a:b:c trigger_section_id, trigger_article_id, trigger_sentence_id, trigger_span_tokens, _ = node2span( trigger_node, sentence_offsets) # Readjust token indices since we removed special tokens (this probably doesn't happen in the data, but just to be safe) trigger_text, original_trigger_span = indices2range( trigger_span_tokens) trigger_span = ( token_map[trigger_article_id][original_trigger_span[0]], token_map[trigger_article_id][original_trigger_span[1]]) packet = { "document_id": f"wsj_{trigger_article_id}", "document": filtered_texts[ trigger_article_id], # filtered document (does not include special parse tokens) "trigger": { "node_id": trigger_node, "span": trigger_span, # offset into filtered document "text": trigger_text }, "arguments": defaultdict(list) } printed_trigger = False for annotation in annotations.getchildren(): argument_node = annotation.attrib.get('node') argument_section_id, argument_article_id, argument_sentence_id, argument_span_tokens, _ = node2span( argument_node, sentence_offsets) if trigger_section_id != argument_section_id: raise ValueError( f"Trigger and argument should be in same section: got trigger_section_id={trigger_section_id}, argument_section_id={argument_section_id}" ) if trigger_article_id != argument_article_id: raise ValueError( f"Trigger and argument should be in same article: got trigger_article_id={trigger_article_id}, argument_article_id={argument_article_id}" ) argn = annotation.attrib.get('value') # get `attribute` assert len(annotation.getchildren()) == 1 assert annotation.getchildren()[0].tag == 'attributes' assert len(annotation.getchildren()[0].getchildren()) <= 1 if len(annotation.getchildren()[0].getchildren()) == 1: attribute = annotation.getchildren()[0].getchildren()[0].text else: attribute = "" # Readjust token indices since we removed special tokens argument_text, original_argument_span = indices2range( argument_span_tokens) argument_span = ( token_map[argument_article_id][original_argument_span[0]], token_map[argument_article_id][original_argument_span[1]]) if attribute == "Split": if not printed_trigger: print("Trigger", trigger_node, trigger_sentence_id, trigger_text, trigger_span) printed_trigger = True print(argn, attribute, argument_node, argument_sentence_id, argument_text, argument_span) packet["arguments"][argn].append({ "node_id": argument_node, "span": argument_span, # offset into filtered document "attribute": attribute, "text": argument_text }) if printed_trigger: print(packet) print() packets.append(packet) return packets
def _create_data(self): # hard coding of the number of samples for train and valid # n_train = 42069 # n_valid = 7139 # n_total = 49208 if self.split == 'train': self._create_vocab() else: self._load_vocab() #tokenizer = TweetTokenizer(preserve_case=False) # we build the dataset by looping through these inds of parsed_sents() if self.split == 'train': n_begin = 0 n_end = 42069 else: n_begin = 42069 n_end = 49208 data = defaultdict(dict) # collect all treebank sentences and nonterminals for multi-processing t1 = time.time() all_sentences = ptb.sents() all_sentences = all_sentences[n_begin:n_end] all_parses = ptb.parsed_sents() all_parses = all_parses[n_begin:n_end] t2 = time.time() print('read all sentences in {} sec'.format(t2 - t1)) # preprocess all sentences in paralell pool = Pool() # required for multicore try: t1 = time.time() preprocessed_sentences = pool.map_async(self._preprocess, all_sentences).get(9999999) pool.close() t2 = time.time() print('preprocessed all sentences in {} min'.format( (t2 - t1) / 60.0)) except KeyboardInterrupt: pool.terminate() pool.join() sys.exit(1) # get all phrase tags in paralell pool = Pool() try: t1 = time.time() phrase_tags = pool.map_async(self._get_phrase_tags, all_parses).get(9999999) pool.close() t2 = time.time() print('phrase tags for all sentences collected in {} min'.format( (t2 - t1) / 60.0)) except KeyboardInterrupt: pool.terminate() pool.join() sys.exit(1) # now, finish things up by adding start/end tags t1 = time.time() tag_count = np.zeros(len(PHRASE_TAGS)) for i, words in enumerate(preprocessed_sentences): inputs = ['<sos>'] + words inputs = inputs[:self.max_sequence_length] target = words[:self.max_sequence_length - 1] target = target + ['<eos>'] assert len(inputs) == len(target), "%i, %i" % (len(inputs), len(target)) length = len(inputs) inputs.extend(['<pad>'] * (self.max_sequence_length - length)) target.extend(['<pad>'] * (self.max_sequence_length - length)) inputs = [self.w2i.get(w, self.w2i['<unk>']) for w in inputs] target = [self.w2i.get(w, self.w2i['<unk>']) for w in target] tag_count += phrase_tags[i] data[i]['input'] = inputs data[i]['target'] = target data[i]['length'] = length data[i]['tags'] = phrase_tags[i] t2 = time.time() print('sentences loaded into dict in {} sec'.format(i, n_end, t2 - t1)) for i, tag in enumerate(PHRASE_TAGS): print('+ tag {}, n={}'.format(tag, tag_count[i])) with io.open(os.path.join(self.data_dir, self.data_file), 'wb') as data_file: data = json.dumps(data, ensure_ascii=False) data_file.write(data.encode('utf8', 'replace')) self._load_data(vocab=False)