def get_tagger(): dirname = os.path.dirname(__file__) corpus_root = os.path.join(dirname, 'training_data') testcaselists = BracketParseCorpusReader(corpus_root, [ 'click.txt', 'enter_text.txt', 'browser.txt', 'load_url.txt', 'keyboard_actions.txt' ]) tagger = ConsecutivePosTagger(testcaselists.tagged_sents()) return tagger
def read_wsj(article_count): wsj_root = '/Users/chbrown/Dropbox/ut/nlp/data/penn-treebank3/parsed/mrg/wsj' articles = [] for section in range(25): for article_path in os.listdir('%s/%02d' % (wsj_root, section)): reader = BracketParseCorpusReader(wsj_root, '%02d/%s' % (section, article_path)) sentences = [] for tagged_sent in reader.tagged_sents(): # token_postag_pairs = sentence token_postag_pairs = [ (token.lower(), pos_tag) for token, pos_tag in tagged_sent if pos_tag not in ('-LRB-', '-RRB-', '-NONE-')] sentence = DefinitenessDocument.from_token_postag_pairs(token_postag_pairs) sentences.append(sentence) articles.append(sentences) if len(articles) >= article_count: return articles return articles
def read_wsj(article_count): wsj_root = '/Users/chbrown/Dropbox/ut/nlp/data/penn-treebank3/parsed/mrg/wsj' articles = [] for section in range(25): for article_path in os.listdir('%s/%02d' % (wsj_root, section)): reader = BracketParseCorpusReader( wsj_root, '%02d/%s' % (section, article_path)) sentences = [] for tagged_sent in reader.tagged_sents(): # token_postag_pairs = sentence token_postag_pairs = [ (token.lower(), pos_tag) for token, pos_tag in tagged_sent if pos_tag not in ('-LRB-', '-RRB-', '-NONE-') ] sentence = DefinitenessDocument.from_token_postag_pairs( token_postag_pairs) sentences.append(sentence) articles.append(sentences) if len(articles) >= article_count: return articles return articles
class NomBank(DataLoader): """Loading Nombank data and implicit argument annotations.""" def __init__(self, params, corpus, with_doc=False): super().__init__(params, corpus, with_doc) self.wsj_treebank = BracketParseCorpusReader( root=params.wsj_path, fileids=params.wsj_file_pattern, tagset='wsj', encoding='ascii') logging.info('Found {} treebank files.'.format( len(self.wsj_treebank.fileids()))) self.nombank = NombankCorpusReader( root=FileSystemPathPointer(params.nombank_path), nomfile=params.nomfile, framefiles=params.frame_file_pattern, nounsfile=params.nombank_nouns_file, parse_fileid_xform=lambda s: s[4:], parse_corpus=self.wsj_treebank) logging.info("Loading G&C annotations.") self.gc_annos = self.load_gc_annotations() num_gc_preds = sum( [len(preds) for (d, preds) in self.gc_annos.items()]) logging.info(f"Loaded {num_gc_preds} predicates") logging.info("Loading Nombank annotations") self.nombank_annos = defaultdict(list) for nb_instance in self.nombank.instances(): docid = nb_instance.fileid.split('/')[-1] self.nombank_annos[docid].append(nb_instance) self.stats = { 'target_pred_count': Counter(), 'predicates_with_implicit': Counter(), 'implicit_slots': Counter(), } self.stat_dir = params.stat_dir class NomElement: def __init__(self, article_id, sent_num, tree_pointer): self.article_id = article_id self.sent_num = int(sent_num) self.pointer = tree_pointer @staticmethod def from_text(pointer_text): parts = pointer_text.split(':') if len(parts) != 4: raise ValueError("Invalid pointer text.") read_id = parts[0] full_id = read_id.split('_')[1][:2] + '/' + read_id + '.mrg' return NomBank.NomElement( full_id, int(parts[1]), NombankTreePointer(int(parts[2]), int(parts[3]))) def __str__(self): return 'Node-%s-%s:%s' % (self.article_id, self.sent_num, self.pointer.__repr__()) def __hash__(self): return hash( (self.article_id, self.sent_num, self.pointer.__repr__())) def __eq__(self, other): return other and other.__str__() == self.__str__() __repr__ = __str__ def load_gc_annotations(self): tree = ET.parse(self.params.implicit_path) root = tree.getroot() gc_annotations = defaultdict(dict) def merge_split_pointers(pointers): all_pointers = [] split_pointers = [] for pointer, is_split in pointers: if is_split: split_pointers.append(pointer) else: all_pointers.append(pointer) if len(split_pointers) > 0: sorted(split_pointers, key=lambda t: t.wordnum) all_pointers.append(NombankChainTreePointer(split_pointers)) return all_pointers total_implicit_count = 0 total_preds = 0 for annotations in root: pred_node_pos = annotations.attrib['for_node'] predicate = NomBank.NomElement.from_text(pred_node_pos) article_id = predicate.article_id total_preds += 1 explicit_roles = set() arg_annos = defaultdict(list) for annotation in annotations: arg_type = annotation.attrib['value'] arg_node_pos = annotation.attrib['node'] (arg_article_id, arg_sent_id, arg_terminal_id, arg_height) = arg_node_pos.split(':') is_split = False is_explicit = False for attribute in annotation[0]: if attribute.text == 'Split': is_split = True elif attribute.text == 'Explicit': is_explicit = True if pred_node_pos == arg_node_pos: # Incorporated nodes are explicit. is_explicit = True if is_explicit: explicit_roles.add(arg_type) else: p = NombankTreePointer(int(arg_terminal_id), int(arg_height)) # Arguments are group by their sentences. arg_annos[(arg_sent_id, arg_type)].append((p, is_split)) all_args = defaultdict(list) implicit_role_here = set() for (arg_sent_id, arg_type), l_pointers in arg_annos.items(): if int(arg_sent_id) > predicate.sent_num: # Ignoring annotations after the sentence. continue if arg_type not in explicit_roles: for p in merge_split_pointers(l_pointers): arg_element = NomBank.NomElement( article_id, arg_sent_id, p) if not predicate.pointer == arg_element.pointer: # Ignoring incorporated ones. all_args[arg_type].append(arg_element) implicit_role_here.add(arg_type) gc_annotations[article_id.split('/')[-1]][predicate] = all_args total_implicit_count += len(implicit_role_here) logging.info(f"Loaded {total_preds} predicates, " f"{total_implicit_count} implicit arguments.") return gc_annotations def add_predicate(self, doc, parsed_sents, predicate_node): pred_node_repr = "%s:%d:%s" % (doc.docid, predicate_node.sent_num, predicate_node.pointer) p_tree = parsed_sents[predicate_node.sent_num] p_word_idx = utils.make_words_from_pointer(p_tree, predicate_node.pointer) predicate_span = utils.get_nltk_span(doc.token_spans, predicate_node.sent_num, p_word_idx) if len(predicate_span) == 0: logging.warning("Zero length predicate found") return p = doc.add_predicate(None, predicate_span, frame_type='NOMBANK') if p: p.add_meta('node', pred_node_repr) return p def add_nombank_arg(self, doc, parsed_sents, wsj_spans, arg_type, predicate, arg_node, implicit=False): arg_type = arg_type.lower() a_tree = parsed_sents[arg_node.sent_num] a_word_idx = utils.make_words_from_pointer(a_tree, arg_node.pointer) arg_node_repr = "%s:%d:%s" % (doc.docid, arg_node.sent_num, arg_node.pointer) argument_span = utils.get_nltk_span(wsj_spans, arg_node.sent_num, a_word_idx) if len(argument_span) == 0: # Some arguments are empty nodes, they will be ignored. return em = doc.add_entity_mention(None, argument_span) if em: if implicit: arg_type = 'i_' + arg_type arg_mention = doc.add_argument_mention(predicate, em.aid, arg_type) arg_mention.add_meta('node', arg_node_repr) if implicit: arg_mention.add_meta('implicit', True) arg_mention.add_meta('sent_num', arg_node.sent_num) arg_mention.add_meta('text', em.text) return arg_mention def get_predicate_text(self, p): p_text = p.text.lower() if p_text == 'losses' or p_text == 'loss' or p_text == 'tax-loss': p_text = 'loss' else: p_text = p_text.rstrip('s') if p_text == 'savings-and-loan': p_text = 'loan' if '-' in p_text: p_text = p_text.split('-')[1] return p_text def add_all_annotations(self, doc, parsed_sents): logging.info("Adding Nombank annotation for " + doc.docid) nb_instances = self.nombank_annos[doc.docid] for nb_instance in nb_instances: predicate_node = NomBank.NomElement(doc.docid, nb_instance.sentnum, nb_instance.predicate) p = self.add_predicate(doc, parsed_sents, predicate_node) for argloc, argid in nb_instance.arguments: arg_node = NomBank.NomElement(doc.docid, nb_instance.sentnum, argloc) arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans, argid, p, arg_node) if arg_node.pointer == predicate_node.pointer: arg.add_meta('incorporated', True) if not self.params.explicit_only and doc.docid in self.gc_annos: for predicate_node, gc_args in self.gc_annos[doc.docid].items(): added_args = defaultdict(list) p = self.add_predicate(doc, parsed_sents, predicate_node) p_text = utils.normalize_pred_text(p.text) p.add_meta('from_gc', True) self.stats['target_pred_count'][p_text] += 1 for arg_type, arg_nodes in gc_args.items(): for arg_node in arg_nodes: arg = self.add_nombank_arg(doc, parsed_sents, doc.token_spans, arg_type, p, arg_node, True) added_args[arg_type].append(arg) # The following should be useless already. if arg_node.pointer == predicate_node.pointer: arg.add_meta('incorporated', True) if arg_node.sent_num > predicate_node.sent_num: arg.add_meta('succeeding', True) if len(added_args) > 0: self.stats['predicates_with_implicit'][p_text] += 1 self.stats['implicit_slots'][p_text] += len(added_args) def set_wsj_text(self, doc, fileid): text = '' w_start = 0 spans = [] for tagged_sent in self.wsj_treebank.tagged_sents(fileid): word_spans = [] for word, tag in tagged_sent: if not tag == '-NONE-': text += word + ' ' word_spans.append((w_start, w_start + len(word))) w_start += len(word) + 1 else: # Ignoring these words. word_spans.append(None) text += '\n' w_start += 1 spans.append(word_spans) doc.set_text(text) return spans def load_nombank(self): all_annos = defaultdict(list) for nb_instance in self.nombank.instances(): all_annos[nb_instance.fileid].append(nb_instance) return all_annos def get_doc(self): for docid, instances in self.nombank_annos.items(): if self.params.gc_only and docid not in self.gc_annos: continue doc = DEDocument(self.corpus) doc.set_id(docid) fileid = docid.split('_')[-1][:2] + '/' + docid parsed_sents = self.wsj_treebank.parsed_sents(fileids=fileid) doc.set_parsed_sents(parsed_sents) token_spans = self.set_wsj_text(doc, fileid) doc.set_token_spans(token_spans) self.add_all_annotations(doc, parsed_sents) yield doc def print_stats(self): logging.info("Corpus statistics from Nombank") keys = self.stats.keys() headline = 'predicate\t' + '\t'.join(keys) sums = Counter() if not os.path.exists(self.stat_dir): os.makedirs(self.stat_dir) preds = sorted(self.stats['predicates_with_implicit'].keys()) with open(os.path.join(self.stat_dir, 'counts.txt'), 'w') as out: print(headline) out.write(f'{headline}\n') for pred in preds: line = f"{pred}:" for key in keys: line += f"\t{self.stats[key][pred]}" sums[key] += self.stats[key][pred] print(line) out.write(f'{line}\n') sum_line = 'Total\t' + '\t'.join([str(sums[k]) for k in keys]) print(sum_line) out.write(f'{sum_line}\n')
while(sentence[i] != "" or len(sentence) <= 3 ): tags.append(get_next_tag(pos_dist, tags[i])) sentence.append(get_next_word(t2w_dist, tags[i+1])) i += 1 return (sentence, tags) # In[ ]: # Import and parse the corpus corpus_root = './corpus_clean/' corpus = BracketParseCorpusReader(corpus_root, ".*") tagged_sentences = corpus.tagged_sents() ngram_input = [] pos_input = [] legal_tags = ["EOS","$","#", "GW", "CC", "CD", "DT", "EX", "FW", "IN", "JJ","JJR","JJS","LS","MD", "NN","NNS","NNP",'NNPS','PDT','POS','PRP','PRP$','RB','RBR','RBS','RP','TO', "UH",'VB', 'VBD',"VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB", "\"", "\'", ",", ".", "AFX"] single_letter_words = ["a", "i", ",", ".", "!", "?", "\'", "\"", ":", ';', '0', '1', '2', "3", '4', '5', "6", '7', '8', "9", "=", "&", "#", '/', '>', "$", '<', '+', '%',] # tags_removed = ["-NONE-","SYM", "CODE", "ADD", "HYPH","-LSB-", "-RSB-",":", "NFP", "XX", "-LRB-", "-RRB-"] # Remove -NONE- and SYM tags from the training data and create a list of tokens and a list of tags. for sentence in tagged_sentences: for token in sentence: word = token[0].lower()
class PTBReader(object): def __init__(self, corpus_root, file_pattern): self.ptb = BracketParseCorpusReader(corpus_root, file_pattern) self.all_sents = [] self.all_tagged_sents = [] self.all_parsed_sents = [] self.ptb_file_id = '' def read_ptb_file(self, node): if node.file_id != self.ptb_file_id: path = '{0}/{1}.mrg'.format(node.directory, node.file_id) self.all_sents = self.ptb.sents(fileids=path) self.all_tagged_sents = self.ptb.tagged_sents(fileids=path) self.all_parsed_sents = self.ptb.parsed_sents(fileids=path) self.ptb_file_id = node.file_id def get_subtree_pos(self, node): parsed_sent = self.all_parsed_sents[node.sent_id] token_pos = parsed_sent.leaf_treeposition(node.token_id) subtree_pos = token_pos[:-(node.phrase_level + 1)] return subtree_pos def is_child_node(self, parent, child): if not (isinstance(parent, Node) and isinstance(child, Node)): return False if not (parent.file_id == child.file_id and parent.sent_id == child.sent_id): return False self.read_ptb_file(parent) parent_subtree_pos = self.get_subtree_pos(parent) child_subtree_pos = self.get_subtree_pos(child) if child_subtree_pos[:len(parent_subtree_pos)] == parent_subtree_pos: return True else: return False def parse_node(self, node): if node.__class__ == SplitNode: # parse each node in the split node for n in node.node_list: self.parse_node(n) # combine the ptb_surface of each node node.ptb_idx_list = [ idx for n in node.node_list for idx in n.ptb_idx_list ] node.ptb_surface = ' '.join( [n.ptb_surface for n in node.node_list]) else: self.read_ptb_file(node) node.subtree_pos = self.get_subtree_pos(node) parsed_sent = self.all_parsed_sents[node.sent_id] node.ptb_idx_list = [] for idx in range(len(parsed_sent.leaves())): if parsed_sent.leaf_treeposition(idx)[:len(node.subtree_pos)] \ == node.subtree_pos: node.ptb_idx_list.append(idx) assert node.ptb_idx_list == \ range(node.ptb_idx_list[0], node.ptb_idx_list[-1] + 1), \ 'Error in matching indices for subtree leaves: {0}'.format(node) tagged_sent = self.all_tagged_sents[node.sent_id] node.ptb_surface = ' '.join([ word[0] for word in [tagged_sent[i] for i in node.ptb_idx_list] ])