def item_based_to_class_based(lst): d = {} for item,class_cluster in enumerate(lst): update_list_dict(d, class_cluster, item) for class_cluster in d: d[class_cluster] = set(d[class_cluster]) return d
def convert_to_conll(tokens, conll_path, isgold, events_only=False, data_set='all', save=True): s = '' fname = 'ECB+/ecbplus_all' # document.fname empty_line = fname+' -\n' coref_ids = {} continuing_mids = [] for i, token in enumerate(tokens): if token.not_mention(): s += empty_line else: s += fname + ' ' put = False new_continuing_mids = [] # putting starters and single-tokeners for mid in token.coref_ids: if mid not in continuing_mids: if token.is_last_token_for_mid(mid): # single-token mention s += '(' + str(mid) + ')' else: # start new mention s += '(' + str(mid) new_continuing_mids.append(mid) put = True helpers.update_list_dict(coref_ids, mid, token) # putting enders cont_mid_list = list(continuing_mids) for mid in cont_mid_list: if token.is_last_token_for_mid(mid) or i == len(tokens)-1: s += str(mid)+')' continuing_mids.remove(mid) put = True if not put: s += '-' s += '\n' continuing_mids += new_continuing_mids print '( = %d, ) = %d'%(s.count('('), s.count(')')) # assert s.count('(') == s.count(')') total_mentions = 0 non_singleton_chains = 0 singletons = 0 for cid in coref_ids: total_mentions += len(coref_ids[cid]) non_singleton_chains += 1 if len(coref_ids[cid])>1 else 0 singletons += 1 if len(coref_ids[cid])==1 else 0 print 'TOTAL MENTIONS:',total_mentions print 'TOTAL CHAINS:',non_singleton_chains print 'TOTAL SINGLETONS:',singletons if save: with open(conll_path + conll_file_name(isgold, events_only, data_set), 'w') as f: f.write('#begin document (ECB+/ecbplus_all); part 000\n') f.write(s) f.write('\n#end document\n')
def get_clusters(self, events_only=True, topics=helpers.ALL_TOPICS): clusters = {} for mention in self.itermentions(): if mention.topic() in topics: if not events_only or mention.is_event(): helpers.update_list_dict(clusters, mention.coref_chain_id, mention)
def __init__(self, mention_coref_clusters, all_tokens, events_only=False, data_set='all', with_topics=False, topics=helpers.ALL_TOPICS): """ :type mention_coref_clusters: MentionCoreferenceClusters :type all_tokens: list """ self.mentions_dict = mention_coref_clusters.get_mentions_by_class(topics=topics) self.mentions_by_doc_dict = mention_coref_clusters.get_mentions_by_doc(topics=topics) self.gold_clusters = mention_coref_clusters.get_clusters(topics=topics) self.tokens = all_tokens self.token_hash_table = {(t.fname.replace('.txt',''), int(t.tid),):t for t in self.tokens} # hash a token table with filename and token id for quick access self.tokens_by_doc = {} for t in self.tokens: helpers.update_list_dict(self.tokens_by_doc, t.fname.replace('.txt',''), t) self.events_only = events_only self.data_set = data_set self.predictor_name = '' self.document_pairs = None self.positive_mention_pairs = None self.negative_mention_pairs = None self.with_topics = with_topics self.set_name() # only resetting token coreference values, NOT mentions for token in self.tokens: token.reset_coreference() for mention in self.itermentions(): for token in mention.tokens: token.reset_coreference() self.new_coref_id = 1
def sort_tokens_into_docs(self, word_lists=False): d = {} for token in self.tokens: if not word_lists: helpers.update_list_dict(d, token.fname.replace('.txt',''), token) else: helpers.update_str_dict(d, token.fname.replace('.txt',''), token.word.decode('utf-8')) return d
def get_mentions_by_class(self, topics=helpers.ALL_TOPICS): d = {} for mention in self.itermentions(): if mention.topic() in topics: tag = mention.get_class() if tag.startswith('NEG'): tag = 'ACTION' helpers.update_list_dict(d, tag.split('_')[0], mention) return d
def get_annotated_sentences(fname): f_to_sent_dict = {} with open(fname, 'rb') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') csv_reader.next() for row in csv_reader: fname = row[0] + '_' + row[1] + '.xml' sentence_num = int(row[2]) helpers.update_list_dict(f_to_sent_dict, fname, sentence_num) return f_to_sent_dict
def get_mentions_by_doc(self, fname=None, topics=helpers.ALL_TOPICS): mentions_by_doc = {} mentions = [] for mention in self.itermentions(): if mention.topic() in topics: if fname and mention.fname == fname: mentions.append(mention) else: helpers.update_list_dict(mentions_by_doc, mention.fname, mention) if fname: return mentions return mentions_by_doc
def test_coref_extraction(coredocs): singletons = 0 corefs_dict = {} clust = coredocs.get_clusters() for mention in clust.itermentions(): helpers.update_list_dict(corefs_dict, mention.coref_chain_id, mention) if mention.is_singleton: singletons += 1 total_count = 0 single_item_lists = 0 for iid in corefs_dict: if len(corefs_dict[iid]) == 1: single_item_lists += 1 total_count += len(corefs_dict[iid])
def count_ecb(count_dict, path, xml_file, annotated_sentences): relevant_toks = [] root = ET.parse(path + xml_file, parser=ET.XMLParser()).getroot() for tok in root.findall('token'): sentnum = int(tok.get('sentence')) if sentnum in annotated_sentences: relevant_toks.append(tok.get('t_id')) # get mentions for mention in root.find('Markables'): if mention.attrib.has_key('TAG_DESCRIPTOR'): continue mention_tok_ids = [child.get('t_id') for child in mention] if helpers.all_in_list(mention_tok_ids, relevant_toks): helpers.update_list_dict(count_dict, mention.tag, mention)
def delta_filter(self, clusters, delta, build_test_comparison=False): m_topics = self.get_mentions_by_topic(events_only=self.events_only, split_into_topics=False) all_mentions = [] delta_clusters = collections.defaultdict(list) clust_key_idxs = collections.defaultdict(int) # iterate for mlst in m_topics.itervalues(): mentions = sorted(mlst, key=Mention.get_comparator_function()) mentions = [m for m in mentions if m.is_event() or not self.events_only] for i,mention in enumerate(mentions): # delta stuff clusters_to_check = clust_key_idxs[clusters[i]] for j in range(0, clusters_to_check+1): clust_key = (clusters[i], j,) # define key for this mention if clust_key in delta_clusters: # compare tfidf for lemma delta my_tfidf = get_doc_tfidf_rep(mention.fname) broken = False for m in delta_clusters[clust_key]: check_tfidf = get_doc_tfidf_rep(m.fname) sim = cos_sim(mention.fname, m.fname, my_tfidf, check_tfidf) if sim < delta: broken = True break if not broken: # then it is similar enough, add it! helpers.update_list_dict(delta_clusters, clust_key, mention) break # then this was too different, so add a new cluster! # this must only be reached if it wasn't added to a cluster # otherwise it would have broken out of the loop # also it cannot be in clusters if we are at end since # otherwise we would not be at the end if j == clusters_to_check: delta_clusters[clust_key].append(mention) clust_key_idxs[clusters[i]] += 1 all_mentions += mentions test_comp = None if build_test_comparison: test_comp = [m.coref_chain_id for m in all_mentions] self.set_tokens_to_clusters({i:cluster for i,cluster in enumerate(delta_clusters.values())}) # this updates the coref_ids on all mentions return test_comp, [m.coref_chain_id for m in all_mentions]
def set_is_last_token(self, mid, boolean): helpers.update_list_dict(self.is_last_token_in_mention, mid, boolean)
def __init__(self, xml_file, path, topic, annotated_sentences, pmention_clusters, get_all=False, events_only=False): """ :type pmention_clusters: MentionCoreferenceClusters """ # organizational attributes self.fname = xml_file self.category = helpers.get_category(xml_file) self.topic = topic self.docnum = int(xml_file.split('_')[1].strip(self.category + '.xml')) self.sentences = {} self.mentions = {} # parsing constants initialization tokens_dict = {} relevant_tokens = {} # token parsing root = ET.parse(path + xml_file, parser=ET.XMLParser()).getroot() for tok in root.findall('token'): sentnum = int(tok.get('sentence')) if get_all or sentnum in annotated_sentences: tid = tok.get('t_id') token = EcbToken(tok.text, tid, sentnum) helpers.update_list_dict(tokens_dict, sentnum, token) relevant_tokens[tid] = token self.sentences = { sentnum: EcbSentence(token_list, sentnum, xml_file) for sentnum, token_list in tokens_dict.iteritems() } # get mentions for item in root.find('Markables'): tag = item.tag mid = item.get('m_id') if is_not_action(tag) and events_only: continue if item.attrib.has_key('TAG_DESCRIPTOR'): pmention_clusters.add_instance( RealInstance(tag, mid, item.get('TAG_DESCRIPTOR'), item.get('instance_id'), xml_file)) else: orig_ids = [int(child.get('t_id')) for child in item] mention_tok_ids = map(str, range(orig_ids[0], orig_ids[-1] + 1)) if helpers.all_in_list(mention_tok_ids, relevant_tokens.keys()): # old_len = len(corefed_token_ids) # corefed_token_ids = corefed_token_ids.union(set(orig_ids)) # if len(corefed_token_ids) - old_len != len(orig_ids): # print 'OVERLAP - continuing' # continue tokens = [relevant_tokens[tid] for tid in mention_tok_ids] self.mentions[mid] = Mention( self.fname, mid, tag, tokens, is_continuous=orig_ids == mention_tok_ids) # get coreference mids_mapped = set() for coreference in root.find('Relations'): iid = coreference.get('note') if iid is None: #intra-doc-coref iid = helpers.get_intra_doc_iid( coreference.find('target').get('m_id'), xml_file) for child in coreference: mid = child.get('m_id') if child.tag == 'source' and mid in self.mentions.keys(): try: pmention_clusters.add_mention(iid, self.mentions[mid]) mids_mapped.add(mid) except KeyError: pass for mid, mention in self.mentions.iteritems(): if mid not in mids_mapped: pmention_clusters.add_singleton_mention(mention)
def __init__(self, xml_file, path, topic, annotated_sentences, pmention_clusters, get_all=False, events_only=False): """ :type pmention_clusters: MentionCoreferenceClusters """ # organizational attributes self.fname = xml_file self.category = helpers.get_category(xml_file) self.topic = topic self.docnum = int(xml_file.split('_')[1].strip(self.category+'.xml')) self.sentences = {} self.mentions = {} # parsing constants initialization tokens_dict = {} relevant_tokens = {} # token parsing root = ET.parse(path+xml_file, parser=ET.XMLParser()).getroot() for tok in root.findall('token'): sentnum = int(tok.get('sentence')) if get_all or sentnum in annotated_sentences: tid = tok.get('t_id') token = EcbToken(tok.text, tid, sentnum) helpers.update_list_dict(tokens_dict, sentnum, token) relevant_tokens[tid] = token self.sentences = {sentnum:EcbSentence(token_list, sentnum, xml_file) for sentnum, token_list in tokens_dict.iteritems()} # get mentions for item in root.find('Markables'): tag = item.tag mid = item.get('m_id') if is_not_action(tag) and events_only: continue if item.attrib.has_key('TAG_DESCRIPTOR'): pmention_clusters.add_instance(RealInstance(tag, mid, item.get('TAG_DESCRIPTOR'), item.get('instance_id'), xml_file)) else: orig_ids = [int(child.get('t_id')) for child in item] mention_tok_ids = map(str, range(orig_ids[0], orig_ids[-1]+1)) if helpers.all_in_list(mention_tok_ids, relevant_tokens.keys()): # old_len = len(corefed_token_ids) # corefed_token_ids = corefed_token_ids.union(set(orig_ids)) # if len(corefed_token_ids) - old_len != len(orig_ids): # print 'OVERLAP - continuing' # continue tokens = [relevant_tokens[tid] for tid in mention_tok_ids] self.mentions[mid] = Mention(self.fname, mid, tag, tokens, is_continuous=orig_ids==mention_tok_ids) # get coreference mids_mapped = set() for coreference in root.find('Relations'): iid = coreference.get('note') if iid is None: #intra-doc-coref iid = helpers.get_intra_doc_iid(coreference.find('target').get('m_id'), xml_file) for child in coreference: mid = child.get('m_id') if child.tag == 'source' and mid in self.mentions.keys(): try: pmention_clusters.add_mention(iid, self.mentions[mid]) mids_mapped.add(mid) except KeyError: pass for mid,mention in self.mentions.iteritems(): if mid not in mids_mapped: pmention_clusters.add_singleton_mention(mention)
def convert_to_conll(tokens, conll_path, isgold, events_only=False, data_set='all', save=True): s = '' fname = 'ECB+/ecbplus_all' # document.fname empty_line = fname + ' -\n' coref_ids = {} continuing_mids = [] for i, token in enumerate(tokens): if token.not_mention(): s += empty_line else: s += fname + ' ' put = False new_continuing_mids = [] # putting starters and single-tokeners for mid in token.coref_ids: if mid not in continuing_mids: if token.is_last_token_for_mid( mid): # single-token mention s += '(' + str(mid) + ')' else: # start new mention s += '(' + str(mid) new_continuing_mids.append(mid) put = True helpers.update_list_dict(coref_ids, mid, token) # putting enders cont_mid_list = list(continuing_mids) for mid in cont_mid_list: if token.is_last_token_for_mid(mid) or i == len(tokens) - 1: s += str(mid) + ')' continuing_mids.remove(mid) put = True if not put: s += '-' s += '\n' continuing_mids += new_continuing_mids print '( = %d, ) = %d' % (s.count('('), s.count(')')) # assert s.count('(') == s.count(')') total_mentions = 0 non_singleton_chains = 0 singletons = 0 for cid in coref_ids: total_mentions += len(coref_ids[cid]) non_singleton_chains += 1 if len(coref_ids[cid]) > 1 else 0 singletons += 1 if len(coref_ids[cid]) == 1 else 0 print 'TOTAL MENTIONS:', total_mentions print 'TOTAL CHAINS:', non_singleton_chains print 'TOTAL SINGLETONS:', singletons if save: with open(conll_path + conll_file_name(isgold, events_only, data_set), 'w') as f: f.write('#begin document (ECB+/ecbplus_all); part 000\n') f.write(s) f.write('\n#end document\n')
def predict(self, build_test_comparison=True, delta=0.0): super(BaselineLemmaPredictor, self).predict() m_topics = self.get_mentions_by_topic(events_only=self.events_only, split_into_topics=False) all_mentions = [] clusters = collections.defaultdict(list) lemma_keys_idxs = collections.defaultdict(int) for mlst in m_topics.itervalues(): mentions = sorted(mlst, key=Mention.get_comparator_function()) mentions = [m for m in mentions if m.is_event() or not self.events_only] for i,mention in enumerate(mentions): best_lemmas = [] # set best_lemmas - for now it is by tuples, may be too restrictive.. for token in mention.tokens: if not helpers.is_stop_word(token.word): best_lemmas.append(token.lemma) if len(best_lemmas) == 0: best_lemmas = [t.lemma for t in mention.tokens] # HEAD LEMMA ONLY!!!! best_lemmas = [mention.tokens[0].lemma] lemmas = tuple(best_lemmas+[mention.topic()]) if not self.with_topics: # remove topic if we aint doin topics lemmas = lemmas[:-1] # delta stuff clusters_to_check = lemma_keys_idxs[lemmas] for i in range(0, clusters_to_check+1): lemmas_key = (lemmas, i,) if lemmas_key in clusters: # compare tfidf for lemma delta my_tfidf = get_doc_tfidf_rep(mention.fname) broken = False for m in clusters[lemmas_key]: check_tfidf = get_doc_tfidf_rep(m.fname) sim = cos_sim(mention.fname, m.fname, my_tfidf, check_tfidf) if sim < delta: broken = True break if not broken: # then it is similar enough, add it! helpers.update_list_dict(clusters, lemmas_key, mention) break # then this was too different, so add a new cluster! # this must only be reached if it wasn't added to a cluster # otherwise it would have broken out of the loop # also it cannot be in clusters if we are at end since # otherwise we would not be at the end if i == clusters_to_check: clusters[lemmas_key].append(mention) lemma_keys_idxs[lemmas] += 1 all_mentions += mentions test_comp = None if build_test_comparison: test_comp = [m.coref_chain_id for m in all_mentions] self.set_tokens_to_clusters({i:cluster for i,cluster in enumerate(clusters.values())}) # this updates the coref_ids on all mentions return test_comp, [m.coref_chain_id for m in all_mentions]