def getHead(text): """duplicates the head generation in java""" text = text.strip() #check if conjunction if utils.isConj(text): return utils.conjHead(text) tokens = text.split() new_text = "" first = True for word in tokens: if (utils.break_word(word) and not first): break if (word.endswith(",")): new_text += word[:-1] break #capture possessives? #if (word.endswith("'s"): # new_text = "" # continue new_text += word + " " first = False new_text = new_text.strip() if new_text == "": sys.stderr.write("Empty text: \"{0}\" : \"{1}\"".format( text, new_text)) return new_text.split()[-1]
def add_stats(text, anaphor, doc, nouns, head2text): head = getHead(text) if head.endswith("%"): return #skip percents if head[-1].isdigit(): return #skip numbers if utils.isConj(head): return #just skip these guys too if head == "himself": return #NOTE for some reason, the filter doesn't #catch this, must be happening after head #noun is created. if head == "themselves": return anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd()) #update the head2text dict if text not in head2text[head]: head2text[head].append(text) #make sure the head nouns are reasonable #print "{0} => {1}".format(text, head) #then look for thangs if text not in list(nouns.keys()): nouns[text] = Nominal(text) nouns[text].updateDocs(doc.getName()) else: nouns[text].updateCount() nouns[text].updateDocs(doc.getName()) if anaphor_np["GRAMMAR"] == "SUBJECT": nouns[text].subj += 1 elif anaphor_np["GRAMMAR"] == "OBJECT": nouns[text].dobj += 1 antecedent = doc.closest_antecedent(anaphor) if antecedent is not None: #record stats sd = doc.sentence_distance(antecedent, anaphor) nouns[text].sentence_distance(sd) nouns[text].most_recent_antecedents.append( antecedent.getText().lower()) antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(), antecedent.getEnd()) if antecedent_np["GRAMMAR"] == "SUBJECT": nouns[text].subj_ante += 1 elif antecedent_np["GRAMMAR"] == "OBJECT": nouns[text].dobj_ante += 1 if antecedent.getText().lower() == anaphor.getText().lower(): nouns[text].string_matches += 1 if specificity_utils.isProper(antecedent_np): nouns[text].prp_ante += 1 elif specificity_utils.isNominal(antecedent_np): nouns[text].nom_ante += 1 elif specificity_utils.isPronoun(antecedent_np): nouns[text].pro_ante += 1 else: #this guy starts the chain nouns[text].starts_chain += 1
def add_stats(text, head, anaphor, doc, nouns, head2text): #catches a problem with the following report if head == 'the': head = text.split()[-1] if head.endswith("%"): return #skip percents if head[-1].isdigit(): return #skip numbers if utils.isConj(head): return #just skip these guys too if head == "himself": return #NOTE for some reason, the filter doesn't #catch this, must be happening after head #noun is created. if head == "themselves": return if head == "head": return if head == "where": return if head == "there": return if head == "here": return anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd()) #update the head2text dict if text not in head2text[head]: head2text[head].append(text) #make sure the head nouns are reasonable #print "{0} => {1}".format(text, head) #then look for thangs if text not in list(nouns.keys()): nouns[text] = VirtualPronoun(text) nouns[text].updateDocs(doc.getName()) else: nouns[text].updateCount() nouns[text].updateDocs(doc.getName()) if anaphor_np["GRAMMAR"] == "SUBJECT": nouns[text].subj += 1 elif anaphor_np["GRAMMAR"] == "OBJECT": nouns[text].dobj += 1 #begin modifier code definite = "the {0}".format(head) indefinite1 = "a {0}".format(head) indefinite2 = "an {0}".format(head) #pos = reconcile.getPOS(doc.getName()) #head_index = specificity_utils.getHeadIndex(anaphor_np, head) #np_pos = pos.getSubset(anaphor.getStart(), anaphor.getEnd()) #np_words = text.split() if text.startswith(definite): nouns[text].bare_definite += 1 #elif text.startswith(indefinite1) or text.startswith(indefinite2): #nouns[text].indefinite += 1 #else: ##NOTE: just checking to see if there is some kind of modification now #if len(np_pos) == len(np_words): ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words)) #for i in range(0, head_index): #if np_pos[i]["TAG"] == "DT": #continue #elif np_pos[i]["TAG"] == "JJ": ##print "Adjective: {0}".format(np_words[i]) #nouns[text].adjective_modifiers.append(np_words[i]) #elif np_pos[i]["TAG"].startswith("N"): ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"]) #if np_pos[i]["TAG"].startswith("NNP"): #nouns[text].proper_modifiers.append(np_words[i]) #else: #nouns[text].common_modifiers.append(np_words[i]) #else: ##print "?: {0}".format(np_words[i]) #nouns[text].other_modifiers.append(np_words[i]) #if text.startswith("the "): #get parts of speech for the np: #else: ##not definite, but still modified #if len(np_pos) == len(np_words): ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words)) #continue #for i in range(0, head_index): #if np_pos[i]["TAG"] == "DT": #continue #elif np_pos[i]["TAG"] == "JJ": ##print "Adjective: {0}".format(np_words[i]) #nouns[text].adjective_modifiers.append(np_words[i]) #elif np_pos[i]["TAG"].startswith("N"): ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"]) #if np_pos[i]["TAG"].startswith("NNP"): #nouns[text].proper_modifiers.append(np_words[i]) #else: #nouns[text].common_modifiers.append(np_words[i]) #else: ##print "?: {0}".format(np_words[i]) #nouns[text].other_modifiers.append(np_words[i]) #capture post modifiers #if text.find(head + " of ") > -1: #of_start = text.find(head + " of ") #of_object = text[len(head) + of_start + 3:] #nouns[text].of_attachments.append(of_object.strip()) #if text.find(head + " on ") > -1: #of_start = text.find(head + " on ") #of_object = text[len(head) + of_start + 3:] #nouns[text].on_attachments.append(of_object.strip()) #if text.find(head + " that ") > -1: #that_start = text.find(head + " that ") #that_clause = text[len(head) + that_start+5:] #nouns[text].that_attachments.append(that_clause.strip()) #if text.find(head + " with ") > -1: #that_start = text.find(head + " with ") #that_clause = text[len(head) + that_start+5:] #nouns[text].with_attachments.append(that_clause.strip()) #if text.find(head + " by ") > -1: #by_start = text.find(head + " by ") #by_object = text[len(head) + by_start+3:] #nouns[text].by_attachments.append(by_object.strip()) #if text.find(head + " which ") > -1: #which_start = text.find(head + " which ") #which_clause = text[len(head) + which_start+6:] #nouns[text].which_attachments.append(which_clause.strip()) #if len(np_pos) >= head_index+2 and len(np_words) >= head_index+2: #if np_pos[head_index+1]["TAG"] == "VBD": #nouns[text].verbed.append(np_words[head_index+1]) #if np_pos[head_index+1]["TAG"] == "VBG": #nouns[text].verbing.append(np_words[head_index+1]) #end modifier code #find which chain the anaphor is from and add the chain statistics anaphor_chain = None for chain in list(doc.gold_chains.keys()): for mention in doc.gold_chains[chain]: if anaphor == mention: anaphor_chain = chain break chain_name = "{0}:{1}".format(doc.getName(), anaphor_chain) if chain_name not in nouns[text].chains: nouns[text].chains.append(chain_name) if anaphor_chain is not None: chain_length = len(doc.gold_chains[anaphor_chain]) nouns[text].chain_size[doc.getName()] = chain_length #coverage #chain_start = doc.gold_chains[chain][0].getStart() #chain_end = doc.gold_chains[chain][-1].getEnd() #chain_size = chain_end - chain_start #chain_coverage = float(chain_size) / len(doc.text) # number of sentences touched / number of sentences covered_sentences = 0 for sent in doc.sentences: for mention in doc.gold_chains[anaphor_chain]: if sent.contains(mention): covered_sentences += 1 break chain_coverage = float(covered_sentences) / len(doc.sentences) nouns[text].chain_coverage[doc.getName()] = chain_coverage for chain in list(doc.gold_chains.keys()): if chain == anaphor_chain: continue if len(doc.gold_chains[chain]) > chain_length: break else: nouns[text].largest_chain += 1 common_only = True for mention in doc.gold_chains[anaphor_chain]: if mention == anaphor: continue mention_head = getHead(utils.textClean(mention.getText())) if mention_head not in nouns[text].all_entities: nouns[text].all_entities.append(mention_head) #does this chain contain proper names? mention_np = doc.nps.getAnnotBySpan(mention.getStart(), mention.getEnd()) if specificity_utils.isProper(mention_np): common_only = False if chain_name not in list(nouns[text].nom_chain_only.keys()): nouns[text].nom_chain_only[chain_name] = common_only else: sys.stderr.write("Anaphor chain not found?\n") antecedent = doc.closest_antecedent(anaphor) if antecedent is not None: #record stats sd = doc.sentence_distance(antecedent, anaphor) nouns[text].sentence_distance(sd) nouns[text].most_recent_antecedents.append( antecedent.getText().lower()) antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(), antecedent.getEnd()) if antecedent_np["GRAMMAR"] == "SUBJECT": nouns[text].subj_ante += 1 elif antecedent_np["GRAMMAR"] == "OBJECT": nouns[text].dobj_ante += 1 if antecedent.getText().lower() == anaphor.getText().lower(): nouns[text].string_matches += 1 if specificity_utils.isProper(antecedent_np): nouns[text].prp_ante += 1 elif specificity_utils.isNominal(antecedent_np): nouns[text].nom_ante += 1 elif specificity_utils.isPronoun(antecedent_np): nouns[text].pro_ante += 1 else: #this guy starts the chain nouns[text].starts_chain += 1
sys.stderr.flush() sys.stderr.write("\r") prog = ProgressBar(len(files)) j = 0 for f in files: prog.update_time(j) sys.stderr.write("\r%s" % (str(prog))) sys.stderr.flush() j += 1 nps = reconcile.getNPs(f) pos = reconcile.getPOS(f) for np in nps: if specificity_utils.isNominal(np): np_text = utils.textClean(np.getText()).lower() if utils.isConj(np_text): continue np_head = specificity_utils.getHead(np_text).lower() head_index = specificity_utils.getHeadIndex(np, np_head) np_pos = pos.getSubset(np.getStart(), np.getEnd()) np_words = np_text.split() #print "{0:35} -> {1:15}".format(np_text, np_head) if np_head not in list(head2nouns.keys()): head2nouns[np_head] = Noun(np_head) head2nouns[np_head].docs.append(f) else: head2nouns[np_head].count += 1 head2nouns[np_head].docs.append(f) if np["GRAMMAR"] == "SUBJECT":