help="Use all heuristics.", action="store_true", dest="all", default=False) (options, args) = parser.parse_args() if len(sys.argv) < 2: parser.print_help() sys.exit(1) if options.verbose: VERBOSE = True if options.filelist is not None: fList = open(options.filelist, 'r') total_start_time = time.time() for f in fList: if f.startswith("#"): continue f = f.strip() start_time = time.time() print("Processing document: %s" % f) d = Document(f) num_pairs = process_doc(d, options) end_time = time.time() print("process time: %0.3f seconds :: %d pairs added" % ((end_time - start_time, num_pairs))) total_end_time = time.time() print("Total process time: %0.3f seconds" % ((total_end_time - total_start_time)))
head2text = defaultdict(list) #sys.stdout.flush() #sys.stdout.write("\r") #prog = ProgressBar(len(files)) i = 0 for f in files: if f.startswith("#"): continue #prog.update_time(i) #sys.stdout.write("\r%s" % (str(prog))) #sys.stdout.flush() i += 1 f = f.strip() doc = Document(f) gold_nps = reconcile.getNPs(f) gold_chains = reconcile.getGoldChains(f) doc.addGoldChains(gold_chains) for np in gold_nps: text = utils.textClean(np.getText().lower()).strip() if TRUE_PRONOUNS: if text in TRUE: add_stats(text, np, doc, nouns, head2text) else: if specificity_utils.isNominal(np): #head = getHead(text) #if head.endswith("%"): continue #skip percents #if head[-1].isdigit(): continue #skip numbers #if utils.isConj(head): continue #just skip these guys too
it = {} third_person_plural = {} sys.stdout.flush() sys.stdout.write("\r") prog = ProgressBar(len(files)) i = 0 for f in files: if f.startswith("#"): continue f = f.strip() prog.update_time(i) sys.stdout.write("\r%s" % (str(prog))) sys.stdout.flush() i += 1 doc = Document(f) #NOTE: still assuming that gold mentions are being supplied via #Reconcile. gold_nps = reconcile.getNPs(f) gold_chains = reconcile.getGoldChains(f) doc.addGoldChains(gold_chains) for np in gold_nps: text = utils.textClean(np.getText().lower()).strip() if text in data.THIRD_PERSON: #then it is he, him, she add_stats(third_person, doc, np, text) elif (text in data.IT) and (text != "i"): #then we have 'it' or 'its' add_stats(it, doc, np, text) elif text in data.THIRD_PERSON_PLURAL:
#the master dictionary of text to nominals text2nominal = {} #something easy to pickle noun2antecedents = defaultdict(dict) fileList = open(options.filelist, 'r') #lines that start with # are ignored files = [x for x in fileList.readlines() if not x.startswith("#")] fileList.close() for f in files: f = f.strip() print("Working on document: %s" % f) #load in the document statistics d = Document(f) #the total number of sentences in this text file. #?double check with nltk? total_sentences_doc = len(reconcile.getSentences(f)) #process a document, get all the nominal stats that are requested. gold_chains = reconcile.getGoldChains(f, True) d.addGoldChains(gold_chains) for gc in list(gold_chains.keys()): base_antecedent = True previous_semantic_tag = "" prev_sent = -1 prev_tile = -1 prev_type = ""
files.extend([x for x in fileList.readlines() if not x.startswith("#")]) sys.stdout.flush() sys.stdout.write("\r") prog = ProgressBar(len(files)) i = 0 nominals = {} for f in files: if f.startswith("#"): continue f=f.strip() prog.update_time(i) sys.stdout.write("\r%s" % (str(prog))) sys.stdout.flush() i += 1 doc = Document(f) #NOTE: still assuming that gold mentions are being supplied via #Reconcile. gold_nps = reconcile.getNPs(f) gold_chains = reconcile.getGoldChains(f) doc.addGoldChains(gold_chains) for np in gold_nps: text = utils.textClean(np.getText().lower()).strip() if (text in data.ALL_PRONOUNS): continue #if specificity_utils.isProper(np): # continue anaphor_np = gold_nps.getAnnotBySpan(np.getStart(), np.getEnd()) if anaphor_np["PROPER_NAME"] != "true" and anaphor_np["PROPER_NOUN"] != "true":
def gold_annotations(f): """process the file with gold annotations""" global virtual_pronouns, total_counts, virtual_pronoun_heads, \ nominal_base_antecedent, distance_from_antecedent doc = Document(f) gold_chains = reconcile.getGoldChains(f) #adding in Sundance nes. nes = reconcile.getNEs(f, True) add_reconcile_semantic_class(gold_chains, nes) #adding in Reconcile pos too. pos = reconcile.getPOS(f, True) #getting the docs nps reconcile_nps = reconcile.getNPs_annots(f) #getting sundance nps sundance_nps = reconcile.getSundanceNPs(f) add_sundance_nps(gold_chains, sundance_nps) original_text_heads = {} # just getting the heads original_text = defaultdict(list) # for getting total doc counts later. nominal2chains = defaultdict( list) # the chains that a given nominal appears. for chain in list(gold_chains.keys()): base_antecedent = True prev_annot = None antecedents = 0 for mention in gold_chains[chain]: #if the first antecedent in a chain, do not list it as anaphoric. if base_antecedent: if mention.getATTR("is_nominal") and not \ mention.getATTR("GOLD_SINGLETON"): text = mention.getText() text_lower = mention.getATTR("TEXT_CLEAN").lower() docs_appeared[text_lower].append(f) nominal_base_antecedent[text_lower] = \ nominal_base_antecedent.get(text_lower, 0) + 1 original_text[text_lower].append(text) #take note that this chain contained this nominal nominal2chains[text_lower].append(chain) #take note of the gold semantic class gold_semantic_class[text_lower].append( mention.getATTR("GOLD_SEMANTIC")) #reconcile's semantic class reconcile_semantic_class[text_lower].append( mention.getATTR("NE_CLASS")) #sundance's semantic class sun_semantic_class[text_lower].append( mention.getATTR("SUN_SEMANTIC")) number_gold_antecedents[text_lower].append(antecedents) #get verb stats if mention.getATTR("ROLE") == "SUBJ": verb = reconcile.getSubjVerb(mention, pos) if verb != None: subj_verbs[text_lower].append(verb.lower()) elif mention.getATTR("ROLE") == "DOBJ": verb = reconcile.getObjVerb(mention, pos) if verb != None: obj_verbs[text_lower].append(verb.lower()) base_antecedent = False prev_annot = mention antecedents += 1 continue if mention.getATTR("is_nominal"): text = mention.getText() text_lower = mention.getATTR("TEXT_CLEAN").lower() head_text = mention.getATTR("HEAD_TEXT") original_text[text_lower].append(text) virtual_pronouns[text_lower] = \ virtual_pronouns.get(text_lower, 0) + 1 virtual_pronoun_heads[head_text.lower()] = \ virtual_pronoun_heads.get(head_text.lower(), 0) + 1 #the semantic class Reconcile puts this in. reconcile_semantic_class[text_lower].append( mention.getATTR("NE_CLASS")) #register this doc as containing this np. docs_appeared[text_lower].append(f) #take note that this chain contained this nominal nominal2chains[text_lower].append(chain) #take note of the gold semantic class gold_semantic_class[text_lower].append( mention.getATTR("GOLD_SEMANTIC")) #the number of possible correct antecedents for this anaphor number_gold_antecedents[text_lower].append(antecedents) #sundance's semantic class sun_semantic_class[text_lower].append( mention.getATTR("SUN_SEMANTIC")) # subject verb statistics if mention.getATTR("ROLE") == "SUBJ": verb = reconcile.getSubjVerb(mention, pos) subj_verbs[text_lower].append(verb.lower()) elif mention.getATTR("ROLE") == "DOBJ": verb = reconcile.getObjVerb(mention, pos) obj_verbs[text_lower].append(verb.lower()) #get the sentence distance from these two mentions. mention_sent = reconcile.getAnnotSentence(f, mention) prev_sent = reconcile.getAnnotSentence(f, prev_annot) if mention_sent > -1 and prev_sent > -1: distance_from_antecedent[text_lower].append(mention_sent - \ prev_sent) #get the TextTiling segment distance for the two mentions mention_seg = doc.getAnnotTile(mention) prev_seg = doc.getAnnotTile(prev_annot) if mention_seg > -1 and prev_seg > -1: focus_distance[text_lower].append(mention_seg - \ prev_seg) #getting the distribution of closest antecedent types for a #given nominal if prev_annot.getATTR("is_nominal"): nominals2type[text_lower]["nominal"] = \ nominals2type[text_lower].get("nominal",0) + 1 elif prev_annot.getATTR("is_pronoun"): nominals2type[text_lower]["pronoun"] = \ nominals2type[text_lower].get("pronoun",0) + 1 else: nominals2type[text_lower]["proper"] = \ nominals2type[text_lower].get("proper",0) + 1 prev_annot = mention antecedents += 1 #for key in nominal2chains.keys(): # print "%d : %s (doc: %s)" % (len(list(set(nominal2chains[key]))), key, # doc) #update the total counts. for key in list(original_text.keys()): for text in list(set(original_text[key])): total_counts[key] = total_counts.get(key, 0) + doc.getWordCounts(text) #the head counts for key in list(virtual_pronoun_heads.keys()): total_counts_heads[key] = total_counts_heads.get(key, 0) + \ doc.getWordCounts(key)
"proper": {} } sys.stdout.flush() sys.stdout.write("\r") prog = ProgressBar(len(files)) i = 0 for f in files: if f.startswith("#"): continue f = f.strip() prog.update_time(i) sys.stdout.write("\r%s" % (str(prog))) sys.stdout.flush() i += 1 doc = Document(f) #NOTE: still assuming that gold mentions are being supplied via #Reconcile. gold_nps = reconcile.getNPs(f) gold_chains = reconcile.getGoldChains(f) doc.addGoldChains(gold_chains) for np in gold_nps: text = utils.textClean(np.getText().lower()).strip() if text in data.THIRD_PERSON: #then it is he, him, she add_stats(noun_classes["third_person"], doc, np, text) elif (text in data.IT) and (text != "i"): #then we have 'it' or 'its' add_stats(noun_classes["it"], doc, np, text)