def getACECommonNouns(f): nps = reconcile.getNPs(f) gold_nps = reconcile.parseGoldAnnots(f) common_nps = AnnotationSet("common_nouns") for np in nps: gold_np = gold_nps.getAnnotBySpan(np.getStart(), np.getEnd()) if not gold_np["GOLD_SINGLETON"] and gold_np["is_nominal"]: common_nps.add(np) return common_nps
def processACE(f, head2qp): global USE_GOLD ace_annots = reconcile.parseGoldAnnots(f) nps = reconcile.getNPs(f) stanford_deps = reconcile.getStanfordDep(f) gold_chains = reconcile.getGoldChains(f) for np in nps: ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd()) head = None text = None if PRONOUNS: if qp_utils.isPronoun(np): head = ace_np["HEAD"].lower() text = np.getText() else: continue else: if ace_np["is_nominal"]: head = utils.textClean(ace_np["HEAD"].strip().lower()) text = utils.textClean(np.getText()) else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) else: head2qp[head].updateDocs(f) head2qp[head].updateCount() if ace_np["GOLD_SINGLETON"]: head2qp[head].singleton += 1 if (text.startswith("a ") or text.startswith("an ")): head2qp[head].faux_ba += 1 else: #does it start the chain? if USE_GOLD: process_gold(f, np, head, text, head2qp, gold_chains) process_syntax(f, np, head, text, head2qp, stanford_deps)
heads.extend(list(map(string.strip, headFile.readlines()))) files = [] with open(sys.argv[2], 'r') as inFile: files.extend([x for x in inFile.readlines() if not x.startswith("#")]) prog = ProgressBar(len(files)) heads2nps = defaultdict(list) i = 0 for f in files: prog.update_time(i) i += 1 sys.stderr.write("\r%s" % (str(prog))) sys.stderr.flush() f = f.strip() nps = reconcile.getNPs(f) for np in nps: if specificity_utils.isNominal(np) or \ specificity_utils.isPronoun(np) or \ np["DATE"] != "NONE": head = specificity_utils.getHead( utils.textClean(np.getText()).lower()) if head.find(" and ") > -1: continue if head in heads: heads2nps[head].append(utils.textClean(np.getText())) sys.stderr.write("\r \r\n") for head in list(heads2nps.keys()): counts = {} definite = False
elif d == 3: return "CCFFFF" elif d == 4: return "66FFFF" else: return "FFFF00" return "FFFF00" if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: %s <file-dir>" % (sys.argv[0])) sys.exit(1) #NOTE: assumes that the nps file will only have gold_nps in it. gold_nps = reconcile.getNPs(sys.argv[1]) labels = {} buf = {} rawText = "" with open(sys.argv[1] + "/raw.txt", 'r') as txtFile: rawText = ''.join(txtFile.readlines()) for np in gold_nps: #print np.ppprint() key = "{0}:{1}".format(np.getStart(), np.getEnd()) #may need to get head for a lot of NPs. text = utils.textClean(np.getText().lower()) if text in data.ALL_PRONOUNS: #if text in ("it","its","they","them","their"): # labels[key] = 0
print("Processing documents...") for f in files: d += 1 #progress bar updates sys.stdout.write("\r") prog.update_time(d) sys.stdout.write("\r%s" % (str(prog))) sys.stdout.flush() #print "Processing document %d/%d" % (d, len(files)) gold_chains = reconcile.getGoldChains(sys.argv[1] + "/" + f) total_coref_chains += len(list(gold_chains.keys())) with open(sys.argv[1] + "/" + f + "/raw.txt", 'r') as raw_txt_file: raw_txt = ''.join(raw_txt_file.readlines()) nps = reconcile.getNPs(sys.argv[1] + "/" + f) nes = reconcile.getNEs(sys.argv[1] + "/" + f) sun_nes = reconcile.getSundanceNEs(sys.argv[1] + "/" + f) features = feature_utils.getFeatures(sys.argv[1] + "/" + f, fdir) total_feature_pairs += len(list(features.keys())) doc_stats = {} doc_stats = process_raw_txt(raw_txt, sent_tokenizer) doc_stats["common2common"] = 0 doc_stats["common2proper"] = 0 doc_stats["common2pronoun"] = 0 doc_stats["proper2common"] = 0 doc_stats["proper2proper"] = 0 doc_stats["proper2pronoun"] = 0 doc_stats["pronoun2common"] = 0 doc_stats["pronoun2proper"] = 0
def process(f, head2qp, annotated_file): stanford_deps = reconcile.getStanfordDep(f) pos = reconcile.getPOS(f) if annotated_file: nps = reconcile.getNPs(f) for np in nps: head = None text = None if PRONOUNS: if qp_utils.isPronoun(np): head = np.getText().lower() text = np.getText() else: continue else: if qp_utils.isNominal(np, pos): text = utils.textClean(np.getText()) np_tags = pos.getSubset(np.getStart(), np.getEnd()) head = utils.textClean( qp_utils.getHead2(text.lower(), np_tags)) else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) head2qp[head].updateCount(True) head2qp[head].updateDocs(f, True) else: head2qp[head].updateDocs(f, True) head2qp[head].updateCount(True) if USE_GOLD: gold_chains = reconcile.getGoldChains(f) process_gold(f, np, head, text, head2qp, gold_chains) process_syntax(f, np, head, text, head2qp, stanford_deps) else: stanford_nps = reconcile.getStanfordNPs(f) for np in stanford_nps: if PRONOUNS: if np["is_pronoun"]: head = np.getText().lower() text = np.getText() else: continue else: #skip some problems with the parser or numbers if np["HEAD"].startswith("$") or np["HEAD"].endswith( "%") or np["HEAD"] == ".": continue if np["is_nominal"]: text = utils.textClean(np.getText()) head = np["HEAD"].lower() else: continue #bookkeeping if head not in list(head2qp.keys()): head2qp[head] = QuasiPronoun(head) head2qp[head].updateDocs(f, False) head2qp[head].updateCount(False) else: head2qp[head].updateDocs(f, False) head2qp[head].updateCount(False) process_syntax(f, np, head, text, head2qp, stanford_deps)