for head in heads: head2nouns[head] = Noun(head) #cycle over all the files #sys.stdout.flush() #sys.stdout.write("\r") #prog = ProgressBar(len(files)) i = 0 for f in files: #prog.update_time(i) #sys.stdout.write("\r%s" % (str(prog))) #sys.stdout.flush() i += 1 #read in the nps nps = reconcile.getNPs(f) sentences = reconcile.getSentences(f) #see which nps correspond to these heads for np in nps: np_text = utils.textClean(np.getText()) np_head = specificity_utils.getHead(np_text) if np_head in heads: #print "{0:35} => {1}".format(np_text, np_head) head2nouns[np_head].addDoc(f) head2nouns[np_head].addText(np_text) head2nouns[np_head].count += 1 head2nouns[np_head].addDefinite(np) if np["GRAMMAR"] == "SUBJECT": head2nouns[np_head].subj += 1
if options.stats: options.evaluate = False options.verbose = False print() if options.all: options.heuristics = 8 if options.vverbose: options.verbose = True nps = reconcile.getNPs_annots(options.directory) pronouns = reconcile.getPronouns(options.directory, "ALL") possessive_pronouns = reconcile.getPronouns(options.directory, "POSSESSIVE") sents = reconcile.getSentences(options.directory) nes = reconcile.getNEsByClass(options.directory, "PERSON") all_pairs = [] counts = {} #remove dates nps = utils.remove_dates(nps) pronouns = utils.remove_dates(pronouns) possessive_pronouns = utils.remove_dates(possessive_pronouns) if options.remove_its: nps = utils.remove_hard_pronouns(nps) pronouns = utils.remove_hard_pronouns(pronouns) pronouns = utils.remove_hard_pronouns(pronouns) possessive_pronouns = utils.remove_hard_pronouns(possessive_pronouns)
def process_gold(f, np, head, text, head2qp, gold_chains): #find the chain of this np np_chain = None for chain in list(gold_chains.keys()): for mention in gold_chains[chain]: if np == mention: np_chain = gold_chains[chain] break #find closest antecedent prev = None for other in np_chain: if np == other: break else: prev = other sentences = reconcile.getSentences(f) if prev is not None: head2qp[head].has_antecedent += 1 i = 0 anaphor_sent = 0 antecedent_sent = 0 #find sentence distance for sent in sentences: if sent.contains(np): anaphor_sent = i if sent.contains(prev): antecedent_sent = i i += 1 sentence_distance = abs(anaphor_sent - antecedent_sent) #how many antecedents within three sentences of antecedent head2qp[head].sent_distances.append(sentence_distance) #if it is a bare definite, how many are base antecedents? if (text == "the " + head) or \ (text == "that " + head) or \ (text == "this " + head) or \ (text == "those " + head) or \ (text == "these " + head): if np_chain[0] == np: head2qp[head].bdef_starts_chain += 1 else: if np_chain[0] == np: head2qp[head].starts_chain += 1 #is this is indef ba? how well does this track with the total % of #indefs? if (text.startswith("a ") or text.startswith("an ")): head2qp[head].faux_ba += 1 #TODO how many string matches? #TODO how many proper names? #TODO diversity of antecedents #chain coverage in document [% of document sentences chain touches] covered_sentences = list(range(len(sentences))) i = 0 for sent in sentences: for mention in np_chain: if sent.contains(mention): covered_sentences.remove(i) break i += 1 chain_coverage = float(len(sentences) - len(covered_sentences)) / len(sentences) head2qp[head].chain_coverage[f] = chain_coverage
# Created By : Nathan Gilbert # import sys import pydot from pyconcile import reconcile if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: %s <response-file>" % (sys.argv[0])) sys.exit(1) dataDir = sys.argv[1][:sys.argv[1].find("/")] responseFile = sys.argv[1][sys.argv[1].find("/"):] clusterer="SingleLink" sentences = reconcile.getSentences(dataDir) gold_chains = reconcile.getGoldChains(dataDir) #get reconcile's edges response_chains = reconcile.getResponseChains(dataDir, responseFile+"/"+clusterer) response_pairs = reconcile.getResponsePairs(dataDir, responseFile, 0.5) response_pairs = reconcile.labelCorrectPairs(gold_chains, response_pairs) #pydot graph graph = pydot.Dot("reconcile_clusters", graph_type='digraph') #add in all the NP #NOTE: as long as we are working with gold mentions, the response and gold #will match. otherwise, will need to switch over to gold nps to see proper
noun2antecedents = defaultdict(dict) fileList = open(options.filelist, 'r') #lines that start with # are ignored files = [x for x in fileList.readlines() if not x.startswith("#")] fileList.close() for f in files: f = f.strip() print("Working on document: %s" % f) #load in the document statistics d = Document(f) #the total number of sentences in this text file. #?double check with nltk? total_sentences_doc = len(reconcile.getSentences(f)) #process a document, get all the nominal stats that are requested. gold_chains = reconcile.getGoldChains(f, True) d.addGoldChains(gold_chains) for gc in list(gold_chains.keys()): base_antecedent = True previous_semantic_tag = "" prev_sent = -1 prev_tile = -1 prev_type = "" for mention in gold_chains[gc]: if HEADS_ONLY: head_clean = ' '.join(map(string.strip, \