def __init__(self, clusterNum, dataDir, lemmaDir, outputDir, printStats=False): #print "* Parsing cluster " + str(clusterNum) self.clusterNum = clusterNum self.references = defaultdict(list) # ref_id -> [m_id1, m_id2, ... ] which spans all docs in the cluster self.docs = defaultdict(Document) # doc_id -> Document # NOTE: the point of this variable is just print stats and see how many # Mentions are 'singletons' (not encompassed by any Ref) self.mentions = defaultdict(list) # m_id -> [ref_id1, ref_id2, ... ] # which spans all docs in the cluster self.headDoc = str(clusterNum) + "_1ecbplus.xml" makeGoldTruth = False # data directories self.dataDir = dataDir + str(clusterNum) + '/' self.lemmaDir = lemmaDir + str(clusterNum) + '/ecbplus/' self.outputDir = outputDir# + 'clusterMentions/' self.numPairs = 0 self.numMentions = 0 # makes the gold truth files self.mentionsList = [] # iterates through each file in the given dir for f in glob(self.dataDir + '*plus.xml'): #print "file: " + str(f) doc_id = f[f.rfind("/") + 1:] doc = Document(doc_id) tokenIDs = defaultdict(str) # gets the contents of the file with open (f, "r") as myfile: fileContents=myfile.read().replace('\n', ' ') # reads <tokens> it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents)) for match in it: t_id = int(match.group(1)) token = match.group(3) tokenIDs[t_id] = token # reads <markers> regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: isPred = False if "ACTION" in match.group(1): isPred = True m_id = int(match.group(2)) # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) curTokenIDs = [] text = "" for match2 in it2: tokenID = int(match2.group(1)) curTokenIDs.append(tokenID) text = text + str(tokenIDs[tokenID]) + " " text = text.rstrip() # constructs the Mention mention = Mention(m_id, text, curTokenIDs, isPred, doc_id) # adds to the Doc and stores it (we will update the Doc w/ ref info below) doc.addMention(mention) # reads <relations> relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")] regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>" it = tuple(re.finditer(regex, relations)) for match in it: ref_id = match.group(1) regex2 = r"<source m_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(2))) for match2 in it2: m_id = int(match2.group(1)) doc.mentions[m_id].addReference(ref_id) if doc.mentions[m_id] in self.references[ref_id]: print "** we already have the mention added to the ref!" exit(1) else: self.references[ref_id].append(doc.mentions[m_id]) # adds the current ref_id to the Doc doc.addRef(ref_id) self.docs[doc_id] = doc # stores the Doc object locally # now let's read the lemmas, provided by Sergio's StanfordNLP-parsed files f_lemma = open(self.lemmaDir + doc_id, 'r') fileContents = f_lemma.read().replace('\n', ' ') lemmaContent = fileContents[fileContents.find("<lemmas>")+8:fileContents.find("</lemmas>")] regex = r"<span m\_id=\"(.+?)/(.+?)\".+?pos=\"(.+?)\">(.+?)</span>" #(.*?)?</.*?>" it = tuple(re.finditer(regex, lemmaContent)) for match in it: filename = match.group(1) cur_m_id = match.group(2) pos = match.group(3) lemma = match.group(4) posTags = pos.split() # adds the lemma to the Mention; hey, pass by reference works! curDoc = self.docs[filename] curDoc.mentions[int(cur_m_id)].addLemma(lemma) curDoc.mentions[int(cur_m_id)].addPOSTags(posTags) # prints how many possible mention-pair combinations we have per cluster for d1 in self.docs.keys(): doc1 = self.docs[d1] #print "doc" + str(d1) + " has " + str(len(doc1.mentionsList)) for m1 in doc1.mentionsList: for d2 in self.docs.keys(): doc2 = self.docs[d2] for m2 in doc2.mentionsList: if m1 != m2: self.numPairs = self.numPairs + 1 #print "numpairs; " + str(self.numPairs) # # PRINTS THE GOLDEN TRUTH FILE (1 per cluster, i later need to cat htem together) fout = open(self.outputDir + str(self.clusterNum) + ".txt", 'w') for r in self.references.keys(): #if r not in self.docs[self.headDoc].refs: for m in self.references[r]: self.numMentions = self.numMentions + 1 self.mentionsList.append(str(m.doc_id) + ";" + str(m.m_id)) # TODO: only used for debugging fout.write(str(self.clusterNum) + ";" + str(r) + ";" + str(m.doc_id) + ";" + str(m.m_id) + ";" + str(m.text.lower()) + ";" + str(m.lemma.lower()) + "\n") fout.close() print str(self.clusterNum) + " -> " + str(self.numMentions) + " mentions" if printStats: # constructs FILE fout = open(self.outputDir + 'clusterMentions/' + str(self.clusterNum) + ".txt", 'w') # collects stats: of how many Mentions are per each Reference? # i.e., 1 mention per Ref happens 13 times # 2 mentions per Ref happens 4 times # 3 mentions per Ref happens 5 times self.numMentionCounts = defaultdict(int) for r in self.references.keys(): count = len(self.references[r]) self.numMentionCounts[count] = self.numMentionCounts[count] + 1 sorted_x = sorted(self.numMentionCounts.items(), key=operator.itemgetter(0)) fout.write("# Mentions per Ref, # times this occurred\n") fout.write("------------------------------\n") for i in sorted_x: fout.write(str(i[0]) + "," + str(i[1]) + "\n") fout.write("\n") fout.write("-------------------------- REFS NOT CONTAINED IN HEAD DOC --------------------------------\n") for r in self.references.keys(): #if r not in self.docs[self.headDoc].refs: fout.write("\t " + r + " (" + str(len(self.references[r])) + " mentions):\n") for m in self.references[r]: fout.write("\t\t" + m.text + " (doc " + str(m.doc_id) + "; m_id: " + str(m.m_id) + "; lemma: " + m.lemma + ")\n") fout.close()