def __init__(self, clusterNum, dataDir, lemmaDir, outputDir, printStats=False): #print "* Parsing cluster " + str(clusterNum) self.clusterNum = clusterNum self.references = defaultdict(list) # ref_id -> [m_id1, m_id2, ... ] which spans all docs in the cluster self.docs = defaultdict(Document) # doc_id -> Document # NOTE: the point of this variable is just print stats and see how many # Mentions are 'singletons' (not encompassed by any Ref) self.mentions = defaultdict(list) # m_id -> [ref_id1, ref_id2, ... ] # which spans all docs in the cluster self.headDoc = str(clusterNum) + "_1ecbplus.xml" makeGoldTruth = False # data directories self.dataDir = dataDir + str(clusterNum) + '/' self.lemmaDir = lemmaDir + str(clusterNum) + '/ecbplus/' self.outputDir = outputDir# + 'clusterMentions/' self.numPairs = 0 self.numMentions = 0 # makes the gold truth files self.mentionsList = [] # iterates through each file in the given dir for f in glob(self.dataDir + '*plus.xml'): #print "file: " + str(f) doc_id = f[f.rfind("/") + 1:] doc = Document(doc_id) tokenIDs = defaultdict(str) # gets the contents of the file with open (f, "r") as myfile: fileContents=myfile.read().replace('\n', ' ') # reads <tokens> it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents)) for match in it: t_id = int(match.group(1)) token = match.group(3) tokenIDs[t_id] = token # reads <markers> regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: isPred = False if "ACTION" in match.group(1): isPred = True m_id = int(match.group(2)) # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) curTokenIDs = [] text = "" for match2 in it2: tokenID = int(match2.group(1)) curTokenIDs.append(tokenID) text = text + str(tokenIDs[tokenID]) + " " text = text.rstrip() # constructs the Mention mention = Mention(m_id, text, curTokenIDs, isPred, doc_id) # adds to the Doc and stores it (we will update the Doc w/ ref info below) doc.addMention(mention) # reads <relations> relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")] regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>" it = tuple(re.finditer(regex, relations)) for match in it: ref_id = match.group(1) regex2 = r"<source m_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(2))) for match2 in it2: m_id = int(match2.group(1)) doc.mentions[m_id].addReference(ref_id) if doc.mentions[m_id] in self.references[ref_id]: print "** we already have the mention added to the ref!" exit(1) else: self.references[ref_id].append(doc.mentions[m_id]) # adds the current ref_id to the Doc doc.addRef(ref_id) self.docs[doc_id] = doc # stores the Doc object locally # now let's read the lemmas, provided by Sergio's StanfordNLP-parsed files f_lemma = open(self.lemmaDir + doc_id, 'r') fileContents = f_lemma.read().replace('\n', ' ') lemmaContent = fileContents[fileContents.find("<lemmas>")+8:fileContents.find("</lemmas>")] regex = r"<span m\_id=\"(.+?)/(.+?)\".+?pos=\"(.+?)\">(.+?)</span>" #(.*?)?</.*?>" it = tuple(re.finditer(regex, lemmaContent)) for match in it: filename = match.group(1) cur_m_id = match.group(2) pos = match.group(3) lemma = match.group(4) posTags = pos.split() # adds the lemma to the Mention; hey, pass by reference works! curDoc = self.docs[filename] curDoc.mentions[int(cur_m_id)].addLemma(lemma) curDoc.mentions[int(cur_m_id)].addPOSTags(posTags) # prints how many possible mention-pair combinations we have per cluster for d1 in self.docs.keys(): doc1 = self.docs[d1] #print "doc" + str(d1) + " has " + str(len(doc1.mentionsList)) for m1 in doc1.mentionsList: for d2 in self.docs.keys(): doc2 = self.docs[d2] for m2 in doc2.mentionsList: if m1 != m2: self.numPairs = self.numPairs + 1 #print "numpairs; " + str(self.numPairs) # # PRINTS THE GOLDEN TRUTH FILE (1 per cluster, i later need to cat htem together) fout = open(self.outputDir + str(self.clusterNum) + ".txt", 'w') for r in self.references.keys(): #if r not in self.docs[self.headDoc].refs: for m in self.references[r]: self.numMentions = self.numMentions + 1 self.mentionsList.append(str(m.doc_id) + ";" + str(m.m_id)) # TODO: only used for debugging fout.write(str(self.clusterNum) + ";" + str(r) + ";" + str(m.doc_id) + ";" + str(m.m_id) + ";" + str(m.text.lower()) + ";" + str(m.lemma.lower()) + "\n") fout.close() print str(self.clusterNum) + " -> " + str(self.numMentions) + " mentions" if printStats: # constructs FILE fout = open(self.outputDir + 'clusterMentions/' + str(self.clusterNum) + ".txt", 'w') # collects stats: of how many Mentions are per each Reference? # i.e., 1 mention per Ref happens 13 times # 2 mentions per Ref happens 4 times # 3 mentions per Ref happens 5 times self.numMentionCounts = defaultdict(int) for r in self.references.keys(): count = len(self.references[r]) self.numMentionCounts[count] = self.numMentionCounts[count] + 1 sorted_x = sorted(self.numMentionCounts.items(), key=operator.itemgetter(0)) fout.write("# Mentions per Ref, # times this occurred\n") fout.write("------------------------------\n") for i in sorted_x: fout.write(str(i[0]) + "," + str(i[1]) + "\n") fout.write("\n") fout.write("-------------------------- REFS NOT CONTAINED IN HEAD DOC --------------------------------\n") for r in self.references.keys(): #if r not in self.docs[self.headDoc].refs: fout.write("\t " + r + " (" + str(len(self.references[r])) + " mentions):\n") for m in self.references[r]: fout.write("\t\t" + m.text + " (doc " + str(m.doc_id) + "; m_id: " + str(m.m_id) + "; lemma: " + m.lemma + ")\n") fout.close()
def createSemanticSpaceSimVectors(self, outPickle, outFile, N, W, sliceNum, totalSlices): print "* creating semantic space vectors" fullWindowSize = W*2 + 1 outPickleFile = self.outputDir + outPickle outputFile = self.outputDir + outFile mentionTypes = [] # stores the tokens found within Mentions (non-stopwords and > 1 in length) # gets the 2,000 most popular words (non-stopwords and > 1 in length) print "* gathering most popular " + str(N) + " words" sys.stdout.flush() wordCounts = defaultdict(int) docs = [] for clusterNum in self.validClusters: # iterates through each file in the given dir/cluster for f in glob(self.dataDir + str(clusterNum) + '/*plus.xml'): doc_id = f[f.rfind("/") + 1:] doc = Document(doc_id) tokenIDs = defaultdict(str) # gets the contents of the file with open (f, "r") as myfile: fileContents=myfile.read().replace('\n', ' ') # reads <tokens> it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents)) for match in it: t_id = int(match.group(1)) sent_num = int(match.group(2)) token = match.group(3).lower() tokenIDs[t_id] = token if sent_num > 0 and token not in self.stopwords and len(token) > 1: wordCounts[token] = wordCounts[token] + 1 if token not in mentionTypes: mentionTypes.append(token) # reads <markers> regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: isPred = False if "ACTION" in match.group(1): isPred = True m_id = int(match.group(2)) # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) curTokenIDs = [] text = "" for match2 in it2: tokenID = int(match2.group(1)) curTokenIDs.append(tokenID) text = text + str(tokenIDs[tokenID]) + " " text = text.rstrip() # constructs the Mention mention = Mention(m_id, text, curTokenIDs, isPred, doc_id) # adds to the Doc and stores it (we will update the Doc w/ ref info below) doc.addMention(mention) docs.append(doc) print "* there were " + str(len(docs)) + " unique docs" sys.stdout.flush() # puts the top N words into a 'topWords' sorted_wordCounts = sorted(wordCounts.items(), key=operator.itemgetter(1), reverse=True) commonTypes = [x[0] for x in sorted_wordCounts][0:N] print "# unique mention tokens: " + str(len(mentionTypes)) # goes through all docs again, this time i do the sliding window # in order to calculate the PMI1 and PMI2, where # PMI1 = freq(p,c) / (freq(p)* freq(c)) # PMI2 = log(prob(p,c) / (prob(x)*prob(c))) mentionCounts = defaultdict(int) commonWordsCounts = defaultdict(int) mentionAndCommonCounts = defaultdict(int) print "* calculating PMI counts for all Mentions across all clusters of docs" for clusterNum in self.validClusters: # iterates through each file in the given dir/cluster for f in glob(self.dataDir + str(clusterNum) + '/*plus.xml'): #docTokens = [] mentionLocations = defaultdict(list) commonWordsLocations = defaultdict(list) # gets the contents of the file with open (f, "r") as myfile: fileContents=myfile.read().replace('\n', ' ') # reads <tokens> it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents)) i = 1 for match in it: sent_num = int(match.group(2)) token = match.group(3).lower() if sent_num > 0 and token not in self.stopwords and len(token) > 1: #docTokens.append(token) if token in commonTypes: commonWordsLocations[token].append(i) commonWordsCounts[token] = commonWordsCounts[token] + 1 if token in mentionTypes: mentionLocations[token].append(i) mentionCounts[token] = mentionCounts[token] + 1 i = i + 1 #print "Mentions: " + str(mentionLocations) #print "commons: " + str(commonWordsLocations) # looks at every Mention to see if a common word appeared within W tokens on either side of it for m in mentionLocations.keys(): for l in mentionLocations[m]: lower = l - W upper = l + W for c in commonWordsLocations.keys(): if c != m: for l2 in commonWordsLocations[c]: if l2 >= lower and l2 <= upper: mentionAndCommonCounts[(m,c)] = mentionAndCommonCounts[(m,c)] + 1 # removes singletons # sorted_m = sorted(mentionCounts.items(), key=operator.itemgetter(1), reverse=True) # single = 0 # for m in sorted_m: # print str(m[0]) + " -> " + str(m[1]) # if m[1] < 3: # single = single+1 # mentionCounts.pop(m[0], "None") # print "singletons: " + str(single) + " out of " + str(len(sorted_m)) # constructs the pmi vector for each Mention token print "* creating vector for " + str(len(mentionCounts.keys())) + " Mention types" sys.stdout.flush() vectors = {} sliceSize= 1+int(math.floor(float(len(mentionCounts.keys()))) / float(totalSlices)) print "slice size: " + str(sliceSize) lower=(sliceNum-1)*sliceSize if sliceNum==totalSlices: upper=len(mentionCounts.keys())-1 else: upper=sliceNum*sliceSize -1 print "SLICENUM: " + str(sliceNum) + ": " + str(lower) + "," + str(upper) sys.stdout.flush() i=0 for m in mentionCounts.keys(): if i>= lower and i<=upper: print "mention " + str(i) + " across " + str(len(commonWordsCounts.keys())) + " commonWordsCounts" sys.stdout.flush() vec = [] for c in commonWordsCounts.keys(): if (m,c) in mentionAndCommonCounts.keys(): vec.append(float(mentionAndCommonCounts[(m,c)]) / (float(commonWordsCounts[c]) * float(mentionCounts[m]))) else: vec.append(0) vectors[m] = vec i = i + 1 # SAVES VECTORS TO A PICKLE (SERIALIZED) FILE print "finished all Mention types' vectors; now writing them to disk" fileObj = open(outPickleFile, "wb") pickle.dump(vectors, fileObj) fileObj.close() print "finished! now writing the stats file (the cosine sim. b/w every Mention type pairs" # WRITES STATS FILE ftrain = open(outputFile, 'w') # calculates the cosine sim. b/w every possible pair of vectors for v1 in vectors.keys(): simScores = {} vec1 = vectors[v1] denom1 = 0 for i in range(len(vec1)): denom1 = denom1 + math.pow(vec1[i], 2) denom1 = math.sqrt(denom1) for v2 in vectors.keys(): if v1 != v2: vec2 = vectors[v2] num = 0 for i in range(len(vec1)): num = num + (float(vec1[i]) * float(vec2[i])) denom2 = 0 for i in range(len(vec2)): denom2 = denom2 + math.pow(vec2[i], 2) denom2 = math.sqrt(denom2) #print denom1 #print denom2 cosine = float(num + 0.0000001) / (0.0000001 + float(denom1) * float(denom2)) simScores[v2] = cosine sorted_scores = sorted(simScores.items(), key=operator.itemgetter(1), reverse=True) ftrain.write("* " + str(v1) + ":\n") for x in sorted_scores: ftrain.write("\t" + str(x[0]) + " (" + str(x[1]) + ")\n") ftrain.close()