def mergeIntoTermCorpus(self): corpus_writer = matutils.MmWriter('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName)) corpus_writer.write_headers(-1, -1, -1) num_nnz = 0 poslast = 0 offsets = [] write_index = 0 totalLen = 0 corporaDict = MyDict() for i in range(self.numberOfProcesses): corpus = corpora.MmCorpus('%s/%s/%s_%i.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName, i)) #corporaList.append(corpus) # (current termId, current index in corpus, corpus) if len(corpus) > 0: termId = [id for (id, sim) in corpus[0] if sim == 1.0][0] corporaDict[i] = (termId, 0, corpus, len(corpus)) totalLen += len(corpus) while 1: isDone = False for corpusId in corporaDict.keys(): termId, index, corpus, len_corpus = corporaDict[corpusId] # Read all values for current corpus from MyDict. if termId == write_index: # We are writing to the merged corpus at index 'write_index'. Write it if it coincides with the column id of the current corpus. # Determine offsets for the index file, allowing O(1) access time of documents. posnow = corpus_writer.fout.tell() if posnow == poslast: offsets[-1] = -1 offsets.append(posnow) poslast = posnow # Write current document max_id, veclen = corpus_writer.write_vector(write_index, corpus[index]) num_nnz += veclen # Update values write_index += 1 #Update the write index of the merged corpus index += 1 #Update the index of the current corpus if index == len_corpus: #Reached the end of the current corpus. Set values to -1 so no more document will be grabbed from this corpus. corporaDict[corpusId] = (-1, -1, corpus, len_corpus) #Set index to -1. Corpus has been fully read. else: termId = [id for (id, sim) in corpus[index] if sim == 1.0][0] #Grab the next column id :: TODO -- CAN THIS BE DONE MORE EFFICIENTLY? corporaDict[corpusId] = (termId, index, corpus, len_corpus) #Update the MyDict with the new values of the current corpus if write_index == totalLen: # If all corpora have been fully read, exit the while loop. isDone = True if isDone: break corpus_writer.fake_headers(totalLen, totalLen, num_nnz) corpus_writer.close() # Write index to file index_fname = corpus_writer.fname + '.index' utils.pickle(offsets, index_fname)
for row in reader1: year = row[0] authors = ','.join([normaliseName(name) for name in row[1].split(',') if len(normaliseName(name))]) title = row[2] writer.writerow([year, authors, title, '', '', '', '']) g.close() print soFarSoGood = set() # "Paulo R. F. Cunha": "Paulo Cunha" # "Neil Maiden": "Neil A. M. Maiden" # Strip middle initials, exact match on all other name parts uselessData = MyDict() # for each name in the DBLP data for key in reverseLookup.keys(): # record a version of the name without initials s = " ".join([p.lower() for p in key.split() if len(p) > 1 and p.find('.') == -1]) uselessData[key] = s # then for each of the unknowns for name in sorted(unknowns): longParts = [p.lower() for p in name.split() if len(p) > 1 and p.find('.') == -1] # if the name contains at least two parts of sufficient length if len(longParts) > 1: # check against each of the DBLP names for key in reverseLookup.keys(): # retrieve the version without initials s = uselessData[key] # check that the name starts and ends with the same parts if s.startswith(longParts[0]) and s.endswith(" %s" % longParts[-1]):
def run(self): ## Decode parameters levThr = self.parameters["levenshteinSimRatio"] # threshold for Levenshtein similarity minLen = self.parameters["minLength"] # threshold for min length of a term simThr = self.parameters["cosineSimRatio"] # threshold for cosine similarity between documents k = self.parameters["rankReductionRatio"] # dimensionality reduction ratio (0 < k <= 1) # dataIndices = self.nameEmailData.keys() """I will iteratively build and populate the document-term matrix.""" runningColIdx = 0 runningRowIdx = 0 ## reverseLookup = MyDict() directLookup = MyDict() documents = MyDict() # Dictionary of documents (key = column index) terms = MyDict() # Dictionary of terms (key = row index) allTerms = set() # Set of all terms termToColumnIndex = MyDict() termToRowIndex = MyDict() start = clock() emailToColumnIndex = MyDict() for idx in self.nameEmailData.keys(): addEmailPrefix = True # default value (always add email prefix to document string, unless ..) (name, email) = self.nameEmailData[idx] # the input data contains (name,email) tuples #Easy merge: different names used by the same email address. #Exceptions: mailing list email addresses. if email not in self.emailExceptions: #If email address is personal, then: #- If I have seen it before, retrieve the column (document) to which it corresponds. #- If I see it for the first time, assign it a new column (document). try: columnIdx = emailToColumnIndex[email] except: columnIdx = runningColIdx emailToColumnIndex[email] = columnIdx runningColIdx += 1 else: #If email address is not personal, directly assign it a new column (document). columnIdx = runningColIdx runningColIdx += 1 addEmailPrefix = False # remember not to add email prefix to document string #Create lookup tables to move from input data to doc-term matrix and back. directLookup[idx] = columnIdx # from index in the input data to index of document (column) in doc-term matrix reverseLookup.append(columnIdx, idx) # from document index (in doc-term matrix) to index in the input data #Add name parts to document string if long enough and not consisting of digits, after normalization. docString = [namePart for namePart in normalize(name) if len(namePart) >= minLen and not namePart.isdigit()] if addEmailPrefix: #Add whole email address prefix to document string, if long enough and after after normalization. prefix = email.split('@')[0].split('+')[0] docString += [prefixPart for prefixPart in normalize(prefix) if len(prefixPart) >= minLen and not prefixPart.isdigit()] #Split email address prefix on dot, and add each part if long enough and after normalization. prefixParts = prefix.split('.') for prefixPart in prefixParts: docString += [p for p in normalize(prefixPart) if len(p) >= minLen and not p.isdigit()] #The document string is a set of terms. Figure out to which rows in the doc-term matrix the terms correspond. for term in docString: try: #If I've seen the term before, retrieve row index. rowIdx = termToRowIndex[term] except: #If first time I see the term, assign new row. rowIdx = runningRowIdx termToRowIndex[term] = rowIdx runningRowIdx += 1 #Keep track of which term is at which row index. terms[rowIdx] = term #Update the document at current column index with new document string. try: documents[columnIdx].update(docString) except: documents[columnIdx] = set() # if here then it's the first occurrence of this email address (document) documents[columnIdx].update(docString) #Keep an updated set of terms (from all documents). allTerms.update(docString) #Bookkeeping: keep track of the documents in which a term appears. for term in set(docString): try: termToColumnIndex[term].add(columnIdx) except: termToColumnIndex[term] = set([columnIdx]) end = clock() numDocuments = len(documents.keys()) numTerms = len(allTerms) if self.progressLogging: print "Initial pass: %6.3f seconds (%d terms, %d documents)" % ((end - start), numTerms, numDocuments) #Make sure that dataDir/saveDataDir exists. if not os.path.exists('%s/%s' % (self.dataDir, self.dataSaveDir)): os.makedirs('%s/%s' % (self.dataDir, self.dataSaveDir)) #Make sure that dataDir/saveDataDir/dataProcessingDir exists. if not os.path.exists('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir)): os.makedirs('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir)) if self.runProfiler: #Make sure that dataDir/saveDataDir/profilerOutputDir exists. if not os.path.exists('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)): os.makedirs('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) #Save the documents MyDict and directLookup MyDict for later reference documents.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.documentsFileName)) directLookup.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.directLookupFileName)) reverseLookup.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.reverseLookupFileName)) #Build the dictionary and corpus on disk from the documents. texts = [documents[key] for key in documents.keys()] dictionary = corpora.Dictionary(texts) # Dictionary: {'nalley': 5649, 'francesco': 4634, 'caisse': 3381, 'gabrielle': 1097, ...} :: (term: id) dictionary.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.dictionaryFileName)) # Save dictionary corpus = [dictionary.doc2bow(text) for text in texts] # Corpus: [[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1)], [(5, 1), (6, 1)], ...] :: (id, count) corpora.MmCorpus.serialize('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.originalCorpusFileName), corpus) # Precompute levenshtein distances between all terms on all cpu cores if self.progressLogging: print 'Starting computation of levenshtein...' start = clock() if self.runProfiler: cProfile.runctx('self.precomputeLevenshteinDistanceBetweenAllTerms(numTerms, dictionary, levThr)', globals(), locals(), '%s/%s/%s/computeTermCorpusChunks' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) else: self.precomputeLevenshteinDistanceBetweenAllTerms(numTerms, dictionary, levThr) end = clock() if self.progressLogging: print 'Done computing levenshtein: %6.3f seconds' % (end - start) # Merge separate MmCorpus files start = clock() if self.runProfiler: cProfile.runctx('self.mergeIntoTermCorpus()', globals(), locals(), '%s/%s/%s/mergeIntoTermCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) else: self.mergeIntoTermCorpus() end = clock() if self.progressLogging: print 'Done merging corpora to termCorpus: %6.3f seconds' % (end - start) # Create levCorpus from corpus.mm and termCorpus.mm. start = clock() if self.runProfiler: cProfile.runctx('self.createLevenshteinCorpus()', globals(), locals(), '%s/%s/%s/createLevenshteinCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) else: self.createLevenshteinCorpus() end = clock() if self.progressLogging: print 'Done creating levCorpus from corpus and termCorpus: %6.3f seconds' % (end - start) corpus_disk = corpora.MmCorpus('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.levenshteinCorpusFileName)) """ Having the document-term matrix (including Levenshtein distance) in the corpus, we apply the TFIDF model. """ start = clock() tfidf = models.TfidfModel(corpus_disk) tfidf.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.tfidfModelFileName)) corpus_tfidf = tfidf[corpus_disk] end = clock() if self.progressLogging: print "Inverse document frequency: %6.3f seconds" % (end - start) """ Next up is applying the LSI model on top of the TFIDF model. """ start = clock() number_topics = len(corpus_disk)*k try: #Occasionally computing the SVD fails. #http://projects.scipy.org/numpy/ticket/990 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=number_topics) except: print 'Failed to compute the LSI model. (SVD did not converge?)' return lsi.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.lsiModelFileName)) corpus_lsi = lsi[corpus_tfidf] end = clock() if self.progressLogging: print "LSI model: %6.3f seconds" % (end - start) """ Finally, we run the cosine similarity on the matrix. """ start = clock() index = similarities.docsim.Similarity('%s/%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir, self.similarityIndexFileName), corpus_lsi, number_topics) index.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.similarityIndexFileName)) end = clock() if self.progressLogging: print "Similarities index: %6.3f seconds" % (end - start) """Output results.""" start = clock() if self.runProfiler: cProfile.runctx('self.writeResults(index, reverseLookup, documents, simThr)', globals(), locals(), '%s/%s/%s/createLevenshteinCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) else: self.writeResults(index, reverseLookup, documents, simThr) end = clock() if self.progressLogging: print "Writing results: %6.3f seconds" % (end - start) return
try: idx = int(row[0]) name = row[1] email = unspam(row[2]) nameEmailData[idx] = (name, email) except: print row f.close() print 'Using the aliases2.csv data set...' elif dataset == 'gnome': import email.utils import email.header emailAddressToUniqueNames = MyDict(os.path.join(dataPath,'emailAddressToUniqueNamesBlacklisted.dict')) i = 0 emailAddresses = [emailAddress for emailAddress in emailAddressToUniqueNames.keys() if len(emailAddress) > 1] for emailAddress in emailAddresses: #print emailAddressToUniqueNames[emailAddress] decodedNames = [decodeHeader(uniqueName) for uniqueName in emailAddressToUniqueNames[emailAddress] ] decodedNames = [decodedName for decodedName in decodedNames if len(decodedName) > 1] for name in decodedNames: nameEmailData[i] = (name, emailAddress) i += 1 print 'Using the GNOME Mailing List data set...' elif dataset == 'icsm': itIdx = 8 f = open(os.path.join(dataPath, 'icsmData', 'training_%d.csv' % itIdx), 'rb') reader = UnicodeReader(f) idx = 0 for row in reader:
aid = reverseLookup[name] except: unknowns.add(name) # Start name matching between conference PC and DBLP aliases g = open(os.path.abspath("../../../data/temp/map_%s.csv" % conference), "wb") writer = UnicodeWriter(g) soFarSoGood = set() # "Paulo R. F. Cunha": "Paulo Cunha" # "Neil Maiden": "Neil A. M. Maiden" # Strip middle initials, exact match on all other name parts uselessData = MyDict() # for each name in the DBLP data for key in reverseLookup.keys(): # record a version of the name without initials s = " ".join([p.lower() for p in key.split() if len(p) > 1 and p.find('.') == -1]) uselessData[key] = s # then for each of the unknowns for name in sorted(unknowns): longParts = [p.lower() for p in name.split() if len(p) > 1 and p.find('.') == -1] # if the name contains at least two parts of sufficient length if len(longParts) > 1: # check against each of the DBLP names for key in reverseLookup.keys(): # retrieve the version without initials s = uselessData[key] # check that the name starts and ends with the same parts if s.startswith(longParts[0]) and s.endswith(" %s" % longParts[-1]):