Beispiel #1
0
	def mergeIntoTermCorpus(self):
		corpus_writer = matutils.MmWriter('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName))
		corpus_writer.write_headers(-1, -1, -1)
		
		num_nnz = 0
		poslast = 0
		offsets = []
		
		write_index = 0
		totalLen = 0
		corporaDict = MyDict()
		for i in range(self.numberOfProcesses):
			corpus = corpora.MmCorpus('%s/%s/%s_%i.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName, i))
			#corporaList.append(corpus)
			# (current termId, current index in corpus, corpus)
			if len(corpus) > 0:
				termId = [id for (id, sim) in corpus[0] if sim == 1.0][0]
				corporaDict[i] = (termId, 0, corpus, len(corpus))
				totalLen += len(corpus)
			
		while 1:
			isDone = False
			for corpusId in corporaDict.keys():
				termId, index, corpus, len_corpus = corporaDict[corpusId] # Read all values for current corpus from MyDict.
				if termId == write_index: # We are writing to the merged corpus at index 'write_index'. Write it if it coincides with the column id of the current corpus.
				
					# Determine offsets for the index file, allowing O(1) access time of documents.
					posnow = corpus_writer.fout.tell()
					if posnow == poslast:
						offsets[-1] = -1
					offsets.append(posnow)
					poslast = posnow
				
					# Write current document
					max_id, veclen = corpus_writer.write_vector(write_index, corpus[index])
					num_nnz += veclen
					
					# Update values
					write_index += 1 #Update the write index of the merged corpus
					index += 1 #Update the index of the current corpus
					if index == len_corpus: #Reached the end of the current corpus. Set values to -1 so no more document will be grabbed from this corpus.
						corporaDict[corpusId] = (-1, -1, corpus, len_corpus) #Set index to -1. Corpus has been fully read.
					else:
						termId = [id for (id, sim) in corpus[index] if sim == 1.0][0] #Grab the next column id :: TODO -- CAN THIS BE DONE MORE EFFICIENTLY?
						corporaDict[corpusId] = (termId, index, corpus, len_corpus) #Update the MyDict with the new values of the current corpus
					
					if write_index == totalLen: # If all corpora have been fully read, exit the while loop.
						isDone = True
						
			if isDone:
				break
		corpus_writer.fake_headers(totalLen, totalLen, num_nnz)
		corpus_writer.close()
		
		# Write index to file
		index_fname = corpus_writer.fname + '.index'
		utils.pickle(offsets, index_fname)
    for row in reader1:
        year = row[0]
        authors = ','.join([normaliseName(name) for name in row[1].split(',') if len(normaliseName(name))])
        title = row[2]
        writer.writerow([year, authors, title, '', '', '', ''])
    g.close()

print
soFarSoGood = set()

# "Paulo R. F. Cunha": "Paulo Cunha"
# "Neil Maiden": "Neil A. M. Maiden"
# Strip middle initials, exact match on all other name parts
uselessData = MyDict()
# for each name in the DBLP data
for key in reverseLookup.keys():
	# record a version of the name without initials
    s = " ".join([p.lower() for p in key.split() if len(p) > 1 and p.find('.') == -1])
    uselessData[key] = s

# then for each of the unknowns
for name in sorted(unknowns):
    longParts = [p.lower() for p in name.split() if len(p) > 1 and p.find('.') == -1]
    # if the name contains at least two parts of sufficient length
    if len(longParts) > 1:
    	# check against each of the DBLP names
        for key in reverseLookup.keys():
        	# retrieve the version without initials
            s = uselessData[key]
            # check that the name starts and ends with the same parts
            if s.startswith(longParts[0]) and s.endswith(" %s" % longParts[-1]):
Beispiel #3
0
	def run(self):
		## Decode parameters
		levThr  = self.parameters["levenshteinSimRatio"] # threshold for Levenshtein similarity
		minLen  = self.parameters["minLength"] # threshold for min length of a term
		simThr	= self.parameters["cosineSimRatio"] # threshold for cosine similarity between documents
		k		= self.parameters["rankReductionRatio"] # dimensionality reduction ratio (0 < k <= 1)
		
#		dataIndices = self.nameEmailData.keys()
		
		"""I will iteratively build and populate the document-term matrix."""
		runningColIdx = 0
		runningRowIdx = 0
		
		## 
		reverseLookup = MyDict()
		directLookup = MyDict()
		
		documents = MyDict() # Dictionary of documents (key = column index)
		terms = MyDict() # Dictionary of terms (key = row index)
		allTerms = set() # Set of all terms
		
		termToColumnIndex = MyDict()
		termToRowIndex = MyDict()
		
		start = clock()
		emailToColumnIndex = MyDict()
		for idx in self.nameEmailData.keys():
			addEmailPrefix = True # default value (always add email prefix to document string, unless ..)
			
			(name, email) = self.nameEmailData[idx] # the input data contains (name,email) tuples
			#Easy merge: different names used by the same email address.
			#Exceptions: mailing list email addresses.
			if email not in self.emailExceptions:
				#If email address is personal, then:
				#- If I have seen it before, retrieve the column (document) to which it corresponds.
				#- If I see it for the first time, assign it a new column (document).
				try:
					columnIdx = emailToColumnIndex[email]
				except:
					columnIdx = runningColIdx
					emailToColumnIndex[email] = columnIdx
					runningColIdx += 1
			else:
				#If email address is not personal, directly assign it a new column (document).
				columnIdx = runningColIdx
				runningColIdx += 1
				addEmailPrefix = False # remember not to add email prefix to document string
				
			#Create lookup tables to move from input data to doc-term matrix and back.
			directLookup[idx] = columnIdx # from index in the input data to index of document (column) in doc-term matrix
			reverseLookup.append(columnIdx, idx) # from document index (in doc-term matrix) to index in the input data
			
			#Add name parts to document string if long enough and not consisting of digits, after normalization.
			docString = [namePart for namePart in normalize(name) if len(namePart) >= minLen and not namePart.isdigit()]
			
			if addEmailPrefix:
				#Add whole email address prefix to document string, if long enough and after after normalization.
				prefix = email.split('@')[0].split('+')[0]
				docString += [prefixPart for prefixPart in normalize(prefix) if len(prefixPart) >= minLen and not prefixPart.isdigit()]
				#Split email address prefix on dot, and add each part if long enough and after normalization.
				prefixParts = prefix.split('.')
				for prefixPart in prefixParts:
					docString += [p for p in normalize(prefixPart) if len(p) >= minLen and not p.isdigit()]
			
			#The document string is a set of terms. Figure out to which rows in the doc-term matrix the terms correspond.
			for term in docString:
				try:
					#If I've seen the term before, retrieve row index.
					rowIdx = termToRowIndex[term]
				except:
					#If first time I see the term, assign new row.
					rowIdx = runningRowIdx
					termToRowIndex[term] = rowIdx
					runningRowIdx += 1
				#Keep track of which term is at which row index.
				terms[rowIdx] = term
			
			#Update the document at current column index with new document string.
			try:
				documents[columnIdx].update(docString)
			except:
				documents[columnIdx] = set() # if here then it's the first occurrence of this email address (document)
				documents[columnIdx].update(docString)
				
			#Keep an updated set of terms (from all documents).
			allTerms.update(docString)
			
			#Bookkeeping: keep track of the documents in which a term appears.
			for term in set(docString):
				try:
					termToColumnIndex[term].add(columnIdx)
				except:
					termToColumnIndex[term] = set([columnIdx])
		end = clock()
				
		numDocuments = len(documents.keys())
		numTerms = len(allTerms)
		if self.progressLogging:
			print "Initial pass: %6.3f seconds (%d terms, %d documents)" % ((end - start), numTerms, numDocuments)
		
		#Make sure that dataDir/saveDataDir exists.
		if not os.path.exists('%s/%s' % (self.dataDir, self.dataSaveDir)):
			os.makedirs('%s/%s' % (self.dataDir, self.dataSaveDir))
		#Make sure that dataDir/saveDataDir/dataProcessingDir exists.
		if not os.path.exists('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir)):
			os.makedirs('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir))
			
		if self.runProfiler:
			#Make sure that dataDir/saveDataDir/profilerOutputDir exists.
			if not os.path.exists('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)):
				os.makedirs('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
				
		#Save the documents MyDict and directLookup MyDict for later reference
		documents.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.documentsFileName))
		directLookup.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.directLookupFileName))
		reverseLookup.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.reverseLookupFileName))
		
		#Build the dictionary and corpus on disk from the documents.
		texts = [documents[key] for key in documents.keys()]
		dictionary = corpora.Dictionary(texts) # Dictionary: {'nalley': 5649, 'francesco': 4634, 'caisse': 3381, 'gabrielle': 1097, ...} :: (term: id)
		dictionary.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.dictionaryFileName)) # Save dictionary
		corpus = [dictionary.doc2bow(text) for text in texts] # Corpus: [[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1)], [(5, 1), (6, 1)], ...] :: (id, count)
		corpora.MmCorpus.serialize('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.originalCorpusFileName), corpus)
		
		# Precompute levenshtein distances between all terms on all cpu cores
		if self.progressLogging:
			print 'Starting computation of levenshtein...'
		start = clock()
		if self.runProfiler:
			cProfile.runctx('self.precomputeLevenshteinDistanceBetweenAllTerms(numTerms, dictionary, levThr)', globals(), locals(), '%s/%s/%s/computeTermCorpusChunks' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
		else:
			self.precomputeLevenshteinDistanceBetweenAllTerms(numTerms, dictionary, levThr)
		end = clock()
		if self.progressLogging:
			print 'Done computing levenshtein: %6.3f seconds' % (end - start)
		
		# Merge separate MmCorpus files
		start = clock()
		if self.runProfiler:
			cProfile.runctx('self.mergeIntoTermCorpus()', globals(), locals(), '%s/%s/%s/mergeIntoTermCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
		else:
			self.mergeIntoTermCorpus()
		end = clock()
		if self.progressLogging:
			print 'Done merging corpora to termCorpus: %6.3f seconds' % (end - start)
		
		# Create levCorpus from corpus.mm and termCorpus.mm.
		start = clock()
		if self.runProfiler:
			cProfile.runctx('self.createLevenshteinCorpus()', globals(), locals(), '%s/%s/%s/createLevenshteinCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
		else:
			self.createLevenshteinCorpus()
		end = clock()
		if self.progressLogging:
			print 'Done creating levCorpus from corpus and termCorpus: %6.3f seconds' % (end - start)
		
		corpus_disk = corpora.MmCorpus('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.levenshteinCorpusFileName))
		
		""" Having the document-term matrix (including Levenshtein distance) in the corpus, we apply the TFIDF model. """
		start = clock()
		tfidf = models.TfidfModel(corpus_disk)
		tfidf.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.tfidfModelFileName))
		corpus_tfidf = tfidf[corpus_disk]
		end = clock()
		if self.progressLogging:
			print "Inverse document frequency: %6.3f seconds" % (end - start)
		
		""" Next up is applying the LSI model on top of the TFIDF model. """
		start = clock()
		number_topics = len(corpus_disk)*k
		try:
			#Occasionally computing the SVD fails.
			#http://projects.scipy.org/numpy/ticket/990
			lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=number_topics)
		except:
			print 'Failed to compute the LSI model. (SVD did not converge?)'
			return
		lsi.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.lsiModelFileName))
		corpus_lsi = lsi[corpus_tfidf]
		end = clock()
		if self.progressLogging:
			print "LSI model: %6.3f seconds" % (end - start)
		
		""" Finally, we run the cosine similarity on the matrix. """
		start = clock()
		index = similarities.docsim.Similarity('%s/%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir, self.similarityIndexFileName), corpus_lsi, number_topics)
		index.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.similarityIndexFileName))
		end = clock()
		if self.progressLogging:
			print "Similarities index: %6.3f seconds" % (end - start)
		
		"""Output results."""
		start = clock()
		if self.runProfiler:
			cProfile.runctx('self.writeResults(index, reverseLookup, documents, simThr)', globals(), locals(), '%s/%s/%s/createLevenshteinCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
		else:
			self.writeResults(index, reverseLookup, documents, simThr)
		end = clock()
		if self.progressLogging:
			print "Writing results: %6.3f seconds" % (end - start)
				
		return
Beispiel #4
0
			try:
				idx = int(row[0])
				name = row[1]
				email = unspam(row[2])
				nameEmailData[idx] = (name, email)
			except:
				print row
		f.close()
		print 'Using the aliases2.csv data set...'
	elif dataset == 'gnome':
		import email.utils
		import email.header
	
		emailAddressToUniqueNames = MyDict(os.path.join(dataPath,'emailAddressToUniqueNamesBlacklisted.dict'))
		i = 0
		emailAddresses = [emailAddress for emailAddress in emailAddressToUniqueNames.keys() if len(emailAddress) > 1]
		for emailAddress in emailAddresses:
			#print emailAddressToUniqueNames[emailAddress]
			decodedNames = [decodeHeader(uniqueName) for uniqueName in emailAddressToUniqueNames[emailAddress] ]
			decodedNames = [decodedName for decodedName in decodedNames if len(decodedName) > 1]
			
			for name in decodedNames:
				nameEmailData[i] = (name, emailAddress)
				i += 1
		print 'Using the GNOME Mailing List data set...'
	elif dataset == 'icsm':
		itIdx = 8
		f = open(os.path.join(dataPath, 'icsmData', 'training_%d.csv' % itIdx), 'rb')
		reader = UnicodeReader(f)
		idx = 0
		for row in reader:
Beispiel #5
0
        aid = reverseLookup[name]
    except:
        unknowns.add(name)

# Start name matching between conference PC and DBLP aliases
g = open(os.path.abspath("../../../data/temp/map_%s.csv" % conference), "wb")
writer = UnicodeWriter(g)

soFarSoGood = set()

# "Paulo R. F. Cunha": "Paulo Cunha"
# "Neil Maiden": "Neil A. M. Maiden"
# Strip middle initials, exact match on all other name parts
uselessData = MyDict()
# for each name in the DBLP data
for key in reverseLookup.keys():
	# record a version of the name without initials
    s = " ".join([p.lower() for p in key.split() if len(p) > 1 and p.find('.') == -1])
    uselessData[key] = s

# then for each of the unknowns
for name in sorted(unknowns):
    longParts = [p.lower() for p in name.split() if len(p) > 1 and p.find('.') == -1]
    # if the name contains at least two parts of sufficient length
    if len(longParts) > 1:
    	# check against each of the DBLP names
        for key in reverseLookup.keys():
        	# retrieve the version without initials
            s = uselessData[key]
            # check that the name starts and ends with the same parts
            if s.startswith(longParts[0]) and s.endswith(" %s" % longParts[-1]):