def mergeIntoTermCorpus(self): corpus_writer = matutils.MmWriter('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName)) corpus_writer.write_headers(-1, -1, -1) num_nnz = 0 poslast = 0 offsets = [] write_index = 0 totalLen = 0 corporaDict = MyDict() for i in range(self.numberOfProcesses): corpus = corpora.MmCorpus('%s/%s/%s_%i.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName, i)) #corporaList.append(corpus) # (current termId, current index in corpus, corpus) if len(corpus) > 0: termId = [id for (id, sim) in corpus[0] if sim == 1.0][0] corporaDict[i] = (termId, 0, corpus, len(corpus)) totalLen += len(corpus) while 1: isDone = False for corpusId in corporaDict.keys(): termId, index, corpus, len_corpus = corporaDict[corpusId] # Read all values for current corpus from MyDict. if termId == write_index: # We are writing to the merged corpus at index 'write_index'. Write it if it coincides with the column id of the current corpus. # Determine offsets for the index file, allowing O(1) access time of documents. posnow = corpus_writer.fout.tell() if posnow == poslast: offsets[-1] = -1 offsets.append(posnow) poslast = posnow # Write current document max_id, veclen = corpus_writer.write_vector(write_index, corpus[index]) num_nnz += veclen # Update values write_index += 1 #Update the write index of the merged corpus index += 1 #Update the index of the current corpus if index == len_corpus: #Reached the end of the current corpus. Set values to -1 so no more document will be grabbed from this corpus. corporaDict[corpusId] = (-1, -1, corpus, len_corpus) #Set index to -1. Corpus has been fully read. else: termId = [id for (id, sim) in corpus[index] if sim == 1.0][0] #Grab the next column id :: TODO -- CAN THIS BE DONE MORE EFFICIENTLY? corporaDict[corpusId] = (termId, index, corpus, len_corpus) #Update the MyDict with the new values of the current corpus if write_index == totalLen: # If all corpora have been fully read, exit the while loop. isDone = True if isDone: break corpus_writer.fake_headers(totalLen, totalLen, num_nnz) corpus_writer.close() # Write index to file index_fname = corpus_writer.fname + '.index' utils.pickle(offsets, index_fname)
def computeOracle(nameEmailData, aliasToIdName): oracle = MyDict() indices = range(len(nameEmailData.keys())) # print aliasToIdName.keys() # Initially all false for idx1, idx2 in itertools.combinations(indices, 2): oracle[(idx1, idx2)] = 0 # Reflexive matches are always true: # any (alias,email) pair should be matched to itself for idx in indices: oracle[(idx, idx)] = 1 # Look up the identity name for an alias # Match (alias,email) pairs that share the same identity name for idx1, idx2 in itertools.combinations(indices, 2): # try: (name1, email1) = nameEmailData[idx1] (name2, email2) = nameEmailData[idx2] # print name1 # if name1 not in aliasToIdName.keys(): # print name1 # exit() if aliasToIdName[name1] == aliasToIdName[name2]: oracle[(idx1, idx2)] = 1 # except: # print idx1, nameEmailData[idx1] # print idx2, nameEmailData[idx2] return oracle
def __init__(self, nameListsPath): '''Data path''' self.dataPath = os.path.abspath(nameListsPath) '''gender.c, already lowercase''' self.genderDict = MyDict(os.path.join(self.dataPath, 'gender.dict')) '''Order of countries (columns) in the nam_dict.txt file shipped together with gender.c''' self.countriesOrder = { 'UK': 0, 'Ireland': 1, 'USA': 2, 'Italy': 3, 'Malta': 4, 'Portugal': 5, 'Spain': 6, 'France': 7, 'Belgium': 8, 'Luxembourg': 9, 'The Netherlands': 10, 'East Frisia': 11, 'Germany': 12, 'Austria': 13, 'Switzerland': 14, 'Iceland': 15, 'Denmark': 16, 'Norway': 17, 'Sweden': 18, 'Finland': 19, 'Estonia': 20, 'Latvia': 21, 'Lithuania': 22, 'Poland': 23, 'Czech Republic': 24, 'Slovakia': 25, 'Hungary': 26, 'Romania': 27, 'Bulgaria': 28, 'Bosnia and Herzegovina': 29, 'Croatia': 30, 'Kosovo': 31, 'Macedonia (FYROM)': 32, 'Montenegro': 33, 'Serbia': 34, 'Slovenia': 35, 'Albania': 36, 'Greece': 37, 'Russia': 38, 'Belarus': 39, 'Moldova': 40, 'Ukraine': 41, 'Armenia': 42, 'Azerbaijan': 43, 'Georgia': 44, 'Kazakhstan': 45, 'Turkey': 46, 'Arabia/Persia': 47, 'Israel': 48, 'China': 49, 'India/Sri Lanka': 50, 'Japan': 51, 'Korea': 52, 'Vietnam': 53, 'other countries': 54, } self.countriesOrderRev = {} for country, idx in self.countriesOrder.items(): self.countriesOrderRev[idx] = country self.threshold = 0.5 self.nameLists = {} '''Name lists per country''' listOfCountries = [ 'Afghanistan', 'Albania', 'Australia', 'Belgium', 'Brazil', 'Canada', 'Czech', 'Finland', 'Greece', 'Hungary', 'India', 'Iran', 'Ireland', 'Israel', 'Italy', 'Latvia', 'Norway', 'Poland', 'Romania', 'Russia', 'Slovenia', 'Somalia', 'Spain', 'Sweden', 'Turkey', 'UK', 'Ukraine', 'USA' ] for country in listOfCountries: self.nameLists[country] = {} self.nameLists[country]['male'], self.nameLists[country][ 'female'] = loadData(country, self.dataPath, hasHeader=False) '''Exceptions (approximations)''' #malesFrance, femalesFrance = loadData('Wallonia', self.dataPath, False) #self.nameLists['France'] = {} #self.nameLists['France']['male'] = malesFrance #self.nameLists['France']['female'] = femalesFrance malesNL, femalesNL = loadData('Frisia', self.dataPath, False) self.nameLists['The Netherlands'] = {} self.nameLists['The Netherlands']['male'] = malesNL self.nameLists['The Netherlands']['female'] = femalesNL '''Black list of first names''' self.blackList = [ 'The', 'the', 'nil', 'Nil', 'NULL', 'null', 'stack', 'cache', 'queue', 'core', 'linux', 'Net', 'stillo', 'alfa', 'beta', 'testing', 'me' ] '''Gender-specific words''' self.maleWords = [ 'Mr.', 'mr.', 'Mr', 'mr', 'Sir', 'sir', 'Captain', 'captain', 'wizard', 'warrior', 'hillbilly', 'beer', 'Mister', 'Lord', 'Duke', 'Baron', 'coolguy' ] self.femaleWords = ['girl', 'grrl', 'grrrl', 'miss', 'Miss', 'Mrs.'] '''Suffixes''' self.suffixes = {} self.suffixes['Russia'] = {} self.suffixes['Russia']['male'] = {} self.suffixes['Russia']['male']['include'] = [ 'ov', 'ev', 'sky', 'skiy', 'iy', 'uy', 'oy', 'skij', 'ij', 'uj', 'oj', 'off' ] '''in/yn excluded due to B-Rain and Earwin''' self.suffixes['Russia']['male']['exclude'] = [ 'Liubov', 'Ljubov', 'Lyubov', 'boy', 'Boy', 'toy', 'Toy', 'dev', 'Dev' ] '''['Iakov','Jakov','Yakov','dev','Dev','Lev','boy','Boy','toy','Toy']''' self.suffixes['Russia']['female'] = {} self.suffixes['Russia']['female']['include'] = [ 'ova', 'eva', 'skaya', 'aya', 'eya', 'oya', 'iaya' ] self.suffixes['Russia']['female']['exclude'] = {} self.suffixes['Belarus'] = self.suffixes['Russia'] self.suffixes['Ukraine'] = self.suffixes['Russia'] self.suffixes['Turkmenistan'] = self.suffixes['Russia'] self.suffixes['Kyrgyzstan'] = self.suffixes['Russia'] self.suffixes['Tajikistan'] = self.suffixes['Russia'] self.suffixes['Kazakhstan'] = self.suffixes['Russia'] self.suffixes['Uzbekistan'] = self.suffixes['Russia'] self.suffixes['Azerbaijan'] = self.suffixes['Russia'] self.suffixes['Uzbekistan'] = self.suffixes['Russia'] self.suffixes['Bulgaria'] = self.suffixes['Russia'] self.suffixes['Macedonia (FYROM)'] = {} self.suffixes['Macedonia (FYROM)']['male'] = {} self.suffixes['Macedonia (FYROM)']['male']['include'] = [ 'ov', 'ev', 'ski', 'evsk' ] self.suffixes['Macedonia (FYROM)']['male']['exclude'] = [ 'Iakov', 'Jakov', 'Yakov', 'dev', 'Dev', 'Lev', 'boy', 'Boy', 'toy', 'Toy' ] self.suffixes['Macedonia (FYROM)']['female'] = {} self.suffixes['Macedonia (FYROM)']['female']['include'] = [ 'ova', 'eva', 'ska', 'evska' ] self.suffixes['Macedonia (FYROM)']['female']['exclude'] = {} self.suffixes['Poland'] = {} self.suffixes['Poland']['male'] = {} self.suffixes['Poland']['male']['include'] = [ 'ski', 'sky', 'cki', 'cky' ] self.suffixes['Poland']['male']['exclude'] = {} self.suffixes['Poland']['female'] = {} self.suffixes['Poland']['female']['include'] = ['cka'] '''-ska is not included because of Polska = Poland which might be confusing''' self.suffixes['Poland']['female']['exclude'] = {} self.suffixes['Czech Republic'] = {} self.suffixes['Czech Republic']['male'] = {} self.suffixes['Czech Republic']['male']['include'] = [ 'ov', u'ský', 'sky', u'ný', 'ny' ] self.suffixes['Czech Republic']['male']['include'] = [ 'ov', 'sky', 'ny' ] self.suffixes['Czech Republic']['male']['exclude'] = {} self.suffixes['Czech Republic']['female'] = {} self.suffixes['Czech Republic']['female']['include'] = [ 'ova', 'ska', 'na', u'ová', u'ská', u'ná' ] self.suffixes['Czech Republic']['female']['include'] = [ 'ova', 'ska', 'na' ] self.suffixes['Czech Republic']['female']['exclude'] = {} '''Male Latvian personal and family names typically end in -s (-š). Some may be derived from Russian names, with an -s ending: e.g. Vladislavs KAZANOVS Only Russian forms are included since we cannot distinguish between the regular Latvian -s and English plural -s''' self.suffixes['Latvia'] = {} self.suffixes['Latvia']['male'] = {} self.suffixes['Latvia']['male']['include'] = [u'š', 'ovs', 'ins'] self.suffixes['Latvia']['male']['exclude'] = {} self.suffixes['Latvia']['female'] = {} self.suffixes['Latvia']['female']['include'] = ['ina'] self.suffixes['Latvia']['female']['exclude'] = {} self.suffixes['Lithuania'] = {} self.suffixes['Lithuania']['male'] = {} self.suffixes['Lithuania']['male']['include'] = [ 'aitis', 'utis', 'ytis', 'enas', 'unas', 'inis', 'ynis', 'onis', 'ius', 'elis' ] self.suffixes['Lithuania']['male']['exclude'] = {} self.suffixes['Lithuania']['female'] = {} self.suffixes['Lithuania']['female']['include'] = [ 'iene', 'aite', 'yte', 'ute', 'te' ] self.suffixes['Lithuania']['female']['exclude'] = {} '''All inverse order countries should also be checked for direct order''' self.invOrder = [ 'Russia', 'Belarus', 'Ukraine', 'Turkmenistan', 'Kyrgyzstan', 'Tajikistan', 'Kazakhstan', 'Uzbekistan', 'Azerbaijan', 'Uzbekistan', 'Hungary', 'China', 'Bosnia', 'Serbia', 'Croatia', 'Sri Lanka', 'Vietnam', 'North Korea', 'South Korea' ] '''Diminutives list''' fd = open(os.path.join(self.dataPath, 'diminutives.csv'), 'rb') reader = UnicodeReader(fd) self.diminutives = {} for row in reader: mainName = row[0].strip().lower() for diminutive in row[1:]: try: self.diminutives[diminutive].add(mainName) except: self.diminutives[diminutive] = set() self.diminutives[diminutive].add(mainName) '''Distribution of StackOverflow users per different countries''' fd = open(os.path.join(self.dataPath, 'countryStats.csv'), 'rb') reader = UnicodeReader(fd) self.countryStats = {} total = 0.0 for row in reader: country = row[0] numUsers = float(row[1]) total += numUsers self.countryStats[country] = numUsers for country in self.countryStats.keys(): self.countryStats[country] = self.countryStats[country] / total
from unidecode import unidecode from nameMap import nameMap dataPath = os.path.abspath("../../../data") # This is the list of DBLP author names (>1.1M people) # 335078;M. G. J. van den Brand, Mark G. J. van den Brand, Mark van den Brand f = open(os.path.join(dataPath, "dblp-author-aliases-stripped.csv"), "rb") reader1 = UnicodeReader(f) # Read the list into a map # reverseLookup['M. G. J. van den Brand'] # = reverseLookup['Mark G. J. van den Brand'] # = reverseLookup['Mark van den Brand'] # = 335078 reverseLookup = MyDict() # Choose a unique spelling for each name # directLookup['M. G. J. van den Brand'] # = directLookup['Mark G. J. van den Brand'] # = directLookup['Mark van den Brand'] # = 'Mark van den Brand' directLookup = MyDict() for row in reader1: aid = int(row[0]) aliases = [name.strip() for name in row[1].split(',')] for name in aliases: reverseLookup[name] = aid directLookup[aid] = aliases[-1] # Normalizes a name using the different aliases
def runTesting(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', nrIterations=10, nrProcesses=1): #For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv) #Grab the parameter combination with the highest f-measure. #Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv) #Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv) #Make sure the resultsPath exists. If not, create it. if not os.path.exists(resultsPath): os.makedirs(resultsPath) #Similar for the resultsPath/testing directory, as training and testing is separated. if not os.path.exists(resultsPath + '/testing'): os.makedirs(resultsPath + '/testing') #Load the oracle for the data aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict')) #Loop all iterations for itIdx in range(nrIterations): resultList = [] #Load the data f = open( os.path.join(resultsPath, 'training', 'resultsTraining_%d.csv' % itIdx), 'rb') reader = csv.reader(f, delimiter=';') for row in reader: header = row #skip header break for row in reader: #levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f levThr = row[0] minLen = row[1] k = row[2] cosThr = row[3] tp = row[4] fp = row[5] fn = row[6] tn = row[7] precision = row[8] recall = row[9] fMeasure = row[10] resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn, precision, recall, fMeasure)) f.close() #Order results by f-measure resultList = sorted(resultList, key=lambda tuple: -float(tuple[10])) #Grab the first record, containing the best parameters. bestLevThr = float(resultList[0][0]) bestMinLen = int(resultList[0][1]) bestK = float(resultList[0][2]) bestCosThr = float(resultList[0][3]) g = open( os.path.join(resultsPath, 'testing', 'resultsTesting_%d.csv' % itIdx), 'wb') writer = csv.writer(g, delimiter=';') writer.writerow([ 'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f' ]) parameters = {} parameters["levenshteinSimRatio"] = bestLevThr parameters["minLength"] = bestMinLen parameters["rankReductionRatio"] = bestK parameters["cosineSimRatio"] = bestCosThr #Run the LSA algorithm on all testing subsets for this iteration for i in range(10): #Read the data from the testing subset nameEmailData = MyDict() f = open(os.path.join(dataPath, 'test_%d_%d.csv' % (itIdx, i)), 'rb') reader = UnicodeReader(f) idx = 0 for row in reader: try: alias = row[0] email = unspam(row[1]) nameEmailData[idx] = (alias, email) except: print row idx += 1 f.close() lsaAlgo = LSAAlgo( nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='testing', resultsFileName='results_%d_%d_preoracle.csv' % (itIdx, i), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False) lsaAlgo.run() #Compute the oracle to verify results oracle = computeOracle(nameEmailData, aliasToIdName) #Now check the results using the oracle [tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults( os.path.join(resultsPath, 'testing'), 'results_%d_%d_preoracle.csv' % (itIdx, i), 'results_%d_%d.csv' % (itIdx, i), oracle, nameEmailData) writer.writerow([ bestLevThr, bestMinLen, bestK, bestCosThr, tp, fp, fn, tn, precision, recall, fmeasure ]) print 'Done computing results on iteration %d, subset %d' % ( itIdx, i) g.close()
def computeResults(self, dataPath, filename, outfilename, oracle, nameEmailData): from gensim import models, similarities, matutils, corpora, utils if os.path.exists(os.path.join(dataPath, filename)): indices = range(len(nameEmailData.keys())) documents = MyDict(os.path.join(dataPath, 'documents.dict')) directLookup = MyDict(os.path.join( dataPath, 'directLookup.dict')) #input data index -> document index index = similarities.docsim.Similarity.load( os.path.join(dataPath, 'index')) # Initial values tp = 0.0 fp = 0.0 fn = 0.0 tn = 0.0 matchedTuples = set() # Write all matched indexes to a set f = open(os.path.join(dataPath, filename), 'rb') reader = UnicodeReader(f) g = open(os.path.join(dataPath, outfilename), 'wb') writer = csv.writer(g, delimiter=';') for row in reader: header = row #Ignore header header.append('kind') writer.writerow(header) break for row in reader: # Put the smallest value first. if int(row[0]) <= int(row[1]): idx1 = int(row[0]) idx2 = int(row[1]) else: idx2 = int(row[0]) idx1 = int(row[1]) matchedTuples.add((idx1, idx2)) if oracle[(idx1, idx2)] == 1: row.append('tp') else: row.append('fp') writer.writerow(row) f.close() #Iterate all combinations to compute all tp, fp, fn, tn for idx1, idx2 in itertools.combinations(indices, 2): docId1 = directLookup[idx1] docId2 = directLookup[idx2] if ( idx1, idx2 ) in matchedTuples: #The tuple was matched by the algorithm # tp or fp if oracle[(idx1, idx2)] == 1: #Correctly matched tp += 1 else: #Incorrectly matched fp += 1 else: #The tuple was not matched by the algorithm if oracle[(idx1, idx2)] == 1: #It should have been matched fn += 1 # Add the fn to the results file writer.writerow([ str(idx1), str(idx2), nameEmailData[idx1], nameEmailData[idx2], documents[docId1], documents[docId2], index.similarity_by_id(docId1)[docId2], 'fn' ]) else: #Did not match correctly tn += 1 g.close() os.remove(os.path.join(dataPath, filename)) #Add reflexive results to the tp's :-) for idx in indices: tp += 1 try: precision = tp / (tp + fp) except: precision = 0.0 try: recall = tp / (tp + fn) except: recall = 0.0 try: fmeasure = 2 * precision * recall / (precision + recall) except: fmeasure = 0.0 return [tp, fp, fn, tn, precision, recall, fmeasure] else: return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
def runTraining(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', levThrRange=[0.7, 0.8, 0.9], minLenRange=[2, 3, 4], kRange=[0.9, 0.95, 1.0], cosThrRange=[0.7, 0.75, 0.8, 0.85], nrIterations=10, nrProcesses=1): #Load iteration of the training set. (../data/icsmData/training_0.csv) #Run this iteration on all parameter combinations. #Compare to oracle and augment the output file from the LSA algorithm. #Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv) #Make sure the resultsPath exists. If not, create it. if not os.path.exists(resultsPath): os.makedirs(resultsPath) #Similar for the resultsPath/training directory, as training and testing is separated. if not os.path.exists(resultsPath + '/training'): os.makedirs(resultsPath + '/training') #Loop all iterations for itIdx in range(nrIterations): #Load the data nameEmailData = MyDict() f = open(os.path.join(dataPath, 'training_%d.csv' % itIdx), 'rb') reader = UnicodeReader(f) idx = 0 for row in reader: try: alias = row[0] email = unspam(row[1]) nameEmailData[idx] = (alias, email) except: print row idx += 1 f.close() #Load the oracle for the data aliasToIdName = MyDict( os.path.join(dataPath, 'aliasToIdNameUTF8.dict')) oracle = computeOracle(nameEmailData, aliasToIdName) nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len( cosThrRange) g = open( os.path.join(resultsPath, 'training', 'resultsTraining_%d.csv' % itIdx), 'wb') writer = csv.writer(g, delimiter=';') writer.writerow([ 'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f' ]) run = 0 for levThr in levThrRange: for minLen in minLenRange: for k in kRange: for cosThr in cosThrRange: #Load the parameters parameters = {} parameters["levenshteinSimRatio"] = levThr parameters["minLength"] = minLen parameters["cosineSimRatio"] = cosThr parameters["rankReductionRatio"] = k #Run the LSA algorithm on these parameters lsaAlgo = LSAAlgo( nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='training', resultsFileName= 'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False) lsaAlgo.run() #Now check the results using the oracle [tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults( os.path.join(resultsPath, 'training'), 'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), 'results_%d_%.2f_%d_%.2f_%.2f.csv' % (itIdx, levThr, minLen, k, cosThr), oracle, nameEmailData) writer.writerow([ levThr, minLen, k, cosThr, tp, fp, fn, tn, precision, recall, fmeasure ]) run += 1 print 'Run %d out of %d in iteration %d...' % ( run, nrRuns, itIdx) g.close()
dataPath = os.path.abspath("../../../data") # This is the list of DBLP author names (>1.1M people) # 335078;M. G. J. van den Brand, Mark G. J. van den Brand, Mark van den Brand f = open(os.path.join(dataPath, "dblp-author-aliases-stripped.csv"), "rb") reader1 = UnicodeReader(f) # Read the list into a map # reverseLookup['M. G. J. van den Brand'] # = reverseLookup['Mark G. J. van den Brand'] # = reverseLookup['Mark van den Brand'] # = 335078 reverseLookup = MyDict() # Choose a unique spelling for each name # directLookup['M. G. J. van den Brand'] # = directLookup['Mark G. J. van den Brand'] # = directLookup['Mark van den Brand'] # = 'Mark van den Brand' directLookup = MyDict() for row in reader1: aid = int(row[0]) aliases = [name.strip() for name in row[1].split(',')] for name in aliases: reverseLookup[name] = aid directLookup[aid] = aliases[-1] # Normalizes a name using the different aliases
def run(self): ## Decode parameters levThr = self.parameters["levenshteinSimRatio"] # threshold for Levenshtein similarity minLen = self.parameters["minLength"] # threshold for min length of a term simThr = self.parameters["cosineSimRatio"] # threshold for cosine similarity between documents k = self.parameters["rankReductionRatio"] # dimensionality reduction ratio (0 < k <= 1) # dataIndices = self.nameEmailData.keys() """I will iteratively build and populate the document-term matrix.""" runningColIdx = 0 runningRowIdx = 0 ## reverseLookup = MyDict() directLookup = MyDict() documents = MyDict() # Dictionary of documents (key = column index) terms = MyDict() # Dictionary of terms (key = row index) allTerms = set() # Set of all terms termToColumnIndex = MyDict() termToRowIndex = MyDict() start = clock() emailToColumnIndex = MyDict() for idx in self.nameEmailData.keys(): addEmailPrefix = True # default value (always add email prefix to document string, unless ..) (name, email) = self.nameEmailData[idx] # the input data contains (name,email) tuples #Easy merge: different names used by the same email address. #Exceptions: mailing list email addresses. if email not in self.emailExceptions: #If email address is personal, then: #- If I have seen it before, retrieve the column (document) to which it corresponds. #- If I see it for the first time, assign it a new column (document). try: columnIdx = emailToColumnIndex[email] except: columnIdx = runningColIdx emailToColumnIndex[email] = columnIdx runningColIdx += 1 else: #If email address is not personal, directly assign it a new column (document). columnIdx = runningColIdx runningColIdx += 1 addEmailPrefix = False # remember not to add email prefix to document string #Create lookup tables to move from input data to doc-term matrix and back. directLookup[idx] = columnIdx # from index in the input data to index of document (column) in doc-term matrix reverseLookup.append(columnIdx, idx) # from document index (in doc-term matrix) to index in the input data #Add name parts to document string if long enough and not consisting of digits, after normalization. docString = [namePart for namePart in normalize(name) if len(namePart) >= minLen and not namePart.isdigit()] if addEmailPrefix: #Add whole email address prefix to document string, if long enough and after after normalization. prefix = email.split('@')[0].split('+')[0] docString += [prefixPart for prefixPart in normalize(prefix) if len(prefixPart) >= minLen and not prefixPart.isdigit()] #Split email address prefix on dot, and add each part if long enough and after normalization. prefixParts = prefix.split('.') for prefixPart in prefixParts: docString += [p for p in normalize(prefixPart) if len(p) >= minLen and not p.isdigit()] #The document string is a set of terms. Figure out to which rows in the doc-term matrix the terms correspond. for term in docString: try: #If I've seen the term before, retrieve row index. rowIdx = termToRowIndex[term] except: #If first time I see the term, assign new row. rowIdx = runningRowIdx termToRowIndex[term] = rowIdx runningRowIdx += 1 #Keep track of which term is at which row index. terms[rowIdx] = term #Update the document at current column index with new document string. try: documents[columnIdx].update(docString) except: documents[columnIdx] = set() # if here then it's the first occurrence of this email address (document) documents[columnIdx].update(docString) #Keep an updated set of terms (from all documents). allTerms.update(docString) #Bookkeeping: keep track of the documents in which a term appears. for term in set(docString): try: termToColumnIndex[term].add(columnIdx) except: termToColumnIndex[term] = set([columnIdx]) end = clock() numDocuments = len(documents.keys()) numTerms = len(allTerms) if self.progressLogging: print "Initial pass: %6.3f seconds (%d terms, %d documents)" % ((end - start), numTerms, numDocuments) #Make sure that dataDir/saveDataDir exists. if not os.path.exists('%s/%s' % (self.dataDir, self.dataSaveDir)): os.makedirs('%s/%s' % (self.dataDir, self.dataSaveDir)) #Make sure that dataDir/saveDataDir/dataProcessingDir exists. if not os.path.exists('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir)): os.makedirs('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir)) if self.runProfiler: #Make sure that dataDir/saveDataDir/profilerOutputDir exists. if not os.path.exists('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)): os.makedirs('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) #Save the documents MyDict and directLookup MyDict for later reference documents.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.documentsFileName)) directLookup.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.directLookupFileName)) reverseLookup.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.reverseLookupFileName)) #Build the dictionary and corpus on disk from the documents. texts = [documents[key] for key in documents.keys()] dictionary = corpora.Dictionary(texts) # Dictionary: {'nalley': 5649, 'francesco': 4634, 'caisse': 3381, 'gabrielle': 1097, ...} :: (term: id) dictionary.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.dictionaryFileName)) # Save dictionary corpus = [dictionary.doc2bow(text) for text in texts] # Corpus: [[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1)], [(5, 1), (6, 1)], ...] :: (id, count) corpora.MmCorpus.serialize('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.originalCorpusFileName), corpus) # Precompute levenshtein distances between all terms on all cpu cores if self.progressLogging: print 'Starting computation of levenshtein...' start = clock() if self.runProfiler: cProfile.runctx('self.precomputeLevenshteinDistanceBetweenAllTerms(numTerms, dictionary, levThr)', globals(), locals(), '%s/%s/%s/computeTermCorpusChunks' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) else: self.precomputeLevenshteinDistanceBetweenAllTerms(numTerms, dictionary, levThr) end = clock() if self.progressLogging: print 'Done computing levenshtein: %6.3f seconds' % (end - start) # Merge separate MmCorpus files start = clock() if self.runProfiler: cProfile.runctx('self.mergeIntoTermCorpus()', globals(), locals(), '%s/%s/%s/mergeIntoTermCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) else: self.mergeIntoTermCorpus() end = clock() if self.progressLogging: print 'Done merging corpora to termCorpus: %6.3f seconds' % (end - start) # Create levCorpus from corpus.mm and termCorpus.mm. start = clock() if self.runProfiler: cProfile.runctx('self.createLevenshteinCorpus()', globals(), locals(), '%s/%s/%s/createLevenshteinCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) else: self.createLevenshteinCorpus() end = clock() if self.progressLogging: print 'Done creating levCorpus from corpus and termCorpus: %6.3f seconds' % (end - start) corpus_disk = corpora.MmCorpus('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.levenshteinCorpusFileName)) """ Having the document-term matrix (including Levenshtein distance) in the corpus, we apply the TFIDF model. """ start = clock() tfidf = models.TfidfModel(corpus_disk) tfidf.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.tfidfModelFileName)) corpus_tfidf = tfidf[corpus_disk] end = clock() if self.progressLogging: print "Inverse document frequency: %6.3f seconds" % (end - start) """ Next up is applying the LSI model on top of the TFIDF model. """ start = clock() number_topics = len(corpus_disk)*k try: #Occasionally computing the SVD fails. #http://projects.scipy.org/numpy/ticket/990 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=number_topics) except: print 'Failed to compute the LSI model. (SVD did not converge?)' return lsi.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.lsiModelFileName)) corpus_lsi = lsi[corpus_tfidf] end = clock() if self.progressLogging: print "LSI model: %6.3f seconds" % (end - start) """ Finally, we run the cosine similarity on the matrix. """ start = clock() index = similarities.docsim.Similarity('%s/%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir, self.similarityIndexFileName), corpus_lsi, number_topics) index.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.similarityIndexFileName)) end = clock() if self.progressLogging: print "Similarities index: %6.3f seconds" % (end - start) """Output results.""" start = clock() if self.runProfiler: cProfile.runctx('self.writeResults(index, reverseLookup, documents, simThr)', globals(), locals(), '%s/%s/%s/createLevenshteinCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)) else: self.writeResults(index, reverseLookup, documents, simThr) end = clock() if self.progressLogging: print "Writing results: %6.3f seconds" % (end - start) return
# for row in reader: # writer.writerow([str(idx)] + row) # idx += 1 # g.close() # exit() # Choose dataset #dataset = 'aliases2' dataset = 'gnome' dataset = 'icsm' # Choose dataset size datasetsize = 'full' #datasetsize = 'sample' nameEmailData = MyDict() if dataset == 'aliases2': f = open(os.path.join(dataPath, "aliases2.csv"), "rb") # f = open(os.path.join(dataPath, "testData2.csv"), "rb") reader = UnicodeReader(f) header = reader.next() for row in reader: try: idx = int(row[0]) name = row[1] email = unspam(row[2]) nameEmailData[idx] = (name, email) except: print row f.close()
year = row[0] authors = ','.join([ normaliseName(name) for name in row[1].split(',') if len(normaliseName(name)) ]) title = row[2] writer.writerow([year, authors, title, '', '', '', '']) g.close() print soFarSoGood = set() # "Paulo R. F. Cunha": "Paulo Cunha" # "Neil Maiden": "Neil A. M. Maiden" # Strip middle initials, exact match on all other name parts uselessData = MyDict() # for each name in the DBLP data for key in reverseLookup.keys(): # record a version of the name without initials s = " ".join( [p.lower() for p in key.split() if len(p) > 1 and p.find('.') == -1]) uselessData[key] = s # then for each of the unknowns for name in sorted(unknowns): longParts = [ p.lower() for p in name.split() if len(p) > 1 and p.find('.') == -1 ] # if the name contains at least two parts of sufficient length if len(longParts) > 1: # check against each of the DBLP names
#conference = "icse" conference = sys.argv[1] # This is the list of DBLP author names (>1.1M people) # 335078;M. G. J. van den Brand, Mark G. J. van den Brand, Mark van den Brand f = open(os.path.abspath("../../../data/dblp-author-aliases-stripped.csv"), "rb") reader1 = UnicodeReader(f) # Read the list into a map # reverseLookup['M. G. J. van den Brand'] # = reverseLookup['Mark G. J. van den Brand'] # = reverseLookup['Mark van den Brand'] # = 335078 reverseLookup = MyDict() for row in reader1: aid = int(row[0]) aliases = [name.strip() for name in row[1].split(',')] for name in aliases: reverseLookup[name] = aid # Read names of conference PC members # There are two cases: # 1. Somebody appears in the DBLP data, with the same spelling (not interesting) # 2. Somebody does not appear in the DBLP data, or his/her name has a different spelling g = open(os.path.abspath("../../../data/pc/%s.csv" % conference), "rb") reader2 = UnicodeReader(g) # Record all conference PC members whose names DO NOT match DBLP unknowns = set()