Beispiel #1
0
	def mergeIntoTermCorpus(self):
		corpus_writer = matutils.MmWriter('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName))
		corpus_writer.write_headers(-1, -1, -1)
		
		num_nnz = 0
		poslast = 0
		offsets = []
		
		write_index = 0
		totalLen = 0
		corporaDict = MyDict()
		for i in range(self.numberOfProcesses):
			corpus = corpora.MmCorpus('%s/%s/%s_%i.mm' % (self.dataDir, self.dataSaveDir, self.termCorpusFileName, i))
			#corporaList.append(corpus)
			# (current termId, current index in corpus, corpus)
			if len(corpus) > 0:
				termId = [id for (id, sim) in corpus[0] if sim == 1.0][0]
				corporaDict[i] = (termId, 0, corpus, len(corpus))
				totalLen += len(corpus)
			
		while 1:
			isDone = False
			for corpusId in corporaDict.keys():
				termId, index, corpus, len_corpus = corporaDict[corpusId] # Read all values for current corpus from MyDict.
				if termId == write_index: # We are writing to the merged corpus at index 'write_index'. Write it if it coincides with the column id of the current corpus.
				
					# Determine offsets for the index file, allowing O(1) access time of documents.
					posnow = corpus_writer.fout.tell()
					if posnow == poslast:
						offsets[-1] = -1
					offsets.append(posnow)
					poslast = posnow
				
					# Write current document
					max_id, veclen = corpus_writer.write_vector(write_index, corpus[index])
					num_nnz += veclen
					
					# Update values
					write_index += 1 #Update the write index of the merged corpus
					index += 1 #Update the index of the current corpus
					if index == len_corpus: #Reached the end of the current corpus. Set values to -1 so no more document will be grabbed from this corpus.
						corporaDict[corpusId] = (-1, -1, corpus, len_corpus) #Set index to -1. Corpus has been fully read.
					else:
						termId = [id for (id, sim) in corpus[index] if sim == 1.0][0] #Grab the next column id :: TODO -- CAN THIS BE DONE MORE EFFICIENTLY?
						corporaDict[corpusId] = (termId, index, corpus, len_corpus) #Update the MyDict with the new values of the current corpus
					
					if write_index == totalLen: # If all corpora have been fully read, exit the while loop.
						isDone = True
						
			if isDone:
				break
		corpus_writer.fake_headers(totalLen, totalLen, num_nnz)
		corpus_writer.close()
		
		# Write index to file
		index_fname = corpus_writer.fname + '.index'
		utils.pickle(offsets, index_fname)
Beispiel #2
0
def computeOracle(nameEmailData, aliasToIdName):
    oracle = MyDict()
    indices = range(len(nameEmailData.keys()))

    #	print aliasToIdName.keys()

    # Initially all false
    for idx1, idx2 in itertools.combinations(indices, 2):
        oracle[(idx1, idx2)] = 0
    # Reflexive matches are always true:
    # any (alias,email) pair should be matched to itself
    for idx in indices:
        oracle[(idx, idx)] = 1
    # Look up the identity name for an alias
    # Match (alias,email) pairs that share the same identity name
    for idx1, idx2 in itertools.combinations(indices, 2):
        #		try:
        (name1, email1) = nameEmailData[idx1]
        (name2, email2) = nameEmailData[idx2]
        #		print name1
        #		if name1 not in aliasToIdName.keys():
        #			print name1
        #			exit()

        if aliasToIdName[name1] == aliasToIdName[name2]:
            oracle[(idx1, idx2)] = 1


#		except:
#			print idx1, nameEmailData[idx1]
#			print idx2, nameEmailData[idx2]

    return oracle
Beispiel #3
0
    def __init__(self, nameListsPath):
        '''Data path'''
        self.dataPath = os.path.abspath(nameListsPath)
        '''gender.c, already lowercase'''
        self.genderDict = MyDict(os.path.join(self.dataPath, 'gender.dict'))
        '''Order of countries (columns) in the 
		nam_dict.txt file shipped together with gender.c'''
        self.countriesOrder = {
            'UK': 0,
            'Ireland': 1,
            'USA': 2,
            'Italy': 3,
            'Malta': 4,
            'Portugal': 5,
            'Spain': 6,
            'France': 7,
            'Belgium': 8,
            'Luxembourg': 9,
            'The Netherlands': 10,
            'East Frisia': 11,
            'Germany': 12,
            'Austria': 13,
            'Switzerland': 14,
            'Iceland': 15,
            'Denmark': 16,
            'Norway': 17,
            'Sweden': 18,
            'Finland': 19,
            'Estonia': 20,
            'Latvia': 21,
            'Lithuania': 22,
            'Poland': 23,
            'Czech Republic': 24,
            'Slovakia': 25,
            'Hungary': 26,
            'Romania': 27,
            'Bulgaria': 28,
            'Bosnia and Herzegovina': 29,
            'Croatia': 30,
            'Kosovo': 31,
            'Macedonia (FYROM)': 32,
            'Montenegro': 33,
            'Serbia': 34,
            'Slovenia': 35,
            'Albania': 36,
            'Greece': 37,
            'Russia': 38,
            'Belarus': 39,
            'Moldova': 40,
            'Ukraine': 41,
            'Armenia': 42,
            'Azerbaijan': 43,
            'Georgia': 44,
            'Kazakhstan': 45,
            'Turkey': 46,
            'Arabia/Persia': 47,
            'Israel': 48,
            'China': 49,
            'India/Sri Lanka': 50,
            'Japan': 51,
            'Korea': 52,
            'Vietnam': 53,
            'other countries': 54,
        }
        self.countriesOrderRev = {}
        for country, idx in self.countriesOrder.items():
            self.countriesOrderRev[idx] = country

        self.threshold = 0.5

        self.nameLists = {}
        '''Name lists per country'''
        listOfCountries = [
            'Afghanistan', 'Albania', 'Australia', 'Belgium', 'Brazil',
            'Canada', 'Czech', 'Finland', 'Greece', 'Hungary', 'India', 'Iran',
            'Ireland', 'Israel', 'Italy', 'Latvia', 'Norway', 'Poland',
            'Romania', 'Russia', 'Slovenia', 'Somalia', 'Spain', 'Sweden',
            'Turkey', 'UK', 'Ukraine', 'USA'
        ]
        for country in listOfCountries:
            self.nameLists[country] = {}
            self.nameLists[country]['male'], self.nameLists[country][
                'female'] = loadData(country, self.dataPath, hasHeader=False)
        '''Exceptions (approximations)'''
        #malesFrance, femalesFrance = loadData('Wallonia', self.dataPath, False)
        #self.nameLists['France'] = {}
        #self.nameLists['France']['male'] 	= malesFrance
        #self.nameLists['France']['female'] 	= femalesFrance

        malesNL, femalesNL = loadData('Frisia', self.dataPath, False)
        self.nameLists['The Netherlands'] = {}
        self.nameLists['The Netherlands']['male'] = malesNL
        self.nameLists['The Netherlands']['female'] = femalesNL
        '''Black list of first names'''
        self.blackList = [
            'The', 'the', 'nil', 'Nil', 'NULL', 'null', 'stack', 'cache',
            'queue', 'core', 'linux', 'Net', 'stillo', 'alfa', 'beta',
            'testing', 'me'
        ]
        '''Gender-specific words'''
        self.maleWords = [
            'Mr.', 'mr.', 'Mr', 'mr', 'Sir', 'sir', 'Captain', 'captain',
            'wizard', 'warrior', 'hillbilly', 'beer', 'Mister', 'Lord', 'Duke',
            'Baron', 'coolguy'
        ]
        self.femaleWords = ['girl', 'grrl', 'grrrl', 'miss', 'Miss', 'Mrs.']
        '''Suffixes'''
        self.suffixes = {}

        self.suffixes['Russia'] = {}
        self.suffixes['Russia']['male'] = {}
        self.suffixes['Russia']['male']['include'] = [
            'ov', 'ev', 'sky', 'skiy', 'iy', 'uy', 'oy', 'skij', 'ij', 'uj',
            'oj', 'off'
        ]
        '''in/yn excluded due to B-Rain and Earwin'''
        self.suffixes['Russia']['male']['exclude'] = [
            'Liubov', 'Ljubov', 'Lyubov', 'boy', 'Boy', 'toy', 'Toy', 'dev',
            'Dev'
        ]
        '''['Iakov','Jakov','Yakov','dev','Dev','Lev','boy','Boy','toy','Toy']'''
        self.suffixes['Russia']['female'] = {}
        self.suffixes['Russia']['female']['include'] = [
            'ova', 'eva', 'skaya', 'aya', 'eya', 'oya', 'iaya'
        ]
        self.suffixes['Russia']['female']['exclude'] = {}

        self.suffixes['Belarus'] = self.suffixes['Russia']
        self.suffixes['Ukraine'] = self.suffixes['Russia']
        self.suffixes['Turkmenistan'] = self.suffixes['Russia']
        self.suffixes['Kyrgyzstan'] = self.suffixes['Russia']
        self.suffixes['Tajikistan'] = self.suffixes['Russia']
        self.suffixes['Kazakhstan'] = self.suffixes['Russia']
        self.suffixes['Uzbekistan'] = self.suffixes['Russia']
        self.suffixes['Azerbaijan'] = self.suffixes['Russia']
        self.suffixes['Uzbekistan'] = self.suffixes['Russia']
        self.suffixes['Bulgaria'] = self.suffixes['Russia']

        self.suffixes['Macedonia (FYROM)'] = {}
        self.suffixes['Macedonia (FYROM)']['male'] = {}
        self.suffixes['Macedonia (FYROM)']['male']['include'] = [
            'ov', 'ev', 'ski', 'evsk'
        ]
        self.suffixes['Macedonia (FYROM)']['male']['exclude'] = [
            'Iakov', 'Jakov', 'Yakov', 'dev', 'Dev', 'Lev', 'boy', 'Boy',
            'toy', 'Toy'
        ]
        self.suffixes['Macedonia (FYROM)']['female'] = {}
        self.suffixes['Macedonia (FYROM)']['female']['include'] = [
            'ova', 'eva', 'ska', 'evska'
        ]
        self.suffixes['Macedonia (FYROM)']['female']['exclude'] = {}

        self.suffixes['Poland'] = {}
        self.suffixes['Poland']['male'] = {}
        self.suffixes['Poland']['male']['include'] = [
            'ski', 'sky', 'cki', 'cky'
        ]
        self.suffixes['Poland']['male']['exclude'] = {}
        self.suffixes['Poland']['female'] = {}
        self.suffixes['Poland']['female']['include'] = ['cka']
        '''-ska is not included because of Polska = Poland which might be confusing'''
        self.suffixes['Poland']['female']['exclude'] = {}

        self.suffixes['Czech Republic'] = {}
        self.suffixes['Czech Republic']['male'] = {}
        self.suffixes['Czech Republic']['male']['include'] = [
            'ov', u'ský', 'sky', u'ný', 'ny'
        ]
        self.suffixes['Czech Republic']['male']['include'] = [
            'ov', 'sky', 'ny'
        ]
        self.suffixes['Czech Republic']['male']['exclude'] = {}
        self.suffixes['Czech Republic']['female'] = {}
        self.suffixes['Czech Republic']['female']['include'] = [
            'ova', 'ska', 'na', u'ová', u'ská', u'ná'
        ]
        self.suffixes['Czech Republic']['female']['include'] = [
            'ova', 'ska', 'na'
        ]
        self.suffixes['Czech Republic']['female']['exclude'] = {}
        '''Male Latvian personal and family names typically end in  -s (-š). Some may be derived 
		from Russian names, with an -s ending: e.g. Vladislavs KAZANOVS
		Only Russian forms are included since we cannot distinguish between the regular Latvian -s and English plural -s'''

        self.suffixes['Latvia'] = {}
        self.suffixes['Latvia']['male'] = {}
        self.suffixes['Latvia']['male']['include'] = [u'š', 'ovs', 'ins']
        self.suffixes['Latvia']['male']['exclude'] = {}
        self.suffixes['Latvia']['female'] = {}
        self.suffixes['Latvia']['female']['include'] = ['ina']
        self.suffixes['Latvia']['female']['exclude'] = {}

        self.suffixes['Lithuania'] = {}
        self.suffixes['Lithuania']['male'] = {}
        self.suffixes['Lithuania']['male']['include'] = [
            'aitis', 'utis', 'ytis', 'enas', 'unas', 'inis', 'ynis', 'onis',
            'ius', 'elis'
        ]
        self.suffixes['Lithuania']['male']['exclude'] = {}
        self.suffixes['Lithuania']['female'] = {}
        self.suffixes['Lithuania']['female']['include'] = [
            'iene', 'aite', 'yte', 'ute', 'te'
        ]
        self.suffixes['Lithuania']['female']['exclude'] = {}
        '''All inverse order countries should also be checked for direct order'''
        self.invOrder = [
            'Russia', 'Belarus', 'Ukraine', 'Turkmenistan', 'Kyrgyzstan',
            'Tajikistan', 'Kazakhstan', 'Uzbekistan', 'Azerbaijan',
            'Uzbekistan', 'Hungary', 'China', 'Bosnia', 'Serbia', 'Croatia',
            'Sri Lanka', 'Vietnam', 'North Korea', 'South Korea'
        ]
        '''Diminutives list'''
        fd = open(os.path.join(self.dataPath, 'diminutives.csv'), 'rb')
        reader = UnicodeReader(fd)
        self.diminutives = {}
        for row in reader:
            mainName = row[0].strip().lower()
            for diminutive in row[1:]:
                try:
                    self.diminutives[diminutive].add(mainName)
                except:
                    self.diminutives[diminutive] = set()
                    self.diminutives[diminutive].add(mainName)
        '''Distribution of StackOverflow users per different countries'''
        fd = open(os.path.join(self.dataPath, 'countryStats.csv'), 'rb')
        reader = UnicodeReader(fd)
        self.countryStats = {}
        total = 0.0
        for row in reader:
            country = row[0]
            numUsers = float(row[1])
            total += numUsers
            self.countryStats[country] = numUsers
        for country in self.countryStats.keys():
            self.countryStats[country] = self.countryStats[country] / total
Beispiel #4
0
from unidecode import unidecode
from nameMap import nameMap

dataPath = os.path.abspath("../../../data")

# This is the list of DBLP author names (>1.1M people)
# 335078;M. G. J. van den Brand, Mark G. J. van den Brand, Mark van den Brand
f = open(os.path.join(dataPath, "dblp-author-aliases-stripped.csv"), "rb")
reader1 = UnicodeReader(f)

# Read the list into a map
# reverseLookup['M. G. J. van den Brand']
#     = reverseLookup['Mark G. J. van den Brand']
#     = reverseLookup['Mark van den Brand']
#     = 335078
reverseLookup = MyDict()
# Choose a unique spelling for each name
# directLookup['M. G. J. van den Brand']
#     = directLookup['Mark G. J. van den Brand']
#     = directLookup['Mark van den Brand']
#     = 'Mark van den Brand'
directLookup = MyDict()
for row in reader1:
    aid = int(row[0])
    aliases = [name.strip() for name in row[1].split(',')]
    for name in aliases:
        reverseLookup[name] = aid
    directLookup[aid] = aliases[-1]


# Normalizes a name using the different aliases
    def runTesting(self,
                   dataPath='../data/icsmData',
                   resultsPath='../data/icsmResults',
                   nrIterations=10,
                   nrProcesses=1):
        #For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv)
        #Grab the parameter combination with the highest f-measure.
        #Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv)
        #Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv)

        #Make sure the resultsPath exists. If not, create it.
        if not os.path.exists(resultsPath):
            os.makedirs(resultsPath)
        #Similar for the resultsPath/testing directory, as training and testing is separated.
        if not os.path.exists(resultsPath + '/testing'):
            os.makedirs(resultsPath + '/testing')

        #Load the oracle for the data
        aliasToIdName = MyDict(os.path.join(dataPath,
                                            'aliasToIdNameUTF8.dict'))

        #Loop all iterations
        for itIdx in range(nrIterations):

            resultList = []

            #Load the data
            f = open(
                os.path.join(resultsPath, 'training',
                             'resultsTraining_%d.csv' % itIdx), 'rb')
            reader = csv.reader(f, delimiter=';')
            for row in reader:
                header = row  #skip header
                break
            for row in reader:
                #levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f
                levThr = row[0]
                minLen = row[1]
                k = row[2]
                cosThr = row[3]
                tp = row[4]
                fp = row[5]
                fn = row[6]
                tn = row[7]
                precision = row[8]
                recall = row[9]
                fMeasure = row[10]

                resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn,
                                   precision, recall, fMeasure))
            f.close()

            #Order results by f-measure
            resultList = sorted(resultList,
                                key=lambda tuple: -float(tuple[10]))

            #Grab the first record, containing the best parameters.
            bestLevThr = float(resultList[0][0])
            bestMinLen = int(resultList[0][1])
            bestK = float(resultList[0][2])
            bestCosThr = float(resultList[0][3])

            g = open(
                os.path.join(resultsPath, 'testing',
                             'resultsTesting_%d.csv' % itIdx), 'wb')
            writer = csv.writer(g, delimiter=';')
            writer.writerow([
                'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn',
                'precision', 'recall', 'f'
            ])

            parameters = {}
            parameters["levenshteinSimRatio"] = bestLevThr
            parameters["minLength"] = bestMinLen
            parameters["rankReductionRatio"] = bestK
            parameters["cosineSimRatio"] = bestCosThr

            #Run the LSA algorithm on all testing subsets for this iteration
            for i in range(10):

                #Read the data from the testing subset
                nameEmailData = MyDict()
                f = open(os.path.join(dataPath, 'test_%d_%d.csv' % (itIdx, i)),
                         'rb')
                reader = UnicodeReader(f)
                idx = 0
                for row in reader:
                    try:
                        alias = row[0]
                        email = unspam(row[1])
                        nameEmailData[idx] = (alias, email)
                    except:
                        print row
                    idx += 1
                f.close()

                lsaAlgo = LSAAlgo(
                    nameEmailData,
                    parameters,
                    dataDir=resultsPath,
                    dataSaveDir='testing',
                    resultsFileName='results_%d_%d_preoracle.csv' % (itIdx, i),
                    resultsHumanReadable=False,
                    numberOfProcesses=nrProcesses,
                    runProfiler=False,
                    profilerOutputDir='profilerOutput',
                    gensimLogging=False,
                    progressLogging=False)
                lsaAlgo.run()

                #Compute the oracle to verify results
                oracle = computeOracle(nameEmailData, aliasToIdName)

                #Now check the results using the oracle
                [tp, fp, fn, tn, precision, recall,
                 fmeasure] = self.computeResults(
                     os.path.join(resultsPath, 'testing'),
                     'results_%d_%d_preoracle.csv' % (itIdx, i),
                     'results_%d_%d.csv' % (itIdx, i), oracle, nameEmailData)

                writer.writerow([
                    bestLevThr, bestMinLen, bestK, bestCosThr, tp, fp, fn, tn,
                    precision, recall, fmeasure
                ])

                print 'Done computing results on iteration %d, subset %d' % (
                    itIdx, i)

            g.close()
    def computeResults(self, dataPath, filename, outfilename, oracle,
                       nameEmailData):

        from gensim import models, similarities, matutils, corpora, utils

        if os.path.exists(os.path.join(dataPath, filename)):

            indices = range(len(nameEmailData.keys()))

            documents = MyDict(os.path.join(dataPath, 'documents.dict'))
            directLookup = MyDict(os.path.join(
                dataPath,
                'directLookup.dict'))  #input data index -> document index
            index = similarities.docsim.Similarity.load(
                os.path.join(dataPath, 'index'))

            # Initial values
            tp = 0.0
            fp = 0.0
            fn = 0.0
            tn = 0.0

            matchedTuples = set()

            # Write all matched indexes to a set
            f = open(os.path.join(dataPath, filename), 'rb')
            reader = UnicodeReader(f)

            g = open(os.path.join(dataPath, outfilename), 'wb')
            writer = csv.writer(g, delimiter=';')
            for row in reader:
                header = row  #Ignore header
                header.append('kind')
                writer.writerow(header)
                break
            for row in reader:

                # Put the smallest value first.
                if int(row[0]) <= int(row[1]):
                    idx1 = int(row[0])
                    idx2 = int(row[1])
                else:
                    idx2 = int(row[0])
                    idx1 = int(row[1])

                matchedTuples.add((idx1, idx2))
                if oracle[(idx1, idx2)] == 1:
                    row.append('tp')
                else:
                    row.append('fp')
                writer.writerow(row)
            f.close()

            #Iterate all combinations to compute all tp, fp, fn, tn
            for idx1, idx2 in itertools.combinations(indices, 2):

                docId1 = directLookup[idx1]
                docId2 = directLookup[idx2]

                if (
                        idx1, idx2
                ) in matchedTuples:  #The tuple was matched by the algorithm
                    # tp or fp
                    if oracle[(idx1, idx2)] == 1:  #Correctly matched
                        tp += 1
                    else:  #Incorrectly matched
                        fp += 1
                else:  #The tuple was not matched by the algorithm
                    if oracle[(idx1, idx2)] == 1:  #It should have been matched
                        fn += 1
                        # Add the fn to the results file
                        writer.writerow([
                            str(idx1),
                            str(idx2), nameEmailData[idx1],
                            nameEmailData[idx2], documents[docId1],
                            documents[docId2],
                            index.similarity_by_id(docId1)[docId2], 'fn'
                        ])
                    else:  #Did not match correctly
                        tn += 1
            g.close()
            os.remove(os.path.join(dataPath, filename))

            #Add reflexive results to the tp's :-)
            for idx in indices:
                tp += 1

            try:
                precision = tp / (tp + fp)
            except:
                precision = 0.0

            try:
                recall = tp / (tp + fn)
            except:
                recall = 0.0

            try:
                fmeasure = 2 * precision * recall / (precision + recall)
            except:
                fmeasure = 0.0

            return [tp, fp, fn, tn, precision, recall, fmeasure]
        else:
            return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    def runTraining(self,
                    dataPath='../data/icsmData',
                    resultsPath='../data/icsmResults',
                    levThrRange=[0.7, 0.8, 0.9],
                    minLenRange=[2, 3, 4],
                    kRange=[0.9, 0.95, 1.0],
                    cosThrRange=[0.7, 0.75, 0.8, 0.85],
                    nrIterations=10,
                    nrProcesses=1):
        #Load iteration of the training set. (../data/icsmData/training_0.csv)
        #Run this iteration on all parameter combinations.
        #Compare to oracle and augment the output file from the LSA algorithm.
        #Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv)

        #Make sure the resultsPath exists. If not, create it.
        if not os.path.exists(resultsPath):
            os.makedirs(resultsPath)
        #Similar for the resultsPath/training directory, as training and testing is separated.
        if not os.path.exists(resultsPath + '/training'):
            os.makedirs(resultsPath + '/training')

        #Loop all iterations
        for itIdx in range(nrIterations):

            #Load the data
            nameEmailData = MyDict()
            f = open(os.path.join(dataPath, 'training_%d.csv' % itIdx), 'rb')
            reader = UnicodeReader(f)
            idx = 0
            for row in reader:
                try:
                    alias = row[0]
                    email = unspam(row[1])
                    nameEmailData[idx] = (alias, email)
                except:
                    print row
                idx += 1
            f.close()

            #Load the oracle for the data
            aliasToIdName = MyDict(
                os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
            oracle = computeOracle(nameEmailData, aliasToIdName)

            nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len(
                cosThrRange)

            g = open(
                os.path.join(resultsPath, 'training',
                             'resultsTraining_%d.csv' % itIdx), 'wb')
            writer = csv.writer(g, delimiter=';')
            writer.writerow([
                'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn',
                'precision', 'recall', 'f'
            ])

            run = 0
            for levThr in levThrRange:
                for minLen in minLenRange:
                    for k in kRange:
                        for cosThr in cosThrRange:
                            #Load the parameters
                            parameters = {}
                            parameters["levenshteinSimRatio"] = levThr
                            parameters["minLength"] = minLen
                            parameters["cosineSimRatio"] = cosThr
                            parameters["rankReductionRatio"] = k

                            #Run the LSA algorithm on these parameters
                            lsaAlgo = LSAAlgo(
                                nameEmailData,
                                parameters,
                                dataDir=resultsPath,
                                dataSaveDir='training',
                                resultsFileName=
                                'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' %
                                (itIdx, levThr, minLen, k, cosThr),
                                resultsHumanReadable=False,
                                numberOfProcesses=nrProcesses,
                                runProfiler=False,
                                profilerOutputDir='profilerOutput',
                                gensimLogging=False,
                                progressLogging=False)
                            lsaAlgo.run()

                            #Now check the results using the oracle
                            [tp, fp, fn, tn, precision, recall,
                             fmeasure] = self.computeResults(
                                 os.path.join(resultsPath, 'training'),
                                 'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' %
                                 (itIdx, levThr, minLen, k, cosThr),
                                 'results_%d_%.2f_%d_%.2f_%.2f.csv' %
                                 (itIdx, levThr, minLen, k, cosThr), oracle,
                                 nameEmailData)

                            writer.writerow([
                                levThr, minLen, k, cosThr, tp, fp, fn, tn,
                                precision, recall, fmeasure
                            ])

                            run += 1
                            print 'Run %d out of %d in iteration %d...' % (
                                run, nrRuns, itIdx)

            g.close()

dataPath = os.path.abspath("../../../data")


# This is the list of DBLP author names (>1.1M people)
# 335078;M. G. J. van den Brand, Mark G. J. van den Brand, Mark van den Brand
f = open(os.path.join(dataPath, "dblp-author-aliases-stripped.csv"), "rb")
reader1 = UnicodeReader(f)

# Read the list into a map
# reverseLookup['M. G. J. van den Brand'] 
# 	= reverseLookup['Mark G. J. van den Brand'] 
# 	= reverseLookup['Mark van den Brand']
# 	= 335078
reverseLookup = MyDict()
# Choose a unique spelling for each name
# directLookup['M. G. J. van den Brand'] 
# 	= directLookup['Mark G. J. van den Brand'] 
# 	= directLookup['Mark van den Brand']
# 	= 'Mark van den Brand'
directLookup = MyDict()
for row in reader1:
    aid = int(row[0])
    aliases = [name.strip() for name in row[1].split(',')]
    for name in aliases:
        reverseLookup[name] = aid
	directLookup[aid] = aliases[-1]


# Normalizes a name using the different aliases 
Beispiel #9
0
	def run(self):
		## Decode parameters
		levThr  = self.parameters["levenshteinSimRatio"] # threshold for Levenshtein similarity
		minLen  = self.parameters["minLength"] # threshold for min length of a term
		simThr	= self.parameters["cosineSimRatio"] # threshold for cosine similarity between documents
		k		= self.parameters["rankReductionRatio"] # dimensionality reduction ratio (0 < k <= 1)
		
#		dataIndices = self.nameEmailData.keys()
		
		"""I will iteratively build and populate the document-term matrix."""
		runningColIdx = 0
		runningRowIdx = 0
		
		## 
		reverseLookup = MyDict()
		directLookup = MyDict()
		
		documents = MyDict() # Dictionary of documents (key = column index)
		terms = MyDict() # Dictionary of terms (key = row index)
		allTerms = set() # Set of all terms
		
		termToColumnIndex = MyDict()
		termToRowIndex = MyDict()
		
		start = clock()
		emailToColumnIndex = MyDict()
		for idx in self.nameEmailData.keys():
			addEmailPrefix = True # default value (always add email prefix to document string, unless ..)
			
			(name, email) = self.nameEmailData[idx] # the input data contains (name,email) tuples
			#Easy merge: different names used by the same email address.
			#Exceptions: mailing list email addresses.
			if email not in self.emailExceptions:
				#If email address is personal, then:
				#- If I have seen it before, retrieve the column (document) to which it corresponds.
				#- If I see it for the first time, assign it a new column (document).
				try:
					columnIdx = emailToColumnIndex[email]
				except:
					columnIdx = runningColIdx
					emailToColumnIndex[email] = columnIdx
					runningColIdx += 1
			else:
				#If email address is not personal, directly assign it a new column (document).
				columnIdx = runningColIdx
				runningColIdx += 1
				addEmailPrefix = False # remember not to add email prefix to document string
				
			#Create lookup tables to move from input data to doc-term matrix and back.
			directLookup[idx] = columnIdx # from index in the input data to index of document (column) in doc-term matrix
			reverseLookup.append(columnIdx, idx) # from document index (in doc-term matrix) to index in the input data
			
			#Add name parts to document string if long enough and not consisting of digits, after normalization.
			docString = [namePart for namePart in normalize(name) if len(namePart) >= minLen and not namePart.isdigit()]
			
			if addEmailPrefix:
				#Add whole email address prefix to document string, if long enough and after after normalization.
				prefix = email.split('@')[0].split('+')[0]
				docString += [prefixPart for prefixPart in normalize(prefix) if len(prefixPart) >= minLen and not prefixPart.isdigit()]
				#Split email address prefix on dot, and add each part if long enough and after normalization.
				prefixParts = prefix.split('.')
				for prefixPart in prefixParts:
					docString += [p for p in normalize(prefixPart) if len(p) >= minLen and not p.isdigit()]
			
			#The document string is a set of terms. Figure out to which rows in the doc-term matrix the terms correspond.
			for term in docString:
				try:
					#If I've seen the term before, retrieve row index.
					rowIdx = termToRowIndex[term]
				except:
					#If first time I see the term, assign new row.
					rowIdx = runningRowIdx
					termToRowIndex[term] = rowIdx
					runningRowIdx += 1
				#Keep track of which term is at which row index.
				terms[rowIdx] = term
			
			#Update the document at current column index with new document string.
			try:
				documents[columnIdx].update(docString)
			except:
				documents[columnIdx] = set() # if here then it's the first occurrence of this email address (document)
				documents[columnIdx].update(docString)
				
			#Keep an updated set of terms (from all documents).
			allTerms.update(docString)
			
			#Bookkeeping: keep track of the documents in which a term appears.
			for term in set(docString):
				try:
					termToColumnIndex[term].add(columnIdx)
				except:
					termToColumnIndex[term] = set([columnIdx])
		end = clock()
				
		numDocuments = len(documents.keys())
		numTerms = len(allTerms)
		if self.progressLogging:
			print "Initial pass: %6.3f seconds (%d terms, %d documents)" % ((end - start), numTerms, numDocuments)
		
		#Make sure that dataDir/saveDataDir exists.
		if not os.path.exists('%s/%s' % (self.dataDir, self.dataSaveDir)):
			os.makedirs('%s/%s' % (self.dataDir, self.dataSaveDir))
		#Make sure that dataDir/saveDataDir/dataProcessingDir exists.
		if not os.path.exists('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir)):
			os.makedirs('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir))
			
		if self.runProfiler:
			#Make sure that dataDir/saveDataDir/profilerOutputDir exists.
			if not os.path.exists('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir)):
				os.makedirs('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
				
		#Save the documents MyDict and directLookup MyDict for later reference
		documents.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.documentsFileName))
		directLookup.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.directLookupFileName))
		reverseLookup.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.reverseLookupFileName))
		
		#Build the dictionary and corpus on disk from the documents.
		texts = [documents[key] for key in documents.keys()]
		dictionary = corpora.Dictionary(texts) # Dictionary: {'nalley': 5649, 'francesco': 4634, 'caisse': 3381, 'gabrielle': 1097, ...} :: (term: id)
		dictionary.save('%s/%s/%s.dict' % (self.dataDir, self.dataSaveDir, self.dictionaryFileName)) # Save dictionary
		corpus = [dictionary.doc2bow(text) for text in texts] # Corpus: [[(0, 1), (1, 1), (2, 1)], [(3, 1), (4, 1)], [(5, 1), (6, 1)], ...] :: (id, count)
		corpora.MmCorpus.serialize('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.originalCorpusFileName), corpus)
		
		# Precompute levenshtein distances between all terms on all cpu cores
		if self.progressLogging:
			print 'Starting computation of levenshtein...'
		start = clock()
		if self.runProfiler:
			cProfile.runctx('self.precomputeLevenshteinDistanceBetweenAllTerms(numTerms, dictionary, levThr)', globals(), locals(), '%s/%s/%s/computeTermCorpusChunks' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
		else:
			self.precomputeLevenshteinDistanceBetweenAllTerms(numTerms, dictionary, levThr)
		end = clock()
		if self.progressLogging:
			print 'Done computing levenshtein: %6.3f seconds' % (end - start)
		
		# Merge separate MmCorpus files
		start = clock()
		if self.runProfiler:
			cProfile.runctx('self.mergeIntoTermCorpus()', globals(), locals(), '%s/%s/%s/mergeIntoTermCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
		else:
			self.mergeIntoTermCorpus()
		end = clock()
		if self.progressLogging:
			print 'Done merging corpora to termCorpus: %6.3f seconds' % (end - start)
		
		# Create levCorpus from corpus.mm and termCorpus.mm.
		start = clock()
		if self.runProfiler:
			cProfile.runctx('self.createLevenshteinCorpus()', globals(), locals(), '%s/%s/%s/createLevenshteinCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
		else:
			self.createLevenshteinCorpus()
		end = clock()
		if self.progressLogging:
			print 'Done creating levCorpus from corpus and termCorpus: %6.3f seconds' % (end - start)
		
		corpus_disk = corpora.MmCorpus('%s/%s/%s.mm' % (self.dataDir, self.dataSaveDir, self.levenshteinCorpusFileName))
		
		""" Having the document-term matrix (including Levenshtein distance) in the corpus, we apply the TFIDF model. """
		start = clock()
		tfidf = models.TfidfModel(corpus_disk)
		tfidf.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.tfidfModelFileName))
		corpus_tfidf = tfidf[corpus_disk]
		end = clock()
		if self.progressLogging:
			print "Inverse document frequency: %6.3f seconds" % (end - start)
		
		""" Next up is applying the LSI model on top of the TFIDF model. """
		start = clock()
		number_topics = len(corpus_disk)*k
		try:
			#Occasionally computing the SVD fails.
			#http://projects.scipy.org/numpy/ticket/990
			lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=number_topics)
		except:
			print 'Failed to compute the LSI model. (SVD did not converge?)'
			return
		lsi.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.lsiModelFileName))
		corpus_lsi = lsi[corpus_tfidf]
		end = clock()
		if self.progressLogging:
			print "LSI model: %6.3f seconds" % (end - start)
		
		""" Finally, we run the cosine similarity on the matrix. """
		start = clock()
		index = similarities.docsim.Similarity('%s/%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.dataProcessingDir, self.similarityIndexFileName), corpus_lsi, number_topics)
		index.save('%s/%s/%s' % (self.dataDir, self.dataSaveDir, self.similarityIndexFileName))
		end = clock()
		if self.progressLogging:
			print "Similarities index: %6.3f seconds" % (end - start)
		
		"""Output results."""
		start = clock()
		if self.runProfiler:
			cProfile.runctx('self.writeResults(index, reverseLookup, documents, simThr)', globals(), locals(), '%s/%s/%s/createLevenshteinCorpus' % (self.dataDir, self.dataSaveDir, self.profilerOutputDir))
		else:
			self.writeResults(index, reverseLookup, documents, simThr)
		end = clock()
		if self.progressLogging:
			print "Writing results: %6.3f seconds" % (end - start)
				
		return
Beispiel #10
0
#	for row in reader:
#		writer.writerow([str(idx)] + row)
#		idx += 1
#	g.close()
#	exit()

	# Choose dataset
	#dataset = 'aliases2'
	dataset = 'gnome'
	dataset = 'icsm'
	
	# Choose dataset size
	datasetsize = 'full'
	#datasetsize = 'sample'
	
	nameEmailData = MyDict()
	if dataset == 'aliases2':
		f = open(os.path.join(dataPath, "aliases2.csv"), "rb")
	#	f = open(os.path.join(dataPath, "testData2.csv"), "rb")
		reader = UnicodeReader(f)
		header = reader.next()
			
		for row in reader:
			try:
				idx = int(row[0])
				name = row[1]
				email = unspam(row[2])
				nameEmailData[idx] = (name, email)
			except:
				print row
		f.close()
        year = row[0]
        authors = ','.join([
            normaliseName(name) for name in row[1].split(',')
            if len(normaliseName(name))
        ])
        title = row[2]
        writer.writerow([year, authors, title, '', '', '', ''])
    g.close()

print
soFarSoGood = set()

# "Paulo R. F. Cunha": "Paulo Cunha"
# "Neil Maiden": "Neil A. M. Maiden"
# Strip middle initials, exact match on all other name parts
uselessData = MyDict()
# for each name in the DBLP data
for key in reverseLookup.keys():
    # record a version of the name without initials
    s = " ".join(
        [p.lower() for p in key.split() if len(p) > 1 and p.find('.') == -1])
    uselessData[key] = s

# then for each of the unknowns
for name in sorted(unknowns):
    longParts = [
        p.lower() for p in name.split() if len(p) > 1 and p.find('.') == -1
    ]
    # if the name contains at least two parts of sufficient length
    if len(longParts) > 1:
        # check against each of the DBLP names
Beispiel #12
0
#conference = "icse"
conference = sys.argv[1]


# This is the list of DBLP author names (>1.1M people)
# 335078;M. G. J. van den Brand, Mark G. J. van den Brand, Mark van den Brand
f = open(os.path.abspath("../../../data/dblp-author-aliases-stripped.csv"), "rb")
reader1 = UnicodeReader(f)

# Read the list into a map
# reverseLookup['M. G. J. van den Brand'] 
# 	= reverseLookup['Mark G. J. van den Brand'] 
# 	= reverseLookup['Mark van den Brand']
# 	= 335078
reverseLookup = MyDict()
for row in reader1:
    aid = int(row[0])
    aliases = [name.strip() for name in row[1].split(',')]
    for name in aliases:
        reverseLookup[name] = aid

# Read names of conference PC members
# There are two cases:
# 1. Somebody appears in the DBLP data, with the same spelling (not interesting)
# 2. Somebody does not appear in the DBLP data, or his/her name has a different spelling
g = open(os.path.abspath("../../../data/pc/%s.csv" % conference), "rb")
reader2 = UnicodeReader(g)

# Record all conference PC members whose names DO NOT match DBLP
unknowns = set()