def runTesting(self,
                   dataPath='../data/icsmData',
                   resultsPath='../data/icsmResults',
                   nrIterations=10,
                   nrProcesses=1):
        #For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv)
        #Grab the parameter combination with the highest f-measure.
        #Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv)
        #Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv)

        #Make sure the resultsPath exists. If not, create it.
        if not os.path.exists(resultsPath):
            os.makedirs(resultsPath)
        #Similar for the resultsPath/testing directory, as training and testing is separated.
        if not os.path.exists(resultsPath + '/testing'):
            os.makedirs(resultsPath + '/testing')

        #Load the oracle for the data
        aliasToIdName = MyDict(os.path.join(dataPath,
                                            'aliasToIdNameUTF8.dict'))

        #Loop all iterations
        for itIdx in range(nrIterations):

            resultList = []

            #Load the data
            f = open(
                os.path.join(resultsPath, 'training',
                             'resultsTraining_%d.csv' % itIdx), 'rb')
            reader = csv.reader(f, delimiter=';')
            for row in reader:
                header = row  #skip header
                break
            for row in reader:
                #levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f
                levThr = row[0]
                minLen = row[1]
                k = row[2]
                cosThr = row[3]
                tp = row[4]
                fp = row[5]
                fn = row[6]
                tn = row[7]
                precision = row[8]
                recall = row[9]
                fMeasure = row[10]

                resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn,
                                   precision, recall, fMeasure))
            f.close()

            #Order results by f-measure
            resultList = sorted(resultList,
                                key=lambda tuple: -float(tuple[10]))

            #Grab the first record, containing the best parameters.
            bestLevThr = float(resultList[0][0])
            bestMinLen = int(resultList[0][1])
            bestK = float(resultList[0][2])
            bestCosThr = float(resultList[0][3])

            g = open(
                os.path.join(resultsPath, 'testing',
                             'resultsTesting_%d.csv' % itIdx), 'wb')
            writer = csv.writer(g, delimiter=';')
            writer.writerow([
                'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn',
                'precision', 'recall', 'f'
            ])

            parameters = {}
            parameters["levenshteinSimRatio"] = bestLevThr
            parameters["minLength"] = bestMinLen
            parameters["rankReductionRatio"] = bestK
            parameters["cosineSimRatio"] = bestCosThr

            #Run the LSA algorithm on all testing subsets for this iteration
            for i in range(10):

                #Read the data from the testing subset
                nameEmailData = MyDict()
                f = open(os.path.join(dataPath, 'test_%d_%d.csv' % (itIdx, i)),
                         'rb')
                reader = UnicodeReader(f)
                idx = 0
                for row in reader:
                    try:
                        alias = row[0]
                        email = unspam(row[1])
                        nameEmailData[idx] = (alias, email)
                    except:
                        print row
                    idx += 1
                f.close()

                lsaAlgo = LSAAlgo(
                    nameEmailData,
                    parameters,
                    dataDir=resultsPath,
                    dataSaveDir='testing',
                    resultsFileName='results_%d_%d_preoracle.csv' % (itIdx, i),
                    resultsHumanReadable=False,
                    numberOfProcesses=nrProcesses,
                    runProfiler=False,
                    profilerOutputDir='profilerOutput',
                    gensimLogging=False,
                    progressLogging=False)
                lsaAlgo.run()

                #Compute the oracle to verify results
                oracle = computeOracle(nameEmailData, aliasToIdName)

                #Now check the results using the oracle
                [tp, fp, fn, tn, precision, recall,
                 fmeasure] = self.computeResults(
                     os.path.join(resultsPath, 'testing'),
                     'results_%d_%d_preoracle.csv' % (itIdx, i),
                     'results_%d_%d.csv' % (itIdx, i), oracle, nameEmailData)

                writer.writerow([
                    bestLevThr, bestMinLen, bestK, bestCosThr, tp, fp, fn, tn,
                    precision, recall, fmeasure
                ])

                print 'Done computing results on iteration %d, subset %d' % (
                    itIdx, i)

            g.close()
Ejemplo n.º 2
0
	def runTesting(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', nrIterations=10, nrProcesses=1):
		#For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv)
		#Grab the parameter combination with the highest f-measure.
		#Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv)
		#Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv)
		
		#Make sure the resultsPath exists. If not, create it.
		if not os.path.exists(resultsPath):
			os.makedirs(resultsPath)
		#Similar for the resultsPath/testing directory, as training and testing is separated.
		if not os.path.exists(resultsPath + '/testingWorse'):
			os.makedirs(resultsPath + '/testingWorse')
		
		#Load the oracle for the data
		aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
		
		#Loop all iterations
		for itIdx in range(nrIterations):
		
			resultList = []
			
			#Load the data
			f = open(os.path.join(resultsPath, 'trainingWorse', 'resultsTrainingWorse_%d.csv' % itIdx), 'rb')
			reader = csv.reader(f, delimiter=';')
			for row in reader:
				header = row #skip header
				break
			for row in reader:
				#levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f
				levThr = row[0]
				minLen = row[1]
				k = row[2]
				cosThr = row[3]
				tp = row[4]
				fp = row[5]
				fn = row[6]
				tn = row[7]
				precision = row[8]
				recall = row[9]
				fMeasure = row[10]
				
				resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn, precision, recall, fMeasure))
			f.close()
				
			#Order results by f-measure
			resultList = sorted(resultList, key=lambda tuple: -float(tuple[10]))
			
			#Grab the first record, containing the best parameters.
			bestLevThr = float(resultList[0][0])
			bestMinLen = int(resultList[0][1])
			bestK = float(resultList[0][2])
			bestCosThr = float(resultList[0][3])
			
			g = open(os.path.join(resultsPath, 'testingWorse', 'resultsTestingWorse_%d.csv' % itIdx), 'wb')
			writer = csv.writer(g, delimiter=';')
			writer.writerow(['levThr','minLen','k','cosThr','tp','fp','fn','tn','precision','recall','f'])
			
			parameters = {}
			parameters["levenshteinSimRatio"] = bestLevThr
			parameters["minLength"] = bestMinLen
			parameters["rankReductionRatio"] = bestK
			parameters["cosineSimRatio"] = bestCosThr
			
			#Read the data from the testing subset
			nameEmailData = MyDict()
			f = open(os.path.join(dataPath, 'testWorse_%d.csv' % (itIdx)), 'rb')
			reader = UnicodeReader(f)
			idx = 0
			for row in reader:
				try:
					alias = row[0]
					email = unspam(row[1])
					nameEmailData[idx] = (alias, email)
				except:
					print row
				idx += 1
			f.close()
		
			lsaAlgo = LSAAlgo(nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='testingWorse', resultsFileName='resultsWorse_%d_preoracle.csv' % (itIdx), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False)
			lsaAlgo.run()
			
			#Compute the oracle to verify results
			oracle = computeOracle(nameEmailData, aliasToIdName)
			
			#Now check the results using the oracle
			[tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults(os.path.join(resultsPath, 'testingWorse'), 'resultsWorse_%d_preoracle.csv' % (itIdx), 'resultsWorse_%d.csv' % (itIdx), oracle, nameEmailData)
		
			writer.writerow([bestLevThr,bestMinLen,bestK,bestCosThr,tp,fp,fn,tn,precision,recall,fmeasure])
			
			print 'Done computing results on iteration %d' % (itIdx)
				
			g.close()
    def runTraining(self,
                    dataPath='../data/icsmData',
                    resultsPath='../data/icsmResults',
                    levThrRange=[0.7, 0.8, 0.9],
                    minLenRange=[2, 3, 4],
                    kRange=[0.9, 0.95, 1.0],
                    cosThrRange=[0.7, 0.75, 0.8, 0.85],
                    nrIterations=10,
                    nrProcesses=1):
        #Load iteration of the training set. (../data/icsmData/training_0.csv)
        #Run this iteration on all parameter combinations.
        #Compare to oracle and augment the output file from the LSA algorithm.
        #Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv)

        #Make sure the resultsPath exists. If not, create it.
        if not os.path.exists(resultsPath):
            os.makedirs(resultsPath)
        #Similar for the resultsPath/training directory, as training and testing is separated.
        if not os.path.exists(resultsPath + '/training'):
            os.makedirs(resultsPath + '/training')

        #Loop all iterations
        for itIdx in range(nrIterations):

            #Load the data
            nameEmailData = MyDict()
            f = open(os.path.join(dataPath, 'training_%d.csv' % itIdx), 'rb')
            reader = UnicodeReader(f)
            idx = 0
            for row in reader:
                try:
                    alias = row[0]
                    email = unspam(row[1])
                    nameEmailData[idx] = (alias, email)
                except:
                    print row
                idx += 1
            f.close()

            #Load the oracle for the data
            aliasToIdName = MyDict(
                os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
            oracle = computeOracle(nameEmailData, aliasToIdName)

            nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len(
                cosThrRange)

            g = open(
                os.path.join(resultsPath, 'training',
                             'resultsTraining_%d.csv' % itIdx), 'wb')
            writer = csv.writer(g, delimiter=';')
            writer.writerow([
                'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn',
                'precision', 'recall', 'f'
            ])

            run = 0
            for levThr in levThrRange:
                for minLen in minLenRange:
                    for k in kRange:
                        for cosThr in cosThrRange:
                            #Load the parameters
                            parameters = {}
                            parameters["levenshteinSimRatio"] = levThr
                            parameters["minLength"] = minLen
                            parameters["cosineSimRatio"] = cosThr
                            parameters["rankReductionRatio"] = k

                            #Run the LSA algorithm on these parameters
                            lsaAlgo = LSAAlgo(
                                nameEmailData,
                                parameters,
                                dataDir=resultsPath,
                                dataSaveDir='training',
                                resultsFileName=
                                'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' %
                                (itIdx, levThr, minLen, k, cosThr),
                                resultsHumanReadable=False,
                                numberOfProcesses=nrProcesses,
                                runProfiler=False,
                                profilerOutputDir='profilerOutput',
                                gensimLogging=False,
                                progressLogging=False)
                            lsaAlgo.run()

                            #Now check the results using the oracle
                            [tp, fp, fn, tn, precision, recall,
                             fmeasure] = self.computeResults(
                                 os.path.join(resultsPath, 'training'),
                                 'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' %
                                 (itIdx, levThr, minLen, k, cosThr),
                                 'results_%d_%.2f_%d_%.2f_%.2f.csv' %
                                 (itIdx, levThr, minLen, k, cosThr), oracle,
                                 nameEmailData)

                            writer.writerow([
                                levThr, minLen, k, cosThr, tp, fp, fn, tn,
                                precision, recall, fmeasure
                            ])

                            run += 1
                            print 'Run %d out of %d in iteration %d...' % (
                                run, nrRuns, itIdx)

            g.close()
Ejemplo n.º 4
0
	def runTraining(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', levThrRange=[0.7, 0.8, 0.9], minLenRange=[2, 3, 4], kRange=[0.9, 0.95, 1.0], cosThrRange=[0.7, 0.75, 0.8, 0.85], nrIterations=10, nrProcesses=1):
		#Load iteration of the training set. (../data/icsmData/training_0.csv)
		#Run this iteration on all parameter combinations.
		#Compare to oracle and augment the output file from the LSA algorithm.
		#Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv)
		
		#Make sure the resultsPath exists. If not, create it.
		if not os.path.exists(resultsPath):
			os.makedirs(resultsPath)
		#Similar for the resultsPath/training directory, as training and testing is separated.
		if not os.path.exists(resultsPath + '/trainingWorse'):
			os.makedirs(resultsPath + '/trainingWorse')
		
		#Loop all iterations
		for itIdx in range(nrIterations):
			
			#Load the data
			nameEmailData = MyDict()
			f = open(os.path.join(dataPath, 'trainingWorse_%d.csv' % itIdx), 'rb')
			reader = UnicodeReader(f)
			idx = 0
			for row in reader:
				try:
					alias = row[0]
					email = unspam(row[1])
					nameEmailData[idx] = (alias, email)
				except:
					print row
				idx += 1
			f.close()
			
			#Load the oracle for the data
			aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
			oracle = computeOracle(nameEmailData, aliasToIdName)
			
			nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len(cosThrRange)
			
			g = open(os.path.join(resultsPath, 'trainingWorse', 'resultsTrainingWorse_%d.csv' % itIdx), 'wb')
			writer = csv.writer(g, delimiter=';')
			writer.writerow(['levThr','minLen','k','cosThr','tp','fp','fn','tn','precision','recall','f'])
			
			run = 0
			for levThr in levThrRange:
				for minLen in minLenRange:
					for k in kRange:
						for cosThr in cosThrRange:
							#Load the parameters
							parameters = {}
							parameters["levenshteinSimRatio"] = levThr
							parameters["minLength"] = minLen
							parameters["cosineSimRatio"] = cosThr
							parameters["rankReductionRatio"] = k
							
							#Run the LSA algorithm on these parameters
							lsaAlgo = LSAAlgo(nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='trainingWorse', resultsFileName='resultsWorse_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False)
							lsaAlgo.run()
							
							#Now check the results using the oracle
							[tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults(os.path.join(resultsPath, 'trainingWorse'), 'resultsWorse_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), 'resultsWorse_%d_%.2f_%d_%.2f_%.2f.csv' % (itIdx, levThr, minLen, k, cosThr), oracle, nameEmailData)
							
							writer.writerow([levThr,minLen,k,cosThr,tp,fp,fn,tn,precision,recall,fmeasure])
							
							run += 1
							print 'Run %d out of %d in iteration %d...' % (run, nrRuns, itIdx)
							
			g.close()