def runTesting(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', nrIterations=10, nrProcesses=1): #For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv) #Grab the parameter combination with the highest f-measure. #Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv) #Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv) #Make sure the resultsPath exists. If not, create it. if not os.path.exists(resultsPath): os.makedirs(resultsPath) #Similar for the resultsPath/testing directory, as training and testing is separated. if not os.path.exists(resultsPath + '/testing'): os.makedirs(resultsPath + '/testing') #Load the oracle for the data aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict')) #Loop all iterations for itIdx in range(nrIterations): resultList = [] #Load the data f = open( os.path.join(resultsPath, 'training', 'resultsTraining_%d.csv' % itIdx), 'rb') reader = csv.reader(f, delimiter=';') for row in reader: header = row #skip header break for row in reader: #levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f levThr = row[0] minLen = row[1] k = row[2] cosThr = row[3] tp = row[4] fp = row[5] fn = row[6] tn = row[7] precision = row[8] recall = row[9] fMeasure = row[10] resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn, precision, recall, fMeasure)) f.close() #Order results by f-measure resultList = sorted(resultList, key=lambda tuple: -float(tuple[10])) #Grab the first record, containing the best parameters. bestLevThr = float(resultList[0][0]) bestMinLen = int(resultList[0][1]) bestK = float(resultList[0][2]) bestCosThr = float(resultList[0][3]) g = open( os.path.join(resultsPath, 'testing', 'resultsTesting_%d.csv' % itIdx), 'wb') writer = csv.writer(g, delimiter=';') writer.writerow([ 'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f' ]) parameters = {} parameters["levenshteinSimRatio"] = bestLevThr parameters["minLength"] = bestMinLen parameters["rankReductionRatio"] = bestK parameters["cosineSimRatio"] = bestCosThr #Run the LSA algorithm on all testing subsets for this iteration for i in range(10): #Read the data from the testing subset nameEmailData = MyDict() f = open(os.path.join(dataPath, 'test_%d_%d.csv' % (itIdx, i)), 'rb') reader = UnicodeReader(f) idx = 0 for row in reader: try: alias = row[0] email = unspam(row[1]) nameEmailData[idx] = (alias, email) except: print row idx += 1 f.close() lsaAlgo = LSAAlgo( nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='testing', resultsFileName='results_%d_%d_preoracle.csv' % (itIdx, i), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False) lsaAlgo.run() #Compute the oracle to verify results oracle = computeOracle(nameEmailData, aliasToIdName) #Now check the results using the oracle [tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults( os.path.join(resultsPath, 'testing'), 'results_%d_%d_preoracle.csv' % (itIdx, i), 'results_%d_%d.csv' % (itIdx, i), oracle, nameEmailData) writer.writerow([ bestLevThr, bestMinLen, bestK, bestCosThr, tp, fp, fn, tn, precision, recall, fmeasure ]) print 'Done computing results on iteration %d, subset %d' % ( itIdx, i) g.close()
def runTesting(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', nrIterations=10, nrProcesses=1): #For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv) #Grab the parameter combination with the highest f-measure. #Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv) #Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv) #Make sure the resultsPath exists. If not, create it. if not os.path.exists(resultsPath): os.makedirs(resultsPath) #Similar for the resultsPath/testing directory, as training and testing is separated. if not os.path.exists(resultsPath + '/testingWorse'): os.makedirs(resultsPath + '/testingWorse') #Load the oracle for the data aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict')) #Loop all iterations for itIdx in range(nrIterations): resultList = [] #Load the data f = open(os.path.join(resultsPath, 'trainingWorse', 'resultsTrainingWorse_%d.csv' % itIdx), 'rb') reader = csv.reader(f, delimiter=';') for row in reader: header = row #skip header break for row in reader: #levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f levThr = row[0] minLen = row[1] k = row[2] cosThr = row[3] tp = row[4] fp = row[5] fn = row[6] tn = row[7] precision = row[8] recall = row[9] fMeasure = row[10] resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn, precision, recall, fMeasure)) f.close() #Order results by f-measure resultList = sorted(resultList, key=lambda tuple: -float(tuple[10])) #Grab the first record, containing the best parameters. bestLevThr = float(resultList[0][0]) bestMinLen = int(resultList[0][1]) bestK = float(resultList[0][2]) bestCosThr = float(resultList[0][3]) g = open(os.path.join(resultsPath, 'testingWorse', 'resultsTestingWorse_%d.csv' % itIdx), 'wb') writer = csv.writer(g, delimiter=';') writer.writerow(['levThr','minLen','k','cosThr','tp','fp','fn','tn','precision','recall','f']) parameters = {} parameters["levenshteinSimRatio"] = bestLevThr parameters["minLength"] = bestMinLen parameters["rankReductionRatio"] = bestK parameters["cosineSimRatio"] = bestCosThr #Read the data from the testing subset nameEmailData = MyDict() f = open(os.path.join(dataPath, 'testWorse_%d.csv' % (itIdx)), 'rb') reader = UnicodeReader(f) idx = 0 for row in reader: try: alias = row[0] email = unspam(row[1]) nameEmailData[idx] = (alias, email) except: print row idx += 1 f.close() lsaAlgo = LSAAlgo(nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='testingWorse', resultsFileName='resultsWorse_%d_preoracle.csv' % (itIdx), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False) lsaAlgo.run() #Compute the oracle to verify results oracle = computeOracle(nameEmailData, aliasToIdName) #Now check the results using the oracle [tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults(os.path.join(resultsPath, 'testingWorse'), 'resultsWorse_%d_preoracle.csv' % (itIdx), 'resultsWorse_%d.csv' % (itIdx), oracle, nameEmailData) writer.writerow([bestLevThr,bestMinLen,bestK,bestCosThr,tp,fp,fn,tn,precision,recall,fmeasure]) print 'Done computing results on iteration %d' % (itIdx) g.close()
def runTraining(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', levThrRange=[0.7, 0.8, 0.9], minLenRange=[2, 3, 4], kRange=[0.9, 0.95, 1.0], cosThrRange=[0.7, 0.75, 0.8, 0.85], nrIterations=10, nrProcesses=1): #Load iteration of the training set. (../data/icsmData/training_0.csv) #Run this iteration on all parameter combinations. #Compare to oracle and augment the output file from the LSA algorithm. #Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv) #Make sure the resultsPath exists. If not, create it. if not os.path.exists(resultsPath): os.makedirs(resultsPath) #Similar for the resultsPath/training directory, as training and testing is separated. if not os.path.exists(resultsPath + '/training'): os.makedirs(resultsPath + '/training') #Loop all iterations for itIdx in range(nrIterations): #Load the data nameEmailData = MyDict() f = open(os.path.join(dataPath, 'training_%d.csv' % itIdx), 'rb') reader = UnicodeReader(f) idx = 0 for row in reader: try: alias = row[0] email = unspam(row[1]) nameEmailData[idx] = (alias, email) except: print row idx += 1 f.close() #Load the oracle for the data aliasToIdName = MyDict( os.path.join(dataPath, 'aliasToIdNameUTF8.dict')) oracle = computeOracle(nameEmailData, aliasToIdName) nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len( cosThrRange) g = open( os.path.join(resultsPath, 'training', 'resultsTraining_%d.csv' % itIdx), 'wb') writer = csv.writer(g, delimiter=';') writer.writerow([ 'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn', 'precision', 'recall', 'f' ]) run = 0 for levThr in levThrRange: for minLen in minLenRange: for k in kRange: for cosThr in cosThrRange: #Load the parameters parameters = {} parameters["levenshteinSimRatio"] = levThr parameters["minLength"] = minLen parameters["cosineSimRatio"] = cosThr parameters["rankReductionRatio"] = k #Run the LSA algorithm on these parameters lsaAlgo = LSAAlgo( nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='training', resultsFileName= 'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False) lsaAlgo.run() #Now check the results using the oracle [tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults( os.path.join(resultsPath, 'training'), 'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), 'results_%d_%.2f_%d_%.2f_%.2f.csv' % (itIdx, levThr, minLen, k, cosThr), oracle, nameEmailData) writer.writerow([ levThr, minLen, k, cosThr, tp, fp, fn, tn, precision, recall, fmeasure ]) run += 1 print 'Run %d out of %d in iteration %d...' % ( run, nrRuns, itIdx) g.close()
def runTraining(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', levThrRange=[0.7, 0.8, 0.9], minLenRange=[2, 3, 4], kRange=[0.9, 0.95, 1.0], cosThrRange=[0.7, 0.75, 0.8, 0.85], nrIterations=10, nrProcesses=1): #Load iteration of the training set. (../data/icsmData/training_0.csv) #Run this iteration on all parameter combinations. #Compare to oracle and augment the output file from the LSA algorithm. #Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv) #Make sure the resultsPath exists. If not, create it. if not os.path.exists(resultsPath): os.makedirs(resultsPath) #Similar for the resultsPath/training directory, as training and testing is separated. if not os.path.exists(resultsPath + '/trainingWorse'): os.makedirs(resultsPath + '/trainingWorse') #Loop all iterations for itIdx in range(nrIterations): #Load the data nameEmailData = MyDict() f = open(os.path.join(dataPath, 'trainingWorse_%d.csv' % itIdx), 'rb') reader = UnicodeReader(f) idx = 0 for row in reader: try: alias = row[0] email = unspam(row[1]) nameEmailData[idx] = (alias, email) except: print row idx += 1 f.close() #Load the oracle for the data aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict')) oracle = computeOracle(nameEmailData, aliasToIdName) nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len(cosThrRange) g = open(os.path.join(resultsPath, 'trainingWorse', 'resultsTrainingWorse_%d.csv' % itIdx), 'wb') writer = csv.writer(g, delimiter=';') writer.writerow(['levThr','minLen','k','cosThr','tp','fp','fn','tn','precision','recall','f']) run = 0 for levThr in levThrRange: for minLen in minLenRange: for k in kRange: for cosThr in cosThrRange: #Load the parameters parameters = {} parameters["levenshteinSimRatio"] = levThr parameters["minLength"] = minLen parameters["cosineSimRatio"] = cosThr parameters["rankReductionRatio"] = k #Run the LSA algorithm on these parameters lsaAlgo = LSAAlgo(nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='trainingWorse', resultsFileName='resultsWorse_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False) lsaAlgo.run() #Now check the results using the oracle [tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults(os.path.join(resultsPath, 'trainingWorse'), 'resultsWorse_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), 'resultsWorse_%d_%.2f_%d_%.2f_%.2f.csv' % (itIdx, levThr, minLen, k, cosThr), oracle, nameEmailData) writer.writerow([levThr,minLen,k,cosThr,tp,fp,fn,tn,precision,recall,fmeasure]) run += 1 print 'Run %d out of %d in iteration %d...' % (run, nrRuns, itIdx) g.close()