Python LSAAlgo Examples

Programming Language: Python

Namespace/Package Name: lsaAlgorithm

Class/Type: LSAAlgo

Examples at hotexamples.com: 4

Python LSAAlgo - 4 examples found. These are the top rated real world Python examples of lsaAlgorithm.LSAAlgo extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LSAAlgo(2)

run(2)

Example #1

Show file

File: runAverageCaseICSMTests.py Project: tue-mdse/aliasMerger

    def runTesting(self,
                   dataPath='../data/icsmData',
                   resultsPath='../data/icsmResults',
                   nrIterations=10,
                   nrProcesses=1):
        #For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv)
        #Grab the parameter combination with the highest f-measure.
        #Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv)
        #Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv)

        #Make sure the resultsPath exists. If not, create it.
        if not os.path.exists(resultsPath):
            os.makedirs(resultsPath)
        #Similar for the resultsPath/testing directory, as training and testing is separated.
        if not os.path.exists(resultsPath + '/testing'):
            os.makedirs(resultsPath + '/testing')

        #Load the oracle for the data
        aliasToIdName = MyDict(os.path.join(dataPath,
                                            'aliasToIdNameUTF8.dict'))

        #Loop all iterations
        for itIdx in range(nrIterations):

            resultList = []

            #Load the data
            f = open(
                os.path.join(resultsPath, 'training',
                             'resultsTraining_%d.csv' % itIdx), 'rb')
            reader = csv.reader(f, delimiter=';')
            for row in reader:
                header = row  #skip header
                break
            for row in reader:
                #levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f
                levThr = row[0]
                minLen = row[1]
                k = row[2]
                cosThr = row[3]
                tp = row[4]
                fp = row[5]
                fn = row[6]
                tn = row[7]
                precision = row[8]
                recall = row[9]
                fMeasure = row[10]

                resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn,
                                   precision, recall, fMeasure))
            f.close()

            #Order results by f-measure
            resultList = sorted(resultList,
                                key=lambda tuple: -float(tuple[10]))

            #Grab the first record, containing the best parameters.
            bestLevThr = float(resultList[0][0])
            bestMinLen = int(resultList[0][1])
            bestK = float(resultList[0][2])
            bestCosThr = float(resultList[0][3])

            g = open(
                os.path.join(resultsPath, 'testing',
                             'resultsTesting_%d.csv' % itIdx), 'wb')
            writer = csv.writer(g, delimiter=';')
            writer.writerow([
                'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn',
                'precision', 'recall', 'f'
            ])

            parameters = {}
            parameters["levenshteinSimRatio"] = bestLevThr
            parameters["minLength"] = bestMinLen
            parameters["rankReductionRatio"] = bestK
            parameters["cosineSimRatio"] = bestCosThr

            #Run the LSA algorithm on all testing subsets for this iteration
            for i in range(10):

                #Read the data from the testing subset
                nameEmailData = MyDict()
                f = open(os.path.join(dataPath, 'test_%d_%d.csv' % (itIdx, i)),
                         'rb')
                reader = UnicodeReader(f)
                idx = 0
                for row in reader:
                    try:
                        alias = row[0]
                        email = unspam(row[1])
                        nameEmailData[idx] = (alias, email)
                    except:
                        print row
                    idx += 1
                f.close()

                lsaAlgo = LSAAlgo(
                    nameEmailData,
                    parameters,
                    dataDir=resultsPath,
                    dataSaveDir='testing',
                    resultsFileName='results_%d_%d_preoracle.csv' % (itIdx, i),
                    resultsHumanReadable=False,
                    numberOfProcesses=nrProcesses,
                    runProfiler=False,
                    profilerOutputDir='profilerOutput',
                    gensimLogging=False,
                    progressLogging=False)
                lsaAlgo.run()

                #Compute the oracle to verify results
                oracle = computeOracle(nameEmailData, aliasToIdName)

                #Now check the results using the oracle
                [tp, fp, fn, tn, precision, recall,
                 fmeasure] = self.computeResults(
                     os.path.join(resultsPath, 'testing'),
                     'results_%d_%d_preoracle.csv' % (itIdx, i),
                     'results_%d_%d.csv' % (itIdx, i), oracle, nameEmailData)

                writer.writerow([
                    bestLevThr, bestMinLen, bestK, bestCosThr, tp, fp, fn, tn,
                    precision, recall, fmeasure
                ])

                print 'Done computing results on iteration %d, subset %d' % (
                    itIdx, i)

            g.close()

Example #2

Show file

File: runWorstCaseICSMTests.py Project: tue-mdse/aliasMerger

	def runTesting(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', nrIterations=10, nrProcesses=1):
		#For each iteration, load the file with the results for each parameter combination. (../data/icsmResults/resultsTraining_0.csv)
		#Grab the parameter combination with the highest f-measure.
		#Run the LSA algorithm on all testing sets for this iteration with the selected parameter combination. (for 0 <= i <= 9 :: ../data/icsmData/test_0_i.csv)
		#Write the results of the testing to file. (../data/icsmResults/resultsTesting_0.csv)
		
		#Make sure the resultsPath exists. If not, create it.
		if not os.path.exists(resultsPath):
			os.makedirs(resultsPath)
		#Similar for the resultsPath/testing directory, as training and testing is separated.
		if not os.path.exists(resultsPath + '/testingWorse'):
			os.makedirs(resultsPath + '/testingWorse')
		
		#Load the oracle for the data
		aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
		
		#Loop all iterations
		for itIdx in range(nrIterations):
		
			resultList = []
			
			#Load the data
			f = open(os.path.join(resultsPath, 'trainingWorse', 'resultsTrainingWorse_%d.csv' % itIdx), 'rb')
			reader = csv.reader(f, delimiter=';')
			for row in reader:
				header = row #skip header
				break
			for row in reader:
				#levThr;minLen;k;cosThr;tp;fp;fn;precision;recall;f
				levThr = row[0]
				minLen = row[1]
				k = row[2]
				cosThr = row[3]
				tp = row[4]
				fp = row[5]
				fn = row[6]
				tn = row[7]
				precision = row[8]
				recall = row[9]
				fMeasure = row[10]
				
				resultList.append((levThr, minLen, k, cosThr, tp, fp, fn, tn, precision, recall, fMeasure))
			f.close()
				
			#Order results by f-measure
			resultList = sorted(resultList, key=lambda tuple: -float(tuple[10]))
			
			#Grab the first record, containing the best parameters.
			bestLevThr = float(resultList[0][0])
			bestMinLen = int(resultList[0][1])
			bestK = float(resultList[0][2])
			bestCosThr = float(resultList[0][3])
			
			g = open(os.path.join(resultsPath, 'testingWorse', 'resultsTestingWorse_%d.csv' % itIdx), 'wb')
			writer = csv.writer(g, delimiter=';')
			writer.writerow(['levThr','minLen','k','cosThr','tp','fp','fn','tn','precision','recall','f'])
			
			parameters = {}
			parameters["levenshteinSimRatio"] = bestLevThr
			parameters["minLength"] = bestMinLen
			parameters["rankReductionRatio"] = bestK
			parameters["cosineSimRatio"] = bestCosThr
			
			#Read the data from the testing subset
			nameEmailData = MyDict()
			f = open(os.path.join(dataPath, 'testWorse_%d.csv' % (itIdx)), 'rb')
			reader = UnicodeReader(f)
			idx = 0
			for row in reader:
				try:
					alias = row[0]
					email = unspam(row[1])
					nameEmailData[idx] = (alias, email)
				except:
					print row
				idx += 1
			f.close()
		
			lsaAlgo = LSAAlgo(nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='testingWorse', resultsFileName='resultsWorse_%d_preoracle.csv' % (itIdx), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False)
			lsaAlgo.run()
			
			#Compute the oracle to verify results
			oracle = computeOracle(nameEmailData, aliasToIdName)
			
			#Now check the results using the oracle
			[tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults(os.path.join(resultsPath, 'testingWorse'), 'resultsWorse_%d_preoracle.csv' % (itIdx), 'resultsWorse_%d.csv' % (itIdx), oracle, nameEmailData)
		
			writer.writerow([bestLevThr,bestMinLen,bestK,bestCosThr,tp,fp,fn,tn,precision,recall,fmeasure])
			
			print 'Done computing results on iteration %d' % (itIdx)
				
			g.close()

Example #3

Show file

File: runAverageCaseICSMTests.py Project: tue-mdse/aliasMerger

    def runTraining(self,
                    dataPath='../data/icsmData',
                    resultsPath='../data/icsmResults',
                    levThrRange=[0.7, 0.8, 0.9],
                    minLenRange=[2, 3, 4],
                    kRange=[0.9, 0.95, 1.0],
                    cosThrRange=[0.7, 0.75, 0.8, 0.85],
                    nrIterations=10,
                    nrProcesses=1):
        #Load iteration of the training set. (../data/icsmData/training_0.csv)
        #Run this iteration on all parameter combinations.
        #Compare to oracle and augment the output file from the LSA algorithm.
        #Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv)

        #Make sure the resultsPath exists. If not, create it.
        if not os.path.exists(resultsPath):
            os.makedirs(resultsPath)
        #Similar for the resultsPath/training directory, as training and testing is separated.
        if not os.path.exists(resultsPath + '/training'):
            os.makedirs(resultsPath + '/training')

        #Loop all iterations
        for itIdx in range(nrIterations):

            #Load the data
            nameEmailData = MyDict()
            f = open(os.path.join(dataPath, 'training_%d.csv' % itIdx), 'rb')
            reader = UnicodeReader(f)
            idx = 0
            for row in reader:
                try:
                    alias = row[0]
                    email = unspam(row[1])
                    nameEmailData[idx] = (alias, email)
                except:
                    print row
                idx += 1
            f.close()

            #Load the oracle for the data
            aliasToIdName = MyDict(
                os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
            oracle = computeOracle(nameEmailData, aliasToIdName)

            nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len(
                cosThrRange)

            g = open(
                os.path.join(resultsPath, 'training',
                             'resultsTraining_%d.csv' % itIdx), 'wb')
            writer = csv.writer(g, delimiter=';')
            writer.writerow([
                'levThr', 'minLen', 'k', 'cosThr', 'tp', 'fp', 'fn', 'tn',
                'precision', 'recall', 'f'
            ])

            run = 0
            for levThr in levThrRange:
                for minLen in minLenRange:
                    for k in kRange:
                        for cosThr in cosThrRange:
                            #Load the parameters
                            parameters = {}
                            parameters["levenshteinSimRatio"] = levThr
                            parameters["minLength"] = minLen
                            parameters["cosineSimRatio"] = cosThr
                            parameters["rankReductionRatio"] = k

                            #Run the LSA algorithm on these parameters
                            lsaAlgo = LSAAlgo(
                                nameEmailData,
                                parameters,
                                dataDir=resultsPath,
                                dataSaveDir='training',
                                resultsFileName=
                                'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' %
                                (itIdx, levThr, minLen, k, cosThr),
                                resultsHumanReadable=False,
                                numberOfProcesses=nrProcesses,
                                runProfiler=False,
                                profilerOutputDir='profilerOutput',
                                gensimLogging=False,
                                progressLogging=False)
                            lsaAlgo.run()

                            #Now check the results using the oracle
                            [tp, fp, fn, tn, precision, recall,
                             fmeasure] = self.computeResults(
                                 os.path.join(resultsPath, 'training'),
                                 'results_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' %
                                 (itIdx, levThr, minLen, k, cosThr),
                                 'results_%d_%.2f_%d_%.2f_%.2f.csv' %
                                 (itIdx, levThr, minLen, k, cosThr), oracle,
                                 nameEmailData)

                            writer.writerow([
                                levThr, minLen, k, cosThr, tp, fp, fn, tn,
                                precision, recall, fmeasure
                            ])

                            run += 1
                            print 'Run %d out of %d in iteration %d...' % (
                                run, nrRuns, itIdx)

            g.close()

Example #4

Show file

File: runWorstCaseICSMTests.py Project: tue-mdse/aliasMerger

	def runTraining(self, dataPath='../data/icsmData', resultsPath='../data/icsmResults', levThrRange=[0.7, 0.8, 0.9], minLenRange=[2, 3, 4], kRange=[0.9, 0.95, 1.0], cosThrRange=[0.7, 0.75, 0.8, 0.85], nrIterations=10, nrProcesses=1):
		#Load iteration of the training set. (../data/icsmData/training_0.csv)
		#Run this iteration on all parameter combinations.
		#Compare to oracle and augment the output file from the LSA algorithm.
		#Write the results from each parameter combination, including precision, recall and f-measure. (../data/icsmResults/resultsTraining_0.csv)
		
		#Make sure the resultsPath exists. If not, create it.
		if not os.path.exists(resultsPath):
			os.makedirs(resultsPath)
		#Similar for the resultsPath/training directory, as training and testing is separated.
		if not os.path.exists(resultsPath + '/trainingWorse'):
			os.makedirs(resultsPath + '/trainingWorse')
		
		#Loop all iterations
		for itIdx in range(nrIterations):
			
			#Load the data
			nameEmailData = MyDict()
			f = open(os.path.join(dataPath, 'trainingWorse_%d.csv' % itIdx), 'rb')
			reader = UnicodeReader(f)
			idx = 0
			for row in reader:
				try:
					alias = row[0]
					email = unspam(row[1])
					nameEmailData[idx] = (alias, email)
				except:
					print row
				idx += 1
			f.close()
			
			#Load the oracle for the data
			aliasToIdName = MyDict(os.path.join(dataPath, 'aliasToIdNameUTF8.dict'))
			oracle = computeOracle(nameEmailData, aliasToIdName)
			
			nrRuns = len(minLenRange) * len(kRange) * len(levThrRange) * len(cosThrRange)
			
			g = open(os.path.join(resultsPath, 'trainingWorse', 'resultsTrainingWorse_%d.csv' % itIdx), 'wb')
			writer = csv.writer(g, delimiter=';')
			writer.writerow(['levThr','minLen','k','cosThr','tp','fp','fn','tn','precision','recall','f'])
			
			run = 0
			for levThr in levThrRange:
				for minLen in minLenRange:
					for k in kRange:
						for cosThr in cosThrRange:
							#Load the parameters
							parameters = {}
							parameters["levenshteinSimRatio"] = levThr
							parameters["minLength"] = minLen
							parameters["cosineSimRatio"] = cosThr
							parameters["rankReductionRatio"] = k
							
							#Run the LSA algorithm on these parameters
							lsaAlgo = LSAAlgo(nameEmailData, parameters, dataDir=resultsPath, dataSaveDir='trainingWorse', resultsFileName='resultsWorse_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), resultsHumanReadable=False, numberOfProcesses=nrProcesses, runProfiler=False, profilerOutputDir='profilerOutput', gensimLogging=False, progressLogging=False)
							lsaAlgo.run()
							
							#Now check the results using the oracle
							[tp, fp, fn, tn, precision, recall, fmeasure] = self.computeResults(os.path.join(resultsPath, 'trainingWorse'), 'resultsWorse_%d_%.2f_%d_%.2f_%.2f_preoracle.csv' % (itIdx, levThr, minLen, k, cosThr), 'resultsWorse_%d_%.2f_%d_%.2f_%.2f.csv' % (itIdx, levThr, minLen, k, cosThr), oracle, nameEmailData)
							
							writer.writerow([levThr,minLen,k,cosThr,tp,fp,fn,tn,precision,recall,fmeasure])
							
							run += 1
							print 'Run %d out of %d in iteration %d...' % (run, nrRuns, itIdx)
							
			g.close()