Python frecuencyTable Examples

Programming Language: Python

Namespace/Package Name: utils.frencuencyTable

Method/Function: frecuencyTable

Examples at hotexamples.com: 2

Python frecuencyTable - 2 examples found. These are the top rated real world Python examples of utils.frencuencyTable.frecuencyTable extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: data.py Project: dav009/LPForTopicIdentification

	def measureFrequencies(self):
		#creates a new table of frequencies
		self.tableOfFrequencies=frecuencyTable()
		#get the words of the message
		words=self.triple['message'].split(" ");
		#count how many times is each word
		for word in words:
			self.tableOfFrequencies.add(word)

Example #2

Show file

File: main.py Project: dav009/LPForTopicIdentification

def trainSupervisedSVM(pathOfDataFile,percentageOfSamples,pmiLowerBound):

	#table frquencuency of all the words in the messages
	frecuencies=frecuencyTable()
	
	#Read the file and convert triples into objects
	
	#read file with messages
	#Data/terraReducedTest.csv
	listOfTriples=readCSV(pathOfDataFile)
	
	listOfData=[]
	#convert the triples to objects
	for triple in listOfTriples:
		listOfData.append(Instance(triple))
		
	#stores the vocabulry  of the docs
	setOfWords=Set()
	#stores the set of labels found in the instnaces
	setOfLabels=Set()
	for instance in listOfData:
		print "cleaning: "+str(instance.triple['id'])

		#stores the labels


		#clean message
		instance.cleanMessage()
		instance.measureFrequencies()

		if(instance.triple['label']!=None and instance.triple['label']!=''):
			setOfLabels.add(instance.triple['label'])

		print setOfLabels
		#gathers the frequencies in each message
		#add each word to the setOfWords(vocabulary)
		currentVocabulary=instance.getFrecuencyTable().getKeys()
		for v in currentVocabulary:
			setOfWords.add(v)

		for word in instance.triple['message'].split(" "):
			frecuencies.add(word)

	listOfWordsByValue=frecuencies.sort_by_value()
	print "words by frequencie---"
	for wordd in listOfWordsByValue:
		print wordd
	print "--------------------------"

	print "looking for PMI"
	#get the instances which are annotated
	listOfAnnotatedData=[]
	listOfUnnanotatedData=[]
	for instance in listOfData:
		if instance.triple['label']!="":
			listOfAnnotatedData.append(instance)
		else:
			listOfUnnanotatedData.append(instance)
	currentNumberOfSeedsPerLabel={}
	for key in setOfLabels:
		currentNumberOfSeedsPerLabel[key]=0

	#there should be an equal number of seeds for each label
	percentage=percentageOfSamples
	numberOfSeeds=len(listOfData)*percentage
	currentNumberOfSeeds=0

	numberOfSeedsPerLabel=math.floor(numberOfSeeds/(1.0*len(setOfLabels)))
	numberOfSeeds=numberOfSeedsPerLabel*len(setOfLabels)
	listForAuxiliaryTraining=[]
	listOfTrainingData_=[]
	SetOfSeeds2=Set()
	for instance in listOfData:
		if ( (not instance.triple['label']=='') and (not instance.triple['label']==None) ):
			#if the instance is between the first 1000 then it is  a seed otherwise it is test
			if(currentNumberOfSeedsPerLabel[instance.triple['label']]<numberOfSeedsPerLabel and  not instance.triple['message'] in SetOfSeeds2):
				currentNumberOfSeedsPerLabel[instance.triple['label']]=currentNumberOfSeedsPerLabel[instance.triple['label']]+1
				SetOfSeeds2.add(instance.triple['message'])
				listOfTrainingData_.append(instance)
			else:
				listForAuxiliaryTraining.append(instance)
		


	listOfPMI=getSetOfWordsPerLabel(setOfLabels,setOfWords,listOfTrainingData_,"PMI")
	#the words whose PMI are over a threshold
	setOfSelectedWords=Set()




	#of dimensions
	numberOfDimensions=1000000000000000000000000000000000000000000
	for Keyqueue in listOfPMI.keys():
		queue=listOfPMI[Keyqueue]
		currentCount=0
		while not queue.empty() and currentCount<numberOfDimensions:
			pmi=queue.get()[1]
			
			if(pmi['pmi']>pmiLowerBound): #not taking into account the pmi
				#print pmi['word']+"--"+str(pmi['pmi'])+"--"+pmi['label']
				currentCount=currentCount+1
				setOfSelectedWords.add(pmi['word'])

	totalNumberOfDiffWords=int(math.ceil(len(listOfWordsByValue)*0.4))
	listOfWordsByValue.reverse()
	counter=0
	for wordd in listOfWordsByValue:
		if(counter==totalNumberOfDiffWords):
			break
		print listOfWordsByValue
		counter=counter+1
		setOfSelectedWords.add(wordd[1])



	
	#of dimensions
	numberOfDimensions=1000000000000000000000000000000000000000000
	for Keyqueue in listOfPMI.keys():
		queue=listOfPMI[Keyqueue]
		currentCount=0
		while not queue.empty() and currentCount<numberOfDimensions:
			pmi=queue.get()[1]
			
			if(pmi['pmi']>pmiLowerBound): #not taking into account the pmi
				#print pmi['word']+"--"+str(pmi['pmi'])+"--"+pmi['label']
				currentCount=currentCount+1
				setOfSelectedWords.add(pmi['word'])


	#train a set of Classifiers for words
	print "training classifiers"
	#setOfClassifiers=trainPredictors(listOfData,setOfSelectedWords,setOfWords)
	

	#once the classifiers are trained get the

	#creates a file for fpgrowth
	contentFileForFPGrowth=""

	#creates the vector for each instance
	print "creating vectors for each message"
	instanceVectors=[]
	for instance in listOfData:
		#for word in setOfWords: #when generating vectors with all the words in the vocabulary
		for word in setOfSelectedWords: #when generating vectors with just the words above the MPI threshold
			#using linear classs
			#if(instance.getFrecuencyTable().get(word)*1.0>100.0):
			#	instance.vector.append(instance.getFrecuencyTable().get(word)*1.0)
			#else:
			#	vocabulary_temp=deepcopy(setOfWords)
			#	if(word in setOfWords):
			#		vocabulary_temp.remove(word)
			#	vectorRepresentation=instance.getVectorRepresentation(vocabulary_temp)
			#	label=setOfClassifiers[word].predict(vectorRepresentation)
			#	if(label[0]>0.0):
			#		print "calculated label: "+str(label)
			#	instance.vector.append(label[0])
			#/using linearclass
			instance.vector.append(instance.getFrecuencyTable().get(word)*1.0) #if prediction does not matter
			if(instance.getFrecuencyTable().get(word)>0):
				contentFileForFPGrowth=contentFileForFPGrowth+" "+word
		contentFileForFPGrowth=contentFileForFPGrowth+"\n"		
		instanceVectors.append(instance.vector)

		FPgrowthFile=open('fpgrowthdata','w')
		FPgrowthFile.write(contentFileForFPGrowth)


	

			
		
		
	

	#SVD
	matrix =np.matrix(instanceVectors)
	print "calculating tf-idf"
	matrix=	tfidfTransform(instanceVectors)
	print "calculatin svd"
	matrixLSA=matrix
	#matrixLSA=svdDimensionalityReduction(matrix,1)

	#print matrixLSA

	print "calculating the graph files for Junto"
	


	#creates a junt graph
	#createJuntoGraph('input_graph',instaceVectors,matrixLSA)
	



	#trains a classifier for a label on all the data
	#trainSVMPredictoForLabels(listOfData,setOfLabels,matrixLSA)
	
	currentNumberOfSeedsPerLabel={}
	for key in setOfLabels:
		currentNumberOfSeedsPerLabel[key]=0



	#this defines the number of seeds(annotated data for the algorithm)
	
	
	currentNumberOfSeedsPerLabel={}
	for key in setOfLabels:
		currentNumberOfSeedsPerLabel[key]=0

	#there should be an equal number of seeds for each label
	percentage=percentageOfSamples
	numberOfSeeds=len(instanceVectors)*percentage
	currentNumberOfSeeds=0

	numberOfSeedsPerLabel=math.floor(numberOfSeeds/(1.0*len(setOfLabels)))
	numberOfSeeds=numberOfSeedsPerLabel*len(setOfLabels)

	#creates the gold_labels for Junto( the instnaces whose label is known)
	#seed files refer to those instances which label is already given
	seedFileContent=""
	seedFile=open("seeds",'w')

	#training set of instances
	trainingListOfdata=[]
	#training set of vectors
	trainingMatrix=[]

	#testData
	testListOfdata=[]
	testMatrix=[]


	#gold file refers to the goldstandard towards the perfomrance is measureed
	goldFileContent=""
	goldFile=open("gold_labels",'w')
	counter_=0
	SetOfSeeds=Set()

	for instance in listOfData:
		if ( (not instance.triple['label']=='') and (not instance.triple['label']==None) ):
			#if the instance is between the first 1000 then it is  a seed otherwise it is test
			if(currentNumberOfSeedsPerLabel[instance.triple['label']]<numberOfSeedsPerLabel and  not instance.triple['message'] in SetOfSeeds):
				seedFileContent=seedFileContent+str(instance.triple['id'])+"\t"+instance.triple['label']+"\t"+"1.0\n"
				currentNumberOfSeedsPerLabel[instance.triple['label']]=currentNumberOfSeedsPerLabel[instance.triple['label']]+1
				trainingListOfdata.append(instance)
				trainingMatrix.append(matrixLSA[counter_])
				SetOfSeeds.add(instance.triple['message'])
			else:
				goldFileContent=goldFileContent+str(instance.triple['id'])+"\t"+instance.triple['label']+"\t"+"1.0\n"
				testListOfdata.append(instance)
				testMatrix.append(matrixLSA[counter_])
		counter_=counter_+1

	seedFile.write(seedFileContent)
	goldFile.write(goldFileContent)


	#train an svm classifier for the given samples
	print "len of training data:"+str(len(trainingListOfdata))
	#pair of positivePredictions, numberOfPredictions
	numberOfSamplesPerLabel={}
	dictOfPresicion={}


	for label in setOfLabels:
		numberOfSamplesPerLabel[label]=0
		dictOfPresicion[label]=[0,0]

	listOfClassifiers=trainSVMPredictoForLabels(trainingListOfdata,setOfLabels,trainingMatrix)
	countOfRightClassifications=0
	countOfPredictions=0
	notClassified=0
	for i in range(0, len(testListOfdata)):
		numberOfSamplesPerLabel[testListOfdata[i].triple['label']]=numberOfSamplesPerLabel[testListOfdata[i].triple['label']]+1
		for label in setOfLabels:
			
			
			prediction=listOfClassifiers[label].predict(testMatrix[i])[0]
			print "predicttion of:: "+label+":"+str(prediction)+"__real:"+testListOfdata[i].triple['label']
			countOfPredictions=countOfPredictions+1
			if(prediction==1.0):
				dictOfPresicion[label][1]=dictOfPresicion[label][1]+1
				print "predicted:: "+label+"__real:"+testListOfdata[i].triple['label']
				if(label==testListOfdata[i].triple['label']):
					dictOfPresicion[label][0]=dictOfPresicion[label][0]+1
					countOfRightClassifications=countOfRightClassifications+1
					
			else:
				
				print "predicted:: "+label+"__real:"+testListOfdata[i].triple['label']
				if(label!=testListOfdata[i].triple['label']):
					countOfRightClassifications=countOfRightClassifications+1

	print "len of testdata:"+str(len(testListOfdata))
	print "right class:"+str(countOfRightClassifications)
	print "number of predctions:"+str(countOfPredictions)
	print "accuracy: "+str(countOfRightClassifications/(countOfPredictions*1.0))

	print "-----------------------"
	for label in setOfLabels:
		if(numberOfSamplesPerLabel[label]>0):
			print "***"+label+"***"
			presition=0
			recall=0
			if(dictOfPresicion[label][1]>0):
				presition=dictOfPresicion[label][0]/(dictOfPresicion[label][1]*1.0)
				print "presition:"+str(presition)
			else:
				print "presition: none instance was classified done"
			recall=dictOfPresicion[label][0]/(numberOfSamplesPerLabel[label]*1.0)
			print "recall:"+str(recall)
			if(presition+recall>0.00000000000000000000000000000):
				print "fscore: "+str((2.0*presition*recall)/(presition+recall))
			print "---"