예제 #1
0
def confusion_matrix(k):
    t = int(500 / k)  #number of documents in a fold
    print('number of documents in a fold=', t)
    dataset = {}
    true_predicted = {}
    cmatrix = {}
    for category in categories:
        system("ls ./webpages/" + category + ">.tmp")
        a = open(".tmp")
        files = a.read()
        a.close()
        files = files.split('\n')
        files.pop()
        dataset[category] = files
        true_predicted[category] = 0
        cmatrix[category] = {}
        for c in categories:
            cmatrix[category][c] = 0
    for i in range(0, k):
        print('i=', i)
        train_set = {}
        test_set = {}
        database = {}
        for category in categories:
            train_set[category] = list(dataset[category][0:i * t] +
                                       dataset[category][(i + 1) * t:500])
            test_set[category] = list(dataset[category][i * t:(i + 1) * t])
            database[category] = {}
        #print('train-set\n',train_set)
        #print('test-set\n',test_set)
        for category in categories:
            for file in train_set[category]:
                freq = getList("./webpages/" + category + "/" + file)
                for word in freq:
                    if word in database[category]:
                        database[category][word] += freq[word]
                    else:
                        database[category][word] = freq[word]

        make_training_set(database)
        print('database created')
        for category in categories:
            for file in test_set[category]:
                freq = getList("./webpages/" + category + "/" + file)
                p_cat = naive_bayes(freq)
                if p_cat == category:
                    true_predicted[category] += 1
                cmatrix[category][p_cat] += 1
            print('k=', k, 'i=', i, 'category=', category, 'true_predicted\n',
                  true_predicted)
    print(true_predicted)

    for actual in categories:
        for predicted in categories:
            print('cmatrix[', actual, '][', predicted, ']=',
                  cmatrix[actual][predicted])
def confusion_matrix(k):
	t=int(500/k) #number of documents in a fold
	print('number of documents in a fold=',t)
	dataset={}
	true_predicted={}
	cmatrix={}
	for category in categories:
		system("ls ./webpages/"+category+">.tmp")
		a=open(".tmp")
		files=a.read()
		a.close()
		files=files.split('\n')
		files.pop()
		dataset[category]=files
		true_predicted[category]=0
		cmatrix[category]={}
		for c in categories:
			cmatrix[category][c]=0
	for i in range(0,k):
		print('i=',i)
		train_set={}
		test_set={}
		database={}
		for category in categories:
			train_set[category]=list(dataset[category][0:i*t]+dataset[category][(i+1)*t:500])
			test_set[category]=list(dataset[category][i*t:(i+1)*t])
			database[category]={}
		#print('train-set\n',train_set)
		#print('test-set\n',test_set)
		for category in categories:
			for file in train_set[category]:
				freq=getList("./webpages/"+category+"/"+file)
				for word in freq:
					if word in database[category]:
						database[category][word]+=freq[word]
					else:
						database[category][word]=freq[word]
		
		make_training_set(database)
		print('database created')
		for category in categories:
			for file in test_set[category]:
				freq=getList("./webpages/"+category+"/"+file)
				p_cat=naive_bayes(freq)
				if p_cat==category:
					true_predicted[category]+=1
				cmatrix[category][p_cat]+=1;
			print('k=',k,'i=',i,'category=',category,'true_predicted\n',true_predicted)
	print(true_predicted)
	
	for actual in categories:
		for predicted in categories:
			print('cmatrix[',actual,'][',predicted,']=',cmatrix[actual][predicted]) 
예제 #3
0
def k_fold_accuracy(k):
	t=int(1000/k) #number of documents in a fold
	print('number of documents in a fold=',t)
	dataset={}
	true_predicted={}
	for category in categories:
		system("ls ./dataset/"+category+">.tmp")
		a=open(".tmp")
		files=a.read()
		a.close()
		files=files.split('\n')
		files.pop()
		dataset[category]=files
		true_predicted[category]=0
	for i in range(0,k):
		print('i=',i)
		train_set={}
		test_set={}
		database={}
		for category in categories:
			train_set[category]=list(dataset[category][0:i*t]+dataset[category][(i+1)*t:])
			test_set[category]=list(dataset[category][i*t:(i+1)*t])
			database[category]={}
		#print('train-set\n',train_set)
		#print('test-set\n',test_set)
		for category in categories:
			for file in train_set[category]:
				freq=getList("./dataset/"+category+"/"+file)
				for word in freq:
					if word in database[category]:
						database[category][word]+=freq[word]
					else:
						database[category][word]=freq[word]
		
		
		print('database created')
		for category in categories:
			for file in test_set[category]:
				freq=getList("./dataset/"+category+"/"+file)
				p_cat=naive_bayes(freq,database)
				if p_cat==category:
					true_predicted[category]+=1
			print('k=',k,'i=',i,'category=',category,'true_predicted\n',true_predicted)
	output={}
	for category in categories:
		output[category]=true_predicted[category]*100/1000.0
	return output
		
		
def k_fold_accuracy(k):
	t=int(1000/k) #number of documents in a fold
	print('number of documents in a fold=',t)
	dataset={}
	true_predicted={}
	for category in categories:
		system("ls ./dataset/"+category+">.tmp")
		a=open(".tmp")
		files=a.read()
		a.close()
		files=files.split('\n')
		files.pop()
		dataset[category]=files
		true_predicted[category]=0
	for i in range(0,k):
		print('i=',i)
		train_set={}
		test_set={}
		database={}
		for category in categories:
			train_set[category]=list(dataset[category][0:i*t]+dataset[category][(i+1)*t:])
			test_set[category]=list(dataset[category][i*t:(i+1)*t])
			database[category]={}
		#print('train-set\n',train_set)
		#print('test-set\n',test_set)
		for category in categories:
			for file in train_set[category]:
				freq=getList("./dataset/"+category+"/"+file)
				for word in freq:
					if word in database[category]:
						database[category][word]+=freq[word]
					else:
						database[category][word]=freq[word]
		
		
		print('database created')
		for category in categories:
			for file in test_set[category]:
				freq=getList("./dataset/"+category+"/"+file)
				p_cat=naive_bayes(freq,database)
				if p_cat==category:
					true_predicted[category]+=1
			print('k=',k,'i=',i,'category=',category,'true_predicted\n',true_predicted)
	output={}
	for category in categories:
		output[category]=true_predicted[category]*100/1000.0
	return output
def makedb(train_set):
	database={}
	for category in categories:
		database[category]={}
		for document in train_set[category]:
			freq=getList('./dataset/'+category+'/'+document)
			for word in freq:
				if word not in database[category]:
					database[category][word]=freq[word]
				else:
					database[category][word]+=freq[word]
	return database
def accuracy_measure_n(n):
	'''n->number of train documents'''
	seed(0)
	x=[]
	y={}
	documents={}
	accuracy={}
	for category in categories:
		system("ls ./dataset/"+category+">.tmp")
		a=open(".tmp")
		files=a.read()
		a.close()
		files=files.split('\n')
		files.pop()
		documents[category]=files
		accuracy[category]=0
	
	for i in range(0,3):
		print("n=",n,"i=",i)
		train_set={}
		test_set={}
		for category in categories:
			train_set[category]=[]
			test_set[category]=[]
			for j in range(0,n):
				t=documents[category][randint(0,len(documents[category])-1)]
				#print(type(t))
				while t in train_set[category]:
					t=documents[category][randint(0,len(documents[category])-1)]
				train_set[category].append(t)
			for d in documents[category]:
				if d not in train_set[category]:
					test_set[category].append(d)
		print("Traing and test sets created")
		'''#divide train set into 3
		tset0={}
		tset1={}
		tset2={}
		tset3={}
		tset4={}
		for category in train_set:
			tset0[category]=train_set[category][0:int(n/5)]
			tset1[category]=train_set[category][int(n/5):int(2*n/5)]
			tset2[category]=train_set[category][int(2*n/5):int(3*n/5)]
			tset3[category]=train_set[category][int(3*n/5):int(4*n/5)]
			tset4[category]=train_set[category][int(4*n/5):]
		#train the model
		database0=makedb(tset0)
		database1=makedb(tset1)
		database2=makedb(tset2)
		database3=makedb(tset3)
		database4=makedb(tset4)
		
		print("training completed n,i",n,i)		
#test model
		vp={0:1,1:1,2:1,3:1,4:1}
		#number of train documents'''
		database=makedb(train_set)
		print(database)
		pickle.dump(database,open('database.db','wb'))
		exit()
		for category in categories:
			#vp={0:1,1:1,2:1,3:1,4:1}
			p=0
			j=0;
			for document in test_set[category]:
				j+=1
				freq=getList('./dataset/'+category+'/'+document)
				'''pc={}
				cc={}
				cc[0]=naive_bayes(freq,database0)
				cc[1]=naive_bayes(freq,database1)
				cc[2]=naive_bayes(freq,database2)
				cc[3]=naive_bayes(freq,database3)
				cc[4]=naive_bayes(freq,database4)
				for l in range(0,5):
					#print('classfier',l,'with power ',vp[l],'predicts',cc[l])
					if cc[l] in pc:
						pc[cc[l]]+=vp[l]
					else:
						pc[cc[l]]=vp[l]
				
				#collect majority vote	
				p_cat=max(pc, key=pc.get)
				#print('predicted category',p_cat)
				if p_cat==category:
					p+=1
					
				#change voting power
				for l in range(0,5):
					if cc[l]==category:
						if vp[l]<2:
							vp[l]*=1.01
					else:
						if vp[l]>0.5:
							vp[l]*=0.99
				
				
				#print('n',n,'i',i,'j',j,'accuracy',(p/float(j))*100,'category',category)
				#print("---------------------------------------------------")
			'''
				p_cat=naive_bayes(freq,database)
				if p_cat==category:
					p+=1
				if p_cat=='Unable to decide':
					print("unable to decide happens")
			f=open("status.txt","a")
			f.write('n='+str(n)+'round'+str(i)+'\n'+str(p)+'documents classified successfully out of '+str(j)+'documents in category'+category+'\n')
			f.close()
				
			
			accuracy[category]+=p*100/len(test_set[category])
	f=open("status.txt","a")
	for category in categories:
		accuracy[category]=accuracy[category]/3
		f.write("accuracy["+category+"]="+str(accuracy[category]));
	f.close()
	print(accuracy)
	return accuracy