Beispiel #1
0
def main():
	
	filename = "stars_data.csv"
	data = a.read_data(filename)
	data.pop(0)
	random.shuffle(data)
	frequency = a.frequency_word(data)
	
	data_neg = [x for x in data if int(x[6]) == 1]
	data_pos = [x for x in data if int(x[6]) == 5]

	matrix_pos = np.zeros((2000,2500))
	matrix_neg = np.zeros((2000,2500))
	matrix_pos = cluster.create_matrix(matrix_pos,data_pos,frequency)
	matrix_neg = cluster.create_matrix(matrix_neg,data_neg,frequency)
	
	kmeans_feature = cluster.kmeans_bin(data,matrix_pos,matrix_neg,frequency,50)
	smeans_feature = cluster.smeans_bin(data,matrix_pos,matrix_neg,frequency,50)	
	origin_feature = a.create_binary_feature(data,frequency,6)
	
	sample_origin_feature = a.create_binary_feature(data,random.sample(frequency,100),6)
	combine_feature = combine(kmeans_feature,sample_origin_feature)
	
	print "Test1"
	test1(matrix_pos,matrix_neg)
	print "Test2"
	test2(kmeans_feature,smeans_feature)
	print "Test3"
	test3(origin_feature,kmeans_feature)
	print "Test4"
	test4(sample_origin_feature,kmeans_feature,combine_feature)
Beispiel #2
0
def main(args):		
	#take argument
	trainfile = args[1]
	testfile = args[2]
	classlabel = int(args[3])
	printWord = int(args[4])
		
	
	#set train file an dtest file
	train = a.read_data(trainfile)
	test  = a.read_data(testfile)
	
	#get top 2000 frequency
	fre = a.frequency(train)
	
	
	#if yes, print Words
	if (printWord == 1):
		a.printTopwords(fre)
	
	#create binary feature for boss data
	train = a.create_binary_feature(train,fre,classlabel)
	test = a.create_binary_feature(test,fre,classlabel)
	
	#get probability table based on train data
	prob_table,pYes,pNo = a.train_nbc(train)
	
	#use probability table for testing,and return result
	result = a.test_nbc(prob_table,test,pYes,pNo)
	
	#get test class label
	classlabel = [x[-1] for x in test]
	
	
	#use zero one difference figure out result
	diff = a.zero_onr_loss(result,classlabel)
	
	print "ZERO-ONE-LOSS {0}".format(diff)
Beispiel #3
0
def main():
	
	#data preprocessing
	filename = "stars_data.csv"
	data = a.read_data(filename)
	data.pop(0)
	random.shuffle(data)
	words = a.frequency_word(data)	
	features = a.create_binary_feature(data,words,6)
	words.append("isPositive")
	words.append("isNegative")
	minsupport = 0.03
	minconf = 3.81
	
	L,support_count = apriori.frequentItemsetGeneration(features,words,minsupport)
	print len(L[0]) + len(L[1]) + len(L[2])
	rules = ruleG(L,support_count,minconf)
	print len(rules)
	rules = sorted(rules.items(),key=operator.itemgetter(1),reverse= True)
	rules = [rules[i] for i in range(30)]
	
	for rule in rules:
		print rule
Beispiel #4
0
def main():
	
	#data preprocessing
	filename = "stars_data.csv"
	data = a.read_data(filename)
	data.pop(0)
	random.shuffle(data)
	words = a.frequency_word(data)	
	features = a.create_binary_feature(data,words,6)
	words.append("isPositive")
	words.append("isNegative")
	minsupport = 0.03
	minconf = 0.25
	
	D = construct(features,words)
	D = map(set, D)
	t = []
	t.append(frozenset(['friendly']))
	t.append(frozenset(['isPositive']))
	t.append(frozenset(['staff']))
	t.append(frozenset(['favorite']))
	
	q2(D,t)
	'''