def main(): filename = "stars_data.csv" data = a.read_data(filename) data.pop(0) random.shuffle(data) frequency = a.frequency_word(data) data_neg = [x for x in data if int(x[6]) == 1] data_pos = [x for x in data if int(x[6]) == 5] matrix_pos = np.zeros((2000,2500)) matrix_neg = np.zeros((2000,2500)) matrix_pos = cluster.create_matrix(matrix_pos,data_pos,frequency) matrix_neg = cluster.create_matrix(matrix_neg,data_neg,frequency) kmeans_feature = cluster.kmeans_bin(data,matrix_pos,matrix_neg,frequency,50) smeans_feature = cluster.smeans_bin(data,matrix_pos,matrix_neg,frequency,50) origin_feature = a.create_binary_feature(data,frequency,6) sample_origin_feature = a.create_binary_feature(data,random.sample(frequency,100),6) combine_feature = combine(kmeans_feature,sample_origin_feature) print "Test1" test1(matrix_pos,matrix_neg) print "Test2" test2(kmeans_feature,smeans_feature) print "Test3" test3(origin_feature,kmeans_feature) print "Test4" test4(sample_origin_feature,kmeans_feature,combine_feature)
def main(args): #take argument trainfile = args[1] testfile = args[2] classlabel = int(args[3]) printWord = int(args[4]) #set train file an dtest file train = a.read_data(trainfile) test = a.read_data(testfile) #get top 2000 frequency fre = a.frequency(train) #if yes, print Words if (printWord == 1): a.printTopwords(fre) #create binary feature for boss data train = a.create_binary_feature(train,fre,classlabel) test = a.create_binary_feature(test,fre,classlabel) #get probability table based on train data prob_table,pYes,pNo = a.train_nbc(train) #use probability table for testing,and return result result = a.test_nbc(prob_table,test,pYes,pNo) #get test class label classlabel = [x[-1] for x in test] #use zero one difference figure out result diff = a.zero_onr_loss(result,classlabel) print "ZERO-ONE-LOSS {0}".format(diff)
def main(): #data preprocessing filename = "stars_data.csv" data = a.read_data(filename) data.pop(0) random.shuffle(data) words = a.frequency_word(data) features = a.create_binary_feature(data,words,6) words.append("isPositive") words.append("isNegative") minsupport = 0.03 minconf = 3.81 L,support_count = apriori.frequentItemsetGeneration(features,words,minsupport) print len(L[0]) + len(L[1]) + len(L[2]) rules = ruleG(L,support_count,minconf) print len(rules) rules = sorted(rules.items(),key=operator.itemgetter(1),reverse= True) rules = [rules[i] for i in range(30)] for rule in rules: print rule
def main(): #data preprocessing filename = "stars_data.csv" data = a.read_data(filename) data.pop(0) random.shuffle(data) words = a.frequency_word(data) features = a.create_binary_feature(data,words,6) words.append("isPositive") words.append("isNegative") minsupport = 0.03 minconf = 0.25 D = construct(features,words) D = map(set, D) t = [] t.append(frozenset(['friendly'])) t.append(frozenset(['isPositive'])) t.append(frozenset(['staff'])) t.append(frozenset(['favorite'])) q2(D,t) '''