def learnfunction(path, pathTweet, numberUsedAll, numberUnlabeled, algorithm, removeWords): numberForTraining = int (0.8 * numberUsedAll) numberForTesting = int (0.2 * numberUsedAll) input_list, input_score = functions.readTestComment(path, numberUsedAll) tweets = functions.readManyStrings(pathTweet) randomSelect = random.sample(xrange(len(input_score)), numberUsedAll) input_list = [input_list[i] for i in randomSelect] input_score = [input_score[i] for i in randomSelect] filtered, freq_words = functions.useFilter(input_list, True, removeWords) f_tweets = functions.useFilter(tweets, False, removeWords) f_tweets = f_tweets[0:numberUnlabeled] accuracy = [] support = [] not_support = [] const_repeat = 5 for i in range(1,const_repeat): print i writeOut = False if algorithm == 4: if i == const_repeat-1: writeOut = True accur, suppor, not_suppor = sklearnClassify.learn(filtered, input_score, numberForTraining, 'decisionTree', f_tweets, writeOut, tweets[0:numberUnlabeled], path) elif algorithm == 3: if i == const_repeat-1: writeOut = True accur, suppor, not_suppor = sklearnClassify.learn(filtered, input_score, numberForTraining, 'BernoulliNB', f_tweets, writeOut, tweets[0:numberUnlabeled], path) elif algorithm == 2: if i == const_repeat-1: writeOut = True accur, suppor, not_suppor = sklearnClassify.learn(filtered, input_score, numberForTraining, 'MultinomialNB', f_tweets, writeOut, tweets[0:numberUnlabeled], path) elif algorithm == 1: if i == const_repeat-1: writeOut = True accur, suppor, not_suppor = sklearnClassify.learn(filtered, input_score, numberForTraining, "svm", f_tweets, writeOut, tweets[0:numberUnlabeled], path) accuracy.append(accur) support.append(suppor) not_support.append(not_suppor) print "" ac = str(sum(accuracy)/len(accuracy)) sp = str(sum(support)/len(support)) nsp = str(sum(not_support)/len(not_support)) print ac + " " + sp + " " + nsp return ac, sp, nsp
def learnfunction(path, pathTweet, numberUsedAll, numberUnlabeled, algorithm): numberForTraining = int(0.8 * numberUsedAll) numberForTesting = int(0.2 * numberUsedAll) input_list, input_score = functions.readTestComment(path, numberUsedAll) tweets = functions.readManyStrings(pathTweet) randomSelect = random.sample(xrange(len(input_score)), numberUsedAll) input_list = [input_list[i] for i in randomSelect] input_score = [input_score[i] for i in randomSelect] filtered, freq_words = functions.useFilter(input_list, True) f_tweets = functions.useFilter(tweets, False) f_tweets = f_tweets[0:numberUnlabeled] raw = functions.formRawDict(filtered, input_score) df = pd.DataFrame(raw) wordList = list(df.itertuples(index=False, name=None)) wordList = functions.filterZeroScore(wordList) accuracy = [] support = [] not_support = [] for i in range(1, 5): print i random.shuffle(wordList) wordList = wordList[0:numberUsedAll] trainingList = wordList[:numberForTraining] testList = wordList[numberForTraining:] if algorithm == 3: accur, suppor, not_suppor = sklearnClassify.bayes( filtered, input_score, numberForTraining, 'BernoulliNB', f_tweets) elif algorithm == 2: accur, suppor, not_suppor = sklearnClassify.bayes( filtered, input_score, numberForTraining, 'MultinomialNB', f_tweets) elif algorithm == 1: accur, suppor, not_suppor = sklearnClassify.svm( filtered, input_score, numberForTraining, f_tweets) accuracy.append(accur) support.append(suppor) not_support.append(not_suppor) print "" ac = str(sum(accuracy) / len(accuracy)) sp = str(sum(support) / len(support)) nsp = str(sum(not_support) / len(not_support)) print ac + " " + sp + " " + nsp return ac, sp, nsp
def click_ok_manual(self, event=None): if not os.path.isfile(self.pass_input.get()): self.label.config(text="File " + self.pass_input.get() + " doesn't exist!") return 0 print "file: " + self.pass_input.get() self.label.config(text="Label tweets manually") global start_bt_ms start_bt_ms = "Label tweets manually" self.stb.config(state='disabled') self.stb2.config(state='disabled') self.ctl_1.config(state='active') self.ctl_2.config(state='active') self.ctl_3.config(state='active') self.ctl_tx.config(state='normal') self.user_input.config(state='disabled') self.user_input2.config(state='disabled') self.pass_input.config(state='disabled') self.f1l1.config(state='disabled') self.f1l1L.config(state='disabled') self.f1l2.config(state='disabled') self.f1l2L.config(state='disabled') self.f1l3.config(state='disabled') self.f1l3L.config(state='disabled') self.stb3.config(text="Save&Quit", default='active', width=7) self.tweets = functions.readManyStrings(self.pass_input.get()) self.next_twi() self.class1 = [] self.class2 = [] global Mflag Mflag = True
if __name__ == "__main__": functions.startingInfo() path = "./output" word_to_grab = raw_input("Indicate file you want to build your category 1: " + "type the keyword you gave in previous (grabing) step.\n" + "Type \"Q\" if you wish to use your own file: ") or "China" if (word_to_grab == "Q"): file_name = raw_input("You hope to give your own file, please indicate loaction of it: " ) or (path + "/stream_" + "China" + ".txt") else: file_name = path + "/stream_" + word_to_grab + ".txt" tweets = functions.readManyStrings(file_name) print "The file you choose is in: \"" + file_name + "\", it contains " + str(len(tweets)) + " tweets." print "" print "Now select the tweets you want in training set for catefory 1." mode = raw_input( "The two available classification mode you can select:\n" + "1. select by key words \n" + "2. select one by one manually \n" + "select the mode you want: " ) or "2" print "" if mode == "1": mode1(tweets) if mode == "2": mode2(tweets)
#only library in sklearnClassify included currently import functions import sklearnClassify import pandas as pd import random import datetime numberForTraining = 4000 numberForTesting = 400 numberUsedAll = numberForTraining + numberForTesting path = './output/' input_list, input_score = functions.readTestComment(path, numberUsedAll) pathTweet = './output/unknownHead.csv' tweets = functions.readManyStrings(pathTweet) tweets = tweets[0:2000] randomSelect = random.sample(xrange(len(input_score)), numberUsedAll) input_list = [input_list[i] for i in randomSelect] input_score = [input_score[i] for i in randomSelect] print "Size of positive training set:" print input_score.count("1") print "Size of negative training set:" print input_score.count("-1") print "in filtering process..." filtered, freq_words = functions.useFilter(input_list, True) f_tweets = functions.useFilter(tweets, False)