Exemple #1
0
def learnfunction(path, pathTweet, numberUsedAll, numberUnlabeled, algorithm, removeWords): 
    numberForTraining = int (0.8 * numberUsedAll)
    numberForTesting = int (0.2 * numberUsedAll)
    input_list, input_score = functions.readTestComment(path, numberUsedAll) 

    tweets = functions.readManyStrings(pathTweet)

    randomSelect = random.sample(xrange(len(input_score)), numberUsedAll)
    input_list = [input_list[i] for i in randomSelect]
    input_score = [input_score[i] for i in randomSelect]

    filtered, freq_words = functions.useFilter(input_list, True, removeWords)
    f_tweets = functions.useFilter(tweets, False, removeWords)
    f_tweets = f_tweets[0:numberUnlabeled]

    accuracy = []
    support = []
    not_support = []
    const_repeat = 5
    for i in range(1,const_repeat):
        print i
    
        writeOut = False
        if algorithm == 4:
            if i == const_repeat-1:
                writeOut = True
            accur, suppor, not_suppor = sklearnClassify.learn(filtered, input_score, numberForTraining, 
                                    'decisionTree', f_tweets, writeOut, tweets[0:numberUnlabeled], path)
        elif algorithm == 3:
            if i == const_repeat-1:
                writeOut = True
            accur, suppor, not_suppor = sklearnClassify.learn(filtered, input_score, numberForTraining, 
                                    'BernoulliNB', f_tweets, writeOut, tweets[0:numberUnlabeled], path)

        elif algorithm == 2:
            if i == const_repeat-1:
                writeOut = True
            accur, suppor, not_suppor = sklearnClassify.learn(filtered, input_score, numberForTraining, 
                                    'MultinomialNB', f_tweets, writeOut, tweets[0:numberUnlabeled], path)

        elif algorithm == 1:
            if i == const_repeat-1:
                writeOut = True
            accur, suppor, not_suppor = sklearnClassify.learn(filtered, input_score, numberForTraining,
                                    "svm", f_tweets, writeOut, tweets[0:numberUnlabeled], path)

        accuracy.append(accur)
        support.append(suppor)
        not_support.append(not_suppor)
        print ""

    ac = str(sum(accuracy)/len(accuracy))
    sp = str(sum(support)/len(support))
    nsp = str(sum(not_support)/len(not_support))
    print ac + " " + sp + " " + nsp 
    return ac, sp, nsp 
Exemple #2
0
def learnfunction(path, pathTweet, numberUsedAll, numberUnlabeled, algorithm):
    numberForTraining = int(0.8 * numberUsedAll)
    numberForTesting = int(0.2 * numberUsedAll)
    input_list, input_score = functions.readTestComment(path, numberUsedAll)

    tweets = functions.readManyStrings(pathTweet)

    randomSelect = random.sample(xrange(len(input_score)), numberUsedAll)
    input_list = [input_list[i] for i in randomSelect]
    input_score = [input_score[i] for i in randomSelect]

    filtered, freq_words = functions.useFilter(input_list, True)
    f_tweets = functions.useFilter(tweets, False)
    f_tweets = f_tweets[0:numberUnlabeled]

    raw = functions.formRawDict(filtered, input_score)
    df = pd.DataFrame(raw)
    wordList = list(df.itertuples(index=False, name=None))
    wordList = functions.filterZeroScore(wordList)

    accuracy = []
    support = []
    not_support = []
    for i in range(1, 5):
        print i
        random.shuffle(wordList)
        wordList = wordList[0:numberUsedAll]
        trainingList = wordList[:numberForTraining]
        testList = wordList[numberForTraining:]

        if algorithm == 3:
            accur, suppor, not_suppor = sklearnClassify.bayes(
                filtered, input_score, numberForTraining, 'BernoulliNB',
                f_tweets)
        elif algorithm == 2:
            accur, suppor, not_suppor = sklearnClassify.bayes(
                filtered, input_score, numberForTraining, 'MultinomialNB',
                f_tweets)
        elif algorithm == 1:
            accur, suppor, not_suppor = sklearnClassify.svm(
                filtered, input_score, numberForTraining, f_tweets)

        accuracy.append(accur)
        support.append(suppor)
        not_support.append(not_suppor)
        print ""

    ac = str(sum(accuracy) / len(accuracy))
    sp = str(sum(support) / len(support))
    nsp = str(sum(not_support) / len(not_support))
    print ac + " " + sp + " " + nsp
    return ac, sp, nsp
Exemple #3
0
    def click_ok_manual(self, event=None):
        if not os.path.isfile(self.pass_input.get()):
            self.label.config(text="File " + self.pass_input.get() +
                              " doesn't exist!")
            return 0

        print "file: " + self.pass_input.get()

        self.label.config(text="Label tweets manually")
        global start_bt_ms
        start_bt_ms = "Label tweets manually"
        self.stb.config(state='disabled')
        self.stb2.config(state='disabled')

        self.ctl_1.config(state='active')
        self.ctl_2.config(state='active')
        self.ctl_3.config(state='active')
        self.ctl_tx.config(state='normal')

        self.user_input.config(state='disabled')
        self.user_input2.config(state='disabled')
        self.pass_input.config(state='disabled')
        self.f1l1.config(state='disabled')
        self.f1l1L.config(state='disabled')
        self.f1l2.config(state='disabled')
        self.f1l2L.config(state='disabled')
        self.f1l3.config(state='disabled')
        self.f1l3L.config(state='disabled')

        self.stb3.config(text="Save&Quit", default='active', width=7)

        self.tweets = functions.readManyStrings(self.pass_input.get())
        self.next_twi()
        self.class1 = []
        self.class2 = []
        global Mflag
        Mflag = True
Exemple #4
0
        

if __name__ == "__main__":
    functions.startingInfo()
    path = "./output"
    word_to_grab = raw_input("Indicate file you want to build your category 1: " 
                            + "type the keyword you gave in previous (grabing) step.\n" 
                            + "Type \"Q\" if you wish to use your own file: ") or "China" 

    if (word_to_grab == "Q"):
        file_name = raw_input("You hope to give your own file, please indicate loaction of it: "
                    ) or (path + "/stream_" + "China" + ".txt") 
    else:
        file_name = path + "/stream_" + word_to_grab + ".txt"

    tweets = functions.readManyStrings(file_name)
    print "The file you choose is in: \"" + file_name + "\", it contains " + str(len(tweets)) + " tweets."
    print ""
    print "Now select the tweets you want in training set for catefory 1."
    mode = raw_input(
            "The two available classification mode you can select:\n"
            + "1. select by key words \n"
            + "2. select one by one manually \n"
            + "select the mode you want: "
            ) or "2" 
    print ""

    if mode == "1":
        mode1(tweets)
    if mode == "2":
        mode2(tweets)
Exemple #5
0
#only library in sklearnClassify included currently
import functions
import sklearnClassify
import pandas as pd
import random
import datetime

numberForTraining = 4000
numberForTesting = 400
numberUsedAll = numberForTraining + numberForTesting

path = './output/'
input_list, input_score = functions.readTestComment(path, numberUsedAll)

pathTweet = './output/unknownHead.csv'
tweets = functions.readManyStrings(pathTweet)
tweets = tweets[0:2000]

randomSelect = random.sample(xrange(len(input_score)), numberUsedAll)
input_list = [input_list[i] for i in randomSelect]
input_score = [input_score[i] for i in randomSelect]

print "Size of positive training set:"
print input_score.count("1")
print "Size of negative training set:"
print input_score.count("-1")

print "in filtering process..."
filtered, freq_words = functions.useFilter(input_list, True)
f_tweets = functions.useFilter(tweets, False)