Ejemplo n.º 1
0
def main(argv):

    training_set = argv[1]
    test_set = argv[2]
    algorithm = argv[3]

    training_set = read_csv(training_set)
    test_set = read_csv(test_set)

    algorithm = algorithm.upper()

    if algorithm == 'NB':

        nb = NaiveBayes()
        nb.calculate_nb(training_set, test_set)

    else:
        int_match = re.findall('\d*', algorithm)
        if int_match[0] is not None:
            algorithm = algorithm.strip(int_match[0])
            k = int(int_match[0])
        if algorithm == 'NN':
            nn = KNearestNeighbor()
            nn.calculate_knn(training_set, test_set, k)

        exit()
Ejemplo n.º 2
0
    def train(self, in_file, out_file):
        # If existing learned data (pkl), load learned data.
        nb = None
        if os.path.exists(out_file):
            with open(out_file, 'rb') as f:
                nb = pickle.load(f)
        # If no learned data, execute learning.
        else:
            # Read learning data.
            nb = NaiveBayes()
            fin = codecs.open(in_file, 'r', 'utf-8')
            lines = fin.readlines()
            fin.close()
            items = []

            for line in lines:
                words = line[:-2]
                train_words = words.split('@')
                items.append(train_words[1])
                nb.train(train_words[1], train_words[0])

            # Save learned data to pkl file.
            with open(out_file, 'wb') as f:
                pickle.dump(nb, f)
        return nb
Ejemplo n.º 3
0
    def make_data_percentage_graph(input_tweets_file, input_labels_file):
        """
            Creates a graph to determine the relationship between test set size and accuracy.
            We try testing on 25 percent - 85 percent of the dataset, where we increment by 5 percent in every run
            The value of alpha is kept as default - 1.0
        """
        plt.title(
            "Accuracy of Naive Bayes with varied test data size and fixed alpha"
        )
        plt.xlabel("Percentage of data used for testing")
        plt.ylabel("Accuracy Achieved")

        values = [
            20.0, 25.0, 30.0, 35.0, 40.0, 45.0, 50.0, 55.0, 60.0, 65.0, 70.0,
            75.0, 80.0, 85.0
        ]
        accuracies = []

        for val in values:
            nb = NaiveBayes(input_tweets_file, input_labels_file, 1.0,
                            (val / 100.0))
            nb.update_model()
            accuracies.append(nb.evaluate_classifier_accuracy())

        plt.plot(values, accuracies)
        # Save the figure in the Figures directory
        file_name = "Figures/accuracy_varied_test_ratio_" + input_tweets_file[
            8:10] + ".jpeg"
        plt.savefig(file_name)
        # To refresh the graph
        plt.close()
Ejemplo n.º 4
0
    def make_accuracy_graph(input_tweets_file, input_labels_file):
        """
            Creates a graph to determine the relationship between alpha and accuracy.
            The values of that are tried are from 1 - 50
            The test set size is kept as default - 20%
        """

        plt.title(
            "Accuracy of Naive Bayes with varied alpha and fixed test set size"
        )
        plt.xlabel("Alpha Value")
        plt.ylabel("Accuracy Achieved")

        accuracies = []

        for i in range(1, 51):
            nb = NaiveBayes(input_tweets_file, input_labels_file, i)
            nb.update_model()
            accuracies.append(nb.evaluate_classifier_accuracy())

        plt.plot(range(1, 51), accuracies)
        # Save the figure in the Figures directory
        file_name = "Figures/accuracy_varied_alpha_" + input_tweets_file[
            8:10] + ".jpeg"
        plt.savefig(file_name)
        # To refresh the graph
        plt.close()
Ejemplo n.º 5
0
    def bindListenerVal(self, event):
        #(not str(event.char).isdigit()
        if (event.keycode == 8):
            #print("reves")
            self.word = (str(self.bins_textField.get()))[:-1]
        else:
            try:
                self.word = str(self.bins_textField.get() + str(event.char))
            except:
                self.word = str(self.bins_textField.get())

        #print("shit:" + self.word)
        #print("shit:" + str(len(self.word)))
        if ((self.word.isdigit() and int(self.word) > 0)
                or (len(self.word)
                    == 0)):  # and len(str(self.bins_textField.get())) == 0)):
            self.inputBindAlret.configure(foreground="#3e5d93")
            if ((self.word.isdigit())):
                self.bindValOk = True
                if (self.directoryValOk == True):
                    self.Build_button.configure(state='normal')
                    self.NB = NaiveBayes()
            else:
                self.bindValOk = False
                self.Build_button.configure(state='disable')

        else:
            self.inputBindAlret.configure(foreground="#ffffffffffff")
            self.bindValOk = False

            self.Build_button.configure(state='disable')
Ejemplo n.º 6
0
    def folderBrowseAction(self):
        dirWind = tk.Tk()
        dirWind.withdraw()
        path = askdirectory()
        if (len(str(self.directory_textField.get())) != 0):
            self.directory_textField.delete(0, 'end')
        self.directory_textField.insert(0, str(path))
        dirWind.destroy()
        if (os.path.isdir(self.directory_textField.get()) == False):
            self.directoryValOk = False
            if (len(self.directory_textField.get()) != 0):
                messagebox.showerror('oops!',
                                     'Please insert a valid Directory path!')
                self.directory_textField.delete(0, 'end')

        else:
            if ((os.path.exists(self.directory_textField.get() + "/train.csv")
                 == False) or (os.path.exists(self.directory_textField.get() +
                                              "/test.csv") == False)
                    or (os.path.exists(self.directory_textField.get() +
                                       "/Structure.txt") == False)):
                self.directoryValOk = False
                if (len(self.directory_textField.get()) != 0):
                    messagebox.showerror(
                        'oops!',
                        '~~ MISSING FILES ~~\n\nMake sure that the files:\ntrain.csv,\ntest.csv\nStructure.txt \nare exists in this path!'
                    )
            else:
                self.directoryValOk = True
                if (self.bindValOk == True):
                    self.Build_button.configure(state='normal')
                    self.NB = NaiveBayes()
Ejemplo n.º 7
0
    def valuesCheckButtonAbillity(self, event):
        if (os.path.isdir(self.directory_textField.get()) == False):
            self.directoryValOk = False
            if (len(self.directory_textField.get()) != 0):
                messagebox.showerror('oops!',
                                     'Please insert a valid Directory path!')
                self.directory_textField.delete(0, 'end')

        else:
            if ((os.path.exists(self.directory_textField.get() + "/train.csv")
                 == False) or (os.path.exists(self.directory_textField.get() +
                                              "/test.csv") == False)
                    or (os.path.exists(self.directory_textField.get() +
                                       "/Structure.txt") == False)):
                self.directoryValOk = False
                if (len(self.directory_textField.get()) != 0):
                    messagebox.showerror(
                        'oops!',
                        '~~ MISSING FILES ~~\n\nMake sure that the files:\ntrain.csv,\ntest.csv\nStructure.txt \nare exists in this path!'
                    )
            else:
                self.directoryValOk = True
                if (self.bindValOk == True):
                    self.Build_button.configure(state='normal')
                    self.NB = NaiveBayes()
Ejemplo n.º 8
0
def use_nb2(datas_train, datas_valid):
    nb = NaiveBayes()
    predicts_all = []
    correct_all = 0
    #
    for cur_part in range(1,partition+1):
        nb.train(datas_train)
        predicts = [nb.predict(data['content'])[0] for data in datas_valid]
        predicts_all += predicts
        correct = 0
        for i in range(len(predicts)):
            if predicts[i] == datas_valid[i]['category']:
                correct += 1
        correct_all += correct
        print("Correct: ", correct, "out of ", len(datas_valid))
        print(datas_valid[-1]['date'])
        #
        datas_train = datas[min(len(datas), plen*cur_part) : min(len(datas), plen*(cur_part+1))]
        datas_valid = datas[min(len(datas), plen*(cur_part+1)) : min(len(datas), plen*(cur_part+2))]
        #
        for i in range(len(datas_train)):
            datas_train[i]['category'] = predicts[i]
    #
    res = "Correct: %d out of %d"%(correct_all, len(predicts_all))
    print(res)
    return res 
Ejemplo n.º 9
0
 def __init__(self, *args, **kwargs):
     super(SmartMatch, self).__init__(*args, **kwargs)
     self.datapaths = {}
     self.monitor_thread = hub.spawn(self._monitor)
     self.naive_bayes = NaiveBayes()
     self.naive_bayes.init_classifier()
     self.logger.info('instantiated Naive Bayes classifier, its accuracy score is: %0.2f',
                      self.naive_bayes.get_accuracy_score())
     self.flow_container = {}
Ejemplo n.º 10
0
def run_nb():
    print("Importing Naive Bayes...")
    from NaiveBayes import NaiveBayes
    print("Successfully Imported Naive Bayes.")
    print("Running...")
    nb_obj = NaiveBayes(file_paths.us_tweets_path, file_paths.us_labels_path)
    nb_obj.update_model()
    print("Execution Complete. Accuracy:" +
          str(nb_obj.evaluate_classifier_accuracy()) + " %")
Ejemplo n.º 11
0
    def train(self, dataSet, CHigher):
        self.assignSensitivity(dataSet)
        dsX = self.splitDataFrame(dataSet, self.Sx)
        dsY = self.splitDataFrame(dataSet, self.Sy)

        NaiveBayes.train(self, dsX, self.modelX)
        NaiveBayes.train(self, dsY, self.modelY)

        self.modify(dataSet, CHigher)
Ejemplo n.º 12
0
	def __init__(self,paramlist):
		#list of filenames in validation corpus
		self.validationcorpus = []
		#list of filenames in trainingcorpus
		self.trainingcorpus = []
		self.createvalandtrain()
		#svm class object:
		self.svm = SVM_classifier()
		self.nb = NaiveBayes()
		logging.basicConfig(filename="test.log", level=logging.DEBUG)
Ejemplo n.º 13
0
 def naive_bayes(self):
     nb = NaiveBayes()
     accuracy_score = 0
     plot = False
     train_data, test_data = self.kfold_split(self.k)
     for i in range(self.k):
         classifier = nb.train(train_data[i])
         if i == 9:
             plot = True
         accuracy_score = accuracy_score + nb.test(classifier, test_data[i],
                                                   plot)
     return accuracy_score / self.k
Ejemplo n.º 14
0
def main():
    data, target = DataSetFileReader.read_dataset_file(
        'data/SMSSpamCollection')

    classifier = NaiveBayes()
    classifier.fit(data, target)

    input_data = DataSetFileReader.read_input_data_file('data/inputdata')

    result = NaiveBayes.predict(classifier, input_data)

    for pred, msg in zip(result, input_data):
        print('{0} -> {1}'.format(pred.upper(), msg))
Ejemplo n.º 15
0
def main():
    attributes_train, data_train = read_from_file("train.txt")

    # DTL
    dtl = DecisionTree()
    tree = dtl.build(data_train, attributes_train)
    with open("output_tree.txt", "w") as file:
        tree_string = dtl.write_tree_to_file(tree, attributes_train, 0)
        file.write(tree_string[:len(tree_string) - 1])
    # KNN
    knn = KNearestNeighbors(attributes_train, data_train)
    # NAIVE BAYES
    naive_bayes = NaiveBayes(attributes_train, data_train)
    attribute_text, data_test = read_from_file("test.txt")
    knn_result = []
    naive_bayes_result = []
    dtl_result = []
    real_classify = []

    for line in data_test:
        real_classify.append(line[-1])
        entry = line[:-1]
        knn_result.append(knn.predict(entry, 5))
        naive_bayes_result.append(naive_bayes.predict(entry))
        dtl_result.append(dtl.predict(tree, entry, attribute_text))
    acc_knn = 0
    acc_nb = 0
    acc_dtl = 0
    # get accuracy
    for (dtl, knn, nb, real) in zip(dtl_result, knn_result, naive_bayes_result,
                                    real_classify):
        if dtl == real:
            acc_dtl += 1
        if knn == real:
            acc_knn += 1
        if nb == real:
            acc_nb += 1
    acc_knn /= len(real_classify)
    acc_nb /= len(real_classify)
    acc_dtl /= len(real_classify)
    acc_knn = float(math.ceil(acc_knn * 100)) / float(100)
    acc_nb = float(math.ceil(acc_nb * 100)) / float(100)
    acc_dtl = float(math.ceil(acc_dtl * 100)) / float(100)

    with open('output.txt', 'w') as output:
        output.write("Num\tDT\tKNN\tnaiveBase\n")
        for i, (a, b, c) in (enumerate(
                zip(dtl_result, knn_result, naive_bayes_result))):
            output.write(str(i + 1) + "\t" + a + "\t" + b + "\t" + c + "\n")
        output.write("\t" + str(acc_dtl) + "\t" + str(acc_knn) + "\t" +
                     str(acc_nb) + "\n")
Ejemplo n.º 16
0
def buildSplits(numFolds, args):
    """Builds the splits for training/testing"""
    splits = []
    trainDir = args[0]
    if len(args) == 1:
        print '[INFO]\tPerforming %d-fold cross-validation on data set:\t%s' % (
            numFolds, trainDir)
        posTrainFileNames = os.listdir('%s/pos/' % trainDir)
        negTrainFileNames = os.listdir('%s/neg/' % trainDir)
        for fold in range(0, numFolds):
            split = NaiveBayes.TrainSplit()
            for fileName in posTrainFileNames:
                example = NaiveBayes.Example()
                example.words = readFile('%s/pos/%s' % (trainDir, fileName))
                example.klass = 'pos'
                if fileName[2] == str(fold):
                    split.test.append(example)
                else:
                    split.train.append(example)
            for fileName in negTrainFileNames:
                example = NaiveBayes.Example()
                example.words = readFile('%s/neg/%s' % (trainDir, fileName))
                example.klass = 'neg'
                if fileName[2] == str(fold):
                    split.test.append(example)
                else:
                    split.train.append(example)
            splits.append(split)
    elif len(args) == 2:
        split = NaiveBayes.TrainSplit()
        testDir = args[1]
        print '[INFO]\tTraining on data set:\t%s testing on data set:\t%s' % (
            trainDir, testDir)
        posTrainFileNames = os.listdir('%s/pos/' % trainDir)
        negTrainFileNames = os.listdir('%s/neg/' % trainDir)
        for fileName in posTrainFileNames:
            example = NaiveBayes.Example()
            example.words = readFile('%s/pos/%s' % (trainDir, fileName))
            example.klass = 'pos'
            split.train.append(example)
        for fileName in negTrainFileNames:
            example = NaiveBayes.Example()
            example.words = readFile('%s/neg/%s' % (trainDir, fileName))
            example.klass = 'neg'
            split.train.append(example)

        posTestFileNames = os.listdir('%s/pos/' % testDir)
        negTestFileNames = os.listdir('%s/neg/' % testDir)
        for fileName in posTestFileNames:
            example = NaiveBayes.Example()
            example.words = readFile('%s/pos/%s' % (testDir, fileName))
            example.klass = 'pos'
            split.test.append(example)
        for fileName in negTestFileNames:
            example = NaiveBayes.Example()
            example.words = readFile('%s/neg/%s' % (testDir, fileName))
            example.klass = 'neg'
            split.test.append(example)
        splits.append(split)
    return splits
Ejemplo n.º 17
0
def compare(filename):  #filename vai ser Tp1_data.csv
    showPlots = False
    Xs, Ys = get_data(filename)
    X_r, X_t, Y_r, Y_t = train_test_split(Xs, Ys, test_size=0.33, stratify=Ys)
    folds = 5
    Kf = StratifiedKFold(Y_r, n_folds=folds)

    KnnErr, bestN, KnnPred = Knn(Kf, X_r, Y_r, X_t, Y_t,
                                 showPlots)  #KnnPred AA-07
    print("KnnErr, best_N:", KnnErr, bestN)

    LogScore, bestC, LogPred = Logistic(Kf, X_r, Y_r, X_t, Y_t, showPlots)
    print("LogisticScore, best_C:", LogScore, bestC)

    NBScore, bestBandwidth, NBPred = NaiveBayes(Kf, X_r, Y_r, X_t, Y_t,
                                                showPlots)
    print("NBScore, best_Bandwidth:", NBScore, bestBandwidth)

    MCNemarKnn_Log = MCNemar(KnnPred, LogPred, Y_t)  #(|e01-e10|-1)²/e01+e10
    MCNemarNB_Log = MCNemar(NBPred, LogPred, Y_t)
    MCNemarNB_Knn = MCNemar(KnnPred, NBPred, Y_t)

    print()
    print("McNemar:")
    print("MCNemarKnn_Log", MCNemarKnn_Log)
    print("MCNemarNB_Log", MCNemarNB_Log)
    print("MCNemarNB_Knn", MCNemarNB_Knn)
Ejemplo n.º 18
0
def buildTestCorpus(ch_aux): 
  """takes doc1\n###\ndoc2\n###... and makes list of documents.
     build their NB, train on train, output pos\nneg\npos...
  """
  # split on ###
  testSplit = NaiveBayes.TrainSplit()
  documents = ch_aux.split('###')
  for document in documents:
    document = document.strip() # remove trailing/starting newlines
    example = NaiveBayes.Example() # example for this document
    example.klass = 'UNK' # testing time, we don't know the label
    example.words = []
    for word in document.split(): # for every token
      example.words.append(word)
    testSplit.test.append(example)
  return testSplit
Ejemplo n.º 19
0
 def __init__(self, ntrees, nbayes, pruneSplit=0.0):
     super().__init__()
     self.classifiers = []
     for i in range(ntrees):
         self.classifiers.append(DecisionTree(pruneSplit=pruneSplit))
     for i in range(nbayes):
         self.classifiers.append(NaiveBayes())
    def processData(self, modelName, gender, pClass, siblings, embarked):
        # loading the dataset
        df = pd.read_csv('train.csv', sep=',')

        # droping passengers id
        df = df.drop('PassengerId', axis=1)

        # changing strings to numeric values
        df["Sex"].replace({"male": 0, "female": 1}, inplace=True)
        df["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True)

        # fillin empty values
        df["Embarked"].fillna(df["Embarked"].mean(), inplace=True)

        # seperating inputs and outputs
        x = df.drop('Survived', axis=1)
        y = df['Survived']

        model = None

        if modelName == 'Decision Tree':
            model = DecisionTree(df)
        elif modelName == 'Naive Bayes':
            model = NaiveBayes(df)
        elif modelName == 'Neural Network':
            model = NeuralNetwork(df)
        elif modelName == 'Random Forest':
            model = RandomForest(df)
        else:
            model = SupportVector(df)
        return model
Ejemplo n.º 21
0
    def train(self):
        """
        training the Entity Classifier
        """
        for i in range(0, 6):
            ori_labels = self.label_sets[i]
            
            print("[Training Entity Classifier with VLSP 2018]")
            # instantiate a NB class object
            self.nb = NaiveBayes(np.unique(ori_labels))
            print("---------------- Training In Progress --------------------")

            # start training by calling the train function
            self.nb.cross_validation(self.ori_data, ori_labels)
            self.classifiers.append(self.nb)
            print('----------------- Training Completed ---------------------')
def inspect(fname, dbname):
    classifier = NaiveBayes()
    ds_builder = DataSet(classifier)

    ds, labels, attributes = ds_builder.ReadDataSet(dbname)
    #purified_ds = ds_builder.PurifyDataSet(ds)
    trained_ds = classifier.TrainingDataSet(ds, labels, attributes)

    #
    data_str = get_data(fname, dbname)
    data_set = data_str.split()
    for i in range(len(data_set)):
        data_set[i] = float(data_set[i])

    result = classifier.InspectData(ds, trained_ds, data_set)
    return result
Ejemplo n.º 23
0
    def get_classifier_object(self):
        # if self.classifier_name == 'LogReg':
        #     self.clf = LogReg(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        # elif self.classifier_name == 'DeciTree':
        #     self.clf = DecisionTree(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        # elif self.classifier_name == 'svm':
        #     self.clf = SVM(self.x_train, self.y_train, self.x_test, self.y_test)
        #     self.clf.train()
        #     self.y_pred = self.clf.predict()
        if self.classifier_name == 'RForest':
            self.clf = RandomForest(self.x_train, self.y_train, self.x_test,
                                    self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'XGB':
            self.clf = XGBoost(self.x_train, self.y_train, self.x_test,
                               self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'NaiveBayes':
            self.clf = NaiveBayes(self.x_train, self.y_train, self.x_test,
                                  self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()
        elif self.classifier_name == 'AdaBoost':
            self.clf = AdaBoost(self.x_train, self.y_train, self.x_test,
                                self.y_test)
            self.clf.train()
            self.y_pred = self.clf.predict()

        return self.clf.get_classifier()
Ejemplo n.º 24
0
    def load_from_model(self, model_name):
        """
        Metoda nacita z predaneho modelu jednotlive slovniky klasifikacnich trid a spoustí GUI, ceka na stisk tlacitka a pote klasifikuje zadanou vetu.
        :param model_name: model ze ktereho se maji nacist jednotliva data.
        """

        with open(model_name, "r") as read_file:
            json_load = json.load(read_file)
            if json_load["namepriz"] == "BagOfWords":
                self.priz_metoda = BagOfWords()
            elif json_load["namepriz"] == "TfIdf":
                self.priz_metoda = TfIdf()
            elif json_load["namepriz"] == "NGram":
                self.priz_metoda = NGram()

            self.priz_metoda.words = json_load["words"]
            self.priz_metoda.klas_tridy = json_load["klas_tridy"]
            self.priz_metoda.prior = json_load["prior"]

            if json_load["nameklas"] == "NaiveBayes":
                self.klasifikator = NaiveBayes(self.priz_metoda)
            elif json_load["nameklas"] == "NN":
                self.klasifikator = NN(self.priz_metoda)

        self.top.title("Classify")
        self.top.geometry('400x300')
        buttonCommit = Button(self.top, height=1, width=10, text="Commit",
                              command=lambda: self.retrieve_input())
        self.text1.pack()
        buttonCommit.pack()
        self.label.pack()
        self.top.mainloop()
Ejemplo n.º 25
0
def initiate_nb():
    conn = sqlite3.connect("FakeNews.sqlite")
    c = conn.cursor()
    c.execute("Select * from FakeNewsTbl")
    results = c.fetchall()
    x_train = [row[2] for row in results]
    y_train = [row[1] for row in results]
    conn.close()

    train_data, test_data, train_labels, test_labels = train_test_split(x_train, y_train, shuffle=True, test_size=0.25,
                                                                        random_state=42, stratify=y_train)
    classes = np.unique(train_labels)

    # Training phase....
    nb = NaiveBayes(classes)
    nb.train(train_data, train_labels)
    return nb
def leave_one_out(x, y, is_continuous):
    """
    使用留一法验证法对模型进行评估
    :return: 模型留一法验证下的正确率
    """
    cnt = 0
    for i in range(len(x)):
        train_x = np.delete(x, i, 0)
        train_y = np.delete(y, i, 0)
        test_x = x[i]
        test_y = y[i]
        naive_bayes = NaiveBayes(train_x, train_y, is_continuous)
        naive_bayes.train()
        test_result = naive_bayes.inference(test_x)
        if test_result == test_y:
            cnt += 1
    return cnt / len(x)
Ejemplo n.º 27
0
def get_model(model):
    """
    load model from json file then call NaiveBayes.test() to test
    """
    with open(model, encoding='utf-8') as json_file:
        data = json.load(json_file)
        classes = np.asarray(data["classes"])
        cates_info = data["cates_info"]
        cates_info = {int(k): v for k, v in cates_info.items()}
        for cate_index, cate in enumerate(classes):
            cates_info[cate_index] = {
                int(k): v
                for k, v in cates_info[cate_index].items()
            }
        nb = NaiveBayes(classes)
        nb.cates_info = cates_info
        return nb
def main():
    n = int(input('enter no of training data sentences :'))
    X,Y = [],[]
    for i in range(n):
        words_i = input('enter ' + str(i)+'th sentence:').strip().split(' ')
        X.append(words_i)
        Y.append(input('enter the class of sentence :').strip())
    
    clean_obj = Clean()
    clean_obj.feature_extract(X, n)
    clean_obj.print_features()
    X = clean_obj.transform_X(X, n)
    Y = clean_obj.transform_Y(Y)
    print('X is :',X)
    print('Y is :',Y)

    clf = NaiveBayes()
    clf.train_text(X, Y)
    clf.printdictionaries()
    
    m = int(input('enter number of testing entries:'))
    Xtest = []
    for i in range(m):
        test_words_i = input('enter ' + str(i+1)+'th sentence:').strip().split(' ')
        Xtest.append(test_words_i)
    Xtest_trans = clean_obj.transform_X(Xtest, m)
    print(Xtest_trans)
    for i in range(m):
        print(clf.predict_text(Xtest_trans[i]))
Ejemplo n.º 29
0
def main(argv):

	trainfile = ''
	testfile  = ''
	mode      = ''

	# validate input
	if len(sys.argv) == 4:
		trainfile = sys.argv[1]
		testfile  = sys.argv[2]
		mode      = sys.argv[3]
	else:
		print("incorrect input supplied")
		sys.exit()


	# ingest data
	trainset = readFile(trainfile)
	testset  = readFile(testfile)

	
	
	# train on random subsets of the data
	sizes = [25, 50, 100]
	accs = []
	for size in sizes:
		tmpAcc = []
		for j in range(4):
			tmpSet = random.sample(trainset.instances, size)

			bayes = NaiveBayes(trainset, testset)
			bayes.train(tmpSet)

			preds = bayes.classify(testset.instances)

			corCount = 0
			for i in range(len(preds)):
				#print preds[i][0], testset.instances[i][-1], preds[i][1]
				if preds[i][0] == testset.instances[i][-1]:
					corCount += 1

			print size, j, corCount
			tmpAcc.append(corCount)
		meanAcc = float(sum(tmpAcc)) / len(tmpAcc)
		accs.append([size, meanAcc])
    def init_process(self):
        smooth      = 1
        smooth      = input('Enter smoothing parameter')

        #Parse training and test data into sets.
        trainEmails = self.parseEmails("./DataSet/train");
        testEmails  = self.parseEmails("./DataSet/test");

        #Train the data and then predict the classifier
        nb = NaiveBayes(smooth);
        nb.train(trainEmails);
        correctPred = 0;

        for e in testEmails:
            if(e.getLabel() == nb.predict(e)):
                correctPred = correctPred + 1

        #Print accuracy statistics
		self.computeAccuracy(correctPred, len(testEmails));
 def latihData(self):
     if (self.Algorithm.get() == "Naive Bayes"):
         try:
             print("NB")
             self.ImportedFile.setDataLatih()
             print(self.ImportedFile.getDataLatih())
             self.NaiveBayes = NaiveBayes(self.ImportedFile.getDataLatih())
             latihstart = time.clock()
             self.NaiveBayes.latih()
             waktu = time.clock() - latihstart
             waktu = round(waktu, 2)
             self.LabelWaktuLatih.configure(text="Waktu Latih: " +
                                            str(waktu) + " detik")
             self.LabelWaktuLatih.lift(self.Frame2)
             self.LabelModelLatih.lift(self.Frame2)
         except:
             msg.showerror("Terjadi Kesalahan",
                           "Pastikan dataset sudah diinput!")
     elif (self.Algorithm.get() == "Gaussian NB"):
         print(0)
         try:
             print("GNB")
             self.ImportedFile.setDataLatih()
             print(self.ImportedFile.getDataLatih())
             self.GNB = GNaiveBayes(self.ImportedFile.getDataLatih())
             latihstart = time.clock()
             self.GNB.latih()
             waktu = time.clock() - latihstart
             waktu = round(waktu, 2)
             self.LabelWaktuLatih.configure(text="Waktu Latih: " +
                                            str(waktu) + " detik")
             self.LabelWaktuLatih.lift(self.Frame2)
             self.LabelModelLatih.lift(self.Frame2)
         except:
             msg.showerror(
                 "Terjadi Kesalahan",
                 "Pastikan dataset sudah diinput, dan bersifat numerik atau kontinyu!"
             )
     else:
         msg.showerror(
             "Terjadi Kesalahan",
             "Harap masukkan data atau pilih algoritma terlebih dahulu")
Ejemplo n.º 32
0
def use_nb(datas_train, datas_valid):
    nb = NaiveBayes()
    nb.train(datas_train)
    #
    predicts = []
    for i in range(len(datas_valid)):
        if (i%1000) == 0:
            print("Getting prediction of", i, "documents out of", len(datas_valid))
        predicts.append(nb.predict(datas_valid[i]['content']))
    #
    pp = [p[0] for p in predicts]
    #
    correct = 0
    for i in range(len(predicts)):
        if predicts[i][0] == datas_valid[i]['category']:
            correct += 1
    #
    res = "Correct: %d out of %d"%(correct, len(datas_valid))
    print(res)
    return res 
Ejemplo n.º 33
0
    def init(self):
        """
            Initialisation stuff
        """
        pygame.init()
        pygame.display.init()
        pygame.display.set_caption('Naive Bayes for digit recognition')
        screen_width=330
        screen_height=280
        self.screen=pygame.display.set_mode([screen_width,screen_height])                   # get the screen
        self.pixel = [[[(10,10,10), [i*10, j*10]] for i in range(28)] for j in range(28)]   # Create a matrix that has dimensions 28x28.
                                                                                            # Every entry is the rgb value of the pixel and
                                                                                            # its actual position on screen
        self.predicted = '?'
        pygame.font.init()
        self.res = pygame.font.Font(None, 36)                                               # Font for the predicted value
        self.text = pygame.font.Font(None, 18)                                              # Font for the normal text
        self.prev_state = pygame.mouse.get_pressed()
        
        self.classifier = NaiveBayes()                                                      # The classifier.
        PATH='./trained.pickle'

        if os.path.isfile(PATH):                                                            # If we trained it already we loade the training values
            self.classifier.train([],[],True)
        else:                                                                               # else we train it
            training = True
            
            print 'Reading MNIST'
            # first read the training data
            training_set, training_labels = self.classifier.read_MNIST(60000, training)
            print'DONE!\n'
            
            print 'Training'
            t = 's'
            start_time = time.localtime()
            self.classifier.train(training_set, training_labels, False) # train the classifier
            end_time = time.localtime()
            # just stuff for the timing output
            b = end_time[4] - start_time[4]
            if b < 0:
                b = 60 + b
            t = str(b) + 'min '
            a = end_time[5] - start_time[5]
            if a < 0:
                a = 60 + a
            t += str(a) + 'sec'
            print 'DONE IN ' + t + '!\n'
        self.initialized = True
Ejemplo n.º 34
0
from Tokenizer import Tokenizer
import re
import os
import json
from collections import OrderedDict


dir1 = sys.argv[1] # dir for top level category classifier. prior, condprobs & config
dir2 = sys.argv[2] # dor for subcat classifier. Contains subdirs for every top level categort. Subdir have prior & condprobs
infile = open(sys.argv[3],'r') # input file \t seperated
opformat = sys.argv[4] #json or tsv
assert opformat == 'json' or opformat == 'tsv'

prior = json.load(open(os.path.join(dir1,'prior.json'),'rb'))
condprobs = json.load(open(os.path.join(dir1,'probs.json'),'rb'))
NB = NaiveBayes(prior, condprobs)
t = Tokenizer()

subcat_classifiers = {}
for k in prior.keys():
    p = json.load(open(os.path.join(dir2,re.sub('[ &]','_', k),'prior.json'),'rb'))
    c = json.load(open(os.path.join(dir2,re.sub('[ &]','_', k),'probs.json'),'rb'))
    subcat_classifiers[k] = NaiveBayes(p,c)

def unicodify(text):
    return text.encode('utf-8','ignore')

def print_line(d):
    if opformat == 'tsv':
        print "\t".join(d.values()).encode('utf-8','ignore')
    if opformat == 'json':
Ejemplo n.º 35
0
def main(argv):

	trainfile = ''
	testfile  = ''
	mode      = ''

	# validate input
	if len(sys.argv) == 4:
		trainfile = sys.argv[1]
		testfile  = sys.argv[2]
		mode      = sys.argv[3]
	else:
		print("incorrect input supplied")
		sys.exit()


	# ingest data
	trainset = readFile(trainfile)
	testset  = readFile(testfile)

	# y1 = 0
	# y2 = 0
	# for instance in trainset.instances:
	# 	if instance[-1] == trainset.labels[0]:
	# 		y1 +=1
	# 	else:
	# 		y2 +=1
	
	if mode == "n":
		print trainset.attributeValues
		print trainset.labels[0], y1
		print trainset.labels[1], y2

		bayes = NaiveBayes(trainset, testset)
		bayes.train(trainset.instances)
		#print bayes.yCounts
		#print bayes.xGivenYCounts[trainset.labels[0]]['bl_of_lymph_c'].values()
		#print bayes.xGivenYCounts[trainset.labels[1]]['bl_of_lymph_c'].values()

		preds = bayes.classify(testset.instances)

		corCount = 0
		for i in range(len(preds)):
			print preds[i][0], testset.instances[i][-1], preds[i][1]
			if preds[i][0] == testset.instances[i][-1]:
				corCount += 1

		print corCount
	
	if mode == "t":
		tan = TAN(trainset, trainset)
		edges = tan.initializeGraph()
		prim = tan.growPrim(edges)
		
		tan.setParentList(prim[1])
		for attrib in trainset.attributes:
			if tan.parentList[attrib]:
				print attrib, tan.parentList[attrib][0], 'class'
			else:
				print attrib, 'class'
		preds = tan.classify(testset.instances)
		print ''
		corCount = 0
		for i in range(len(preds)):
			print preds[i][0], testset.instances[i][-1], preds[i][1]
			if preds[i][0] == testset.instances[i][-1]:
				corCount += 1

		print ''
		print corCount
Ejemplo n.º 36
0
    are written to stdout.
    
    @author David Greisler <*****@*****.**>
    @author Paul Kitt <*****@*****.**>
    
"""

from DirectoryCrawler import DirectoryCrawler 
from TrainingClass import TrainingClass
from NaiveBayes import NaiveBayes
from BagOfWords import BagOfWords
import os

root_path = "<HIER PFAD ZUM data VERZEICHNIS EINFUEGEN!>"
crawler = DirectoryCrawler(root_path)
naive_bayes = NaiveBayes()

class_names = [ "politik", "sport", "wirtschaft" ]
training_classes = []
number_of_documents = 0
classes = []
vocabulary = BagOfWords("")

print "Root directory for test/training data: " + crawler.root_path
print "Class names: " + ', '.join(str(name) for name in class_names)

for document_class in class_names:
    training_class = TrainingClass(document_class, crawler.read_training_documents(document_class))
    training_classes.append(training_class)
    number_of_documents += len(training_class.training_documents)
    
Ejemplo n.º 37
0
#datapoints=[]
for data in data2:
  datapoint=getFeatures(data)
  print data
  classes.append('non_confused')
  datapoints.append(datapoint)

#print datapoints  
#print classes
#print 
print len(data2)
print len(datapoints)



nb = NaiveBayes()


tdatapoints = datapoints
tclasses = classes;

if ('test' in sys.argv[1]):
  print sys.argv[1]
  try:
     datapoints=pickle.load( open( "save.p", "rb" ) )
     classes=pickle.load( open( "save.p1", "rb" ) )
  except:
     datapoints = tdatapoints
     classes = tclasses
     print "reverting to stateless mode"   
else:
Ejemplo n.º 38
0
# from __future__ import division
from NaiveBayes import NaiveBayes
import Preprocessing

__author__ = 'undeed'

inputDataTrain = 'Data Train.xlsx'
inputDataTest = 'Data Test.xlsx'

preprocessedData = "dataset_preprocessing.xlsx"
model = "model_classification.xlsx"
outputResult = "RESULT CLASS.xlsx"


# print "preprocess file"
# Preprocessing.preprocessFile(inputDataTrain, preprocessedData)
#
nb = NaiveBayes(model)
#
# print "start learning"
# nb.learning(inputDataTrain, preprocessedData)
# print "stop learning"

print "start testing"
nb.testing(inputDataTest, outputResult)



Ejemplo n.º 39
0
class NaiveBayesUI:
    """
        Class that allowes us to draw our own digits and let the classifier
        do it's magic on them
    """
    
    def init(self):
        """
            Initialisation stuff
        """
        pygame.init()
        pygame.display.init()
        pygame.display.set_caption('Naive Bayes for digit recognition')
        screen_width=330
        screen_height=280
        self.screen=pygame.display.set_mode([screen_width,screen_height])                   # get the screen
        self.pixel = [[[(10,10,10), [i*10, j*10]] for i in range(28)] for j in range(28)]   # Create a matrix that has dimensions 28x28.
                                                                                            # Every entry is the rgb value of the pixel and
                                                                                            # its actual position on screen
        self.predicted = '?'
        pygame.font.init()
        self.res = pygame.font.Font(None, 36)                                               # Font for the predicted value
        self.text = pygame.font.Font(None, 18)                                              # Font for the normal text
        self.prev_state = pygame.mouse.get_pressed()
        
        self.classifier = NaiveBayes()                                                      # The classifier.
        PATH='./trained.pickle'

        if os.path.isfile(PATH):                                                            # If we trained it already we loade the training values
            self.classifier.train([],[],True)
        else:                                                                               # else we train it
            training = True
            
            print 'Reading MNIST'
            # first read the training data
            training_set, training_labels = self.classifier.read_MNIST(60000, training)
            print'DONE!\n'
            
            print 'Training'
            t = 's'
            start_time = time.localtime()
            self.classifier.train(training_set, training_labels, False) # train the classifier
            end_time = time.localtime()
            # just stuff for the timing output
            b = end_time[4] - start_time[4]
            if b < 0:
                b = 60 + b
            t = str(b) + 'min '
            a = end_time[5] - start_time[5]
            if a < 0:
                a = 60 + a
            t += str(a) + 'sec'
            print 'DONE IN ' + t + '!\n'
        self.initialized = True
        
    def draw(self):
        """
            The draw method that gets called in the "Mainloop"
        """
        if self.initialized:
            self.screen.fill((0,0,0))
            for row in self.pixel:
                for pixel in row:
                    pygame.draw.rect(self.screen, pixel[0], (pixel[1][0], pixel[1][1], 10, 10))     # Draw the pixel in the matrix
            pygame.draw.rect(self.screen, (255, 0, 0), (280, 0, 50, 50))                            # Draws the predict "button"
            pygame.draw.rect(self.screen, (255,0,0),(280,230,50,50))                                # Draws the clar screen "button"
            pygame.draw.rect(self.screen, (255,255,255), (280, 0, 2, 280))                          # Border between picture and "buttons"
            text = self.res.render('= '+str(self.predicted), 1, (255, 255, 255))                    # Draws the wanted texts at their positions
            self.screen.blit(text, (285, 140))
            text = self.text.render('Clear', 1, (0,0,0))
            self.screen.blit(text, (285, 240))
            text = self.text.render('Screen', 1, (0,0,0))
            self.screen.blit(text, (285, 260))
            text = self.text.render('Predict', 1, (0,0,0))
            self.screen.blit(text, (285, 20))
            pygame.display.flip()
            
    def addtuples(self,x,y):
        """
            Helper method to simply add two RGB tupels
        """
        a = []
        for i in range(len(x)):
            b = x[i] + y[i]
            if b < 0:
                b = 0
            if b > 255:
                b = 255
            a.append(b)
        return tuple(a)
        
    def update(self):
        """
            The update function is called every time in the "Mainloop"
        """
        self.mouse_state = pygame.mouse.get_pressed()
        if self.mouse_state[0] == 1:                                            # if the LMB is pressed we eiter want to draw some pixel on the screen, ...
            pos = pygame.mouse.get_pos()
            if pos[0] >= 0 and pos[1] >= 0 and pos[0] < 280 and pos[1] < 280:
                x = pos[0] / 10 % 28
                y = pos[1] / 10 % 28
                self.pixel [y][x][0] = (255,255,255)
                if y > 0:
                    self.pixel[y-1][x][0] = self.addtuples(self.pixel[y-1][x][0], (5, 5, 5))
                if y < 27:
                    self.pixel[y+1][x][0] = self.addtuples(self.pixel[y+1][x][0], (5, 5, 5))
                if x > 0:
                    self.pixel[y][x-1][0] = self.addtuples(self.pixel[y][x-1][0], (5, 5, 5))
                if x < 27:
                    self.pixel[y][x+1][0] = self.addtuples(self.pixel[y][x+1][0], (5, 5, 5))
                    
            elif pos[0] >= 285 and pos[0] <= 330 and pos[1] >= 230 and pos[1] <= 280 and self.prev_state[0] != self.mouse_state[0]: # ... clicked the clear button, ...
                for i in range(28):
                    for j in range(28):
                        self.pixel[i][j][0] = (10,10,10)
                self.predicted = '?'
            elif pos[0] >= 285 and pos[0] <= 330 and pos[1] >= 0 and pos[1] <= 50 and self.prev_state[0] != self.mouse_state[0]: # ... or we want our picture to be predicted
                image = []
                for i in range(28):
                    for j in range(28):
                        image.append(self.pixel[i][j][0][0])
                self.predicted = self.classifier.predict(image)
                
        self.prev_state = self.mouse_state
            
            
    def main(self):
        """
            "Mainloop"
        """
        while 1:
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    return
            self.update()
            self.draw()
Ejemplo n.º 40
0
  def output(self, partId, ch_aux):
    """Uses the student code to compute the output for test cases."""
    trainDir = '../data/imdb1/'

    classifier = NaiveBayes()
    if partId == 1: # development on all words
      splits = classifier.crossValidationSplits(trainDir)
      accuracy = 0.0
      for split in splits:
        nb = NaiveBayes()
        nb.train(split)
        guesses = nb.test(split)
        numCorrect = 0.0
        for i in range(0, len(guesses)):
          guess = guesses[i]
          gold = split.test[i].klass
          if guess == gold:
            numCorrect += 1
        accuracy += numCorrect/len(guesses)
      accuracy = accuracy / 10.0
      output = 'accuracy: 1 %f' % accuracy
      return output
    elif partId == 2: # testing on all words
      trainSplit = classifier.trainSplit(trainDir)
      classifier.train(trainSplit)
      testSplit = buildTestCorpus(ch_aux)
      guesses = classifier.test(testSplit)
      guesses.insert(0, '2')
      output = '\n'.join(guesses)
      return output
    elif partId == 3:  # development without stopwords
      splits = classifier.crossValidationSplits(trainDir)
      accuracy = 0.0
      for split in splits:
        nb = NaiveBayes()
        nb.FILTER_STOP_WORDS = True
        nb.train(split)
        guesses = nb.test(split)
        numCorrect = 0.0
        for i in range(0, len(guesses)):
          guess = guesses[i]
          gold = split.test[i].klass
          if guess == gold:
            numCorrect += 1
        accuracy += numCorrect/len(guesses)
      accuracy = accuracy / 10.0
      output = 'accuracy: 3 %f' % accuracy
      return output
    elif partId == 4: # testing without stopwords
      classifier.FILTER_STOP_WORDS = True
      trainSplit = classifier.trainSplit(trainDir)
      classifier.train(trainSplit)
      testSplit = buildTestCorpus(ch_aux)
      guesses = classifier.test(testSplit)
      guesses.insert(0, '4') # put in the part id.
      output = '\n'.join(guesses)
      return output


    elif partId == 5:  # development binarized
      splits = classifier.crossValidationSplits(trainDir)
      accuracy = 0.0
      for split in splits:
        nb = NaiveBayes()
        nb.BOOLEAN_NB = True
        nb.train(split)
        guesses = nb.test(split)
        numCorrect = 0.0
        for i in range(0, len(guesses)):
          guess = guesses[i]
          gold = split.test[i].klass
          if guess == gold:
            numCorrect += 1
        accuracy += numCorrect/len(guesses)
      accuracy = accuracy / 10.0
      output = 'accuracy: 5 %f' % accuracy
      return output
    elif partId == 6: # testing binarized
      classifier.BOOLEAN_NB = True
      trainSplit = classifier.trainSplit(trainDir)
      classifier.train(trainSplit)
      testSplit = buildTestCorpus(ch_aux)
      guesses = classifier.test(testSplit)
      guesses.insert(0, '6') # put in the part id.
      output = '\n'.join(guesses)
      return output

    elif partId == 7:  # development best model
      splits = classifier.crossValidationSplits(trainDir)
      accuracy = 0.0
      for split in splits:
        nb = NaiveBayes()
        nb.BEST_MODEL = True
        nb.train(split)
        guesses = nb.test(split)
        numCorrect = 0.0
        for i in range(0, len(guesses)):
          guess = guesses[i]
          gold = split.test[i].klass
          if guess == gold:
            numCorrect += 1
        accuracy += numCorrect/len(guesses)
      accuracy = accuracy / 10.0
      output = 'accuracy: 7 %f' % accuracy
      return output
    elif partId == 8: # testing best model
      classifier.BEST_MODEL = True
      trainSplit = classifier.trainSplit(trainDir)
      classifier.train(trainSplit)
      testSplit = buildTestCorpus(ch_aux)
      guesses = classifier.test(testSplit)
      guesses.insert(0, '8') # put in the part id.
      output = '\n'.join(guesses)
      return output

    else:
      print 'Unknown partId: %d' % partId
      return None
Ejemplo n.º 41
0
	def __init__(self, trainset, testset):
		NaiveBayes.__init__(self, trainset, testset)
		self.train(trainset.instances)
Ejemplo n.º 42
0


importer = Importer()

print('Loading stop words')
importer.add_stop_words('data/stopwords/german/')

# Importing training sets
training_data = []
print('Loading training data')
training_data.append(importer.extract_training_data('data/politik/',    label='politik'))
training_data.append(importer.extract_training_data('data/wirtschaft/', label='wirtschaft'))
training_data.append(importer.extract_training_data('data/sport/',      label='sport'))

nb = NaiveBayes()
print('Training')
nb.train(training_data)
# Importing test sets
test_data = []

print('Loading test data')
test_data.append(importer.extract_test_data('data/politik/',    label='politik'))
test_data.append(importer.extract_test_data('data/sport/',      label='sport'))
test_data.append(importer.extract_test_data('data/wirtschaft/', label='wirtschaft'))

print('Testing')
accuracy = nb.test(test_data)
print('accuracy: ' + str(accuracy))

Ejemplo n.º 43
0
		tn = [(a,b) for a,b in accuracyResults if a == 0 and b ==0]

		precision = float(len(tp))/(len(tp) + len(fp))
		recall 	  = float(len(tp))/(len(tp) + len(fn))

		return 1./((a*(1/precision))+((1-a)*1/recall))

if __name__ == "__main__":
	from DataSet import DataSet
	from NaiveBayes import NaiveBayes
	from IBk import IBk

	fileIn = "C:\\Users\\a5rjqzz\\Desktop\\Python\\pyClassifiers\\data\\IBk\\sample_set_life.gla"

	ds = DataSet(fileIn)
	nb = NaiveBayes()
	es = Estimator()
	ib = IBk()

	for i in xrange(30):#
		train, test = ds.getTrainTestSet()
		crossValida = ds.getCrossValidationSet(2)

		#nb.train(ds)
		#results = nb.test(test)

		#print es.accuracy(results)

		#ib.train(train)
		#results = ib.test(test)
		#print es.accuracy(results)
Ejemplo n.º 44
0
def main():
    features = None
    IDs = None
    all_targets = None
    try:
        print 'load training features...'
        features, IDs = load_features(features_file_train, 1)
        print 'load training targets...'
        all_targets = load_targets(targets_file_train)
    except IOError:
        print "The corresponding files have not been created yet."
        print "Please run preprocessing with the same parameters and try again."
        raise SystemExit(0)

    print 'split data...'
    features_train, all_targets_train, IDs_train, features_test, targets_test, IDs_test\
            = splitdata(features, all_targets, IDs)


    # run Naive Bayes for each target separately
    # (not 1 vs all because different targets are independent)
    all_targets_train = all_targets_train.T
    all_probabilities = []
    for i in xrange(15):
        print 'TARGET %d:' % (i)
        print 'train...'
        nb = NaiveBayes()
        targets = all_targets_train[i]

        nb.train(features_train, targets)

        # PREDICTION:
        print 'predict...'        
        probabilities = nb.predict(features_test)
        all_probabilities.append(probabilities)


    print 'write predictions to file...'
    predictions_file = open(predictions_filename, 'w')
    write_csv_row(predictions_file, ['id', 's1', 's2', 's3', 's4', 's5', 'w1', 'w2',\
                                     'w3','w4', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6',\
                                     'k7', 'k8','k9', 'k10', 'k11', 'k12', 'k13',\
                                     'k14', 'k15'])
    prob_file = open(probabilities_filename, 'w')
    write_csv_row(prob_file, ['id', 's1', 's2', 's3', 's4', 's5', 'w1', 'w2',\
                                     'w3','w4', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6',\
                                     'k7', 'k8','k9', 'k10', 'k11', 'k12', 'k13',\
                                     'k14', 'k15'])

    all_prob = (np.array(all_probabilities).T).tolist()
    all_predictions = []
    for i in range(len(all_prob)):
        prob = all_prob[i]
        ID = IDs_test[i]
        zeros = [int(ID)] + [0] * 9
        write_csv_row(prob_file, zeros + prob)
        #make predictions from probabilities (either 0 or 1):
        pred = [round(p) for p in prob]
        write_csv_row(predictions_file, zeros + pred)
        all_predictions.append(pred)
    prob_file.close()
    predictions_file.close()
    
    print ''
    print 'EVALUATE PREDICTIONS'
    row_errors = 0
    total_errors = 0
    number_tweets = len(targets_test)
    predictions_total = 15 * number_tweets
    for i in xrange(number_tweets):
        target_row = targets_test[i]
        targets_rounded = [round(p) for p in target_row]
        predictions = all_predictions[i]
        row_wrong = False
        for j in xrange(len(targets_rounded)):
            if targets_rounded[j] != predictions[j]:
                row_wrong = True
                total_errors += 1
        if row_wrong:
            row_errors += 1
    row_accuracy = (float(number_tweets - row_errors)/number_tweets) * 100
    total_accuracy = (float(predictions_total - total_errors))\
            / predictions_total* 100
    print '%d/%d tweets contain an error in the predictions \
            --> accuracy = %d percent' % (row_errors, number_tweets, row_accuracy)
    print '%d/%d predictions in total wrong --> accuracy %d percent.' \
                      % (total_errors, predictions_total, total_accuracy)
    

    print 'finished.'

    return
Ejemplo n.º 45
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""
    trainDir = "../data/imdb1/"

    classifier = NaiveBayes()
    if partId == 1:  # development on all words
        splits = classifier.crossValidationSplits(trainDir)
        accuracy = 0.0
        for split in splits:
            nb = NaiveBayes()
            nb.train(split)
            guesses = nb.test(split)
            numCorrect = 0.0
            for i in range(0, len(guesses)):
                guess = guesses[i]
                gold = split.test[i].klass
                if guess == gold:
                    numCorrect += 1
            accuracy += numCorrect / len(guesses)
        accuracy = accuracy / 10.0
        output = "accuracy: 1 %f" % accuracy
        return output
    elif partId == 2:  # testing on all words
        trainSplit = classifier.trainSplit(trainDir)
        classifier.train(trainSplit)
        testSplit = buildTestCorpus(ch_aux)
        guesses = classifier.test(testSplit)
        guesses.insert(0, "2")
        output = "\n".join(guesses)
        return output
    elif partId == 3:  # development without stopwords
        splits = classifier.crossValidationSplits(trainDir)
        accuracy = 0.0
        for split in splits:
            nb = NaiveBayes()
            nb.FILTER_STOP_WORDS = True
            nb.train(split)
            guesses = nb.test(split)
            numCorrect = 0.0
            for i in range(0, len(guesses)):
                guess = guesses[i]
                gold = split.test[i].klass
                if guess == gold:
                    numCorrect += 1
            accuracy += numCorrect / len(guesses)
        accuracy = accuracy / 10.0
        output = "accuracy: 3 %f" % accuracy
        return output
    elif partId == 4:  # testing without stopwords
        classifier.FILTER_STOP_WORDS = True
        trainSplit = classifier.trainSplit(trainDir)
        classifier.train(trainSplit)
        testSplit = buildTestCorpus(ch_aux)
        guesses = classifier.test(testSplit)
        guesses.insert(0, "4")  # put in the part id.
        output = "\n".join(guesses)
        return output
    else:
        print "Unknown partId: %d" % partId
        return None