Ejemplo n.º 1
0
def test_BernouliNB2():
    X = np.array([
        [0, 1],
        [1, 1],
        [1, 0],
        [-1, 1],
        [1000, 1000],
        [1000, 10001],
        [998, 800],
        [990, 1100],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 2, 3, 4, 5, 6, 7, 8])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1002, 1010],
            [1010, 910],
            [1003, 980],
            [1008, 1030],
            [-1, -1],
            [-3, -10],
            [40, 1],
            [1, -100],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X[i]) + ' pred_ret ' + str(pred_ret)
Ejemplo n.º 2
0
def train_model(data, target):
    """
    Splits the data into a training set and test set

    Instatiating a Bernoulli Naive Bayes classifier, train on the training set,
    and then evaluate the model based upon the test set
    """

    # Using cross-validation
    # TO TRY: stratification for dividing preclassified tweets into homogenous subgroups before
    # sampling in order to improve the representativeness of the sampling

    train_tweets, validation_tweets, train_sentiment, validation_sentiment = cross_validation.train_test_split(data, 
                                                                                                target,
                                                                                                test_size=0.4)

    
    # Fitting the Naive Bayes classifier wtih the training tweets and corresponding sentiment
    classifier = BernoulliNB().fit(train_tweets, train_sentiment)


    predicted = classifier.predict(validation_tweets)

    # Using the cross-validation split, evaluate the accuracy of the predicted tweets
    evaluate_model(validation_sentiment, predicted)

    # Pickling the classifier
    pickle_file = open('nb_classifier.pickle', 'wb')
    pickle.dump(classifier, pickle_file)
    pickle_file.close()

    return classifier
Ejemplo n.º 3
0
def train(cutoffs):
    print "\n========== Start Training =========="
    if len(__TRAIN_DATA) == 3:
        list_io_addr = get_io_addr(__TRAIN_DATA[0], __TRAIN_DATA[1], __TRAIN_DATA[2])
    else:
        list_io_addr = get_io_addr_random_sample(__TRAIN_DATA[0], __TRAIN_DATA[1])
    clf = BernoulliNB(fit_prior=True)

    for i in range(len(list_io_addr)):
        path_in = list_io_addr[i]
        print "\nGenerating training set from {}".format(path_in)
        with open(path_in, "r") as file_in:
            X = Sparse_Matrix_IO.load_sparse_csr(file_in)

        if len(cutoffs) > 0:
            print "Discarding selected features......"
            X = discard_vars(X, cutoffs)

        vector_len = len(X[0])
        X_train = X[:, 0:vector_len-1]
        y_train = X[:, vector_len-1]
        print "Done"

        # sm = SMOTE(ratio=0.9)
        # X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

        print "Fitting Model......"
        clf.partial_fit(X_train, y_train, classes=[0, 1])
        print "Done"

    with open(__ROOT_MODEL, "w") as file_out:
        pickle.dump(clf, file_out)
Ejemplo n.º 4
0
def test_discretenb_predict_proba():
    """Test discrete NB classes' probability scores"""

    # The 100s below distinguish Bernoulli from multinomial.
    X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
    X_multinomial = [[0, 1], [1, 3], [4, 0]]

    # Confirm that the 100s above distinguish Bernoulli from multinomial
    y = [0, 0, 1]
    cls_b = BernoulliNB().fit(X_bernoulli, y)
    cls_m = MultinomialNB().fit(X_bernoulli, y)
    assert_not_equal(cls_b.predict(X_bernoulli)[-1],
                     cls_m.predict(X_bernoulli)[-1])

    # test binary case (1-d output)
    y = [0, 0, 2]   # 2 is regression test for binary case, 02e673
    for cls, X in zip([BernoulliNB, MultinomialNB],
                      [X_bernoulli, X_multinomial]):
        clf = cls().fit(X, y)
        assert_equal(clf.predict(X[-1]), 2)
        assert_equal(clf.predict_proba(X[0]).shape, (1, 2))
        assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1),
                                  np.array([1., 1.]), 6)

    # test multiclass case (2-d output, must sum to one)
    y = [0, 1, 2]
    for cls, X in zip([BernoulliNB, MultinomialNB],
                      [X_bernoulli, X_multinomial]):
        clf = cls().fit(X, y)
        assert_equal(clf.predict_proba(X[0]).shape, (1, 3))
        assert_equal(clf.predict_proba(X[:2]).shape, (2, 3))
        assert_almost_equal(np.sum(clf.predict_proba(X[1])), 1)
        assert_almost_equal(np.sum(clf.predict_proba(X[-1])), 1)
        assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
        assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
Ejemplo n.º 5
0
def main(output_file=time.strftime('%h%d-%Hh%Mm')+'.csv', in_pkl=None):
    """ Generates features and fits classifier. 
    Input command line argument is optional run name, defaults to date/time.
    """
    logging.info("Loading features...")
    if not in_pkl:
        return "input .plk required"
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(in_pkl)
    logging.info("Loaded features, fitting model...")
    # Bernoulli Naive Bayes
    clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True)
    clf.fit(trainFeatures,trainTargets)
    logging.info("Predicting...")
    # Use probabilities instead of binary class prediction in order to generate a ranking    
    predicted_scores = clf.predict_log_proba(testFeatures).T[1]

    logging.info("Write results...")
    logging.info("Writing submission to %s" % output_file)
    f = open(output_file, "w")
    f.write("id\n")

    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        # only writes item_id per output spec, but may want to look at predicted_scores
        f.write("%d\n" % (item_id))

    f.close()
    logging.info("Done.")
Ejemplo n.º 6
0
def tryBinomialNaiveBayes(goFast):
  best_score = 0

  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm")
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm")
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm")

  from sklearn.naive_bayes import BernoulliNB

  for alpha_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    for binarize_value in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
      for fit_prior_value in [True, False]:
        binary_operator = BernoulliNB(alpha_value,binarize_value,fit_prior_value)
        binary_operator.fit(training_data,training_labels)
        current_score = binary_operator.score(validation_data,validation_labels)

        print "Current test: " + str(alpha_value), str(binarize_value), fit_prior_value
        print "Current score: " + str(current_score)

        if current_score > best_score:
          best_score = current_score
          print "***NEW MAXIMUM SCORE: " + str(best_score)
          print "***NEW MAXIMUM PARAMETERS: " + str(alpha_value), str(binarize_value), fit_prior_value

  print "Best score was " + str(best_score)
Ejemplo n.º 7
0
def compareClassifiers():
	(observations, classes) = createObservations()
	observations = np.array(observations)
	classes = np.array(classes)

	# make tree classifier
	my_tree = tree.DecisionTreeClassifier()
	my_tree.fit(observations, classes)
	tree_score = my_tree.score(observations, classes)
	tree_cv = cross_validation.cross_val_score(my_tree, observations, classes, scoring='accuracy', cv=10)
	#print "tree score:", tree_score, "tree cv", np.mean(tree_cv)

	# make naive classifier
	naive = BernoulliNB(binarize=None)
	naive.fit(observations, classes)
	naive_score = naive.score(observations, classes)
	naive_cv = cross_validation.cross_val_score(naive, observations, classes, scoring='accuracy', cv=10)
	#print "naive score:", naive_score, "naive cv", np.mean(naive_cv)

	# make SVM classifier
	svm = LinearSVC()
	svm.fit(observations, classes)
	svm_score = svm.score(observations, classes)
	svm_cv = cross_validation.cross_val_score(svm, observations, classes, scoring='accuracy', cv=10)
	#print "svm score:", svm_score, "svm cv", np.mean(svm_cv)

	# make Log classifier
	log = LogisticRegression()
	log.fit(observations, classes)
	log_score = log.score(observations, classes)
	log_cv = cross_validation.cross_val_score(log, observations, classes, scoring='accuracy', cv=10)
	#print "log score:", log_score, "log cv", np.mean(log_cv)

	return [(tree_score, np.mean(tree_cv)), (naive_score, np.mean(naive_cv)), (svm_score, np.mean(svm_cv)), (log_score, np.mean(log_cv))]
Ejemplo n.º 8
0
def main():
	# Get the data and targets
	df = pd.read_csv('train1.csv')
	df = df[df.rating != 'rating']
	corpus = [review for review in df.review]
	splitPoint = len(corpus)*2/3
	trainingCorpus = corpus[:splitPoint]
	testCorpus = corpus[splitPoint:]
	target = [rating for rating in df.rating]
	trainingTarget = np.array(target[:splitPoint])
	testTarget = np.array(target[splitPoint:])

	# Train the algorithm
	train_X, vocabList = createVectorizer(trainingCorpus, 'None', True)
	NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget)

	# Test the algorithm
	test_X = createVectorizer(testCorpus, vocabList, True)
	test_predict = NB_Bern_model.predict(test_X)
	print(np.mean(test_predict == testTarget))	
	print metrics.classification_report(testTarget, test_predict, target_names=['0', '1'])

	# Make Predictions
	predict_df = pd.read_csv('test2.csv')
	predictCorpus = [review for review in predict_df.review]
	member = [memberid for memberid in predict_df.ID]
	predict_X = createVectorizer(predictCorpus, vocabList, True)
	predictions = NB_Bern_model.predict(predict_X)
	predict_df.columns = ['ID', 'Predicted']
	for i in range(len(member)):
	 	predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i]
	predict_df.to_csv('submission1.csv', sep = ',', index=False)
Ejemplo n.º 9
0
def synergy_naive_bayes(data,target):
    # generate champion relations as binaries
    for i in xrange(len(data)):
        temp = []
        for j in xrange(len(data[i])):
            if data[i][j] == -1:
                temp.append(1)
            else:
                temp.append(0)
        for j in xrange(len(data[i])):
            if data[i][j] == 1:
                temp.append(1)
            else:
                temp.append(0)
        num_champ = 124
        for j in xrange(num_champ):
            for k in xrange(j,num_champ):
                temp.append(temp[j]*temp[k])
                temp.append(temp[j+num_champ]*temp[k+num_champ])

        data[i] = temp  
    X = array(data)

    y = array(target)

    combined = zip(X, y)
    shuffle(combined)

    gnb = BernoulliNB()
    y_pred = gnb.fit(X[:len(X) * 4 / 5], y[:len(y) * 4 / 5]).predict(X[len(X) * 4 / 5:])

    print (metrics.classification_report(y[len(y) * 4 / 5:],y_pred))
def combined_experiment(train_x,train_y,test_x,test_y,train_f_x,train_f_y,test_f_x,test_f_y, bias):
    labels = [] # Will contain all the final labels that result from the voting
    clf_c1 = MultinomialNB()
    clf_c1.fit(train_x,train_y)
    clf_c2 = BernoulliNB()
    clf_c2.fit(train_x,train_y)
    clf_f1 = svm.SVC(kernel='linear',cache_size = 512)
    clf_f1.fit(train_f_x,train_f_y)
    clf_f2 = svm.SVC(kernel='rbf',cache_size = 512)
    clf_f2.fit(train_f_x,train_f_y)
    
    p1 = clf_c1.predict(test_x)
    p2 = clf_c2.predict(test_x)
    p3 = clf_f1.predict(test_f_x)
    p4 = clf_f2.predict(test_f_x)
    if bias == 'content':
        for i in range(len(p1)):
            if p1[i] == p2[i] or p1[i] == p3[i]:
                labels.append(p1[i])
            else:
                labels.append(p2[i])
    elif bias == "syntax":
        for i in range(len(p1)):
            if p1[i] == p3[i] or p1[i] == p4[i]:
                labels.append(p1[i])
            else:
                labels.append(p3[i])
    else:
        print 'Please enter a valid bias ("syntax" or "content")!'
    p_combined = np.array(labels)
    accuracy = (np.sum(p_combined == test_y)/np.float_(len(test_y)))
    return accuracy
class NaiveBayesClassifierBernoulli:
    """
    this class capsules the Bernoulli NaiveBayes functions of scikit-learn in BernoulliNB class
"""
    def __init__(self, matrixFileName = matrixFilePath, dicFileName = dictFilePath):
        self.X,self.Y = load_svmlight_file(matrixFileName)
        self.dictionary = pickle.load(open(dicFileName, "rb"))
        self.bernoulliNB = BernoulliNB()
        self.bernoulliNB.fit(self.X, self.Y)
        self.matrixParser = Parser.MatrixParserForLearning()
        
    def classifyOneSentence(self, string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            return self.bernoulliNB.predict(row)
        else : return None
    
    def classifyOneSentenceWithProbability(self,string):
        row = self.matrixParser.getRowForClassify(string, self.dictionary)
        if row != None:
#             return self.bernoulliNB.predict(row)
            a = self.bernoulliNB.predict_proba(row)
            return a[0][1] - a[0][0]
        else : return None
Ejemplo n.º 12
0
def naive_bayes(data, target):
    # change data to binary
    for i in xrange(len(data)):
        temp = []
        for j in xrange(len(data[i])):
            if data[i][j] == -1:
                temp.append(1)
            else:
                temp.append(0)
        for j in xrange(len(data[i])):
            if data[i][j] == 1:
                temp.append(1)
            else:
                temp.append(0)
        data[i] = temp  
    X = array(data)

    y = array(target)

    combined = zip(X, y)
    shuffle(combined)

    gnb = BernoulliNB()
    y_pred = gnb.fit(X[:len(X) * 4 / 5], y[:len(y) * 4 / 5]).predict(X[len(X) * 4 / 5:])

    print (metrics.classification_report(y[len(y) * 4 / 5:],y_pred))
Ejemplo n.º 13
0
def bernoulli_classify():
    clf = BernoulliNB()
    traindata = []
    traintarget = []
    for f in glob.glob("../../../res/articles/training_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        traindata.extend(output[0])
        traintarget.extend(output[1])

    testdata = []
    testtarget = []
    for f in glob.glob("../../../res/articles/test_data/*-articles.json"):
        target = f.replace("-articles.json", "")
        target = re.sub(r".*\/+", "", target)
        output = readWholeFileBernoulli(f, target)
        testdata.extend(output[0])
        testtarget.extend(output[1])

    clf.fit(traindata, traintarget)
    ncorrect = 0
    total = len(testdata)
    for i in range(len(testdata)):
        predict = clf.predict(testdata[i])
        correct = testtarget[i]
        if correct == predict[0]:
            ncorrect += 1

        print ("Correct: {0} - Predicted: {1}".format(correct, predict[0]))

    print "Correct ", ncorrect, " Total ", total, " Correctness ", ncorrect * 1.0 / total
Ejemplo n.º 14
0
def NB_train_classifier(train_x, train_y):
    """ Returns the predictions on the validation set
    """
    classifier = BernoulliNB()
    classifier.fit(train_x, train_y)

    return classifier
def MungeData(train, test):

    todrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47']
    print(todrop)

    train.drop(todrop,
               axis=1, inplace=True)
    test.drop(todrop,
              axis=1, inplace=True)

    features = train.columns[2:]
    for col in features:
        if((train[col].dtype == 'object')):
            print(col)
            train, binfeatures = Binarize(col, train)
            test, _ = Binarize(col, test, binfeatures)
            nb = BernoulliNB()
            nb.fit(train[col+'_'+binfeatures].values, train.target.values)
            train[col] = \
                nb.predict_proba(train[col+'_'+binfeatures].values)[:, 1]
            test[col] = \
                nb.predict_proba(test[col+'_'+binfeatures].values)[:, 1]
            train.drop(col+'_'+binfeatures, inplace=True, axis=1)
            test.drop(col+'_'+binfeatures, inplace=True, axis=1)

    features = train.columns[2:]
    train[features] = train[features].astype(float)
    test[features] = test[features].astype(float)
    train.fillna(-1, inplace=True)
    test.fillna(-1, inplace=True)
    return train, test
Ejemplo n.º 16
0
def bnb_fit(train_data, train_lbl_data):
    from sklearn.naive_bayes import BernoulliNB
    print "Starts bnb"

    bnb = BernoulliNB()
    bnb.fit(train_data, train_lbl_data)
    return bnb
Ejemplo n.º 17
0
def naive_bayes(df,column):
    reviews_pn = df[df['class'].isin(['positive','negative'])]
    comments = list(reviews_pn[column].values)
    classes = list(reviews_pn['class'].values)
    
    # preprocess creates the term frequency matrix for the review data set
    stop = stopwords.words('english')
    count_vectorizer = CountVectorizer(stop_words = stop, ngram_range=(1,3))
    comments1 = count_vectorizer.fit_transform(comments)
    tfidf_comments = TfidfTransformer(use_idf=True).fit_transform(comments1)
    
    # preparing data for split validation. 60% training, 40% test
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(tfidf_comments,classes,test_size=0.4,random_state=43)
    classifier = BernoulliNB().fit(data_train,target_train)
    predicted = classifier.predict(data_test)
    
    print classification_report(target_test,predicted)
    print "The accuracy score is {:.2%}".format(accuracy_score(target_test,predicted))
    
    most_informative_feature_for_binary_classification(count_vectorizer,classifier,n=20)
    
    #predict on unknown
    reviews_nc = reviews_df[reviews_df['class'] == '']
    comments_nc = list(reviews_nc[column].values)
    comments_nc1 = count_vectorizer.transform(comments_nc)    
    tfidf_comments_nc = TfidfTransformer(use_idf=True).fit_transform(comments_nc1)    
    new_predicted = classifier.predict(tfidf_comments_nc)
    
    print "negative = %s" %sum(new_predicted == 'negative')
    print "positive = %s" %sum(new_predicted == 'positive')
 def doclassify(self, type='normal'):
     if type == 'normal':
         clf = BernoulliNB()
         clf.fit(self.train_x, self.train_y)
         BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
         score = clf.score(self.train_x, self.train_y)
         print 'score = ', score
Ejemplo n.º 19
0
def BernoulliNB_1(train_predictors,test_predictors,train_target,test_target):
    clf = BernoulliNB()
    clf.fit(train_predictors,train_target)
    predicted = clf.predict(test_predictors)
    accuracy = accuracy_score(test_target, predicted)
    print "Accuracy for Bernoulli Naive Bayes: "+str(accuracy)
    return accuracy,predicted  
def BNB(data_train, data_train_vectors, data_test_vectors, **kwargs):
    # Implementing classification model- using BernoulliNB
    clf_BNB = BernoulliNB(alpha=.01)
    clf_BNB.fit(data_train_vectors, data_train.target)
    y_pred = clf_BNB.predict(data_test_vectors)
    
    return y_pred
Ejemplo n.º 21
0
def test_BernouliNB4():
    X = np.array([
        [1, 1],
        [1, 1],
        [1, 1],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [0, 0],
        [0, 0],
        [1, 0],
        ]
            )
    print 'X ' + str(X)
    #Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])
    Y = np.array([1, 1, 0, 1, 0, 0, 0, 1, 1, 0])
    print 'Y ' + str(Y)
    clf = BernoulliNB(alpha = 1)
    clf.fit(X, Y)
    X2 = np.array(
            [
            [1, 1],
            ]
            )
    for i in xrange(len(X2)):
        #pred_ret = clf.predict_proba(X2[i])
        pred_ret = clf.predict(X2[i])
        print 'X[' + str(i) + '] = ' + str(X2[i]) + ' pred_ret ' + str(pred_ret)
Ejemplo n.º 22
0
    def render_content(self):
        if self.text_source is None:
            return "No text source selected."
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.naive_bayes import BernoulliNB
        from sklearn import metrics
        self.dm("creating vectorizer")
        vectorizer = CountVectorizer(stop_words=self.get_user_list(self.stop_list), max_features=self.vocab_size)
        data = self.get_column_data(self.text_source)
        self.dm("using vectorizer")
        X_train = vectorizer.fit_transform(data)
        Y_train = self.get_column_data(self.code_source)
        self.dm("creating classifier")
        clf = BernoulliNB()
        clf.fit(X_train, Y_train)
        
        accuracy = clf.score(X_train, Y_train)
        self.dm("predicting")
        pred = clf.predict(X_train)
        cm = metrics.confusion_matrix(Y_train, pred)

        self.dm("displaying result")
        html_output = "accuracy is " + str(round(accuracy, 2))
        html_output += '<pre>'+ str(cm) + '</pre>'

        return html_output
Ejemplo n.º 23
0
    def generatePredictingModel(data):
        """
            Build the prediction model (based on the data set we have) in order to be able to predict the category
            of a new video from the user input
            Return a classifier able to predict the category of a video based on its title and description.
        """
        try:
            # Intitialize a timer to compute the time to build the model
            start = time.time()

            # Split into train-test data set
            X = data[[x for x in data.columns if x in ('title', 'description')]]
            Y = data[[x for x in data.columns if x in ('video_category_id')]]
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.80, random_state = 10)

            # Build the 2 text corpus
            corpus_title = X_train['title'].values.tolist()
            corpus_description = X_train['description'].values.tolist()

            # initializes the 2 vectorizers.
            count_vectorizer_title = CountVectorizer()
            count_vectorizer_description = CountVectorizer()

            # learn the 2 vocabulary dictionary
            count_vectorizer_title.fit(corpus_title)
            count_vectorizer_description.fit(corpus_description)

            # Build the sparse matrices
            X_train_count_title = count_vectorizer_title.transform(X_train['title'])
            X_train_count_description = count_vectorizer_description.transform(X_train['description'])
            X_test_count_title = count_vectorizer_title.transform(X_test['title'])
            X_test_count_description = count_vectorizer_description.transform(X_test['description'])

            # Set and train the models (for title and description features)
            model_count_title = BernoulliNB()
            model_count_description = BernoulliNB()
            model_count_title.fit(X_train_count_title, Y_train['video_category_id'])
            model_count_description.fit(X_train_count_description, Y_train['video_category_id'])

            # Merge the title and description predictions and build a new prediction based on these 2 predictions combined
            new_df_train = pd.DataFrame()
            new_df_train['title_prediction'] = model_count_title.predict(X_train_count_title)
            new_df_train['description_prediction'] = model_count_description.predict(X_train_count_description)
            new_df_test = pd.DataFrame()
            new_df_test['title_prediction'] = model_count_title.predict(X_test_count_title)
            new_df_test['description_prediction'] = model_count_description.predict(X_test_count_description)
            tree = DecisionTreeClassifier()
            tree.fit(new_df_train, Y_train)

            end = time.time()
            execution_time = end - start

            print "Time to build this incredibly amazing model, only : {} seconds!!!!!!".format(execution_time)
            time.sleep(3)

            return tree, model_count_title, model_count_description,count_vectorizer_title,count_vectorizer_description

        except:
            raise VideoAnalysisException(" Error while creation of predictive model ")
def learn_model(data, target):
    # preparing data for split validation. 80% training, 20% test
    data_train, data_test, target_train, target_test = cross_validation.train_test_split(
        data, target, test_size=0.2, random_state=43
    )
    classifier = BernoulliNB().fit(data_train, target_train)
    predicted = classifier.predict(data_test)
    evaluate_model(target_test, predicted)
Ejemplo n.º 25
0
def score(train_X, train_y):

    X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10)

    clf = BernoulliNB(binarize=False, fit_prior=True, alpha=0.7)
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)
    return log_loss(y_valid, y_pred)
Ejemplo n.º 26
0
	def testBoGNB(self):
		'''
		Test on sentiment analysis task using Naive Bayes classifier 
		with Bag-of-Word feature vectors.
		'''
		wordlist = []
		# Preprocessing of original txt data set
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			wordlist.extend(words)
		word_dict = set(wordlist)
		word2index = dict(zip(word_dict, range(len(word_dict))))
		# Build BoG feature
		train_size = len(self.senti_train_txt)
		test_size = len(self.senti_test_txt)
		pprint('Training set size: %d' % train_size)
		pprint('Test set size: %d' % test_size)
		train_feat = np.zeros((train_size, len(word_dict)), dtype=np.float)
		test_feat = np.zeros((test_size, len(word_dict)), dtype=np.float)
		# Using binary feature
		start_time = time.time()
		for i, sent in enumerate(self.senti_train_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			train_feat[i, indices] = 1.0
		for i, sent in enumerate(self.senti_test_txt):
			words = sent.split()
			words = [word.lower() for word in words if len(word) > 2]
			indices = map(lambda x: word2index[x], words)
			test_feat[i, indices] = 1.0
		end_time = time.time()
		pprint('Finished building training and test feature matrix, time used: %f seconds.' % (end_time-start_time))
		pprint('Classification using Bernoulli Naive Bayes classifier: ')
		clf = BernoulliNB()
		# clf = LogisticRegression()
		clf.fit(train_feat, self.senti_train_label)
		train_pred_label = clf.predict(train_feat)
		train_acc = np.sum(train_pred_label == self.senti_train_label) / float(train_size)
		pprint('Training accuracy = %f' % train_acc)
		pred_label = clf.predict(test_feat)
		acc = np.sum(pred_label == self.senti_test_label) / float(test_size)
		pprint('Accuracy: %f' % acc)
		train_pos_count = np.sum(self.senti_train_label == 1)
		train_neg_count = np.sum(self.senti_train_label == 0)
		test_pos_count = np.sum(self.senti_test_label == 1)
		test_neg_count = np.sum(self.senti_test_label == 0)
		pprint('Positive count in training set: %d' % train_pos_count)
		pprint('Negative count in training set: %d' % train_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(train_pos_count) / train_neg_count))
		pprint('Positive count in test set: %d' % test_pos_count)
		pprint('Negative count in test set: %d' % test_neg_count)
		pprint('Ratio: pos/neg = %f' % (float(test_pos_count) / test_neg_count))
def learnBModel(ip,label,tst,tst_label):
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(ip.data)
    X_test = vectorizer.transform(tst.data)
    tfidf_train = TfidfTransformer(use_idf=False).fit_transform(X_train)
    tfidf_test = TfidfTransformer(use_idf=False).fit_transform(X_test)
    classifier = BernoulliNB().fit(tfidf_train,label)
    predicted_BModel = classifier.predict(tfidf_test)
    evaluate_model(tst_label,predicted_BModel)
Ejemplo n.º 28
0
def naive_bayesB_classifier(X_train, categories, X_test, test_categories):
    from sklearn.naive_bayes import BernoulliNB   
    clf = BernoulliNB(alpha = 0.10000000000000001).fit(X_train, categories)
    y_nb_predicted = clf.predict(X_test)
    print "\n Here is the classification report for Naive Bayes classifier:"
    print metrics.classification_report(test_categories, y_nb_predicted)
    print "Accuracy score:"
    print metrics.accuracy_score(test_categories, y_nb_predicted)
    to_latex(test_categories, y_nb_predicted)  
Ejemplo n.º 29
0
def BernoulliNB_pred(X_train, X_test, y_train):
    clf_NB = BernoulliNB()
    clf_NB.fit(X_train, y_train)

    # Conveting to back, (could be used sklearn standardization function for both decoding and encoding)
    predictions_train = clf_NB.predict_proba(X_train)
    predictions = clf_NB.predict_proba(X_test)

    return predictions[:, 1], predictions_train[:, 1]
Ejemplo n.º 30
0
 def nb_classifier(self, secret):
     clf = BernoulliNB()
     x = self.raw_attr_vector(secret)
     y = self.get_labels(secret)
     fsl = self.feature_sel(secret)
     new_x = fsl.transform(x)
     clf.fit(new_x, y)
     new_y = clf.predict(new_x)
     return clf, fsl, self.evaluate(new_y, y)
Ejemplo n.º 31
0
    # tf-idf处理
    vectorizer = TfidfVectorizer(input='content',
                                 stop_words='english',
                                 max_df=0.5,
                                 sublinear_tf=True)
    x_train = vectorizer.fit_transform(data_train.data)
    x_test = vectorizer.transform(data_test.data)
    print('训练集样本个数:%d,特征个数:%d' % x_train.shape)
    print('停止词:\n', end=' ')

    #pprint(vectorizer.get_stop_words())
    feature_names = np.asarray(vectorizer.get_feature_names())

    # 比较分类器结果
    clfs = (MultinomialNB(), BernoulliNB())
    result = []
    for clf in clfs:
        r = make_test(clf)
        result.append(r)
        print('\n')

    result = np.array(result)
    time_train, time_test, err, names = result.T
    time_train = time_train.astype(np.float)
    time_test = time_test.astype(np.float)
    err = err.astype(np.float)
    x = np.arange(len(time_train))
    mpl.rcParams['font.sans-serif'] = ['simHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(10, 7), facecolor='w')
Ejemplo n.º 32
0
##datasets with a validation set
X_train2 = full_df[:1120000, :]
X_valid = full_df[1120000:1600000, :]

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train1 = le.fit_transform(full_data['Sentiment'])
y_train2 = le.transform(full_data['Sentiment'][:1120000])
y_valid = le.transform(full_data['Sentiment'][1120000:])
y_test = le.transform(test_data_pos_neg['Sentiment'])

######Try Binomial Naive Bayes Model without word stemming######
from sklearn.naive_bayes import BernoulliNB
##Convert word frequency matrix into binary matrix
X_train1_bin = X_train1.copy()
X_train1_bin[X_train1_bin > 0] = 1

clf_ber_bayes = BernoulliNB()
clf_ber_bayes.fit(X_train1_bin, y_train1)

train_preds = clf_ber_bayes.predict(X_train1_bin)
accuracy_score(train_preds, y_train1)
#Convert test dataframe to binary
X_test_bin = X_test.copy()
X_test_bin[X_test_bin > 0] = 1

test_preds = clf_ber_bayes.predict(X_test_bin)
accuracy_score(y_test, test_preds)  ##84.12 % accuracy_score
Ejemplo n.º 33
0
def modelTraining(X_train, X_test, y_train, y_test, f):
    models = {}
    # Linear SVC
    try:
        lsvc = LinearSVC()
        y_pred = lsvc.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Linear Support Vector Classifier"] = model_accr
        f.writelines(
            "\n            Accuracy of Linear Support Vector Classifier is " +
            str(model_accr))
    except:
        logging.info("LSVC is throwing exception")
        f.writelines("\n            LSVC is throwing exception")

    # KNN
    try:
        knn = KNeighborsClassifier()
        y_pred = knn.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["KNN Classifier"] = model_accr
        f.writelines("\n            Accuracy of KNN Classifier is " +
                     str(model_accr))
    except:
        logging.info("KNN is throwing exception")
        f.writelines("\n            KNN is throwing exception")

    # DTC
    try:
        clf_gini = DecisionTreeClassifier(criterion="gini", random_state=0)
        y_pred = clf_gini.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Decision Tree Classifier - GINI"] = model_accr
        f.writelines(
            "\n            Accuracy of Decision Tree Classifier - GINI is " +
            str(model_accr))
    except:
        logging.info("DTC GINI is throwing exception")
        f.writelines("\n            DTC GINI is throwing exception")

    try:
        clf_entropy = DecisionTreeClassifier(criterion="entropy",
                                             random_state=0)
        y_pred = clf_entropy.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Decision Tree Classifier - ENTROPY"] = model_accr
        f.writelines(
            "\n            Accuracy of Decision Tree Classifier - ENTROPY is "
            + str(model_accr))
    except:
        logging.info("DTC ENTROPY is throwing exception")
        f.writelines("\n            DTC ENTROPY is throwing exception")

    # Multinomial NB
    try:
        mnb_model = MultinomialNB()
        y_pred = mnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Multinomial Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of Multinomial NB is " +
                     str(model_accr))
    except:
        logging.info("Multinomial NB is throwing exception")
        f.writelines("\n            Multinomial NB is throwing exception")

    # Bernoulli NB
    try:
        bnb_model = BernoulliNB()
        y_pred = bnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Bernoulli Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of Bernoulli NB is " +
                     str(model_accr))
    except:
        logging.info("Bernoulli NB is throwing exception")
        f.writelines("\n            Bernoulli NB is throwing exception")

    # Gaussian NB
    try:
        gnb_model = GaussianNB()
        y_pred = gnb_model.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Gaussian Naive Bayes"] = model_accr
        f.writelines("\n            Accuracy of GaussianNB is " +
                     str(model_accr))
    except:
        logging.info("GaussianNB is throwing exception")
        f.writelines("\n            GaussianNB is throwing exception")

    # ADB
    try:
        adb = AdaBoostClassifier(n_estimators=200, learning_rate=1)
        # Train Adaboost Classifer
        y_pred = adb.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["AdaBoost Classifier"] = model_accr
        f.writelines("\n            Accuracy of AdaBoost Classifier is " +
                     str(model_accr))
    except:
        logging.info("AdaBoost Classifier is throwing exception")
        f.writelines("\n            AdaBoost Classifier is throwing exception")

    # Random Forest Classifier
    try:
        rfc = RandomForestClassifier(n_estimators=100)
        y_pred = rfc.fit(X_train, y_train).predict(X_test)
        model_accr = metrics.accuracy_score(y_test, y_pred) * 100
        models["Random Forest Classifier"] = model_accr
        f.writelines("\n            Accuracy of Random Forest Classifier is " +
                     str(model_accr))
    except:
        logging.info("Random Forest Classifier is throwing exception")
        f.writelines(
            "\n            Random Forest Classifier is throwing exception")

    return (models)
Ejemplo n.º 34
0
def classification_voting(X,y, nome):
    clf2 = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=2, random_state=0)
    clf3 = BernoulliNB()
    eclf2 = VotingClassifier(estimators=[('rf', clf2), ('bnb', clf3)],voting = 'soft')
    classification_model_cv(X, y, eclf2, "Voting Model "+nome)
Ejemplo n.º 35
0
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# call garbage collection to release some memory
del train, test, user, tf_csr, tfidf_csr
gc.collect()
# -------------------------------------------------------------------------------------------------

print('-' * 100)
print(f'Gender prediction with {TARGET_FEAT}\n')
models = dict(
    lr=LogisticRegression(random_state=seed, C=5, solver='sag'),
    svm=LinearSVC(random_state=seed, C=0.5),
    pac=PassiveAggressiveClassifier(random_state=seed, C=0.05),
    ridge=RidgeClassifier(random_state=seed, alpha=5),
    sgd=SGDClassifier(random_state=seed, penalty='l1', loss='log', alpha=1e-6),
    bnb=BernoulliNB(alpha=0.1),
    mnb=MultinomialNB(alpha=0.1),
)

# specify target label
y_train = label_gender

# define features
train_feat_gender = pd.DataFrame()
test_feat_gender = pd.DataFrame()

for name, model in models.items():
    timer.start()
    stack_train, stack_test = kfold_stack_binary(kfold, model, x_train,
                                                 y_train, x_test)
    timer.stop()
Ejemplo n.º 36
0
###############
save_classifier = open("originalnaivebayes5k.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:",
      (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

save_classifier = open("MNB_classifier5k.pickle", "wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:",
      (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

save_classifier = open("BernoulliNB_classifier5k.pickle", "wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:",
      (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) *
      100)

save_classifier = open("LogisticRegression_classifier5k.pickle", "wb")
Ejemplo n.º 37
0
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8702380952380953
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=BernoulliNB(alpha=100.0, fit_prior=True)),
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini",
                                                       max_depth=7,
                                                       min_samples_leaf=8,
                                                       min_samples_split=20)),
    XGBClassifier(learning_rate=0.1,
                  max_depth=5,
                  min_child_weight=4,
                  n_estimators=100,
                  nthread=1,
                  subsample=0.7000000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Ejemplo n.º 38
0
def classify(X, y, clf_type='nbc'):
    """
    Preprocess the input documents to extract feature vector representations of
    them. Your features should be N-gram counts, for N<=2.

    1. Experiment with the complexity of the N-gram features (i.e., unigrams,
       or unigrams and bigrams): `gram_min` + `gram_max`
    2. Experiment with removing stop words. (see NLTK)
    3. Remove infrequently occurring words and bigrams as features. You may tune
       the threshold at which to remove infrequent words and bigrams.
    4. Search over hyperparameters for the three models (nb, svm, lr) to
       find the best performing model.

    All 4 of the above are done in the context of 10-fold cross validation on
    the data. On the training data, 3-fold cross validation is done to find the
    optimal hyperparameters (using randomized CV), which are then tested on
    held-out data.
    """

    if clf_type == 'nbc':
        clf = BernoulliNB()
        params = SETTINGS_NB
    elif clf_type == 'svc':
        clf = LinearSVC()
        params = SETTINGS_SVC
    elif clf_type == 'lrc':
        clf = LogisticRegression()
        params = SETTINGS_LR
    else:
        raise Exception('invalid clf {}: {nbc, svc, lrc}'.format(clf_type))

    # pipeline runs preprocessing and model during every CV loop
    pipe = Pipeline([
        ('pre', CountVectorizer()),
        ('clf', clf),
    ])

    model = RandomizedSearchCV(
        pipe, params, n_jobs=-1, n_iter=N_CV, cv=INNER, scoring='f1_macro'
    )

    results = {
        'test':  {'loss': [], 'accuracy': [], 'confusion': [], 'errors': []},
        'train': {'loss': [], 'accuracy': [], 'confusion': []},
        'cv': {}
    }

    kf = StratifiedKFold(n_splits=FOLDS, shuffle=True)

    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print("[{}] {}/{}".format(clf_type, i+1, FOLDS))

        # split training and test sets
        X_train = X[train_idx]
        X_test = X[test_idx]
        y_train = y[train_idx]
        y_test = y[test_idx]

        # fit model
        model.fit(X_train, y_train)

        # save the best parameters from the inner-fold cross validation
        best_params = model.best_estimator_.get_params()
        for p in sorted(params.keys()):
            results['cv'][p] = best_params[p]

        # make predictions on train and test set
        y_test_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)

        # record some misclassified sentences
        idx_errors = np.where(y_test_pred != y_test)[0]
        np.random.shuffle(idx_errors)
        errors = X_test[idx_errors[:5]]
        results['test']['errors'].extend(errors)

        # store results
        results['test']['loss'].append(log_loss(y_test, y_test_pred))
        results['test']['accuracy'].append(accuracy_score(y_test, y_test_pred))
        results['test']['confusion'].append(confusion_matrix(y_test, y_test_pred))
        results['train']['loss'].append(log_loss(y_train, y_train_pred))
        results['train']['accuracy'].append(accuracy_score(y_train, y_train_pred))
        results['train']['confusion'].append(confusion_matrix(y_train, y_train_pred))

    return(results)
Ejemplo n.º 39
0
from scorer_semeval18 import main as eval

tokenized_tweets = pickle.load(open(TOK_TWEETS_PATH, 'rb'))
print('loaded tweets')

data_matrix = construct_data_matrix(tokenized_tweets)
print('constructed data matrix')
print('Dim:', data_matrix.shape)
print('Density:', np.count_nonzero(data_matrix) / np.size(data_matrix))

labels = np.asarray(open(CLEAN_LABELS_PATH).read().splitlines())
data_train, data_test, labels_train, labels_test = split_data(
    data_matrix, labels)
print('split data')

bern = BernoulliNB()
bern.fit(data_train, labels_train)
print("\nbern", bern.score(data_test, labels_test))
eval(labels_test, bern.predict(data_test))

multi = MultinomialNB()
multi.fit(data_train + abs(np.min(data_train)), labels_train)
print("\nmulti", multi.score(data_test + abs(np.min(data_test)), labels_test))
eval(labels_test, multi.predict(data_test))

tree = DecisionTreeClassifier(max_depth=10)
tree.fit(data_train, labels_train)
print("\ntree", tree.score(data_test, labels_test))
eval(labels_test, tree.predict(data_test))

clf = RandomForestClassifier(max_depth=3)
Ejemplo n.º 40
0
# [5] Результат в процентах

from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    results = []
    for clf, name in [
        (BernoulliNB(alpha=0.4), 'Native Bayes'),
        (LinearSVC(C=9), 'SVC'),
        (
            DecisionTreeClassifier(max_depth=26),
            'DecisionTreeClassifier',
        ),
            # (LogisticRegression(C=12), 'LogisticRegression'),
            # (RandomForestClassifier(max_depth=2, random_state=0), 'RandomForest'),
        (KNeighborsClassifier(n_neighbors=13), 'KNN')
    ]:
        #     Y_train.reshape(Y_train.shape[0],)
        #     Y_test.reshape(Y_test.shape[0])
        clf.fit(X_train, Y_train)

        predictions = clf.predict(X_train)
        training_accuracy = accuracy_score(predictions, Y_train)
Ejemplo n.º 41
0
            votes.append(v)
        return str(mode(votes)[0])

    def confidence(self, features):
        votes =[]
        for c in self._classifiers:
            v = c.predict(features)
            votes.append(v)
        choice_votes = int(mode(votes)[1])
        conf = choice_votes / len(votes)
        return conf
    #def test_accuracy(self, x2,x3,x4,x5,x6, x7):
    #    average = mean([x2,x3,x4,x5,x6, x7])
    #    return average

BNB = BernoulliNB()
BNB.fit(tfidf_train, y_train)
pred = BNB.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
x2 = metrics.accuracy_score(y_test, pred)
print("BernoulliNB Naive Bayes Accuracy:   %0.3f" % score)
#cm = metrics.confusion_matrix(y_test, pred, labels=[0,1])
#plot_confusion_matrix(cm, classes=[0, 1])

save_classifier = open("Pickled/BernoulliNB.pickle", "wb")
pickle.dump(BNB, save_classifier)
save_classifier.close()

LR = LogisticRegression()
LR.fit(tfidf_train, y_train)
pred = LR.predict(tfidf_test)
Ejemplo n.º 42
0
def main():
    show_plots = False #set to True to show plots, False to not show plots

    #read categories from arguments. e.g. "python3 test.py Comedy Drama Documentary Horror"
    categories = []
    for arg in sys.argv[1:]:
        categories.append(arg)

    X, y, files_used = read_files(categories)

    try:
        high_info_words = high_information_words(X, y)

        X_high_info = []
        for bag in X:
            new_bag = []
            for words in bag:
                if words in high_info_words:
                    new_bag.append(words)
            X_high_info.append(new_bag)
    except ZeroDivisionError:
        print("Not enough information too get high information words, please try again with more files.", file=sys.stderr)
        X_high_info = X

    X_wpm = wpm(files_used, categories, show_plots)
    X_dpm = dpm(files_used, categories, show_plots)
    X_wd = word_distribution(files_used, categories)

    doc2vec_model = Doc2Vec.load("d2v_150.model")
    #doc2vec_model = Doc2Vec.load("d2v_400.model")

    #Reason I don't infer the vector is that I used the data already while training the vector model (with tagged docoments), so I can just retrieve the data
    X_d2v = [doc2vec_model.docvecs[str(i)] for i in range(len(X))]
    #X_d2v = [doc2vec_model.infer_vector(to_list(str(i))) for i in X] 

    X = [(str(x), str(x_high), wpm, dpm, wd, d2v) for x, x_high, wpm, dpm, wd, d2v in zip(X, X_high_info, X_wpm, X_dpm, X_wd, X_d2v)]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

    clfs = [
        SVC(C=10, cache_size=500, class_weight=None, coef0=0.0, #parameters found using grid_search.py
        decision_function_shape=None, degree=3, gamma=0.0001, kernel='linear',
        max_iter=100000, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False),
        MultinomialNB(alpha=1.0),
        BernoulliNB(),
    ]

    pipeline = Pipeline([
        # Extract the features
        ('features', FeaturesExtractor()),

        # Use FeatureUnion to combine the features from subject and body
        ('union', FeatureUnion(
            transformer_list=[
                #Pipeline bag-of-words model 
                ('text', Pipeline([
                    ('selector', ItemSelector(key='text')),
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3))),
                    #('chi-square', SelectKBest(chi2, 300)),
                ])),

                #Pipeline for high info words bag-of-words model 
                ('text_high', Pipeline([
                    ('selector', ItemSelector(key='text_high')),
                    ('tfidf', TfidfVectorizer(sublinear_tf=True, norm='l2')),
                ])),

                #Pipeline for wpm feature
                ('wpm', Pipeline([
                    ('selector', ItemSelector(key='wpm')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for dpm feature
                ('dpm', Pipeline([
                    ('selector', ItemSelector(key='dpm')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for wd feature
                ('wd', Pipeline([
                    ('selector', ItemSelector(key='wd')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for d2v feature
                ('d2v', Pipeline([
                    ('selector', ItemSelector(key='d2v')),
                    ('scaler', MinMaxScaler()),
                ])),

                #Pipeline for POS tag features
                # ('pos', Pipeline([
                #     ('selector', ItemSelector(key='pos')),
                #     ('words', TfidfVectorizer(sublinear_tf=True, binary=True, norm='l2', ngram_range=(1,3)))
                # ])),

            ],

            # weight components in FeatureUnion
            transformer_weights={ 
                'text': 0.2,
                'text_high' : 1,
                'wpm': 0,
                'dpm': 0.2,
                'wd': 0,
                'd2v': 0,
                #'pos': 0,
            },
        )),

        # Use a classifier on the combined features
        ('classifier', clfs[0]),
    ])

    train(pipeline, X_train, y_train, categories, show_plots)

    final_pred = pipeline.predict(X_test)
    print("\nScores on test set:\n")
    print(metrics.accuracy_score(y_test, final_pred))
    print(metrics.classification_report(y_test, final_pred, digits=3))

    confusion_m = metrics.confusion_matrix(y_test, final_pred, labels=categories)
    plt.figure(figsize = (16, 9), dpi=150)
    sn.set(font_scale=1.4) #label size
    hm = sn.heatmap(confusion_m, annot=True, fmt='g', annot_kws={"size": 16}) #font size
    hm.set(xticklabels = categories, yticklabels = categories)
    plt.title(str(pipeline.named_steps['classifier']).split("(")[0] + ' Confusion Matrix')
    if show_plots:
        plt.show()
    hm.figure.savefig(str(pipeline.named_steps['classifier']).split("(")[0] + '_confusion_matrix_test' + '.png', figsize = (16, 9), dpi=150)
    plt.close()
Ejemplo n.º 43
0
#Naive Bayes Model
from sklearn.model_selection import train_test_split
quora_train, cv = train_test_split(quora_train, test_size=0.2)
x_train = quora_train.drop(['target'], axis=1)
y_train = quora_train['target']
x_cv = cv.drop(['target'], axis=1)
y_cv = cv['target']

from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer()
reviews_tfidf = tf_idf_vect.fit_transform(x_train['question_text'].values)
reviews_tfidf1 = tf_idf_vect.transform(x_cv['question_text'].values)
reviews_tfidf2 = tf_idf_vect.transform(quora_test['question_text'].values)
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()
param_grid = {
    'alpha': [1000, 100, 10, 1, 0.1, 0.01, 0.001]
}  #params we need to try on classifier
gsv = GridSearchCV(nb, param_grid, cv=2, verbose=1, n_jobs=-1, scoring='f1')
gsv.fit(reviews_tfidf, y_train)
nb = BernoulliNB(alpha=0.1)
nb.fit(reviews_tfidf, y_train)
train_pred = nb.predict(reviews_tfidf)
cv_pred = nb.predict(reviews_tfidf1)

test_pred = nb.predict(reviews_tfidf2)
print("Train Set Accuracy: {}".format(accuracy_score(train_pred, y_train)))
print("Train Set ROC: {}".format(roc_auc_score(train_pred, y_train)))
print("Train Set F1 Score: {}\n".format(f1_score(train_pred, y_train)))
print("Validation Set Accuracy: {}".format(accuracy_score(cv_pred, y_cv)))
print Matr.shape
Matr=Matr[1:]
print len(Yval)

a=1000
b=100000
prior1=(a+spamc-1)*1.0/(a+b+spamc+legitc-2)
prior2=(a+legitc-1)*1.0/(a+b+spamc+legitc-2)
#   y=beta.pdf(x, a, b)
from sklearn.metrics import precision_recall_curve
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import train_test_split
a_train, a_test, b_train, b_test = train_test_split(Matr, Yval, test_size=0.2, random_state=42)
clf = MultinomialNB(class_prior=[1,2])
clf2= BernoulliNB(class_prior=[prior1,prior2])
clf.fit(a_train, b_train)
clf2.fit(a_train, b_train)
Ax=clf.predict(a_test)
Bx=clf2.predict(a_test)
from sklearn.metrics import f1_score

#print f1_score(b_test, Ax, average='macro')
print f1_score(b_test, Bx, average='macro')

import matplotlib.pyplot as plt

precision, recall, _ = precision_recall_curve(b_test, Bx)

plt.step(recall, precision, color='b', alpha=0.2,where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
Ejemplo n.º 45
0
def main():
    x = [0, 1, 2, 3, 4, 5]
    LABELS = [
        'simple_nb', 'svm', 'KNN', 'gausian_nb', 'bernoulli', 'random_forest'
    ]
    plt.title("Accuracy of different algorithm on different user chat")
    plt.xlabel("Algorithms used")
    plt.ylabel("Accuracy")
    path = './chats_process'

    #test_negative = convert_float(test_nega)
    #labels_test_negative = get_labels(test_negative)
    count = 0
    results = [0, 0, 0, 0, 0, 0]
    for filename in os.listdir(path):

        count += 1

        #print filename
        t = path + '/' + filename + '/train.csv'
        splitRatio = .5
        dataset = loadCsv(t)
        trainingSet, testSet = splitDataset(dataset, splitRatio)

        #testSet = testSet + test_nega
        trainset_copy = trainingSet
        test_copy = testSet

        trainingSet = convert_float(trainingSet)
        testSet = convert_float(testSet)

        #print testSet

        summaries = summarizeByClass(trainingSet)
        predictions = getPredictions(summaries, testSet)
        acc_NB = getAccuracy1(testSet, predictions)

        #print "accuracy_simpleNB= " + str(acc_NB)
        results[0] += acc_NB
        train_set = convert_float(trainset_copy)
        labels_train = get_labels(trainset_copy)

        test_set = convert_float(test_copy)
        #testSet = testSet + test_negative
        labels_test = get_labels(test_copy)
        #labels_test = labels_test + labels_test_negative
        #print labels_test

        tp_NB = TruePositive(predictions, testSet)
        tn_NB = TrueNegative(predictions, testSet)
        fp_NB = FalsePositive(predictions, testSet)
        fn_NB = FalseNegative(predictions, testSet)

        prec_NB = tp_NB / (tp_NB + fp_NB)
        rec_NB = tp_NB / (tp_NB + fn_NB)

        # SVM
        clf = svm.SVC(probability=True)
        clf.fit(train_set, labels_train)
        #clf.decision_function(test_set)
        results_SVM = clf.predict(test_set)
        a = clf.predict_proba(test_set)
        acc_svm = getAccuracy(results_SVM, labels_test)
        #print "accuracy_svm= " + str(acc_svm)
        results[1] += acc_svm

        tp_SVM = TruePositive(results_SVM, labels_test)
        tn_SVM = TrueNegative(results_SVM, labels_test)
        fp_SVM = FalsePositive(results_SVM, labels_test)
        fn_SVM = FalseNegative(results_SVM, labels_test)

        prec_SVM = tp_SVM / (tp_SVM + fp_SVM)
        rec_SVM = tp_SVM / (tp_SVM + fn_SVM)

        #KNN
        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(train_set, labels_train)
        results_KNN = neigh.predict(test_set)
        b = neigh.predict_proba(test_set)
        acc_knn = getAccuracy(results_KNN, labels_test)
        #print "accuracy_knn= " + str(acc_knn)
        results[2] += acc_knn

        tp_knn = TruePositive(results_KNN, labels_test)
        tn_knn = TrueNegative(results_KNN, labels_test)
        fp_knn = FalsePositive(results_KNN, labels_test)
        fn_knn = FalseNegative(results_KNN, labels_test)

        prec_knn = tp_knn / (tp_knn + fp_knn)
        rec_knn = tp_knn / (tp_knn + fn_knn)

        #gausianNB
        clf = GaussianNB()
        clf.fit(train_set, labels_train)
        results_GausianNB = clf.predict(test_set)
        c = clf.predict_proba(test_set)
        acc_gausNB = getAccuracy(results_GausianNB, labels_test)
        #print "accuracy_gausNB= " + str(acc_gausNB)
        results[3] += acc_gausNB

        tp_gnb = TruePositive(results_GausianNB, labels_test)
        tn_gnb = TrueNegative(results_GausianNB, labels_test)
        fp_gnb = FalsePositive(results_GausianNB, labels_test)
        fn_gnb = FalseNegative(results_GausianNB, labels_test)

        prec_gnb = tp_gnb / (tp_gnb + fp_gnb)
        rec_gnb = tp_gnb / (tp_gnb + fn_gnb)

        #BernoiliNB
        clf = BernoulliNB()
        clf.fit(train_set, labels_train)
        results_BernoulliNB = clf.predict(test_set)
        d = clf.predict_proba(test_set)
        acc_BernoNB = getAccuracy(results_BernoulliNB, labels_test)
        #print "accuracy_bernoNB= " + str(acc_BernoNB)
        results[4] += acc_BernoNB

        tp_gnb = TruePositive(results_BernoulliNB, labels_test)
        tn_gnb = TrueNegative(results_BernoulliNB, labels_test)
        fp_gnb = FalsePositive(results_BernoulliNB, labels_test)
        fn_gnb = FalseNegative(results_BernoulliNB, labels_test)

        prec_bnb = tp_gnb / (tp_gnb + fp_gnb)
        rec_bnb = tp_gnb / (tp_gnb + fn_gnb)

        #randomforests

        clf = RandomForestClassifier(n_estimators=10)
        clf.fit(train_set, labels_train)
        results_randomforest = clf.predict(test_set)
        e = clf.predict_proba(test_set)
        acc_random_F = getAccuracy(results_randomforest, labels_test)
        #print "accuracy_random_forest= " + str(acc_random_F)
        results[5] += acc_random_F

        tp_gnb = TruePositive(results_randomforest, labels_test)
        tn_gnb = TrueNegative(results_randomforest, labels_test)
        fp_gnb = FalsePositive(results_randomforest, labels_test)
        fn_gnb = FalseNegative(results_randomforest, labels_test)

        prec_rf = tp_gnb / (tp_gnb + fp_gnb)
        rec_rf = tp_gnb / (tp_gnb + fn_gnb)

        #print "-------------\n"
        #print results_SVM
        #print results_KNN
        #print results_GausianNB
        #print results_BernoulliNB
        #print results_randomforest

        #print "\n"
        #print labels_test
        #print results
        #plt.plot(x,results,marker='o')
        '''
		s = open('results.txt','a')
	
		with open('./chats_process/'+filename+'/'+'ml_training_'+'.csv', 'w') as csvoutput:
			writer = csv.writer(csvoutput)
			for a1,b1,c1,d1,e1,label in zip(a,b,c,d,e,labels_test):
				writer.writerow([a1[1],b1[1],c1[1],d1[1],e1[1],label])
				s.write("%s\n" % a1)
				s.write("%s\n" % b1)
				s.write("%s\n" % c1)
				s.write("%s\n" % d1)
				s.write("%s\n" % e1)
		
				#s.write(b1)
				#s.write(str(c1)) 
				#s.write(d1) 
				#s.write(e1)
				s.write("................\n")
	
		print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
		# prepare model
		summaries = summarizeByClass(trainingSet)
		# test model
		predictions = getPredictions(summaries, testSet)
		accuracy = getAccuracy(testSet, predictions)
		print('Accuracy: {0}%').format(accuracy) '''

    t = open('remove_one5.txt', 'a')
    t.write(str(prec_NB) + " , " + str(rec_NB) + '\n')
    t.write(str(prec_SVM) + " , " + str(rec_SVM) + '\n')
    t.write(str(prec_gnb) + " , " + str(rec_gnb) + '\n')
    t.write(str(prec_bnb) + " , " + str(rec_bnb) + '\n')
    t.write(str(prec_rf) + " , " + str(rec_rf) + '\n')
    t.write(str(prec_knn) + " , " + str(rec_knn) + '\n')
with open('Kfold_acc.pickle', 'wb') as f:
    pickle.dump(alternative_Kfold_mean, f)

#support vector machine
from sklearn.svm import LinearSVC
SVM = LinearSVC(random_state=123)
SVM.fit(X_train,y_train)

#decision tree
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=123)
DT.fit(X_train,y_train)

#naive bayes
from sklearn.naive_bayes import BernoulliNB
NB = BernoulliNB()
NB.fit(X_train,y_train)

y_pred_log = Log_Reg.predict(X_test)
y_pred_svm = SVM.predict(X_test)
y_pred_DT = DT.predict(X_test)
y_pred_NB = NB.predict(X_test)

###    validation score    ##
## 10-fold cross validation ##
from sklearn.model_selection import cross_val_score
cross_val = (cross_val_score(Log_reg_fitted, X_train, y_train, cv=10))
alternative_Kfold_mean = np.mean(cross_val)
print('Average validation score Log Reg: ',alternative_Kfold_mean,'\n', 'Validation score per fold: ','\n',cross_val)

### rest of the classifiers' K-fold validation scores ###
Ejemplo n.º 47
0
    def train(self, with_trees, with_print):
        # if fetch_from_server:
        #    self.fetch_tweets(with_print=with_print,pth=pth,remove_stopwords=remove_stopwords,ngrams=ngrams,n_min=n_min,n_max=n_max)
        # else:

        # self.train_test_split(with_print)

        # Logistic Regression
        print(
            '------------------------------------------------------------------------\n',
            'Logistic Regression:')
        start_clf_time = time.time()
        LogisticRegression_classifier = LogisticRegression(fit_intercept=True)

        LogisticRegression_classifier.fit(X=self.X_train, y=self.y_train)
        output = Kappa(LogisticRegression_classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output

        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\LogisticRegression.pickle",
                "wb") as classifier_f:
            pickle.dump(LogisticRegression_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Naive Bayes:')
        start_clf_time = time.time()
        Naivebayes_classifier = GaussianNB()

        Naivebayes_classifier.fit(X=self.X_train, y=self.y_train)
        output = Kappa(Naivebayes_classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        # Naivebayes_classifier.show_most_informative_features(15)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\Naivebayes_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(Naivebayes_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Multinomial Naive Bayes:')
        start_clf_time = time.time()
        MNB_classifier = MultinomialNB()

        MNB_classifier.fit(X=self.X_train, y=self.y_train)
        output = Kappa(MNB_classifier, X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\MNB_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(MNB_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Bernoulli Naive Bayes:')
        start_clf_time = time.time()
        BernoulliNB_classifier = BernoulliNB()
        BernoulliNB_classifier.fit(X=self.X_train, y=self.y_train)

        output = Kappa(BernoulliNB_classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\BernoulliNB_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(BernoulliNB_classifier, classifier_f)
            classifier_f.close()
        '''
      ================================================================================================================================================
      ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS ~~~ SVM KERNELS 
      ================================================================================================================================================
      '''

        print(
            '------------------------------------------------------------------------\n',
            'C-Support Vector Machine:')
        print('======================\n', 'Linear Kernel')
        start_clf_time = time.time()
        SVC_lin_classifier = SVC(kernel='linear')
        SVC_lin_classifier.fit(X=self.X_prep_train, y=self.y_train)

        output = Kappa(SVC_lin_classifier,
                       X_test=self.X_prep_test,
                       y_test=self.y_test).output
        output['Kernel'] = 'linear'
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() + "\\classifiers\\words_as_features\\SVC_lin.pickle",
                "wb") as classifier_f:
            pickle.dump(SVC_lin_classifier, classifier_f)
            classifier_f.close()

        print('======================\n', 'Polynomial Kernel')
        start_clf_time = time.time()
        SVC_poly_classifier = SVC(kernel='poly', C=1, gamma=1)
        SVC_poly_classifier.fit(X=self.X_prep_train, y=self.y_train)

        output = Kappa(SVC_poly_classifier,
                       X_test=self.X_prep_test,
                       y_test=self.y_test).output
        output['Kernel'] = 'poly'
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() + "\\classifiers\\words_as_features\\SVC_poly.pickle",
                "wb") as classifier_f:
            pickle.dump(SVC_poly_classifier, classifier_f)
            classifier_f.close()

        # Also default kernel
        print('======================\n', 'Radial Basis Function Kernel')
        start_clf_time = time.time()
        SVC_classifier = SVC(kernel='rbf', gamma=0.1, C=1.38)
        SVC_classifier.fit(X=self.X_prep_train, y=self.y_train)

        output = Kappa(SVC_classifier,
                       X_test=self.X_prep_test,
                       y_test=self.y_test).output
        output['Kernel'] = 'rbf'
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() + "\\classifiers\\words_as_features\\SVC_rbf.pickle",
                "wb") as classifier_f:
            pickle.dump(SVC_classifier, classifier_f)
            classifier_f.close()

        print('======================\n', 'Sigmoid Kernel')
        start_clf_time = time.time()
        SVC_sig_classifier = SVC(kernel='sigmoid', gamma=10)
        SVC_sig_classifier.fit(X=self.X_prep_train, y=self.y_train)

        output = Kappa(SVC_sig_classifier,
                       X_test=self.X_prep_test,
                       y_test=self.y_test).output
        output['Kernel'] = 'sigmoid'
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\SVC_sigmoid.pickle",
                "wb") as classifier_f:
            pickle.dump(SVC_sig_classifier, classifier_f)
            classifier_f.close()
        '''
      ================================================================================================================================================
      '''

        print(
            '------------------------------------------------------------------------\n',
            'Stochastic Gradient Descent:')
        start_clf_time = time.time()
        SGD_classifier = SGDClassifier()
        SGD_classifier.fit(X=self.X_train, y=self.y_train)

        output = Kappa(SGD_classifier, X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\SGD_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Multi-layer Perceptron:')
        start_clf_time = time.time()
        MLP_Classifier = MLPClassifier(alpha=1)
        MLP_Classifier.fit(X=self.X_train, y=self.y_train)

        output = Kappa(MLP_Classifier, X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\MLP_Classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()
        '''
      Apart from training the forest classifier, both .dot and .png files are created with visual
      represntation of the trees
      '''
        print(
            '------------------------------------------------------------------------\n',
            'Random Forest:')
        start_clf_time = time.time()
        rnd_forest = RandomForestClassifier(n_jobs=-1,
                                            n_estimators=25,
                                            warm_start=True,
                                            max_features=7)
        RandomForest_Classifier = rnd_forest
        RandomForest_Classifier.fit(X=self.X_train, y=self.y_train)

        if with_trees:
            # Export trees
            i_tree = 0
            for tree_in_forest in rnd_forest.estimators_:
                tree_dot_str = getcwd() + '/trees/tree_' + str(i_tree) + '.dot'
                with open(tree_dot_str, 'w') as tree_dot_file:
                    tree_dot_file = tree.export_graphviz(
                        tree_in_forest, out_file=tree_dot_file)

                (graph, ) = pydot.graph_from_dot_file(tree_dot_str)
                graph.write_png(tree_dot_str.replace('.dot', '.png'))

                i_tree = i_tree + 1

        output = Kappa(RandomForest_Classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\RandomForest_Classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Adaptive Boosting:')
        start_clf_time = time.time()
        AdaBoost_Classifier = AdaBoostClassifier()
        AdaBoost_Classifier.fit(X=self.X_train, y=self.y_train)

        output = Kappa(AdaBoost_Classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\AdaBoost_Classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()

        print(
            '------------------------------------------------------------------------\n',
            'Voted Classifier:')
        start_clf_time = time.time()
        voted_classifier = VoteClassifier(
            Naivebayes_classifier,
            # SVR_classifier,
            MLP_Classifier,
            RandomForest_Classifier,
            # QDA_Classifier,
            AdaBoost_Classifier,
            SVC_lin_classifier,
            # SVC_poly_classifier,
            SVC_sig_classifier,
            SVC_classifier,
            SGD_classifier,
            MNB_classifier,
            BernoulliNB_classifier,
            LogisticRegression_classifier)

        with open(
                getcwd() +
                "\\classifiers\\words_as_features\\voted_classifier.pickle",
                "wb") as classifier_f:
            pickle.dump(SGD_classifier, classifier_f)
            classifier_f.close()

        output = Kappa(voted_classifier,
                       X_test=self.X_test,
                       y_test=self.y_test).output
        output['duration'] = round(time.time() - start_clf_time, 3)
        output['time_stamp'] = datetime.datetime.now().strftime(
            "%Y_%m_%d_%H:%M:%S")
        self.output_log = self.output_log.append(output)

        print(
            '------------------------------------------------------------------------'
        )

        self.output_log['Train_News'] = self.sizes_df.loc['Training']['News']
        self.output_log['Train_Spam'] = self.sizes_df.loc['Training'][
            'Not-News']
        self.output_log['Test_News'] = self.sizes_df.loc['Testing']['News']
        self.output_log['Test_Spam'] = self.sizes_df.loc['Testing']['Not-News']
        self.output_log['feature_cnt'] = None

        self.output_log['type'] = 'descriptive_features'

        # Reorder ouput log
        self.output_log = self.output_log[[
            # ID
            'time_stamp',
            'Name',
            'Kernel',
            'feature_cnt',
            'type',
            # Sizes
            'Train_News',
            'Train_Spam',
            'Test_News',
            'Test_Spam',
            'True_News',
            'True_Spam',
            'False_News',
            'False_Spam',

            # Measures
            'Accuracy',
            'Kappa',
            'rauc',
            'duration',
            'News_TPR',
            'News_FPR',
            'News_Prec',
            'News_Recall',
            'News_F1',
            'Spam_TPR',
            'Spam_FPR',
            'Spam_Prec',
            'Spam_Recall',
            'Spam_F1',
        ]]

        # Saving results to file
        df = pd.DataFrame()
        if os.path.isfile(
                getcwd() +
                "\\classifiers\\words_as_features\\desc_weighted_confs.csv"):
            retry = 5
            while retry > 0:
                try:
                    df = pd.DataFrame().from_csv(
                        getcwd() +
                        "\\classifiers\\words_as_features\\desc_weighted_confs.csv",
                        sep=";")
                except Exception as e:
                    retry -= 1
                    time.sleep(60)
                    print('Error reading file.', retry,
                          'attempts remainig ...')
                    continue
                break

            df = self.output_log.append(df, ignore_index=True)
        else:
            df = self.output_log

        retry = 5
        while retry > 0:
            try:
                df.to_csv(
                    getcwd() +
                    "\\classifiers\\words_as_features\\desc_weighted_confs.csv",
                    sep=";")
                print(
                    'saved to',
                    getcwd() +
                    "\\classifiers\\words_as_features\\desc_weighted_confs.csv"
                )
            except Exception as e:
                retry -= 1
                time.sleep(60)
                print('Error writing to file.', retry, 'attempts remainig ...')
                continue
            break
Ejemplo n.º 48
0
# Each vector has the length of the entire vocabulary and
# an integer count for the number of times each word appeared in the document.
myPattern = r'[a-z]{4,}' if token_pattern else r'(?u)\b\w\w+\b'

vectorizer = CountVectorizer(stop_words=stop_words,
                             max_df=max_df,
                             min_df=min_df,
                             token_pattern=myPattern)
counts = vectorizer.fit_transform(X_train)

# Create classifier and fit for multinomial model.
clfMulti = MultinomialNB()
clfMulti.fit(counts, Y_train)

# Create classifier and fit for bernoulli model
clfBernoulli = BernoulliNB(binarize=1)
clfBernoulli.fit(counts, Y_train)

X_test = df_test.text
Y_test = df_test.label

# Transforms each document into a vector (with length of vocabulary of train documents) with an
# integer count for the number of times each word appeared in the document
example_count = vectorizer.transform(X_test)

# Predict labels on the test data set
predictionsMulti = clfMulti.predict(example_count)
predictionsBernoulli = clfBernoulli.predict(example_count)


def getPercentageCorrect(predictions):
Ejemplo n.º 49
0
svm_cv.fit(train_X, train_y)
print(svm_cv.best_params__)
print(svm_cv.cv_results__)"""
gamma_best = 1.0#svm_cv.best_params__["gamma"]


# final experiments (e.g., to get standard error)
numruns = 8

# try a neural network since svm can take too long to converge
nn = MLPClassifier(hidden_layer_sizes = (16, 8), alpha = 0.0, max_iter = 10, random_state = None)

final_algs = {
    "Logistic Regression": LogisticRegression(penalty = "l1", solver = "saga", random_state = None, class_weight = "balanced", max_iter = 90, C = C_best),
    "SVM": SVC(kernel = "rbf", random_state = None, class_weight = "balanced", gamma = gamma_best, max_iter = 1000),
    "Naive Bayes": BernoulliNB(alpha = 1.0, fit_prior = True)
    #"Neural Network": nn
    }

print("Starting final experiments")



conf_mats = {} # holds the confusion matrices for each algorithm
f1 = {} # holds the list of macro f1 scores for each algorithm
for name in final_algs.keys():
    conf_mats[name] = pd.DataFrame([[0, 0], [0, 0]])
    f1[name] = []

# compute macro average of f1 score (i.e., f1 score for every run) so that we may calculate a confidence interval
for i in range(numruns):
Ejemplo n.º 50
0
#####TRYING MIX OF ALL MODELS
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

X = train_data.tweet
y = train_data.label

cv = ShuffleSplit(n_splits=20, test_size=0.2)

models = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(),
    SGDClassifier(),
    LinearSVC(),
    RandomForestClassifier(),
    MLPClassifier()
]

sm = SMOTE()

# Init a dictionary for storing results of each run for each model
results = {
    model.__class__.__name__: {
        'accuracy': [], 
        'f1_score': [],
        'confusion_matrix': []
train_label_fn = 'train-labels-100.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
    

train_data_fn = 'train-features-50.txt'
train_label_fn = 'train-labels-50.txt'
test_data_fn = 'test-features.txt'
test_label_fn = 'test-labels.txt'

(train_data, train_label)  = read_data(train_data_fn, train_label_fn)
(test_data, test_label)  = read_data(test_data_fn, test_label_fn)
clf = MultinomialNB()
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
    
clf = BernoulliNB(binarize = .5)
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)
print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))
Ejemplo n.º 52
0
def classification_naive_bayes(X, Y, nome):
    nb_model = BernoulliNB()
    classification_model_cv(X, Y, nb_model, "Naive Bayes "+nome)
Ejemplo n.º 53
0
vecCount = CountVectorizer(min_df=3)
vecCount.fit(X_train["text"])
# 単語の種類
print("word size: ", len(vecCount.vocabulary_))
# 先頭5件の単語を表示
print("word content: ", dict(list(vecCount.vocabulary_.items())[0:5]))
# トレーニング・評価データをベクトル化
X_train_vec = vecCount.transform(X_train["text"])
X_test_vec = vecCount.transform(X_test["text"])
# 先頭5件のベクトル化データを表示
print("先頭5件のベクトル化データを表示")
print(pd.DataFrame(X_train_vec.toarray()[0:5], columns=vecCount.get_feature_names()))

# -モデル作成-
# ベルヌーイモデル
model = BernoulliNB()
model.fit(X_train_vec, Y_train["class"])

# -評価-
print("Train accuracy = %.3f" % model.score(X_train_vec, Y_train))
print("Test accuracy = %.3f" % model.score(X_test_vec, Y_test))

# -予測-
# 予測テキストデータ作成
data = np.array([
    "I am happy.",
    "Are you happy? 00",
    "Free service! Please contact me immediately. But it is 300 US dollars next month."
])
df_data = pd.DataFrame(data, columns=["text"])
# 予測テキストデータをベクトル化
Ejemplo n.º 54
0
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf.fit(X_train, y_train)
  score = f05_scorer(clf, X_test, y_test)
  if score > best_score:
    best_clf = clf
    best_score = score

fout = open('kbest-multinomialNB.pickle','w')
pickle.dump(clf,fout)
fout.close()

#######################
print "Bernoulli NB"
clf = BernoulliNB(binarize = 0.0, alpha = 0.25, fit_prior = False)

kf = KFold(72000, n_folds=10, shuffle=True)
best_score = 0
best_clf = 0
for train_index, test_index in kf:
  print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  clf.fit(X_train, y_train)
  score = f05_scorer(clf, X_test, y_test)
  if score > best_score:
    best_clf = clf
    best_score = score

fout = open('kbest-bernoulliNB.pickle','w')
Ejemplo n.º 55
0
 def __init__(self, info, verbose=True, debug_mode=False, run_on_gpu=False):
     self.label_num = info['label_num']
     self.target_num = info['target_num']
     self.task = info['task']
     self.metric = info['metric']
     self.postprocessor = MultiLabelEnsemble(
         LogisticRegression(), balance=False)  # To calibrate proba
     if debug_mode >= 2:
         self.name = "RandomPredictor"
         self.model = RandomPredictor(self.target_num)
         self.predict_method = self.model.predict_proba
         return
     if info['task'] == 'regression':
         if info['is_sparse'] == True:
             self.name = "BaggingRidgeRegressor"
             self.model = BaggingRegressor(
                 base_estimator=Ridge(),
                 n_estimators=1,
                 verbose=verbose,
                 random_state=1)  # unfortunately, no warm start...
             # Lukasz uses BernoulliNB() instead of Ridge()
         else:
             #self.name = "GradientBoostingRegressor"
             #self.model = GradientBoostingRegressor(n_estimators=1, verbose=verbose, warm_start = True, random_state=1)
             # There is a problem with  "GradientBoostingRegressor", which does not accept non c-contiguous arrays.
             self.name = "RandomForestRegressor"
             self.model = RandomForestRegressor(n_estimators=1,
                                                random_state=1,
                                                warm_start=True)
         self.predict_method = self.model.predict
     else:
         if info['has_categorical']:  # Out of lazziness, we do not convert categorical variables...
             self.name = "RandomForestClassifier"
             self.model = RandomForestClassifier(
                 n_estimators=1, verbose=verbose, random_state=1
             )  # New: warm_start = True ,now there is warm start is sklearn 0.16.1 not in here for backward compatibility
         elif info['format'] == 'sparse_binary':
             self.name = "BaggingBernoulliNBClassifier"
             self.model = BaggingClassifier(
                 base_estimator=BernoulliNB(),
                 n_estimators=1,
                 verbose=verbose,
                 random_state=1)  # unfortunately, no warm start...
         elif info['format'] == 'sparse':
             self.name = "BaggingMutinomialNBClassifier"
             self.model = BaggingClassifier(
                 base_estimator=MultinomialNB(),
                 n_estimators=1,
                 verbose=verbose,
                 random_state=1)  # unfortunately, no warm start...
         else:
             if info['label_num'] > 100:
                 self.name = "BaggingGaussianNBClassifier"
                 self.model = BaggingClassifier(
                     base_estimator=GaussianNB(),
                     n_estimators=1,
                     verbose=verbose,
                     random_state=1)  # unfortunately, no warm start...
             else:
                 #self.name = "RandomForestClassifier"
                 #self.model = RandomForestClassifier(n_estimators=1, verbose=verbose, warm_start = True , random_state=1) # New: now there is warm start is sklearn 0.16.1
                 self.name = "GradientBoostingClassifier"
                 self.model = GradientBoostingClassifier(
                     n_estimators=1,
                     verbose=verbose,
                     random_state=1,
                     min_samples_split=10,
                     warm_start=False)  # New bug warm start no longer works
         if info['task'] == 'multilabel.classification':
             self.model = MultiLabelEnsemble(self.model)
         self.predict_method = self.model.predict_proba
Ejemplo n.º 56
0
def training_step(data, vectorizer):
    training_text = data['Lyrics']
    training_result = data['Year']
    training_text = vectorizer.fit_transform(training_text)

    return BernoulliNB().fit(training_text, training_result)
    SGD_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 4))),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression()),
    ])
elif algo == "Perceptron" or algo == "perceptron":
    SGD_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 4))),
        ('tfidf', TfidfTransformer()),
        ('clf', Perceptron()),
    ])
elif algo == "BernoulliNB" or algo == "bernoulliNB":
    SGD_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 4))),
        ('tfidf', TfidfTransformer()),
        ('clf', BernoulliNB()),
    ])
elif algo == "SGDClassifier" or algo == "sgdClassifier":
    SGD_clf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 4))),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])

# Fit model to training set
SGD_clf.fit(X_train, y_train)

# Predict on test set
SVM_pred = SGD_clf.predict(X_test)

Ejemplo n.º 58
0
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
import random
from sklearn.ensemble import VotingClassifier

random.seed(2002)

iris = datasets.load_iris()
X = iris.data
Y = iris.target

tree = DecisionTreeClassifier()
GNB = GaussianNB()
BNB = BernoulliNB()

vote = VotingClassifier(estimators=[('tree', tree), ('Gnb', GNB),
                                    ('Bnb', BNB)],
                        weights=[2, 1, 1])
vote.fit(X, Y)
pred = vote.predict(X)

print(accuracy_score(Y, pred))
Ejemplo n.º 59
0

tweet_data = train_data['tweet_text']
topic_data = topic_analysis(train_data)
count = CountVectorizer(token_pattern=r'[a-zA-Z0-9#@%_$]+[a-zA-Z0-9#@%_$]+',
                        lowercase=False)
bag_of_words = count.fit_transform(tweet_data)
bag_of_words_2 = count.transform(test_data['tweet_text'])

X = bag_of_words.toarray()
Y = np.array(topic_data)

x_train = X
x_test = bag_of_words_2.toarray()
y_train = Y

from sklearn.naive_bayes import BernoulliNB
#from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

clf = BernoulliNB()
model = clf.fit(x_train, y_train)

predictions = model.predict(x_test)
instance = test_data['instance_number']
dic = OrderedDict()
for i in range(len(instance)):
    dic[instance[i]] = predictions[i]

for k, v in dic.items():
    print(str(k) + ' ' + str(v))
Ejemplo n.º 60
0
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(RandomForestClassifier(random_state=random_state))

#Gaussian process
classifiers.append(GaussianProcessClassifier(random_state=random_state))

#Generalized linear models
classifiers.append(LogisticRegressionCV(random_state=random_state))
classifiers.append(PassiveAggressiveClassifier(random_state=random_state))
classifiers.append(RidgeClassifierCV())
classifiers.append(SGDClassifier(random_state=random_state))
classifiers.append(Perceptron(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))

#Navies Bayes
classifiers.append(BernoulliNB())
classifiers.append(GaussianNB())

#Nearest Neighbors
classifiers.append(KNeighborsClassifier())

#Discrimnant analysis
classifiers.append(LinearDiscriminantAnalysis())

#Support vector machine
classifiers.append(SVC(random_state=random_state, probability=True))
classifiers.append(NuSVC(random_state=random_state, probability=True))
classifiers.append(LinearSVC(random_state=random_state))

#Trees
classifiers.append(DecisionTreeClassifier(random_state=random_state))