Ejemplo n.º 1
0
def naive_bayes(x_value, y_value):
    X = x_value
    y = y_value

    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

    vect = CountVectorizer()
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)

    X_test_dtm = vect.transform(X_test)

    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    
    print 'Accuracy: '
    print metrics.accuracy_score(y_test, y_pred_class)
    
    print 'Null Accuracy: '
    print y_test.value_counts().head(1) / len(y_test)
    
    print 'Confusion Matrix: '
    print metrics.confusion_matrix(y_test, y_pred_class)
Ejemplo n.º 2
0
def main():
    """loads data, trains model, tests model

    Inputs:
        file: binary file containing sparse numpy array with text features
        file: binary file containing pandas dataframe with training labels

    Outs:
        print: classification report of classifier performance

    """

    # Load training labels and text features
    chdir("../pickles")
    with open("word_counts.pkl", "rb") as f:
        X = pickle.load(f)
    with open("training_labels.pkl", "rb") as f:
        y = pickle.load(f)
        y = np.ravel(y["sponsored"])

    # Create train and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    # Create and train model
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(classification_report(y_test, y_pred))
Ejemplo n.º 3
0
def run_analyzer(data_file):
    start_time = time.time()
    with open(data_file, 'r') as f:
                data = pickle.load(f)
                labels = data['labels']
                features = data['features']
    
    #split into training and test data
    training_features, test_features, training_labels, test_labels = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=0)
    
    
    clf = svm.SVC()
    clf.fit(training_features, training_labels)
    clf = MultinomialNB().fit(training_features, training_labels)
    print "number of training samples %d" %len(training_labels)
    print "number of test samples: %d" %len(test_labels)
    print "number of features: %d" %training_features.shape[1]
    print "score on the training data: %.2f: " %clf.score(training_features, training_labels)
    predictions = clf.predict(test_features)
    predictions = map(float, predictions)
    test_labels = map(float, test_labels)
    test_labels = np.array(test_labels)
    succes_rate = np.mean(predictions == test_labels)
    
    print "results fitting on test data:"
    print "succes rate: %s" %succes_rate
    print "Runtime : %.2f seconds" % (time.time() - start_time)

##SCRIPT
#run_analyzer(DATA_FILE_2)
#cross_val(DATA_FILE)
#cross_val(DATA_FILE_2)
#search_parameters(DATA_FILE_2)
Ejemplo n.º 4
0
    def train(self):
        '''
        ## -- How to predict -- ##
            query = "blah blah"
            q = list2vec(hashit(q)) 
            clf2 = joblib.load('nb')
            print(clf2.predict(q)) # <--- returns type id
        '''

        limit = self.comment_limit
        sqls = ["SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=1 ORDER BY time DESC LIMIT " + str(limit),
            "SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=2 ORDER BY time DESC LIMIT " + str(limit),
            "SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=3 ORDER BY time DESC LIMIT " + str(limit)]

        print "training model"
        comments = self.sql2list(sqls)
        x, y = self.featureMatrix(comments)
        X = list2Vec(x)
        Y = list2Vec(y)

        q = "Let's talk about food."
        q_vec = list2Vec(hashit(q))

        ## Precicting
        print "Classifying"
        clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
        clf.fit(X, Y)
        joblib.dump(clf, self.path, compress=9)
Ejemplo n.º 5
0
def crossValidate(X_dataset,y):
#cross validate model
    num_folds = 5
    kfold = cross_validation.StratifiedKFold(y, n_folds=num_folds, shuffle=True)

   # kfold=KFold(X.shape[0],n_folds=10, shuffle=True)
    avg_accuracy=0
    avg_precision=0
    avg_recall=0
    print "----------- cross_validation k=5"
    for train,test in kfold:
        Xtrain,Xtest,ytrain,ytest=X_dataset[train],X_dataset[test],y[train],y[test]
        
#        clf=LinearSVC()
        clf=MultinomialNB(alpha=0.1)
#        clf=LDA()
        clf.fit(Xtrain.toarray(),ytrain)
        ypred=clf.predict(Xtest.toarray())
        accuracy=metrics.accuracy_score(ytest,ypred)              
#        print "accuracy = ", accuracy
        avg_accuracy+=accuracy
        precision = metrics.precision_score(ytest,ypred)
#        print("precision:   %0.3f" % precision)
        avg_precision+=precision
        recall = metrics.recall_score(ytest,ypred)
#        print("recall:   %0.3f" % recall)
        avg_recall+=recall
        
    print "Average accuracy : " , (avg_accuracy/num_folds)
    print "Average precision : " , (avg_precision/num_folds)
    print "Average recall : " , (avg_recall/num_folds)        
def bag_of_words_probabilities(train_reviews, test_reviews):
    """ Implements a baseline bag-of-words classifier.  Returns a dictionary mapping tuples (review_id, class) to the probability that that review belongs to that class. """
    train_corpus = []
    test_corpus = []
    Y_train = []
    for review_id in train_reviews:
        review = train_reviews[review_id]
        train_corpus.append(review["text"])
        Y_train.append(review["rating"])

    vectorizer = CountVectorizer(stop_words = 'english')
    X_train = vectorizer.fit_transform(train_corpus)

    for review_id in test_reviews:
        review = test_reviews[review_id]
        test_corpus.append(review["text"])

    # clf = LinearSVC(class_weight = 'auto').fit(X_train, Y_train)
    # clf = LogisticRegression().fit(X_train, Y_train)
    clf = MultinomialNB().fit(X_train, Y_train)

    X_test = vectorizer.transform(test_corpus)
    Y_probability = clf.predict_proba(X_test)

    probability_dict = {}
    review_id_list = test_reviews.keys()
    for i in range(len(review_id_list)):
        probability_dict[review_id_list[i]] = Y_probability[i][1]

    return probability_dict
Ejemplo n.º 7
0
def MultinomialNBClassify_Proba(enrollment_id, trainData, trainLabel, testData):
    nbClf = MultinomialNB() # default alpha=1.0, Laplace smoothing
    # settinf alpha < 1 is called Lidstone smoothing
    nbClf.fit(trainData, ravel(trainLabel))
    testLabel = nbClf.predict_proba(testData)[:,1]
    saveResult(enrollment_id, testLabel, 'Proba_sklearn_MultinomialNB_alpha=0.1_Result.csv')
    return testLabel
def main():
	# extract reviews from tsv files
	labeled_training_data = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3) # 25,000 reviews
	test_data = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3) # 25, 000 reviews


	print "Creating BOW...."" "		
	vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) 
	trained_data_features  = vectorizer.fit_transform(review_list)
	trained_data_features = trained_data_features.toarray() # convert to numpy array for faster processing

	print "Supervised Learning - Naive Bayes"
	nb_model = MultinomialNB(alpha = 0.01)
	nb_model = nb_model.fit(trained_data_features, labeled_training_data["sentiment"]) # using BOW as feaures and the given labels as repsonse variables

	print "---------------------------------"
	print " "
	print "Predicting on test data: "

	# BOW for test set
	test_data_features = vectorizer.transform(test_review_list)
	test_data_features = test_data_features.toarray()

	# use the trained forest to make predictions
	predictions = nb_model.predict(test_data_features)

	# prepare output submission file
	prediction_output = pd.DataFrame( data = {"id":test_data["id"], "sentiment":predictions} ) # create pandas dataframe
	prediction_output.to_csv("BOW_NB.csv", index=False, quoting=3)# write to csv file
	joblib.dump(vectorizer, 'bow_model.pkl')
	joblib.dump(nb_model, 'nb_bow_model.pkl')  
Ejemplo n.º 9
0
def classify_reviews():
	import featurizer
	import gen_training_data
	import numpy as np
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.linear_model import SGDClassifier

	data = gen_training_data.gen_data();
	stemmed_data = featurizer.stem(data);
	tfidf= featurizer.tfidf(data);
	clf = MultinomialNB().fit(tfidf['train_tfidf'], data['training_labels']);
	predicted = clf.predict(tfidf['test_tfidf']);
	num_wrong = 0;
	tot = 0;
	for expected, guessed in zip(data['testing_labels'], predicted):
		if(expected-guessed != 0):	
			num_wrong += 1;

	print("num_wrong: %d",num_wrong)

	sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42);
	_ = sgd_clf.fit(tfidf['train_tfidf'], data['training_labels']);
	sgd_pred = sgd_clf.predict(tfidf['test_tfidf']);
	print np.mean(sgd_pred == data['testing_labels']);

	stem_tfidf = featurizer.tfidf(stemmed_data);
	_ = sgd_clf.fit(stem_tfidf['train_tfidf'], data['training_labels']);
	sgd_stem_prd = sgd_clf.predict(stem_tfidf['test_tfidf']);
	print np.mean(sgd_stem_prd==data['testing_labels']);
Ejemplo n.º 10
0
def main(clf):
    #print 'getting train'
    train = pd.read_csv('dat/trainMN.tsv',sep = '\t')
    #print 'getting test'
    test = pd.read_csv('dat/devMN.tsv', sep = '\t')

    global all_words
    all_words = word_to_set(train['Phrase'], trim=20, is_raw=True)

    #print 'creating x dict vectors from train'
    train_x = train['Phrase']
    #print 'extracting...'
    train_x = use_feature_dicts(train_x)
    # print train_x

    #print 'creating train y'
    train_y = [int(y) for y in train['Sentiment']]
    if clf == 'NB':
        classifier = MultinomialNB().fit(train_x, train_y)
    elif clf == 'RF':
        classifier = RandomForestClassifier().fit(train_x, train_y)
    elif clf == 'LG':
        classifier = linear_model.LinearRegression()
        classifier = classifier.fit(train_x, train_y)
    elif clf == 'SGD':
        classifier = SGDClassifier().fit(train_x, train_y)
    #print 'testing'
    test_x = use_feature_dicts(test['Phrase'])
    
    for i in classifier.predict(test_x):
        print i
    title = clf + '.pickle'
    pickle.dump(classifier, open(title, 'w'))
Ejemplo n.º 11
0
def naive_bayes():
    nb = MultinomialNB()
    nb.fit(X_train, train_data.danger)
    nb_pred = nb.predict(X_test)
    nb_score = nb.score(X_test, y_test)
    precision, recall, _, _ = precision_recall_fscore_support(y_test, nb_pred)
    return precision, recall, str(nb_score)
Ejemplo n.º 12
0
class TrainNaiveBayes:

    def __init__(self, all_features, neu_labels):
        """
        Trains a classifier using Naive Bayes
        """
        self._num_features = len(all_features.values()[0])

        self._X = numpy.zeros((1, self._num_features))          # Feature matrix
        self._Y = numpy.array([0])                        # Label vector
        for user_id in neu_labels.keys():
            self._X = numpy.append(self._X, [all_features[user_id]], axis=0)
            self._Y = numpy.append(self._Y, [neu_labels[user_id]])
        self._X = numpy.delete(self._X, 0, 0)           # Delete the first row (contains all 0s)
        self._Y = numpy.delete(self._Y, 0)

        print "Using MultinomialNB"
        self._model = MultinomialNB()
        print cross_validation.cross_val_score(self._model, self._X, self._Y, cv=10, scoring='f1')

        self._model.fit(self._X, self._Y)

    def predict(self, features):
        A = numpy.zeros((1, self._num_features))
        for user_id in features.keys():
            A = numpy.append(A, [features[user_id]], axis=0)
        A = numpy.delete(A, 0, 0)
        return self._model.predict(A)
Ejemplo n.º 13
0
def train(good_sources, bad_sources,method,naive_bayes=None,keywords=list()):
    #train the algorithm
    good_samples = find_keywords(' '.join([entry[method] for entry in good_sources]))
    bad_samples = find_keywords(' '.join([entry[method] for entry in bad_sources]))


    #if we have an exists knowledge base to append this new information to, do so
    if naive_bayes:
        new_kws = set(good_samples+bad_samples)
        print('Using old keywords as well')
        print("# old keywords = {}\n # new keywords = {}".format(len(keywords),len(new_kws)))
        new_kws = set(good_samples+bad_samples).difference(keywords)
        print("# fresh keywords = {}\n".format(len(new_kws)))

        #make some call to naive_bayes.partial_fssit in here
        X = np.concatenate((naive_bayes.feature_count_, np.zeros((naive_bayes.feature_count_.shape[0],len(new_kws)))),1)
        all_kw = keywords + list(new_kws)

    else:
        print('Only using keywords from this content set')
        all_kw = list(set(good_samples+bad_samples))
        X = np.zeros((2,len(all_kw)))

    for j,kw in enumerate(all_kw):
        X[0,j] += good_samples.count(kw)
        X[1,j] += bad_samples.count(kw)

    y = ['good','bad']

    naive_bayes = MultinomialNB()
    naive_bayes.fit(X,y)

    return naive_bayes, all_kw
def string_selection():
    # get data
    vectorizer = CountVectorizer(decode_error='ignore')
    ch2 = SelectKBest(chi2, k=100)

    # get data
    train_data, permission_list = db_tool.get_new_train_data()
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'],
                                                                         train_data['target'], test_size=0.2,
                                                                         random_state=1)

    # feature extraction
    x_train = vectorizer.fit_transform(x_train)
    feature_names = vectorizer.get_feature_names()

    x_train = ch2.fit_transform(x_train, y_train)
    feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
    print(ch2.scores_)
    print(ch2.get_support(indices=True))
    print(feature_names)
    x_test = vectorizer.transform(x_test)
    x_test = ch2.transform(x_test)

    # # build the model
    model = MultinomialNB().fit(x_train, y_train)
    #
    # # valid the model
    predicted = model.predict(x_test)
    print (metrics.accuracy_score(y_test, predicted))
Ejemplo n.º 15
0
	def run_naivebayes_evaluation(self, inputdata, outputdata, k):
		""" Fit Naive Bayes Classification on train set with cross validation. 
		Run Naive Bayes Classificaiton on test set. Return results
		"""

		###print "** Fitting Naive Bayes classifier.."

		# Cross validation
		cv = cross_validation.KFold(inputdata.shape[0], n_folds=k, indices=True)
		cv_naivebayes = []
		f1_scores = []
		for traincv, testcv in cv:

			clf_cv = MultinomialNB()
			clf_cv.fit(inputdata[traincv], outputdata[traincv])

			y_pred_cv = clf_cv.predict(inputdata[testcv])

			f1 = metrics.f1_score(outputdata[testcv], y_pred_cv, pos_label=0)
			f1_scores.append(f1)

		
		#TODO: NEEDED? self.classifier = clf_cv
		print "score average: %s" + str(np.mean(f1_scores))

		average_score =np.mean(f1_scores)
		tuples = (average_score, f1_scores)

		return (tuples, 'N.A.', 'N.A.')
Ejemplo n.º 16
0
def predict(cur, plyr_id, game_plyrs): 
  #creates training set (called 'X') for plyr
  all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches)
  games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in
  n_cols = all_plyrs.shape[0] #int 
  m_rows = games.shape[0] #int
  w = weights(games)
  zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF
  X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe
  populate_training_set(cur, X, games, plyr_id)
  #print "X: ", X.values
  
  ###run coaches_model and then im here### 
  #creates vector of known output values
  Y = training_output_vector(cur, games, plyr_id) #good
  #print "(len) Y: ", len(Y), Y
  test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF
  test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe
  update_training_matrix(cur, game_plyrs, 0, test_X)
  
  #run Bernoulli NB Classifier
  nb_clf = MultinomialNB()
  
  if len(X.values) == 0:
    return 0
  nb_clf.fit(X, Y, sample_weight=w)
  nb_predictions = nb_clf.predict(test_X)
  #print "test_X: ", test_X.values
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  avgs = [3,8,12.5,17,21,25]
  #print "probs: ", nb_norm_prob
  #print avgs
  ev = expected_val(nb_norm_prob, avgs) #can also calc dot product
  return round(ev,1)
Ejemplo n.º 17
0
class Sentiment:
    def __init__(self):
        self.stop_words = stopwords.words() + list(string.punctuation)
        self.tfid = TfidfVectorizer()
        self.clf = MultinomialNB()

        # score: 0.7225
        # self.clf = SVC()

    # create pipelines
    # clean the input
    def fit(self, X, Y):
        self.X = X
        self.Y = Y
        # give the subset of dataset to be trained
        l = 0
        h = 4000
        words = [word_tokenize(x.decode("utf-8").lower()) for x in X[l:h]]
        processed_words = [" ".join(w for w in s if w not in self.stop_words) for s in words]
        X_train = self.tfid.fit_transform(processed_words)
        Y_train = Y[l:h]
        self.clf.fit(X_train, Y_train)
        print "Classes: ", self.clf.classes_
        print "Score: ", self.clf.score(X_train, Y_train)

    def predict(self, X_inp):
        word_list = " ".join(w for w in word_tokenize(X_inp.decode("utf-8").lower()) if w not in self.stop_words)
        X_test = self.tfid.transform([word_list])
        return self.clf.predict(X_test)
Ejemplo n.º 18
0
def run_learning_curves_experiment(dataset):
    logger.info("Now starting experiment with learning curves...")
    scores = []
    sklearn_scores = []
    train_sizes = []

    clf = MultinomialBayesEstimator()
    sklearn_clf = MultinomialNB()
    # Constructing confidence intervals using empiric bootstrap
    intervals = []
    for test_size in xrange(1, len(dataset)):
        f_scores = []
        f_scores_sklearn = []
        for train_set, test_set in split_train_test_p_out(dataset, test_size):
            train_set, test_set = split_train_test(dataset, test_size)
            X_train, y_train, X_test, y_test = make_test_train(train_set, test_set)
            clf.fit(X_train, y_train)
            f_scores.append(f1_score(y_test, clf.predict(X_test)))
            sklearn_clf.fit(X_train, y_train.ravel())
            f_scores_sklearn.append(f1_score(y_test, sklearn_clf.predict(X_test)))
        intervals.append(calculate_confidence_interval(f_scores))
        scores.append(np.mean(f_scores))
        sklearn_scores.append(np.mean(f_scores_sklearn))
        train_sizes.append(len(dataset) - test_size)

    plot_learning_curves(train_sizes, sklearn_scores, scores, intervals)
Ejemplo n.º 19
0
def test_sklearn_nb(balanced):
    movie_words = process_plots_mp(balanced)

    training_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 != 0]
    test_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 == 0]

    vec = DictVectorizer()
    training_features = vec.fit_transform([movie.wordcounts for movie in training_movies]).toarray()
    training_labels = np.array([movie.year for movie in training_movies])
    #LOGGER.debug("Original size of feature vectors: %d (issparse: %s)" % (
        #csr_matrix(training_features[-1]).toarray().size, str(issparse(training_features))
    #))

    mnb_classifier = MultinomialNB()
    mnb_classifier.fit(training_features, training_labels)

    test_features = vec.transform([movie.wordcounts for movie in test_movies])
    test_labels = np.array([movie.year for movie in test_movies])

    results = mnb_classifier.predict(test_features)

    correct = sum([1 for i, result in enumerate(results) if result == test_labels[i]])
    LOGGER.info("skleanrn's MultinomialNB classifier predicted %d/%d correctly (%0.3f%% accuracy)" % (
        correct, len(test_labels), correct / len(test_labels) * 100
    ))
Ejemplo n.º 20
0
    def train(self, data):
        nb = MultinomialNB()

        launches = map(lambda x: x['application'], data)
        instances = map(lambda i: {'lu1': launches[i-1]}, xrange(1, len(launches)))
        X = self.vectorizer.fit_transform(instances).toarray()
        y = launches[1:]
        self.lu1_predictor = nb.fit(X, y)

        instances = map(lambda i: {'lu2': launches[i-2]}, xrange(2, len(launches)))
        X = self.vectorizer.fit_transform(instances).toarray()
        y = launches[2:]
        self.lu2_predictor = nb.fit(X, y)

        # tune mu
        max_hr = 0
        best_mu = 0
        for mu in map(lambda x: x/10.0, xrange(11)):
            self.mu = mu
            predictions = map(lambda i: self.predict({'lu1': launches[i-1], 'lu2': launches[i-2]}), \
                xrange(2, len(launches)))
            hr, mrr = self.test(launches[2:], predictions)
            if hr > max_hr:
                max_hr = hr
                best_mu = mu
        self.mu = best_mu
Ejemplo n.º 21
0
def run_k_fold_cross_validation_experiment(dataset):
    logger.info("Starting %d-fold cross-validation...", len(dataset))

    clf_sklearn = MultinomialNB()
    clf = MultinomialBayesEstimator()

    sklearn_scores = create_scores_collector()
    scores = create_scores_collector()

    for train_set, test_set in split_train_test_k_fold(dataset):
        X_train, y_train, X_test, y_test = make_test_train(train_set, test_set)

        # Sklearn
        clf_sklearn.fit(X_train, y_train.ravel())
        predictions = clf_sklearn.predict(X_test)
        sklearn_scores.append_scores(y_test, predictions)

        # Our bayes without ngrams
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        scores.append_scores(y_test, predictions)

    logger.info("%d-fold cross validation finished", len(dataset))
    log_scores(sklearn_scores, "Sklearn")
    log_scores(scores, "MBE")
Ejemplo n.º 22
0
def MultinomialNBClassify(trainData, trainLabel, testData):
    nbClf = MultinomialNB(alpha=0.1) # default alpha=1.0, Laplace smoothing
    # settinf alpha < 1 is called Lidstone smoothing
    nbClf.fit(trainData, ravel(trainLabel))
    testLabel = nbClf.predict(testData)
    saveResult(testLabel, 'sklearn_MultinomialNB_alpha=0.1_Result.csv')
    return testLabel
Ejemplo n.º 23
0
def main():
    print('Reading in data file...')
    data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv',
                       usecols=['Sentiment', 'SentimentText'], error_bad_lines=False)

    print('Pre-processing tweet text...')
    corpus = data['SentimentText']
    vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                                 stop_words='english', tokenizer=tokenize)
    X = vectorizer.fit_transform(corpus.values)
    y = data['Sentiment'].values

    print('Training sentiment classification model...')
    classifier = MultinomialNB()
    classifier.fit(X, y)

    print('Training word2vec model...')
    corpus = corpus.map(lambda x: tokenize(x))
    word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4)
    word2vec.init_sims(replace=True)

    print('Fitting PCA transform...')
    word_vectors = [word2vec[word] for word in word2vec.vocab]
    pca = PCA(n_components=2)
    pca.fit(word_vectors)

    print('Saving artifacts to disk...')
    joblib.dump(vectorizer, path + 'vectorizer.pkl')
    joblib.dump(classifier, path + 'classifier.pkl')
    joblib.dump(pca, path + 'pca.pkl')
    word2vec.save(path + 'word2vec.pkl')

    print('Process complete.')
Ejemplo n.º 24
0
def naive_classify_unknown(X_train, y_train, vectorizer):
    client = pymongo.MongoClient("localhost", 27017)
    db = client.tweets
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    test_users = db.tweets.distinct('user.screen_name')
    classify_users(clf, vectorizer, test_users, load_users(db, test_users))
Ejemplo n.º 25
0
class NaiveBayes:
	def __init__(self):
		self.clf = MultinomialNB()
		self.pattern ='(?u)\\b[A-Za-z]{3,}'
		self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(2,2))

	def train(self,fileName):
		print "Naive Bayes classifier is being trained"
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_train = self.tfidf.fit_transform(table.message)
		Y_train = []
		for item in table.cat:
			Y_train.append(int(item)) 
		self.clf.fit(X_train, Y_train)
		self.clf.fit(X_train, Y_train)
		print "Naive Bayes classifier has been trained"

	def classify(self,cFileName, rFileName):
		table = pandas.read_table(cFileName, names=["message"])
		X_test = self.tfidf.transform(table.message)
		print "Data have been classified"
		with open(rFileName,'w') as f:
			for item in self.clf.predict(X_test).astype(str):
				f.write(item+'\n')

	def validate(self,fileName):
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_validate = self.tfidf.transform(table.message)
		Y_validated = self.clf.predict(X_validate).astype(str)
		totalNum = len(table.cat)
		errorCount = 0
		for i in range(0,totalNum):
			if int(table.cat[i])!=int(Y_validated[i]):
				errorCount += 1
		print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
Ejemplo n.º 26
0
def plain_word_counts(corpus_path):
    folds = KFold(article_count, n_folds=10, shuffle=True)

    results = []

    for i, (train_idx, test_idx) in enumerate(folds):
        logging.info("Running fold %d" % i)
        vect = CountVectorizer(max_features=1000, decode_error='ignore', strip_accents='unicode')
        x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx))

        bin = LabelEncoder()
        y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx))

        x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx))
        y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx))

        model = MultinomialNB()
        model.fit(x_train, y_train)
        pred = model.predict(x_test)

        score = accuracy_score(y_test, pred)
        logging.info("Completed fold %d with score %.04f" % (i, score))
        results.append(score)

    return results
Ejemplo n.º 27
0
def bcluster(corpus_path, cluster_fn):
    folds = KFold(article_count, n_folds=10, shuffle=True)

    results = []

    for i, (train_idx, test_idx) in enumerate(folds):
        logging.info("Running fold %d" % i)
        vect = BrownClusterVectorizer(cluster_fn)
        x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx))

        bin = LabelEncoder()
        y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx))

        x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx))
        y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx))

        model = MultinomialNB()
        model.fit(x_train, y_train)
        pred = model.predict(x_test)

        score = accuracy_score(y_test, pred)
        logging.info("Completed fold %d with score %.04f" % (i, score))
        results.append(score)

    return results
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)  
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
    
    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
    
    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)
    
    clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类
    doc_test_predicted = clf.predict(doc_test_vec)
    
    acc = np.mean(doc_test_predicted == doc_class_list_test)  
    print 'Accuracy: ', acc
    
    return acc
Ejemplo n.º 29
0
def find_best_vectorizor(vectorizer, grid):
  dg = DataGatherer()
  y_test = dg.validate_target
  y_train = dg.labeled_target

  nb = MultinomialNB()
  header_printed = False
  best_params = None
  best_score = -1
  for param in IterGrid(grid):
    if not header_printed:
      print(str(",".join(param.keys())) + ",Score")
    header_printed = True
    vectorizer.set_params(**param)
    X_train = vectorizer.fit_transform(dg.labeled_data)    
    X_test = vectorizer.transform(dg.validate_data)
    nb.fit(X_train, y_train)
    score = nb.score(X_test, y_test)
    if score > best_score:
      best_score = score
      best_params = param
    print(str(",".join(map(str, param.values()))) + "," + str(score))
  print("")
  print("Best params: " + str(best_params))
  print("Best score: " + str(best_score))
Ejemplo n.º 30
0
def train_chunk(X, Y, Xe, Ye):
	#clf = KNeighborsClassifier(n_neighbors=5).fit(X, Y)
	#clf = GaussianNB().fit(X, Y)
	clf = MultinomialNB().fit(X, Y)
	Yd = clf.predict(Xe)

	return stats(Ye, Yd)
print("DCDISTANCE + RF")
t.classify(dataset=dataset,
           platform=platform,
           language=language,
           clf=RandomForestClassifier(random_state=42),
           parameters={'clf__n_estimators': tree_estimators},
           feature_set=Model.DCDISTANCE_CODE,
           kfold=5)
print("------------------------------------------------------")

# BOW
print("BOW + MultinomialNB")
t.classify(dataset=dataset,
           platform=platform,
           language=language,
           clf=MultinomialNB(),
           parameters={},
           feature_set=Model.BOW_CODE,
           kfold=5)
print("------------------------------------------------------")

print("BOW + KNN")
t.classify(dataset=dataset,
           platform=platform,
           language=language,
           clf=KNeighborsClassifier(),
           parameters={'clf__n_neighbors': [1, 3, 5, 7]},
           feature_set=Model.BOW_CODE,
           kfold=5)
print("------------------------------------------------------")
def k_fold_cross_validation(x, y, splits, repeats):

    seed = 7

    # classificadores para o ensemble
    clf1 = LogisticRegression(random_state=seed, C=625, penalty='l1')
    clf2 = MultinomialNB(alpha=1130)
    clf3 = GaussianNB()
    clf4 = KNeighborsClassifier(n_neighbors=450)
    clf5 = ExtraTreesClassifier(random_state=seed,
                                criterion='gini',
                                n_estimators=1000,
                                max_features=5)
    clf6 = QuadraticDiscriminantAnalysis()
    eclf = VotingClassifier(estimators=[('LR', clf1), ('NBM', clf2),
                                        ('NBG', clf3), ('KNN', clf4),
                                        ('ET', clf5), ('ADQ', clf6)],
                            voting='hard')

    # Algoritmos comparados
    models = []

    models.append(
        ('RL', LogisticRegression(random_state=seed, C=625, penalty='l1')))
    models.append(('ADL', LinearDiscriminantAnalysis()))
    models.append(('ADQ', QuadraticDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier(n_neighbors=450)))
    models.append(('NBG', GaussianNB()))
    models.append(('NBM', MultinomialNB(alpha=1130)))
    models.append(('SVML', SVC(random_state=seed, kernel='linear', C=0.1)))
    models.append(
        ('SVMR', SVC(random_state=seed, kernel='rbf', C=1, gamma=0.0001)))
    models.append(('RF',
                   RandomForestClassifier(random_state=seed,
                                          criterion='entropy',
                                          n_estimators=1000,
                                          max_features=5)))
    models.append(('ET',
                   ExtraTreesClassifier(random_state=seed,
                                        criterion='gini',
                                        n_estimators=1000,
                                        max_features=5)))
    models.append(('ENS', eclf))

    # loop que analisa cada algoritmo
    score = 'accuracy'
    results1 = []
    names1 = []
    mean1 = []
    std1 = []

    for name, model in models:
        kfold = model_selection.RepeatedStratifiedKFold(n_splits=splits,
                                                        n_repeats=repeats,
                                                        random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     x,
                                                     y,
                                                     cv=kfold,
                                                     scoring=score)
        results1.append(cv_results)
        names1.append(name)
        mean1.append(cv_results.mean() * 100)
        std1.append(cv_results.std() * 100)
        msg = "%s: %f (%f)" % (name, cv_results.mean() * 100,
                               cv_results.std() * 100)
        print(msg)

    list_results_acc = list(zip(names1, results1))
    print(list_results_acc)
    df_results_acc = pd.DataFrame(list_results_acc)
    if part_ign == 3:
        df_results_acc.to_csv('df_results_acc_3.csv', sep=';')
    if part_ign == 10:
        df_results_acc.to_csv('df_results_acc_10.csv', sep=';')
    if part_ign == 19:
        df_results_acc.to_csv('df_results_acc_19.csv', sep=';')

    if score == 'accuracy':
        list_acc = list(zip(names1, mean1, std1))
        df_acc = pd.DataFrame(list_acc)
        if part_ign == 3:
            df_acc.to_csv('df_acc_3.csv', sep=';')
        if part_ign == 10:
            df_acc.to_csv('df_acc_10.csv', sep=';')
        if part_ign == 19:
            df_acc.to_csv('df_acc_19.csv', sep=';')

    # classificadores para o ensemble
    clf1 = LogisticRegression(random_state=seed, C=625, penalty='l1')
    clf2 = MultinomialNB(alpha=15)
    clf3 = GaussianNB()
    clf4 = KNeighborsClassifier(n_neighbors=10)
    clf5 = ExtraTreesClassifier(random_state=seed,
                                criterion='entropy',
                                n_estimators=1000,
                                max_features=17)
    clf6 = QuadraticDiscriminantAnalysis()
    eclf = VotingClassifier(estimators=[('LR', clf1), ('NBM', clf2),
                                        ('NBG', clf3), ('KNN', clf4),
                                        ('ET', clf5), ('ADQ', clf6)],
                            voting='hard')

    models = []

    models.append(
        ('RL', LogisticRegression(random_state=seed, C=625, penalty='l1')))
    models.append(('ADL', LinearDiscriminantAnalysis()))
    models.append(('ADQ', QuadraticDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier(n_neighbors=10)))
    models.append(('NBG', GaussianNB()))
    models.append(('NBM', MultinomialNB(alpha=15)))
    models.append(('SVML', SVC(random_state=seed, kernel='linear', C=10)))
    models.append(
        ('SVMR', SVC(random_state=seed, kernel='rbf', C=10, gamma=0.001)))
    models.append(('RF',
                   RandomForestClassifier(random_state=seed,
                                          criterion='gini',
                                          n_estimators=1000,
                                          max_features=17)))
    models.append(('ET',
                   ExtraTreesClassifier(random_state=seed,
                                        criterion='entropy',
                                        n_estimators=1000,
                                        max_features=17)))
    models.append(('ENS', eclf))

    # loop que analisa cada algoritmo
    score = 'f1_macro'
    results2 = []
    names2 = []
    mean2 = []
    std2 = []

    for name, model in models:
        kfold = model_selection.RepeatedStratifiedKFold(n_splits=splits,
                                                        n_repeats=repeats,
                                                        random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     x,
                                                     y,
                                                     cv=kfold,
                                                     scoring=score)
        results2.append(cv_results)
        names2.append(name)
        mean2.append(cv_results.mean() * 100)
        std2.append(cv_results.std() * 100)
        msg = "%s: %f (%f)" % (name, cv_results.mean() * 100,
                               cv_results.std() * 100)
        print(msg)

    list_results_f1 = list(zip(names2, results2))
    print(list_results_f1)
    df_results_f1 = pd.DataFrame(list_results_f1)
    if part_ign == 3:
        df_results_f1.to_csv('df_results_f1_3.csv', sep=';')
    if part_ign == 10:
        df_results_f1.to_csv('df_results_f1_10.csv', sep=';')
    if part_ign == 19:
        df_results_f1.to_csv('df_results_f1_10.csv', sep=';')

    if score == 'f1_macro':
        list_f1 = list(zip(names2, mean2, std2))
        df_f1 = pd.DataFrame(list_f1)
        if part_ign == 3:
            df_f1.to_csv('df_f1_3.csv', sep=';')
        if part_ign == 10:
            df_f1.to_csv('df_f1_10.csv', sep=';')
        if part_ign == 19:
            df_f1.to_csv('df_f1_19.csv', sep=';')

# plotando gráfico
    fig = plt.figure(figsize=(15, 5))
    ax1 = fig.add_subplot(211)
    ax2 = fig.add_subplot(212)
    plt.subplot(211)
    plt.boxplot(results1)
    ax1.set_xticklabels(names1, fontsize=14)
    plt.ylabel('Acurácia', fontsize=18)
    plt.xlabel('(a)', fontsize=18)
    plt.yticks(rotation='horizontal', fontsize=14)
    plt.axhline(y=0.4656, xmin=0, xmax=1, hold=None, color='g')
    plt.axhline(y=0.5024, xmin=0, xmax=1, hold=None, color='b')
    plt.subplot(212)
    plt.xlabel('(b)\nClassificadores', fontsize=18)
    plt.boxplot(results2)
    plt.ylabel('F1-score', fontsize=18)
    ax2.set_xticklabels(names2, fontsize=14)
    plt.yticks(rotation='horizontal', fontsize=14)
    ax2.annotate(
        'RL = Regressao Logistica\nADL = Analise Discr. Linear\n\
ADQ = Analise Discr. Quadratica\nKNN = K-Nearest Neighbors\n\
NBG = Naive Bayes Gaussiano\nNBM = Naive Bayes Multinomial\n\
SVML = SVM Linear\nSVMR = SVM kernel rbf\nRF = Random Forest\n\
ET = Extra Trees',

        # The point that we'll place the text in relation to
        xy=(1.01, 0.5),
        # Interpret the x as axes coords, and the y as figure coords
        xycoords=('axes fraction', 'figure fraction'),

        # The distance from the point that the text will be at
        xytext=(0, 0),
        # Interpret `xytext` as an offset in points...
        textcoords='offset points',

        # Any other text parameters we'd like
        size=12,
        ha='left',
        va='center')
    plt.subplot(212)
    plt.show(fig)
Ejemplo n.º 33
0
from sklearn.datasets import fetch_20newsgroups  # 从sklearn.datasets里导入新闻数据抓取器 fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer  # 从sklearn.feature_extraction.text里导入文本特征向量化模块
from sklearn.naive_bayes import MultinomialNB  # 从sklean.naive_bayes里导入朴素贝叶斯模型
from sklearn.metrics import classification_report

#1.数据获取
news = fetch_20newsgroups(subset='all')
print(len(news.data))  # 输出数据的条数:18846

#2.数据预处理:训练集和测试集分割,文本特征向量化
X_train, X_test, y_train, y_test = train_test_split(
    news.data, news.target, test_size=0.25,
    random_state=33)  # 随机采样25%的数据样本作为测试集
print(X_train[0])  #查看训练样本
print(y_train[0:100])  #查看标签

#文本特征向量化
vec = CountVectorizer()
X_train = vec.fit_transform(X_train)
X_test = vec.transform(X_test)

#3.使用朴素贝叶斯进行训练
mnb = MultinomialNB()  # 使用默认配置初始化朴素贝叶斯
mnb.fit(X_train, y_train)  # 利用训练数据对模型参数进行估计
y_predict = mnb.predict(X_test)  # 对参数进行预测

#4.获取结果报告
print('The Accuracy of Naive Bayes Classifier is:', mnb.score(X_test, y_test))
print(classification_report(y_test, y_predict, target_names=news.target_names))
Ejemplo n.º 34
0
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = [
    "K Nearest Neighbors", "Decision Tree", "Random Forest",
    "Logistic Regression", "SGD Classifier", "Naive Bayes", "SVM Linear"
]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print("{} Accuracy: {}".format(name, accuracy))

# ### Building the VotingClassifier for Ensembel Modelling

# In[20]:
Ejemplo n.º 35
0
    The binomial model is useful if your feature vectors are binary (i.e. zeros and ones).
    One application would be text classification with ‘bag of words’ model where the 1s & 0s are
    “word occurs in the document” and “word does not occur in the document” respectively.

Refs:
http://cpmarkchang.logdown.com/posts/193470-natural-language-processing-naive-bayes-classifier
https://www.analyticsvidhya.com/blog/2017/09/naive-bayes-explained/
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
gnb = GaussianNB()
mnb = MultinomialNB()
y_pred_gnb = gnb.fit(x_train, y_train).predict(x_test)
cnf_matrix_gnb = confusion_matrix(y_test, y_pred_gnb)
print(cnf_matrix_gnb)

y_pred_mnb = mnb.fit(x_train, y_train).predict(x_test)
cnf_matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
print(cnf_matrix_mnb)
classifier_NB = GaussianNB()
classifier_NB.fit(X_train1, Y_train1)
pred_NB_train=classifier_NB.predict(X_train1)
np.mean(pred_NB_train==Y_train1)
pred_NB_test=classifier_NB.predict(X_test1)
np.mean(pred_NB_test==Y_test1)
#Train Accuracy NB=85.74
#Test Accuracy NB=68.91






classifier_MNB = MultinomialNB()
classifier_MNB.fit(X_train1, Y_train1)
pred_MNB_train=classifier_MNB.predict(X_train1)
np.mean(pred_MNB_train==Y_train1)
pred_MNB_test=classifier_MNB.predict(X_test1)
np.mean(pred_MNB_test==Y_test1)
#Train Accuracy MNB=84.52
#Test Accuracy MNB=83.99



classifier_DT = DecisionTreeClassifier(criterion = "entropy", random_state = 0)
classifier_DT.fit(X_train1,Y_train1)
pred_DT_train=classifier_DT.predict(X_train1)
np.mean(pred_DT_train==Y_train1)
pred_DT_test=classifier_DT.predict(X_test1)
training_set = featuresets[:1900]

testing_set = featuresets[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f = open("naive_bayes.picke", "rb")

classifier = pickle.load(classifier_f)

classifier_f.close()

print("Accuracy :", (nltk.classify.accuracy(classifier, testing_set)))

### Multinomial Naive Bayes
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classfier:", (nltk.classify.accuracy(classifier, testing_set)))

##### Gaussian Naive Bayes
##Gaussian_NB_classifier = SklearnClassifier(GaussianNB())
##Gaussian_NB_classifier.train(training_set)
##print("GNB_classfier:", (nltk.classify.accuracy(Gaussian_NB_classifier, testing_set)))

### Bernoulli Naive Bayes
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BNB_classfier:",
      (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)))

#LogisticRegression, SGDClassifier
def trainNaiveBayes(trainFeatures, trainLabels):
    clf = make_pipeline(DictVectorizer(sparse=False), MultinomialNB())
    scores = cross_val_score(clf, trainFeatures, trainLabels, cv=5)
    clf.fit(trainFeatures, trainLabels)
    return clf, scores.mean()
else:
    dt_clf = joblib.load('DTmodel.pkl') 
#test dt classifier
preds = dt_clf.predict(X_test)
cm = confusion_matrix(Y_test, preds)
print(cm)
print('\n')
print(classification_report(Y_test, preds))
#plot_roc_curve(dt_clf,X_test,Y_test)
plt.figure()
plot_confusion_matrix(cm, classes=['negative', 'positive'], normalize=True, title='Normalized confusion matrix - Decision Tree')
plt.show()

#train naive bayes classifier
nb_flag = 0 #if 1, train model from scratch and dump - if 0, load dumped model
nb = MultinomialNB()
if nb_flag:
    nb_clf = nb.fit(X_train, Y_train)
    joblib.dump(nb_clf, 'NBmodel.pkl')
else:
    nb_clf = joblib.load('NBmodel.pkl')
#test nb classifier
preds = nb_clf.predict(X_test)
cm = confusion_matrix(Y_test, preds)
print(cm)
print('\n')
print(classification_report(Y_test, preds))
#plot_roc_curve(nb_clf,X_test,Y_test)
plt.figure()
plot_confusion_matrix(cm, classes=['negative', 'positive'], normalize=True, title='Normalized confusion matrix - Naive Bayes')
plt.show()
Ejemplo n.º 40
0
plt.tight_layout(pad=0)
plt.show()

from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(emails["filtered_text"],
                                                    emails["spam"],
                                                    test_size=0.2,
                                                    random_state=10)

# Bag of words with naive bayes
count_vectorizer = CountVectorizer()
count_vectorizer.fit(train_X)
X_train_df = count_vectorizer.transform(train_X)
X_test_df = count_vectorizer.transform(test_X)

classifier = MultinomialNB(alpha=1.8)
classifier.fit(X_train_df, train_y)
pred = classifier.predict(X_test_df)
accuracy_score(test_y, pred)

# TF-IDF with naive bayes

tf = TfidfVectorizer()
tf.fit(train_X)
tfidf_train_X = tf.transform(train_X)
tfidf_test_X = tf.transform(test_X)

classifier = MultinomialNB(alpha=0.04)
classifier.fit(tfidf_train_X, train_y)
pred = classifier.predict(tfidf_test_X)
accuracy_score(test_y, pred)
Ejemplo n.º 41
0
##'''10. Split the dataset into training data and testing data with train_test_split function
##Note: parameters test_size=0.33, random_state=42'''

#X_train, X_test, y_train, y_test = train_test_split(df['message'],df['label'],random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=42)
#print(df.shape)
#print(X_train.shape)
#print(X_test.shape)

#11. Initialise multimimial_naive_bayes classifier
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

#12.Fit the training data with labels in Naive Bayes classifier 'clf'
"""
cv=CountVectorizer(stop_words='english')
training_data=cv.fit_transform(X_train)
testing_data=cv.transform(X_test)
clf.fit(training_data,y_train)
"""
clf.fit(X_train, y_train)
predictions = clf.predict()
#
#from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
#
#print(accuracy_score(y_test,predictions))
#print(precision_score(y_test,predictions))
#loading the input csv data into the pandas dataframe
public_griv_df = pd.read_csv("pg_complete_set_1.csv",
                         engine='python',error_bad_lines=False)

#print(public_griv_df.columns)

y = public_griv_df.org_name
X = public_griv_df.subject_content

#splitting the data into training and testing purposes
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

#pipeline for Naive Bayes algorithm
txt_clf_NB=Pipeline([ ('vect',CountVectorizer()),
                  
                  ('clf',MultinomialNB())])

#pipeline for SVM classifier algorithm
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42))
 ])


#fitting and measuring accuracy for SVM model
text_clf_svm.fit(X_train, y_train)

#joblib.dump(text_clf_svm, 'model.pkl')
#text_clf_svm = joblib.load('model.pkl')
#print("Model dumped!")
Ejemplo n.º 43
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.base import TransformerMixin

from joblib import dump

df = pd.read_csv('FA-KES-Dataset.csv', encoding='latin1')
df.drop_duplicates(keep=False, inplace=True)
df['text'] = df['article_title'] + ' ' + df['article_content']

X = df["text"].values
y = df["labels"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=345)

nb = make_pipeline(CountVectorizer(binary=True), MultinomialNB())

nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print(classification_report(y_test, y_pred))
nb.fit(X, y)

dump(nb, "clf.joblib")
Ejemplo n.º 44
0
    tmp_score = line[0].strip('\"')

    if int(tmp_score) < 2:
        #negative
        train_score.append(0)
        train_text.append(line[5])

    elif int(tmp_score) > 2:
        #positive
        train_score.append(1)
        train_text.append(line[5])
    else:
        continue

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf.fit(train_text, train_score)

# Have to adjust for company name
base = len('Tesla')

filelist = os.listdir()
for item in filelist:
    if item[-4:] == '.txt':
        year = item[base + 1:base + 5]
        month = dict[item[base + 6:base + 9]]
        day = item[base + 10:-4]
        date = year + '-' + month + '-' + day
        output_dict[date] = []

        pred_file = open(item, "r", encoding="utf-8")
Ejemplo n.º 45
0
# split up the data
df_train, df_test, Ytrain, Ytest = train_test_split(df['data'],
                                                    Y,
                                                    test_size=0.33)

# try multiple ways of calculating features
tfidf = TfidfVectorizer(decode_error='ignore')
Xtrain = tfidf.fit_transform(df_train)
Xtest = tfidf.transform(df_test)

# count_vectorizer = CountVectorizer(decode_error='ignore')
# Xtrain = count_vectorizer.fit_transform(df_train)
# Xtest = count_vectorizer.transform(df_test)

# create the model, train it, print scores
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("test score:", model.score(Xtest, Ytest))

# exit()


# visualize the data
def visualize(label):
    words = ''
    for msg in df[df['labels'] == label]['data']:
        msg = msg.lower()
        words += msg + ' '
    wordcloud = WordCloud(width=600, height=400).generate(words)
    plt.imshow(wordcloud)
Ejemplo n.º 46
0
from sklearn.model_selection import train_test_split as tts
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Code starts here

X_train,X_val,y_train, y_val= tts(X,y,test_size=0.3, random_state=42)

log_reg= LogisticRegression(random_state=0)
log_reg.fit(X_train,y_train)
y_pred= log_reg.predict(X_val)
log_accuracy= accuracy_score(y_pred,y_val)
print(log_accuracy)

nb= MultinomialNB()
nb.fit(X_train,y_train)
y_pred= nb.predict(X_val)
nb_accuracy= accuracy_score(y_pred,y_val)
print(nb_accuracy)

lsvm= LinearSVC(random_state=0)
lsvm.fit(X_train,y_train)
y_pred= lsvm.predict(X_val)
lsvm_accuracy= accuracy_score(y_pred,y_val)
print(lsvm_accuracy)



# --------------
# path_test : Location of test data
Ejemplo n.º 47
0
pid_test = random.sample(list(pid), 10)

df_train = df[df['product_id'].isin(pid_train)]
df_test = df[df['product_id'].isin(pid_test)]
#print(df_train)

# Setting up Bag of Words Model
count_vect = CountVectorizer()
desc_train = df_train['desc']
X_train_counts = count_vect.fit_transform(list(desc_train))
print(X_train_counts.shape)
print(count_vect.vocabulary_.get('images'))

# Fitting tdidf vectorization
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# applying Multinominal Classifier on feature vectors obtained
clf = MultinomialNB().fit(X_train_tfidf, list(df_train['Category_ID']))

# testing the model, we only need transform as for the corpus the global weights of each term are alreday learned are already learned
desc_test = df_test['desc']
pid_test = df_test['product_id']
X_test_counts = count_vect.transform(list(desc_test))
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

predicted = clf.predict(X_test_tfidf)

for i, category in zip(pid_test, predicted):
    print("{} => {}".format(i, category))
def classifier_analysis(X, label, methodType):
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import ShuffleSplit
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    #rng = None
    rng = np.random.RandomState(1)

    if methodType == 0:
        # random forest
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2,
                                            min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
                                            max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                            bootstrap=True, oob_score=False, n_jobs=n_jobs, random_state=rng, verbose=0,
                                            warm_start=False, class_weight=None)

        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__n_estimators': [5, 10, 20],
            'classifier__max_depth': [None, 10, 5, 3],
            'classifier__max_features': ['auto', 10, 5]
        }
    elif methodType == 1:
        # adaboost
        from sklearn.ensemble import AdaBoostClassifier
        classifier = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=rng)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__n_estimators': [5, 10, 20],
            'classifier__learning_rate': [0.8, 0.9, 1.0]
        }
    elif methodType == 2:
        # GBC
        from sklearn.ensemble import GradientBoostingClassifier
        classifier = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0,
                                                criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
                                                min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
                                                min_impurity_split=None, init=None, random_state=rng, max_features=None,
                                                verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__n_estimators': [50, 100, 150],
            'classifier__max_depth': [None, 10, 5, 3],
            'classifier__learning_rate': [0.8, 0.9, 1.0]
        }
    elif methodType == 3:
        # logtistic regression
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
                                        intercept_scaling=1, class_weight=None, random_state=rng, solver='saga',
                                        max_iter=100, multi_class='multinomial', verbose=0, warm_start=False, n_jobs=n_jobs)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.9, 1.0, 1.1]
        }
    elif methodType == 4:
        # SVM
        from sklearn.svm import SVC
        classifier = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
                         tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1,
                         decision_function_shape='ovr', random_state=rng)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'classifier__C': [0.9, 1.0, 1.1]
        }
    elif methodType == 5:
        # MLP
        from sklearn.neural_network import MLPClassifier
        classifier = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001,
                                   batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5,
                                   max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False,
                                   warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False,
                                   validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__hidden_layer_sizes': [(100, ), (50, ), (20, )],
            'classifier__learning_rate_init': [0.0001, 0.001, 0.01]
        }
    elif methodType == 6:
        # linear SVM
        from sklearn.svm import LinearSVC
        classifier = LinearSVC(penalty='l2', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, multi_class='ovr',
                               fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=rng,
                               max_iter=1000)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.9, 1.0, 1.1]
        }
    elif methodType == 7:
        # Bernoulli Naive Bayes
        from sklearn.naive_bayes import BernoulliNB
        classifier = BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None)
        param_grid = {
            'filter__threshold': [0.95, 0.97, 0.99],
            'classifier__alpha': [0.90, 0.95, 1.0],
            'classifier__fit_prior': [True, False]
        }
    elif methodType == 8:
        # multinomial Naive Bayes
        from sklearn.naive_bayes import MultinomialNB
        classifier = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
        param_grid = {
            'classifier__alpha': [0.90, 0.95, 1.0],
            'classifier__fit_prior': [True, False]
        }
    else:
        return

    if methodType == 8:
        pipe = Pipeline([
            ('classifier', classifier)
        ])
    else:
        pipe = Pipeline([
            ('scale', StandardScaler()),
            ('filter', FilterSimu()),
            ('classifier', classifier)
        ])


    grid = GridSearchCV(pipe, cv=ShuffleSplit(n_splits=4, test_size=0.25, random_state=rng), n_jobs=1, param_grid=param_grid)
    grid.fit(X, label)
    best_estimator = grid.best_estimator_

    #mean_scores = np.array(grid.cv_results_['mean_test_score'])
    #mean_tscores = np.array(grid.cv_results_['mean_train_score'])
    #print mean_scores
    #print mean_tscores

    print grid.best_params_
    score = grid.best_score_
    #print grid.cv_results_['params']

    return best_estimator, grid.predict(X), score
Ejemplo n.º 49
0
def NB_create_model():
    # 获取标题文本
    text_list = []

    for page_num in range(0, 50):
        # 页数可改
        url = 'http://guba.eastmoney.com/list,gssz,f_' + \
              str(page_num) + '.html'
        stockPageRequest = requests.get(url, headers=headers)
        htmlTitleContent = stockPageRequest.text

        resp = Selector(text=htmlTitleContent)
        nodes = resp.xpath(
            '//div[contains(@class,"articleh normal_post") or contains(@class,"articleh normal_post odd")]'
        )

        # itemstemp = re.findall(pattern, content)
        for index, item in enumerate(nodes):

            view = item.xpath('./span[@class="l1 a1"]/text()').extract_first()
            comment_count = item.xpath(
                './span[@class="l2 a2"]/text()').extract_first()
            title = item.xpath(
                './span[@class="l3 a3"]/a/text()').extract_first()
            author = item.xpath(
                './span[@class="l4 a4"]/a/text()').extract_first()
            create_time = item.xpath(
                './span[@class="l5 a5"]/text()').extract_first()
            # 处理日期
            date_pattern = re.search('(\d+)-(\d+)', create_time)

            month = sub_zero(date_pattern.group(1))

            day = sub_zero(date_pattern.group(2))

            seg_list = list(jieba.cut(title, cut_all=False))
            seg_str = " ".join(seg_list)
            text_list.append(seg_str)

    text_list = np.array(text_list)  # 文本list

    # 标注文本特征
    class_vec = [' '] * len(text_list)  # 一样长的list

    for i in range(0, len(text_list)):
        for pos in positiveWord:
            if pos in text_list[i]:
                class_vec[i] = '积极'
        for neg in negativeWord:
            if neg in text_list[i]:
                class_vec[i] = '消极'
        for neu in neutralWord:
            if neu in text_list[i]:
                class_vec[i] = '中立'
        if class_vec[i] == ' ':
            class_vec[i] = '无立场'

        print(class_vec[i])
    # 将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer = CountVectorizer()
    # 该类会统计每个词语的tf-idf权值
    transformer = TfidfTransformer()
    # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    tfidf = transformer.fit_transform(vectorizer.fit_transform(text_list))

    # 构造分类器
    clf = MultinomialNB()
    clf.fit(tfidf, class_vec)

    # 持久化保存
    joblib.dump(clf, 'Clf_v1.pkl')
    joblib.dump(vectorizer, 'Vect_v1')
    joblib.dump(transformer, 'Tf-Idf_v1')
Ejemplo n.º 50
0
Archivo: ngrams.py Proyecto: rsotoc/nlp
#featuresets_bigrams = [
#    document_features_ngrams(nltk.FreqDist(d), bigrams_frq)
#    for d in movies_reviews["bigrams"]]
#featuresets_trigrams = [
#    document_features_ngrams(nltk.FreqDist(d), trigrams_frq)
#    for d in movies_reviews["trigrams"]]
elapsed_time = time.time() - start_time

#for i in range(100):
#    print(sum(x > 0 for x in featuresets_bigrams[i]))

bigrams_train, bigrams_test, biy_train, biy_test = train_test_split(
    featuresets_bigrams, Sentiments, test_size=0.1)

# Entrenamiento de un clasificador Multinomial Bayes ingenuo
clfM = MultinomialNB()
clfM.fit(bigrams_train, biy_train)
print(elapsed_time)

# Pruebas del clasificador
predictions_train = clfM.predict(bigrams_train)
fails_train = sum(biy_train != predictions_train)
print(
    "Puntos mal clasificados en el conjunto de entrenamiento: {} de {} ({}%)\n"
    .format(fails_train, len(bigrams_train),
            100 * fails_train / len(bigrams_train)))
predictions_test = clfM.predict(bigrams_test)
fails_test = sum(biy_test != predictions_test)
print("Puntos mal clasificados en el conjunto de prueba: {} de {} ({}%)\n".
      format(fails_test, len(bigrams_test),
             100 * fails_test / len(bigrams_test)))
Ejemplo n.º 51
0
spams = []
for s in range(len(class1['TEXT'])):
    spams.append('Спам')
class1['CLASS'] = spams

hams = []
for s in range(len(class2['TEXT'])):
    hams.append('Не спам')
class2['CLASS'] = hams

class3 = pd.DataFrame(class2).append(class1)

count_vector = CountVectorizer()
result = count_vector.fit_transform(class3['TEXT'].values)

BinClass = MultinomialNB()
objects = class3['CLASS'].values

BinClass.fit(result, objects)

print('Введіть дані:')
input_string = [input()]
count_input = count_vector.transform(input_string)
answers = BinClass.predict(count_input)

print("Введенні дані являються:", str(answers))

#Experiments:

#Words from the FirstCLass
exp1 = ['Юридичний']
Ejemplo n.º 52
0

if __name__ == '__main__':
    np.random.seed(1337)
    unigrams = utils.top_n_words(FREQ_DIST_FILE, UNIGRAM_SIZE)
    if USE_BIGRAMS:
        bigrams = utils.top_n_bigrams(BI_FREQ_DIST_FILE, BIGRAM_SIZE)
    tweets = process_tweets(TRAIN_PROCESSED_FILE, test_file=False)
    if TRAIN:
        train_tweets, val_tweets = utils.split_data(tweets)
    else:
        random.shuffle(tweets)
        train_tweets = tweets
    del tweets
    print ('Extracting features & training batches')
    clf = MultinomialNB()
    batch_size = len(train_tweets)
    i = 1
    n_train_batches = int(np.ceil(len(train_tweets) / float(batch_size)))
    for training_set_X, training_set_y in extract_features(train_tweets, test_file=False, feat_type=FEAT_TYPE, batch_size=batch_size):
        utils.write_status(i, n_train_batches)
        i += 1
        if FEAT_TYPE == 'frequency':
            tfidf = apply_tf_idf(training_set_X)
            training_set_X = tfidf.transform(training_set_X)
        clf.partial_fit(training_set_X, training_set_y, classes=[0, 1, 2, 3, 4])
    print ('\n')
    print ('Testing')
    if TRAIN:
        correct, total = 0, len(val_tweets)
        i = 1
dados.append([1, 0, 1])
dados.append([1, 0, 1])
dados.append([1, 0, 0])
dados.append([1, 1, 0])
dados.append([1, 1, 1])
dados.append([1, 1, 0])
dados.append([0, 1, 0])
dados.append([0, 1, 1])
dados.append([1, 1, 1])
dados.append([1, 1, 0])
dados.append([0, 1, 0])
dados.append([0, 1, 1])

marcacoes = ([1]*10) + ([0]*6)

modelo = MultinomialNB()
modelo.fit(dados, marcacoes)

_1cervejeiro  = [1, 1, 1]
_2cervejeiro  = [1, 0, 0]
_1leiteiro    = [0, 1, 1]
_2leiteiro    = [0, 1, 0]

dados_teste = [_1cervejeiro, _2cervejeiro, _1leiteiro, _2leiteiro]
marcacoes_teste = [1, 1, 0, 0]

resultado = modelo.predict(dados_teste)
diferencas = resultado - marcacoes_teste

acertos = [d for d in diferencas if d == 0]
Ejemplo n.º 54
0
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

#Importing the cleaned file containing the text and label
news = pd.read_csv('news.csv')
X = news['text']
y = news['label']

#Splitting the data into train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Creating a pipeline that first creates bag of words(after applying stopwords) & then applies Multinomial Naive Bayes model
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('nbmodel', MultinomialNB())])

#Training our data
pipeline.fit(X_train, y_train)

#Predicting the label for the test data
pred = pipeline.predict(X_test)

#Checking the performance of our model
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

#Serialising the file
with open('model.pickle', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)
Ejemplo n.º 55
0
samples_weight = compute_sample_weight('balanced', train_df['toxicity'])
print(train_df.head(5))

count_vect = CountVectorizer(stop_words='english')
tfidf_transformer = TfidfTransformer()

train_df['comment_text'] = train_df['comment_text'].astype('U')
test_df['comment_text'] = test_df['comment_text'].astype('U')

X_train_counts = count_vect.fit_transform(train_df['comment_text'].values)
print(X_train_counts.shape)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf,
                          train_df['toxicity'],
                          sample_weight=samples_weight)

X_new_counts = count_vect.transform(test_df['comment_text'])
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
print(Counter(train_df['toxicity']))
print(accuracy_score(predicted, test_df['toxicity']))
print(f1_score(predicted, test_df['toxicity']))
print(predicted.shape)

print(classification_report(predicted, test_df['toxicity']))

# Dla progu 0.5:
# (1799564, 318216)
train_tc = count_vectorizer.fit_transform(training_data.data)
print('\nDimensions of training data:', train_tc.shape)

# Create the tf-idf transformer
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)

# Define test data
input_data = [
    'You need to be careful with cars when you are driving on slippery roads',
    'A lot of devices can be operated wirelessly',
    'Players need to be careful when they are close to goal posts',
    'Political debates help us understand the perspectives of both sides'
]

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB().fit(train_tfidf, training_data.target)

# Transform input data using count vectorizer
input_tc = count_vectorizer.transform(input_data)

# Transform vectorized data using tfidf transformer
input_tfidf = tfidf.transform(input_tc)

# Predict the output categories
predictions = classifier.predict(input_tfidf)

# Print the outputs
for sent, category in zip(input_data, predictions):
    print('\nInput:', sent, '\nPredicted category:',
          category_map[training_data.target_names[category]])
Ejemplo n.º 57
0
# count_f = CountVectorizer(max_features=1000)
# x_train_bow_f = count_f.fit_transform(X_train)
# # count_test_f = CountVectorizer(max_features=1000)
# x_test_bow_f = count_f.transform(X_test)

# all words considered
count = CountVectorizer(lowercase=False, token_pattern='[A-Za-z0-9#@_$%]{2,}')
x_train_bow = count.fit_transform(X_train)
# count_test = CountVectorizer()
test_bow = count.transform(X_test)

# # model takes the most frequent 1000 words
# clf = MultinomialNB()
# train_model_f = clf.fit(x_train_bow_f, y_train)
# predict_and_test(train_model_f, x_test_bow_f)

# model takes all words considered
clf = MultinomialNB(alpha=1)
model = clf.fit(x_train_bow, y_train)
predicted_y = model.predict(test_bow)

f = open("output.txt", 'a')
for i in range(0, len(test_id)):
    f.write(str(test_id[i]))
    f.write(' ')
    f.write(predicted_y[i])
    f.write('\n')
f.close()

# predict_and_test(train_model, x_test_bow)
Ejemplo n.º 58
0
print(len(content_train))

# To extract useful features from noise reduction data, we extract bag of words model features from the text
vectorizer = CountVectorizer(
    analyzer='word',  # tokenize by character ngrams
    ngram_range=(1, 4),  # use ngrams of size 1 2 and 3
    max_features=20000)  # keep the most common 1000 ngrams
vectorizer.fit(content_train)


def get_features(content):
    vectorizer.transform(content)


# import classifier and train data
classifier = MultinomialNB()
classifier.fit(vectorizer.transform(content_train), tag_train)
classifier.score(vectorizer.transform(content_test), tag_test)
""" cross verification part """


# A more reliable method of cross verification is StratifiedKFold,
# but cross verification is the best way to ensure that each sample category is relatively balanced
def stratified_k_fold(content,
                      tag,
                      classifier_class,
                      shuffle=True,
                      n_splits=5,
                      **kwargs):
    sk_fold = StratifiedKFold(n_splits=n_splits, shuffle=shuffle)
    tag_prediction = tag[:]
Ejemplo n.º 59
0
def multinomial_nb_cl(params):
    cl=MultinomialNB(**params)
    return cl
def makeClassifierBayes(tfidf,result,alpha=1.0):
    clf = MultinomialNB(alpha=alpha).fit(tfidf, result)
    return clf