Ejemplo n.º 1
0
class Sentiment:
    def __init__(self):
        self.stop_words = stopwords.words() + list(string.punctuation)
        self.tfid = TfidfVectorizer()
        self.clf = MultinomialNB()
        # score: 0.7225
        # self.clf = SVC()

    # create pipelines
    # clean the input
    def fit(self, X, Y):
        self.X = X
        self.Y = Y
        n = 4000
        print "Fitting ", n, " samples..."
        words = [word_tokenize(unicodedata.normalize('NFKD', x.decode("utf-8")).encode('ascii','ignore'))     for x in X[:n]]
        processed_words = [" ".join(w    for w in s    if w not in self.stop_words)    for s in words]
        X_train = self.tfid.fit_transform(processed_words)
        Y_train = Y[:n]
        print "Classifier created"
        self.clf.fit(X_train, Y_train)

    def predict(self, X_inp):
        X_inp = unicodedata.normalize('NFKD', X_inp.decode("utf-8")).encode('ascii','ignore')
        word_list = " ".join(w    for w in word_tokenize(X_inp.lower())    if w not in self.stop_words)
        X_test = self.tfid.transform([word_list])
        return self.clf.predict(X_test)
Ejemplo n.º 2
0
def main():
    """loads data, trains model, tests model

    Inputs:
        file: binary file containing sparse numpy array with text features
        file: binary file containing pandas dataframe with training labels

    Outs:
        print: classification report of classifier performance

    """

    # Load training labels and text features
    chdir("../pickles")
    with open("word_counts.pkl", "rb") as f:
        X = pickle.load(f)
    with open("training_labels.pkl", "rb") as f:
        y = pickle.load(f)
        y = np.ravel(y["sponsored"])

    # Create train and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    # Create and train model
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(classification_report(y_test, y_pred))
Ejemplo n.º 3
0
class NBTest(unittest.TestCase):

    def setUp(self):
        self.mnb = NaiveBayes(multinomial=True)
        self.skmnb = MultinomialNB()
        self.bnb = NaiveBayes(bernoulli=True)
        self.skbnb = BernoulliNB()
        self.cnb = NaiveBayes(multinomial=True, cnb=True)
        self.wcnb = NaiveBayes(multinomial=True, wcnb=True)

    def test_count_vectorized(self):
        self.mnb.fit(X_count, train_targets)
        self.skmnb.fit(X_count, train_targets)
        self.assertEqual(self.mnb.score(X_count_test,test_targets),self.skmnb.score(X_count_test,test_targets))

    def test_tfidf_vectorized(self):
        self.mnb.fit(X_tfidf, train_targets)
        self.skmnb.fit(X_tfidf, train_targets)
        self.assertEqual(self.mnb.score(X_tfidf_test, test_targets), self.skmnb.score(X_tfidf_test, test_targets))

    def test_cnb(self):
        self.cnb.fit(X_count, train_targets)
        self.mnb.fit(X_count, train_targets)
        cnb_score = self.cnb.score(X_count_test, test_targets)
        mnb_score = self.mnb.score(X_count_test, test_targets)
        print "CNB: {},   MNB: {}".format(cnb_score, mnb_score)
        assert (cnb_score - mnb_score) > -0.1  

    def test_wcnb(self):
        self.wcnb.fit(X_count, train_targets)
        self.mnb.fit(X_count, train_targets)
        wcnb_score = self.wcnb.score(X_count_test, test_targets)
        mnb_score = self.mnb.score(X_count_test, test_targets)
        print "WCNB: {},   MNB: {}".format(wcnb_score, mnb_score)
        assert (wcnb_score - mnb_score) > -0.5  
Ejemplo n.º 4
0
def classifier():
    nb = MultinomialNB(alpha=0)
    nb.fit(DOC_TRAIN, CLASS_TRAIN)
    db = DB()
    query = 'select cate_id, tf, url, content from site_content_3'

    cursor = db.cursor()
    logger.info(query)
    cursor.execute(query)
    rows = cursor.fetchall()
    for row in rows:
        currentCateId = row['cate_id']
        print 'rowID => ', row['cate_id'];
        url = row['url']
        tf = row['tf']
        content = row['content']
        termFrequencyDict = {}
        # continue

        try:
            termFrequencyDict = json.loads(tf)
        except:
            print 'error => ', url
            continue

        testItem = np.array([])
        for word in termFrequencyDict:
            tf = termFrequencyDict[word]
            if WORDS.has_key(word):
                testItem = np.append([tf])
            else:
                testItem = np.append([0])

        print "CURRENT CATE ", currentCateId
        print "NEW ", nb.predict(testItem)
Ejemplo n.º 5
0
def crossValidate(X_dataset,y):
#cross validate model
    num_folds = 5
    kfold = cross_validation.StratifiedKFold(y, n_folds=num_folds, shuffle=True)

   # kfold=KFold(X.shape[0],n_folds=10, shuffle=True)
    avg_accuracy=0
    avg_precision=0
    avg_recall=0
    print "----------- cross_validation k=5"
    for train,test in kfold:
        Xtrain,Xtest,ytrain,ytest=X_dataset[train],X_dataset[test],y[train],y[test]
        
#        clf=LinearSVC()
        clf=MultinomialNB(alpha=0.1)
#        clf=LDA()
        clf.fit(Xtrain.toarray(),ytrain)
        ypred=clf.predict(Xtest.toarray())
        accuracy=metrics.accuracy_score(ytest,ypred)              
#        print "accuracy = ", accuracy
        avg_accuracy+=accuracy
        precision = metrics.precision_score(ytest,ypred)
#        print("precision:   %0.3f" % precision)
        avg_precision+=precision
        recall = metrics.recall_score(ytest,ypred)
#        print("recall:   %0.3f" % recall)
        avg_recall+=recall
        
    print "Average accuracy : " , (avg_accuracy/num_folds)
    print "Average precision : " , (avg_precision/num_folds)
    print "Average recall : " , (avg_recall/num_folds)        
Ejemplo n.º 6
0
def naive_bayes():
    nb = MultinomialNB()
    nb.fit(X_train, train_data.danger)
    nb_pred = nb.predict(X_test)
    nb_score = nb.score(X_test, y_test)
    precision, recall, _, _ = precision_recall_fscore_support(y_test, nb_pred)
    return precision, recall, str(nb_score)
Ejemplo n.º 7
0
def MultinomialNBClassify_Proba(enrollment_id, trainData, trainLabel, testData):
    nbClf = MultinomialNB() # default alpha=1.0, Laplace smoothing
    # settinf alpha < 1 is called Lidstone smoothing
    nbClf.fit(trainData, ravel(trainLabel))
    testLabel = nbClf.predict_proba(testData)[:,1]
    saveResult(enrollment_id, testLabel, 'Proba_sklearn_MultinomialNB_alpha=0.1_Result.csv')
    return testLabel
Ejemplo n.º 8
0
class TrainNaiveBayes:

    def __init__(self, all_features, neu_labels):
        """
        Trains a classifier using Naive Bayes
        """
        self._num_features = len(all_features.values()[0])

        self._X = numpy.zeros((1, self._num_features))          # Feature matrix
        self._Y = numpy.array([0])                        # Label vector
        for user_id in neu_labels.keys():
            self._X = numpy.append(self._X, [all_features[user_id]], axis=0)
            self._Y = numpy.append(self._Y, [neu_labels[user_id]])
        self._X = numpy.delete(self._X, 0, 0)           # Delete the first row (contains all 0s)
        self._Y = numpy.delete(self._Y, 0)

        print "Using MultinomialNB"
        self._model = MultinomialNB()
        print cross_validation.cross_val_score(self._model, self._X, self._Y, cv=10, scoring='f1')

        self._model.fit(self._X, self._Y)

    def predict(self, features):
        A = numpy.zeros((1, self._num_features))
        for user_id in features.keys():
            A = numpy.append(A, [features[user_id]], axis=0)
        A = numpy.delete(A, 0, 0)
        return self._model.predict(A)
Ejemplo n.º 9
0
def train(good_sources, bad_sources,method,naive_bayes=None,keywords=list()):
    #train the algorithm
    good_samples = find_keywords(' '.join([entry[method] for entry in good_sources]))
    bad_samples = find_keywords(' '.join([entry[method] for entry in bad_sources]))


    #if we have an exists knowledge base to append this new information to, do so
    if naive_bayes:
        new_kws = set(good_samples+bad_samples)
        print('Using old keywords as well')
        print("# old keywords = {}\n # new keywords = {}".format(len(keywords),len(new_kws)))
        new_kws = set(good_samples+bad_samples).difference(keywords)
        print("# fresh keywords = {}\n".format(len(new_kws)))

        #make some call to naive_bayes.partial_fssit in here
        X = np.concatenate((naive_bayes.feature_count_, np.zeros((naive_bayes.feature_count_.shape[0],len(new_kws)))),1)
        all_kw = keywords + list(new_kws)

    else:
        print('Only using keywords from this content set')
        all_kw = list(set(good_samples+bad_samples))
        X = np.zeros((2,len(all_kw)))

    for j,kw in enumerate(all_kw):
        X[0,j] += good_samples.count(kw)
        X[1,j] += bad_samples.count(kw)

    y = ['good','bad']

    naive_bayes = MultinomialNB()
    naive_bayes.fit(X,y)

    return naive_bayes, all_kw
Ejemplo n.º 10
0
class Sentiment:
    def __init__(self):
        self.stop_words = stopwords.words() + list(string.punctuation)
        self.tfid = TfidfVectorizer()
        self.clf = MultinomialNB()

        # score: 0.7225
        # self.clf = SVC()

    # create pipelines
    # clean the input
    def fit(self, X, Y):
        self.X = X
        self.Y = Y
        # give the subset of dataset to be trained
        l = 0
        h = 4000
        words = [word_tokenize(x.decode("utf-8").lower()) for x in X[l:h]]
        processed_words = [" ".join(w for w in s if w not in self.stop_words) for s in words]
        X_train = self.tfid.fit_transform(processed_words)
        Y_train = Y[l:h]
        self.clf.fit(X_train, Y_train)
        print "Classes: ", self.clf.classes_
        print "Score: ", self.clf.score(X_train, Y_train)

    def predict(self, X_inp):
        word_list = " ".join(w for w in word_tokenize(X_inp.decode("utf-8").lower()) if w not in self.stop_words)
        X_test = self.tfid.transform([word_list])
        return self.clf.predict(X_test)
Ejemplo n.º 11
0
	def run_naivebayes_evaluation(self, inputdata, outputdata, k):
		""" Fit Naive Bayes Classification on train set with cross validation. 
		Run Naive Bayes Classificaiton on test set. Return results
		"""

		###print "** Fitting Naive Bayes classifier.."

		# Cross validation
		cv = cross_validation.KFold(inputdata.shape[0], n_folds=k, indices=True)
		cv_naivebayes = []
		f1_scores = []
		for traincv, testcv in cv:

			clf_cv = MultinomialNB()
			clf_cv.fit(inputdata[traincv], outputdata[traincv])

			y_pred_cv = clf_cv.predict(inputdata[testcv])

			f1 = metrics.f1_score(outputdata[testcv], y_pred_cv, pos_label=0)
			f1_scores.append(f1)

		
		#TODO: NEEDED? self.classifier = clf_cv
		print "score average: %s" + str(np.mean(f1_scores))

		average_score =np.mean(f1_scores)
		tuples = (average_score, f1_scores)

		return (tuples, 'N.A.', 'N.A.')
Ejemplo n.º 12
0
def test_sklearn_nb(balanced):
    movie_words = process_plots_mp(balanced)

    training_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 != 0]
    test_movies = [movie_words[i] for i in range(len(movie_words)) if i % 3 == 0]

    vec = DictVectorizer()
    training_features = vec.fit_transform([movie.wordcounts for movie in training_movies]).toarray()
    training_labels = np.array([movie.year for movie in training_movies])
    #LOGGER.debug("Original size of feature vectors: %d (issparse: %s)" % (
        #csr_matrix(training_features[-1]).toarray().size, str(issparse(training_features))
    #))

    mnb_classifier = MultinomialNB()
    mnb_classifier.fit(training_features, training_labels)

    test_features = vec.transform([movie.wordcounts for movie in test_movies])
    test_labels = np.array([movie.year for movie in test_movies])

    results = mnb_classifier.predict(test_features)

    correct = sum([1 for i, result in enumerate(results) if result == test_labels[i]])
    LOGGER.info("skleanrn's MultinomialNB classifier predicted %d/%d correctly (%0.3f%% accuracy)" % (
        correct, len(test_labels), correct / len(test_labels) * 100
    ))
Ejemplo n.º 13
0
def predict(cur, plyr_id, game_plyrs): 
  #creates training set (called 'X') for plyr
  all_plyrs = all_player_ids(cur) #np.array - all NFL players (and coaches)
  games = games_played_in(cur, plyr_id) #np.array - the games_ids the player played in
  n_cols = all_plyrs.shape[0] #int 
  m_rows = games.shape[0] #int
  w = weights(games)
  zeros = np.zeros((m_rows, n_cols)) #2darr - used to initialize DF
  X = pd.DataFrame(zeros, index=games, columns=all_plyrs) #dataframe
  populate_training_set(cur, X, games, plyr_id)
  #print "X: ", X.values
  
  ###run coaches_model and then im here### 
  #creates vector of known output values
  Y = training_output_vector(cur, games, plyr_id) #good
  #print "(len) Y: ", len(Y), Y
  test_zeros = np.zeros((1, n_cols)) #2darr - used to initialize DF
  test_X = pd.DataFrame(zeros, columns=all_plyrs) #dataframe
  update_training_matrix(cur, game_plyrs, 0, test_X)
  
  #run Bernoulli NB Classifier
  nb_clf = MultinomialNB()
  
  if len(X.values) == 0:
    return 0
  nb_clf.fit(X, Y, sample_weight=w)
  nb_predictions = nb_clf.predict(test_X)
  #print "test_X: ", test_X.values
  nb_norm_prob = normalize_probs(nb_clf.predict_proba(test_X)[0])
  avgs = [3,8,12.5,17,21,25]
  #print "probs: ", nb_norm_prob
  #print avgs
  ev = expected_val(nb_norm_prob, avgs) #can also calc dot product
  return round(ev,1)
Ejemplo n.º 14
0
def run_k_fold_cross_validation_experiment(dataset):
    logger.info("Starting %d-fold cross-validation...", len(dataset))

    clf_sklearn = MultinomialNB()
    clf = MultinomialBayesEstimator()

    sklearn_scores = create_scores_collector()
    scores = create_scores_collector()

    for train_set, test_set in split_train_test_k_fold(dataset):
        X_train, y_train, X_test, y_test = make_test_train(train_set, test_set)

        # Sklearn
        clf_sklearn.fit(X_train, y_train.ravel())
        predictions = clf_sklearn.predict(X_test)
        sklearn_scores.append_scores(y_test, predictions)

        # Our bayes without ngrams
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        scores.append_scores(y_test, predictions)

    logger.info("%d-fold cross validation finished", len(dataset))
    log_scores(sklearn_scores, "Sklearn")
    log_scores(scores, "MBE")
Ejemplo n.º 15
0
def run_learning_curves_experiment(dataset):
    logger.info("Now starting experiment with learning curves...")
    scores = []
    sklearn_scores = []
    train_sizes = []

    clf = MultinomialBayesEstimator()
    sklearn_clf = MultinomialNB()
    # Constructing confidence intervals using empiric bootstrap
    intervals = []
    for test_size in xrange(1, len(dataset)):
        f_scores = []
        f_scores_sklearn = []
        for train_set, test_set in split_train_test_p_out(dataset, test_size):
            train_set, test_set = split_train_test(dataset, test_size)
            X_train, y_train, X_test, y_test = make_test_train(train_set, test_set)
            clf.fit(X_train, y_train)
            f_scores.append(f1_score(y_test, clf.predict(X_test)))
            sklearn_clf.fit(X_train, y_train.ravel())
            f_scores_sklearn.append(f1_score(y_test, sklearn_clf.predict(X_test)))
        intervals.append(calculate_confidence_interval(f_scores))
        scores.append(np.mean(f_scores))
        sklearn_scores.append(np.mean(f_scores_sklearn))
        train_sizes.append(len(dataset) - test_size)

    plot_learning_curves(train_sizes, sklearn_scores, scores, intervals)
Ejemplo n.º 16
0
def MultinomialNBClassify(trainData, trainLabel, testData):
    nbClf = MultinomialNB(alpha=0.1) # default alpha=1.0, Laplace smoothing
    # settinf alpha < 1 is called Lidstone smoothing
    nbClf.fit(trainData, ravel(trainLabel))
    testLabel = nbClf.predict(testData)
    saveResult(testLabel, 'sklearn_MultinomialNB_alpha=0.1_Result.csv')
    return testLabel
Ejemplo n.º 17
0
    def train(self, data):
        nb = MultinomialNB()

        launches = map(lambda x: x['application'], data)
        instances = map(lambda i: {'lu1': launches[i-1]}, xrange(1, len(launches)))
        X = self.vectorizer.fit_transform(instances).toarray()
        y = launches[1:]
        self.lu1_predictor = nb.fit(X, y)

        instances = map(lambda i: {'lu2': launches[i-2]}, xrange(2, len(launches)))
        X = self.vectorizer.fit_transform(instances).toarray()
        y = launches[2:]
        self.lu2_predictor = nb.fit(X, y)

        # tune mu
        max_hr = 0
        best_mu = 0
        for mu in map(lambda x: x/10.0, xrange(11)):
            self.mu = mu
            predictions = map(lambda i: self.predict({'lu1': launches[i-1], 'lu2': launches[i-2]}), \
                xrange(2, len(launches)))
            hr, mrr = self.test(launches[2:], predictions)
            if hr > max_hr:
                max_hr = hr
                best_mu = mu
        self.mu = best_mu
Ejemplo n.º 18
0
class NaiveBayes:
	def __init__(self):
		self.clf = MultinomialNB()
		self.pattern ='(?u)\\b[A-Za-z]{3,}'
		self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(2,2))

	def train(self,fileName):
		print "Naive Bayes classifier is being trained"
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_train = self.tfidf.fit_transform(table.message)
		Y_train = []
		for item in table.cat:
			Y_train.append(int(item)) 
		self.clf.fit(X_train, Y_train)
		self.clf.fit(X_train, Y_train)
		print "Naive Bayes classifier has been trained"

	def classify(self,cFileName, rFileName):
		table = pandas.read_table(cFileName, names=["message"])
		X_test = self.tfidf.transform(table.message)
		print "Data have been classified"
		with open(rFileName,'w') as f:
			for item in self.clf.predict(X_test).astype(str):
				f.write(item+'\n')

	def validate(self,fileName):
		table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
		X_validate = self.tfidf.transform(table.message)
		Y_validated = self.clf.predict(X_validate).astype(str)
		totalNum = len(table.cat)
		errorCount = 0
		for i in range(0,totalNum):
			if int(table.cat[i])!=int(Y_validated[i]):
				errorCount += 1
		print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
Ejemplo n.º 19
0
def main():
    print('Reading in data file...')
    data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv',
                       usecols=['Sentiment', 'SentimentText'], error_bad_lines=False)

    print('Pre-processing tweet text...')
    corpus = data['SentimentText']
    vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                                 stop_words='english', tokenizer=tokenize)
    X = vectorizer.fit_transform(corpus.values)
    y = data['Sentiment'].values

    print('Training sentiment classification model...')
    classifier = MultinomialNB()
    classifier.fit(X, y)

    print('Training word2vec model...')
    corpus = corpus.map(lambda x: tokenize(x))
    word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4)
    word2vec.init_sims(replace=True)

    print('Fitting PCA transform...')
    word_vectors = [word2vec[word] for word in word2vec.vocab]
    pca = PCA(n_components=2)
    pca.fit(word_vectors)

    print('Saving artifacts to disk...')
    joblib.dump(vectorizer, path + 'vectorizer.pkl')
    joblib.dump(classifier, path + 'classifier.pkl')
    joblib.dump(pca, path + 'pca.pkl')
    word2vec.save(path + 'word2vec.pkl')

    print('Process complete.')
Ejemplo n.º 20
0
def bcluster(corpus_path, cluster_fn):
    folds = KFold(article_count, n_folds=10, shuffle=True)

    results = []

    for i, (train_idx, test_idx) in enumerate(folds):
        logging.info("Running fold %d" % i)
        vect = BrownClusterVectorizer(cluster_fn)
        x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx))

        bin = LabelEncoder()
        y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx))

        x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx))
        y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx))

        model = MultinomialNB()
        model.fit(x_train, y_train)
        pred = model.predict(x_test)

        score = accuracy_score(y_test, pred)
        logging.info("Completed fold %d with score %.04f" % (i, score))
        results.append(score)

    return results
Ejemplo n.º 21
0
def naive_classify_unknown(X_train, y_train, vectorizer):
    client = pymongo.MongoClient("localhost", 27017)
    db = client.tweets
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    test_users = db.tweets.distinct('user.screen_name')
    classify_users(clf, vectorizer, test_users, load_users(db, test_users))
Ejemplo n.º 22
0
def find_best_vectorizor(vectorizer, grid):
  dg = DataGatherer()
  y_test = dg.validate_target
  y_train = dg.labeled_target

  nb = MultinomialNB()
  header_printed = False
  best_params = None
  best_score = -1
  for param in IterGrid(grid):
    if not header_printed:
      print(str(",".join(param.keys())) + ",Score")
    header_printed = True
    vectorizer.set_params(**param)
    X_train = vectorizer.fit_transform(dg.labeled_data)    
    X_test = vectorizer.transform(dg.validate_data)
    nb.fit(X_train, y_train)
    score = nb.score(X_test, y_test)
    if score > best_score:
      best_score = score
      best_params = param
    print(str(",".join(map(str, param.values()))) + "," + str(score))
  print("")
  print("Best params: " + str(best_params))
  print("Best score: " + str(best_score))
Ejemplo n.º 23
0
    def train(self):
        '''
        ## -- How to predict -- ##
            query = "blah blah"
            q = list2vec(hashit(q)) 
            clf2 = joblib.load('nb')
            print(clf2.predict(q)) # <--- returns type id
        '''

        limit = self.comment_limit
        sqls = ["SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=1 ORDER BY time DESC LIMIT " + str(limit),
            "SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=2 ORDER BY time DESC LIMIT " + str(limit),
            "SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=3 ORDER BY time DESC LIMIT " + str(limit)]

        print "training model"
        comments = self.sql2list(sqls)
        x, y = self.featureMatrix(comments)
        X = list2Vec(x)
        Y = list2Vec(y)

        q = "Let's talk about food."
        q_vec = list2Vec(hashit(q))

        ## Precicting
        print "Classifying"
        clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
        clf.fit(X, Y)
        joblib.dump(clf, self.path, compress=9)
Ejemplo n.º 24
0
def train_classifiers(X_data, y_data):
    ############ Linear SVM: 0.908 #############
    clf_LSVM = svm.SVC(kernel = 'linear')
    clf_LSVM.fit(X_data, y_data)
    
    ############ MultinomialNB: 0.875 #############
    clf_MNB = MultinomialNB()
    clf_MNB.fit(X_data, y_data)
    
    ############ Random Forest: 0.910 #############
    clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy')
    clf_RF.fit(X_data, y_data)
    
    ############ Extra Tree: 0.915 ##################
    clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0)
    clf_ETC.fit(X_data, y_data)
    
    ############ AdaBoost: 0.88 ##################
    clf_Ada = AdaBoostClassifier()
    clf_Ada.fit(X_data, y_data)
    
    ############ rbf SVM: 0.895 #############
    clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf')
    clf_rbf.fit(X_data, y_data)
    
    ############ GradientBoosting: 0.88 #############
    clf_GBC = GradientBoostingClassifier()
    clf_GBC.fit(X_data, y_data)
    
    return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC    
Ejemplo n.º 25
0
def naive_bayes(x_value, y_value):
    X = x_value
    y = y_value

    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

    vect = CountVectorizer()
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)

    X_test_dtm = vect.transform(X_test)

    from sklearn.naive_bayes import MultinomialNB
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    
    print 'Accuracy: '
    print metrics.accuracy_score(y_test, y_pred_class)
    
    print 'Null Accuracy: '
    print y_test.value_counts().head(1) / len(y_test)
    
    print 'Confusion Matrix: '
    print metrics.confusion_matrix(y_test, y_pred_class)
def nb(x_train,x_test,y_train,doc_app_id,id_name_dict):
	clf = MultinomialNB(alpha=0.01)
	clf.fit(x_train,y_train)
	pred = clf.predict(x_test)
	for i in range(len(pred)):
		app_id = doc_app_id[i]
		print id_name_dict[app_id]+" "+str(pred[i])
Ejemplo n.º 27
0
def trainNB(xTrain, yTrain):

    classifier = MultinomialNB()

    classifier.fit(xTrain, yTrain)

    return classifier
Ejemplo n.º 28
0
def multinomialNB(devMatrix, trainMatrix, devtarget, traintarget):
	f = open('MNNB2.log', 'a')
	f.write("Making model!!!!!")
	print 'Making model!'
	clf = MultinomialNB(alpha=1, fit_prior=False)
	clf.fit(trainMatrix, traintarget)
	f.write("\n")
	value = ('Model: multinomial bayes with parameters ',clf.get_params(False))
	print (str(value))
	f.write(str(value))
	f.write("\n")
	f.write("MSE for train: %.2f" % np.mean((clf.predict(trainMatrix) - traintarget) ** 2))
	score = clf.score(trainMatrix, traintarget)
	f.write("\n")
	value = ('Score for train %.2f', score)
	f.write("\n")
	f.write("MSE for dev: %.2f" % np.mean((clf.predict(devMatrix) - devtarget) ** 2))
	score = clf.score(devMatrix, devtarget)
	value = ('Score for dev %.2f', score)
	print(str(value))
	f.write("\n")
	s = str(value)
	f.write(s)
	f.write("\n")
	f.write('model done')
	f.write("\n")
	f.write("\n")
	f.close()
	return score
Ejemplo n.º 29
0
def do_lda(x, y, folds):
    indexes = list(range(len(x)))
    shuffle(indexes)
    x = list(x[i] for i in indexes)
    y = list(y[i] for i in indexes)
    fold_size = len(x) / folds
    corrects = []
    for fold in range(folds):
        test_x = []
        train_x = []
        test_y = []
        train_y = []
        for i in range(len(x)):
            fold_index = i / fold_size
            if fold == fold_index:
                test_x.append(x[i])
                test_y.append(y[i])
            else:
                train_x.append(x[i])
                train_y.append(y[i])
        print 'Partitioned data into fold'
        test_x, train_x = remove_redundant_dimensions(test_x, train_x)
        print 'Removed redundant dimensions'
	nb =  MultinomialNB()
	nb.fit(train_x, train_y)
	print 'Fit NB'
	predictions = nb.predict(test_x)
#        lda = LDA()
#        lda.fit(train_x, train_y)
#        print 'Fit lda'
#        predictions = lda.predict(test_x)
        correct = sum(1 for i in range(len(predictions)) if predictions[i] == test_y[i])
        print 'Did fold, correct:', correct
        corrects.append(correct)
    return corrects
Ejemplo n.º 30
0
def plain_word_counts(corpus_path):
    folds = KFold(article_count, n_folds=10, shuffle=True)

    results = []

    for i, (train_idx, test_idx) in enumerate(folds):
        logging.info("Running fold %d" % i)
        vect = CountVectorizer(max_features=1000, decode_error='ignore', strip_accents='unicode')
        x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx))

        bin = LabelEncoder()
        y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx))

        x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx))
        y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx))

        model = MultinomialNB()
        model.fit(x_train, y_train)
        pred = model.predict(x_test)

        score = accuracy_score(y_test, pred)
        logging.info("Completed fold %d with score %.04f" % (i, score))
        results.append(score)

    return results
def word_classification():
    X, y = get_features_and_labels()
    model = MultinomialNB()
    model.fit(X, y)
    score = cross_val_score(model, X, y, cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=0))
    return score
Ejemplo n.º 32
0
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

voice_data = pd.read_csv('voice.csv')
voice_data = voice_data[['meanfun', 'IQR', 'Q25', 'label']]
x = voice_data.iloc[:, :-1]
y = voice_data.iloc[:, -1]
y = LabelEncoder().fit_transform(y)
imp = SimpleImputer(missing_values=0, strategy='mean')
x = imp.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

predictionrate = []

mnb = MultinomialNB()
mnb.fit(x_train, y_train)
y_predict = mnb.predict(x_test)
print('MultinomialNB准确率:', mnb.score(x_test, y_test))
print(classification_report(y_test, y_predict))
predictionrate.append(mnb.score(x_test, y_test))

gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_predict = gnb.predict(x_test)
print('GaussianNB准确率:', gnb.score(x_test, y_test))
print(classification_report(y_test, y_predict))
predictionrate.append(gnb.score(x_test, y_test))

scaler1 = StandardScaler()
scaler1.fit(x_train)
x_train = scaler1.transform(x_train)
Ejemplo n.º 33
0
word = vectorizer.get_feature_names()
for n in word[:10]:
    print(n)
print("单词数量:", len(word))

#将tf-idf矩阵抽取 元素w[i][j]表示j词在i类文本中的tf-idf权重
X = coo_matrix(tfidf, dtype=np.float32).toarray()  #稀疏矩阵
print(X.shape)
print(X[:10])

X_train = X[:len(train_labels)]
X_test = X[len(train_labels):]
y_train = train_labels
y_test = test_labels
print(len(X_train), len(X_test), len(y_train), len(y_test))

#-----------------------------------------------------------------------------
#分类模型
clf = MultinomialNB()
#clf = svm.LinearSVC()
#clf = LogisticRegression(solver='liblinear')
#clf = RandomForestClassifier(n_estimators=10)
#clf = neighbors.KNeighborsClassifier(n_neighbors=7)
#clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
print('模型的准确度:{}'.format(clf.score(X_test, y_test)))
pre = clf.predict(X_test)
print("分类")
print(len(pre), len(y_test))
print(classification_report(y_test, pre, digits=4))
Ejemplo n.º 34
0
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = .5)


# tree classifier algorithm
clf = tree.DecisionTreeClassifier() # calling the decision tree clasifier

# Naive Bayes classifier algorithm
from sklearn.naive_bayes import MultinomialNB # import gaussian classi
nb_clf = MultinomialNB()


# --- Trying one hot encoder ------
enc = OneHotEncoder(categorical_features =[0, 2, 3, 4, 5]) # One Hot encoder Specifying the categorical attributes.
enc.fit(x) #fit the encoder to the data
clf.fit(enc.transform(x_train), y_train) # create the learninf instance
nb_clf.fit(enc.transform(x_train), y_train) # Nive Bayes - Multinomial model

# prediction
predictions = clf.predict(enc.transform(x_test))
prediction_NB = nb_clf.predict(enc.transform(x_test))


# Accuracy
from sklearn.metrics import accuracy_score # impor accuracy score functionality
print 'Accuracy tree encoded data prediction',accuracy_score(y_test, predictions)
print 'Accuracy Multinomial NB data prediction', accuracy_score(y_test, prediction_NB)

# Learning Curve plot
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import numpy as np
Ejemplo n.º 35
0
############################################################
with open("E:/AB104/AlgorithmTest/Jieba_Booking.json", 'r') as a:
    data = json.load(a)

data = DataFrame(data)
classifier = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(data['comments'].values,
                                                    data['mark'].values,
                                                    test_size=0)

targets = y_train
# print len(targets) #241221
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X_train)
# print len(X_train) #241221
classifier.fit(counts, targets)

############################################################
##                    進行檢測之結果儲存                     ##
############################################################
commList_Jieba_marked = []
for i in commList_Jieba:
    commList_Jieba_marked_dict = {}
    examples = [i["comments"]]
    # print i["comments"]
    example_counts = count_vectorizer.transform(examples)
    predictions = classifier.predict(example_counts)
    commList_Jieba_marked_dict["mark"] = predictions.tolist()
    # print predictions
    commList_Jieba_marked_dict["comments"] = [i["comments"]]
    commList_Jieba_marked_dict["hotel"] = [i["hotel"]]
Ejemplo n.º 36
0
    15: "world"
}

trainNews = pd.read_csv("./data/train.csv")
testNews = pd.read_csv("./data/test.csv")
xTrain = trainNews['text']
yTrain = trainNews['label']

tfidf = vect.fit(xTrain.values.astype('U'))
xTrainvect = vect.fit_transform(xTrain)
yTrainvect = yTrain
xTestvect = vect.transform(testNews['text'])
yTestvect = testNews['label']

model = MultinomialNB(alpha=0.01, fit_prior=True)
model.fit(xTrainvect, yTrainvect)

ypred = model.predict(xTestvect)
score = accuracy_score(yTestvect, ypred)
print("Accuracy: ", score)
pickle.dump(
    model,
    open(
        "/Nepali-NLP/Nepali-News-Classification/models/news_classifier_model.pickle",
        'wb'))
pickle.dump(
    tfidf,
    open(
        "/Nepali-NLP/Nepali-News-Classification/models/news_vectorizer.pickle",
        "wb"))
####TEST#####
from texts import text_counter, text_training
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

intercepted_text = "I love my China."

text_counts = text_counter.transform([intercepted_text])

text_classifier = MultinomialNB()

text_labels = [0] * 1000 + [1] * 1000

text_classifier.fit(text_training, text_labels)

final_pos = text_classifier.predict_proba(text_counts)[0][1]

final_neg = text_classifier.predict_proba(text_counts)[0][0]

if final_pos > final_neg:
    print("The text is positive.")
else:
    print("The text is negative.")
Ejemplo n.º 38
0
#build a vocabulary from the training set
vectorizer.fit(train_M)

# a look inside
#for x, y in zip(vectorizer.get_feature_names(), vectorizer.idf_):
#	print(x, y , sep=' : ')

#transform the training set into a bag of words
bow = vectorizer.transform(train_M)

#instantiate the algorithm
algo = MultinomialNB()  #navie_bayes classifier

#train it
algo.fit(bow, train_L)
print('I am trained to predict')

#prediction time
tot = len(test_M)
err = 0
for lbl, msg in zip(test_L, test_M):
    #transform the message to be processed
    msg_bow = vectorizer.transform([msg])
    #predict
    prediction = algo.predict(msg_bow)
    #show up
    print(prediction[0], lbl, sep=' : ')
    if prediction[0] != lbl:
        err += 1
print('Failure Rate :', err, '/', tot)
Ejemplo n.º 39
0
    # Joining the stemmed words
    dialog = ' '.join(words)

    # Creating a corpus
    corpus.append(dialog)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000, ngram_range=(1, 2))
X = cv.fit_transform(corpus).toarray()
y = df['genre'].values

# Creating a pickle file for the CountVectorizer
pickle.dump(cv, open('cv-transform.pkl', 'wb'))

# Model Building

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(X_train, y_train)

# Creating a pickle file for the Multinomial Naive Bayes model
filename = 'movie-genre-mnb-model.pkl'
pickle.dump(nb_classifier, open(filename, 'wb'))
        count = 0
        while True:
            count += 1
            try:
                ele = next(a)
                TrainingData.append(list(ele))
                TrainingResult.append(dataLabel)
            #  print(type(TrainingData))
            #  print(type(TrainingResult))
            except StopIteration:
                print('Training ' + dataLabel + ' :' + str(count))
                break


readTrainingData('C:/Research_PatternRecognition/Data/AllC16Data.csv',
                 '16 Cell')
readTrainingData('C:/Research_PatternRecognition/Data/AllC8Data.csv', '8 Cell')
readTrainingData('C:/Research_PatternRecognition/Data/AllC4Data.csv', '4 Cell')
model = GaussianNB()
modelMultiNorm = MultinomialNB()

model.fit(np.array(TrainingData).astype(np.float), np.array(TrainingResult))
modelMultiNorm.fit(
    np.array(TrainingData).astype(np.float), np.array(TrainingResult))

print(model.predict(np.array(TestSample).astype(np.float)))
#print(model.predict_proba(np.array(TestSample).astype(np.float)))
print(TestActualResult)

print(modelMultiNorm.predict(np.array(TestSample).astype(np.float)))
print(TestActualResult)
Ejemplo n.º 41
0
X_vec = cv.fit_transform(X_train_clean).toarray()  # eğitim verisi
print(X_vec)

X_test_vect = cv.transform(X_test_clean).toarray()

print("Data vektörize tamam \n --Sonuçlar gösterilecek.")

print("Makine öğrenme algoritması çalıştırılıyor.")

from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

mn = MultinomialNB()

mn.fit(X_vec, Y_train)  # X_vec = Metinler, / Y_train = 0,1 lerden oluşan liste
print(X_vec.shape)
print(X_test_vect.shape)

Y_test_pred = mn.predict(X_test_vect)

print(Y_test_pred)

print("Naive bayes accuracy score : ",
      accuracy_score(Y_test_pred, Y_test) * 100)

print(classification_report(Y_test, Y_test_pred))

cnf_matrix = confusion_matrix(Y_test, Y_test_pred)

labels = [0, 1]
Ejemplo n.º 42
0
    training_label_set[index + 1] = np.vstack(
        (training_label_set[index + 1], negative_set))
    new_lb_train = np.delete(new_lb_train, ran_doc_index, axis=0)
    training_label_set[index + 1] = training_label_set[index + 1][:, max_index]

    #for vector dataset
    training_data_set[index + 1] = np.vstack(
        (training_data_set[index + 1],
         getRowsFromMatrix(ran_doc_index, new_vec_lb_train)))
    new_vec_lb_train = np.delete(new_vec_lb_train, ran_doc_index, axis=0)

# create binary classifiers
binary_classifiers = []
for index in range(10):
    nb = MultinomialNB(alpha=0.01)
    nb.fit(sparse.csr_matrix(training_data_set[index + 1]),
           training_label_set[index + 1])
    binary_classifiers.append(nb)
test_binary_label = []
for row in vectorised_test_documents:
    generated_label = []
    for classifier in binary_classifiers:
        generated_label.append((nb.predict(row))[0])
    test_binary_label.append(generated_label)

test_binary_label = np.array(test_binary_label)
#remove all other classes
all_class_index = [item for item in range(0, test_labels.shape[1])]
col_to_delete = [x for x in all_class_index if x not in index_max_class]
test_labels = np.delete(test_labels, col_to_delete, axis=1)

# print("test_binary_label",test_binary_label[:3,:])
Ejemplo n.º 43
0
 def classify(self, document):
     train_test_vectors = self.vectorize(document)
     clf = MultinomialNB()
     clf.fit(train_test_vectors[0], self.train_labels)
     return clf.predict(train_test_vectors[1])
]]  #select features
scaler = StandardScaler()
scaler.fit(data)

#prepare training and test data
msk = np.random.rand(len(df)) < 0.7
train = data[msk]
test = data[~msk]

xtrain = np.array(train.iloc[:, 0:num_features], dtype=np.float32)
ytrain = np.array(train.iloc[:, num_features:(num_features + num_classes + 1)],
                  dtype=np.float32)
xtest = np.array(test.iloc[:, 0:num_features], dtype=np.float32)
ytest = np.array(test.iloc[:, num_features:(num_features + num_classes + 1)],
                 dtype=np.float32)

#Model
clf = MultinomialNB()
clf.fit(xtrain, ytrain)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# In[16]:

#Validation
prediction = clf.predict(xtest)
correct = 0
for i in range(len(prediction)):
    if prediction[i] == ytest[i]:
        correct += 1
print(float(correct) / len(prediction))
Ejemplo n.º 45
0
xtrain, xtest, ytrain, ytest = train_test_split(df["text"],
                                                df["label_n"],
                                                test_size=0.20)

# Feature extraction text --CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
xtrain_count = cv.fit_transform(xtrain.values)
xtrain_count.toarray()[:3]

# Create a model
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(xtrain_count, ytrain)

emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = cv.transform(emails)
model.predict(emails_count)

# Check accuracy
xtest_count = cv.transform(xtest)
model.score(xtest_count, ytest)

# Sklearn pipeline
from sklearn.pipeline import Pipeline
Ejemplo n.º 46
0
        x.append(tweets_data[i]['text'])
        y.append(sent['sentiment'][i])
#print(x[0].split(" "))
#print(y[0])

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

vectorizer = CountVectorizer(stop_words='english')
train_features = vectorizer.fit_transform(x)

actual = y[:-500]

nb = MultinomialNB()
nb.fit(train_features, [int(r) for r in y])

test_features = vectorizer.transform(x[:-500])

test_try = vectorizer.transform([
    "Can we all stop treating anxiety like it's a choice and something cool to have thank you"
])
test_try2 = vectorizer.transform(["I feel like drinking alchohol"])
predict2 = nb.predict(test_try)
predict3 = nb.predict(test_try2)

#print(predict2)
predictions = nb.predict(test_features)

print()
Ejemplo n.º 47
0
x_train, x_test, y_train, y_test = train_test_split(news.data,
                                                    news.target,
                                                    test_size=0.25,
                                                    random_state=33)

from sklearn.feature_extraction.text import CountVectorizer

counnt_vec = CountVectorizer()
x_count_train = counnt_vec.fit_transform(x_train)
x_count_test = counnt_vec.transform(x_test)

from sklearn.naive_bayes import MultinomialNB

mnb_count = MultinomialNB()
mnb_count.fit(x_count_train, y_train)
print("the accuracy of classifying 20newsgroups using Naive Bayes:",
      mnb_count.score(x_count_test, y_test))
y_count_predict = mnb_count.predict(x_count_test)
from sklearn.metrics import classification_report

print(
    classification_report(y_test,
                          y_count_predict,
                          target_names=news.target_names))

#tfidfVectorizer.
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
x_tfidf_train = tfidf_vec.fit_transform(x_train)
Ejemplo n.º 48
0
import numpy as np
import pandas as pd

data = pd.read_csv('riskcsv.csv', index_col=0)
# sample = data.sample(frac=1)
# sample.reset_index(drop=False)
# 重新创建索引
# sample.reset_index(drop=True)
# 将采样数据存到'application_train_sample.csv'文件中
# sample.to_csv('risk_sample.csv')

data = pd.read_csv('riskcsv.csv', index_col=0)
my_matrix = np.loadtxt(open("riskcsv.csv", "rb"), delimiter=",", skiprows=1)
# print(my_matrix)
# print(str(my_matrix))
#
x1 = my_matrix[1:15, 0:21]
print(x1)
y = my_matrix[1:15, -1]
print(y)

x2 = my_matrix[0:1, 0:21]
print(x2)

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x1, y)
print(clf.predict(my_matrix[0:1, 0:21]))
rf_dtm.shape# (402, 1337) , 1 record error titles
type(rf_dtm)# scipy.sparse.csr.csr_matrix
print(rf_dtm)# non zeros locations and contents of non zeros save memory i guess?


rf_Ddtm = rf_dtm.toarray()# to dense why to do this looks like same D = dense
rf_Ddtm.shape# (402, 1337)
type(rf_Ddtm)#numpy.ndarray
print(rf_Ddtm)
df_dtm = pd.DataFrame(rf_Ddtm,columns = dims)
type(df_dtm)#pandas.core.frame.DataFrame
df_dtm.shape# (402, 1337)
print(df_dtm)

mnb = MultinomialNB()#knn not working , may be good for vertical exampls where b<l,this is fat
y_cl = mnb.fit(rf_Ddtm,cls)

#######################################################


test_rf = pd.read_csv('Test RR.csv',names=['RepID','RepCols'],skiprows=[0],index_col = 0)
test_rf.head(10)    
test_rf_X = rf['RepCols'] 
test_rfdtm = vect.transform(test_rf_X).toarray()
test_predict_y_cl = mnb.predict(test_rfdtm)
test_predict_y_cl.shape
test_predict_y_cl.__len__()

test_rf['pred_cl'] = test_predict_y_cl# 41 wrong predictions out of 402

#Visulaizations of excel download 
def embedded_model(column, k):
    X = encoded_dataset[column]
    y = encoded_dataset['Label']

    #Splitting the dataset into X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=13)

    #MNB Model
    selected_markers = []
    mnb_accuracy = []
    for i in range(k):
        i = i + 1

        #Check point i
        #print ('Iteration with number of markers :  {}' .format(i), '\n')

        #create a combination of marker
        markers = list(combinations(column, i))

        #create a dictionary for marker and the model accuracy
        model_list = {}

        #for each combination, generate the Classifier, obtain the model accuracy
        for marker in markers:
            selected = list(marker)
            #marker_model['Marker'] = marker
            trainX = X_train[selected]
            testX = X_test[selected]

            #build the svm model using training data
            model = MultinomialNB()
            model.fit(trainX, y_train)

            #testing the model
            predictions = model.predict(testX)

            #model evaluation
            scores = cross_val_score(model, trainX, y_train, cv=5)
            #marker_model['SVC Accuracy'] = scores.mean()
            #marker_model['SVC std'] = scores.std()*2
            marker_accuracy = scores.mean()

            #store the marker evaluation score
            model_list[marker] = marker_accuracy

            #check point
            #print (model_list)

            #select the most accurate model
            optimum = max(list(model_list.values()))

            #for each combination class get the optimum combination based on max accuracy
            mark = list(model_list.keys())[list(
                model_list.values()).index(optimum)]
        selected_markers.append(mark)
        optimum = round(optimum, 2)
        mnb_accuracy.append(optimum)

    #final output
    df1 = pd.DataFrame(list(zip(selected_markers, mnb_accuracy)),
                       columns=['Markers', 'Accuracy'])

    #write the sheet as excel sheet
    df1.to_excel('Extremophile_classifier.xlsx')
Ejemplo n.º 51
0
# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))

#---------------------------------------------------------------------------------------------------------------#
#Training and testing the "fake news" model with CountVectorizer

# Import the necessary modules
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)

#---------------------------------------------------------------------------------------------------------------#
#Training and testing the "fake news" model with TfidfVectorizer
Ejemplo n.º 52
0
import numpy as np
X = np.random.randint(5, size=(6, 100))
print(X)
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

print(clf.predict(X[2:3]))
Ejemplo n.º 53
0
transformer = TfidfTransformer()
tfidf_train = transformer.fit_transform(countMatrix_train)
tfidf_test = transformer.transform(countMatrix_test)

tfidf_train2 = transformer.fit_transform(countMatrix_train2)
tfidf_test2 = transformer.transform(countMatrix_test2)

print tfidf_train.shape
print tfidf_test.shape
#X_train, X_test, y_train, y_test = inst[train], inst[test], classs[train], classs[test]

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(tfidf_train, y_train)

clf_mNB = MultinomialNB()
clf_mNB.fit(tfidf_train, y_train)

clf_knn = KNeighborsClassifier()
clf_knn.fit(tfidf_train, y_train)

clf_ada = RandomForestClassifier(n_estimators=25)
clf_ada.fit(tfidf_train, y_train)

print clf_svm.score(tfidf_test, y_test)
print clf_mNB.score(tfidf_test, y_test)
print clf_knn.score(tfidf_test, y_test)
print clf_ada.score(tfidf_test, y_test)

predicted_svm = clf_svm.predict(tfidf_test)
#print np.mean(predicted_svm == y_train)
Ejemplo n.º 54
0
test_ss_x = ss.transform(test_x)

# 创建KNN分类器
knn = KNeighborsClassifier()
knn.fit(train_ss_x, train_y)
predict_y = knn.predict(test_ss_x)
print("KNN准确率: %.4lf" % accuracy_score(predict_y, test_y))

# 创建SVM分类器
svm = SVC()
svm.fit(train_ss_x, train_y)
predict_y = svm.predict(test_ss_x)
print('SVM准确率: %0.4lf' % accuracy_score(predict_y, test_y))

# 采用Min-Max规范化
mm = preprocessing.MinMaxScaler()
train_mm_x = mm.fit_transform(train_x)
test_mm_x = mm.transform(test_x)

# 创建Naive Bayes分类器
mnb = MultinomialNB()
mnb.fit(train_mm_x, train_y)
predict_y = mnb.predict(test_mm_x)
print("多项式朴素贝叶斯准确率: %.4lf" % accuracy_score(predict_y, test_y))

# 创建CART决策树分类器
dtc = DecisionTreeClassifier()
dtc.fit(train_mm_x, train_y)
predict_y = dtc.predict(test_mm_x)
print("CART决策树准确率: %.4lf" % accuracy_score(predict_y, test_y))
def func2():
    user = {}
    for line in fileinput.input("../../data/select/select_a"):
        mac = line.strip().split(" ")[0]
        user[mac] = True
    fileinput.close()
    cnt_0, cnt_1 = 0, 0
    docMap_1, docMap_2, docMap_3, docMap_4, classMap = {}, {}, {}, {}, {}
    for line in fileinput.input(
            "../../data/feature/trace_all_statistic_filter_feature_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        if user.has_key(mac):
            if sex == 0:
                cnt_0 += 1
            if sex == 1:
                cnt_1 += 1
            _list = []
            for f in feat:
                _list.append(float(f))
            docMap_1[mac] = _list
            classMap[mac] = sex
    fileinput.close()
    print cnt_0, cnt_1
    for line in fileinput.input(
            "../../data/feature/trace_online_statistic_filter_feature_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        if user.has_key(mac):
            _list = []
            for f in feat:
                _list.append(float(f))
            docMap_2[mac] = _list
    fileinput.close()
    for line in fileinput.input(
            "../../data/feature/trace_http_statistic_filter_feature_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        if user.has_key(mac):
            _list = []
            for f in feat:
                _list.append(float(f))
            docMap_3[mac] = _list
    fileinput.close()
    for line in fileinput.input("../../data/feature/keywords_normalize_sex"):
        part = line.strip().split(" ")
        mac, sex, feat = part[0], int(part[1]), part[2:]
        if user.has_key(mac):
            _list = []
            for f in feat:
                _list.append(float(f))
            docMap_4[mac] = _list
    fileinput.close()
    docList_1, docList_2, docList_3, docList_4, classList = [], [], [], [], []
    # print len(user.keys()), len(docMap_1.keys()), len(docMap_2.keys()), len(docMap_3.keys()), len(docMap_4.keys())
    for k, v in user.iteritems():
        if k in docMap_1 and k in docMap_2 and k in docMap_3 and k in docMap_4 and k in classMap:
            docList_1.append(docMap_1[k])
            docList_2.append(docMap_2[k])
            docList_3.append(docMap_3[k])
            docList_4.append(docMap_4[k])
            classList.append(classMap[k])
    docList_1, docList_2, docList_3, docList_4, classList = np.array(
        docList_1), np.array(docList_2), np.array(docList_3), np.array(
            docList_4), np.array(classList)
    min_max_scaler = preprocessing.MinMaxScaler()
    docList_1, docList_2, docList_3 = min_max_scaler.fit_transform(
        docList_1), min_max_scaler.fit_transform(
            docList_2), min_max_scaler.fit_transform(docList_3)
    cnt, errorCount = 0, 0
    loo = LeaveOneOut(len(classList))
    trainingdoc, trainingclass = [], []
    # file = open("../../data/prediction/result","w")
    for train, test in loo:
        cnt += 1
        print cnt
        trainingdoc_1, trainingdoc_2, trainingdoc_3, trainingdoc_4, trainingclass, testingdoc_1, testingdoc_2, testingdoc_3, testingdoc_4, testingclass\
         = docList_1[train], docList_2[train], docList_3[train], docList_4[train], classList[train], docList_1[test], docList_2[test], docList_3[test], docList_4[test], classList[test]
        clf_1 = pipeline.Pipeline([
            ('feature_selection',
             linear_model.LogisticRegression(penalty='l2',
                                             dual=False,
                                             tol=0.0001,
                                             C=1.0,
                                             fit_intercept=True,
                                             intercept_scaling=1,
                                             class_weight='auto',
                                             random_state=None)),
            ('classification',
             svm.SVC(kernel='linear', class_weight='auto', probability=True))
        ])
        clf_2 = pipeline.Pipeline([
            ('feature_selection',
             linear_model.LogisticRegression(penalty='l2',
                                             dual=False,
                                             tol=0.0001,
                                             C=1.0,
                                             fit_intercept=True,
                                             intercept_scaling=1,
                                             class_weight='auto',
                                             random_state=None)),
            ('classification',
             svm.SVC(kernel='linear', class_weight='auto', probability=True))
        ])
        clf_3 = pipeline.Pipeline([
            ('feature_selection',
             linear_model.LogisticRegression(penalty='l2',
                                             dual=False,
                                             tol=0.0001,
                                             C=1.0,
                                             fit_intercept=True,
                                             intercept_scaling=1,
                                             class_weight='auto',
                                             random_state=None)),
            ('classification',
             svm.SVC(kernel='linear', class_weight='auto', probability=True))
        ])
        gnb = MultinomialNB()
        clf_1.fit(trainingdoc_1, trainingclass)
        clf_2.fit(trainingdoc_2, trainingclass)
        clf_3.fit(trainingdoc_3, trainingclass)
        gnb.fit(trainingdoc_4, trainingclass)
        docList_final = []
        for one in train:
            res_1 = clf_1.predict_proba(docList_1[one])[0]
            res_2 = clf_2.predict_proba(docList_2[one])[0]
            res_3 = clf_3.predict_proba(docList_3[one])[0]
            res_4 = gnb.predict_proba(docList_4[one])[0]
            _list = [
                res_1[0], res_1[1], res_2[0], res_2[1], res_3[0], res_3[1],
                res_4[0], res_4[1]
            ]
            docList_final.append(_list)
        res_1 = clf_1.predict_proba(testingdoc_1)[0]
        res_2 = clf_2.predict_proba(testingdoc_2)[0]
        res_3 = clf_3.predict_proba(testingdoc_3)[0]
        res_4 = gnb.predict_proba(testingdoc_4)[0]
        testing_final = [
            res_1[0], res_1[1], res_2[0], res_2[1], res_3[0], res_3[1],
            res_4[0], res_4[1]
        ]
        print testing_final
# Making the Confusion Matrix
cm = confusion_matrix(y_validate, y_val_lgt_pred)
class_label = ['1', '5']
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)
sns.heatmap(df_cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Star')
plt.ylabel('Actual Star')
plt.show()


# 3. Naive Bayes Classifier

class_nbc_val = MultinomialNB()

lgt_nbc_model = class_nbc_val.fit(tfidf_train, y_train)

y_val_nbc_pred = lgt_nbc_model.predict(tfidf_validate)

precision, recall, fscore, train_support = score(y_validate, y_val_nbc_pred, pos_label='5', average='binary')
print('Precision: {} / Recall: {} / F1-Score: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), round(acs(y_validate, y_val_nbc_pred), 3)))

# Making the Confusion Matrix
cm = confusion_matrix(y_validate, y_val_nbc_pred)
class_label = ['1', '5']
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)
sns.heatmap(df_cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Star')
plt.ylabel('Actual Star')
Ejemplo n.º 57
0
X_df = df[['home', 'busca', 'logado']]
Y_df = df['comprou']

Xdummies_df = pd.get_dummies(X_df)
Ydummies_df = Y_df

X = Xdummies_df.values
Y = Ydummies_df.values

porcentagem_de_treino = 0.9
tamanho_de_treino = porcentagem_de_treino * len(Y)
tamanho_de_teste = len(Y) - tamanho_de_treino

treino_dados = X[:int(tamanho_de_treino)]
treino_marcacoes = Y[:int(tamanho_de_treino)]

teste_dados = X[-int(tamanho_de_teste):]
teste_marcacoes = Y[-int(tamanho_de_teste):]

from sklearn.naive_bayes import MultinomialNB
modelo = MultinomialNB()
modelo.fit(treino_dados, treino_marcacoes)

resultado = modelo.predict(teste_dados)
diferencas = resultado - teste_marcacoes

acertos = [d for d in diferencas if d == 0]
total_de_acertos = len(acertos)
total_de_elementos = len(teste_dados)
taxa_de_acerto = 100.0 * total_de_acertos / total_de_elementos
print taxa_de_acerto, total_de_elementos
Ejemplo n.º 58
0
class TextClassifier(object):
    """A text classifier model:
        - Vectorize the raw text into features.
        - Fit a naive bayes model to the resulting features.

    The work done by this class could also be done with a sklean.pipeline
    object.  Since the author cannot guarentee that Pipelines have been
    introduced, he opted to write his own class implementing the model.

    This class is an example of coding to an interface, it implements the
    standard sklearn fit, predict, score interface.
    """
    def __init__(self):
        self._vectorizer = TfidfVectorizer()
        self._classifier = MultinomialNB()

    def fit(self, X, y):
        """Fit a text classifier model.

        Parameters
        ----------
        X: A numpy array or list of text fragments, to be used as predictors.
        y: A numpy array or python list of labels, to be used as responses.

        Returns
        -------
        self: The fit model object.
        """
        X = self._vectorizer.fit_transform(X)
        self._classifier.fit(X, y)
        return self

    def predict_proba(self, X):
        """Make probability predictions on new data.
        
        Parameters
        ----------
        X: A numpy array or list of text fragments, to be used as predictors.

        Returns
        -------
        probs: A (n_obs, n_classes) numpy array of predicted class probabilities. 
        """
        X = self._vectorizer.transform(X)
        return self._classifier.predict_proba(X)

    def predict(self, X):
        """Make class predictions on new data.

        Parameters
        ----------
        X: A numpy array or list of text fragments, to be used as predictors.

        Returns
        -------
        preds: A (n_obs,) numpy array containing the predicted class for each
        observation (i.e. the class with the maximal predicted class probabilitiy.
        """
        X = self._vectorizer.transform(X)
        return self._classifier.predict(X)

    def score(self, X, y):
        """Return a classification accuracy score on new data.

        Parameters
        ----------
        X: A numpy array or list of text fragments.
        y: A numpy array or python list of true class labels.
        """
        X = self._vectorizer.transform(X)
        return self._classifier.score(X, y)
Ejemplo n.º 59
0
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle
import time

categories = ['alt.atheism', 'sci.space', 'comp.graphics',
              'rec.motorcycles', 'sci.electronics']
news = fetch_20newsgroups(remove=("headers", "footers", "quotes"),
                          categories=categories)

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(news.data)
clf = MultinomialNB(alpha=0.01)
clf.fit(vectors, news.target)
pickle.dump({"vectorizer": vectorizer, "model": clf}, open("nb_model", "wb"))

# pred = clf.predict(vectorizer.transform([news.data[-1]]))
# print news.target_names[pred[0]]
Ejemplo n.º 60
0
bow_transformer = CountVectorizer(analyzer=text_process).fit(X_train)
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_train = bow_transformer.transform(X_train)
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_test = bow_transformer.transform(X_test)

# # # # Naive Bayes # # #

# In[111]:

from sklearn.naive_bayes import MultinomialNB
# instantiating the model with Multinomial Naive Bayes..
model = MultinomialNB()
# training the model...
model = model.fit(text_bow_train, y_train)

# In[73]:

model.score(text_bow_train, y_train)

# In[74]:

# Importing necessary libraries
from sklearn.metrics import classification_report

# getting the predictions of the Validation Set...
predictions = model.predict(text_bow_test)
# getting the Precision, Recall, F1-Score
print(classification_report(y_test, predictions))