Esempio n. 1
0
def _mnb(t, min_freq, save=False):
    if save:
        clf = mnb().fit(records, labels)
        save_classifier(clf, t, 'mnb', min_freq)
        return ('mnb', clf)
    else:
        clf = load_classifier(t, 'mnb', min_freq)
        return ('mnb', clf)
Esempio n. 2
0
def CrossValidate(X, Y, IDX, cl, **kwargs):
    '''
    Input
        X - is array of (N,K) features
        Y - is array of (N,1) labels
        IDX - is the (N,1) array of keys
        clf is a string for the classifier method: 'svm','mnb','knn',etc.
        kwargs is for ShiffleSplit and should be {'n_iter':5, 'test_size':0.80,'random_state':0} for example
    Returns:
        Predictions= dict(Key: log likelihood of label[0])
    '''
    print 'running cross-val'
    from sklearn import cross_validation as cv
    from sklearn import svm
    from sklearn.naive_bayes import MultinomialNB as mnb
    from sklearn import neighbors as knn
    #predcition=cv.cross_val_predict(clf,X,Y,**kwargs)   #this _predict function only exists in an updated version of sklearn.
    Res = {}
    print kwargs
    splits = cv.ShuffleSplit(
        X.shape[0], **kwargs)  # n_iter=5, test_size=0.80,random_state=0)
    print 'running ', len(splits), ' splits in cross-validation'
    for trainidx, testidx in splits:
        if len(set(Y[trainidx])) == 1:
            continue
        trainL = Y[trainidx]
        trainT = X[trainidx]
        testL = Y[testidx]
        testT = X[testidx]
        testIDX = IDX[testidx]
        if cl == 'knn':
            cl = neighbors.KNeighborsClassifier()
        if cl == 'svm':
            cl = svm.SVC(C=1, kernel='linear', probability=True)
        if cl == 'mnb':
            cl = mnb()
        cl.fit(trainT, trainL)
        print 'accuracy of nth fold is ', cl.score(testT, testL)
        preds = cl.predict_proba(testT)
        if 0 in preds:
            for i, p in enumerate(preds):
                if p[0] == 0:
                    preds[i][0] = .01
                    preds[i][1] = .99
                if p[1] == 0:
                    preds[i][0] = .99
                    preds[i][1] = .01
        female = [np.log(p[0] / p[1]) for p in preds]
        res = dict(zip(testIDX, female))
        for k, v in res.iteritems():
            if k in Res:
                Res[k].append(v)
            else:
                Res[k] = [v]

    return Res
Esempio n. 3
0
def predictByMNB(features, classes, test):

    ## Why MNB requires non-negative features?
    ## MNB uses multinomial distribution to compute P(x_i|Y_j), which is the distribution of the i_th feature when given class Y_j.
    ## Because multinomial distribution's every value should be >= 0, so MNB ask for it too.
    if (features.min() < 0) :
        raise ValueError("Feautres must be larger than or equal to 0 if using multinomial naive bayes!")
    
    clf = mnb()
    clf.fit(features, classes)
    return clf.predict(test)
Esempio n. 4
0
def multinomialNaiveBayesTrain(trainQuestions, tag, X, y, mnbd):
    clf = mnb()
    i = 0
    for qid in trainQuestions:
        if tag in trainQuestions[qid].tags:
            y[i] = 1
        else:
            y[i] = 0
        i += 1

    clf.fit(X, y)
    mnbd[tag] = clf
Esempio n. 5
0
def multinomialNaiveBayesTrain(trainQuestions, tag, X, y, mnbd):
  clf = mnb()
  i = 0
  for qid in trainQuestions:
    if tag in trainQuestions[qid].tags:
      y[i] = 1
    else:
      y[i] = 0
    i += 1

  clf.fit(X,y)
  mnbd[tag] = clf
def RFETopWords(X, Y, n=20, clf=''):
    if clf == 'knn':
        cl = neighbors.KNeighborsClassifier()
    if clf == 'svm':
        cl = svm.LinearSVC()
    if clf == 'mnb':
        cl = mnb()
    selector = RFE(cl, n, step=.05)
    selector = selector.fit(X, Y)
    tops = np.argsort(selector.support_)[-n:]
    #words=[vectorizer.get_feature_names()[i] for i in tops]
    return selector, tops
Esempio n. 7
0
def Classify(trainT, trainL, clf='knn'):
    '''Code to train and test classifiers.  type can be 'knn' 'nb' or 'svm'
    returns the fit matrix #a dictionary of {twitterID: likelihood ratio}'''
    from sklearn import svm
    from sklearn.naive_bayes import MultinomialNB as mnb
    from sklearn import neighbors
    print 'Running Classifier ' + clf
    if clf == 'knn':
        cl = neighbors.KNeighborsClassifier()
        cl.fit(trainT, trainL)
    if clf == 'svm':
        cl = svm.SVC(C=100, gamma=.1, probability=True)
        cl.fit(trainT, trainL)
    if clf == 'mnb':
        cl = mnb()
        cl.fit(trainT, trainL)
    return cl
Esempio n. 8
0
def naive_bayes_mnb(x_train, y_train, x_test, y_test):
    model = mnb()
    model.fit(x_train, y_train)
    expected = y_test
    predicted = model.predict(x_test)
    return expected, predicted
Esempio n. 9
0
                                oob_score=False,
                                random_state=13,
                                verbose=0,
                                warm_start=False)
model0.fit(train_x, train_y)
predicted = model0.predict(test_x)
model0.score(test_x, test_y)

# Just a little bit better...

# ## Bayes (model 1)

# In[34]:

from sklearn.naive_bayes import MultinomialNB as mnb
model1 = mnb()
model1.fit(train_x, train_y)

# In[35]:

predicted = model1.predict(test_x)
model1.score(test_x, test_y)

# ## SVM (model 2)

# In[36]:

from sklearn import svm
from sklearn.model_selection import GridSearchCV as gs

# In[37]:
Esempio n. 10
0
fig.autofmt_xdate()

from sklearn.metrics import *
y_dum = np.ones(len(data['rumorType'].values))
score1 = accuracy_score(y_dum, data['rumorType'])

from sklearn.feature_extraction.text import TfidfVectorizer
zhTokenizer = jieba.cut
v = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b',
                    tokenizer = zhTokenizer,
                    lowercase = False,
                    stop_words = ['是','的'],
                    max_features = 250)

y = data['rumorType']
X_txt = data.drop(['rumorType','crawlTime','mainSummary'],axis=1)

from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
X_tr,X_te,y_tr,y_te = train_test_split(X_txt,y,test_size=0.2,stratify=y)

#Convert X_train
X_tr_v = v.fit_transform(X_tr['title'])

from sklearn.naive_bayes import MultinomialNB as mnb
model_bl = mnb()
model_bl.fit(X_tr_v,y_tr.values)

X_te_v = v.transform(X_te['title'])
y_pred = model_bl.predict(X_te_v)

score2 = accuracy_score(y_pred,y_te)
Esempio n. 11
0
	model.fit( X , y )

	X_test = [ row[ :-1 ] for row in test_data ]
	y_real = [ row[ -1 ] for row in test_data ]
	y_pred = model.predict( X_test )
	print report( y_real , y_pred )
	tp = lambda x : 1 if x == 'spam' else 0
	real = [ tp( v ) for v in y_real ]
	pred = [ tp( v ) for v in y_pred ]
	print mean_absolute_error( real , pred )
	print mean_squared_error( real , pred )

if __name__ == '__main__' :
	if len( sys.argv ) > 2 :
		train_fpath , test_fpath = sys.argv[ 1: ]
		train_data = import_csv( train_fpath )
		test_data = import_csv( test_fpath )
		''' DECISION TREE '''
		cf = dtc( criterion = 'gini' , max_depth = 50 )
		classify( cf , train_data , test_data , 'decision_tree' )
		
		''' NEAREST NEIGHBORS '''
		cf = knc( n_neighbors = 1 , metric = 'hamming' )
		classify( cf , train_data , test_data , 'knearest_neighbors' )
		
		''' NAIVE BAYES '''
		cf = mnb( alpha = 100.0 )
		classify( cf , train_data , test_data , 'naive_bayes' )
	else :
		print "Usage python %s [train_csv_file] [test_csv_file]" % sys.argv[ 0 ]
# predict classes
dt_predictions_tuned = pd.Series(dt_estimator_tuned.predict(post2000_exp))

# cross predicted vs actual
post2000_res.index = dt_predictions_tuned.index
dt_crosstab_tuned = pd.crosstab(
    post2000_res, dt_predictions_tuned, rownames=["Actual"], colnames=["Predicted"], margins=True
)
print dt_crosstab_tuned


# BUILD NAIVE BAYES MODEL (UNSCALED DATA)------------------------------------------------

# run model
nb = mnb()

# conduct recursive feature search
nb_rfe_cv = rfe(estimator=nb, step=1, cv=10, scoring="roc_auc", verbose=1)
nb_rfe_cv.fit(pre2000_exp, pre2000_res)

# identify and plot optimal number of features (d = 50). ROC_AUC=0.6391
print nb_rfe_cv.n_features_
print nb_rfe_cv.grid_scores_.max()

plt.figure()
plt.xlabel("NB: Number of Features selected")
plt.ylabel("NB: Cross Validation Score (ROC_AUC)")
plt.plot(range(1, len(nb_rfe_cv.grid_scores_) + 1), nb_rfe_cv.grid_scores_)
plt.show()
Esempio n. 13
0
ftr2_ext_dat = filter_dataset_2(ext_dat[0],ext_dat[1],ext_dat[2],ext_dat[3],ext_dat[4])
sorted_sessions_ftr2 = construct_customer_view(ftr2_ext_dat)

ftr3_ext_dat = filter_dataset_3(ext_dat[0],ext_dat[1],ext_dat[2],ext_dat[3],ext_dat[4])
sorted_sessions_lng = construct_customer_view(ftr3_ext_dat)

# Classifier Classes
clf1 = svm.LinearSVC(C=0.05,penalty='l2')
clf2 = lm.ElasticNetCV(l1_ratio=0.3,n_jobs=1)
clf3 = lm.LogisticRegression(penalty='l1')
clf4 = lm.SGDClassifier(loss='hinge',n_jobs=1,n_iter=100,penalty='elasticnet')
clf5 = svm.SVC(C=4.0,kernel='rbf',degree=3,probability=True)
bclf1 = dtree(max_depth=10)
bclf2 = svm.SVC(C=4.0,kernel='rbf',degree=3,probability=True)
bclf3 = mnb(alpha=1.0,fit_prior=True,class_prior=None)
#clf6 = ensmbl.AdaBoostClassifier(base_estimator=bclf1,n_estimators=100,learning_rate=1.0)
clf7 = ensmbl.RandomForestClassifier(n_estimators=10,criterion='gini')
clf8 = gs.GridSearchCV(svm.LinearSVC(penalty='l2'),{'C':[0.005,0.01,0.05,0.1,0.2,0.3]},cv=3)
clf9 = gs.GridSearchCV(svm.LinearSVC(penalty='l2'),{'C':[0.005,0.01,0.05,0.1,0.2,0.3]},cv=3)
clf10 = gs.RandomizedSearchCV(svm.LinearSVC(penalty='l2'),{'C':[0.0001,0.001,0.01,0.1,0.25]},cv=3)
clf11 = gs.RandomizedSearchCV(svm.LinearSVC(penalty='l2'),{'C':[1.0,2.0,5.0,10.0,20.0]},cv=3)
#clf9 = gs.GridSearchCV(svm.SVC(kernel='poly',degree='3'),{'C':[0.005,0.01,0.05,0.1,0.2,0.3]},cv=3)
#clf9 = gs.GridSearchCV(svm.SVC(),{'C':[0.3,0.5,1.0,2.0,3.0]},cv=3)
#clf9 = gs.GridSearchCV(lm.SGDClassifier(penalty='elasticnet',loss='log',n_iter=1000,n_jobs=-1,shuffle=True),{'l1_ratio':[0.1,0.5,0.7,0.9]},cv=3)
clf_ls = [gs.GridSearchCV(svm.LinearSVC(penalty='l2'),{'C':[0.005,0.01,0.05,0.1,0.2,0.3]},cv=3) for it in range(0,4)]

# Feature Selection Classes
fs1 = fs.SelectKBest(chi2,k=100)
fs2 = fs.RFECV(clf1,step=1000,cv=5)
fs3 = fs.RFE(clf1)
Esempio n. 14
0
def hybridTrial(metadata):
    '''
    This code takes two above feature sets and tests whether they change their collective and individual predictability
    Raw * Raw Topics = ? * .72 = .71 (No change in nb score)
    Subtopics * raw topics = .65 * .72 = .69
    '''
    print 'import raw topic scores'
    filename = 'Twitter/Data/Raw_Topic_Scores.csv'
    data = ImportCSVFeatureData(filename, -1)
    print 'drawing samples'
    vec = np.array([[float(l) for l in line[1:]]
                    for line in data])  #exclude cases where sex is unknown
    labels = np.array([metadata[line[0]][0]
                       for line in data])  # if 'age' not in line])
    IDX = np.array([line[0] for line in data])
    print 'CV for RAW TOPICS'
    CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0}
    cl = mnb()
    #Preds=Classifiers.CrossValidate(vec,labels,IDX,cl,**CVargs)

    print 'importing subtopic scores'
    path = 'Twitter/Data/'
    preds = {}
    Data = []
    for cat in set([line[2] for line in metadata.values()]):
        if cat == 'category' or cat == 'party':
            continue
        print 'RUNNINING ', cat, ' SUBTOPIC SCORES'
        f = 'Twitter_' + cat + '_Topic_Scores.csv'
        data = ImportCSVFeatureData(path + f, -1)
        Data.append(data)
        #for line in data:
        #    for idx in IDX:
        #        if line[0]==idx:
        #            rvec.append(line)
        #            break
        #vec=np.array([[float(l) for l in line[1:]] for line in data])   #exclude cases where sex is unknown
        #labels=np.array([metadata[line[0]][0] for line in data])# if 'age' not in line])
        #IDX=np.array([line[0] for line in data])
    print 'resorting cases to align with labels'
    rvec = [[] for i in IDX]
    #rlabels=[]

    for data in Data:
        for i, idx in enumerate(IDX):
            if idx in [line[0] for line in data]:
                for line in data:
                    if idx == line[0]:
                        rvec[i] += line[1:]
                        continue
                    #rvec.append(line[1:])
            else:
                rvec[i] += [0 for i in data[0][1:]]

    #used to align RAW
    #for idx in IDX:
    #    if line[0]==idx:
    #        rvec.append(line[1:])
    #        break
    #rlabels.append(meta-data[str(int(idx))][0])

    rvec = np.append(vec, np.array(rvec), axis=1)

    print 'crossvalidate testing COMBINATION'
    CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0}
    cl = mnb()
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)

    CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0}
    cl = mnb()
    cl = ensemble.AdaBoostClassifier(n_estimators=10)
    Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs)

    return
Esempio n. 15
0
'''
#用TfIdfVectorizer将文本向量化
zhTokenizer = jieba.cut
v = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",
                    tokenizer=zhTokenizer,
                    lowercase=False,
                    stop_words=['是', '的'],
                    max_features=250)
y = data['rumorType']
#将谣言的内容和题目合一起
x_txt = data[['mainSummary', 'title']].apply(lambda x: ' '.join(x), axis=1)
#划分训练集和测试集
x_tr, x_te, y_tr, y_te = train_test_split(x_txt, y, test_size=0.2, stratify=y)
#构建模型并训练
x_tr_v = v.fit_transform(x_tr)
model_bl = mnb()
model_bl.fit(x_tr_v, y_tr.values)
x_te_v = v.transform(x_te)
y_pred = model_bl.predict(x_te_v)
#accuracy_score分类准确率,算出分类中正确分类的百分比
print("使用Multinomial Naive Bayes模型预测的准确率:", accuracy_score(y_te, y_pred))
print('使用Multinomial Naive Bayes模型预测的精确率为:',
      precision_score(y_te, y_pred, average="micro"))
print('使用Multinomial Naive Bayes模型预测的召回率为:',
      recall_score(y_te, y_pred, average="micro"))
print('使用Multinomial Naive Bayes模型预测的F1值为:',
      f1_score(y_te, y_pred, average="micro"))
print('使用Multinomial Naive Bayes模型预测的Cohen’s Kappa系数为:',
      cohen_kappa_score(y_te, y_pred))
print('使用Multinomial Naive Bayes模型预测的分类报告为:', '\n',
      classification_report(y_te, y_pred))
Esempio n. 16
0
def main():
    priors = [
        .0369, .0162, .012, .0103, .0133, .0126, .0172, .0133, .5214, .0068,
        .1756, .0104, .1218, .0191, .013
    ]

    ##########################
    ####System Counts#########
    ##########################
    # define global set for creating data frames
    # test_tree_list, test_classes, test_ids = extract_tree("test")
    # globalSetTest = set()
    # dictListTest = list()
    # for tree in test_tree_list:
    #     dictListTest.append(perSysCallCount(tree, globalSetTest))

    # train_tree_list, train_classes, train_ids = extract_tree("train")
    # dictListTrain = list()
    # for tree in train_tree_list:
    #     dictListTrain.append(perSysCallCount(tree, globalSetTest))

    # newPerSysCallCountFile(dictListTest,test_classes, test_ids, "perSysCountsTest.csv", globalSetTest)
    # newCountFile(test_tree_list, test_classes, test_ids, "choppyTest.csv")
    # del test_tree_list,test_classes,dictListTest,test_ids

    # newPerSysCallCountFile(dictListTrain,train_classes,train_ids, "perSysCountsTrain.csv",globalSetTest)
    #newCountFile(train_tree_list, train_classes, train_ids, "choppyTrain.csv")
    # del train_tree_list,train_classes,train_ids,dictListTrain

    ###############################################
    #######Per-Tree, Per-System Call Counts########
    ###############################################
    """
    Read in train and test as Pandas DataFrames
    """
    # df_train = pd.read_csv("choppyTrain.csv")
    # df_test = pd.read_csv("choppyTest.csv")
    df_train = pd.read_csv("perSysCountsTrain.csv")
    df_test = pd.read_csv("perSysCountsTest.csv")
    #store class values
    Y_train = df_train.Class.values
    testID = df_test.Id.values
    #row where testing examples start
    test_idx = df_train.shape[0]
    df_all = pd.concat((df_train, df_test), axis=0)
    del df_train
    del df_test
    df_all = df_all.drop(['Id'], axis=1)
    df_all = df_all.drop(['Class'], axis=1)
    vals = df_all.values
    del df_all
    X_train = vals[:test_idx]
    X_test = vals[test_idx:]
    del vals
    # clf = bnb(class_prior=priors)
    # clf.fit(X_train, Y_train)
    clf = mnb(class_prior=priors)
    clf.fit(X_train, Y_train)
    del X_train
    del Y_train
    # bnb_predict = clf.predict(X_test)
    mnb_predict = clf.predict(X_test)
    # util.write_predictions(bnb_predict,test_ids,"ChoppySingleBNB.csv")
    util.write_predictions(mnb_predict, testID, "PerSysCallCountsBNB.csv")
#predict classes
dt_predictions_tuned = pd.Series(dt_estimator_tuned.predict(post2000_exp))

#cross predicted vs actual
post2000_res.index = dt_predictions_tuned.index
dt_crosstab_tuned = pd.crosstab(post2000_res, dt_predictions_tuned, rownames=['Actual'], 
                          colnames=['Predicted'], margins=True)
print dt_crosstab_tuned



#BUILD NAIVE BAYES MODEL (UNSCALED DATA)------------------------------------------------

#run model
nb = mnb()

#conduct recursive feature search
nb_rfe_cv = rfe(estimator=nb, step=1, cv=10, scoring='roc_auc', verbose = 1)
nb_rfe_cv.fit(pre2000_exp, pre2000_res)

#identify and plot optimal number of features (d = 50). ROC_AUC=0.6391
print nb_rfe_cv.n_features_
print nb_rfe_cv.grid_scores_.max()

plt.figure()
plt.xlabel("NB: Number of Features selected")
plt.ylabel("NB: Cross Validation Score (ROC_AUC)")
plt.plot(range(1, len(nb_rfe_cv.grid_scores_) + 1), nb_rfe_cv.grid_scores_)
plt.show()
Esempio n. 18
0
import sys
import sklearn
from classifier_utils import *
from sklearn.naive_bayes import MultinomialNB as mnb

if __name__ == '__main__' :
	if len( sys.argv ) > 2 :
		infilepath , alp = sys.argv[ 1: ]
		data = import_csv( infilepath )
		cf = mnb( alpha = float( alp ) )
		stats = cross_validation( data , cf )
		print "PARAMS: alpha=%s" % alp
		print_stats( stats )
	else :
		print "Usage python %s [csv_file] [neighbors] [distance]" % sys.argv[ 0 ]