Beispiel #1
0
def featureSelection() :
	
	#load Dataset
	X_0, y, biomarkerNames = loadDataset()\
	#use K-Fold
	kf = KFold(n_splits=10)
	kf.get_n_splits(X_0)
	
	for i in (250,500,1000):
		print("Number of Features "+str(i))
		fold=0
		for train_index, test_index in kf.split(X_0):
			print("Fold "+str(fold))
			fold=fold+1
			#declare selector with 4 features using F-score
			selector=SelectKBest(f_classif, k=i)
			#Normalize Data
			scaler = StandardScaler()
			X_train, X_test = X_0[train_index], X_0[test_index]
			y_train, y_test = y[train_index], y[test_index]
			X_train = scaler.fit_transform(X_train)
			X_test=scaler.transform(X_test)
			#Calculate Scores
			X_train = selector.fit_transform(X_train, y_train)
			#Get positions of Best Scores
			selected=selector.get_support(indices=True)
			X_test=selector.transform(X_test)
			##Print ANOVA F-Values
			#print("ANOVA F-value")
			#print(selector.scores_[selected])
			##Print P-values
			#print("p values")
			#print(selector.pvalues_[selected])
			#Print Resulting FeaturesS
			#print("features names")
			#print(biomarkerNames[selected])
			#print("features index")
			##Print Features Index
			#print(selected)
			#Declare Classifier
			clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
			#Train Classifier
			clf.fit(X_train, y_train)
			#Print Accuracy
			accuracy_train=clf.score(X_train,y_train)
			accuracy_test=clf.score(X_test,y_test)
			print("Accuracy Train " + str(accuracy_train))
			print("Accuracy Test " + str(accuracy_test))
			## create folder
			#folderName ="./results/"
			#if not os.path.exists(folderName) : os.makedirs(folderName)
			##Print reduce Dataset
			#pd.DataFrame(X_new).to_csv(folderName+"data_"+str(0)+".csv", header=None, index =None)
			#pd.DataFrame(biomarkerNames[selected]).to_csv(folderName+"features_"+str(0)+".csv", header=None, index =None)
			#pd.DataFrame(y).to_csv(folderName+"labels.csv", header=None, index =None)
		
	return 
Beispiel #2
0
def PassiveAggressive_clf(training_set_np, validation_set_np, testing_set_np,
                          training_label, validation_label, testing_label):
    clf = PassiveAggressiveClassifier(max_iter=50)
    clf.fit(training_set_np, training_label)

    print("Passive Aggressive Classifier")
    print("Training Set Accuracy  : " +
          str(100 * clf.score(training_set_np, training_label)))
    print("Validation Set Accuracy: " +
          str(100 * clf.score(validation_set_np, validation_label)))
    print("Testing Set Accuracy   : " +
          str(100 * clf.score(testing_set_np, testing_label)))
    print("\n")
Beispiel #3
0
def model_PassiveAggressive(train_x, train_y, test_x, test_y, n_est=100):
    model = PassiveAggressiveClassifier()
    model.fit(train_x, train_y)
    sc = model.score(test_x, test_y)
    prediction = model.predict(test_x)
    mae = mean_absolute_error(test_y, prediction)
    return (sc, mae, prediction, model)
Beispiel #4
0
def linear_models(x_train, y_train):
    from sklearn.linear_model import LogisticRegression
    classifier1 = LogisticRegression(C=1.2, random_state=0, max_iter=1500)
    classifier1.fit(x_train, y_train)

    from sklearn.linear_model import PassiveAggressiveClassifier
    classifier2 = PassiveAggressiveClassifier()
    classifier2.fit(x_train, y_train)

    from sklearn.linear_model import RidgeClassifierCV
    classifier3 = RidgeClassifierCV()
    classifier3.fit(x_train, y_train)

    from sklearn.linear_model import SGDClassifier
    classifier4 = SGDClassifier()
    classifier4.fit(x_train, y_train)

    from sklearn.linear_model import Perceptron
    classifier5 = Perceptron()
    classifier5.fit(x_train, y_train)

    print('LogisticRegression training accuracy: ',
          classifier1.score(x_train, y_train))
    print('PassiveAggressiveClassifier training accuracy: ',
          classifier2.score(x_train, y_train))
    print('RidgeClassifierCV training accuracy: ',
          classifier3.score(x_train, y_train))
    print('SGDClassifier training accuracy: ',
          classifier4.score(x_train, y_train))
    print('Perceptron training accuracy: ',
          classifier5.score(x_train, y_train))

    return classifier1, classifier2, classifier3, classifier4, classifier5
Beispiel #5
0
def training():
    X, y = get_data()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=7)

    tfidf_Xtrain, tfidf_Xtest = Vectorize(X_train, X_test)

    Pac = PassiveAggressiveClassifier(C=0.5, random_state=5)

    Pac.fit(tfidf_Xtrain, y_train)

    Pac_acc = Pac.score(tfidf_Xtest, y_test)

    print(Pac_acc)

    y_pred = Pac.predict(tfidf_Xtest)

    Pac_accuracy = accuracy_score(y_test, y_pred)

    print(Pac_accuracy)

    conf_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

    print(conf_matrix)

    clf_report = classification_report(y_test, y_pred)

    print(clf_report)

    makePickleFile(Pac)
Beispiel #6
0
def passiveAggresive(train, test, Y_train, Y_test, column):
    '''
    Fits a Passive Aggresive Perceptron Classifer
    '''
    clf = PassiveAggressiveClassifier(C = .1, max_iter = 1000, class_weight = 'balanced', tol = 1e-3)
    clf.fit(train, Y_train[column])
    clf.predict(test)
    return clf.score(test, Y_test[column])
Beispiel #7
0
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            clf = PassiveAggressiveClassifier(C=1.0, n_iter=30,
                                              fit_intercept=fit_intercept,
                                              random_state=0)
            clf.fit(data, y)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            clf = PassiveAggressiveClassifier(C=1.0, n_iter=30,
                                              fit_intercept=fit_intercept,
                                              random_state=0)
            clf.fit(data, y)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
def test_classifier_partial_fit():
    classes = np.unique(y)
    for data in (X, X_csr):
        clf = PassiveAggressiveClassifier(C=1.0,
                                          fit_intercept=True,
                                          random_state=0)
        for t in xrange(30):
            clf.partial_fit(data, y, classes)
        score = clf.score(data, y)
        assert_greater(score, 0.79)
def test_classifier_partial_fit():
    classes = np.unique(y)
    for data in (X, X_csr):
            clf = PassiveAggressiveClassifier(C=1.0,
                                              fit_intercept=True,
                                              random_state=0)
            for t in range(30):
                clf.partial_fit(data, y, classes)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
def get_baseline_pa(dataset, train_label_list, test_label_list, verbose=True):
    (X_train, Y_train), (X_test, Y_test) = dataset
    classifier = PassiveAggressiveClassifier(n_jobs=-1, fit_intercept=True)
    classifier.fit(X_train, train_label_list)
    accuracy = classifier.score(X_test, test_label_list)

    if verbose:
        print('Got baseline of %f with Passive Aggressive classifier' %
              accuracy)

    return accuracy
Beispiel #12
0
def get_baseline_pa(dataset_info, verbose=True):
    (X_train,
     Y_train), (X_test,
                Y_test) = dataset_info.ds.get_dataset(to_categorical=True,
                                                      num_labels=num_labels)
    classifier = PassiveAggressiveClassifier(n_jobs=-1, fit_intercept=True)
    classifier.fit(X_train, dataset_info.ds.get_Y_train(X_train))
    accuracy = classifier.score(X_test, dataset_info.ds.get_Y_test(X_train))

    if verbose:
        print('Got baseline of %f with Passive Aggressive classifier' %
              accuracy)

    return accuracy
def featureSelection():
    #load Dataset
    X, y, biomarkerNames = loadDataset()
    #Normalize Data
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    #Declare Classifier
    clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)
    #Train Classifier
    clf.fit(X, y)
    #Print Accuracy
    print(clf.score(X, y))

    return
Beispiel #14
0
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            for average in (False, True):
                clf = PassiveAggressiveClassifier(
                    C=1.0, max_iter=30, fit_intercept=fit_intercept,
                    random_state=1, average=average, tol=None)
                clf.fit(data, y)
                score = clf.score(data, y)
                assert score > 0.79
                if average:
                    assert hasattr(clf, 'average_coef_')
                    assert hasattr(clf, 'average_intercept_')
                    assert hasattr(clf, 'standard_intercept_')
                    assert hasattr(clf, 'standard_coef_')
Beispiel #15
0
def test_classifier_partial_fit():
    classes = np.unique(y)
    for data in (X, X_csr):
        for average in (False, True):
            clf = PassiveAggressiveClassifier(random_state=0,
                average=average, max_iter=5)
            for t in range(30):
                clf.partial_fit(data, y, classes)
            score = clf.score(data, y)
            assert score > 0.79
            if average:
                assert hasattr(clf, 'average_coef_')
                assert hasattr(clf, 'average_intercept_')
                assert hasattr(clf, 'standard_intercept_')
                assert hasattr(clf, 'standard_coef_')
def pac(x, y, x_t, y_t, y_pred):
    score = 0
    t = 0
    for i in range(48):
        classifier = PassiveAggressiveClassifier(max_iter=len(x[i]))
        try:
            classifier.fit(np.array(x[i]), np.array(y[i]))
            y_pred[i] = classifier.predict(x_t[i])
            score += classifier.score(x_t[i], y_t[i])
            t += 1
        except:
            print('error in ' + str(i))
            y_pred[i] = np.zeros(17)
            continue
    return score / t
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            for average in (False, True):
                clf = PassiveAggressiveClassifier(
                    C=1.0, max_iter=30, fit_intercept=fit_intercept,
                    random_state=1, average=average, tol=None)
                clf.fit(data, y)
                score = clf.score(data, y)
                assert_greater(score, 0.79)
                if average:
                    assert hasattr(clf, 'average_coef_')
                    assert hasattr(clf, 'average_intercept_')
                    assert hasattr(clf, 'standard_intercept_')
                    assert hasattr(clf, 'standard_coef_')
def test_classifier_partial_fit():
    classes = np.unique(y)
    for data in (X, X_csr):
        for average in (False, True):
            clf = PassiveAggressiveClassifier(
                C=1.0, fit_intercept=True, random_state=0,
                average=average, max_iter=5)
            for t in range(30):
                clf.partial_fit(data, y, classes)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
            if average:
                assert hasattr(clf, 'average_coef_')
                assert hasattr(clf, 'average_intercept_')
                assert hasattr(clf, 'standard_intercept_')
                assert hasattr(clf, 'standard_coef_')
Beispiel #19
0
def test_classifier_accuracy():
    for data in (X, X_csr):
        for fit_intercept in (True, False):
            for average in (False, True):
                clf = PassiveAggressiveClassifier(C=1.0, n_iter=30,
                                                  fit_intercept=fit_intercept,
                                                  random_state=0,
                                                  average=average)
                clf.fit(data, y)
                score = clf.score(data, y)
                assert_greater(score, 0.79)
                if average:
                    assert_true(hasattr(clf, 'average_coef_'))
                    assert_true(hasattr(clf, 'average_intercept_'))
                    assert_true(hasattr(clf, 'standard_intercept_'))
                    assert_true(hasattr(clf, 'standard_coef_'))
def test_classifier_partial_fit():
    classes = np.unique(y)
    for data in (X, X_csr):
        for average in (False, True):
            clf = PassiveAggressiveClassifier(C=1.0,
                                              fit_intercept=True,
                                              random_state=0,
                                              average=average)
            for t in range(30):
                clf.partial_fit(data, y)
            score = clf.score(data, y)
            assert_greater(score, 0.79)
            if average:
                assert_true(hasattr(clf, 'average_coef_'))
                assert_true(hasattr(clf, 'average_intercept_'))
                assert_true(hasattr(clf, 'standard_intercept_'))
                assert_true(hasattr(clf, 'standard_coef_'))
Beispiel #21
0
def featureSelection():

    #load Dataset
    X_0, y, biomarkerNames = loadDataset()

    for i in (2, 4, 8, 16):
        #declare selector with 4 features using F-score
        selector = SelectKBest(f_classif, k=i)
        #Normalize Data
        scaler = StandardScaler()
        X = scaler.fit_transform(X_0)
        #Calculate Scores
        X_new = selector.fit_transform(X, y)
        #Get positions of Best Scores
        selected = selector.get_support(indices=True)
        ##Print ANOVA F-Values
        #print("ANOVA F-value")
        #print(selector.scores_[selected])
        ##Print P-values
        #print("p values")
        #print(selector.pvalues_[selected])
        ##Print Resulting Features
        #print("features names")
        #print(biomarkerNames[selected])
        #print("features index")
        ##Print Features Index
        #print(selected)
        print(i)
        #Declare Classifier
        clf = PassiveAggressiveClassifier(max_iter=1000,
                                          random_state=0,
                                          tol=1e-3)
        #Train Classifier
        clf.fit(X_new, y)
        #Print Accuracy
        print(clf.score(X_new, y))

        ## create folder
        #folderName ="./results/"
        #if not os.path.exists(folderName) : os.makedirs(folderName)
        ##Print reduce Dataset
        #pd.DataFrame(X_new).to_csv(folderName+"data_"+str(0)+".csv", header=None, index =None)
        #pd.DataFrame(biomarkerNames[selected]).to_csv(folderName+"features_"+str(0)+".csv", header=None, index =None)
        #pd.DataFrame(y).to_csv(folderName+"labels.csv", header=None, index =None)
    return
Beispiel #22
0
print(df.shape)
df.head()

labels = df.label
labels.head()

X = df['text']
y = df['label']
cv = CountVectorizer()
X = cv.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.24,
                                                    random_state=4)

pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train, y_train)
pac.score(X_test, y_test)

y_pred = pac.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

pickle.dump(pac, open('model.pkl', 'wb'))

model = pickle.load(open('model.pkl', 'rb'))
print(classification_report(y_test, y_pred))
y_pred=ac.predict(tfidf_test)



from sklearn.metrics import accuracy_score, confusion_matrix
Confusion=confusion_matrix(y_pred,y_test)
print(Confusion)

Accuracy=accuracy_score(y_pred,y_test)
print("Accuracy :",(Accuracy*100).round(3))


ACC=[]
for i in range(20,50):
    acc=PassiveAggressiveClassifier(max_iter=i).fit(tfidf_train,y_train)
    ACC.append((acc.score(tfidf_test,y_test)*100).round(2))
    
print(max(ACC))





############################################################################ 


from sklearn.linear_model import LogisticRegression

log=LogisticRegression().fit(tfidf_train,y_train)
L_pred=log.predict(tfidf_test)
Beispiel #24
0
clf7.fit(X_train, y_train)

vc = VotingClassifier(estimators=[
    ('mlp', clf), ('dt', clf3), ('et', clf6), ('bag', clf5), ('grad', clf7)
], voting='soft', weights=[0.3, 0.1, 0.2, 0.1, 0.3])
vc.fit(X_train, y_train)

predicted = clf.predict(X_test)
predicted2 = clf2.predict(X_test)
predicted3 = clf3.predict(X_test)
predicted_vc = vc.predict(X_test)

score1 = clf.score(X_test, y_test)
score2 = clf2.score(X_test, y_test)
score3 = clf3.score(X_test, y_test)
score4 = clf4.score(X_test, y_test)
score5 = clf5.score(X_test, y_test)
score6 = clf6.score(X_test, y_test)
score7 = clf7.score(X_test, y_test)
score_vc = vc.score(X_test, y_test)

sia = SIA()
pol_scores = [0]*len(y_test)

for i in range(0,len(y_test)):
    pol_score = sia.polarity_scores(X_test1.values[i])['compound']
    pol_scores[i] = int(round(2*pol_score + 2))


print('Diff')
print('(MLP SVC DT) (POL, VC)')
Beispiel #25
0
    percScoresTrain = []
    percScoresDev = []
    for i in range(10):
        perceptron.fit(trainX, trainY)
        percScoresDev.append(perceptron.score(devX, devY))
        percScoresTrain.append(perceptron.score(trainX, trainY))

    print "Perceptron Train:", np.mean(percScoresTrain)
    print "Perceptron Dev:", np.mean(percScoresDev)
    
    passAggScoresTrain = []
    passAggScoresDev = []
    for i in range(10):
        passAgg.fit(trainX, trainY) 
        passAggScoresDev.append( passAgg.score(devX, devY))
        passAggScoresTrain.append( passAgg.score(trainX, trainY))


    print "Passive Aggressive Train:", np.mean(passAggScoresTrain)
    print "Passive Aggressive Dev:", np.mean(passAggScoresDev)

    
    passAggScoresSmallTrain = []
    passAggScoresSmallDev = []
    for i in range(10):
        passAgg.fit(trainX, trainY) 
        passAggScoresSmallDev.append( passAgg.score(devX, devY))
        passAggScoresSmallTrain.append( passAgg.score(trainXSmall,trainYSmall))

    print "Passive Aggressive (Small Dataset)) Train:", np.mean(passAggScoresSmallTrain)
Beispiel #26
0
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier
from sklearn.externals import joblib
import numpy as np
import pickle

#clf = svm.SVC()
clf1 = PassiveAggressiveClassifier()
clf2 = SGDClassifier()
scaler = StandardScaler()

X = np.loadtxt('features.txt')
y = [0] * 4192 + [1] * 3317
#X = SelectKBest(chi2, k=10).fit_transform(X, y)
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)
#clf.fit(X_train, y_train)
clf1.fit(X, y)
clf2.fit(X, y)
print clf1.score(X_test, y_test)
print clf2.score(X_test, y_test)
joblib.dump(clf1, 'passive_aggressive.pkl')
joblib.dump(clf2, 'sgd.pkl')
#joblib.dump(scaler, 'scaler.pkl')
      per(svm_bow_test) + ' testing accuracy')
print('Bigram Results: ' + per(svm_bigram_train) + ' training accuracy, ' +
      per(svm_bigram_test) + ' testing accuracy')
# Now lets try using passive aggressive classifier:
from sklearn.linear_model import PassiveAggressiveClassifier, PassiveAggressiveRegressor
pac = PassiveAggressiveClassifier()
pac2 = PassiveAggressiveClassifier()
par = PassiveAggressiveRegressor()
par2 = PassiveAggressiveRegressor()
# Now fit
pac.fit(train_bow, train_ratings)
par.fit(train_bow, train_ratings)
pac2.fit(train_bigram, train_ratings)
par2.fit(train_bigram, train_ratings)
# Record and desplay results
pac_bow_train = pac.score(train_bow, train_ratings)
pac_bow_test = pac.score(test_bow, test_ratings)
pac_bigram_train = pac2.score(train_bigram, train_ratings)
pac_bigram_test = pac2.score(test_bigram, test_ratings)
par_bow_train = par.score(train_bow, train_ratings)
par_bow_test = par.score(test_bow, test_ratings)
par_bigram_train = par2.score(train_bigram, train_ratings)
par_bigram_test = par2.score(test_bigram, test_ratings)
# pac = par = pac2 = par2 = 1
del pac, par, pac2, par2
# Results
print('Passive Aggressive Classifier')
print('BOW Results: ' + per(pac_bow_train) + ' training accuracy, ' +
      per(pac_bow_test) + ' testing accuracy')
print('Bigram Results: ' + per(pac_bigram_train) + ' training accuracy, ' +
      per(pac_bigram_test) + ' testing accuracy')
Beispiel #28
0
    height_list = []

    logging.info("GhCore on '%s' database", db_name)
    logging.info("#samples = %d; #features = %d" % (X.shape[0], X.shape[1]))

    logging.info("Creting train/test split...")
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    for trainval_index, test_index in skf.split(X, y):
        X_trainval, X_test = X[trainval_index], X[test_index]
        y_trainval, y_test = y[trainval_index], y[test_index]

        # accuracy baseline
        if True:
            model = PassiveAggressiveClassifier()
            model.fit(X_trainval, y_trainval)
            base_accuracy_test = model.score(X_test, y_test)
        else:
            model, base_accuracy_test = keras_cnn_model(
                X_trainval, y_trainval, X_test, y_test)
        print("Baseline accuracy: %.4f" % (base_accuracy_test))
        base_accuracy_list.append(base_accuracy_test)

        # try with GhCore
        skf2 = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
        list_of_splits = [split for split in skf.split(X_trainval, y_trainval)]
        train_index, val_index = list_of_splits[0]
        X_train, X_val = X_trainval[train_index], X_trainval[val_index]
        y_train, y_val = y_trainval[train_index], y_trainval[val_index]

        root, _, model, accuracy_train, X_arch_core, y_arch_core, outliers, pruned_nodes = Ghcore(X_train, y_train, X_val, y_val, \
                       max_height=20, min_epochs=10, \
def runLearner(printStages = True, useSelector = False, discreteHelpfulness = True, useRST = True, useFew = False):
    learner = PassiveAggressiveClassifier() if discreteHelpfulness else PassiveAggressiveRegressor()
    #bestwords = getBestWords(instances,num=1000)
    tfidvec = TfidfVectorizer(sublinear_tf=True,stop_words='english', ngram_range=(1,3), decode_error='replace')
    selector = SelectKBest(chi2, k=50000) if useSelector else None
    encoder = LabelEncoder() if discreteHelpfulness else None
    if discreteHelpfulness:
        classlabels = encoder.fit_transform(labels)
    newData = False

    count = 0
    if useRST:
      print 'Getting RST data'
      nums, texts, ilabels = getPickledRSTSciKitDataLists(True) if newData else getRSTSciKitDataLists(True)

      random = RandomFeatureExtractor()
      lengthBaseline = LenFeatureExtractor()
      fullRST = FullPickledRSTFeatureExtractor(nums)  if newData else FullTextRSTFeatureExtractor(nums)
      limitedRST = LimitedPickledRSTFeatureExtractor(nums)  if newData else LimitedTextRSTFeatureExtractor(nums)
      vectorizer =  FeatureUnion([('extra',limitedRST),('tfid',tfidvec)])

      print 'Fitting random features baseline'
      random.fit(texts)
      print 'Fitting text length baseline'
      lengthBaseline.fit(texts)
      print 'Fitting full RST features'
      fullRST.fit(texts)
      print 'Fitting limited RST features'
      limitedRST.fit(texts)
      print 'Fitting limited RST with tfidvec features'
      vectorizer.fit(texts)
      print 'Fitting tfidvec features'
      tfidvec.fit(texts)

      split = int(0.8*len(ilabels))
      trainData = (texts[:split],ilabels[:split])
      testData = (texts[split:],ilabels[split:])      

      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      print 'random features baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      dummy = DummyClassifier()
      X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector)
      dummy.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],random,encoder,selector)
      print 'Dummy label distribution baseline trained on %d instances has accuracy %f'%(len(trainData[0]),dummy.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],lengthBaseline,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],lengthBaseline,encoder,selector)
      print 'text length baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],fullRST,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],fullRST,encoder,selector)
      print 'Full RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],limitedRST,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],limitedRST,encoder,selector)
      print 'Limited RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],vectorizer,encoder,selector)
      learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],vectorizer,encoder,selector)
      print 'Limited RST with ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))

      X,y = getAsSciKit(trainData[0],trainData[1],tfidvec,encoder,selector)
      learner = learner.fit(X,y)
      X,y = getAsSciKit(testData[0],testData[1],tfidvec,encoder,selector)
      print 'ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y))


    else:
      vectorizer = tfidvec
      testData = None
      vocabGotten = False
      instances = ([],[])
      numVocab = 50000
      numTest = 50000
      numTrain = 100000
      maxTrainStages = 20
      for text,label in getSciKitData(stateProgress = False, discreteLabels=discreteHelpfulness):
          if label!='few' or useFew:
            instances[0].append(text)
            instances[1].append(label)
            if not vocabGotten and len(instances[0]) == numVocab:
                if printStages:
                    print 'Fitting vocabulary with %d instances'%numVocab
                vectorizer.fit(instances[0],None)
                if selector is not None:
                    X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,None)
                    selector.fit(X,y)
                vocabGotten = True
                instances = ([],[])
            elif vocabGotten and testData is None and len(instances[0]) == numTest:
                if printStages:
                    print 'Getting test data with %d instances'%numTest
                testData = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector)
                instances = ([],[])
            elif vocabGotten and testData is not None and len(instances[0]) == numTrain:
                X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector)
                if discreteHelpfulness:
                    learner = learner.partial_fit(X,y, classes = classlabels)
                else:
                    learner = learner.partial_fit(X,y)
                instances = ([],[])
                count = count + 1
                if printStages:
                    print 'Baseline trained on %d instances has accuracy %f'%(count*numTrain,learner.score(testData[0],testData[1]))
            elif count == maxTrainStages:
                break
      print 'Final learner trained on %d instances has accuracy %f'%(maxTrainStages*numTrain,learner.score(testData[0],testData[1]))
Beispiel #30
0
#y is a categorical variable so will encode it
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

#now splittin the model into train and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
#training the model
from sklearn.linear_model import PassiveAggressiveClassifier
model = PassiveAggressiveClassifier()
model.fit(x_train, y_train)
#predicting the values
y_pred = model.predict(x_test)
#score of the model
model.score(x_test, y_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Classification Report : \n\n{classification_report(y_test, y_pred)}")
'''Classification Report : 

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.88      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115
'''
Beispiel #31
0
    # Create the model
    pac = PassiveAggressiveClassifier(C=0.05,
                                      loss='squared_hinge',
                                      max_iter=2000,
                                      random_state=1000)

    # Train with the start-up samples
    nb_initial_samples = int(X_train.shape[0] / 1.5)
    pac.fit(X_train[0:nb_initial_samples], Y_train[0:nb_initial_samples])

    # Continue with the incremental samples
    validation_accuracies = []

    for (x, y) in zip(X_train[nb_initial_samples:],
                      Y_train[nb_initial_samples:]):
        pac.partial_fit(x.reshape(1, -1),
                        y.ravel(),
                        classes=np.unique(iris['target']))
        validation_accuracies.append(pac.score(X_test, Y_test))

    # Show the validation plot
    fig, ax = plt.subplots(figsize=(18, 8))

    ax.plot(validation_accuracies)
    ax.set_xlabel('Online sample')
    ax.set_ylabel('Validation accuracy')
    ax.grid()

    plt.show()
Beispiel #32
0
def main():
    # Vectorizer with 2^18 buckets.
    chunkSize = 300000
    n_buckets = 2**19

    vectorizer = HashingVectorizer(decode_error='ignore',
                                   n_features=n_buckets,
                                   non_negative=True)
    classifier = PassiveAggressiveClassifier()

    #JSONGenerator = readChunk("data/dataSampleFile",chunkSize)
    #JSONGenerator = readChunk("data/RC_2007-10",chunkSize)
    #JSONGenerator = readChunk("data/RC_2008-01",chunkSize)
    JSONGenerator = readChunk("data/RC_2008-12", chunkSize)
    #JSONGenerator = readChunk("data/RC_2009-12",chunkSize)
    #JSONGenerator = readChunk("data/RC_2012-01",chunkSize)

    JSONArrayTestSet = next(JSONGenerator)
    X_test_text = []
    Y_test = []
    for JSONString in JSONArrayTestSet:
        JSONObject = json.loads(JSONString)
        # Don't care about deleted content.
        if JSONObject["body"] == "[deleted]":
            continue

        X_test_text.append(JSONObject["body"])
        Y_test.append(rangifyScore(int(JSONObject["score"])))

    X_test = vectorizer.transform(X_test_text)
    log("Start till MainLoop timer: " + str(time.time() - startTick))
    generatorTimeTick = time.time()
    # For loop for generators. Smart!
    for i, JSONArray in enumerate(JSONGenerator):
        log("readChunkTimer: " + str(time.time() - generatorTimeTick))

        X_train_text = []
        Y_train = []
        extractFeatureTimeTick = time.time()
        for JSONString in JSONArray:

            JSONObject = json.loads(JSONString)
            # Don't care about deleted content.
            if JSONObject["body"] == "[deleted]":
                continue

            X_train_text.append(JSONObject["body"])
            Y_train.append(rangifyScore(int(JSONObject["score"])))

        log("Feature Extract timer: " +
            str(time.time() - extractFeatureTimeTick))
        tick = time.time()
        X_train = vectorizer.transform(X_train_text)
        log("Vectorize timer:" + str(time.time() - tick))

        tick = time.time()
        classifier.partial_fit(X_train,
                               Y_train,
                               classes=[i for i in range(41)])
        log("Partial fit timer:" + str(time.time() - tick))

        generatorTimeTick = time.time()

    log("Total Time: " + str(time.time() - startTick))
    print(classifier.score(X_test, Y_test))
plt.legend(loc="lower right")

plt.show()

#Passive Aggressive Classifier Algorithm

from sklearn.linear_model import PassiveAggressiveClassifier
PC = PassiveAggressiveClassifier()
PC = PC.fit(X_train, y_train)
PC

#accuracy of Passive Aggressive Classifier Algorithm

y_pred1 = PC.predict(X_test)
print('Accuracy score= {:.2f}'.format(PC.score(X_test, y_test)))

#ROC curve of Passive Aggressive Classifier Algorithm

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_test, y_pred1)

roc_auc = auc(fpr, tpr)

plt.figure()

plt.plot(fpr,
         tpr,
         color='darkorange',
Beispiel #34
0
from sklearn.linear_model import PassiveAggressiveClassifier
P_estimator = PassiveAggressiveClassifier(C=1.0,
                                          fit_intercept=True,
                                          shuffle=True,
                                          verbose=0,
                                          loss='hinge',
                                          n_jobs=1,
                                          random_state=None,
                                          warm_start=False,
                                          class_weight=None,
                                          n_iter=5)

P_estimator.fit(X_train, y_train)
P_estimator.predict(X_test)
print("Accuracy:{}".format(P_estimator.score(X_test, y_test)))

# # Comparing Different Classifiers

# In[136]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression

heldout = [0.95, 0.90, 0.75, 0.50, 0.01]
Beispiel #35
0
print("Shape of Train X {}\n".format(trainX.shape))
print("Sample of the vocab:\n {}".format(
    np.random.choice(countVec.get_feature_names(), 20)))

#%% PICK A MODEL AND EXPERIMENT
lr = LogisticRegression()
passAgg = PassiveAggressiveClassifier()
perceptron = Perceptron()

lr.fit(trainX, trainY)
print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")

passAgg.fit(trainX, trainY)
print("Passive Aggressive Train:", passAgg.score(trainX, trainY))
print("Passive Aggressive Dev:", passAgg.score(devX, devY))
print("--")

perceptron.fit(trainX, trainY)
print("Perceptron Train:", perceptron.score(trainX, trainY))
print("Perceptron Dev:", perceptron.score(devX, devY))
print("--")

#%% ANALYSIS AND DEBUGGING
lr = LogisticRegression()
lr.fit(trainX, trainY)
print("Logistic Regression Train:", lr.score(trainX, trainY))
print("Logistic Regression Dev:", lr.score(devX, devY))
print("--")
#以下使用CLASSIFICATION,Passive Aggressive Classifier
from sklearn.linear_model import PassiveAggressiveClassifier
classifier = PassiveAggressiveClassifier(random_state=None)
#classifier.fit(data,t) # training on the iris dataset

#print(classifier.predict(data[0]))
#print(t[0])

#from sklearn import cross_validation
#train, test, t_train, t_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
from sklearn.model_selection import train_test_split
train, test, t_train, t_test = train_test_split(X, y, test_size=0.4, random_state=0)
classifier.fit(train,t_train) # train
#print(classifier.get_params())
print(classifier.score(test,t_test)) # test
print()

#confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(classifier.predict(test),t_test))
print()

#計算f1-score
from sklearn.metrics import classification_report
print(classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica']))
print()

#-----------------------------------------------------

#from sklearn.cross_validation import cross_val_score