def train_and_predict_m7 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball') ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier print ("Fitting Passive-Aggressive Classifer...") clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01) ## Create a parameter grid to search for best parameters for everything in the pipeline # Note: minkowski with p > 2 does not work for sparse matrices param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]} #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]} ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def test_classifier_refit(): # Classifier can be retrained on different labels and features. clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y) assert_array_equal(clf.classes_, np.unique(y)) clf.fit(X[:, :-1], iris.target_names[y]) assert_array_equal(clf.classes_, iris.target_names)
def PassiveAggressiveClassifier_1(train_predictors,test_predictors,train_target,test_target): clf = PassiveAggressiveClassifier() clf.fit(train_predictors,train_target) predicted = clf.predict(test_predictors) accuracy = accuracy_score(test_target, predicted) print "Accuracy for Linear Model PassiveAggressiveClassifier: "+str(accuracy) return accuracy,predicted
def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): clf = PassiveAggressiveClassifier(C=1.0, n_iter=30, fit_intercept=fit_intercept, random_state=0) clf.fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79)
def train_online_model(xtr, ytr, model=None): # Train classifier t0 = time.time() if model is None: model = PassiveAggressiveClassifier() model.fit(xtr, ytr) else: model.partial_fit(xtr, ytr) print "Training took %.2f seconds" % (time.time()-t0) return model
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): clf = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, random_state=0) for t in range(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert_greater(score, 0.79)
def test_passive_aggressive_2(): """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0""" tpot_obj = TPOT() result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0) result = result[result['group'] == 'testing'] pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42) pagg.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
def mainworker(limit1,limit2): N=10 l=[] w1=[] # +1 class w2=[]#-1 class temp=[] classlist=[] f=open("pdata.txt") for line in f: x=(line.strip("\n")).split(",") temp=[] for i in xrange(len(x)): x[i]=int(x[i]) temp.append(x[i]) clas=temp.pop() temp=temp[:limit1]+temp[limit2+1:] l.append(temp) classlist.append(clas) """if(temp[-1]==-1): w2.append(temp) else: w1.append(temp)""" f.close() X=np.array(l) y=np.array(classlist) X=np.array(l) y=np.array(classlist) karray=[2,3,4,5] for k in karray: kf = cross_validation.KFold(11054, n_folds=k) averager=[] for train_index,test_index in kf: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #print X_train, len(X_test), len(y_train), len(y_test) train_data=[] test_data=[] train_label=[] test_label=[] X1 = X_train#train_data Y1 = y_train#train_label clf = PassiveAggressiveClassifier() #clf = svm.SVC(kernel='linear') clf.fit(X1,Y1) Z = X_test#test_data predicted = clf.predict(Z) accuracy = getAccuracy(predicted, y_test)#test_label) averager.append(accuracy) answer=np.mean(averager) print "The mean for",k,"fold is:" print answer
def TrainSVM(data,labels): usealgo = 1 if usealgo == 0: from sklearn.linear_model import PassiveAggressiveClassifier clf=PassiveAggressiveClassifier(class_weight='balanced',n_jobs=-1,n_iter=15,fit_intercept=True) elif usealgo ==1: clf = SVC(probability= True,decision_function_shape='ovr',random_state=np.random.randint(1000),kernel="linear") elif usealgo ==2: from sklearn.svm import LinearSVC clf = LinearSVC() clf.fit(data,labels) return clf
class DeployedClassifierFactory: def __init__(self, term_doc_matrix, term_doc_matrix_factory, category, nlp=None): '''This is a class that enables one to train and save a classification model. Parameters ---------- term_doc_matrix : TermDocMatrix term_doc_matrix_factory : TermDocMatrixFactory category : str Category name nlp : spacy parser ''' self._term_doc_matrix = term_doc_matrix self._term_doc_matrix_factory = term_doc_matrix_factory assert term_doc_matrix_factory._nlp is None assert term_doc_matrix_factory.category_text_iter is None self._category = category self._clf = None self._proba = None def passive_aggressive_train(self): '''Trains passive aggressive classifier ''' self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0) self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y) y_dist = self._clf.decision_function(self._term_doc_matrix._X) pos_ecdf = ECDF(y_dist[y_dist >= 0]) neg_ecdf = ECDF(y_dist[y_dist <= 0]) def proba_function(distance_from_hyperplane): if distance_from_hyperplane > 0: return pos_ecdf(distance_from_hyperplane) / 2. + 0.5 elif distance_from_hyperplane < 0: return pos_ecdf(distance_from_hyperplane) / 2. return 0.5 self._proba = proba_function return self def build(self): '''Builds Depoyed Classifier ''' if self._clf is None: raise NeedToTrainExceptionBeforeDeployingException() return DeployedClassifier(self._category, self._term_doc_matrix._category_idx_store, self._term_doc_matrix._term_idx_store, self._term_doc_matrix_factory)
def test_classifier_correctness(loss): y_bin = y.copy() y_bin[y != 1] = -1 clf1 = MyPassiveAggressive( C=1.0, loss=loss, fit_intercept=True, n_iter=2) clf1.fit(X, y_bin) for data in (X, X_csr): clf2 = PassiveAggressiveClassifier( C=1.0, loss=loss, fit_intercept=True, max_iter=2, shuffle=False, tol=None) clf2.fit(data, y_bin) assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
def test_classifier_accuracy(): for data in (X, X_csr): for fit_intercept in (True, False): for average in (False, True): clf = PassiveAggressiveClassifier( C=1.0, max_iter=30, fit_intercept=fit_intercept, random_state=1, average=average, tol=None) clf.fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): for average in (False, True): clf = PassiveAggressiveClassifier( C=1.0, fit_intercept=True, random_state=0, average=average, max_iter=5) for t in range(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
class PassiveAgressiveClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._occ = OCC(C=0.0083, n_iter=27, loss="hinge") def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._occ = self._occ.fit(matrix, classes) print "Fitting complete..." self._has_fit = True output = self._occ.predict(self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
class PassiveAggressiveModel(BaseModel): def __init__(self, cached_features): BaseModel.__init__(self, cached_features) self.model = PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, random_state=1) def _predict_internal(self, X_test): return self.model.predict(X_test)
def test_classifier_correctness(): y_bin = y.copy() y_bin[y != 1] = -1 for loss in ("hinge", "squared_hinge"): clf1 = MyPassiveAggressive(C=1.0, loss=loss, fit_intercept=True, n_iter=2) clf1.fit(X, y_bin) clf2 = PassiveAggressiveClassifier(C=1.0, loss=loss, fit_intercept=True, n_iter=2) clf2.fit(X, y_bin) assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
def test_equal_class_weight(): X2 = [[1, 0], [1, 0], [0, 1], [0, 1]] y2 = [0, 0, 1, 1] clf = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight=None) clf.fit(X2, y2) # Already balanced, so "balanced" weights should have no effect clf_balanced = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight="balanced") clf_balanced.fit(X2, y2) clf_weighted = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight={0: 0.5, 1: 0.5}) clf_weighted.fit(X2, y2) # should be similar up to some epsilon due to learning rate schedule assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
def main(): #stemmer = SnowballStemmer('english') #stemmer = EnglishStemmer() training_data=open('trainingdata.txt', 'rU') n = int(training_data.readline().strip()) train_data = [] class_data = [] for i in range(n): line = training_data.readline().strip() train_data.append(line[1:].strip()) class_data.append(int(line[0])) train_data = np.array(train_data) class_data = np.array(class_data) # 2) Vectorize bag of words vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True ) vectorizer.fit(train_data) X_train = vectorizer.transform(train_data) # Read test data from input X_test = np.array([raw_input().strip() for i in range(int(raw_input().strip()))]) X_test = vectorizer.transform(X_test) clf = PassiveAggressiveClassifier(n_iter=9) clf.fit(X_train, class_data) pred = clf.predict(X_test) for i in pred: print i
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = ( TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types) ).build() ) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc(term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
def test_class_weights(): # Test class weights. X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y2 = [1, 1, 1, -1, -1] clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight=None, random_state=100) clf.fit(X2, y2) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight={1: 0.001}, random_state=100) clf.fit(X2, y2) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
def passive_aggressive_train(self): '''Trains passive aggressive classifier ''' self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0) self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y) y_dist = self._clf.decision_function(self._term_doc_matrix._X) pos_ecdf = ECDF(y_dist[y_dist >= 0]) neg_ecdf = ECDF(y_dist[y_dist <= 0]) def proba_function(distance_from_hyperplane): if distance_from_hyperplane > 0: return pos_ecdf(distance_from_hyperplane) / 2. + 0.5 elif distance_from_hyperplane < 0: return pos_ecdf(distance_from_hyperplane) / 2. return 0.5 self._proba = proba_function return self
abstracts = [BeautifulSoup(x).get_text() for x in data['abstract']] tfidf = TfidfVectorizer() X = tfidf.fit_transform(abstracts) y = data['type'].to_numpy() support_vec = svm.SVC(kernel='rbf', C=1000, gamma=0.001) rf = RandomForestClassifier(criterion='gini', max_features='sqrt', n_estimators=700) sgd = SGDClassifier(alpha=0.0001, fit_intercept=True, loss='modified_huber', penalty='l2') pac = PassiveAggressiveClassifier(C=1.0, early_stopping=True, fit_intercept=True, max_iter=2000) support_vec.fit(X, y) rf.fit(X, y) sgd.fit(X, y) pac.fit(X, y) # p_data = pd.read_csv('potentially_fake.tsv', sep='\t') p_data = pd.read_csv('potentially_fake-8000.tsv', sep='\t') p_abstracts = [BeautifulSoup(x).get_text() for x in p_data['abstract']] fake_indexes = [] for index in range(len(p_abstracts)): tfidf_pred = TfidfVectorizer(vocabulary=tfidf.vocabulary_) p_x = tfidf_pred.fit_transform([p_abstracts[index]])
]) testingD = pd.read_csv("test.tsv", header=None, sep="\t", names=[ 'ID', 'Label', 'Statement', 'Subject', 'Speaker', 'Job', 'State', 'Party', 'Barely True', 'False', 'Half True', 'Mostly True', 'Pants On Fire', 'Context' ]) df = pd.concat([trainingD, testingD]) df = df.dropna() passive_aggressive = PassiveAggressiveClassifier(max_iter=500) tfidf = TfidfVectorizer(stop_words='english', max_df=0.9) x_train, x_test, y_train, y_test = train_test_split(df["Party"], df['Label'], test_size=0.2) tfidf_train = tfidf.fit_transform(x_train) tfidf_test = tfidf.transform(x_test) passive_aggressive.fit(tfidf_train, y_train) y_pred = passive_aggressive.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f' passive aggressive Accuracy: {round(score * 100, 2)}%')
n= int(f.readline()) ln=[] ls=[] for _ in range(n): # change this to n in stead of 3 x=f.readline() xs=x[2:] xn=x[0] ls.append(xs) ln.append(xn) #stem the words bag_of_words=vectorizer.fit(ls) bag_of_words=vectorizer.transform(ls) cmax=0 for cc in range(1,100): #sw=stopwords.words() #stopwords are not supported, requires download clf = PassiveAggressiveClassifier(n_iter=9,C=cc/10) # svm=LinearSVC(C=cc/10.0) clf.fit(bag_of_words,ln) #Now get input (test) data lt=[] filename=open("testdata.txt") line = filename.readline() ntests=int(line) for _ in range(ntests): lt.append(filename.readline()) bag_of_test_words=vectorizer.transform(lt) result=clf.predict(bag_of_test_words) actuals=[] filename=open("testresults.txt")
#print X_train_tfidf.shape ntest = input() testdoc = [] for t in range(0, ntest): doc = raw_input() testdoc.append(doc) X_new_counts = count_vect.transform(testdoc) X_new_tfidf = tfidf_transformer.transform(X_new_counts) """" #Naive bayes from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) #test random forest from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=10) clf = clf.fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) """ from sklearn.linear_model import PassiveAggressiveClassifier clf = PassiveAggressiveClassifier(n_iter=50) clf = clf.fit(X_train_tfidf, trainlabel) predicted = clf.predict(X_new_tfidf) for t in range(0, ntest): print predicted[t]
# Naive Bayes : 0.88.... # GridSearch ## RidgeClassifier #ridge = RidgeClassifier(tol=1e-3, solver="lsqr") #alphas = np.logspace(-6, -1, 100) #clf = GridSearchCV(estimator=ridge, param_grid=dict(alpha=alphas), n_jobs = 3) #clf.fit(X_train, y_train) # feature_selection # selection from model from sklearn.feature_selection import SelectFromModel clf = PassiveAggressiveClassifier(C=0.099, n_iter=200, loss='hinge',random_state = 42) sfm = SelectFromModel(clf, threshold = 0.001) sfm.fit(X_train, y_train) X_train_select = sfm.transform(X_train) X_test_select = sfm.transform(X_test) # test with new clf clf1 = PassiveAggressiveClassifier(C=0.5, n_iter=200, loss='hinge',random_state = 42) benchmark(clf1, X_train_select, y_train, X_test_select, y_test) # GridSearch for C # Set the parameters by cross-validation
def plot_sgd_comparison(): """ ================================== Comparing various online solvers ================================== An example showing how different online solvers perform on the hand-written digits dataset. """ import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.linear_model import SGDClassifier, Perceptron from sklearn.linear_model import PassiveAggressiveClassifier from sklearn.linear_model import LogisticRegression heldout = [0.95, 0.90, 0.75, 0.50, 0.01] rounds = 20 digits = datasets.load_digits() X, y = digits.data, digits.target classifiers = [("SGD", SGDClassifier(max_iter=100)), ("ASGD", SGDClassifier(average=True, max_iter=100)), ("Perceptron", Perceptron(tol=1e-3)), ("Passive-Aggressive I", PassiveAggressiveClassifier(loss='hinge', C=1.0, tol=1e-4)), ("Passive-Aggressive II", PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, tol=1e-4)), ("SAG", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))] xx = 1. - np.array(heldout) for name, clf in classifiers: print("training %s" % name) rng = np.random.RandomState(42) yy = [] for i in heldout: yy_ = [] for r in range(rounds): X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=i, random_state=rng) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) yy_.append(1 - np.mean(y_pred == y_test)) yy.append(np.mean(yy_)) plt.plot(xx, yy, label=name) plt.legend(loc="upper right") plt.xlabel("Proportion train") plt.ylabel("Test Error Rate") plt.show()
pred, target_names=categories)) if print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(
clf.fit(X_train, y_train) y_pred = clf.predict(X_test).reshape(-1, 1) accuracy_score(y_test, y_pred) # Perceptron from sklearn.linear_model import Perceptron clf = Perceptron(tol=1e-3, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test).reshape(-1, 1) accuracy_score(y_test, y_pred) # PassiveAggressiveClassifier from sklearn.linear_model import PassiveAggressiveClassifier clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3) clf.fit(X_train, y_train) y_pred = clf.predict(X_test).reshape(-1, 1) accuracy_score(y_test, y_pred) '''Fuzzy Logic Classifier''' '''Cross Validation''' '''# Undersampling per cross_val from collections import Counter from sklearn.datasets import make_classification from imblearn.under_sampling import RandomUnderSampler # summarize class distribution print(Counter(y)) # define dataset X, y = make_classification(n_samples=30000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.333333,0.333333,0.333334],n_classes= 3, flip_y=0, random_state=100) undersample = NearMiss(version=1, n_neighbors_ver3=3)
max_features = 1000 ngram_range = (1,5) #if not specified its (1,1) countvec=CountVectorizer(min_df = min_df, ngram_range = ngram_range, max_features=max_features) # Learn vocabulary from train set countVec.fit(trainText) # Transform list of review to matrix of bag-of-word vectors trainX = countVec.transform(trainText) devX = countVec.transform(devText) testX = countVec.transform(testText) print("Shape of Train X {}\n".format(trainX.shape)) print("Sample of the vocab:\n {}".format(np.random.choice(countVec.get_feature_names(), 20))) #%% PICK A MODEL AND EXPERIMENT lr = LogisticRegression(C=0.1) passAgg = PassiveAggressiveClassifier(C=0.1) perceptron = Perceptron() lr.fit(trainX, trainY) print("Logistic Regression Train:", lr.score(trainX, trainY)) print("Logistic Regression Dev:", lr.score(devX, devY)) print("--") passAgg.fit(trainX, trainY) print("Passive Aggressive Train:", passAgg.score(trainX, trainY)) print("Passive Aggressive Dev:", passAgg.score(devX, devY)) print("--") perceptron.fit(trainX, trainY) print("Perceptron Train:", perceptron.score(trainX, trainY)) print("Perceptron Dev:", perceptron.score(devX, devY))
target_names=categories)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
for stat in content: ar=stat.split(" ") labels.append(ar[0]) ar=ar[1:] s=' '.join(ar) # print(s) texts.append(s) vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english') X_train = vectorizer.fit_transform(texts) Y_train = np.array(labels) predict_data=[] n=int(input().strip()) for i in range(n): predict_data.append(input().strip()) X_test = vectorizer.transform(predict_data) clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(X_train, Y_train) results=clf.predict(X_test) for i in results: print(i)
print "Accuracy", scores.mean() print "\nUsing nearest centroid" nc = NearestCentroid() scores = cross_val_score(nc, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nnusvc" nusvc = NuSVC() scores = cross_val_score(nusvc, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nUsing Passive aggressive Classifier" pac = PassiveAggressiveClassifier() scores = cross_val_score(pac, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() print "\nUsing the perceptron" per = Perceptron(fit_intercept=False, n_iter=10, shuffle=False) scores = cross_val_score(per, feature_normal, labels, cv=10, n_jobs=4) print scores print "Accuracy", scores.mean() # This hangs my computer for some reason #print "\n Using quadratic discriminant analysis" #qda = QuadraticDiscriminantAnalysis(store_covariances=True) #scores = cross_val_score(qda, feature_normal, labels, cv=10, n_jobs = 2) #print scores
# print (y_train) # k_t, test_y, a_t = read_pan(pan_test) print('Extracting asset') train_x, f_names, chi, transformer = feature_extraction2(givenlabel, k) # print ('Writing database ...') # writeTrainToTxt(train_x,'feature.txt') # # writeTrainToTxt(y_train,'y.txt') # print ('written') train_X, test_X, train_y, test_y = train_test_split(train_x, y_train, train_size=0.80) print('Mission successful') # print(len(X_train)) # test_x,_,_,_=feature_extraction2(givenlabel,k_t) # train_x=train_x.toarray().astype(np.float) # print (train_x.dtype) # train_x,f_names,chi,transformer=feature_extraction(givenlabel,k) # print(simple_classify(RandomForestClassifier(),test_X,test_y,train_X,train_y)) print(simple_classify(RidgeClassifier(), test_X, test_y, train_X, train_y)) print(simple_classify(Perceptron(), test_X, test_y, train_X, train_y)) print( simple_classify(PassiveAggressiveClassifier(), test_X, test_y, train_X, train_y)) print( simple_classify(RandomForestClassifier(), test_X, test_y, train_X, train_y)) print(simple_classify(KNeighborsClassifier(), test_X, test_y, train_X, train_y)) print(simple_classify(MultinomialNB(), test_X, test_y, train_X, k))
if each_word not in stop_words ] text_lmtzr = [lmtzr.lemmatize(each_word) for each_word in text_clean] training_text.append(' '.join(text_lmtzr)) X_vector = vectorizer.fit_transform(training_text) print " Actual number of tfidf features: %d" % X_vector.get_shape()[1] # raw_input() svd = TruncatedSVD(100) lsa = make_pipeline(svd, Normalizer(copy=False)) X_train_lsa = lsa.fit_transform(X_vector) passive_tfidf = PassiveAggressiveClassifier(n_iter=50) passive_tfidf.fit(X_vector, training_class) passive_lsa = PassiveAggressiveClassifier(n_iter=50) passive_lsa.fit(X_train_lsa, training_class) file_Name = "global_intent_tfidf.p" fileObject = open(file_Name, 'wb') pickle.dump(passive_tfidf, fileObject) fileObject.close() file_Name_lsa = "global_intent_lsa.p" fileObject_lsa = open(file_Name_lsa, 'wb') pickle.dump(passive_lsa, fileObject_lsa) fileObject_lsa.close()
# parser.add_argument('--n_estimators', type=int, default=100) # parser.add_argument('--min_samples_leaf', type=int, default=3) # args holds all passed-in arguments args = parser.parse_args() # Read in csv training file training_dir = args.data_dir train_data = pd.read_csv(os.path.join(training_dir, "train.csv"), header=None, names=None) # Labels are in the first column train_y = train_data.iloc[:, 0] train_x = train_data.iloc[:, 1:] ## --- Your code here --- ## ## TODO: Define a model model = PassiveAggressiveClassifier(max_iter=args.iter, random_state=0, tol=1e-3, validation_fraction=args.valid_frac) ## TODO: Train the model model.fit(train_x, train_y) ## --- End of your code --- ## # Save the trained model joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
from sklearn.metrics import accuracy_score, confusion_matrix #Read the data df = pd.read_csv("news.csv") #Get shape and head shape = df.shape print(shape) head1 = df.head() print(head1) labels = df.label print(labels) #Split the dataset x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=7) #Initialize a TfidfVectorizer tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) #Fit and transform train set, transform test set tfidf_train = tfidf_vectorizer.fit_transform(x_train) tfidf_test = tfidf_vectorizer.transform(x_test) #Initialize a PassiveAggressiveClassifier pac = PassiveAggressiveClassifier(max_iter=100) pac.fit(tfidf_train, y_train) #Predict on the test set and calculate accuracy y_pred = pac.predict(tfidf_test) score = accuracy_score(y_test, y_pred) print(f'Accuracy: {round(score*100,2)}%') print(confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL']))
stack[:, i], 6) df_stack.to_csv(path + 'feature/advertiser_id_tfidf_sgd_error_single_classfiy.csv', index=None, encoding='utf8') print('sgd特征已保存\n') ########################### pac(PassiveAggressiveClassifier) ################################ print('PAC stacking') stack_train = np.zeros((train_feature.shape[0], number)) stack_test = np.zeros((test_feature.shape[0], number)) score_va = 0 for i, (tr, va) in enumerate(kfold.split(train_feature, score)): print('stack:%d/%d' % ((i + 1), n_folds)) pac = PassiveAggressiveClassifier(random_state=RANDOM_SEED) pac.fit(train_feature[tr], score[tr]) score_va = pac._predict_proba_lr(train_feature[va]) score_te = pac._predict_proba_lr(test_feature) print(score_va) print('得分' + str(mean_squared_error(score[va], pac.predict(train_feature[va])))) stack_train[va] += score_va stack_test += score_te stack_test /= n_folds stack = np.vstack([stack_train, stack_test]) df_stack = pd.DataFrame() for i in range(stack.shape[1]): df_stack['advertiser_id_tfidf_pac_classfiy_{}'.format(i)] = np.around( stack[:, i], 6) df_stack.to_csv(path +
def get_best_model(x_tweet_data,y_tweet_data, x_news_data, y_news_data): x_tweet_data = prepare_data(x_tweet_data) x_news_data = prepare_data(x_news_data) tf_idf_vectorizer = get_tf_idf_vectorizer(x_tweet_data) x_tweet_data = tf_idf_vectorizer.transform(x_tweet_data) x_news_data = tf_idf_vectorizer.transform(x_news_data) y_tweet_data = [class_mapping.get(elem[0]) for elem in y_tweet_data] y_news_data = [class_mapping.get(elem[0]) for elem in y_news_data] f1_coll = [] model_coll = [] ########################### FOR SVC svc_model = SVC(C=1, kernel='linear', degree=4, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight='balanced', verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None) fitted_model, p,r,f,acc= best_model_selection(x_tweet_data, y_tweet_data, x_news_data, y_news_data, model_name='SVC', curr_model=svc_model, lower_lim=50, upper_lim=5000, step=50) f1_coll.append(f) model_coll.append(fitted_model) print('#################### SVC : ') print('precision : ',p) print('recall : ',r) print('max_f1: ', f) print('acc: ', acc) f1_coll.append(f) model_coll.append(fitted_model) ########################### FOR SGD sgd_model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False, n_iter=None) fitted_model, p,r,f,acc= best_model_selection(x_tweet_data, y_tweet_data, x_news_data, y_news_data, model_name='SGD', curr_model=sgd_model, lower_lim=50, upper_lim=5000, step=50) f1_coll.append(f) model_coll.append(fitted_model) print('#################### SGD : ') print('precision : ',p) print('recall : ',r) print('max_f1: ', f) print('acc: ', acc) ########################### FOR PA pa_model = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, max_iter=None, tol=0.001, shuffle=True, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False, class_weight='balanced', average=True, n_iter=None) fitted_model,p,r,f,acc= best_model_selection(x_tweet_data, y_tweet_data, x_news_data, y_news_data, model_name='PA', curr_model=pa_model, lower_lim=50, upper_lim=5000, step=50) f1_coll.append(f) model_coll.append(fitted_model) print('#################### PA : ') print('precision : ', p) print('recall : ', r) print('max_f1: ', f) print('acc: ', acc) best_model_ind = np.argmax(f1_coll) return tf_idf_vectorizer, model_coll[best_model_ind]
score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) clf = MultinomialNB() clf.fit(count_train, y_train) pred = clf.predict(count_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) # Testing from sklearn.linear_model import PassiveAggressiveClassifier linear_clf = PassiveAggressiveClassifier(n_iter=50) linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) clf = MultinomialNB(alpha=0.1) last_score = 0 for alpha in np.arange(0, 1, .1): nb_classifier = MultinomialNB(alpha=alpha) nb_classifier.fit(tfidf_train, y_train) pred = nb_classifier.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred) if score > last_score:
def __init__(self, cached_features): BaseModel.__init__(self, cached_features) self.model = PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, random_state=1)
def tweetread(): data = [] catagory = [] results_traffic = collection_aa.find({"manualtype":{"$ne":"/^non*/"}}) for i,item in enumerate(results_traffic): text = unicodedata.normalize('NFKD', item["text"]).encode('ascii','ignore').decode('utf-8') text = re.sub(r"@([A-Za-z]+[A-Za-z]+[A-Za-z0-9-_\.]+)", "", text) print(text) data.append(str(text)) catagory.append(0) results_nontraffic = collection_mapped.find({"_id":{"$regex":"2014/04/18/09*"}}) nontraffic = [] data = data[:5000] catagory = catagory[:5000] #docs = [{f["text"]:"TRAFFIC"} for f in results_traffic] print(len(data), " TRAFFIC SIZE ") for res in results_nontraffic: #print(len(res["item"])) for i in res["item"]: if len(data) < 10000: text = unicodedata.normalize('NFKD', i["text"]).encode('ascii','ignore').decode('utf-8') #if not check_in(['delays', 'crash', 'cleared'] , text): text = re.sub(r"@([A-Za-z]+[A-Za-z0-9-_\.]+)", "", text) print(text) data.append(text) catagory.append(1) #else: # print(text) print(len(data), "SAMPLE SIZE ") vectorizer = TfidfVectorizer( analyzer='word', # features made of words token_pattern=r'[a-z]{3,}', use_idf=True, strip_accents='unicode', #ngram_range=(2,3), sublinear_tf=True, max_df=0.95, min_df=0.05,stop_words='english') #vectorizer = DictVectorizer(); X_train = vectorizer.fit_transform(data) X_test = vectorizer.transform(data) feature_names = vectorizer.get_feature_names()#np.vectorize(vectorizer.get_feature_names()) print(feature_names); print(X_test) print(data[0]) print(data[1]) #BernoulliNB(alpha=.01) #nb_classifier = BernoulliNB(alpha=.01).fit(X_train, catagory) #nb_classifier = RidgeClassifier(tol=1e-2, solver="lsqr").fit(X_train, catagory) #nb_classifier = Perceptron(n_iter=50).fit(X_train, catagory) nb_classifier = PassiveAggressiveClassifier(n_iter=50).fit(X_train, catagory) #nb_classifier = MultinomialNB(alpha=.01).fit(X_train, catagory) y_nb_predicted = nb_classifier.predict(X_test) print("Dimensionality: %d" % nb_classifier.coef_.shape[0]) show_most_informative_features(vectorizer, nb_classifier, n=50) print("traffic :" + str(traffic_label)) print("traffic score #:" + str(traffic_scores)) print("non :" + str(nontraffic_label)) print("non score #:" + str(nontraffic_scores)) print("MODEL: Multinomial Naive Bayes\n") print('The precision for this classifier is ' + str(metrics.precision_score(catagory, y_nb_predicted))); print('The recall for this classifier is ' + str(metrics.recall_score(catagory, y_nb_predicted))); print('The f1 for this classifier is ' + str(metrics.f1_score(catagory, y_nb_predicted))); print('The accuracy for this classifier is ' + str(metrics.accuracy_score(catagory, y_nb_predicted))); print('\nHere is the classification report:'); print(classification_report(catagory, y_nb_predicted)); print(metrics.confusion_matrix(catagory, y_nb_predicted, labels=[0,1])) results_nontraffic = collection_mapped.find({"_id":{"$regex":"2014/04/*"}}) nontraffic = [] data = data[:1000] catagory = catagory[:1000] #docs = [{f["text"]:"TRAFFIC"} for f in results_traffic] print(len(data), " TRAFFIC SIZE ") f = open('classifier.pickle', 'wb') v = open('vector.pickle', 'wb') pickle.dump(nb_classifier, f) pickle.dump(vectorizer, v) f.close() for res in results_nontraffic: for item in res["item"]: text = unicodedata.normalize('NFKD', item["text"]).encode('ascii','ignore').decode('utf-8') X_test = vectorizer.transform([text]) y_nb_predicted = nb_classifier.predict(X_test) #score = metrics.f1_score(X_test, y_nb_predicted) if y_nb_predicted == 0: #if check_in(['delays', 'crash', 'cleared'] , text): #print("PREDICTED", text) print("", text,"\\\\")
target_names=target_names)) if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50, tol=1e-3), "Perceptron"), (PassiveAggressiveClassifier(max_iter=50, tol=1e-3), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append(
class DefaultConfig(object): """ 参数配置 """ def __init__(self): pass # 次数 k = 5 # 项目路径 project_path = '/'.join(os.path.abspath(__file__).split('/')[:-2]) # 停用词文件路径 stopwords_path = project_path + '/data/stopwords/stopwords.txt' # app_desc.dat 路径 app_desc_path = project_path + '/data/original/app_desc.dat' # apptype_id_name.txt 路径 apptype_id_name_path = project_path + '/data/original/apptype_id_name.txt' # apptype_train.dat 路径 apptype_train_path = project_path + '/data/original/apptype_train.dat' # apptype_train_term_doc.h5文件保存路径 apptype_train_term_doc_path = project_path + '/data/cache/apptype_train_term_doc.h5' # app_desc_term_doc.h5文件路径 app_desc_term_doc_path = project_path + '/data/cache/app_desc_term_doc.h5' # app_desc_apptype 对app_desc进行预判断 app_desc_apptype_path = project_path + '/data/cache/app_desc_apptype.h5' # apptype_train_classification.h5文件路径 apptype_train_classification_path = project_path + '/data/cache/apptype_train_classification.h5' # app_desc_classification.h5文件路径 app_desc_classification_path = project_path + '/data/cache/app_desc_classification.h5' # apptype_train_word_index.h5 apptype_train_word_index_path = project_path + '/data/cache/apptype_train_word_index.h5' # app_desc_word_index.h5 app_desc_word_index_path = project_path + '/data/cache/app_desc_word_index.h5' # 单模型 AdaBoostClassifier_model = AdaBoostClassifier() BaggingClassifier_model = BaggingClassifier() ExtraTreesClassifier_model = ExtraTreesClassifier() GradientBoostingClassifier_model = GradientBoostingClassifier() RandomForestClassifier_model = RandomForestClassifier() GaussianProcessClassifier_model = GaussianProcessClassifier() PassiveAggressiveClassifier_model = PassiveAggressiveClassifier() RidgeClassifier_model = RidgeClassifier(alpha=0.8, tol=0.1, solver="sag", normalize=True, max_iter=1000, random_state=2019) SGDClassifier_model = SGDClassifier() KNeighborsClassifier_model = KNeighborsClassifier() GaussianNB_model = GaussianNB() MLPClassifier_model = MLPClassifier() DecisionTreeClassifier_model = DecisionTreeClassifier() ExtraTreeClassifier_model = ExtraTreeClassifier() SVC_model = SVC() LinearSVC_model = LinearSVC() # XGBClassifier_model = XGBClassifier() # LGBMClassifier_model = LGBMClassifier() LinearClassifierMixin_model = LinearClassifierMixin() RidgeClassifierCV_model = RidgeClassifierCV() SparseCoefMixin_model = SparseCoefMixin() # 选中的模型 select_model = RidgeClassifier_model # select_model = 'lgb' # select_model = 'fast_text' # replace 是否进行替换 not_replace = False
if len(line)>2: targets.append(int(line[0])) docs.append(' '.join([i for i in line[1:] if not is_stopword(i)])) count_vect = CountVectorizer(input='content',ngram_range=(1,2)) X_train_counts = count_vect.fit_transform(docs) tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) #svd = TruncatedSVD(n_components=55, random_state=7) #X_train = svd.fit_transform(X_train_tf) #clf = KNeighborsClassifier(n_neighbors=8).fit(X_train, targets) #clf = BernoulliNB(alpha=.01) #clf = LinearSVC() clf=PassiveAggressiveClassifier(n_iter=9) clf.fit(X_train_tf, targets) def classify(content): global count_vect global tf_transformer global svd global clf X_new_counts = count_vect.transform(content) X_new_tfidf = tf_transformer.transform(X_new_counts) #X_new = svd.transform(X_new_tfidf) return clf.predict(X_new_tfidf) tc = int(raw_input()) inp = [] for tcc in range(tc):
classifiers = { 'keras_mlp': KerasClassifier( build_fn=create_mlp, nb_epoch=150, batch_size=64 ), 'svc_linear': LinearSVC(), 'lr_lbfgs': LogisticRegression( C=2.02739770e+04, # particle swarm optimised tol=6.65926091e-04, solver='lbfgs' ), 'lr_lbfgs_default': LogisticRegression(solver='lbfgs'), 'pa': PassiveAggressiveClassifier( C=0.01, fit_intercept=True, loss='hinge' ), 'pa_default': PassiveAggressiveClassifier(), 'gnb': GaussianNB(), 'lda': LinearDiscriminantAnalysis(), 'rf': RandomForestClassifier( n_estimators=200, criterion='gini', max_depth=4, min_samples_leaf=3, min_samples_split=3 ), 'xgb': XGBClassifier( n_estimators=200, max_depth=6,
X_train1, X_test1, y_train, y_test = train_test_split( train['text'], train['sentiment'], test_size=0.1) X_test = count_vect.fit_transform(X_test1) X_train = count_vect.transform(X_train1) clf = MLPClassifier(alpha=1, random_state=65) clf.fit(X_train, y_train) clf2 = SVC(probability = True, gamma=2, C=1) clf2.fit(X_train, y_train) clf3 = DecisionTreeClassifier(random_state = 0) clf3.fit(X_train, y_train) clf4 = PassiveAggressiveClassifier() clf4.fit(X_train, y_train) clf5 = BaggingClassifier(random_state=54) clf5.fit(X_train, y_train) clf6 = ExtraTreesClassifier(random_state=0) clf6.fit(X_train, y_train) clf7 = GradientBoostingClassifier(random_state=32) clf7.fit(X_train, y_train) vc = VotingClassifier(estimators=[ ('mlp', clf), ('dt', clf3), ('et', clf6), ('bag', clf5), ('grad', clf7) ], voting='soft', weights=[0.3, 0.1, 0.2, 0.1, 0.3]) vc.fit(X_train, y_train)
def runLearner(printStages = True, useSelector = False, discreteHelpfulness = True, useRST = True, useFew = False): learner = PassiveAggressiveClassifier() if discreteHelpfulness else PassiveAggressiveRegressor() #bestwords = getBestWords(instances,num=1000) tfidvec = TfidfVectorizer(sublinear_tf=True,stop_words='english', ngram_range=(1,3), decode_error='replace') selector = SelectKBest(chi2, k=50000) if useSelector else None encoder = LabelEncoder() if discreteHelpfulness else None if discreteHelpfulness: classlabels = encoder.fit_transform(labels) newData = False count = 0 if useRST: print 'Getting RST data' nums, texts, ilabels = getPickledRSTSciKitDataLists(True) if newData else getRSTSciKitDataLists(True) random = RandomFeatureExtractor() lengthBaseline = LenFeatureExtractor() fullRST = FullPickledRSTFeatureExtractor(nums) if newData else FullTextRSTFeatureExtractor(nums) limitedRST = LimitedPickledRSTFeatureExtractor(nums) if newData else LimitedTextRSTFeatureExtractor(nums) vectorizer = FeatureUnion([('extra',limitedRST),('tfid',tfidvec)]) print 'Fitting random features baseline' random.fit(texts) print 'Fitting text length baseline' lengthBaseline.fit(texts) print 'Fitting full RST features' fullRST.fit(texts) print 'Fitting limited RST features' limitedRST.fit(texts) print 'Fitting limited RST with tfidvec features' vectorizer.fit(texts) print 'Fitting tfidvec features' tfidvec.fit(texts) split = int(0.8*len(ilabels)) trainData = (texts[:split],ilabels[:split]) testData = (texts[split:],ilabels[split:]) X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) print 'random features baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) dummy = DummyClassifier() X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) dummy.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],random,encoder,selector) print 'Dummy label distribution baseline trained on %d instances has accuracy %f'%(len(trainData[0]),dummy.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],lengthBaseline,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],lengthBaseline,encoder,selector) print 'text length baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],fullRST,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],fullRST,encoder,selector) print 'Full RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],limitedRST,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],limitedRST,encoder,selector) print 'Limited RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],vectorizer,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],vectorizer,encoder,selector) print 'Limited RST with ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],tfidvec,encoder,selector) learner = learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],tfidvec,encoder,selector) print 'ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) else: vectorizer = tfidvec testData = None vocabGotten = False instances = ([],[]) numVocab = 50000 numTest = 50000 numTrain = 100000 maxTrainStages = 20 for text,label in getSciKitData(stateProgress = False, discreteLabels=discreteHelpfulness): if label!='few' or useFew: instances[0].append(text) instances[1].append(label) if not vocabGotten and len(instances[0]) == numVocab: if printStages: print 'Fitting vocabulary with %d instances'%numVocab vectorizer.fit(instances[0],None) if selector is not None: X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,None) selector.fit(X,y) vocabGotten = True instances = ([],[]) elif vocabGotten and testData is None and len(instances[0]) == numTest: if printStages: print 'Getting test data with %d instances'%numTest testData = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector) instances = ([],[]) elif vocabGotten and testData is not None and len(instances[0]) == numTrain: X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector) if discreteHelpfulness: learner = learner.partial_fit(X,y, classes = classlabels) else: learner = learner.partial_fit(X,y) instances = ([],[]) count = count + 1 if printStages: print 'Baseline trained on %d instances has accuracy %f'%(count*numTrain,learner.score(testData[0],testData[1])) elif count == maxTrainStages: break print 'Final learner trained on %d instances has accuracy %f'%(maxTrainStages*numTrain,learner.score(testData[0],testData[1]))
tweets = pd.read_csv('train_data_flag.csv', header=None, skiprows=1, names=["name","id","description","friends","followers","location","tweet","flag"]) tweets.loc[(tweets['flag'] == 1) , ['flag']] = 'DEPRESSED' tweets.loc[(tweets['flag'] == 0) , ['flag']] = 'NORMAL' features=tweets['tweet'] labels=tweets['flag'] x_train,x_test,y_train,y_test=train_test_split(features, labels, test_size=0.2, random_state=42) tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7) tfidf_vectorizer.fit(x_train) tfidf_train=tfidf_vectorizer.transform(x_train) tfidf_test=tfidf_vectorizer.transform(x_test) pa_classifier=PassiveAggressiveClassifier(max_iter=50) pa_classifier.fit(tfidf_train,y_train) y_pred=pa_classifier.predict(tfidf_test) score=accuracy_score(y_test,y_pred) print(f'Accuracy: {round(score*100,2)}%') #Save Model joblib.dump(tfidf_vectorizer,"tfidf_vectorizer.pkl") joblib.dump(pa_classifier, "pa_classifier.pkl")
### Treinamento batch_size = 1000 n_pairs = 516000 train_set_size = 510000 n_batches = int(train_set_size / batch_size) classes = [0, 1] classifiers = [ ("clfP", Perceptron(penalty = None)), ("clfPL1", Perceptron(penalty = 'l1')), ("clfSGDCL1", SGDClassifier(learning_rate = 'constant', penalty = 'l1', loss = 'hinge', eta0 = 1)), ("clfSGDO", SGDClassifier(learning_rate = 'optimal', penalty = 'l2', loss = 'hinge')), # SGD() ("clfSGDOL1", SGDClassifier(learning_rate = 'optimal', penalty = 'l1', loss = 'hinge')), ("clfSGDOLOG", SGDClassifier(learning_rate = 'optimal', penalty = 'l2', loss = 'log', eta0 = 1)), ("clfPAH", PassiveAggressiveClassifier(loss = 'hinge')), ("clfPASH", PassiveAggressiveClassifier(loss = 'squared_hinge')) # Removidos: #("clfPL2", Perceptron(penalty = 'l2')), #("clfSGDC", SGDClassifier(learning_rate = 'constant', penalty = 'l2', loss = 'hinge', eta0 = 1)), #("clfSGDCLOG", SGDClassifier(learning_rate = 'constant', penalty = 'l2', loss = 'log', eta0 = 1)), ] log('n_pairs: ' + str(n_pairs) + ' train_set_size: ' + str(train_set_size) + ' n_batches: ' + str(n_batches) + ' batch_size: ' + str(batch_size)) for j in range(n_batches): start_batch = time.time() X = pickle.load(open('.\Batches\X.' + str(j), 'rb')) y = pickle.load(open('.\Batches\y.' + str(j), 'rb')) for name, clf in classifiers: clf.partial_fit(X, y, classes)
def main(): X_training, X_test, y_training, y_test, featureNames, classValue, name = loadTrainingAndTest( ) print("Pre-processing data...") dfFeaturesTest = read_csv("./data/Affy2Weights.csv", header=None) values = dfFeaturesTest.values[:, 0] print(values) for i in range(0, len(X_test[0])): for j in range(0, len(X_test)): X_test[j, i] = X_test[j, i] * values[i] if True: # let's normalize the data by sample scaler_sample = StandardScaler() scaler_sample2 = StandardScaler() X_training = scaler_sample.fit_transform(X_training.T).T X_test = scaler_sample2.fit_transform(X_test.T).T if True: scaler = StandardScaler() X_training = scaler.fit_transform(X_training) X_test = scaler.transform(X_test) # also normalize by feature # time to plot a PCA if True: from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(X_training) X_pca_train = pca.transform(X_training) X_pca_test = pca.transform(X_test) import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) class3 = [(y == classValue) for y in y_training] ax.plot(X_pca_train[:, 0], X_pca_train[:, 1], 'b.', label="tcga data") ax.plot(X_pca_train[class3, 0], X_pca_train[class3, 1], color='orange', marker='.', linestyle='None', label=("tcga data, class " + str(classValue))) ax.plot(X_pca_test[:, 0], X_pca_test[:, 1], 'r.', label=name + " data") ax.legend(loc='best') ax.set_title("TCGA vs " + name + " data") ax.set_xlabel("PCA 0") ax.set_ylabel("PCA 1") plt.savefig(name + ".png") results = np.zeros(shape=(X_test.shape[0] * 9, 10)) results2 = np.zeros(shape=(9, 10)) j = 0 for k in range(0, 10): ## FINALLY, WE CAN CLASSIFY AWAY! classifierList = [ #[RandomForestClassifier(), "RandomForestClassifier()"], [LogisticRegression(), "LogisticRegression"], # coef_ [PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"], # coef_ [SGDClassifier(), "SGDClassifier"], # coef_ [ SVC(kernel='linear'), "SVC(linear)" ], # coef_, but only if the kernel is linear...the default is 'rbf', which is NOT linear [RidgeClassifier(), "RidgeClassifier"], # coef_ [ BaggingClassifier(n_estimators=300), "BaggingClassifier(n_estimators=300)" ], [ GradientBoostingClassifier(n_estimators=300), "GradientBoostingClassifier(n_estimators=300)" ], [ RandomForestClassifier(n_estimators=300), "RandomForestClassifier(n_estimators=300)" ], ] f2 = open(name + "_" + str(k) + "_.txt", 'w') f = open(name + "_" + str(k) + ".txt", 'w') l = 0 ##for i in range(0, 10) : for originalClassifier, classifierName in classifierList: f.write("\nClassifier " + classifierName + "\n") f2.write("\nClassifier " + classifierName + "\n") print("\nClassifier " + classifierName) ##let's normalize, anyway classifier = copy.deepcopy(originalClassifier) classifier.fit(X_training, y_training) y_train_pred = classifier.predict(X_training) y_test_pred = classifier.predict(X_test) scoreTraining = accuracy_score(y_train_pred, y_training) scoreTest = accuracy_score(y_test_pred, y_test) f.write("Training accuracy: %.4f; Test accuracy: %.4f \n" % (scoreTraining, scoreTest)) f2.write("Training accuracy: %.4f; Test accuracy: %.4f \n" % (scoreTraining, scoreTest)) print("Training accuracy: %.4f; Test accuracy: %.4f" % (scoreTraining, scoreTest)) f.write("Complete classification on test: \n") for i in range(0, y_test.shape[0]): f.write("%d \n" % (y_test_pred[i])) results[j, k] = y_test_pred[i] j = j + 1 results2[l, k] = scoreTest l = l + 1 f.close() f2.close() j = 0 saveMatrix(name + "_results.csv", results) saveMatrix(name + "results2.csv", results2) return
# "there there there"] #y = [1,1,1,1] trans = vectorizer.fit_transform(x) #print vectorizer.transform(["I am in a tree tree"]).toarray() #print vectorizer.get_feature_names() #print trans.toarray() #print sorted(vectorizer.vocabulary_) print len(vectorizer.vocabulary_) K = 1 from sklearn import neighbors #clf = neighbors.KNeighborsClassifier(K,weights = 'distance', leaf_size= 30) from sklearn.linear_model import PassiveAggressiveClassifier clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(trans, y) #f = open("testDatatextClassification.txt",'r') f = open("input01.txt",'r') f2 = open("output01.txt","r") d = f.readlines() d = d[1:] ans = map(int,f2.readlines()) t0= time.clock() summing = 0; for j,i in enumerate(d): sol = int(clf.predict(vectorizer.transform([i]).toarray())[0]) #print sol, ans[j] if (sol==ans[j]):
default='squared_hinge', help='loss function for model') parser.add_argument('-v', '--validate', action='store_true', help='do validation') parser.add_argument( '--avg', type=int, help= 'average this # of models together, trained on different data order') start = datetime.now() args = parser.parse_args() print args model = PassiveAggressiveClassifier(C=args.C, loss=args.loss, warm_start=True) print 'Using model:' print model if args.validate: if args.avg is None: validate(model, args.infile, args.passes, args.bits) else: avg_validate(args.avg, model, args.infile, args.passes, args.bits) else: if args.avg is None: run_all(model, args.infile, args.passes, args.bits, args.submit_id) else: avg_run_all(args.avg, model, args.infile, args.passes, args.bits, args.submit_id) finish = datetime.now()
# Splitting the dataset into Training Set and Test Set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 0) # TfidfVectorizer # Initialize from sklearn.feature_extraction.text import TfidfVectorizer TDfVector = TfidfVectorizer(stop_words = 'english', max_df = 0.7) # Fitting the TDfVector_train = TDfVector.fit_transform(X_train) TDfVector_test = TDfVector.transform(X_test) # PassiveAgressiveClassifer #initialize from sklearn.linear_model import PassiveAggressiveClassifier PAClassifier = PassiveAggressiveClassifier(max_iter=50) PAClassifier.fit(TDfVector_train,y_train) # Prediction y_pred = PAClassifier.predict(TDfVector_test) #Accuracy in % from sklearn.metrics import accuracy_score, confusion_matrix Score = accuracy_score(y_test,y_pred) final = Score*100 print(f'Accuracy Score : {round(final)}%') # Confusion Matrix cm = confusion_matrix(y_test,y_pred)
def model_selection(X, y, X_pred, donation_columns, cat_col, no_donation_columns, skewed_target_value): models = [ { 'label': 'LogisticRegression', 'model': LogisticRegression() }, { 'label': 'RidgeClassifier', 'model': RidgeClassifier() }, # No predict_proba { 'label': 'MultinomialNB', 'model': MultinomialNB() }, { 'label': 'ComplementNB', 'model': ComplementNB() }, { 'label': 'BernoulliNB', 'model': BernoulliNB() }, { 'label': 'DecisionTreeClassifier', 'model': DecisionTreeClassifier() }, { 'label': 'SGDClassifier', 'model': SGDClassifier(loss='log') }, { 'label': 'PassiveAggressiveClassifier', 'model': PassiveAggressiveClassifier() }, # No predict_proba { 'label': 'LinearSVC', 'model': LinearSVC() }, # No predict_proba { 'label': 'RandomForestClassifier', 'model': RandomForestClassifier() } ] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) if (not no_donation_columns) & (not skewed_target_value): pdf.ln(1) pdf.multi_cell( h=5.0, w=0, txt=" a. 80% ({}) of data used for training the model" .format(convert_number_format(X_train.shape[0]))) pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt=" b. 20% ({}) of data used for testing the model". format(convert_number_format(X_test.shape[0]))) pdf.ln(0.5) test_list = [chr(x) for x in range(ord('a'), ord('z') + 1)] if no_donation_columns: pdf.multi_cell( h=5.0, w=0, txt= " 2. Donation Columns: The uploaded donor file is missing donation " "information (amount) to show donation column(s).") pdf.ln(0.5) elif skewed_target_value: pdf.multi_cell( h=5.0, w=0, txt= " 2. Donation Columns: The uploaded donor file has an imbalanced dataset to" " show donation column(s). More than") pdf.ln(0.25) pdf.multi_cell( h=5.0, w=0, txt= " 98% of your sample belongs to one class (0 or 1 Target Value) that make up a " "large proportion of the data.") pdf.ln(0.5) else: pdf.multi_cell(h=5.0, w=0, txt=" 2. Donation Columns:") pdf.ln(0.5) for i in range(len(donation_columns)): pdf.multi_cell(h=5.0, w=0, txt=" {}. {}".format( test_list[i], donation_columns[i])) pdf.ln(0.3) pdf.ln(0.5) if len(cat_col) > len(test_list): cat_col = random.sample(cat_col, len(test_list)) if len(cat_col) != 0: pdf.multi_cell(h=5.0, w=0, txt=" 3. Categorical Columns:") pdf.ln(0.5) for i in range(len(cat_col)): pdf.multi_cell(h=5.0, w=0, txt=" {}. {}".format( test_list[i], cat_col[i])) pdf.ln(0.3) else: pdf.multi_cell( h=5.0, w=0, txt= " 3. Categorical Columns: No categorical columns identified on the uploaded " "donor file.") pdf.ln(0.5) print_steps_taken() pdf.set_font(font_style, 'BU', size=10) pdf.multi_cell(h=7.5, w=0, txt="C. Important Terms Used in Predictive Modeling") pdf.set_font(font_style, size=10) pdf.ln(1) pdf.multi_cell( h=5.0, w=0, txt=" 1. F1-score: It is a harmonic mean of precision and recall.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt= " 2. Precision: It is a fraction of correctly classified instances among all " "predicted instances.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt= " 3. Recall: It is a fraction of correctly classified instances among all " "actual/valid instances.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt=" 4. Support: Number of samples used for the experiment.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt= " 5. Confusion Matrix Plot: It is a plot of the true count (x-axis) versus " "predicted count (y-axis) for both the classes") pdf.ln(0.25) pdf.multi_cell( h=5.0, w=0, txt= " (donor and non-donor). The top left box represents the count of true " "negatives, the top right box represents the") pdf.ln(0.25) pdf.multi_cell( h=5.0, w=0, txt= " count of false negatives, bottom left box represents the count of false " "positives and bottom right box represents") pdf.ln(0.25) pdf.multi_cell(h=5.0, w=0, txt=" the count of true positives.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt= " 6. Feature Importance Plot: Y-axis: feature present in input file and " "X-axis: relative % of feature importance.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt= " 7. Correlation Plot: Correlation explains how one or more variables are " "related to each other.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt= " 8. Probability Score: It is a probabilty (likelihood) of an individual to " "donate.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt= " 9. Threshold Value: It is the threshold (cut-off) value used on a probability " "score to seperate a donor from a") pdf.ln(0.25) pdf.multi_cell(h=5.0, w=0, txt=" non-donor.") pdf.ln(0.5) pdf.multi_cell( h=5.0, w=0, txt= " 10. Predicted Classification (0 and 1): Classification value 1 indicates an " "individual likely to donate and classification") pdf.ln(0.25) pdf.multi_cell( h=5.0, w=0, txt= " value 0 indicates an individual less likely to donate. They follow the " "threshold (cut-off) value logic.") pdf.ln(0.5) pdf.ln(3) plt.figure(figsize=(15, 10)) model_f1_score = {} classification_full_pred = {} classification_full_pred_prob = {} feature_importance_dict = {} roc_fpr = {} roc_tpr = {} roc_auc = {} y_test_dict = {} y_pred_dict = {} for ind, m in enumerate(models): start_time = time.time() model = m['model'] if m['label'] in [ 'PassiveAggressiveClassifier', 'LinearSVC', 'RidgeClassifier' ]: model = CalibratedClassifierCV(model) model.fit(X_train, y_train) y_pred = model.predict(X_test) classification_full_pred[m['label']] = model.predict(X_pred) classification_full_pred_prob[m['label']] = model.predict_proba(X_pred) print("Classifier: {} and time(seconds): {}".format( m['label'], round(time.time() - start_time, 3))) print() model_f1_score[m['label']] = round( f1_score(y_test, y_pred, average='weighted'), 2) y_test_dict[m['label']] = y_test y_pred_dict[m['label']] = y_pred if m['label'] in ['DecisionTreeClassifier', 'RandomForestClassifier']: feature_value = model.feature_importances_[:-1] elif m['label'] in [ 'PassiveAggressiveClassifier', 'LinearSVC', 'RidgeClassifier' ]: model = m['model'] model.fit(X_train, y_train) feature_value = model.coef_[0][:-1] elif m['label'] in ['GaussianNB']: continue else: feature_value = model.coef_[0][:-1] feature_importance_dict[m['label']] = feature_value fpr, tpr, auc = calculate_fpr_tpr(model, y_test, y_pred, X_test) roc_fpr[m['label']] = fpr roc_tpr[m['label']] = tpr roc_auc[m['label']] = auc return model_f1_score, classification_full_pred, classification_full_pred_prob, feature_importance_dict, roc_fpr, \ roc_tpr, roc_auc, y_test_dict, y_pred_dict
f = list(all_features - {'<BIAS>'})[0] flt_res = get_res(x, feature_filter=lambda name, _: name != f) flt_features = get_all(flt_res.targets[0].feature_weights) assert flt_features == (all_features - {f}) return True return False @pytest.mark.parametrize(['clf'], [ [LogisticRegression(random_state=42)], [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')], [LogisticRegression(random_state=42, fit_intercept=False)], [LogisticRegressionCV(random_state=42)], [SGDClassifier(**SGD_KWARGS)], [SGDClassifier(loss='log', **SGD_KWARGS)], [PassiveAggressiveClassifier(random_state=42)], [Perceptron(random_state=42)], [RidgeClassifier(random_state=42)], [RidgeClassifierCV()], [LinearSVC(random_state=42)], [OneVsRestClassifier(LogisticRegression(random_state=42))], ]) def test_explain_linear(newsgroups_train, clf): assert_multiclass_linear_classifier_explained(newsgroups_train, clf, explain_prediction) if isinstance(clf, OneVsRestClassifier): assert_multiclass_linear_classifier_explained( newsgroups_train, clf, explain_prediction_sklearn) @pytest.mark.parametrize(['clf'], [
def get_base_model(): return { 'passive_aggressive_classifier': PassiveAggressiveClassifier(max_iter=1000, tol=1e-3) }
def create_models(headlines): headline = headlines['headline'] label = headlines['label'] headlines.loc[headlines['label'] == -1, 'label'] = 0 arr_Accu = [] #Random State apo edw **************************** # for i in range(1, 20): # headline_train, headline_test, label_train, label_test = train_test_split(headline, label, test_size=0.01, random_state=i) # # vect = CountVectorizer(max_features=100000, binary=True) # vect = TfidfVectorizer(max_features=100000, strip_accents='unicode', analyzer='word', stop_words='english', token_pattern=r'\w{1,}', ngram_range=(1, 3)) # headline_train_vector = vect.fit_transform(headline_train) # headline_test_vector = vect.transform(headline_test) # # # Note: Egine prospatheia balancing tou dataset alla to accuracy sti sunexeia twn dokimwn apo katw den veltiwthike # # balancing = SMOTE() # # headline_train_balanced, label_train_balanced = balancing.fit_sample(headline_train_vector, label_train) # # oversampled_headlines, counts = np.unique(label_train_balanced, return_counts=True) # # print(list(zip(oversampled_headlines, counts))) # print("pre-Dummy") # dummy = DummyClassifier() # print("post-Dummy") # dummy.fit(headline_train_vector, label_train) # prediction = dummy.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print("Dummy Classifier: ") # print(accuracy) # arr_Accu.append(accuracy) # print(max(arr_Accu)) # max_random_state = arr_Accu.index(max(arr_Accu)) + 1 # print(max_random_state) # for j in range(1, 20): # print("Random State : ", j, " Accuracy : ", arr_Accu[j-1]) # Random State mexri edw ******************************** # Dokimi me k-fold gia tin euresi katalilis timis K gia megisto accuracy # Note: to accuracy edw einai xeirotero apo prin # arr_Accu = [] # for i in range(3, 15): # vect = CountVectorizer(stop_words='english', analyzer="word", min_df=2, max_df=0.8) # headline_train_vector = vect.fit_transform(headline) # # dummy = DummyClassifier() # accuracy = cross_val_score(dummy, headline_train_vector, label, cv=i, scoring='accuracy') # # arr_Accu.append(np.mean(accuracy)) # # # print(arr_Accu) # for j in range(3, 15): # print("K-Fold : ", j, " Accuracy : ", arr_Accu[j - 3]) # Ksekina i dimiourgia montelwn me to veltisto random state # print("random state chosen: ") # print(max_random_state) # headline_train, headline_test, label_train, label_test = train_test_split(headline, label, test_size=0.20, random_state=max_random_state) x = headlines['headline'] y = headlines['label'] print("Headlines", x.shape) print("Labels", y.shape) neg = sum(headlines.label == 0) pos = sum(headlines.label == 1) print("Neg", neg) print("Pos", pos) diff = abs(pos - neg) print("Class difference: ", diff) df_filter = headlines[headlines.label == 0] run_stats = pd.DataFrame() print(headlines.head()) from ekphrasis.classes.spellcorrect import SpellCorrector # Dokimastiko pre-processing twn tweets @lru_cache(maxsize=50000) def tokenization(text): text = re.split('\W+', text) return text headlines['headline'] = headlines['headline'].apply( lambda x: tokenization(x.lower())) print(headlines.head()) stopword = nltk.corpus.stopwords.words('english') # @lru_cache(maxsize=50000) def remove_stopwords(text): return [word for word in text if word not in stopword] headlines['headline'] = headlines['headline'].apply(remove_stopwords) print(headlines.head()) # stemmer = nltk.PorterStemmer() # def stemming(text): # text = [stemmer.stem(word) for word in text] # return text # # headlines['headline'] = headlines['headline'].apply(lambda x: stemming(x)) # print(headlines.head(10)) # Spell Correction, ισως να μην χρησιμοποιηθεί γιατί απαιτεί υπερβολικά πολλή RAM και χρόνο *** sp = SpellCorrector(corpus="english") def spell_corrector(text): print("**Text before correction: ", text) text = [sp.correct(word) for word in text] print(">>Text after correction:", text) return text # print("Spelling Correction") # headlines['headline'] = headlines['headline'].apply(lambda x: spell_corrector(x)) # headlines['headline'] = headlines['headline'].apply(spell_corrector) lm = nltk.WordNetLemmatizer() def lemmatizer(text): return [lm.lemmatize(word) for word in text] print("Lemmatizer") headlines['headline'] = headlines['headline'].apply(lemmatizer) print(headlines.head(10)) headlines['headline'] = headlines['headline'].str.join(" ") print(headlines.head()) headline_train, headline_test, label_train, label_test = train_test_split( headline, label, test_size=.02) x_validation, x_test, y_validation, y_test = train_test_split( headline_test, label_test, test_size=.5) print(headline_train.shape) print(headline_test.shape) # vect = TfidfVectorizer(max_features=100000, strip_accents='unicode', analyzer='word', stop_words='english', token_pattern=r'\w{1,}', ngram_range=(1, 3)) vect = TfidfVectorizer(ngram_range=(1, 3)) # Grid Searching gia veltisto apotelesma POLY XRONOVORO # lr = LogisticRegression() # text_clf = Pipeline([ # ('vect', CountVectorizer()), # ('tfidf', TfidfTransformer()), # ('clf', LogisticRegression())]) # params = { # 'clf__penalty': ['l1', 'l2'], # l1 is Lasso, l2 is Ridge # 'clf__solver': ['liblinear'], # 'clf__C': np.linspace(0.00002, 1, 10) # } # lr_gs = GridSearchCV(text_clf, params, cv=5, iid=False).fit(headline_train[:200000], label_train[:200000]) # print("Best Params", lr_gs.best_params_) # print("Best Score", lr_gs.best_score_) # Mexri edw log_regression = LogisticRegression(C=1.0, class_weight="balanced", solver="liblinear", multi_class="ovr", verbose=100, random_state=42) linear_SVC = LinearSVC(C=0.1, verbose=100, random_state=42) passive_aggressive = PassiveAggressiveClassifier() multinomial_bayes = MultinomialNB(alpha=10) complementNB = ComplementNB() ridge_clas = RidgeClassifier(solver='lsqr', random_state=42) naive_bayes = BernoulliNB() random_forest = RandomForestClassifier(max_depth=30, n_estimators=4000, verbose=100, n_jobs=2) svm = SVC(gamma=0.5, C=100, kernel="linear", verbose=100) # Edw dokimazw me GridSearch gia ta kalutero parameter tuning parameters = { 'classifier__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0], 'classifier__max_iter': [1000], 'classifier__solver': ['lsqr'], 'classifier__random_state': [42] } # EDW KANW GRIDSEARCH # pipe = Pipeline([ # ('vectorizer', TfidfVectorizer(max_features=100000)), # ('classifier', ridge_clas) # ]) # grid = GridSearchCV(pipe, n_jobs=2, cv=5, verbose=3, param_grid=parameters) # # start_time = time.time() # grid.fit(headline_train, label_train) # end_time = time.time() # print('Total fit time: {}'.format(end_time - start_time)) # # prediction = grid.predict(label_test) # print("Prediction Finished") # res = pd.DataFrame({'Prediction ': prediction}) # print(res) # MEXRI EDW # algorithms = [log_regression, complementNB, linear_SVC, passive_aggressive, multinomial_bayes, naive_bayes, ridge_clas] # algo_names = ["Logistic Regression", "Complement Naive Bayes", "Linear SVC", "Passive Aggressive", "Mutlinomial Bayes", "Naive Bayes", "Ridge Classifier"] # algo_name_pair = zip(algorithms, algo_names) algorithms = [ridge_clas] algo_names = ["Ridge Classifier"] algo_name_pair = zip(algorithms, algo_names) results = dict() for algo, name in algo_name_pair: ug_pipeline = Pipeline([('vectorizer', vect), ('classifier', algo)]) print("Classifier : ", algo) results[name] = train_test_and_evaluate(ug_pipeline, headline_train, label_train, x_validation, y_validation) dframe = pd.DataFrame.from_dict(results, orient="index").reset_index() dframe.columns = ["classifier", "prediction"] dframe.sort_values(by=["prediction"], ascending=False) print(results) sns.barplot(x='classifier', y='prediction', data=dframe) plt.title("TFidf Vectorizer, n-gram=3") fig = plt.gcf() fig.set_size_inches(20, 10) plt.show() # ta headlines tou training kommatioy ginontai fit_transform gia to fit # ta headlines tou test ginontai transform gia to test # Multionomial Bayes # mbayes = MultinomialNB() # start_time = time.time() # mbayes.fit(headline_train_vector, label_train) # runtime = time.time() - start_time # # print(mbayes.score(headline_train_vector, label_train)) # # # actual testing me to testing set pou diaxwrisame # prediction = mbayes.predict(headline_test_vector) # # print(prediction) # accuracy = metrics.accuracy_score(label_test, prediction) # print('MBayes Accuracy : ', accuracy) # run_stats = run_stats.append({'Classifier': 'Multinomial Naive Bayes', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True) # results["bayes_accuracy"] = prediction # start_time = time.time() # log_regression = LogisticRegression() # log_regression.fit(headline_train_vector, label_train) # prediction = log_regression.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # runtime = time.time() - start_time # print('LogisticRegression Accuracy : ', accuracy) # print('Runtime : ', runtime) # results["Logistic_regression"] = accuracy # Teleutaia fora 0.77838 # decision_tree = DecisionTreeClassifier(criterion='entropy') # decision_tree.fit(headline_train_vector, label_train) # prediction = decision_tree.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('DecisionTree Accuracy : ', accuracy) # # # random_forest = RandomForestClassifier(criterion='entropy') # random_forest.fit(headline_train_vector, label_train) # prediction = random_forest.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('RandomForestClassifier Accuracy : ', accuracy) # Teleutaia fora, DEN ETREKSE, PIRE POLY WRA KAI TO EKLEISA # # adaboost = AdaBoostClassifier() # adaboost.fit(headline_train_vector, label_train) # prediction = adaboost.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('Adaboost Accuracy : ', accuracy) # Teleutaio accuracy 0.66687 # # bernoulli_bayes = BernoulliNB() # start_time = time.time() # bernoulli_bayes.fit(headline_train_vector, label_train) # runtime = time.time() - start_time # prediction = bernoulli_bayes.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('BernoulliNB Accuracy : ', accuracy) # run_stats = run_stats.append({'Classifier': 'Bernoulli', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True) # linear_SVC = LinearSVC() # start_time = time.time() # linear_SVC.fit(headline_train_vector, label_train) # runtime = time.time() - start_time # prediction = linear_SVC.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('Linear_SVC Accuracy : ', accuracy) # print("Runtime : ", runtime) # run_stats = run_stats.append({'Classifier': 'Linear SVC', 'Accuracy': accuracy, 'Runtime': runtime}, ignore_index=True) # Teleutaio accuracy 0.7761956 # passive_aggressive = PassiveAggressiveClassifier() # passive_aggressive.fit(headline_train_vector, label_train) # prediction = passive_aggressive.predict(headline_test_vector) # accuracy = metrics.accuracy_score(label_test, prediction) # print('PassiveAggressiveClassifier Accuracy : ', accuracy) pprint(run_stats) return results
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) clf = MultinomialNB() clf.fit(count_train, y_train) pred = clf.predict(count_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) # Testing from sklearn.linear_model import PassiveAggressiveClassifier linear_clf = PassiveAggressiveClassifier(n_iter=50) linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) plot_confusion_matrix(cm, classes=['FAKE', 'REAL']) clf = MultinomialNB(alpha=0.1) last_score = 0 for alpha in np.arange(0,1,.1): nb_classifier = MultinomialNB(alpha=alpha) nb_classifier.fit(tfidf_train, y_train) pred = nb_classifier.predict(tfidf_test)
test_ratio = .2 val_ratio = .15 batch_size = 256 validate_every = 50 print_every = batch_size * 2 loss = ['hinge', 'log'] alpha = [.000001, .00001, .0001, .001] l1_ratio = [0., .1, .2, .3, .4, .5, .6, .7, .8, .9] partial_fit_classifiers = { 'SGD-SVM': SGDClassifier(random_state=random_state, loss='hinge'), 'SGD-Log': SGDClassifier(random_state=random_state, loss='log'), 'Perceptron': Perceptron(random_state=random_state), 'NB Multinomial': MultinomialNB(alpha=0.01), 'Passive-Aggressive': PassiveAggressiveClassifier(random_state=random_state) } def get_batchnames(split_val=True): """ shuffle train and test set then split train set further to train set and validation set :return: pickle filenames of train/validation/test batches """ np.random.seed(random_state) test_batch_names = glob.glob(os.path.join('dataset/batches/test_batches', '*.pickle')) test_batch_names = np.random.choice(test_batch_names, len(test_batch_names), replace=False) train_batch_names = glob.glob(os.path.join('dataset/batches/train_batches', '*.pickle')) train_batch_names = np.random.choice(train_batch_names, len(train_batch_names), replace=False) if split_val: