class Pac_RF(): def __init__(self): self.pac = PassiveAggressiveClassifier(tol=0.001) self.forest = RandomForestClassifier() self.classes_ = [] def set_params(self, params): for param in params: model, param_ = param.split('.') if model == 'pac': self.pac.set_params(**{param_: params[param]}) else: self.forest.set_params(**{param_: params[param]}) def fit(self, X, y): self.pac.fit(X, y) des_matrix = self.pac.decision_function(X) self.forest.fit(des_matrix, y) self.find_class_order() def predict_proba(self, X): des_matrix = self.pac.decision_function(X) probs = self.forest.predict_proba(des_matrix) return probs def find_class_order(self): self.classes_ = self.forest.classes_
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = (TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).build()) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc( term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
class DeployedClassifierFactory: def __init__(self, term_doc_matrix, term_doc_matrix_factory, category, nlp=None): '''This is a class that enables one to train and save a classification model. Parameters ---------- term_doc_matrix : TermDocMatrix term_doc_matrix_factory : TermDocMatrixFactory category : str Category name nlp : spacy.en.English ''' self._term_doc_matrix = term_doc_matrix self._term_doc_matrix_factory = term_doc_matrix_factory assert term_doc_matrix_factory._nlp is None assert term_doc_matrix_factory.category_text_iter is None self._category = category self._clf = None self._proba = None def passive_aggressive_train(self): '''Trains passive aggressive classifier ''' self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0) self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y) y_dist = self._clf.decision_function(self._term_doc_matrix._X) pos_ecdf = ECDF(y_dist[y_dist >= 0]) neg_ecdf = ECDF(y_dist[y_dist <= 0]) def proba_function(distance_from_hyperplane): if distance_from_hyperplane > 0: return pos_ecdf(distance_from_hyperplane) / 2. + 0.5 elif distance_from_hyperplane < 0: return pos_ecdf(distance_from_hyperplane) / 2. return 0.5 self._proba = proba_function return self def build(self): '''Builds Depoyed Classifier ''' if self._clf is None: raise NeedToTrainExceptionBeforeDeployingException() return DeployedClassifier(self._category, self._term_doc_matrix._category_idx_store, self._term_doc_matrix._term_idx_store, self._term_doc_matrix_factory)
class PAC(BaseClassifier): def __init__(self,TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\ TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\ TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\ UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\ TEST_MERGE,TEST,name='pac',USE_TINY=False,RANDOMSTATE=2018): super(PAC, self).__init__( TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\ TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\ TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\ UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\ TEST_MERGE,TEST,name,USE_TINY,RANDOMSTATE) '''In Ridge, only 'sag' solver can currently fit the intercept when X is sparse.''' self.clf = PassiveAggressiveClassifier(n_iter=50, tol=1e-3) def trainWithEva(self, trainval_x): '''fit the data with evalidation''' train_x, valid_x, train_y, valid_y = train_test_split(\ trainval_x,self.trainval['label'],\ test_size=0.1, random_state=self.randomstate) self.clf.fit(train_x, train_y) pred = self.clf.decision_function(valid_x) #print(valid_y,pred) score = metrics.roc_auc_score(valid_y, pred) print("%s on valid set accuracy: %0.5f" % (self.name, score)) return score def predict(self, test_x=None, model_path=None): if model_path is not None: self.load_model(model_path) if test_x is None: _, test_x = self.feature_engineering() #self.clf.decision_function(test_x) #print(pd.read_csv(self.ds.TEST),self.ds.TEST) pre = pd.read_csv(self.ds.TEST) #print(test_x.shape,pre.shape) pre['score'] = self.clf.decision_function(test_x) pre['score'] = pre['score'].apply(lambda x: float('%.6f' % x)) return pre
class DeployedClassifierFactory: def __init__(self, term_doc_matrix, term_doc_matrix_factory, category, nlp=None): '''This is a class that enables one to train and save a classification model. Parameters ---------- term_doc_matrix : TermDocMatrix term_doc_matrix_factory : TermDocMatrixFactory category : str Category name nlp : spacy parser ''' self._term_doc_matrix = term_doc_matrix self._term_doc_matrix_factory = term_doc_matrix_factory assert term_doc_matrix_factory._nlp is None assert term_doc_matrix_factory.category_text_iter is None self._category = category self._clf = None self._proba = None def passive_aggressive_train(self): '''Trains passive aggressive classifier ''' self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0) self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y) y_dist = self._clf.decision_function(self._term_doc_matrix._X) pos_ecdf = ECDF(y_dist[y_dist >= 0]) neg_ecdf = ECDF(y_dist[y_dist <= 0]) def proba_function(distance_from_hyperplane): if distance_from_hyperplane > 0: return pos_ecdf(distance_from_hyperplane) / 2. + 0.5 elif distance_from_hyperplane < 0: return pos_ecdf(distance_from_hyperplane) / 2. return 0.5 self._proba = proba_function return self def build(self): '''Builds Depoyed Classifier ''' if self._clf is None: raise NeedToTrainExceptionBeforeDeployingException() return DeployedClassifier(self._category, self._term_doc_matrix._category_idx_store, self._term_doc_matrix._term_idx_store, self._term_doc_matrix_factory)
class PassiveAggressiveClassifierImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = ( TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types) ).build() ) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc(term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
def predict(filename): df = pd.read_csv( 'C:\\Users\\Niladri Shekhar Dutt\\Desktop\\IET-FE\\FakeNews\\fakenewsFE\\fake_or_real_news.csv' ) #df = df.set_index("Unnamed: 0") # Set `y` y = df.label # Drop the `label` column df.drop("label", axis=1) # Make training and test sets X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.5, random_state=53) # Initialize the `count_vectorizer` count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the training data count_train = count_vectorizer.fit_transform(X_train) # Initialize the `tfidf_vectorizer` tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) # Fit and transform the training data tfidf_train = tfidf_vectorizer.fit_transform(X_train) # Get the feature names of `tfidf_vectorizer` print(tfidf_vectorizer.get_feature_names()[-10:]) # Get the feature names of `count_vectorizer` print(count_vectorizer.get_feature_names()[:10]) # In[32]: count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names()) # In[33]: tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names()) difference = set(count_df.columns) - set(tfidf_df.columns) set() print(count_df.equals(tfidf_df)) count_df.head() # In[34]: tfidf_df.head() # In[40]: linear_clf = PassiveAggressiveClassifier(n_iter=50) linear_clf.fit(tfidf_train, y_train) # In[41]: #vec_clf = Pipeline([('vectorizer', tfidf_train), ('pac', linear_clf)]) linear_clf.fit(tfidf_train, y_train) # In[42]: a = pd.read_csv(filename, encoding='latin1') # Set index #a=a.set_index("Unnamed: 0") # Print first lines of `df` X_test = a['text'] # In[45]: # Transform the test set count_test = count_vectorizer.transform(X_test) # Transform the test set tfidf_test = tfidf_vectorizer.transform(X_test) pred = linear_clf.predict(tfidf_test) probs = linear_clf.decision_function(tfidf_test) # In[46]: probs = (probs + 1.0) / 2.0 print(probs) # In[47]: flag = True for i in probs: if (i > (0.25)): flag = True else: flag = False print(flag) return (probs[0] * 100)
pickle.dump(clf_pac, open(model_file, 'wb')) # Saved the tfidf to transform input tfidf_file = 'tfidf.sav' pickle.dump(tfidf_ngram, open(tfidf_file, 'wb')) # Fun thing sen_test = "In 2013, Clinton told Goldman Sachs bigwigs: \ 'I would like to see people like Donald Trump run for office.\ They're honest, and can't be bought" sentiment_score = sen_feature(sen_test) X_sen = tfidf_ngram.transform([sen_test]) X_sen = sp.sparse.hstack((X_sen, np.array([sentiment_score])), format='csr') label_sen = clf_pac.predict(X_sen) proba_truth_sen = clf_pac.decision_function(X_sen)[0] proba_doubt_sen = (1 - abs(proba_truth_sen)) * abs(proba_truth_sen) / ( -proba_truth_sen) print(" PAC_model :") print(" This new is : " + label_sen[0]) print(" The sentiment score :" + str(sentiment_score)) print(" The truth score :" + str(proba_truth_sen)) print(" The doublt score :" + str(proba_doubt_sen)) X_sen2 = tokenizer.texts_to_sequences([sen_test]) X_sen2 = sequence.pad_sequences(X_sen2, maxlen=50) label_sen2 = model.predict(X_sen2) if label_sen2[0] < 0.5: label_sen2 = 'FAKE' else:
def run(self, nFold=3, iter=10, verbose=1): """ CV: -1 => total model (no cv) CV: nFold => mean metric over cv """ self.__database.createGOIDView(self.__goidtable, double=["AUROC", "AUPR", "Fmax"], drop=True) self.__database.createProteinView(self.__proteintable, \ double=["ProteinID", "Label", "Score"], drop=True) # Get labels test = 0 pp = permutation(self.__numproteins) resultid = 0 for goid in self.__goid: print "____________ GOID= %d ____________" % goid # Get label for GOID goidindex = where(self.__goid==goid) goidindex = int(goidindex[0]) print goidindex annotations = self.selectAnnotatedProteinsMousefunc(goidindex) print "0s=", len([x for x in annotations if x == 0]) print "1s=", len([x for x in annotations if x == 1]) print "-1s=", len([x for x in annotations if x == -1]) annotation = [] for value in annotations: annotation.append(value) annotation = asarray(annotation).astype(float64) annotation = annotation.ravel() model = PassiveAggressiveClassifier(loss='hinge', n_iter=iter, verbose=verbose) model.fit(self.__network, annotation) scores = model.decision_function(self.__network) scores = self.convertScore(scores) per = Performance(annotations, scores) roc = per.AUROCGillis() print "AUROC= ", roc pr = per.AUPRGillis() print "AUPR= ", pr fmax = per.Fmax() print "Fmax= ", fmax self.__database.insertProteinView(self.__proteintable, resultid, goid[0], -1, \ self.__proteins, annotations, scores) self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], -1, [roc, pr, fmax]) resultid += 1 del per labelIx = range(self.__numproteins) offset = 0 fold = 0 meanroc = [] meanpr = [] meanfmax = [] while fold < nFold: print "____________ Fold= %d ____________" % fold lastelem = min(self.__numproteins, offset+floor(self.__numproteins/nFold)) ix = [] for index in pp[offset+1:lastelem]: ix.append(labelIx[index]) offset = lastelem labeltmp = [] for value in annotations: labeltmp.append(float(value)) labeltmp = asarray(labeltmp).astype(float64) labeltmp = labeltmp.ravel() print labeltmp.shape for index in ix: labeltmp[index] = 0 print "0s=", len([x for x in labeltmp if x == 0]) print "1s=", len([x for x in labeltmp if x == 1]) print "-1s=", len([x for x in labeltmp if x == -1]) model = PassiveAggressiveClassifier(loss='hinge', \ n_iter=iter, verbose=verbose) model.fit(self.__network, labeltmp) scores = model.decision_function(self.__network) scores = self.convertScore(scores) score = [] annotation = [] proteins = [] for index in ix: score.append(float(scores[index])) annotation.append(annotations[index]) proteins.append(self.__proteins[index]) per = Performance(annotation, score) roc = per.AUROCGillis() print "AUROC= ", roc meanroc.append(roc) pr = per.AUPRGillis() print "AUPR= ", pr meanpr.append(pr) fmax = per.Fmax() print "Fmax= ", fmax meanfmax.append(fmax) self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], fold,\ [roc, pr, fmax]) self.__database.insertProteinView(self.__proteintable, resultid, goid[0],\ fold, proteins, annotation, score) del proteins del annotation del score del per fold += 1 resultid += 1 roc_mean = reduce(lambda x, y: x + y / float(len(meanroc)), meanroc, 0) print "Mean AUROC= ", roc_mean #print sum(meanroc)/float(len(meanroc)) pr_mean = reduce(lambda x, y: x + y / float(len(meanpr)), meanpr, 0) print "Mean AUPR= ", pr_mean #print sum(meanpr)/float(len(meanpr)) fmax_mean = reduce(lambda x, y: x + y / float(len(meanfmax)), meanfmax, 0) print "Mean Fmax= ", fmax_mean self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], nFold, \ [roc_mean, pr_mean, fmax_mean]) resultid += 1 test += 1
def hoax_detection(): #Reading data as pandas dataframe frame = pd.read_csv('MasterBeritaAfterCleanCombined.csv', error_bad_lines=False, encoding='latin1') frame2 = pd.read_csv('new_TestData.csv', error_bad_lines=False, encoding='latin1') # TODO: remove this line # frame = frame.head(5) berita = '' berita = stem(berita) data = {'no': ['1'], 'berita': [berita], 'tagging': ['Hoax']} # frame2 = pd.DataFrame(data, columns=['no','berita','tagging']) #Inspecing Shape frame.shape frame2.shape #Inspecting top 5 rows frame.head() frame2.head() #Setting the DataFrame index (row labels) using one or more existing columns frame = frame.set_index("no") frame.head() frame2 = frame2.set_index("no") frame2.head() y = frame.tagging y.head() y2 = frame2.tagging frame.drop("tagging", axis=1) frame.head() frame2.drop("tagging", axis=1) # print(frame['berita']) # print(frame['berita']) X_train = frame['berita'] y_train = y print(X_train.shape) print(y_train.shape) # print(X_train) # print(y_train) # print(len(X_train)) # print(len(y_train)) # uux_train, X_test , uuy_train, y_test = train_test_split(frame2['berita'], y2, test_size=0.33, random_state=53) X_test = frame2['berita'] y_test = y2 print(len(X_test)) # stemming # print(frame['berita'][0]) # print(frame2['berita']) X_train.head() y_train.head() X_train, X_test, y_train, y_test = train_test_split(frame['berita'], y, test_size=0.33, random_state=53) factory = StopWordRemoverFactory() stopwords = factory.get_stop_words() # count_vectorizer = case folding, tokenizing, remove stopwords # analyze = count_vectorizer.build_analyzer() # analyze("Saya mau MAKAN dimakan di tempat makan") # print(count_vectorizer) # count_vectorizer = CountVectorizer(lowercase=True, stop_words=frozenset(stopwords)) # Fit and transform the training data. # count_train = count_vectorizer.fit_transform(X_train) # print(count_train) # Transform the test set # count_test = count_vectorizer.transform(X_test) # Initialize the `tfidf_vectorizer` tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words=frozenset(stopwords), max_df=0.7) # Fit and transform the training data tfidf_train = tfidf_vectorizer.fit_transform(X_train) # Transform the test set tfidf_test = tfidf_vectorizer.transform(X_test) print(tfidf_test) print('separator') # Get the feature names of `tfidf_vectorizer` print(tfidf_vectorizer.get_feature_names()[-20:]) tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names()) # tfidf_df.to_excel('output-hoax-only.xlsx') # print(tfidf_df) # Get the feature names of `count_vectorizer` # print(count_vectorizer.get_feature_names()[0:10]) import matplotlib.pyplot as plt def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ See full source and example: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() start = timeit.default_timer() clf = MultinomialNB() clf.fit(tfidf_train, y_train) pred = clf.predict(tfidf_test) score = accuracy_score(y_test, pred) multinomialpred = pred print("#Result:#Multinomial#", pred) print("accuracy: %0.3f" % score) cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time Multinomial: ', stop - start) plot_confusion_matrix( cm, classes=['Hoax', 'Valid'], title='MultinomialNB Confusion Matrix (Predict: Test)') # y_pred_prob = clf.predict_proba(tfidf_test) # print(y_pred_prob) # hoax_probs = y_pred_prob[:,1] # # # fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=hoax_probs, pos_label='spam') # # Plot # plt.plot(fpr,tpr, color='red') # plt.title('Receiver Operating Characteristic Curve', size=20) # plt.plot([0, 1], [0, 1], color='green', linestyle=':') # plt.xlabel('False Positive Rate', size=15) # plt.ylabel('True Positive Rate', size=15) # plt.show() clf = MultinomialNB() clf.fit(tfidf_train, y_train) pred = clf.predict(tfidf_train) score = accuracy_score(y_train, pred) multinomialpred = pred cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() plot_confusion_matrix( cm, classes=['Hoax', 'Valid'], title='MultinomialNB Confusion Matrix (Predict: Training)') start = timeit.default_timer() linear_clf = PassiveAggressiveClassifier() linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_test) score = accuracy_score(y_test, pred) passiveaggressivepred = pred print("#Result:#PassiveAggressiveClassifier#", pred) print("accuracy: %0.3f" % score) cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time PassiveAggressiveClassifier: ', stop - start) plot_confusion_matrix( cm, classes=['Hoax', 'Valid'], title='PassiveAggressiveClassifier Confusion Matrix (Predict: Test)') linear_clf = PassiveAggressiveClassifier() linear_clf.fit(tfidf_train, y_train) pred = linear_clf.predict(tfidf_train) score = accuracy_score(y_train, pred) passiveaggressivepred = pred cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time PassiveAggressiveClassifier: ', stop - start) plot_confusion_matrix( cm, classes=['Hoax', 'Valid'], title='PassiveAggressiveClassifier Confusion Matrix (Predict: Training)' ) start = timeit.default_timer() linear_clf_svm = svm.SVC() linear_clf_svm.fit(tfidf_train, y_train) pred = linear_clf_svm.predict(tfidf_test) score = accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) print("#Result:#SVM#", pred) svmpred = pred cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time SVM: ', stop - start) plot_confusion_matrix(cm, classes=['Hoax', 'Valid'], title='SVM Confusion Matrix (Predict: Test)') linear_clf_svm = svm.SVC() linear_clf_svm.fit(tfidf_train, y_train) pred = linear_clf_svm.predict(tfidf_train) score = accuracy_score(y_train, pred) svmpred = pred cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid']) stop = timeit.default_timer() print('Time SVM: ', stop - start) plot_confusion_matrix(cm, classes=['Hoax', 'Valid'], title='SVM Confusion Matrix (Predict: Training)') def most_informative_feature_for_binary_classification( vectorizer, classifier, n=100): """ See: https://stackoverflow.com/a/26980472 Identify most important features if given a vectorizer and binary classifier. Set n to the number of weighted features you would like to show. (Note: current implementation merely prints and does not return top classes.) """ class_labels = classifier.classes_ feature_names = vectorizer.get_feature_names() topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n] topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:] for coef, feat in topn_class1: print(class_labels[0], coef, feat) print() for coef, feat in reversed(topn_class2): print(class_labels[1], coef, feat) print('y_test') print(y_test) # print('score') # print(score) # y_pred_prob = clf.predict_proba(tfidf_test) # spam_probs = y_pred_prob[:,1] # print(spam_probs) # # # Build confusion metrics # fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=spam_probs, pos_label='spam') # # Plot # plt.plot(fpr,tpr, color='red') # plt.title('Receiver Operating Characteristic Curve', size=20) # plt.plot([0, 1], [0, 1], color='green', linestyle=':') # plt.xlabel('False Positive Rate', size=15) # plt.ylabel('True Positive Rate', size=15) # plt.show() from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_test, linear_clf.decision_function(tfidf_test), pos_label='neg') # find threshold closest to zero: close_zero = np.argmin(np.abs(thresholds)) plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label='threshold zero(default)', fillstyle='none', c='k', mew=2) plt.plot([0, 1], linestyle='-', lw=2, color='r', label='random', alpha=0.8) plt.legend(loc=4) plt.plot(fpr, tpr, label='ROC Curve') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate (recall)') plt.title('roc_curve') plt.show() from sklearn.metrics import auc print('AUC score is: ', auc(fpr, tpr)) # plot precision recall curve Multinomial # disp = plot_precision_recall_curve(linear_clf, tfidf_test, y_test) # y_score = linear_clf.decision_function(X_test) # average_precision = average_precision_score(y_test, y_score) # disp.ax_.set_title('2-class Precision-Recall curve: ' # 'AP={0:0.2f}'.format(average_precision)) # disp.show() # # most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30) feature_names = tfidf_vectorizer.get_feature_names() sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20] ### Most fake sorted(zip(clf.coef_[0], feature_names))[:20] tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0]))) for i in tokens_with_weights: print(i) break result = dict() result['multinomial'] = multinomialpred result['passive'] = passiveaggressivepred result['svm'] = svmpred # print(result) return result
matData.append(W_key) #print "N_features", ex.shape W = np.asarray( csc_matrix((matData, (RowIndex, ColIndex)), shape=ex.shape).todense()) #print W #raw_input("W (output)") #W=W_old #dump line #set the weights of PA to the predicted values PassiveAggressive.coef_ = W pred = PassiveAggressive.predict(ex) score = PassiveAggressive.decision_function(ex) bintargets.append(g_it.target[i]) if pred != g_it.target[i]: errors += 1 print "Error", errors, " on example", i, "pred", score, "target", g_it.target[ i] if g_it.target[i] == 1: fn += 1 else: fp += 1 else: if g_it.target[i] == 1: tp += 1 else:
#print "new_features", list_for_deep[i] # for key,rowDict in list_for_deep[i].iteritems(): # #print "key", key, "target", target # #print "weight", features[i,key] # exampleESN+=np.array(np.multiply(rowDict,features[i,key])).reshape(nHidden,) # #print "exampleESN", exampleESN # #print list_for_deep[i].keys() if i != 0: #W_old contains the model at the preceeding step # Here we want so use the deep network to predict the W values of the features # present in ex #set the weights of PA to the predicted values pred = PassiveAggressive.predict(exampleESN) score = PassiveAggressive.decision_function(exampleESN) if pred != g_it.target[i]: errors += 1 print "Error", errors, " on example", i, "pred", score, "target", g_it.target[ i] if g_it.target[i] == 1: fn += 1 else: fp += 1 else: if g_it.target[i] == 1: tp += 1 else: tn += 1 #print "Correct prediction example",i, "pred", score, "target",g_it.target[i]
class PassiveAggressive( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): def __init__(self, C, fit_intercept, tol, loss, average, random_state=None): self.C = C self.fit_intercept = fit_intercept self.average = average self.tol = tol self.loss = loss self.random_state = random_state self.estimator = None self.max_iter = self.get_max_iter() self.n_iter_ = None @staticmethod def get_max_iter(): return 1024 def get_current_iter(self): return self.n_iter_ def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model import PassiveAggressiveClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None self.n_iter_ = None if self.estimator is None: self.fully_fit_ = False self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.C = float(self.C) call_fit = True self.estimator = PassiveAggressiveClassifier( C=self.C, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, loss=self.loss, shuffle=True, random_state=self.random_state, warm_start=True, average=self.average, ) self.classes_ = np.unique(y.astype(int)) else: call_fit = False # Fallback for multilabel classification if len(y.shape) > 1 and y.shape[1] > 1: import sklearn.multiclass self.estimator.max_iter = self.get_max_iter() self.estimator = sklearn.multiclass.OneVsRestClassifier( self.estimator, n_jobs=1) self.estimator.fit(X, y) self.fully_fit_ = True else: if call_fit: self.estimator.fit(X, y) self.n_iter_ = n_iter else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, self.max_iter) self.estimator._validate_params() lr = "pa1" if self.estimator.loss == "hinge" else "pa2" self.estimator._partial_fit( X, y, alpha=1.0, C=self.estimator.C, loss="hinge", learning_rate=lr, max_iter=n_iter, classes=None, sample_weight=sample_weight, coef_init=None, intercept_init=None ) self.n_iter_ += self.estimator.n_iter_ if ( self.estimator.max_iter >= self.max_iter or self.estimator.max_iter > self.n_iter_ ): self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'PassiveAggressive Classifier', 'name': 'Passive Aggressive Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'handles_multioutput': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") loss = CategoricalHyperparameter( "loss", ["hinge", "squared_hinge"], default_value="hinge" ) tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4, log=True) # Note: Average could also be an Integer if > 1 average = CategoricalHyperparameter('average', ['False', 'True'], default_value='False') cs = ConfigurationSpace() cs.add_hyperparameters([loss, fit_intercept, tol, C, average]) return cs
def run(self, nFold=3, loss='hinge', iter=10, verbose=1): log.debug("PA: run") (numx, numy) = self._network.shape pp = permutation(numx) model = PassiveAggressiveClassifier(loss=loss, n_iter=iter, verbose=verbose) model.fit(self._network, self._annotation.ravel()) scores = model.decision_function(self._network) self._scores = self._convertScore(scores) fold = 0 offset = 0 meanroc = [] labelIx = range(numx) while fold < nFold: log.debug("NV: ___ fold= %d ___" % fold) lastelem = int(min(numx, offset+floor(numx/nFold))) ix = [] for index in pp[offset+1:lastelem]: ix.append(index) print lastelem offset = lastelem labeltmp = [] for value in self._annotation: labeltmp.append(float(value)) for index in ix: labeltmp[index] = 0 model = PassiveAggressiveClassifier(loss=loss, n_iter=iter, verbose=verbose) model.fit(self._network, labeltmp) scores = model.decision_function(self._network) scores = self._convertScore(scores) score = [] label = [] protein = [] for index in ix: score.append(float(scores[index])) label.append(int(self._annotation[index])) protein.append(int(self._proteinid[index])) self._foldlabels.append(int(self._annotation[index])) self._foldscores.append(float(scores[index])) self._foldproteins.append(int(self._proteinid[index])) auroc = self.AUROC(label, score) log.debug("AUROC= %.4f" % auroc) meanroc.append(auroc) fold += 1 self._auroc = reduce(lambda x, y: x + y / float(len(meanroc)), meanroc, 0) auroc = self.AUROC(self._foldlabels, self._foldscores) self._TPR_FPR(self._foldlabels, self._foldscores)
#Ensemblistes svc_clf8 = LinearSVC(C=0.8) svc_clf8.fit(np.log(x_train+1), y_train) decision_svc=svc_clf8.decision_function(x_test) prediction_svc8=svc_clf8.predict(x_test) svc_score8 = accuracy_score(y_test, prediction_svc8) Ridge_clf = RidgeClassifier(alpha=1) Ridge_clf.fit(x_train, y_train) decision_ridge=Ridge_clf.decision_function(x_test) prediction_ridge=Ridge_clf.predict(x_test) Ridge_clf_score = accuracy_score(y_test, prediction_ridge) PAC_clf = PassiveAggressiveClassifier(C=0.1) PAC_clf.fit(x_train, y_train) decision_pac=PAC_clf.decision_function(x_test) prediction_PAC=PAC_clf.predict(x_test) PAC_clf_score = accuracy_score(y_test, prediction_PAC) from sklearn.linear_model import RandomizedLogisticRegression RandomizedLogisticRegression_clf = RandomizedLogisticRegression(C=5,n_jobs=-1) RandomizedLogisticRegression_clf.fit(x_train, y_train) prediction_RandomizedLogisticRegression=RandomizedLogisticRegression_clf.predict(x_test) RandomizedLogisticRegression_clf_score = accuracy_score(y_test, prediction_RandomizedLogisticRegression) #################################################################### #Affichage des score des différents modèles print('Score modele %s est de %s' % ('RF',score_rf)) print('Score modele %s est de %s' % ('Ext',score_ext)) print('Score modele %s est de %s' % ('Sig',sig_score))
fileObject_lsa = open(file_Name_lsa, 'wb') pickle.dump(passive_lsa, fileObject_lsa) fileObject_lsa.close() file_vectorizer_open = open("global_intent_tfidf_vectorizer.p", 'wb') pickle.dump(vectorizer, file_vectorizer_open) file_vectorizer_open.close() file_lsa_vectorizer_open = open("global_intent_lsa_vectorizer.p", 'wb') pickle.dump(lsa, file_lsa_vectorizer_open) file_lsa_vectorizer_open.close() while (1): out_put = [] test_text = raw_input('Enter: ') test_text_clean = [ each_word for each_word in test_text.split() if each_word not in stop_words ] test_text_lmtzr = [ lmtzr.lemmatize(each_word) for each_word in test_text_clean ] out_put.append(' '.join(test_text_lmtzr)) out_put_vector = vectorizer.transform(out_put) out_put_class = passive_tfidf.predict(out_put_vector) print 'tf-idf: ', out_put_class print 'tf-idf: ', passive_tfidf.decision_function(out_put_vector) out_put_vector_lsa = lsa.transform(out_put_vector) print 'lsa: ', passive_lsa.predict(out_put_vector_lsa) print 'lsa: ', passive_lsa.decision_function(out_put_vector_lsa)
def predict(filename): df=pd.read_csv('C:\\Users\\Niladri Shekhar Dutt\\Desktop\\IET-FE\\FakeNews\\fakenewsFE\\fake_or_real_news.csv') y = df.label df.drop("label", axis=1) X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.5, random_state=53) count_vectorizer = CountVectorizer(stop_words='english') count_train = count_vectorizer.fit_transform(X_train) tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) tfidf_train = tfidf_vectorizer.fit_transform(X_train) print(tfidf_vectorizer.get_feature_names()[-10:]) print(count_vectorizer.get_feature_names()[:10]) count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names()) tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names()) difference = set(count_df.columns) - set(tfidf_df.columns) set() print(count_df.equals(tfidf_df)) count_df.head() tfidf_df.head() linear_clf = PassiveAggressiveClassifier(n_iter=50) linear_clf.fit(tfidf_train, y_train) linear_clf.fit(tfidf_train, y_train) a=pd.read_csv(filename,encoding='latin1') X_test=a['text'] count_test = count_vectorizer.transform(X_test) tfidf_test = tfidf_vectorizer.transform(X_test) pred=linear_clf.predict(tfidf_test) probs=linear_clf.decision_function(tfidf_test) probs=(probs+1.0)/2.0 print(probs) flag=True for i in probs: if(i>(0.25)): flag=True else: flag=False print(flag) return (probs[0]*100)
X_vector = vectorizer.fit_transform(training_text) clf = PassiveAggressiveClassifier(n_iter=50) clf.fit(X_vector,training_class) file_Name = "classif_test.p" fileObject = open(file_Name,'wb') pickle.dump(clf, fileObject) fileObject.close() while(1): out_put = [] out_put.append(raw_input('Enter: ').lower()) out_put_vector = vectorizer.transform(out_put) out_put_class = clf.predict(out_put_vector) print out_put_class print clf.decision_function(out_put_vector) # print clf.predict_proba(out_put_vector)