Esempio n. 1
0
class Pac_RF():

    def __init__(self):
        self.pac = PassiveAggressiveClassifier(tol=0.001)
        self.forest = RandomForestClassifier()
        self.classes_ = []

    def set_params(self, params):
        for param in params:
            model, param_ = param.split('.')
            if model == 'pac':
                self.pac.set_params(**{param_: params[param]})
            else:
                self.forest.set_params(**{param_: params[param]})

    def fit(self, X, y):
        self.pac.fit(X, y)
        des_matrix = self.pac.decision_function(X)
        self.forest.fit(des_matrix, y)
        self.find_class_order()

    def predict_proba(self, X):
        des_matrix = self.pac.decision_function(X)
        probs = self.forest.predict_proba(des_matrix)
        return probs

    def find_class_order(self):
        self.classes_ = self.forest.classes_
 def test_main(self):
     categories, documents = get_docs_categories()
     clean_function = lambda text: '' if text.startswith('[') else text
     entity_types = set(['GPE'])
     term_doc_mat = (TermDocMatrixFactory(
         category_text_iter=zip(categories, documents),
         clean_function=clean_function,
         nlp=_testing_nlp,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=entity_types)).build())
     clf = PassiveAggressiveClassifier(n_iter=5,
                                       C=0.5,
                                       n_jobs=-1,
                                       random_state=0)
     fdc = FeatsFromDoc(
         term_doc_mat._term_idx_store,
         clean_function=clean_function,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
     tfidf = TfidfTransformer(norm='l1')
     X = tfidf.fit_transform(term_doc_mat._X)
     clf.fit(X, term_doc_mat._y)
     X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
     pred = clf.predict(tfidf.transform(X_to_predict))
     dec = clf.decision_function(X_to_predict)
Esempio n. 3
0
class DeployedClassifierFactory:
    def __init__(self,
                 term_doc_matrix,
                 term_doc_matrix_factory,
                 category,
                 nlp=None):
        '''This is a class that enables one to train and save a classification model.

		Parameters
		----------
		term_doc_matrix : TermDocMatrix
		term_doc_matrix_factory : TermDocMatrixFactory
		category : str
			Category name
		nlp : spacy.en.English
		'''
        self._term_doc_matrix = term_doc_matrix
        self._term_doc_matrix_factory = term_doc_matrix_factory
        assert term_doc_matrix_factory._nlp is None
        assert term_doc_matrix_factory.category_text_iter is None
        self._category = category
        self._clf = None
        self._proba = None

    def passive_aggressive_train(self):
        '''Trains passive aggressive classifier

		'''
        self._clf = PassiveAggressiveClassifier(n_iter=50,
                                                C=0.2,
                                                n_jobs=-1,
                                                random_state=0)
        self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
        y_dist = self._clf.decision_function(self._term_doc_matrix._X)
        pos_ecdf = ECDF(y_dist[y_dist >= 0])
        neg_ecdf = ECDF(y_dist[y_dist <= 0])

        def proba_function(distance_from_hyperplane):
            if distance_from_hyperplane > 0:
                return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
            elif distance_from_hyperplane < 0:
                return pos_ecdf(distance_from_hyperplane) / 2.
            return 0.5

        self._proba = proba_function
        return self

    def build(self):
        '''Builds Depoyed Classifier
		'''
        if self._clf is None:
            raise NeedToTrainExceptionBeforeDeployingException()
        return DeployedClassifier(self._category,
                                  self._term_doc_matrix._category_idx_store,
                                  self._term_doc_matrix._term_idx_store,
                                  self._term_doc_matrix_factory)
Esempio n. 4
0
class PAC(BaseClassifier):
    def __init__(self,TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\
                    TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\
                    TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\
                    UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\
                    TEST_MERGE,TEST,name='pac',USE_TINY=False,RANDOMSTATE=2018):
        super(PAC, self).__init__(
                    TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\
                    TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\
                    TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\
                    UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\
                    TEST_MERGE,TEST,name,USE_TINY,RANDOMSTATE)
        '''In Ridge, only 'sag' solver can currently fit the intercept when X is sparse.'''
        self.clf = PassiveAggressiveClassifier(n_iter=50, tol=1e-3)

    def trainWithEva(self, trainval_x):
        '''fit the data with evalidation'''
        train_x, valid_x, train_y, valid_y = train_test_split(\
                            trainval_x,self.trainval['label'],\
                            test_size=0.1, random_state=self.randomstate)
        self.clf.fit(train_x, train_y)
        pred = self.clf.decision_function(valid_x)
        #print(valid_y,pred)
        score = metrics.roc_auc_score(valid_y, pred)
        print("%s on valid set accuracy:   %0.5f" % (self.name, score))
        return score

    def predict(self, test_x=None, model_path=None):
        if model_path is not None:
            self.load_model(model_path)
        if test_x is None:
            _, test_x = self.feature_engineering()
        #self.clf.decision_function(test_x)
        #print(pd.read_csv(self.ds.TEST),self.ds.TEST)
        pre = pd.read_csv(self.ds.TEST)
        #print(test_x.shape,pre.shape)
        pre['score'] = self.clf.decision_function(test_x)
        pre['score'] = pre['score'].apply(lambda x: float('%.6f' % x))
        return pre
class DeployedClassifierFactory:
	def __init__(self, term_doc_matrix, term_doc_matrix_factory, category, nlp=None):
		'''This is a class that enables one to train and save a classification model.

		Parameters
		----------
		term_doc_matrix : TermDocMatrix
		term_doc_matrix_factory : TermDocMatrixFactory
		category : str
			Category name
		nlp : spacy parser
		'''
		self._term_doc_matrix = term_doc_matrix
		self._term_doc_matrix_factory = term_doc_matrix_factory
		assert term_doc_matrix_factory._nlp is None
		assert term_doc_matrix_factory.category_text_iter is None
		self._category = category
		self._clf = None
		self._proba = None

	def passive_aggressive_train(self):
		'''Trains passive aggressive classifier

		'''
		self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0)
		self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
		y_dist = self._clf.decision_function(self._term_doc_matrix._X)
		pos_ecdf = ECDF(y_dist[y_dist >= 0])
		neg_ecdf = ECDF(y_dist[y_dist <= 0])

		def proba_function(distance_from_hyperplane):
			if distance_from_hyperplane > 0:
				return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
			elif distance_from_hyperplane < 0:
				return pos_ecdf(distance_from_hyperplane) / 2.
			return 0.5

		self._proba = proba_function
		return self

	def build(self):
		'''Builds Depoyed Classifier
		'''
		if self._clf is None:
			raise NeedToTrainExceptionBeforeDeployingException()
		return DeployedClassifier(self._category,
		                          self._term_doc_matrix._category_idx_store,
		                          self._term_doc_matrix._term_idx_store,
		                          self._term_doc_matrix_factory)
Esempio n. 6
0
class PassiveAggressiveClassifierImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
	def test_main(self):
		categories, documents = get_docs_categories()
		clean_function = lambda text: '' if text.startswith('[') else text
		entity_types = set(['GPE'])
		term_doc_mat = (
			TermDocMatrixFactory(
				category_text_iter=zip(categories, documents),
				clean_function=clean_function,
				nlp=_testing_nlp,
				feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
			).build()
		)
		clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0)
		fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
		                   clean_function=clean_function,
		                   feats_from_spacy_doc=FeatsFromSpacyDoc(
			                   entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
		tfidf = TfidfTransformer(norm='l1')
		X = tfidf.fit_transform(term_doc_mat._X)
		clf.fit(X, term_doc_mat._y)
		X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
		pred = clf.predict(tfidf.transform(X_to_predict))
		dec = clf.decision_function(X_to_predict)
Esempio n. 8
0
def predict(filename):
    df = pd.read_csv(
        'C:\\Users\\Niladri Shekhar Dutt\\Desktop\\IET-FE\\FakeNews\\fakenewsFE\\fake_or_real_news.csv'
    )
    #df = df.set_index("Unnamed: 0")
    # Set `y`
    y = df.label

    # Drop the `label` column
    df.drop("label", axis=1)
    # Make training and test sets
    X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                        y,
                                                        test_size=0.5,
                                                        random_state=53)

    # Initialize the `count_vectorizer`
    count_vectorizer = CountVectorizer(stop_words='english')

    # Fit and transform the training data
    count_train = count_vectorizer.fit_transform(X_train)

    # Initialize the `tfidf_vectorizer`
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

    # Fit and transform the training data
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)
    # Get the feature names of `tfidf_vectorizer`
    print(tfidf_vectorizer.get_feature_names()[-10:])

    # Get the feature names of `count_vectorizer`
    print(count_vectorizer.get_feature_names()[:10])

    # In[32]:

    count_df = pd.DataFrame(count_train.A,
                            columns=count_vectorizer.get_feature_names())

    # In[33]:

    tfidf_df = pd.DataFrame(tfidf_train.A,
                            columns=tfidf_vectorizer.get_feature_names())
    difference = set(count_df.columns) - set(tfidf_df.columns)
    set()
    print(count_df.equals(tfidf_df))
    count_df.head()

    # In[34]:

    tfidf_df.head()

    # In[40]:

    linear_clf = PassiveAggressiveClassifier(n_iter=50)
    linear_clf.fit(tfidf_train, y_train)

    # In[41]:

    #vec_clf = Pipeline([('vectorizer', tfidf_train), ('pac', linear_clf)])
    linear_clf.fit(tfidf_train, y_train)

    # In[42]:

    a = pd.read_csv(filename, encoding='latin1')
    # Set index
    #a=a.set_index("Unnamed: 0")
    # Print first lines of `df`
    X_test = a['text']

    # In[45]:

    # Transform the test set
    count_test = count_vectorizer.transform(X_test)
    # Transform the test set
    tfidf_test = tfidf_vectorizer.transform(X_test)
    pred = linear_clf.predict(tfidf_test)
    probs = linear_clf.decision_function(tfidf_test)

    # In[46]:

    probs = (probs + 1.0) / 2.0
    print(probs)

    # In[47]:

    flag = True
    for i in probs:
        if (i > (0.25)):
            flag = True
        else:
            flag = False

    print(flag)
    return (probs[0] * 100)
Esempio n. 9
0
pickle.dump(clf_pac, open(model_file, 'wb'))

# Saved the tfidf to transform input
tfidf_file = 'tfidf.sav'
pickle.dump(tfidf_ngram, open(tfidf_file, 'wb'))

# Fun thing
sen_test = "In 2013, Clinton told Goldman Sachs bigwigs: \
        'I would like to see people like Donald Trump run for office.\
        They're honest, and can't be bought"

sentiment_score = sen_feature(sen_test)
X_sen = tfidf_ngram.transform([sen_test])
X_sen = sp.sparse.hstack((X_sen, np.array([sentiment_score])), format='csr')
label_sen = clf_pac.predict(X_sen)
proba_truth_sen = clf_pac.decision_function(X_sen)[0]
proba_doubt_sen = (1 - abs(proba_truth_sen)) * abs(proba_truth_sen) / (
    -proba_truth_sen)
print(" PAC_model :")
print(" This new is : " + label_sen[0])
print(" The sentiment score         :" + str(sentiment_score))
print(" The truth score             :" + str(proba_truth_sen))
print(" The doublt score            :" + str(proba_doubt_sen))

X_sen2 = tokenizer.texts_to_sequences([sen_test])
X_sen2 = sequence.pad_sequences(X_sen2, maxlen=50)

label_sen2 = model.predict(X_sen2)
if label_sen2[0] < 0.5:
    label_sen2 = 'FAKE'
else:
	def run(self, nFold=3, iter=10, verbose=1):
		"""
			CV: -1 => total model (no cv)
			CV: nFold => mean metric over cv
		"""
		self.__database.createGOIDView(self.__goidtable, double=["AUROC", "AUPR", "Fmax"], drop=True)
		self.__database.createProteinView(self.__proteintable, \
						double=["ProteinID", "Label", "Score"], drop=True)
		
		# Get labels
		test = 0
		pp = permutation(self.__numproteins)
		resultid = 0
		for goid in self.__goid:
			print "____________ GOID= %d ____________" % goid
			# Get label for GOID
			goidindex = where(self.__goid==goid)
                        goidindex = int(goidindex[0])
                        print goidindex
			annotations = self.selectAnnotatedProteinsMousefunc(goidindex)

			print "0s=", len([x for x in annotations if x == 0])
			print "1s=", len([x for x in annotations if x == 1])
			print "-1s=", len([x for x in annotations if x == -1])
						
			annotation = []
                        for value in annotations:
                                annotation.append(value)

			annotation = asarray(annotation).astype(float64)
                        annotation = annotation.ravel()

			model = PassiveAggressiveClassifier(loss='hinge', n_iter=iter, verbose=verbose)
			model.fit(self.__network, annotation)
			scores = model.decision_function(self.__network)
			scores = self.convertScore(scores)
			
			per = Performance(annotations, scores)
			roc = per.AUROCGillis()
                        print "AUROC= ", roc
                        pr = per.AUPRGillis()
                        print "AUPR= ", pr
                        fmax = per.Fmax()
                        print "Fmax= ", fmax

			self.__database.insertProteinView(self.__proteintable, resultid, goid[0], -1, \
						self.__proteins, annotations, scores)
			self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], -1, [roc, pr, fmax])
			resultid += 1

			del per

			labelIx = range(self.__numproteins)
			offset = 0
			fold = 0
			meanroc = []
			meanpr = []
			meanfmax = []
			while fold < nFold:
				print "____________ Fold= %d ____________" % fold
				lastelem = min(self.__numproteins, offset+floor(self.__numproteins/nFold))
				ix = []
				for index in pp[offset+1:lastelem]:
					ix.append(labelIx[index])
				
				offset = lastelem
	
				labeltmp = []
				for value in annotations:
					labeltmp.append(float(value))
				
				labeltmp = asarray(labeltmp).astype(float64)
                                labeltmp = labeltmp.ravel()
                                print labeltmp.shape			
	
				for index in ix:
					labeltmp[index] = 0

				print "0s=", len([x for x in labeltmp if x == 0])
				print "1s=", len([x for x in labeltmp if x == 1])
				print "-1s=", len([x for x in labeltmp if x == -1])

				model = PassiveAggressiveClassifier(loss='hinge', \
							n_iter=iter, verbose=verbose)
				model.fit(self.__network, labeltmp)
				scores = model.decision_function(self.__network)
				scores = self.convertScore(scores)

				score = []
				annotation = []
				proteins = []
				for index in ix:
					score.append(float(scores[index]))
					annotation.append(annotations[index])
					proteins.append(self.__proteins[index])

				per = Performance(annotation, score)
				roc = per.AUROCGillis()
                	        print "AUROC= ", roc
				meanroc.append(roc)
                        	pr = per.AUPRGillis()
	                        print "AUPR= ", pr			
				meanpr.append(pr)
				fmax = per.Fmax()
	                        print "Fmax= ", fmax			
				meanfmax.append(fmax)

				self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], fold,\
									[roc, pr, fmax])
				self.__database.insertProteinView(self.__proteintable, resultid, goid[0],\
								fold, proteins, annotation, score)

				del proteins
				del annotation
				del score
				del per
				fold += 1
				resultid += 1

			roc_mean = reduce(lambda x, y: x + y / float(len(meanroc)), meanroc, 0)
			print "Mean AUROC= ", roc_mean
			#print sum(meanroc)/float(len(meanroc))
			pr_mean = reduce(lambda x, y: x + y / float(len(meanpr)), meanpr, 0)
			print "Mean AUPR= ", pr_mean
			#print sum(meanpr)/float(len(meanpr))
			fmax_mean = reduce(lambda x, y: x + y / float(len(meanfmax)), meanfmax, 0)
			print "Mean Fmax= ", fmax_mean

			self.__database.insertGOIDView(self.__goidtable, resultid, goid[0], nFold, \
						[roc_mean, pr_mean, fmax_mean])
			resultid += 1

			test += 1
def hoax_detection():
    #Reading data as pandas dataframe
    frame = pd.read_csv('MasterBeritaAfterCleanCombined.csv',
                        error_bad_lines=False,
                        encoding='latin1')
    frame2 = pd.read_csv('new_TestData.csv',
                         error_bad_lines=False,
                         encoding='latin1')

    # TODO: remove this line
    #    frame = frame.head(5)

    berita = ''
    berita = stem(berita)
    data = {'no': ['1'], 'berita': [berita], 'tagging': ['Hoax']}
    #    frame2 = pd.DataFrame(data, columns=['no','berita','tagging'])

    #Inspecing Shape
    frame.shape
    frame2.shape

    #Inspecting top 5 rows
    frame.head()
    frame2.head()

    #Setting the DataFrame index (row labels) using one or more existing columns
    frame = frame.set_index("no")
    frame.head()

    frame2 = frame2.set_index("no")
    frame2.head()

    y = frame.tagging
    y.head()

    y2 = frame2.tagging

    frame.drop("tagging", axis=1)
    frame.head()

    frame2.drop("tagging", axis=1)

    # print(frame['berita'])

    # print(frame['berita'])

    X_train = frame['berita']
    y_train = y
    print(X_train.shape)
    print(y_train.shape)
    # print(X_train)
    # print(y_train)
    # print(len(X_train))
    # print(len(y_train))

    # uux_train, X_test , uuy_train, y_test = train_test_split(frame2['berita'], y2, test_size=0.33, random_state=53)

    X_test = frame2['berita']
    y_test = y2
    print(len(X_test))

    # stemming
    # print(frame['berita'][0])

    # print(frame2['berita'])

    X_train.head()

    y_train.head()

    X_train, X_test, y_train, y_test = train_test_split(frame['berita'],
                                                        y,
                                                        test_size=0.33,
                                                        random_state=53)

    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()

    # count_vectorizer = case folding, tokenizing, remove stopwords
    # analyze = count_vectorizer.build_analyzer()
    # analyze("Saya mau MAKAN dimakan di tempat makan")
    # print(count_vectorizer)
    # count_vectorizer = CountVectorizer(lowercase=True, stop_words=frozenset(stopwords))

    # Fit and transform the training data.
    # count_train = count_vectorizer.fit_transform(X_train)

    # print(count_train)
    # Transform the test set
    # count_test = count_vectorizer.transform(X_test)

    # Initialize the `tfidf_vectorizer`
    tfidf_vectorizer = TfidfVectorizer(lowercase=True,
                                       stop_words=frozenset(stopwords),
                                       max_df=0.7)

    # Fit and transform the training data
    tfidf_train = tfidf_vectorizer.fit_transform(X_train)

    # Transform the test set
    tfidf_test = tfidf_vectorizer.transform(X_test)

    print(tfidf_test)

    print('separator')

    # Get the feature names of `tfidf_vectorizer`
    print(tfidf_vectorizer.get_feature_names()[-20:])

    tfidf_df = pd.DataFrame(tfidf_train.A,
                            columns=tfidf_vectorizer.get_feature_names())

    #    tfidf_df.to_excel('output-hoax-only.xlsx')

    #    print(tfidf_df)
    # Get the feature names of `count_vectorizer`
    # print(count_vectorizer.get_feature_names()[0:10])

    import matplotlib.pyplot as plt

    def plot_confusion_matrix(cm,
                              classes,
                              normalize=False,
                              title='Confusion matrix',
                              cmap=plt.cm.Blues):
        """
        See full source and example: 
        http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
        
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')

        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j,
                     i,
                     cm[i, j],
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()

    start = timeit.default_timer()

    clf = MultinomialNB()
    clf.fit(tfidf_train, y_train)
    pred = clf.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    multinomialpred = pred
    print("#Result:#Multinomial#", pred)
    print("accuracy:   %0.3f" % score)
    cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time Multinomial: ', stop - start)
    plot_confusion_matrix(
        cm,
        classes=['Hoax', 'Valid'],
        title='MultinomialNB Confusion Matrix (Predict: Test)')

    #    y_pred_prob = clf.predict_proba(tfidf_test)
    #    print(y_pred_prob)
    #    hoax_probs = y_pred_prob[:,1]
    #
    #
    #    fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=hoax_probs, pos_label='spam')
    #    # Plot
    #    plt.plot(fpr,tpr, color='red')
    #    plt.title('Receiver Operating Characteristic Curve', size=20)
    #    plt.plot([0, 1], [0, 1], color='green', linestyle=':')
    #    plt.xlabel('False Positive Rate', size=15)
    #    plt.ylabel('True Positive Rate', size=15)
    #    plt.show()

    clf = MultinomialNB()
    clf.fit(tfidf_train, y_train)
    pred = clf.predict(tfidf_train)
    score = accuracy_score(y_train, pred)
    multinomialpred = pred
    cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    plot_confusion_matrix(
        cm,
        classes=['Hoax', 'Valid'],
        title='MultinomialNB Confusion Matrix (Predict: Training)')

    start = timeit.default_timer()
    linear_clf = PassiveAggressiveClassifier()
    linear_clf.fit(tfidf_train, y_train)
    pred = linear_clf.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    passiveaggressivepred = pred
    print("#Result:#PassiveAggressiveClassifier#", pred)
    print("accuracy:   %0.3f" % score)
    cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time PassiveAggressiveClassifier: ', stop - start)
    plot_confusion_matrix(
        cm,
        classes=['Hoax', 'Valid'],
        title='PassiveAggressiveClassifier Confusion Matrix (Predict: Test)')

    linear_clf = PassiveAggressiveClassifier()
    linear_clf.fit(tfidf_train, y_train)
    pred = linear_clf.predict(tfidf_train)
    score = accuracy_score(y_train, pred)
    passiveaggressivepred = pred
    cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time PassiveAggressiveClassifier: ', stop - start)
    plot_confusion_matrix(
        cm,
        classes=['Hoax', 'Valid'],
        title='PassiveAggressiveClassifier Confusion Matrix (Predict: Training)'
    )

    start = timeit.default_timer()
    linear_clf_svm = svm.SVC()
    linear_clf_svm.fit(tfidf_train, y_train)
    pred = linear_clf_svm.predict(tfidf_test)
    score = accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    print("#Result:#SVM#", pred)
    svmpred = pred
    cm = confusion_matrix(y_test, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time SVM: ', stop - start)
    plot_confusion_matrix(cm,
                          classes=['Hoax', 'Valid'],
                          title='SVM Confusion Matrix (Predict: Test)')

    linear_clf_svm = svm.SVC()
    linear_clf_svm.fit(tfidf_train, y_train)
    pred = linear_clf_svm.predict(tfidf_train)
    score = accuracy_score(y_train, pred)
    svmpred = pred
    cm = confusion_matrix(y_train, pred, labels=['Hoax', 'Valid'])
    stop = timeit.default_timer()
    print('Time SVM: ', stop - start)
    plot_confusion_matrix(cm,
                          classes=['Hoax', 'Valid'],
                          title='SVM Confusion Matrix (Predict: Training)')

    def most_informative_feature_for_binary_classification(
            vectorizer, classifier, n=100):
        """
        See: https://stackoverflow.com/a/26980472
        
        Identify most important features if given a vectorizer and binary classifier. Set n to the number
        of weighted features you would like to show. (Note: current implementation merely prints and does not 
        return top classes.)
        """

        class_labels = classifier.classes_
        feature_names = vectorizer.get_feature_names()
        topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
        topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

        for coef, feat in topn_class1:
            print(class_labels[0], coef, feat)

        print()

        for coef, feat in reversed(topn_class2):
            print(class_labels[1], coef, feat)

    print('y_test')
    print(y_test)

    #    print('score')
    #    print(score)

    #    y_pred_prob = clf.predict_proba(tfidf_test)
    #    spam_probs = y_pred_prob[:,1]
    #    print(spam_probs)
    #
    #    # Build confusion metrics
    #    fpr, tpr, threshold = roc_curve(y_true=y_test, y_score=spam_probs, pos_label='spam')
    #    # Plot
    #    plt.plot(fpr,tpr, color='red')
    #    plt.title('Receiver Operating Characteristic Curve', size=20)
    #    plt.plot([0, 1], [0, 1], color='green', linestyle=':')
    #    plt.xlabel('False Positive Rate', size=15)
    #    plt.ylabel('True Positive Rate', size=15)
    #    plt.show()

    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(y_test,
                                     linear_clf.decision_function(tfidf_test),
                                     pos_label='neg')
    # find threshold closest to zero:
    close_zero = np.argmin(np.abs(thresholds))
    plt.plot(fpr[close_zero],
             tpr[close_zero],
             'o',
             markersize=10,
             label='threshold zero(default)',
             fillstyle='none',
             c='k',
             mew=2)
    plt.plot([0, 1], linestyle='-', lw=2, color='r', label='random', alpha=0.8)
    plt.legend(loc=4)
    plt.plot(fpr, tpr, label='ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (recall)')
    plt.title('roc_curve')
    plt.show()
    from sklearn.metrics import auc
    print('AUC score is: ', auc(fpr, tpr))

    # plot precision recall curve Multinomial
    #    disp = plot_precision_recall_curve(linear_clf, tfidf_test, y_test)
    #    y_score = linear_clf.decision_function(X_test)
    #    average_precision = average_precision_score(y_test, y_score)
    #    disp.ax_.set_title('2-class Precision-Recall curve: '
    #                       'AP={0:0.2f}'.format(average_precision))
    #    disp.show()
    #
    #    most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30)

    feature_names = tfidf_vectorizer.get_feature_names()
    sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20]

    ### Most fake
    sorted(zip(clf.coef_[0], feature_names))[:20]

    tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0])))
    for i in tokens_with_weights:
        print(i)
        break

    result = dict()
    result['multinomial'] = multinomialpred
    result['passive'] = passiveaggressivepred
    result['svm'] = svmpred

    # print(result)
    return result
Esempio n. 12
0
                matData.append(W_key)

#print "N_features", ex.shape
            W = np.asarray(
                csc_matrix((matData, (RowIndex, ColIndex)),
                           shape=ex.shape).todense())
            #print W
            #raw_input("W (output)")

            #W=W_old #dump line

            #set the weights of PA to the predicted values
            PassiveAggressive.coef_ = W
            pred = PassiveAggressive.predict(ex)

            score = PassiveAggressive.decision_function(ex)

            bintargets.append(g_it.target[i])
            if pred != g_it.target[i]:
                errors += 1
                print "Error", errors, " on example", i, "pred", score, "target", g_it.target[
                    i]
                if g_it.target[i] == 1:
                    fn += 1
                else:
                    fp += 1

            else:
                if g_it.target[i] == 1:
                    tp += 1
                else:
Esempio n. 13
0
        #print "new_features", list_for_deep[i]
        #      for key,rowDict in list_for_deep[i].iteritems():
        #          #print "key", key, "target", target
        #          #print "weight", features[i,key]
        #          exampleESN+=np.array(np.multiply(rowDict,features[i,key])).reshape(nHidden,)
        #          #print "exampleESN", exampleESN
        #        #print list_for_deep[i].keys()

        if i != 0:
            #W_old contains the model at the preceeding step
            # Here we want so use the deep network to predict the W values of the features
            # present in ex

            #set the weights of PA to the predicted values
            pred = PassiveAggressive.predict(exampleESN)
            score = PassiveAggressive.decision_function(exampleESN)
            if pred != g_it.target[i]:
                errors += 1
                print "Error", errors, " on example", i, "pred", score, "target", g_it.target[
                    i]
                if g_it.target[i] == 1:
                    fn += 1
                else:
                    fp += 1

            else:
                if g_it.target[i] == 1:
                    tp += 1
                else:
                    tn += 1
                #print "Correct prediction example",i, "pred", score, "target",g_it.target[i]
Esempio n. 14
0
class PassiveAggressive(
    IterativeComponentWithSampleWeight,
    AutoSklearnClassificationAlgorithm,
):
    def __init__(self, C, fit_intercept, tol, loss, average, random_state=None):
        self.C = C
        self.fit_intercept = fit_intercept
        self.average = average
        self.tol = tol
        self.loss = loss
        self.random_state = random_state
        self.estimator = None
        self.max_iter = self.get_max_iter()
        self.n_iter_ = None

    @staticmethod
    def get_max_iter():
        return 1024

    def get_current_iter(self):
        return self.n_iter_

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model import PassiveAggressiveClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None
            self.n_iter_ = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)
            self.C = float(self.C)

            call_fit = True
            self.estimator = PassiveAggressiveClassifier(
                C=self.C,
                fit_intercept=self.fit_intercept,
                max_iter=n_iter,
                tol=self.tol,
                loss=self.loss,
                shuffle=True,
                random_state=self.random_state,
                warm_start=True,
                average=self.average,
            )
            self.classes_ = np.unique(y.astype(int))
        else:
            call_fit = False

        # Fallback for multilabel classification
        if len(y.shape) > 1 and y.shape[1] > 1:
            import sklearn.multiclass
            self.estimator.max_iter = self.get_max_iter()
            self.estimator = sklearn.multiclass.OneVsRestClassifier(
                self.estimator, n_jobs=1)
            self.estimator.fit(X, y)
            self.fully_fit_ = True
        else:
            if call_fit:
                self.estimator.fit(X, y)
                self.n_iter_ = n_iter
            else:
                self.estimator.max_iter += n_iter
                self.estimator.max_iter = min(self.estimator.max_iter, self.max_iter)
                self.estimator._validate_params()
                lr = "pa1" if self.estimator.loss == "hinge" else "pa2"
                self.estimator._partial_fit(
                    X, y,
                    alpha=1.0,
                    C=self.estimator.C,
                    loss="hinge",
                    learning_rate=lr,
                    max_iter=n_iter,
                    classes=None,
                    sample_weight=sample_weight,
                    coef_init=None,
                    intercept_init=None
                )
                self.n_iter_ += self.estimator.n_iter_
                if (
                    self.estimator.max_iter >= self.max_iter
                        or self.estimator.max_iter > self.n_iter_
                ):
                    self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        df = self.estimator.decision_function(X)
        return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'PassiveAggressive Classifier',
                'name': 'Passive Aggressive Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': True,
                'handles_multioutput': False,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        C = UniformFloatHyperparameter("C", 1e-5, 10, 1.0, log=True)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        loss = CategoricalHyperparameter(
            "loss", ["hinge", "squared_hinge"], default_value="hinge"
        )

        tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-4,
                                         log=True)
        # Note: Average could also be an Integer if > 1
        average = CategoricalHyperparameter('average', ['False', 'True'],
                                            default_value='False')

        cs = ConfigurationSpace()
        cs.add_hyperparameters([loss, fit_intercept, tol, C, average])
        return cs
Esempio n. 15
0
  def run(self, nFold=3, loss='hinge', iter=10, verbose=1):
    log.debug("PA: run")
    (numx, numy) = self._network.shape

    pp = permutation(numx)
		
    model = PassiveAggressiveClassifier(loss=loss, n_iter=iter, verbose=verbose)
    model.fit(self._network, self._annotation.ravel())
    scores = model.decision_function(self._network)
    self._scores = self._convertScore(scores)

    fold = 0
    offset = 0
    meanroc = []
    labelIx = range(numx)
    while fold < nFold:
      log.debug("NV: ___ fold= %d ___" % fold)
      lastelem = int(min(numx, offset+floor(numx/nFold)))

      ix = []
      for index in pp[offset+1:lastelem]:
        ix.append(index)

      print lastelem

      offset = lastelem

      labeltmp = []
      for value in self._annotation:
        labeltmp.append(float(value))

      for index in ix:
        labeltmp[index] = 0

      model = PassiveAggressiveClassifier(loss=loss, n_iter=iter, verbose=verbose)
      model.fit(self._network, labeltmp)
      scores = model.decision_function(self._network)
      scores = self._convertScore(scores)

      score = []
      label = []
      protein = []
      for index in ix:
        score.append(float(scores[index]))
        label.append(int(self._annotation[index]))
        protein.append(int(self._proteinid[index]))

        self._foldlabels.append(int(self._annotation[index]))
        self._foldscores.append(float(scores[index]))
        self._foldproteins.append(int(self._proteinid[index]))

      auroc = self.AUROC(label, score)
      log.debug("AUROC= %.4f" % auroc)

      meanroc.append(auroc)

      fold += 1

    self._auroc = reduce(lambda x, y: x + y / float(len(meanroc)), meanroc, 0)
    auroc = self.AUROC(self._foldlabels, self._foldscores)

    self._TPR_FPR(self._foldlabels, self._foldscores)
Esempio n. 16
0
#Ensemblistes
svc_clf8 = LinearSVC(C=0.8)
svc_clf8.fit(np.log(x_train+1), y_train)
decision_svc=svc_clf8.decision_function(x_test)
prediction_svc8=svc_clf8.predict(x_test)
svc_score8 = accuracy_score(y_test, prediction_svc8)

Ridge_clf = RidgeClassifier(alpha=1)
Ridge_clf.fit(x_train, y_train)
decision_ridge=Ridge_clf.decision_function(x_test)
prediction_ridge=Ridge_clf.predict(x_test)
Ridge_clf_score = accuracy_score(y_test, prediction_ridge)

PAC_clf = PassiveAggressiveClassifier(C=0.1)
PAC_clf.fit(x_train, y_train)
decision_pac=PAC_clf.decision_function(x_test)
prediction_PAC=PAC_clf.predict(x_test)
PAC_clf_score = accuracy_score(y_test, prediction_PAC)

from sklearn.linear_model import RandomizedLogisticRegression
RandomizedLogisticRegression_clf = RandomizedLogisticRegression(C=5,n_jobs=-1)
RandomizedLogisticRegression_clf.fit(x_train, y_train)

prediction_RandomizedLogisticRegression=RandomizedLogisticRegression_clf.predict(x_test)
RandomizedLogisticRegression_clf_score = accuracy_score(y_test, prediction_RandomizedLogisticRegression)

####################################################################
#Affichage des score des différents modèles
print('Score modele %s est de %s' % ('RF',score_rf))
print('Score modele %s est de %s' % ('Ext',score_ext))
print('Score modele %s est de %s' % ('Sig',sig_score))
Esempio n. 17
0
fileObject_lsa = open(file_Name_lsa, 'wb')
pickle.dump(passive_lsa, fileObject_lsa)
fileObject_lsa.close()

file_vectorizer_open = open("global_intent_tfidf_vectorizer.p", 'wb')
pickle.dump(vectorizer, file_vectorizer_open)
file_vectorizer_open.close()

file_lsa_vectorizer_open = open("global_intent_lsa_vectorizer.p", 'wb')
pickle.dump(lsa, file_lsa_vectorizer_open)
file_lsa_vectorizer_open.close()

while (1):
    out_put = []
    test_text = raw_input('Enter: ')
    test_text_clean = [
        each_word for each_word in test_text.split()
        if each_word not in stop_words
    ]
    test_text_lmtzr = [
        lmtzr.lemmatize(each_word) for each_word in test_text_clean
    ]
    out_put.append(' '.join(test_text_lmtzr))
    out_put_vector = vectorizer.transform(out_put)
    out_put_class = passive_tfidf.predict(out_put_vector)
    print 'tf-idf: ', out_put_class
    print 'tf-idf: ', passive_tfidf.decision_function(out_put_vector)
    out_put_vector_lsa = lsa.transform(out_put_vector)
    print 'lsa: ', passive_lsa.predict(out_put_vector_lsa)
    print 'lsa: ', passive_lsa.decision_function(out_put_vector_lsa)
Esempio n. 18
0
def predict(filename):
    df=pd.read_csv('C:\\Users\\Niladri Shekhar Dutt\\Desktop\\IET-FE\\FakeNews\\fakenewsFE\\fake_or_real_news.csv')

    y = df.label


    df.drop("label", axis=1)

    X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.5, random_state=53)


    count_vectorizer = CountVectorizer(stop_words='english')


    count_train = count_vectorizer.fit_transform(X_train)


    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)


    tfidf_train = tfidf_vectorizer.fit_transform(X_train)

    print(tfidf_vectorizer.get_feature_names()[-10:])


    print(count_vectorizer.get_feature_names()[:10])





    count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())





    tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
    difference = set(count_df.columns) - set(tfidf_df.columns)
    set()
    print(count_df.equals(tfidf_df))
    count_df.head()





    tfidf_df.head()





    linear_clf = PassiveAggressiveClassifier(n_iter=50)
    linear_clf.fit(tfidf_train, y_train)



    linear_clf.fit(tfidf_train, y_train)





    a=pd.read_csv(filename,encoding='latin1')

    X_test=a['text']



    count_test = count_vectorizer.transform(X_test)

    tfidf_test = tfidf_vectorizer.transform(X_test)
    pred=linear_clf.predict(tfidf_test)
    probs=linear_clf.decision_function(tfidf_test)




    probs=(probs+1.0)/2.0
    print(probs)




    flag=True
    for i in probs:
        if(i>(0.25)):
            flag=True
        else:
            flag=False

    print(flag)
    return (probs[0]*100)
X_vector = vectorizer.fit_transform(training_text)

clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(X_vector,training_class)


file_Name = "classif_test.p"
fileObject = open(file_Name,'wb') 
pickle.dump(clf, fileObject)
fileObject.close()




while(1):
	out_put = []
	out_put.append(raw_input('Enter: ').lower())
	out_put_vector = vectorizer.transform(out_put)
	out_put_class = clf.predict(out_put_vector)
	print out_put_class
	print clf.decision_function(out_put_vector)
	# print clf.predict_proba(out_put_vector)