Esempio n. 1
0
class FakeNewDetector:
    def __init__(self):
        self.tf_idf_vectorizer = TfidfVectorizer(stop_words='english',
                                                 max_df=0.7)
        self.pac_model = None

    def prepare_model(self):
        df = pd.read_csv('data_set/news.csv', encoding='latin-1')
        df.head(10)
        x_train, x_test, y_train, y_test = train_test_split(df.text,
                                                            df.label,
                                                            test_size=0.2,
                                                            random_state=7)
        tfidf_train = self.tf_idf_vectorizer.fit_transform(x_train)
        tfidf_test = self.tf_idf_vectorizer.transform(x_test)
        self.pac_model = PassiveAggressiveClassifier(max_iter=50)
        result = self.pac_model.fit(tfidf_train, y_train)
        print(result)
        y_pred = self.pac_model.predict(tfidf_test)
        score = accuracy_score(y_test, y_pred)
        print(f'Accuracy: {round(score * 100, 2)}%')
        confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

    def predict_outcome(self, input_message):
        test_array = np.array([input_message])
        tfidf_test = self.tf_idf_vectorizer.transform(test_array)
        return self.pac_model.predict(tfidf_test)

    def predict_outcome_list(self, input_message_list):
        test_array = np.array(input_message_list)
        tfidf_test = self.tf_idf_vectorizer.transform(test_array)
        return self.pac_model.predict(tfidf_test)
Esempio n. 2
0
def passiveAggresive(train, test, Y_train, Y_test, column):
    '''
    Fits a Passive Aggresive Perceptron Classifer
    '''
    clf = PassiveAggressiveClassifier(C = .1, max_iter = 1000, class_weight = 'balanced', tol = 1e-3)
    clf.fit(train, Y_train[column])
    clf.predict(test)
    return clf.score(test, Y_test[column])
Esempio n. 3
0
def _passiveaggressiveclassifier(*,
                                 train,
                                 test,
                                 x_predict=None,
                                 metrics,
                                 C=1.0,
                                 fit_intercept=True,
                                 max_iter=1000,
                                 tol=0.001,
                                 early_stopping=False,
                                 validation_fraction=0.1,
                                 n_iter_no_change=5,
                                 shuffle=True,
                                 verbose=0,
                                 loss='hinge',
                                 n_jobs=None,
                                 random_state=None,
                                 warm_start=False,
                                 class_weight=None,
                                 average=False):
    """For for info visit :
        https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn.linear_model.PassiveAggressiveClassifier
    """

    model = PassiveAggressiveClassifier(
        C=C,
        fit_intercept=fit_intercept,
        max_iter=max_iter,
        tol=tol,
        early_stopping=early_stopping,
        validation_fraction=validation_fraction,
        n_iter_no_change=n_iter_no_change,
        shuffle=shuffle,
        verbose=verbose,
        loss=loss,
        n_jobs=n_jobs,
        random_state=random_state,
        warm_start=warm_start,
        class_weight=class_weight,
        average=average)
    model.fit(train[0], train[1])
    model_name = 'PassiveAggressiveClassifier'
    y_hat = model.predict(test[0])

    if metrics == 'f1_score':
        accuracy = f1_score(test[1], y_hat)
    if metrics == 'jaccard_score':
        accuracy = jaccard_score(test[1], y_hat)
    if metrics == 'accuracy_score':
        accuracy = accuracy_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Esempio n. 4
0
def train(tfidf_train, y_train, tfidf_test):
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train, y_train)

    y_pred = pac.predict(tfidf_test)

    return y_pred
Esempio n. 5
0
def constructPickles(filename):
    dataDF = pd.read_csv(filename)

    labels = dataDF.label

    # DataFlair - Split the dataset
    x_train, x_test, y_train, y_test = train_test_split(dataDF['text'],
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=7)

    # DataFlair - Initialize a TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

    # DataFlair - Fit and transform train set, transform test set
    tfidf_train = tfidf_vectorizer.fit_transform(x_train)
    tfidf_test = tfidf_vectorizer.transform(x_test)

    # DataFlair - Initialize a PassiveAggressiveClassifier
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train, y_train)

    # DataFlair - Predict on the test set and calculate accuracy
    pac = joblib.load("testPickle")
    tfidf_vectorizer = joblib.load("testPickleVector")

    tfidf_test = tfidf_vectorizer.transform(x_test)

    y_pred = pac.predict(tfidf_test)
    score = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {round(score * 100, 2)}%')

    joblib.dump(pac, "testPickle")
    joblib.dump(tfidf_vectorizer, "testPickleVector")
Esempio n. 6
0
def Predict():
    #user_inputเข้ามา
    user_input = request.form['text']
    #อ่านไฟล์dataset
    df = pd.read_csv('train.csv')
    #กำหนดค่าจากตัวเลขให้เป็นtext
    conversion_dict = {0: 'HQ', 1: 'LQ_EDIT', 2: 'LQ_CLOSE'}
    df['Body'] = df['Y'].replace(conversion_dict)
    # print(df.label.value_counts())

    # Train test split
    #แบ่งข้อมูลมาtrain
    x_train, x_test, y_train, y_test = train_test_split(df['Body'],
                                                        df['Y'],
                                                        test_size=0.25,
                                                        random_state=7,
                                                        shuffle=True)
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.75)

    # แปลงเป็นตัวเลข
    vec_train = tfidf_vectorizer.fit_transform(x_train.values.astype('U'))
    vec_test = tfidf_vectorizer.transform(x_test.values.astype('U'))

    # Train Model
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(vec_train, y_train)
    model = MultinomialNB()
    model.fit(vec_train, y_train)

    # Predict
    user_input_tranform = tfidf_vectorizer.transform([user_input])
    y_predict = pac.predict(user_input_tranform)

    return render_template("Predict.html", text=user_input, predict=y_predict)
Esempio n. 7
0
def model_PassiveAggressive(train_x, train_y, test_x, test_y, n_est=100):
    model = PassiveAggressiveClassifier()
    model.fit(train_x, train_y)
    sc = model.score(test_x, test_y)
    prediction = model.predict(test_x)
    mae = mean_absolute_error(test_y, prediction)
    return (sc, mae, prediction, model)
Esempio n. 8
0
def PassiveAggressiveClassifier_1(train_predictors,test_predictors,train_target,test_target):
    clf = PassiveAggressiveClassifier()
    clf.fit(train_predictors,train_target)
    predicted = clf.predict(test_predictors)
    accuracy = accuracy_score(test_target, predicted)
    print "Accuracy for Linear Model PassiveAggressiveClassifier: "+str(accuracy)
    return accuracy,predicted 
def passiveAgressive(train_x, train_y, test_x):
    from sklearn.linear_model import PassiveAggressiveClassifier
    # apply Linear Regression:
    model = PassiveAggressiveClassifier()
    model.fit(train_x, train_y)
    y_prediction = model.predict(test_x)
    return y_prediction
Esempio n. 10
0
def get_delay():
    result = request.form
    query_title = result['title']
    query_text = result['maintext']
    # print(query_text)
    query = get_all_query(query_title, query_text)
    # query = remove_punctuation_stopwords_lemma(query_text)
    # print(query)
    # user_input = {'query':query}
    toSearch = query_title
    query_text = [query_text]
    query_title = [query_title]
    tfidf_test_input = tfidf_vectorizer.transform(query)

    linear_clf = PassiveAggressiveClassifier()

    linear_clf.fit(tfidf_train, y_train)
    pred = linear_clf.predict(tfidf_test_input)
    print(pred)

    try:
        from googlesearch import search
    except ImportError:
        print("No module named 'google' found")

    # to search
    links = []
    for j in search(toSearch, tld="co.in", num=10, stop=10, pause=2):
        links.append(j)

    return f'<html><body><h1>{pred[0]}</h1> <a href={links[0]}> Article 1 </a><br> <a href={links[1]}> Article 2 </a>  <form action="/"> <button type="submit">back </button> </form></body></html>'
def train_and_predict_m7 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM7, stemmer_type = 'snowball')

    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 5, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 5), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    print ("Fitting Passive-Aggressive Classifer...")
    clf = PassiveAggressiveClassifier(random_state = randomState, loss = 'squared_hinge', n_iter = 100, C = 0.01)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
		# Note: minkowski with p > 2 does not work for sparse matrices
    param_grid = {'C' : [0.003, 0.01, 0.03, 0.1], 'loss': ['hinge', 'squared_hinge'], 'n_iter': [5, 10, 30, 100, 300]}
    #param_grid = {'C' : [0.003, 0.01, 0.03, 0.1, 0.3, 1], 'loss': ['hinge'], 'n_iter': [5, 10, 30, 100, 300, 1000]}
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
 def test_main(self):
     categories, documents = get_docs_categories()
     clean_function = lambda text: '' if text.startswith('[') else text
     entity_types = set(['GPE'])
     term_doc_mat = (TermDocMatrixFactory(
         category_text_iter=zip(categories, documents),
         clean_function=clean_function,
         nlp=_testing_nlp,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=entity_types)).build())
     clf = PassiveAggressiveClassifier(n_iter=5,
                                       C=0.5,
                                       n_jobs=-1,
                                       random_state=0)
     fdc = FeatsFromDoc(
         term_doc_mat._term_idx_store,
         clean_function=clean_function,
         feats_from_spacy_doc=FeatsFromSpacyDoc(
             entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
     tfidf = TfidfTransformer(norm='l1')
     X = tfidf.fit_transform(term_doc_mat._X)
     clf.fit(X, term_doc_mat._y)
     X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
     pred = clf.predict(tfidf.transform(X_to_predict))
     dec = clf.decision_function(X_to_predict)
Esempio n. 13
0
class my_model:
    def __init__(self):
        # defines the self function used in fit and predict
        self.preprocessor = CountVectorizer(stop_words='english')
        self.clf = PassiveAggressiveClassifier(C=0.1,
                                               fit_intercept=True,
                                               n_iter_no_change=10,
                                               validation_fraction=0.8)

    def fit(self, X, y):
        # do not exceed 29 mins
        X_df = get_train_test_df(X)
        XX = self.preprocessor.fit_transform(X_df)
        X_final = TfidfTransformer(norm='l2',
                                   use_idf=False,
                                   smooth_idf=False,
                                   sublinear_tf=True).fit_transform(XX)
        self.clf.fit(X_final, y)
        return

    def predict(self, X):
        # remember to apply the same preprocessing in fit() on test data before making predictions
        X_df = get_train_test_df(X)
        XX = self.preprocessor.transform(X_df)
        X_final = TfidfTransformer(norm='l2',
                                   use_idf=False,
                                   smooth_idf=False,
                                   sublinear_tf=True).fit_transform(XX)
        predictionsOfModel = self.clf.predict(X_final)
        return predictionsOfModel
Esempio n. 14
0
def get_delay():
    result = request.form
    query_title = result['title']
    query_text = result['maintext']
    # print(query_text)
    query = get_all_query(query_title, query_text)
    # query = remove_punctuation_stopwords_lemma(query_text)
    # print(query)
    # user_input = {'query':query}
    toSearch = query_title
    query_text = [query_text]
    query_title = [query_title]
    tfidf_test_input = tfidf_vectorizer.transform(query)

    linear_clf = PassiveAggressiveClassifier()

    linear_clf.fit(tfidf_train, y_train)
    pred = linear_clf.predict(tfidf_test_input)
    print(pred)

    try:
        from googlesearch import search
    except ImportError:
        print("No module named 'google' found")

    # to search
    links = []
    for j in search(toSearch, tld="co.in", num=10, stop=10, pause=2):
        links.append(j)


# <style>body{text-align: center;font-family: Arial, Helvetica, sans-serif;}</style>
# return f'<html><style>body{text-align: center;font-family: Arial, Helvetica, sans-serif;}</style><body><h1>It is a {pred[0]} news. </h1><h2>You may refer to the following articles for more details.</h2> <a href={links[0]}> Article 1 </a><br> <a href={links[1]}> Article 2 </a>  <form action="/"> <button type="submit">back </button> </form></body></html>'
    return render_template('result.html', links=links, pred=pred[0])
Esempio n. 15
0
def training():
    X, y = get_data()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=7)

    tfidf_Xtrain, tfidf_Xtest = Vectorize(X_train, X_test)

    Pac = PassiveAggressiveClassifier(C=0.5, random_state=5)

    Pac.fit(tfidf_Xtrain, y_train)

    Pac_acc = Pac.score(tfidf_Xtest, y_test)

    print(Pac_acc)

    y_pred = Pac.predict(tfidf_Xtest)

    Pac_accuracy = accuracy_score(y_test, y_pred)

    print(Pac_accuracy)

    conf_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])

    print(conf_matrix)

    clf_report = classification_report(y_test, y_pred)

    print(clf_report)

    makePickleFile(Pac)
class PassiveAggressiveModel(BaseModel):
    
    def __init__(self, cached_features):
        BaseModel.__init__(self, cached_features)
        self.model = PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, random_state=1)

    def _predict_internal(self, X_test):
        return self.model.predict(X_test)
def paclassifier(train_X, train_Y, test_X):
    print("Training model.....")
    pac = PassiveAggressiveClassifier(
        random_state=0)  # Training the Passive Aggressive Classifier Model
    pac.fit(train_X, train_Y)  # Fitting the training data into the model
    y_pred = pac.predict(
        test_X)  # Predicting the label output for the test data
    return y_pred
Esempio n. 18
0
def passive_aggressive(sample_data, test_percentage):
    """ Implement Naive Bayes and perform accuracy_tests """
    # Bag of words for first x words
    X_train, X_test, y_train, y_test = data_models.split_test_train_data(sample_data, test_percentage)
    linear_clf = PassiveAggressiveClassifier()
    linear_clf.fit(X_train, y_train)
    pred = linear_clf.predict(X_test)
    return (y_test == pred).sum() * 100 / len(y_test)
Esempio n. 19
0
 def train_passve_aggresive_classifier(self, tfidf_train, b_train,
                                       tfidf_test, b_test):
     pclass = PassiveAggressiveClassifier(max_iter=60)
     pclass.fit(tfidf_train, b_train)
     b_pred = pclass.predict(tfidf_test)
     factcheckscore = accuracy_score(b_test, b_pred)
     print(f"Accuracy Is {round(factcheckscore*100,2)}%")
     return self.save_model(pclass)
Esempio n. 20
0
def paClassify(X, Y, Xt, Yt, class_weight):

    title = "Passive Aggressive Classifier"

    classifier = PassiveAggressiveClassifier(n_iter=10, class_weight=class_weight)
    classifier.fit(X, Y)
    YPredict = classifier.predict(Xt)

    printAccuracy(YPredict, Yt, title)
Esempio n. 21
0
def classify():
    # Initialize a PassiveAggressiveClassifier
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train, y_train)

    # Predict on the test set and calculate accuracy
    y_pred_data = pac.predict(tfidf_test)
    score = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {round(score * 100, 2)}%')
    return y_pred_data
class BinaryClassifier(object):
    def __init__(self, classifier_type, scale_features=True):
        self.scale_features = scale_features
        self.classifier_type = classifier_type
        self.clear()
        self.train_std = 0
        self.random_gen = np.random.RandomState(136543785)

    def clear(self, remember_train_std_if_supported=False):
        self.positive_instances = []
        self.negative_instances = []
        # self.classifier = svm.SVC(kernel='linear')
        if self.classifier_type == CLASSIFIER_TYPE.LR:
            #print (CLASSIFIER_TYPE.LR)
            self.classifier = LogisticRegression(C=1.0)
        elif self.classifier_type == CLASSIFIER_TYPE.PA:
            #print (CLASSIFIER_TYPE.PA)
            self.classifier = PassiveAggressiveClassifier(loss='hinge', C=1.0)
        elif self.classifier_type == CLASSIFIER_TYPE.SVM:
            self.classifier = svm.SVC(kernel='linear')
        if not remember_train_std_if_supported:
            self.train_std = 0

    def add_positive_instances(self, positive_instances):
        self.positive_instances.extend(positive_instances)

    def add_negative_instances(self, negative_instances):
        self.negative_instances.extend(negative_instances)

    def train(self):
        X = self.positive_instances + self.negative_instances
        y = np.asarray([1] * len(self.positive_instances) +
                       [0] * len(self.negative_instances))

        # shuffling the train instances in case classifier is sensitive to this order
        Xy = list(zip(X, y))
        self.random_gen.shuffle(Xy)
        X[:], y[:] = zip(*Xy)

        X = mat_concat(X)

        if self.scale_features:
            if self.train_std == 0:
                self.train_std = (pointwise_mult(X, X).mean() -
                                  X.mean()**2)**0.5
            X = X / self.train_std
        # X = X/self.train_std
        self.classifier.fit(X, y)

    def predict(self, instances):
        # scaled_instances = [inst/self.train_std for inst in instances]
        instances = mat_concat(instances)
        if self.scale_features and self.train_std > 0:
            instances = instances / self.train_std
        return self.classifier.predict(instances)
Esempio n. 23
0
def reanalyze(article_text):
    df = pd.read_csv(train_file)

    # Change the labels
    df.loc[(df['label'] == 1), ['label']] = 'FAKE'
    df.loc[(df['label'] == 0), ['label']] = 'REAL'

    labels = df.label
    x_train, x_test, y_train, y_test = train_test_split(df['text'],
                                                        labels,
                                                        test_size=0.27,
                                                        random_state=7,
                                                        shuffle=True)

    # Initialize a TfidfVectorizer, vectorize the text
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

    # Fit and transform train set, transform test set
    tfidf_train = tfidf_vectorizer.fit_transform(x_train.values.astype('U'))
    tfidf_test = tfidf_vectorizer.transform(x_test.values.astype('U'))

    # Initialize a PassiveAggressiveClassifier and fit training sets
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train, y_train)

    # Predict on the test set and calculate accuracy
    y_pred = pac.predict(tfidf_test)
    score = accuracy_score(y_test, y_pred)

    vec_new = tfidf_vectorizer.transform([article_text])
    y_pred_new = pac.predict(vec_new)
    rounded_score = round(score * 100, 2)

    # confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
    print(f'Accuracy: {rounded_score}')

    result = 'reliable content' if y_pred_new[
        0] == 'REAL' else 'unreliable content'
    str = f'Brrrrr, calculating... There is a good chance that this is **{result}**.'
    save_pickle(pac, tfidf_vectorizer)

    return str
Esempio n. 24
0
def train_model():

    #just using dummy data from a text
    article = extract("/home/david/2019-ca400-taland2/src/dataset/test.txt")
    dftrain = pd.read_csv(
        '/home/david/2019-ca400-taland2/src/dataset/train.csv')
    #drops rows that have null values
    dftrain = dftrain.dropna()
    #Set column names to variables
    df_x = dftrain['text']
    df_y = dftrain['label']

    #split training data
    x_train, x_test, y_train, y_test = train_test_split(df_x,
                                                        df_y,
                                                        test_size=0.33,
                                                        random_state=53)

    # cv = CountVectorizer(stop_words = 'english', max_features = 1000)
    # x_traincv = cv.fit_transform(x_train)
    # article_testcv = cv.transform(article)

    tfv = TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000)
    x_traintf = tfv.fit_transform(x_train)
    article_testtf = tfv.transform(article)
    tfv_test = tfv.transform(x_test)

    #tfv_df = pd.DataFrame(x_traintf.A, columns = tfv.get_feature_names())
    #print(tfv_df.head())

    #accuracy = 0.873

    # mnb_clf = MultinomialNB()
    # mnb_clf.fit(x_traintf, y_train)
    # pred = mnb_clf.predict(tfv_test)
    #
    #accuracy = 0.925

    pac = PassiveAggressiveClassifier(n_iter_no_change=5,
                                      max_iter=10,
                                      early_stopping=True)
    pac.fit(x_traintf, y_train)
    pred = pac.predict(article_testtf)
    accuracy = metrics.accuracy_score(y_test, pred)

    #pred = .predict(tfv_test)
    #pred = mnb_clf.predict(article_testtf)
    #
    # if pred == [0]:
    #     print("This news article is reliable")
    # else:
    #     print("This news article is deemed unreliable")

    print("MultinomialNB accuracy:   %0.3f" % accuracy)
Esempio n. 25
0
def test_passive_aggressive_2():
    """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0"""

    tpot_obj = TPOT()
    result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0)
    result = result[result['group'] == 'testing']

    pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42)
    pagg.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
Esempio n. 26
0
def test_class_weights():
    # Test class weights.
    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                   [1.0, 1.0], [1.0, 0.0]])
    y2 = [1, 1, 1, -1, -1]

    clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight=None,
                                      random_state=100)
    clf.fit(X2, y2)
    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))

    # we give a small weights to class 1
    clf = PassiveAggressiveClassifier(C=0.1, max_iter=100,
                                      class_weight={1: 0.001},
                                      random_state=100)
    clf.fit(X2, y2)

    # now the hyperplane should rotate clock-wise and
    # the prediction on this point should shift
    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
Esempio n. 27
0
def dive2(X_train, X_test, y_train, y_test, df, data, print_report=False, print_cm=False,\
          print_top10=False, feature_names=False, target_names=False):
    # clf = KNeighborsClassifier(n_neighbors=10)
    # clf = RandomForestClassifier(n_estimators=100)
    clf = PassiveAggressiveClassifier(n_iter=50, random_state=1)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    score = metrics.accuracy_score(y_test, pred)
    recall = metrics.recall_score(y_test, pred)
    pre_score = metrics.precision_score(y_test, pred)
    print(df.columns)
    # df['Integ_Issue_Probability'] = clf.predict_proba(data)[:,1]
    df['Integ_Issue_Prediction'] = clf.predict(data)

    free_text_columns = ['All_Comments', 'Activity_Memo', 'Comment_Summary']
    # df_copy = df.copy()
    # for column in free_text_columns:
    #     df[column]= df_copy[column].apply(clean_string)

    return df, clf, score, pred, recall, pre_score
Esempio n. 28
0
def test_passive_aggressive_2():
    """Ensure that the TPOT PassiveAggressiveClassifier outputs the same as the sklearn classifier when C == 0.0"""

    tpot_obj = TPOT()
    result = tpot_obj._passive_aggressive(training_testing_data, 0.0, 0)
    result = result[result['group'] == 'testing']

    pagg = PassiveAggressiveClassifier(C=0.0001, loss='hinge', fit_intercept=True, random_state=42)
    pagg.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, pagg.predict(testing_features))
def test_class_weights():
    # Test class weights.
    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                   [1.0, 1.0], [1.0, 0.0]])
    y2 = [1, 1, 1, -1, -1]

    clf = PassiveAggressiveClassifier(C=0.1, max_iter=100, class_weight=None,
                                      random_state=100)
    clf.fit(X2, y2)
    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))

    # we give a small weights to class 1
    clf = PassiveAggressiveClassifier(C=0.1, max_iter=100,
                                      class_weight={1: 0.001},
                                      random_state=100)
    clf.fit(X2, y2)

    # now the hyperplane should rotate clock-wise and
    # the prediction on this point should shift
    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
def mainworker(limit1,limit2):
	N=10
	l=[]
	w1=[] # +1 class
	w2=[]#-1 class
	temp=[]
	classlist=[]
	f=open("pdata.txt")
	for line in f:
        	x=(line.strip("\n")).split(",")
        	temp=[]
        	for i in xrange(len(x)):
			x[i]=int(x[i])
			temp.append(x[i])
        	clas=temp.pop()
		temp=temp[:limit1]+temp[limit2+1:]
        	l.append(temp)
       		classlist.append(clas)
       		"""if(temp[-1]==-1):
                	w2.append(temp)
       		else:
                	w1.append(temp)"""
	f.close()
	X=np.array(l)
	y=np.array(classlist)

	X=np.array(l)
	y=np.array(classlist)
	karray=[2,3,4,5]
	for k in karray:
		kf = cross_validation.KFold(11054, n_folds=k)
		averager=[]
		for train_index,test_index in kf:
		#print("TRAIN:", train_index, "TEST:", test_index)
	   		X_train, X_test = X[train_index], X[test_index]
	   		y_train, y_test = y[train_index], y[test_index]
		#print X_train, len(X_test), len(y_train), len(y_test)
			train_data=[]
	        	test_data=[]
        		train_label=[]
       			test_label=[]
			X1 = X_train#train_data
			Y1 = y_train#train_label	
			clf = PassiveAggressiveClassifier()
			#clf = svm.SVC(kernel='linear')
			clf.fit(X1,Y1)
			Z = X_test#test_data
			predicted = clf.predict(Z)
			accuracy = getAccuracy(predicted, y_test)#test_label)
			averager.append(accuracy)
		answer=np.mean(averager)
		print "The mean for",k,"fold is:"
		print answer
Esempio n. 31
0
class FakeNews:
    def __init__(self, db):
        self.db = db
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        self.classifier = None
        self.tfidf_vectorizer = None

    def showData(self):
        print(self.db.shape)
        print(self.db.head())

    def splitData(self, testsize):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.db.text, self.db.label, test_size=testsize, random_state=7)

    def solve(self):

        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                                max_df=0.7)
        tfidf_train = self.tfidf_vectorizer.fit_transform(self.x_train)
        tfidf_test = self.tfidf_vectorizer.transform(self.x_test)
        self.classifier = PassiveAggressiveClassifier(max_iter=50)
        self.classifier.fit(tfidf_train, self.y_train)
        y_pred = self.classifier.predict(tfidf_test)
        score = accuracy_score(self.y_test, y_pred)
        print('Accuracy of the solved model: {} %'.format(round(
            100 * score, 2)))
        cm = confusion_matrix(self.y_test, y_pred, labels=['FAKE', 'REAL'])
        print(cm)

    def predict(self, news):
        return self.classifier.predict(self.tfidf_vectorizer.transform([news]))

    @classmethod
    def loadData(cls, data):
        return cls(db=pd.read_csv(data))
def main():
    """Train and pickle the model/tokenizer."""

    # Load the data
    df = pd.read_csv("./data/data.csv")

    # Get the labels
    labels = df.label

    # Split into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(
        df["text"], labels, test_size=0.3, random_state=7
    )

    # Initialize a TfidfVectorizer with stop words and max doc freq of 0.7
    tfidf_vec = TfidfVectorizer(stop_words="english", max_df=0.7)

    # Fit and transform train set
    tfidf_train = tfidf_vec.fit_transform(x_train)

    # Transform test set
    tfidf_test = tfidf_vec.transform(x_test)

    # Initialize a PassiveAggressiveClassifier and fit tfidf_train and y_train
    pac = PassiveAggressiveClassifier(max_iter=50)
    pac.fit(tfidf_train, y_train)

    # Predict on the test set and calc accuracy
    y_pred = pac.predict(tfidf_test)
    score = accuracy_score(y_test, y_pred)

    # Report metrics
    print(f"Done training model.\n\nAccuracy: {round(score*100, 2)}%")
    print(
        f"\nClassification Report:\n\n{classification_report(y_test, y_pred)}")

    # Pickle the classifier
    pickle.dump(pac, open("./fake-news-app/pac.pickle", "wb"))

    # Pickle the TfidfVectorizer
    pickle.dump(tfidf_vec, open(
        "./fake-news-app/tfidf-vectorizer.pickle", "wb"))

    # Save testing results
    test_df = pd.DataFrame({
        "label": y_test,
        "prediction": y_pred
    })

    test_df.to_csv("./data/model-test-results.csv", index=False)
Esempio n. 33
0
def pac(x, y, x_t, y_t, y_pred):
    score = 0
    t = 0
    for i in range(48):
        classifier = PassiveAggressiveClassifier(max_iter=len(x[i]))
        try:
            classifier.fit(np.array(x[i]), np.array(y[i]))
            y_pred[i] = classifier.predict(x_t[i])
            score += classifier.score(x_t[i], y_t[i])
            t += 1
        except:
            print('error in ' + str(i))
            y_pred[i] = np.zeros(17)
            continue
    return score / t
Esempio n. 34
0
    def Passive_Aggressive(X_train, Y_train, X_test, Y_test):
        ######################  Passive Aggressive ###########################--Code from ASTD
        classifier = PassiveAggressiveClassifier(n_iter=100)
        classifier.fit(X_train, Y_train)
        # Predicting the Test set results
        y_pred = classifier.predict(X_test)
        # Making the Confusion Matrix
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(Y_test, y_pred)
        if len(cm[0]) == 2:
            total_correct_predictions = cm[0, 0] + cm[1, 1]
        elif len(cm[0]) == 3:
            total_correct_predictions = cm[0, 0] + cm[1, 1] + cm[2, 2]
        total_predictions_made = np.sum(cm)
        accuracy = total_correct_predictions / total_predictions_made * 100

        return accuracy
Esempio n. 35
0
class PassiveAggressiveClassifierImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
class PassiveAgressiveClassifier(Classifier):
    def __init__(self, matrixdatabase):
        self._matrix_database = matrixdatabase
        self._has_fit = False
        self._occ = OCC(C=0.0083, n_iter=27, loss="hinge")

    def learn(self, ingredients, cuisine):
        return

    def classify(self, ingredients):
        if not self._has_fit:
            matrix, classes = self._matrix_database.make_train_matrix()
            self._occ = self._occ.fit(matrix, classes)
            print "Fitting complete..."
            self._has_fit = True
        output = self._occ.predict(self._matrix_database.make_row_from_recipe(ingredients))
        return output[0]
class PassiveAgressiveClassifier(Classifier):
    def __init__(self, matrixdatabase):
        self._matrix_database = matrixdatabase
        self._has_fit = False
        self._occ = OCC(C=0.0083, n_iter=27, loss='hinge')

    def learn(self, ingredients, cuisine):
        return

    def classify(self, ingredients):
        if not self._has_fit:
            matrix, classes = self._matrix_database.make_train_matrix()
            self._occ = self._occ.fit(matrix, classes)
            print('Fitting complete...')
            self._has_fit = True
        output = self._occ.predict(
            self._matrix_database.make_row_from_recipe(ingredients))
        return output[0]
Esempio n. 38
0
def main():
    #stemmer = SnowballStemmer('english')
    #stemmer = EnglishStemmer()

    training_data=open('trainingdata.txt', 'rU')
    n = int(training_data.readline().strip())    
    
    train_data = []
    class_data = []

    for i in range(n):
        line = training_data.readline().strip()
        train_data.append(line[1:].strip())
        class_data.append(int(line[0]))
        
    train_data = np.array(train_data)
    class_data = np.array(class_data)


    # 2) Vectorize bag of words
    vectorizer = TfidfVectorizer(stop_words="english", max_df=0.5, sublinear_tf=True )
    vectorizer.fit(train_data)
    X_train = vectorizer.transform(train_data)
        
  
    
    # Read test data from input
    X_test = np.array([raw_input().strip() for i in range(int(raw_input().strip()))])

    X_test = vectorizer.transform(X_test)

    clf = PassiveAggressiveClassifier(n_iter=9) 
    
    clf.fit(X_train, class_data)
    
    pred = clf.predict(X_test)
    for i in pred:
        print i
	def test_main(self):
		categories, documents = get_docs_categories()
		clean_function = lambda text: '' if text.startswith('[') else text
		entity_types = set(['GPE'])
		term_doc_mat = (
			TermDocMatrixFactory(
				category_text_iter=zip(categories, documents),
				clean_function=clean_function,
				nlp=_testing_nlp,
				feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
			).build()
		)
		clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0)
		fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
		                   clean_function=clean_function,
		                   feats_from_spacy_doc=FeatsFromSpacyDoc(
			                   entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
		tfidf = TfidfTransformer(norm='l1')
		X = tfidf.fit_transform(term_doc_mat._X)
		clf.fit(X, term_doc_mat._y)
		X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
		pred = clf.predict(tfidf.transform(X_to_predict))
		dec = clf.decision_function(X_to_predict)
Esempio n. 40
0

clf = MultinomialNB() 
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])


# Testing
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(n_iter=50)
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])



clf = MultinomialNB(alpha=0.1)
last_score = 0
for alpha in np.arange(0,1,.1):
    nb_classifier = MultinomialNB(alpha=alpha)
    nb_classifier.fit(tfidf_train, y_train)
    pred = nb_classifier.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred)
    if score > last_score:
Esempio n. 41
0
#https://www.hackerrank.com/challenges/document-classification/submissions/code/10577787
# Enter your code here. Read input from STDIN. Print output to STDOUT
documents=[]
target=[]
cnt=0
from sklearn.linear_model import PassiveAggressiveClassifier
with open("trainingdata.txt","rb") as infile:
    for line in infile:
        if cnt==0:
            cnt=1
            continue
        category=int(line[0:2])
        doc=line[2:]
        target.append(category)
        documents.append(doc)
from sklearn.feature_extraction.text import TfidfVectorizer
transformer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word',stop_words='english')
X = transformer.fit_transform(documents)
from sklearn.naive_bayes import MultinomialNB
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(X, target)

n=int(raw_input())
for i in range(0,n):
    X=transformer.transform([raw_input()])
    print(clf.predict(X))[0]

Esempio n. 42
0
#print X_train_tfidf.shape

ntest = input()
testdoc = []
for t in range(0, ntest):
    doc = raw_input()
    testdoc.append(doc)

X_new_counts = count_vect.transform(testdoc)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
""""
#Naive bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)

#test random forest

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)
"""
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(n_iter=50)
clf = clf.fit(X_train_tfidf, trainlabel)
predicted = clf.predict(X_new_tfidf)

for t in range(0, ntest):
    print predicted[t]
def tweetread():   
    data = []
    catagory = []
    
    results_traffic = collection_aa.find({"manualtype":{"$ne":"/^non*/"}}) 
    for i,item in enumerate(results_traffic):
        text = unicodedata.normalize('NFKD', item["text"]).encode('ascii','ignore').decode('utf-8')
        text = re.sub(r"@([A-Za-z]+[A-Za-z]+[A-Za-z0-9-_\.]+)", "", text)
        print(text)
        data.append(str(text))
        catagory.append(0)
    results_nontraffic = collection_mapped.find({"_id":{"$regex":"2014/04/18/09*"}}) 
    nontraffic = []
    data = data[:5000]
    catagory = catagory[:5000]
    #docs = [{f["text"]:"TRAFFIC"} for f in results_traffic]
    print(len(data), " TRAFFIC SIZE ")
    
    for res in results_nontraffic:
        #print(len(res["item"]))
        for i in res["item"]:
            if len(data) < 10000:
                text = unicodedata.normalize('NFKD', i["text"]).encode('ascii','ignore').decode('utf-8')
                #if not check_in(['delays', 'crash', 'cleared'] , text):
                text = re.sub(r"@([A-Za-z]+[A-Za-z0-9-_\.]+)", "", text)
                print(text)
                data.append(text)
                catagory.append(1)
                #else:
                #    print(text)
    print(len(data), "SAMPLE SIZE ")
    vectorizer =  TfidfVectorizer(
        analyzer='word',  # features made of words
        token_pattern=r'[a-z]{3,}',
        use_idf=True,
        strip_accents='unicode',
        #ngram_range=(2,3),
        sublinear_tf=True, max_df=0.95, min_df=0.05,stop_words='english')
    #vectorizer =  DictVectorizer();
   
    X_train = vectorizer.fit_transform(data)
    X_test = vectorizer.transform(data)
    feature_names = vectorizer.get_feature_names()#np.vectorize(vectorizer.get_feature_names())
    print(feature_names);
    print(X_test)
    print(data[0])
    print(data[1])
    
    
    #BernoulliNB(alpha=.01)
    #nb_classifier = BernoulliNB(alpha=.01).fit(X_train, catagory)
    #nb_classifier = RidgeClassifier(tol=1e-2, solver="lsqr").fit(X_train, catagory)
    #nb_classifier = Perceptron(n_iter=50).fit(X_train, catagory)
    nb_classifier = PassiveAggressiveClassifier(n_iter=50).fit(X_train, catagory)
    #nb_classifier = MultinomialNB(alpha=.01).fit(X_train, catagory)
    y_nb_predicted = nb_classifier.predict(X_test)
    
    print("Dimensionality: %d" % nb_classifier.coef_.shape[0])
    show_most_informative_features(vectorizer, nb_classifier, n=50)
    print("traffic     :"  + str(traffic_label))
    print("traffic score    #:"  + str(traffic_scores))
    print("non  :"  + str(nontraffic_label))        
    print("non score #:"  + str(nontraffic_scores))
    
  
    
    print("MODEL: Multinomial Naive Bayes\n")
    
    print('The precision for this classifier is ' + str(metrics.precision_score(catagory, y_nb_predicted)));
    print('The recall for this classifier is ' + str(metrics.recall_score(catagory, y_nb_predicted)));
    print('The f1 for this classifier is ' + str(metrics.f1_score(catagory, y_nb_predicted)));
    print('The accuracy for this classifier is ' + str(metrics.accuracy_score(catagory, y_nb_predicted)));
    
    print('\nHere is the classification report:');
    print(classification_report(catagory, y_nb_predicted));
    print(metrics.confusion_matrix(catagory, y_nb_predicted, labels=[0,1]))
    
    results_nontraffic = collection_mapped.find({"_id":{"$regex":"2014/04/*"}}) 
    nontraffic = []
    data = data[:1000]
    catagory = catagory[:1000]
    #docs = [{f["text"]:"TRAFFIC"} for f in results_traffic]
    print(len(data), " TRAFFIC SIZE ")
    
    f = open('classifier.pickle', 'wb')
    v = open('vector.pickle', 'wb')
    pickle.dump(nb_classifier, f)
    pickle.dump(vectorizer, v)
    f.close()
    
    for res in results_nontraffic:
        for item in res["item"]:
            text = unicodedata.normalize('NFKD', item["text"]).encode('ascii','ignore').decode('utf-8')
            X_test = vectorizer.transform([text])
            y_nb_predicted = nb_classifier.predict(X_test)
            #score = metrics.f1_score(X_test, y_nb_predicted)
            if y_nb_predicted == 0:
                #if check_in(['delays', 'crash', 'cleared'] , text):
                #print("PREDICTED", text)
                print("", text,"\\\\")
Esempio n. 44
0
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)

result1 = tpot_data.copy()

# Perform classification with a passive aggressive classifier
pagr1 = PassiveAggressiveClassifier(C=0.81, loss="squared_hinge", fit_intercept=True, random_state=42)
pagr1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)

result1['pagr1-classification'] = pagr1.predict(result1.drop('class', axis=1).values)
#clf = neighbors.KNeighborsClassifier(K,weights = 'distance', leaf_size= 30)

from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(trans, y)

#f = open("testDatatextClassification.txt",'r')
f = open("input01.txt",'r')
f2 = open("output01.txt","r")
d = f.readlines()
d = d[1:]
ans = map(int,f2.readlines())
t0= time.clock()

summing = 0;
for j,i in enumerate(d):
    sol = int(clf.predict(vectorizer.transform([i]).toarray())[0])
    #print sol, ans[j]
    if (sol==ans[j]):
        summing = summing + 1
    #clf.predict(vectorizer.transform([i]).toarray())[0]
t= time.clock() - t0
print t
print 100*float(summing-(len(ans)-summing))/len(ans)
print len(ans)





#stem the words
bag_of_words=vectorizer.fit(ls)
bag_of_words=vectorizer.transform(ls)
cmax=0
for cc in range(1,100):
    #sw=stopwords.words() #stopwords are not supported, requires download
    clf = PassiveAggressiveClassifier(n_iter=9,C=cc/10)
#    svm=LinearSVC(C=cc/10.0)
    clf.fit(bag_of_words,ln)
    
    #Now get input (test) data
    lt=[]
    filename=open("testdata.txt")
    line = filename.readline()
    ntests=int(line)
    for _ in range(ntests):
        lt.append(filename.readline())
    
    bag_of_test_words=vectorizer.transform(lt)
    result=clf.predict(bag_of_test_words)
    actuals=[]
    filename=open("testresults.txt")
    z=0
    for x in range(len(result)):
        zz = int(filename.readline())
        if zz==int(result[x]):
            z=z+1
    acc=(float(z)-(len(result)-float(z)))/len(result)
    if cmax<acc: cmax=acc
    print cc
    print cmax*100