Example #1
0
rate_model_detection = MultinomialNB()

rate_model_detection.fit(X_train, y_train)

predictions = rate_model_detection.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('cv', CountVectorizer()), ('tfidf', TfidfTransformer()),
                     ('naive', MultinomialNB())])

X = yelp_class['text']

y = yelp_class['stars']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=101)

pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
Example #2
0
        feature_vals.append(feature_names[idx])

    # create a tuples of feature,score
    # results = zip(feature_vals,score_vals)
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]

    return results


docs = des.tolist()

cv = CountVectorizer(max_df=0.85, max_features=10000)
word_count_vector = cv.fit_transform(docs)
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)
feature_names = cv.get_feature_names()

docs = " ".join(docs)
docs = remove_html(docs)
docs = docs.replace('br', '')

tf_idf_vector = tfidf_transformer.transform(cv.transform([docs]))
sorted_items = sort_coo(tf_idf_vector.tocoo())
keywords = extract_topn_from_vector(feature_names, sorted_items, 10)

plt.bar(*zip(*keywords.items()))
plt.xticks(rotation=60)
plt.savefig("des_keywords.jpg")
plt.show()
Example #3
0
traind.head(10)

traind.shape

dbpedia_df = traind

X = dbpedia_df['sentence']
Y = dbpedia_df['condition']

count_vectorizer = CountVectorizer(min_df=0, max_df=80, ngram_range=(2, 2))

feature_vector = count_vectorizer.fit_transform(X)

feature_vector.shape

tfidf_transformer = TfidfTransformer()

feature_vector = tfidf_transformer.fit_transform(feature_vector)

feature_vector.shape

X_dense = feature_vector.todense()
X_dense.shape
x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size=0.2)

import torch
import numpy as np

Xtrain_ = torch.from_numpy(x_train).float()
Xtest_ = torch.from_numpy(x_test).float()
Example #4
0
from scipy.sparse.linalg import svds


movie_dataframe['parse'] = movie_dataframe['content'].apply(st.whitespace_nlp_with_sentences)

#corpus matrix
corpus = (st.CorpusFromParsedDocuments(movie_dataframe,
                                       category_col='review',
                                       parsed_col='parse')
              .build()
              .get_stoplisted_unigram_corpus()) 

corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['author'])

# Calculating the Eigen matrix of the corpus.....
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
u, s, vt = svds(embeddings, k=1000, maxiter=20000, which='LM')
projection = pd.DataFrame({'term': corpus.get_metadata(), 'x': u.T[0], 'y': u.T[1]}).set_index('term')


#Plot 2
category = 'positive'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
html = st.produce_pca_explorer(corpus,
                               category=category,
                               category_name='positive',
                               not_category_name='negative',
                               metadata=movie_dataframe['author'],
                               width_in_pixels=1000,
                               show_axes=False,
                               use_non_text_features=True,
Example #5
0
if __name__ == '__main__':
    args = parseArguments()

    if args["training_datafile"] is not None:
        print ("-------Training-------")
        original_training_df = getDataFrame(args["training_datafile"])
        clean_training_df = cleanDataFrame(original_training_df.copy())

        print ("splitting data into training and validation set")
        training_set, validation_set = train_test_split(clean_training_df, test_size=0.3)
        print (training_set.shape)
        print (validation_set.shape)

        pipeline = Pipeline([
                ('count_vectorizer',CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('classifier',BernoulliNB(binarize=0.0)) ])
 
        model = trainModel(training_set,pipeline)

        print ("----Validation and Classification Report----")
        predicted = predictModel(validation_set, model)
        target_classifications = list(set(validation_set["classification"]))
        print(classification_report(validation_set["classification"], predicted, target_names=target_classifications))

        print ("----Saving Model----")
        if args["modelpath"] is not None:
            saveModel(model, args["modelpath"])
        else:
            saveModel(model)
    if args["modelpath"] is not None and args["test_datafile"] is not None:
Example #6
0
def test_tfidf_transformer_type(X_dtype):
    X = sparse.rand(10, 20000, dtype=X_dtype, random_state=42)
    X_trans = TfidfTransformer().fit_transform(X)
    assert X_trans.dtype == X.dtype
Example #7
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 17 17:17:16 2018

@author: Junbin Gao  All Copyright
"""

from sklearn.feature_extraction.text import TfidfTransformer


# Initiate the transformer
transformer = TfidfTransformer(smooth_idf=False)
# Check what it is
transformer  

# Corpus with three different terms and their counts in a corpus of 6 documents
counts = [[3, 0, 1],
           [2, 0, 0],
           [3, 0, 0],
           [4, 0, 0],
           [3, 2, 0],
           [3, 0, 2]]

# Transform the corpus
tfidf = transformer.fit_transform(counts)

# This is the transformed feature matrix for 6 documents
# This matrix can be pipelined into a machine learning algorithm
# Each row is normalized to have unit Euclidean norm:
X = tfidf.toarray()  
Example #8
0
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')
            ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# https://www.oreilly.com/library/view/applied-text-analysis/9781491963036/ch04.html
text_transformer = Pipeline(
    steps=[('bow', CountVectorizer(
        analyzer=process_text)), ('tfidf', TfidfTransformer())])


def get_column_transformer_preprocessor(numeric_features, categorical_features,
                                        text_features):
    transformers = []

    if len(numeric_features) > 0:
        transformers.append(('num', numeric_transformer, numeric_features))

    if len(categorical_features) > 0:
        transformers.append(
            ('cat', categorical_transformer, categorical_features))

    for x in text_features:
        transformers.append(('txt_' + str(x), text_transformer, x))
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import f1_score

data = fetch_20newsgroups(subset="train", categories=None)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

# Parameters
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log")
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline([
    ("vect", CountVectorizer(**vectorizer_params)),
    ("tfidf", TfidfTransformer()),
    ("clf", SGDClassifier(**sdg_params)),
])
# SelfTraining Pipeline
st_pipeline = Pipeline([
    ("vect", CountVectorizer(**vectorizer_params)),
    ("tfidf", TfidfTransformer()),
    ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
])
# LabelSpreading Pipeline
ls_pipeline = Pipeline([
    ("vect", CountVectorizer(**vectorizer_params)),
    ("tfidf", TfidfTransformer()),
    # LabelSpreading does not support dense matrices
    ("todense", FunctionTransformer(lambda x: x.todense())),
    ("clf", LabelSpreading()),
Example #10
0
def main():
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
        # ('city', 'state', 'german'),
    )

    ### choose a data source ###
    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=3,
                                             n_vocab=2000,
                                             by_paragraph=True)
    # sentences, word2idx = get_wikipedia_data(n_files=5, n_vocab=2000, by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                analogies to try or increase vocab size" % w)
                notfound = True
    if notfound:
        exit()

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i, j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A.T).T

    # tsne requires a dense array
    A = A.toarray()

    # map back to word in plot
    idx2word = {v: k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:, 0], Z[:, 1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"),
                         xy=(Z[i, 0], Z[i, 1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    ### multiple ways to create vectors for each word ###
    # 1) simply set it to the TF-IDF matrix
    # We = A

    # 2) create a higher-D word embedding
    tsne = TSNE(n_components=3)
    We = tsne.fit_transform(A)

    # 3) use a classic dimensionality reduction technique
    # svd = KernelPCA(n_components=20, kernel='rbf')
    # We = svd.fit_transform(A)

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx, idx2word)

    plt.show()  # pause script until plot is closed
Example #11
0

processed_textdata1, class1 = loadtrainset(
    "C:/Users/Administrator/Desktop/dataset/train/hotel", "宾馆")
processed_textdata2, class2 = loadtrainset(
    "C:/Users/Administrator/Desktop/dataset/train/travel", "旅游")
processed_textdata3, class3 = loadtrainset(
    "C:/Users/Administrator/Desktop/dataset/train/travel", "旅游")
train_data = processed_textdata1 + processed_textdata2
classtags_list = class1 + class2

count_vector = CountVectorizer()
vecot_matrix = count_vector.fit_transform(train_data)

#TFIDF
train_tfidf = TfidfTransformer(use_idf=False).fit_transform(vecot_matrix)

clf = MultinomialNB().fit(train_tfidf, classtags_list)

testset = []

path = "C:/Users/Administrator/Desktop/dataset/tt"
allfiles = os.listdir(path)
hotel = 0
travel = 0
for thisfile in allfiles:
    path_name = path + "/" + thisfile
    new_count_vector = count_vector.transform([preprocess(path_name)])
    new_tfidf = TfidfTransformer(use_idf=False).fit_transform(new_count_vector)

    predict_result = clf.predict(new_tfidf)
Example #12
0
        train_id_list = total_id_list[:train_size]
        test_id_list = total_id_list[train_size:]

        train_title_list = total_title_list[:train_size]
        test_title_list = total_title_list[train_size:]

        train_date_list = total_date_list[:train_size]
        test_date_list = total_date_list[train_size:]

        train_code_list = total_code_list[:train_size]
        test_code_list = total_code_list[train_size:]

        count_vec = CountVectorizer(min_df=1)
        tf_train = count_vec.fit_transform(train_text_list)

        tfidf_transformer = TfidfTransformer().fit(tf_train)
        tfidf_train = tfidf_transformer.transform(tf_train)
        tf_test = count_vec.transform(test_text_list)
        tfidf_test = tfidf_transformer.transform(tf_test)

        del AllData
        del total_text_list
        del total_label_list
        del total_id_list
        del total_title_list
        del total_date_list
        del total_code_list

        del train_text_list
        del test_text_list
        del tf_train
Example #13
0
def dat_prep(nbd_train,nbd_test,k,vect_type,Type_train,Type_test,Chr_train,Chr_test,Label_train,Label_test,scaled_feats_train,scaled_feats_test,dummy_train,dummy_test):
    #Derives the Count Vectorizer or TFIDF scores for a given neighborhood sequence
    """
    Arguments:
        nbd_train = Column containing the neighborhood sequence from the training data
        nbd_test = Column containing the neighborhood sequence from the test data

        k=size of kmer
        vect_type= 'CV' for Count Vectorizer or else TFIDF Vectorizer 
        Type_train=Numerically encoded substitution Type ("A>T" encoded as 1 or "G>C" encoded as 2 and so on) from training data
        Type_test=Numerically encoded substitution Type ("A>T" encoded as 1 or "G>C" encoded as 2 and so on) from test data
        Chr_train= Chromosome number from training data
        Chr_test=Chromosome number from test data
        Label_train=Binary label (training data), where 1=Passenger and 2=Driver
        Label_test=Binary label (test data), where 1=Passenger and 2=Driver
        scaled_feats_train=Scaled genomic features (consrvation, amino acid etc.) for training data
        scaled_feats_test=Scaled genomic features (consrvation, amino acid etc.) for test data
        dummy train= One-hot encoding based feature matrix for training data
        dummy test=One-hot encoding based feature matrix for test data


    Returns:
        df_comb_train= The complete dataframe (using training data) of TFIDF or CountVect scores plus other features such as chromosome number and substitution type
        df_comb_test= The complete dataframe (using test data) of TFIDF or CountVect scores plus other features such as chromosome number and substitution type
        count_vector_train= Just the TFIDF or Count vect features (training data) also known as the Document-Term matrix
        count_vector_test= Just the TFIDF or Count vect features (test data) also known as the Document-Term matrix
        cols= feature names
        vect= The vocabulary derived from the training data
        sc= The scaling variable derived from the training data


    """
    if(vect_type=="CV"):
        vect=Pipeline([('cv1',CountVectorizer(lowercase=False))])
    else:
        vect = Pipeline([('cv1',CountVectorizer(lowercase=False)), ('tfidf_transformer',TfidfTransformer(smooth_idf=True,use_idf=True))])
        

    count_vector_train=vect.fit_transform(preprocess(nbd_train,k))
    count_vector_test=vect.transform(preprocess(nbd_test,k))
    
    df_train=pd.DataFrame(count_vector_train.todense(),columns=vect['cv1'].get_feature_names())
    df_test=pd.DataFrame(count_vector_test.todense(),columns=vect['cv1'].get_feature_names())

    if(vect_type=="tf"):
        sc=MinMaxScaler()
        #We have used fit_transform() here because we wanted to learn the vocabulary dictionary and return document-term matrix using the traininig data
        df_train=pd.DataFrame(sc.fit_transform(df_train),columns=df_train.columns)
        #We have used transform() here since we already have a pretrained vocabulary using which we just wanted to derive the term-document matrix for the test data
        df_test=pd.DataFrame(sc.transform(df_test),columns=df_test.columns)
        
    df_train['Type']=Type_train;df_test['Type']=Type_test
    df_train['Label']=Label_train;df_test['Label']=Label_test
    df_train['Chr']=Chr_train;df_test['Chr']=Chr_test
    df_comb_train=pd.concat([df_train, scaled_feats_train,dummy_train], axis=1)
    df_comb_test=pd.concat([df_test, scaled_feats_test,dummy_test], axis=1)

    df_comb_train = df_comb_train.loc[:,~df_comb_train.columns.duplicated()]
    df_comb_test = df_comb_test.loc[:,~df_comb_test.columns.duplicated()]
    cols=vect['cv1'].get_feature_names()


    return df_comb_train,df_comb_test,count_vector_train,count_vector_test,cols,vect,sc
#we have matrix of size of (10240, 12196) by calling below
def get_countVectorizer_stats():

    #vocab size
    train_count.shape

    #check vocabulary using below command
    print(countV.vocabulary_)

    #get feature names
    print(countV.get_feature_names()[:25])


#create tf-df frequency features
#tf-idf
tfidfV = TfidfTransformer()
train_tfidf = tfidfV.fit_transform(train_count)


def get_tfidf_stats():
    train_tfidf.shape
    #get train data feature names
    print(train_tfidf.A[:10])


#bag of words - with n-grams
#countV_ngram = CountVectorizer(ngram_range=(1,3),stop_words='english')
#tfidf_ngram  = TfidfTransformer(use_idf=True,smooth_idf=True)

tfidf_ngram = TfidfVectorizer(stop_words='english',
                              ngram_range=(1, 4),
Example #15
0
    8: 1,
    9: 1,
    10: 1,
    11: 1,
    12: 1,
    13: 1,
    14: 1,
    15: 1,
    16: 1,
    17: 1,
    18: 1,
    19: 1
}

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf',
                      SGDClassifier(loss='hinge',
                                    penalty='l2',
                                    alpha=1e-3,
                                    random_state=42,
                                    max_iter=5,
                                    tol=None,
                                    class_weight=class_weight))])
text_clf.fit(X_train, y_train)
predicted_SVM = text_clf.predict(X_test)
print("SVM part, metrics on test set:")
print(metrics.classification_report(y_test, predicted_SVM))

from sklearn.model_selection import GridSearchCV
parameters = {

# In[76]:

sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format((sparsity)))


# In[77]:

from sklearn.feature_extraction.text import TfidfTransformer


# In[78]:

tfidf_transformer=TfidfTransformer().fit(messages_bow)


# In[79]:

tfidf4=tfidf_transformer.transform(bow4)


# In[80]:

print(tfidf4)


# In[81]:

tfidf_transformer.idf_[bow_transformer.vocabulary_['university']]
Example #17
0
        return [self.wnl.lemmatize(token, TAG_MAP[tag[0]])
                for token, tag in pos_tag(tokenized)]
        
categories = ['alt.atheism','soc.religion.christian','comp.graphics','sci.med']

print('defining dataset')
trainingData = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
countVectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
                                  lowercase=True,
                                  strip_accents='unicode')
print('transforming data to tfidf')
xTrainCounts = countVectorizer.fit_transform(trainingData.data)
print('done')
# print(countVectorizer.vocabulary_.get(u'software'))

tfidTransformer = TfidfTransformer()
xTrainTfidf = tfidTransformer.fit_transform(xTrainCounts)

model = MultinomialNB().fit(xTrainTfidf, trainingData.target)
preds = model.predict(xTrainTfidf)
print(confusion_matrix(trainingData.target, preds))
print(accuracy_score(trainingData.target, preds))
print(classification_report(trainingData.target, preds))

new = ['This has nothing to do with church or religion', 
       'Software engineering is getting hotter and hotter nowadays']

xNewCounts = countVectorizer.transform(new)
xNewTfidf = tfidTransformer.transform(xNewCounts)

predicted = model.predict(xNewTfidf)
                                 "./data/mid_cut_jieba.txt")

x_train, x_test, y_train, y_test = train_test_split(x_text,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2017)
y = y.ravel()
y_train = y_train.ravel()
y_test = y_test.ravel()

print("Train/Test split: {:d}/{:d}".format(len(y_train), len(y_test)))
""" Naive Bayes classifier """
bayes_clf = Pipeline([
    ('vect', CountVectorizer()
     ),  # Convert a collection of text documents to a matrix of token counts
    ('tfidf', TfidfTransformer()
     ),  # Transform a count matrix to a normalized tf or tf-idf representation
    ('clf', MultinomialNB())  # Naive Bayes classifier for multinomial models
])
bayes_clf.fit(x_train, y_train)
""" Predict the test dataset using Naive Bayes"""
predicted = bayes_clf.predict(x_test)
print('Naive Bayes correct prediction: {:4.4f}'.format(
    np.mean(predicted == y_test)))
print(metrics.classification_report(y_test, predicted,
                                    target_names=categories))
""" Support Vector Machine (SVM) classifier"""
svm_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf',
        if fn[19] == 'D':
            test_demo_label = np.append(test_demo_label, fn[19])
            testDemo.append(f.read())
        if fn[19] == 'R':
            test_repub_label = np.append(test_repub_label, fn[19])
            testRepub.append(f.read())
        f.close()

    #In this part of the code we make a pipeline, though we only use the CountVectorizer
    #in this case. This portion of the code will focus on the Democratic party.
    #First the code will fit and transform the democractic data set we appended before.
    #Then convert it to a format we can analyze and find bigrams within the dataset that
    #contain keywords we're looking for, such as energy, education, and healthcare.
    #We also conveniently provide a count to also print out if necessary.
    text_pipeline = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer(use_idf=False)),
                              ('clf', MultinomialNB(alpha=0))])

    vecti = CountVectorizer(ngram_range=(2, 2))
    freqTrain = vecti.fit_transform(trainDemo).toarray()
    nameDict = vecti.get_feature_names()
    freqArray = np.sum(freqTrain, axis=0)
    countDict = dict(zip(nameDict, freqArray))
    energyDict = {}
    educationDict = {}
    healthDict = {}
    print "Intriguing Bigrams Democrats: "
    for x in countDict:
        if "energy" in x:
            print x
            if x not in energyDict:
Example #20
0
items = pd.read_csv('ml-100k/u.item',
                    sep='|',
                    names=i_cols,
                    encoding='latin-1')

n_items = items.shape[0]
print('Number of items:', n_items)

X0 = items.values
X_train_counts = X0[:, -19:]

# tfidf
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()


def get_items_rated_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:, 0]  # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1
    # while index in python starts from 0
    ids = np.where(y == user_id + 1)[0]
    item_ids = rate_matrix[ids, 1] - 1  # index starts from 0
def tfidf_transformer(bow_matrix):

    transformer = TfidfTransformer(norm='l2', smooth_idf=True, use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix
Example #22
0
    lemma_instance = WordNetLemmatizer()
    lemmas = [lemma_instance.lemmatize(word, "v") for word in no_stopwords]

    stem_instance = PorterStemmer()
    stems = [stem_instance.stem(word) for word in lemmas]

    return stems


# Creating the pipeline
print('\nCreating the pipeline ...')
pipeline = Pipeline([
    ('bow', CountVectorizer(
        analyzer=text_processing)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier',
     MultinomialNB()),  # train on TF-IDF vectors with Naive Bayes classifier
])

# Train Test Split
X = messages['message']  # features
y = messages['label']  # target
msg_train, msg_test, label_train, label_test = train_test_split(X,
                                                                y,
                                                                test_size=0.3)

# Training Pipeline
print('\nTraining the Pipeline ...')
pipeline.fit(msg_train, label_train)
def run(train_data, valid_data, test_data, truth_data):
    train_data_df = pd.DataFrame.from_dict(train_data)
    truth_data_df = pd.DataFrame.from_dict(truth_data)
    train = pd.merge(train_data_df, truth_data_df, on="id")
    data = train.values

    textFeatures = ["postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                    "targetDescription", "truthClass"]

    vals = data.tolist()
    final_vals = []
    # print(vals[0])
    for i in range(len(vals)):
        if vals[i][1] != []:
            print(vals[i][2])
            final_vals.append([vals[i][2], vals[i][4], vals[i][5], vals[i][6], vals[i][7], vals[i][8], vals[i][9]])

    vals_df = pd.DataFrame(final_vals, columns=["postText", "targetCaptions", "targetParagraphs", "targetTitle", "targetKeywords",
                    "targetDescription", "truthClass"])
    textColumns = vals_df.values.tolist()

    df = []
    y = []
    print('---------')
    print(len(final_vals))

    VALIDATION_SPLIT = 0.1
    nb_validation_samples = int(VALIDATION_SPLIT * len(final_vals))
    valid_data = final_vals[:nb_validation_samples]
    test_data = final_vals[int(0.8 * len(final_vals)):int(0.9 * len(final_vals))]
    final_vals = final_vals[0:int(len(final_vals)*0.8)]

    for i in final_vals:
        if(i[6]=="clickbait"):
            y.append(1)
        else:
            y.append(0)
    # print(textColumns[0])

    for i in range(len(final_vals)):
        text = []
        for j in range(0,6):
            k = final_vals[i][j]
            # print(k, j)
            if (j == 2 or j == 3):
                text.append(k)
            else:
                text+=k
        words = ""
        for string in text:
            string = clean_str(string)
            words +=" ".join(string.split())
        df+=[words]

    vectorizer = CountVectorizer(input='content', lowercase=False, analyzer='word', stop_words='english')
    X = vectorizer.fit_transform(df)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X)
    print(X_train_tfidf.shape)

    clf = linear_model.LinearRegression()
    clf.fit(X_train_tfidf, y)

    ### VALIDATION DATA ###

    print("Validation")
    # valid_data_df = pd.DataFrame(valid_data)
    # valid_data_df = pd.DataFrame.from_dict(valid_data)
    # valid = pd.merge(valid_data_df, truth_data_df, on="id")
    # vdata = valid.append(train).values
    # vdata = final_vals.append(valid_data_df.values).tolist()
    vdata = final_vals + valid_data

    y_valid = []
    for i in vdata:
        if (i[6] == "clickbait"):
            y_valid.append(1)
        if (i[6] == "no-clickbait"):
            y_valid.append(0)

    y_valid = pd.DataFrame(y_valid)
    print("Y_valid length", len(y_valid))
    # vdata = valid[textFeatures].append(train[textFeatures]).values.tolist()

    df_valid = []
    for i in range(len(vdata)):
        text = []
        for j in range(0, 5):
            k = vdata[i][j]
            if (j == 2 or j == 3):
                text.append(k)
            else:
                text += k
        words = ""
        for string in text:
            string = clean_str(string)
            words += " ".join(string.split())
        df_valid += [words]

    # a_train, a_val, b_train, b_val = train_test_split(df_valid, y_valid, test_size = 0.11, random_state = 42)
    predicted = []
    for v in df_valid:
        valid_X = vectorizer.transform([v])
        X_valid_tfidf = tfidf_transformer.transform(valid_X)
        predicted.append(clf.predict(X_valid_tfidf).round())

    scores = accuracy_score(y_valid, predicted)
    print("Validation Data Accuracy ", scores)

    ### TEST DATA ###

    # predicted = []
    # for t in df_test:
    #     test_X = vectorizer.transform([t])
    #     X_test_tfidf = tfidf_transformer.transform(test_X)
    #     predicted.append(model.predict(X_test_tfidf).round())
    #
    # scores = accuracy_score(y_test, predicted)

    tdata = test_data

    y_test =[]
    df_test =[]

    for i in tdata:
        if(i[6]=="clickbait"):
            y_test.append(1)
        if(i[6]=="no-clickbait"):
            y_test.append(0)

    # textColumns_test = test[textFeatures]
    # textColumns_test = textColumns_test.values.tolist()

    for i in range(len(tdata)):
        text = []
        for j in range(0,5):
            k = tdata[i][j]
            if (j == 2 or j == 3):
                text.append(k)
            else:
                text+=k
        words = ""
        for string in text:
            string = clean_str(string)
            words +=" ".join(string.split())
        df_test+=[words]

    # test_X = vectorizer.fit_transform(df_test)
    # X_test_tfidf = tfidf_transformer.fit_transform(test_X)
    # predicted = model.predict(X_test_tfidf)
    # print(clf.score(X_test_tfidf, y_test))
    predicted = []
    for t in df_test:
        test_X = vectorizer.transform([t])
        X_test_tfidf = tfidf_transformer.transform(test_X)
        predicted.append(clf.predict(X_test_tfidf).round())

    scores = accuracy_score(y_test, predicted)
    print("Test Data Accuracy ", scores)
Example #24
0
def build_model():
    pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
                         ('clf',
                          MultiOutputClassifier(RandomForestClassifier()))])
    return pipeline
Example #25
0
File: p2f.py Project: kvwwang/UCLA
        eight_test.target[k] = 0
    else:
        eight_test.target[k] = 1

eight_train.target_names = ['c','r']
eight_test.target_names = ['c','r']        
            
#%%
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import text

stop_words = text.ENGLISH_STOP_WORDS
vectorizer = CountVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(eight_train.data)
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X)

#%%

from sklearn.decomposition import TruncatedSVD

svd_model = TruncatedSVD(n_components=50, random_state=42)
X_svd = svd_model.fit_transform(X_tfidf)


#%%
from sklearn.svm import LinearSVC

LSVM = LinearSVC(loss='hinge')
X_LSVM = LSVM.fit(X_svd,eight_train.target)
Example #26
0
    # Instantiating a PorterStemmer object
    porter = PorterStemmer()
    token_words = word_tokenize(nopunc)
    stem_message = []
    for word in token_words:
        stem_message.append(porter.stem(word))
        stem_message.append(" ")
    return ''.join(stem_message)


sms['clean_message'] = sms['message'].apply(text_process)

X = sms.clean_message
y = sms.label_num

pipe = Pipeline([('bow', CountVectorizer()), ('tfid', TfidfTransformer()),
                 ('model', LogisticRegression(solver='liblinear'))])

pipe.fit(X, y)


@app.route('/')
def home():
    return render_template('home.html')


@app.route('/Predict', methods=['POST'])
def Predict():

    if request.method == 'POST':
        message = request.form['message']
Example #27
0
def calTFidf(text):
    vectorizer = CountVectorizer(lowercase=True)
    wordcount = vectorizer.fit_transform(text)
    tf_idf_transformer = TfidfTransformer()
    tfidf_matrix = tf_idf_transformer.fit_transform(wordcount)
    return vectorizer, tfidf_matrix
Example #28
0
 def transformer(self, matrixList):
     trans = TfidfTransformer()
     counts = array(matrixList)
     tfidf = trans.fit_transform(counts)
     # print type(tfidf)
     return tfidf
Example #29
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert_equal(counts_test[0, vocabulary["salad"]], 1)
        assert_equal(counts_test[0, vocabulary["tomato"]], 1)
        assert_equal(counts_test[0, vocabulary["water"]], 1)

        # stop word from the fixed list
        assert_false("the" in vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false("copyright" in vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, vocabulary["coke"]], 0)
        assert_equal(counts_test[0, vocabulary["burger"]], 0)
        assert_equal(counts_test[0, vocabulary["beer"]], 0)
        assert_equal(counts_test[0, vocabulary["pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert_equal(len(t1.idf_), len(v1.vocabulary_))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert_false(hasattr(t2, "idf_"))

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    assert_raises(ValueError, t3.transform, counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5], [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3], [1, 3]]
    assert_raises(ValueError, t3.transform, X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert_false(tv.fixed_vocabulary_)
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    assert_equal(v3.build_preprocessor(), strip_accents_ascii)

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    assert_raises(ValueError, v3.build_preprocessor)

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    assert_raises(ValueError, v3.build_analyzer)
Example #30
0
                                                    df.target,
                                                    test_size=0.2,
                                                    random_state=42)
#----------------------------------------Entrainement des modèle---------------------------------------------
#A la place d'effectuer chaque tache séparément on peut passer (l'entrainement,prédiction,transformation.......)
from sklearn.neighbors import KNeighborsClassifier


#Fonction utilisé pour adapter knn puisqu'on utilise le pipeline
class DenseTransformer(TransformerMixin):
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


# On peut une pipeline qui nous permet de passer les trois paramètres (le modèle à utiliser,le transformateur et le compteurs des mots ..)
pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                 ('model',
                  OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3)))])
# Entrainemt du modèle
model = pipe.fit(X_train, y_train)
prediction = model.predict(X_test)
filename = 'finalized_model.sav'
#sauvegarde du modéle
pickle.dump(model, open(filename, 'wb'))