Esempio n. 1
0
from sklearn.linear_model import Ridge
from scipy.sparse import hstack

df=pd.read_csv('salary-train.csv')
df_test=pd.read_csv('salary-test-mini.csv')
target=df['SalaryNormalized']
df['FullDescription'] = df['FullDescription'].str.lower()
df['FullDescription'] = df['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
df_test['FullDescription'] = df_test['FullDescription'].str.lower()
df_test['FullDescription'] = df_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
df=df.drop(['SalaryNormalized'], axis=1)
df_test=df_test.drop(['SalaryNormalized'], axis=1)



vectorizer = TfidfVectorizer(min_df=5)
X = vectorizer.fit_transform(df['FullDescription'])
X_test=vectorizer.transform(df_test['FullDescription'])



enc = DictVectorizer()
for row in df.loc[df.ContractTime.isnull(), 'ContractTime'].index:
    df.at[row, 'ContractTime'] = 'sad23'
for row in df_test.loc[df_test.ContractTime.isnull(), 'ContractTime'].index:
    df_test.at[row, 'ContractTime'] = 'sad23'
X_train_categ = enc.fit_transform(df[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ=   enc.transform(df_test[['LocationNormalized', 'ContractTime']].to_dict('records'))
matr=hstack([X,X_train_categ])
matr1=hstack([X_test,X_test_categ])
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

captions = []
caption_file = open("cap.txt", encoding="utf8")
for caption in caption_file:
   captions.append(caption.split(' ', 1)[1])

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(captions)

true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),

print("\n")
print("Prediction")

Y = vectorizer.transform(["FLOOD WATERS CHURNING JUST UNDER THE DUCK RIVER BRIDGE ON HIGHWAY 50 NEAR I-40 EXIT 148."])
prediction = model.predict(Y)
print("FLOOD WATERS CHURNING JUST UNDER THE DUCK RIVER BRIDGE ON HIGHWAY 50 NEAR I-40 EXIT 148.")
print(prediction)
Esempio n. 3
0
def create_word_tf_idf_for_ip(df, whole_df):
    """ word level tf-idf """
    tfidf_vector = TfidfVectorizer(analyzer='word', max_features=5000)
    tfidf_vector.fit(whole_df['text'])
    x_test_tfidf = tfidf_vector.transform(df['text'])
    return x_test_tfidf
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load(open(words_file, "r"))
authors = pickle.load(open(authors_file, "r"))

### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
#from sklearn import cross_validation
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(
    word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()

### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]

### your code goes here
from sklearn import tree
from sklearn.metrics import accuracy_score

clf = tree.DecisionTreeClassifier()
def tfidf_vec(corpus):
    tfidf = TfidfVectorizer()
    train_vec = tfidf.fit_transform(corpus)
    # for test data
    # tfidf.transform(['ya Allah meri sister Affia ki madad farma', 'khud chahta a is umar main shadi'])
    return train_vec, tfidf
Esempio n. 6
0
# train
call_txt_tr = []
for i, row in trainLabels.iterrows():
    call_txt_tr.append(' '.join(
        get_call_list('{}stdcall_grepper/'.format(feats_path) + row['Id'])))

# train
call_txt_te = []
for i, row in sampleSubmission.iterrows():
    call_txt_te.append(' '.join(
        get_call_list('{}stdcall_grepper/'.format(feats_path) + row['Id'])))

logging.info('-> vectorizing...')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vect = TfidfVectorizer(max_features=10000)
vect.fit(call_txt_tr + call_txt_te)
X_call_tr = vect.transform(call_txt_tr)
X_call_te = vect.transform(call_txt_te)

logging.info('-> apply NMF...')
from sklearn.decomposition import TruncatedSVD, NMF
from scipy import sparse

nmf = NMF(n_components=10, sparseness='data')
nmf.fit(sparse.vstack([X_call_tr, X_call_te]))
X_calls_nmf_tr = nmf.transform(X_call_tr)
X_calls_nmf_te = nmf.transform(X_call_te)

# funcs

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer

clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])
clf_2 = Pipeline([
    ('vect', HashingVectorizer(non_negative=True)),
    ('clf', MultinomialNB()),
])
clf_3 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])


clfs = [clf_1, clf_2, clf_3]
"""
AMHPC
# if defined (_OPENMP)
    pragma omp parallel for
# endif
"""
for clf in clfs:
    evaluate_cross_validation(clf, X_train, y_train, 5)

Esempio n. 8
0
        if word in stopwords:
            continue
        else:
            newSent.append(word)

    return newSent


'''
word2vec by tfidf
'''
corpus = []
for i in range(len(mydata)):
    corpus.append(' '.join(sent2word(mydata['data'][i])))

vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')
tfidf = vectorizer.fit_transform(corpus)
print('the num of doc: %d' % tfidf.shape[0])
print('the size of dict: %d' % tfidf.shape[1])

X = np.array(tfidf.todense())  #把稀疏矩阵输出成真实矩阵
y = np.array(mydata['label'])
y_crowd = np.array(mydata['crowd3'])
print(X.shape)
'''
disrupt the order of the data
'''
rng = np.random.RandomState(0)  #创建随机数生成器
indices = np.arange(len(mydata))  #生成固定长度数组
rng.shuffle(indices)  #随机变换数组内元素顺序
X = X[indices[:]]
Esempio n. 9
0
    explained_var.append(pca.explained_variance_ratio_.sum())
    if pca.explained_variance_ratio_.sum() > 0.6:
        break

pca_train = PCA(n_components=260)
pca_train.fit(X_train)
pca_test = pca.transform(X_test)
pca_train = pca.transform(X_train)
model = KMeans(n_clusters=5, max_iter=100)
clustered = model.fit(pca_train)
labels_pred = model.predict(pca_test)
metrics.fowlkes_mallows_score(y_test, labels_pred)

#Create  TF-IDF with no dimensionality reduction
data_Tfidf = pd.Series([' '.join(data) for data in data])
vectorizer = TfidfVectorizer()
data_Tfidf = vectorizer.fit_transform(data_Tfidf).toarray()

vectorizer = TfidfVectorizer()
data_Tfidf = vectorizer.fit_transform(data).toarray()

#Split train test
stratSplit = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
for train_index, test_index in stratSplit.split(data_Tfidf, label):
    X_train, X_test = data_Tfidf[train_index], data_Tfidf[test_index]
    y_train, y_test = label[train_index], label[test_index]

#Kmeans on TF-IDF with no dimensionality reduction
model = KMeans(n_clusters=5, max_iter=100)
clustered = model.fit(X_train)
labels_pred = model.predict(X_test)
def hierarchical_clustering(datasetDir, preprocessing, distance):

    all_data = datasets.load_files(datasetDir,
                                   description=None,
                                   load_content=True,
                                   encoding='utf-8',
                                   shuffle=False)

    prefix = ClusteringDir2 + "\\" + preprocessing + "_" + distance
    """
    Apply Tf-idf vectorizer with stop words
    """
    count_vectorizer = TfidfVectorizer(stop_words='english')
    """
    Learn vocabulary and tf-idf, return term-document matrix.
    """
    X = count_vectorizer.fit_transform(raw_documents=all_data.data).toarray()
    """
    Apply Dimensionality reduction using truncated SVD (aka LSA).
    """
    svd = TruncatedSVD(n_components=200)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    data = lsa.fit_transform(X)

    labels = plot_clusters(data, cluster.AgglomerativeClustering,
                           preprocessing, distance, (), {
                               'n_clusters': n_clusters,
                               'linkage': 'complete',
                               'affinity': distance
                           })

    clustering_file = prefix + '_clustering_result.data'
    clustering_file_handle = open(clustering_file, 'w', encoding='utf-8')

    for index in range(0, 10):
        sub_corpus = []
        corpus_files = []
        # cluster_label = []
        for key, label in enumerate(labels):
            if index == label:
                sub_corpus.append(all_data.data[key])
                corpus_files.append(all_data.filenames[key])

        top_n_words, _ = get_top_n_words_n_que(sub_corpus, 10)
        top_n_words = np.array(top_n_words)

        clustering_file_handle.write("cluster %d, label %s\n" %
                                     (index, list(top_n_words[:, 0])))
        for file in corpus_files:
            clustering_file_handle.write("\t %s\n" % file)
# print(all_data.filenames[key])

# Plot silhouette score
    sil = []
    for n_cluster in range(4, 30):
        model = cluster.AgglomerativeClustering(n_clusters=n_cluster,
                                                affinity=distance,
                                                linkage='complete').fit(X)
        labels = model.labels_
        model.__dict__
        sil.append(silhouette_score(X, labels, metric='euclidean'))
        # model = KMeans(random_state=42, n_clusters=n_cluster)
        # Svisualizer = SilhouetteVisualizer(model)
        # Svisualizer.fit(X)    # Fit the data to the visualizer
        # Svisualizer.poof()    # Draw/show/poof the data
        # plt.
    plt.plot(list(range(4, 30)), sil)
    plt.grid(True)
    plt.savefig(prefix + "_sihouette_score.png")
    plt.close()
    """
    Plot the hierarchical clustering as a dendrogram
    """

    plt.figure(figsize=(10, 7))
    plt.title("Hierarchical Clustering Dendograms")
    dend = shc.dendrogram(shc.linkage(data, method='complete'))
    plt.savefig(prefix + "_hierarchical clustering_Dendograms.png")
    plt.close()

    top_n_words, que_n_words = get_top_n_words_n_que(all_data.data,
                                                     top_word_count)
    """
    Output n top frequent words into file
    """

    top_n_words_file = prefix + "top_n_words.data"
    out_filepath_handle = open(top_n_words_file, "w")
    word_names = []
    word_freqs = []
    reverse_freqs = []
    for word in top_n_words:
        word_names.append(word[0])
        word_freqs.append(word[1])
        reverse_freqs.append(word[1])
        out_filepath_handle.write(str(word) + '\n')
    out_filepath_handle.close()
    """
    visualize the n top frequent words 
    """
    index = np.arange(0, top_word_count)
    reverse_freqs.reverse()
    word_names.reverse()
    plt.barh(index, reverse_freqs)
    plt.yticks(index, word_names)
    plt.title(str(top_word_count) + " top frequent words")
    plt.ylabel("words")
    plt.xlabel("frequency")
    plt.savefig(prefix + "_" + str(top_word_count) + " top frequent words.png")
    plt.close()

    word_freqs = np.array(reverse_freqs)
    cooccurrence_matrix = np.outer(word_freqs, word_freqs)

    ax = sns.heatmap(cooccurrence_matrix, linewidth=0.1)
    plt.yticks(index, word_names, rotation='horizontal')
    plt.xticks(index, word_names, rotation='vertical')
    plt.title("Words Co-occurrence")
    plt.savefig(prefix + "_Co-occurrence.png")
    plt.close()
Esempio n. 11
0
df_test = pd.read_csv(os.path.join(DATA_PATH, 'q_test.csv'))
df_val = df_test.head(100)
df_test = df_test.drop(df_test.head(100).index)  # Not to use the validation data used in 5.1 for model selection
dict_latent_traits = pickle.load(open(os.path.join(DATA_PATH, 'known_latent_traits.p'), "rb"))

# define latent traits calibrator (known latent traits)
latent_traits_calibrator = KnownParametersCalibrator(dict_latent_traits)

for min_df in np.arange(0.00, 0.11, 0.02):
    for max_df in np.arange(0.90, 1.01, 0.02):

        file = open("outputs/5_1_model_selection_LR_mindf_%.2f_maxdf_%.2f.txt" % (min_df, max_df), 'w')
        file.write("MIN_DF = %.2f - MAX DF = %.2f" % (min_df, max_df))

        # pipeline difficulty
        vec_b = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df)
        pipe_b = FeatureEngAndRegressionPipeline(
            FeatureEngineeringModule([
                IRFeaturesComponent(vec_b, concatenate_correct=True, concatenate_wrong=True),
                LinguisticFeaturesComponent(),
                ReadabilityFeaturesComponent(),
            ]),
            RegressionModule([
                SklearnRegressionComponent(LinearRegression(), latent_trait_range=B_RANGE)
            ])
        )
        # pipeline discrimination
        vec_a = TfidfVectorizer(stop_words='english', preprocessor=preproc, min_df=min_df, max_df=max_df)
        pipe_a = FeatureEngAndRegressionPipeline(
            FeatureEngineeringModule([
                IRFeaturesComponent(vec_a, concatenate_correct=True, concatenate_wrong=True),
Esempio n. 12
0
vectorizer = CountVectorizer(max_features=2000,
                             min_df=3,
                             max_df=0.6,
                             stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

# BoW to TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = transformer.fit_transform(X).toarray()

# TF-IDF Vectorizer
# so we don't need to countVectorizer and TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=2000,
                             min_df=3,
                             max_df=0.6,
                             stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()

# creating training and test set
from sklearn.model_selection import train_test_split
text_train, text_test, sent_train, sent_test = train_test_split(X,
                                                                y,
                                                                test_size=0.2,
                                                                random_state=0)

# train data with logistic regression
# from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression()
# classifier.fit(text_train, sent_train)
seed = 1024
np.random.seed(seed)
from config import path

ft = ['question1_distinct_unigram_question2_distinct_unigram']
train = pd.read_csv(path+"train_cooccurrence_distinct.csv")[ft]
test = pd.read_csv(path+"test_cooccurrence_distinct.csv")[ft]

len_train = train.shape[0]

max_features = None
ngram_range = (1,1)
min_df = 3
print('Generate tfidf')
feats= ['question1_distinct_unigram_question2_distinct_unigram']
vect_orig = TfidfVectorizer(max_features=max_features,ngram_range=ngram_range, min_df=min_df)

corpus = []
for f in feats:
    train[f] = train[f].astype(str)
    test[f] = test[f].astype(str)
    corpus+=train[f].values.tolist()

vect_orig.fit(
    corpus
    )

for f in feats:
    train_tfidf = vect_orig.transform(train[f].values.tolist())
    test_tfidf = vect_orig.transform(test[f].values.tolist())
def tf_idf(message):
    tfidfVectorizer = TfidfVectorizer()
    term_matrix = tfidfVectorizer.fit_transform(message)
    pd.set_option('display.max_columns', None)
    features = pd.DataFrame(term_matrix.toarray(),columns=tfidfVectorizer.get_feature_names())
    return features, tfidfVectorizer
Esempio n. 15
0
def TFIDF(X_train, X_test, MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with", str(np.array(X_train).shape[1]), "features")
    return (X_train, X_test)
Esempio n. 16
0
train_words_list=train_words_list1+train_words_list2+train_words_list3+train_words_list4
train_labels=train_labels1+train_labels2+train_labels3+train_labels4

#测试数据
test_words_list1, test_labels1 = loadfile('data/test/女性', '女性')
test_words_list2, test_labels2 = loadfile('data/test/体育', '体育')
test_words_list3, test_labels3 = loadfile('data/test/文学', '文学')
test_words_list4, test_labels4 = loadfile('data/test/校园', '校园')


test_words_list=test_words_list1+test_words_list2+test_words_list3+test_words_list4
test_labels=test_labels1+test_labels2+test_labels3+test_labels4

stop_words=open('stopword.txt','r',encoding='utf-8').read()
#列表头部\ufeff处理
stop_words=stop_words.encode('utf-8').decode('utf-8-sig')
#根据分隔符分离
stop_words=stop_words.split('\n')
#计算单词权重
tf=TfidfVectorizer(stop_words=stop_words,max_df=0.5)

train_features=tf.fit_transform(train_words_list)
test_features=tf.transform(test_words_list)

#多项式贝叶斯分类器
from  sklearn.naive_bayes import  MultinomialNB
clf=MultinomialNB(alpha=0.001).fit(train_features,train_labels)
predicted_labels=clf.predict(test_features)
#计算准确率
print('准确率为:',metrics.accuracy_score(test_labels,predicted_labels))
Esempio n. 17
0
def main():

    print("Reading Training Data")
    training = read_training_data("../data/train_with_test.csv")
    nb = MultinomialNB()

    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                       token_pattern=ur'\b\w+\b',
                                       stop_words=None,
                                       min_df=3)
    tfidf_features = tfidf_vectorizer.fit_transform(training["data"])

    lr = LogisticRegression(C=.1,
                            class_weight=None,
                            dual=False,
                            fit_intercept=True,
                            intercept_scaling=1,
                            penalty='l2',
                            tol=0.0001)
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                        token_pattern=ur'\b\w+\b',
                                        stop_words=None,
                                        min_df=3,
                                        binary=True)
    text_features = bigram_vectorizer.fit_transform(training["data"])

    feature_functions = [
        get_words_upper, get_personal_refs, get_word_count, get_common_insults,
        get_common_swear_words, get_letters_upper, get_exaggeration
    ]
    features = extract_features(training["data"], feature_functions)

    lr.fit(text_features, training["labels"])
    lr_preds = lr.predict_proba(text_features)

    nb.fit(tfidf_features, training["labels"])
    nb_preds = nb.predict_proba(tfidf_features)

    rf = RandomForestClassifier(n_estimators=500)
    rf.fit(features, training["labels"])
    rf_preds = rf.predict_proba(features)

    gb_features = numpy.empty((len(lr_preds), 3))
    for i in range(len(lr_preds)):
        gb_features[i][0] = (lr_preds[i][1])
        gb_features[i][1] = (rf_preds[i][1])

        gb_features[i][2] = (nb_preds[i][1])

    gb = GradientBoostingClassifier(n_estimators=200)
    gb.fit(gb_features, training["labels"])

    print("Reading Test Data")
    test = read_final_test_data("../data/impermium_verification_set.csv")
    text_features_test = bigram_vectorizer.transform(test["data"])

    tfidf_features_test = tfidf_vectorizer.transform(test["data"])

    features = extract_features(test["data"], feature_functions)
    lr_preds = lr.predict_proba(text_features_test)
    rf_preds = rf.predict_proba(features)

    nb_preds = nb.predict_proba(tfidf_features_test)

    gb_features = numpy.empty((len(lr_preds), 3))

    lr_pred = []
    rf_pred = []

    gb_pred = []
    nb_pred = []

    for i in range(len(lr_preds)):
        gb_features[i][0] = (lr_preds[i][1])
        gb_features[i][1] = (rf_preds[i][1])
        gb_features[i][2] = (nb_preds[i][1])

        lr_pred.append(lr_preds[i][1])
        rf_pred.append(rf_preds[i][1])

        nb_pred.append(nb_preds[i][1])

    predictions = gb.predict_proba(gb_features)

    output_file = "submission.csv"
    print("Writing submission to %s" % output_file)
    f = open(output_file, "w")
    f.write("id,insult\n")

    for i in range(len(test["data"])):
        f.write("%d,%f\n" % (test["ids"][i], predictions[i][1]))
        gb_pred.append(predictions[i][1])
    f.close()
Esempio n. 18
0
print(
    '\n-------------Naive Bayes Classification with BOW Vectorization-------------'
)
accuracyDict["BOW-NB"] = NaiveBayesClassification(trainX, trainY, testX, testY,
                                                  le)

print(
    '\n-------------K Nearest Neighbor Classification with BOW Vectorization-------------'
)
accuracyDict["BOW-knn"] = KnnClassification(trainX, trainY, testX, testY, le)
# endregion

#   - #### Tf-idf vectorization

# region
tfIdfVectorizer = TfidfVectorizer(max_features=1000)

trainX = tfIdfVectorizer.fit_transform(trainDataSet['CONTENT'])
testX = tfIdfVectorizer.transform(testDataSet['CONTENT'])

print('-------------SVM Classification with TfIdf Vectorization-------------')
accuracyDict["TfIdf-SVM"] = SvmClassification(trainX, trainY, testX, testY, le)

print(
    '\n-------------Random Forests Classification with TfIdf Vectorization-------------'
)
accuracyDict["TfIdf-RandomForests"] = RandomForestClassification(
    trainX, trainY, testX, testY, le)

print(
    '\n-------------Naive Bayes Classification with TfIdf Vectorization-------------'
Esempio n. 19
0
    return text;

filt_X=raw_X.apply(filtering);
X=filt_X.apply(wordToken);

#Plotting histogram for the length of the comments
Len_X=X.apply(len);
plt.hist(Len_X,bins=50);
plt.show();
#print(Len_X);

#Splitting data into two sets: Train and Test
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3);

#Transforming text to feature vectors using TfidfVectorizer
TV=TfidfVectorizer(max_features=5000);
X_train=TV.fit_transform(X_train);
X_test=TV.transform(X_test);

column_names=Y_train.columns;

#Training model using training data and calculating the accuracy
from sklearn.linear_model import LogisticRegression;
LR=LogisticRegression(C=12.0);

for x in column_names:
    target=Y_train[x];
    LR.fit(X_train,target);
    Y_pred=LR.predict(X_test);
    Accuracy=accuracy_score(Y_test[x],Y_pred);
    print("Accuracy for ",x,":",Accuracy);
test_data          = pd.DataFrame()
test_data['Mail']  = X_test
test_data['Class'] = y_test

#test_data          = test_data.sample(frac = 1)
test_data.reset_index(inplace = True, drop = True)

############################################################################## Encoding y_train and y_test ############################################################

encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test  = encoder.fit_transform(y_test)

# creating the tf_idf vectorizer
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None)

tfidf_vect.fit(data['Mail'])
X_train_tfidf =  tfidf_vect.transform(X_train)
X_test_tfidf =  tfidf_vect.transform(X_test)


# creating a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(data['Mail'])


# transform the training and validation data using count vectorizer object
X_train_count =  count_vect.transform(X_train)
X_test_count =  count_vect.transform(X_test)
Esempio n. 21
0
 def train(self):
     vectorDimension = 200
     scriptDir = os.path.dirname(__file__)
     corpusPath = os.path.join(scriptDir, 'data', 'domain', '*.txt')
     listOfCorpusFiles = sorted(glob.glob(corpusPath))
     print("\tProcessing following Corpus files:",
           *listOfCorpusFiles,
           sep='\n\t')
     corpus = []
     faq = []
     for fileName in listOfCorpusFiles:
         corpusFile = codecs.open(fileName, 'r', encoding='utf-8')
         corpus.append(corpusFile.read())
     faqPath = os.path.join(scriptDir, 'data', 'faq', '*.txt')
     listOfFaqFiles = sorted(glob.glob(faqPath))
     print("\n\tProcessing following FAQ files:",
           *listOfFaqFiles,
           sep='\n\t')
     for fileName in listOfFaqFiles:
         faqFile = codecs.open(fileName, 'r', encoding='utf-8').read()
         i = 1
         for line in faqFile.split('\n'):
             if (line.count('?') > 1):
                 print(
                     "\tSEVERE:Found multiple questions in %s at line %d." %
                     (os.path.basename(fileName), i))
                 print("\tSEVERE:Aborting the process..!!!")
                 sys.exit("\tAborting...")
             line = line.replace('$', 'USD')
             line = line.replace('"', '\'')
             que, ans = line.split('?')
             corpus.append(que + ' ?')
             faq.append(line)
             i += 1
     print('\n\tTotal no of questions for training: %s' % len(corpus))
     stopListFile = os.path.join(scriptDir, 'data', 'dictionary',
                                 'stopwords_en.txt')
     arrayWords = []
     stopWords = []
     f = codecs.open(stopListFile, 'r', 'utf-8')
     lines = f.read().split("\n")
     for line in lines:
         if line != "":
             arrayWords.append(line.split(','))
     for a_word in arrayWords:
         for s_word in a_word:
             if (re.sub(' ', '', s_word)) != "":
                 stopWords.append(s_word)
     extraStopWords = set(stopWords)
     stops = set(stopwords.words('english')) | extraStopWords
     tfidfVec = TfidfVectorizer(corpus,
                                decode_error='ignore',
                                stop_words=stops,
                                ngram_range=(1, 5),
                                tokenizer=m.stemTokenize_2)
     trainsetIdfVectorizer = tfidfVec.fit_transform(corpus).toarray()
     vLength = len(trainsetIdfVectorizer[1])
     nDimension = vectorDimension
     if vLength <= vectorDimension:
         nDimension = vLength - 1
     svd = TruncatedSVD(n_components=nDimension,
                        algorithm='randomized',
                        n_iter=15,
                        random_state=42)
     trainLSA = svd.fit_transform(trainsetIdfVectorizer)
     picklePath = os.path.join(scriptDir, 'model')
     fileName = os.path.join(picklePath, 'corpus.m')
     fileObject = open(fileName, 'wb')
     pickle.dump(corpus, fileObject)
     fileObject.close()
     fileName = os.path.join(picklePath, 'faq.m')
     fileObject = open(fileName, 'wb')
     pickle.dump(faq, fileObject)
     fileObject.close()
     fileName = os.path.join(picklePath, 'tfidfVec.m')
     fileObject = open(fileName, 'wb')
     pickle.dump(tfidfVec, fileObject)
     fileObject.close()
     fileName = os.path.join(picklePath, 'svd.m')
     fileObject = open(fileName, 'wb')
     pickle.dump(svd, fileObject)
     fileObject.close()
     fileName = os.path.join(picklePath, 'trainLSA.m')
     fileObject = open(fileName, 'wb')
     pickle.dump(trainLSA, fileObject)
     fileObject.close()
    msg_with_removed_num = ''.join([char for char in msg_with_removed_punc if char not in '1234567890'])
    #convert from uppercase to lowercase
    msg_aftr_converted_to_Lowercase = ''.join([char.lower() for char in msg_with_removed_num])
    #lemmatization
    lem_word_tokens = nltk.word_tokenize(msg_aftr_converted_to_Lowercase)
    lemmatized_message = ''.join([wordnet_lemmatizer.lemmatize(word) for word in lem_word_tokens ])
    #stemming
    stemming_word_tokens = nltk.word_tokenize(lemmatized_message)
    stemmed_message = ''.join([snowball_stemmer.stem(word) for word in stemming_word_tokens])
    #stop words
    stopwords_tokens = nltk.word_tokenize(stemmed_message)
    msg_with_removed_stopwords = ''.join([word for word in stopwords_tokens if word not in stopwords.words('english')])
    return msg_with_removed_stopwords

from sklearn.feature_extraction.text import TfidfVectorizer    #for feature extraction
vectorizer_new = TfidfVectorizer(analyzer = cleaning_data)
spam_tfidfvectorizer_new  = vectorizer_new.fit_transform(spam_data["v2"])

X = spam_tfidfvectorizer_new
y = spam_data['v1']

#test_train_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)


from sklearn.naive_bayes import GaussianNB

NB_classifier = GaussianNB()
NB_classifier_model = NB_classifier.fit(X_train.toarray(),y_train)
Esempio n. 23
0
    print(np.array(labelList))
        
    
def main():
    process_resume_list()


save_model = 'finalized_model.sav'
save_vector = 'finalized_vectorizer.sav'

if __name__ == '__main__':
    main()

    label=np.array(labelList)
    
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',max_features=250)
    resumes_train,resumes_test,y_train,y_test=train_test_split(resume_list,label,test_size=0.33,random_state=1)
    X_train = vectorizer.fit_transform(resumes_train)
    X_test = vectorizer.fit_transform(resumes_test)
    
    X_train_array = X_train.toarray()
    X_test_array  = X_test.toarray()
    y_test1=y_test.reshape(-1,1)
    
    
    print(vectorizer.get_feature_names())
    pickle.dump(vectorizer, open(save_vector, 'wb'))
    
    #Implementing Bernoulli Naive Bayes
    naive_bayes = BernoulliNB(alpha=1.0)
    naive_bayes.fit(X_train_array, y_train)
    test_labels = []
    for curr_class in classes:
        dirname = os.path.join(data_dir, curr_class)
        for fname in os.listdir(dirname):
            with open(os.path.join(dirname, fname), 'r') as f:
                content = f.read()
                if fname.startswith('cv9'):
                    test_data.append(content)
                    test_labels.append(curr_class)
                else:
                    train_data.append(content)
                    train_labels.append(curr_class)

    # Create feature vectors
    vectorizer = TfidfVectorizer(min_df=5,
                                 max_df = 0.8,
                                 sublinear_tf=True,
                                 use_idf=True)
    train_vectors = vectorizer.fit_transform(train_data)
    test_vectors = vectorizer.transform(test_data)

    clf = MultinomialNB()
    t0 = time.time()
    clf.fit(train_vectors, train_labels)
    t1 = time.time()
    prediction = clf.predict(test_vectors)


    t2 = time.time()
    time_train = t1-t0
    time_predict = t2-t1
Esempio n. 25
0
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from joblib import dump

data = fetch_20newsgroups()

categories = ['comp.windows.x', 'misc.forsale', 'rec.motorcycles']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(train.data, train.target)
labels = model.predict(test.data)
test.target[0:10]
n = len(test.data)
acc = [ 1 for i in range(n) if test.target[i] == labels[i] ]
print(f'Acc : {sum(acc)*100/n} %')
dump(model, 'chatgroup.model')
Acc = sum(acc)*100/n
dump(Acc, 'acc.model')
# Features which are passwords
features = data.values[:, 1].astype('str')

# Labels which are strength of password
labels = data.values[:, -1].astype('int')

# Splitting the dataset into the training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,
                                                    labels,
                                                    test_size=0.25,
                                                    random_state=0)

# Sequentially apply a list of transforms and a final estimator
classifier_model = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char')),
    ('logisticRegression',
     LogisticRegression(multi_class='multinomial', solver='sag')),
])

# Fit the Model
classifier_model.fit(X_train, y_train)

y_pred = classifier_model.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix: \n", cm)
accuracy = (cm[0][0] + cm[1][1] +
            cm[2][2]) / (cm[0][0] + cm[0][1] + cm[0][2] + cm[1][0] + cm[1][1] +
from sklearn.naive_bayes import MultinomialNB

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = TfidfVectorizer(charset='latin1')
X_train = vectorizer.fit_transform(
    (open(f).read() for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
news_test = load_mlcomp('20news-18828', 'test')
t0 = time()
print "done in %fs" % (time() - t0)

print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)
Esempio n. 28
0
from time import time
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

from dabl.search import GridSuccessiveHalving

data_train = fetch_20newsgroups(subset="train")
data_test = fetch_20newsgroups(subset="test")

pipe = Pipeline([('vect', CountVectorizer()), ('clf', LogisticRegression())])
param_grid = {
    'vect': [TfidfVectorizer(), CountVectorizer()],
    'clf__C': np.logspace(-3, 3, 7),
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]}
print("Parameter grid:")
print(param_grid)

sh = GridSuccessiveHalving(pipe, param_grid, cv=5)
print("Start successive halving")
tick = time()
sh.fit(data_train.data, data_train.target)
print("Training Time Successive Halving", time() - tick)
print("Test Score Successive Halving: ",
      sh.score(data_test.data, data_test.target))
print("Parameters Successive Halving: ", sh.best_params_)

gs = GridSearchCV(pipe, param_grid, cv=5)
                                                random_state=2020, 
                                                test_size=0.1, shuffle=True)

def multiclass_logloss(actual, predicted, eps=1e-15):
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=4000)
tfidf_vectorizer.fit(list(x_train) + list(x_val))
xtrain = tfidf_vectorizer.transform(x_train)
xval = tfidf_vectorizer.transform(x_val)

xtest = tfidf_vectorizer.transform(test.clean_script.values)

xtrain.shape, xval.shape

model = LogisticRegression()
model.fit(xtrain, ytrain)

pred_prob = model.predict_proba(xval)
print(multiclass_logloss(yval, pred_prob))

pred_test = model.predict_proba(xtest)
Esempio n. 30
0
def main():
    # loading  Data

    category = ['Accepted', 'Rejected']
    doc_to_data = skd.load_files('Dataset/',
                                 description=None,
                                 categories=category,
                                 load_content=True,
                                 encoding='ISO-8859-1',
                                 random_state=24)

    # print(doc_to_data.data)
    # print(doc_to_data.target)
    X_train, X_test, y_train, y_test = train_test_split(doc_to_data.data,
                                                        doc_to_data.target,
                                                        test_size=0.05,
                                                        random_state=24)
    # Splitting Data
    zippedList = list(zip(X_train, y_train))
    df = pd.DataFrame(zippedList, columns=['Isnad', 'Class'])

    custom_stop_words = [
        'Tell', 'Tell us', 'Narrated', 'Messenger', 'Prophet', 'Aisha',
        'Division', 'Allah', 'God', 'Lord', 'Allaah', 'He', 'She', 'A', 'They',
        '(h)', 'We', 'It'
    ]

    vector = TfidfVectorizer(encoding='ISO-8859-1',
                             lowercase=False,
                             preprocessor=preprocess,
                             tokenizer=tokenization,
                             min_df=2,
                             max_df=0.5,
                             stop_words=custom_stop_words,
                             sublinear_tf=True,
                             use_idf=True,
                             smooth_idf=True)

    #counts = vector.fit(X_train)
    # print(vector.get_feature_names())

    #transformer = TfidfTransformer(sublinear_tf=True, use_idf=True, smooth_idf=True)

    features = vector.fit_transform(X_train).toarray()
    labels = df.Class

    models = [
        RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
        LinearSVC(),
        #SVC(C=1.0, kernel='linear', degree=3, gamma='auto'),
        naive_bayes.BernoulliNB(),
        SGDClassifier(loss='hinge',
                      penalty='l2',
                      alpha=1e-3,
                      random_state=42,
                      max_iter=5,
                      tol=None),
        LogisticRegression(penalty='l2',
                           solver='liblinear',
                           C=1,
                           class_weight='balanced',
                           random_state=24,
                           tol=0.000001),
        KNeighborsClassifier(n_neighbors=3)
    ]
    CV = 5
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model,
                                     features,
                                     labels,
                                     scoring='accuracy',
                                     cv=CV)
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries,
                         columns=['model_name', 'fold_idx', 'accuracy'])

    print(cv_df.groupby('model_name').accuracy.mean())
    sns.boxplot(x='model_name', y='accuracy', data=cv_df)
    sns.stripplot(x='model_name',
                  y='accuracy',
                  data=cv_df,
                  size=8,
                  jitter=True,
                  edgecolor="gray",
                  linewidth=2)
    plot.show()