Exemple #1
0
def getSVDChart():
    data_set = DataSet()
    data, label, wm = data_set.get_train_data_set()
    indexs = random.sample(range(len(data)), 28000)
    data = data[indexs]
    label = label[indexs]
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(data)

    truncatedSVD = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
    truncatedSVD = truncatedSVD.fit(X_train_counts)

    X_r = truncatedSVD.transform(X_train_counts)
    showChart(X_r, label, "PCA metric graph", len(X_r), 5000)
Exemple #2
0
def getChiChart(numComp=5):
    data_set = DataSet()
    data, label, wm = data_set.get_train_data_set()
    indexs = random.sample(range(len(data)), 10000)
    data = data[indexs]
    label = label[indexs]
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(data)

    chi2_model = SelectKBest(chi2, k=numComp)
    chi2_model = chi2_model.fit(X_train_counts, label)
    X_new = chi2_model.transform(X_train_counts)

    print(chi2_model.scores_.shape)
    print(chi2_model.scores_)
    showChart(x=X_new,
              y=label,
              title="Chi squared",
              range_=X_new.shape[0],
              limit=3000)
Exemple #3
0
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from data import DataSet
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from conf_lib import plot_confusion_matrix
from sklearn.decomposition import TruncatedSVD
import random

data_set = DataSet()
data, label, class_names = data_set.get_train_data_set()

indexs = random.sample(range(len(data)), 50000)
data = data[indexs]
label = label[indexs]
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    label,
                                                    test_size=0.33,
                                                    random_state=42)

est = [('count_vect', CountVectorizer()),
       ('tr', TruncatedSVD(n_components=10, n_iter=100, random_state=42)),
       ('clf_NB', GaussianNB())]

pipeline_NB = Pipeline(est)

pipeline_NB = pipeline_NB.fit(X_train, y_train)