def load_predict(directory):
    clf = joblib.load('svm_trained.joblib')
    documents = utils.load_dirs_custom(directory)
    documents = utils.n_gram_documents_range(documents, 5, 6)
    documents = np.array(documents)
    doc_test = utils.convert_docs_to_lines(documents)
    predicted_lines = []
    target_lines = []
    for doc in doc_test:
        lines = clf.predict(doc.data)
        predicted_lines += list(lines)
    return predicted_lines
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.externals import joblib

documents = utils.load_dirs_custom([
    './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged',
    './NON_PERSONAL_DATA'
])

documents = utils.n_gram_documents_range(documents, 5, 6)
documents = np.array(documents)
doc_train, doc_test, = utils.document_test_train_split(documents, 0.01)

print("Doc train: ", len(doc_train))
print("Doc test: ", len(doc_test))
X_train, y_train = utils.convert_docs_to_lines(doc_train)
X_test, y_test = utils.convert_docs_to_lines(doc_test)

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf',
                      SGDClassifier(loss='hinge',
                                    penalty='none',
                                    learning_rate='optimal',
                                    alpha=1e-4,
                                    epsilon=0.1,
                                    max_iter=1000,
                                    tol=None,
                                    shuffle=True))])

print("Training Model")
Ejemplo n.º 3
0
from sklearn.decomposition import TruncatedSVD
from scipy.stats import randint as sp_randint

# Models to try
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#  from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score

from sklearn.pipeline import Pipeline
import utils

documents = utils.load_dirs_custom([
    '../SENSITIVE_DATA/html-tagged', '../PERSONAL_DATA/html-tagged',
    '../NON_PERSONAL_DATA'
])

X_info, y_info = utils.convert_docs_to_lines(documents)

count_vect = CountVectorizer()
doc_count = count_vect.fit_transform(X_info)
print(doc_count)
print('==============')
print(y_info)
np.save("count_x_line", doc_count)
np.save("count_y_line", y_info)