def load_predict(directory):
    clf = joblib.load('svm_trained.joblib')
    documents = utils.load_dirs_custom(directory)
    documents = utils.n_gram_documents_range(documents, 5, 6)
    documents = np.array(documents)
    doc_test = utils.convert_docs_to_lines(documents)
    predicted_lines = []
    target_lines = []
    for doc in doc_test:
        lines = clf.predict(doc.data)
        predicted_lines += list(lines)
    return predicted_lines
Ejemplo n.º 2
0
from keras.regularizers import l1
from keras.callbacks import EarlyStopping
import utils

from keras import backend as K

import numpy as np
import pickle

from keras.utils import np_utils

#utils.personal_categories_dict (.inv)

documents = utils.load_dirs_custom([
    '../../TAGGED_DATA_NEW_NEW/SENSITIVE_DATA/html-tagged',
    '../../TAGGED_DATA_NEW_NEW/PERSONAL_DATA/html-tagged',
    '../../TAGGED_DATA_NEW_NEW/NON_PERSONAL_DATA'
],
                                   individual=True)

x = []
y = []
for document in documents:
    lines = document.lines
    categories = []
    for line in lines:
        for category in line.categories:
            if category not in categories:
                categories.append(category)
    x += ['\n'.join(document.data)]
    y += [categories]
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score
from sklearn.model_selection import GridSearchCV
import utils
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.externals import joblib

documents = utils.load_dirs_custom([
    './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged',
    './NON_PERSONAL_DATA'
])

documents = utils.n_gram_documents_range(documents, 5, 6)
documents = np.array(documents)
doc_train, doc_test, = utils.document_test_train_split(documents, 0.01)

print("Doc train: ", len(doc_train))
print("Doc test: ", len(doc_test))
X_train, y_train = utils.convert_docs_to_lines(doc_train)
X_test, y_test = utils.convert_docs_to_lines(doc_test)

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf',
                      SGDClassifier(loss='hinge',
Ejemplo n.º 4
0
    new_arr = []
    for value in arr:
        new_arr.append((value - minimum) / (maximum - minimum))
    return new_arr


def show_overfit_plot():
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()


documents = utils.load_dirs_custom([
    '../../anondata_lines/sensitive', '../../anondata_lines/personal',
    '../../anondata_lines/nonpersonal'
],
                                   individual=True)

documents = utils.n_gram_documents_range(documents, 8, 8)

doc_train, doc_test, = utils.document_test_train_split(documents, 0.05)

print("Doc train: ", len(doc_train))
print("Doc test: ", len(doc_test))
x_train, y_train = utils.convert_docs_to_lines(doc_train)
x_test, y_test = utils.convert_docs_to_lines(doc_test)

y_train = np.where((y_train == 2) | (y_train == 1), 1, 0)
y_test = np.where((y_test == 2) | (y_test == 1), 1, 0)