def load_predict(directory): clf = joblib.load('svm_trained.joblib') documents = utils.load_dirs_custom(directory) documents = utils.n_gram_documents_range(documents, 5, 6) documents = np.array(documents) doc_test = utils.convert_docs_to_lines(documents) predicted_lines = [] target_lines = [] for doc in doc_test: lines = clf.predict(doc.data) predicted_lines += list(lines) return predicted_lines
from sklearn.linear_model import SGDClassifier from sklearn import metrics from sklearn.externals import joblib documents = utils.load_dirs_custom([ './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged', './NON_PERSONAL_DATA' ]) documents = utils.n_gram_documents_range(documents, 5, 6) documents = np.array(documents) doc_train, doc_test, = utils.document_test_train_split(documents, 0.01) print("Doc train: ", len(doc_train)) print("Doc test: ", len(doc_test)) X_train, y_train = utils.convert_docs_to_lines(doc_train) X_test, y_test = utils.convert_docs_to_lines(doc_test) text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='none', learning_rate='optimal', alpha=1e-4, epsilon=0.1, max_iter=1000, tol=None, shuffle=True))]) print("Training Model")
from sklearn.decomposition import TruncatedSVD from scipy.stats import randint as sp_randint # Models to try from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer # from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import confusion_matrix from sklearn.metrics import fbeta_score from sklearn.pipeline import Pipeline import utils documents = utils.load_dirs_custom([ '../SENSITIVE_DATA/html-tagged', '../PERSONAL_DATA/html-tagged', '../NON_PERSONAL_DATA' ]) X_info, y_info = utils.convert_docs_to_lines(documents) count_vect = CountVectorizer() doc_count = count_vect.fit_transform(X_info) print(doc_count) print('==============') print(y_info) np.save("count_x_line", doc_count) np.save("count_y_line", y_info)