from sklearn.model_selection import train_test_split from sklearn.preprocessing import Imputer import utils # Load the punkt tokenizer used for splitting documents into sentences tokenizer = nltk.data.load('tokenizers/punkt/german.pickle') data_set = utils.load_dirs_custom([ './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged', './NON_PERSONAL_DATA' ]) doc_train, doc_test, = utils.document_test_train_split( data_set, 0.20 ) X_train, y_train = utils.convert_docs_to_lines(doc_train) X_test, y_test = utils.convert_docs_to_lines(doc_test) def document_to_word_list(document, remove_stopwords=True): document_text = re.sub("[^a-zA-Z]"," ", document) words = document_text.lower().split() if remove_stopwords: stops = set(stopwords.words("german")) words = [w for w in words if not w in stops]
from sklearn.metrics import fbeta_score from sklearn.model_selection import GridSearchCV import utils from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from sklearn import metrics from sklearn.externals import joblib documents = utils.load_dirs_custom([ './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged', './NON_PERSONAL_DATA' ]) documents = utils.n_gram_documents_range(documents, 5, 6) documents = np.array(documents) doc_train, doc_test, = utils.document_test_train_split(documents, 0.01) print("Doc train: ", len(doc_train)) print("Doc test: ", len(doc_test)) X_train, y_train = utils.convert_docs_to_lines(doc_train) X_test, y_test = utils.convert_docs_to_lines(doc_test) text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='none', learning_rate='optimal', alpha=1e-4, epsilon=0.1, max_iter=1000,
from sklearn.pipeline import Pipeline import utils from sklearn.model_selection import KFold kf = KFold(n_splits=5, shuffle=True) print("---Loading Data---") documents = utils.load_dirs_custom([ './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged', './NON_PERSONAL_DATA' ]) print("---Creating N_grams---") documents = utils.n_gram_documents_range(documents, 2, 2) doc_data, doc_vault, = utils.document_test_train_split(documents, 0.10) doc_data = np.array(doc_data) argument_sets = [] for train_index, test_index in kf.split(doc_data): print("TRAIN:", train_index, "TEST:", test_index) doc_train = doc_data[train_index] doc_test = doc_data[test_index] X_train, y_train = utils.convert_docs_to_lines(doc_train) X_test, y_test = utils.convert_docs_to_lines(doc_test) argument_sets += [(X_train, X_test, y_train, y_test)] text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),