Example #1
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

import utils

# Load the punkt tokenizer used for splitting documents into sentences
tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')

data_set = utils.load_dirs_custom([
    './SENSITIVE_DATA/html-tagged',
    './PERSONAL_DATA/html-tagged',
    './NON_PERSONAL_DATA'
])

doc_train, doc_test, = utils.document_test_train_split(
    data_set, 0.20
)

X_train, y_train = utils.convert_docs_to_lines(doc_train)
X_test, y_test = utils.convert_docs_to_lines(doc_test)

def document_to_word_list(document, remove_stopwords=True):
    
    document_text = re.sub("[^a-zA-Z]"," ", document)
    
    words = document_text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("german"))
        words = [w for w in words if not w in stops]
    
from sklearn.metrics import fbeta_score
from sklearn.model_selection import GridSearchCV
import utils
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.externals import joblib

documents = utils.load_dirs_custom([
    './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged',
    './NON_PERSONAL_DATA'
])

documents = utils.n_gram_documents_range(documents, 5, 6)
documents = np.array(documents)
doc_train, doc_test, = utils.document_test_train_split(documents, 0.01)

print("Doc train: ", len(doc_train))
print("Doc test: ", len(doc_test))
X_train, y_train = utils.convert_docs_to_lines(doc_train)
X_test, y_test = utils.convert_docs_to_lines(doc_test)

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf',
                      SGDClassifier(loss='hinge',
                                    penalty='none',
                                    learning_rate='optimal',
                                    alpha=1e-4,
                                    epsilon=0.1,
                                    max_iter=1000,
Example #3
0
from sklearn.pipeline import Pipeline
import utils

from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True)
print("---Loading Data---")
documents = utils.load_dirs_custom([
    './SENSITIVE_DATA/html-tagged', './PERSONAL_DATA/html-tagged',
    './NON_PERSONAL_DATA'
])

print("---Creating N_grams---")
documents = utils.n_gram_documents_range(documents, 2, 2)

doc_data, doc_vault, = utils.document_test_train_split(documents, 0.10)

doc_data = np.array(doc_data)

argument_sets = []
for train_index, test_index in kf.split(doc_data):
    print("TRAIN:", train_index, "TEST:", test_index)
    doc_train = doc_data[train_index]
    doc_test = doc_data[test_index]

    X_train, y_train = utils.convert_docs_to_lines(doc_train)
    X_test, y_test = utils.convert_docs_to_lines(doc_test)
    argument_sets += [(X_train, X_test, y_train, y_test)]

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),