Example #1
0
for k, v in enumerate(test_texts):
    test_data_list.append(test_texts[v])

test_data_flat = []
for sublist in test_data_list:
    for item in sublist:
        test_data_flat.append(item)

# Preprocessing texts
tokenizer = TweetTokenizer()
punctuation = string.punctuation + "’“”.»«…°"
stpwords_fr = stopwords.words("french")
stpwords_en = stopwords.words("english")
cleaned_train_data_texts = clean_host_texts(
    data=train_data_flat,
    tok=tokenizer,
    stpwds=stpwords_fr + stpwords_en,
    punct=punctuation,
)
cleaned_test_data = clean_host_texts(
    data=test_data_flat,
    tok=tokenizer,
    stpwds=stpwords_fr + stpwords_en,
    punct=punctuation,
)

# Logistic Regression Model
clf_lgr = Pipeline([
    (
        "vect",
        TfidfVectorizer(
            decode_error="ignore",
Example #2
0
train_hosts, y_train = get_train_data(train_file)
texts_path = "../text/text"
texts = import_texts(texts_path)

with open(data + "test.csv", "r") as f:
    test_hosts = f.read().splitlines()

train_data = generate_data(train_hosts, texts)

# Preprocessing texts
tokenizer = TweetTokenizer()
punctuation = string.punctuation + "’“”.»«…°"
stpwords_fr = stopwords.words("french")
stpwords_en = stopwords.words("english")
cleaned_train_data = clean_host_texts(data=train_data,
                                      tok=tokenizer,
                                      stpwds=stpwords_fr + stpwords_en,
                                      punct=punctuation)

dict_y = dict([(j, i + 2) for (i, j) in enumerate(set(y_train))])
y = [dict_y[x] for x in y_train]

# Pipeline: TF-IFD + Logistic Regression
clas = Pipeline([
    ("vect", TfidfVectorizer(decode_error="ignore", sublinear_tf=True)),
    ("clf", LogisticRegression()),
])

# The list of hyper-parameters we want to optimize. For each one we define the bounds,
# the corresponding scikit-learn parameter name, as well as how to sample values
# from that dimension ('log-uniform' for the learning rate)
space = [