for k, v in enumerate(test_texts): test_data_list.append(test_texts[v]) test_data_flat = [] for sublist in test_data_list: for item in sublist: test_data_flat.append(item) # Preprocessing texts tokenizer = TweetTokenizer() punctuation = string.punctuation + "’“”.»«…°" stpwords_fr = stopwords.words("french") stpwords_en = stopwords.words("english") cleaned_train_data_texts = clean_host_texts( data=train_data_flat, tok=tokenizer, stpwds=stpwords_fr + stpwords_en, punct=punctuation, ) cleaned_test_data = clean_host_texts( data=test_data_flat, tok=tokenizer, stpwds=stpwords_fr + stpwords_en, punct=punctuation, ) # Logistic Regression Model clf_lgr = Pipeline([ ( "vect", TfidfVectorizer( decode_error="ignore",
train_hosts, y_train = get_train_data(train_file) texts_path = "../text/text" texts = import_texts(texts_path) with open(data + "test.csv", "r") as f: test_hosts = f.read().splitlines() train_data = generate_data(train_hosts, texts) # Preprocessing texts tokenizer = TweetTokenizer() punctuation = string.punctuation + "’“”.»«…°" stpwords_fr = stopwords.words("french") stpwords_en = stopwords.words("english") cleaned_train_data = clean_host_texts(data=train_data, tok=tokenizer, stpwds=stpwords_fr + stpwords_en, punct=punctuation) dict_y = dict([(j, i + 2) for (i, j) in enumerate(set(y_train))]) y = [dict_y[x] for x in y_train] # Pipeline: TF-IFD + Logistic Regression clas = Pipeline([ ("vect", TfidfVectorizer(decode_error="ignore", sublinear_tf=True)), ("clf", LogisticRegression()), ]) # The list of hyper-parameters we want to optimize. For each one we define the bounds, # the corresponding scikit-learn parameter name, as well as how to sample values # from that dimension ('log-uniform' for the learning rate) space = [