data_target = data['Label'] kfold = KFold(data_tweet, data_target, 10) data_train, data_test = kfold.get_data_sequence() i = 0 print("kfold") print(time.time() - start) start = time.time() prepro = Preprocessing() cleaned_data, terms, asd = prepro.preprocessing(data_train[i]["tweet"]) print("preprocessing") print(time.time() - start) start = time.time() tbrs = TermBasedRandomSampling(X=10, Y=10, L=40) stopwords = tbrs.create_stopwords(cleaned_data, terms) print("remove stopword") print(time.time() - start) start = time.time() prepro2 = Preprocessing() new_cleaned_data, new_terms, removed_words = prepro2.remove_stopword( cleaned_data, stopwords) print("create stopword") print(time.time() - start) start = time.time() weight = Weighting(new_cleaned_data, new_terms)
for i in range(9): x_array.append(" ") y_array.append(" ") l_array.append(" ") accuracy_total_accumulation = 0 for i in range(len(data_train)): kfold_per_combination.append(i+1) y_test = [] y_pred = [] prepro = Preprocessing() cleaned_data, terms = prepro.preprocessing(data_train[i]["tweet"]) tbrs = TermBasedRandomSampling(X=x, Y=y, L=l) stopwords = tbrs.create_stopwords(cleaned_data,terms) prepro2 = Preprocessing() new_cleaned_data, new_terms = prepro2.remove_stopword(cleaned_data, stopwords) weight = Weighting(new_cleaned_data, new_terms) tfidf = weight.get_tf_idf_weighting() idf = weight.get_idf() nb = NBMultinomial() nb.fit(new_cleaned_data,new_terms,data_train[i]["target"],stopwords,idf,tfidf) correct_ans = 0 for j in range(len(data_test[i]["tweet"])): prediction = nb.predict(data_test[i]["tweet"][j],data_test[i]["target"][j])