def test_with_labeled_state(data_filename,replacer, classifier, bow,tfidf): data = pd.read_csv(data_filename) bow_transformer = joblib.load(bow) tfidf_transformer = joblib.load(tfidf) classifier = joblib.load(classifier) data_labels = data['state'].tolist() data_texts = data['content'] #preprocess data worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) word_list = preprocess(data_texts, replacer) text_vectors_list = worker.content_to_vectors(word_list) #test model pred = classifier.predict(text_vectors_list) worker.calculate_result(data_labels,pred)
def random_forest_generator(data_filename, replacer,train_ratio = 0.8): #initialize variables data = pd.read_csv(data_filename) data_labels = data['state'].tolist() data_texts = data['content'] bow_transformer = CountVectorizer() #vectorize train data word_list = preprocess(data_texts, replacer) vectorized_data = bow_transformer.fit_transform(word_list) tfidf_transformer = TfidfTransformer().fit(vectorized_data) worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) vectorized_data_list = tfidf_transformer.transform(vectorized_data).toarray() #print(vectorized_data_list) all_data = worker.sliceData(vectorized_data_list,data_labels,train_ratio) train_data = all_data['train_data'] train_labels = all_data['train_labels'] test_data = all_data['test_data'] test_labels = all_data['test_labels'] #build model forest = RandomForestClassifier(n_estimators=30,n_jobs=-1) forest = forest.fit(train_data,train_labels) pred=forest.predict(test_data) accuracy = worker.calculate_result(test_labels,pred) #save data and model joblib.dump(forest,'./var/model/forest_0',compress = 3) joblib.dump(bow_transformer,'./var/model/bow_0',compress = 3) joblib.dump(tfidf_transformer,'./var/model/tfidf_0', compress = 3) return forest
def logistic_regression_generator(data_filename, replacer, train_ratio = 0.8): data = pd.read_csv(data_filename) data_labels = data['state'].tolist() data_texts = data['content'] bow_transformer = CountVectorizer() #vectorize train data word_list = preprocess(data_texts, replacer) vectorized_data = bow_transformer.fit_transform(word_list) tfidf_transformer = TfidfTransformer().fit(vectorized_data) worker = Worker(bow = bow_transformer,tfidf = tfidf_transformer) vectorized_data_list = tfidf_transformer.transform(vectorized_data).toarray() all_data = worker.sliceData(vectorized_data_list,data_labels,train_ratio) train_data = all_data['train_data'] train_labels = all_data['train_labels'] test_data = all_data['test_data'] test_labels = all_data['test_labels'] #build model lr = LogisticRegression(C=30,penalty='l2',random_state=0) lr.fit(train_data,train_labels) pred=lr.predict(test_data) print("generator prediction:") accuracy = worker.calculate_result(test_labels,pred) #save data and model joblib.dump(lr,'./var/model/lr_0',compress = 3) joblib.dump(bow_transformer,'./var/model/bow_0',compress = 3) joblib.dump(tfidf_transformer,'./var/model/tfidf_0', compress = 3) return lr