def test_purified_file(self): data_df = get_sms_dataset(raw=False) assert data_df is not None # confirm purified for rule_name, matched in TextPurifier( texts=data_df['message']).show_iter(): print(matched) assert matched == []
def kf_test_with_datas(begin, end, test_num, name_scoring="neg_mean_squared_error"): data_df = get_sms_dataset(noStopwords=True, overwrite=True) X, y = data_df['message'], data_df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) cv = CountVectorizer() X_train = cv.fit_transform(X_train) kf_test_and_draw(X_train, y_train, begin, end, test_num, name_scoring)
def auto_test_for_other_model(model, param_grid, name_scoring): data_df = get_sms_dataset(noStopwords=True, overwrite=True) X, y = data_df['message'], data_df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) cv = CountVectorizer() X_train = cv.fit_transform(X_train) grid = gsc(model, param_grid, scoring=name_scoring) grid.fit(X_train, y_train) print(grid.best_params_, grid.best_score_) return grid.best_params_, grid.best_score_
def kf_test_with_datas_and_draw_alot(begin, end, test_num): data_df = get_sms_dataset(noStopwords=True, overwrite=True) X, y = data_df['message'], data_df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) cv = CountVectorizer() X_train = cv.fit_transform(X_train) kf_test_and_draw(X_train, y_train, begin, end, test_num, 'accuracy') kf_test_and_draw(X_train, y_train, begin, end, test_num, 'precision') kf_test_and_draw(X_train, y_train, begin, end, test_num, 'f1') kf_test_and_draw(X_train, y_train, begin, end, test_num, 'recall')
def __init__(self, sequence_length, transform, cate_type, file_path='data/spam.csv'): # get data and transform data_df = get_sms_dataset(SMS_DATASET=file_path, type=cate_type, noStopwords=False) texts, targets = data_df.message.to_list(), data_df.target.to_list() self.texts, self.targets, self.word_dict, self.index_dict = transform( texts, targets) self.sequence_length = sequence_length self.words = " ".join(texts).split() self.words_indexes = [self.word_dict[w] for w in self.words]
def generate_model_no_length(scoring="accuracy"): data_df = get_sms_dataset(noStopwords=True, overwrite=True) X, y = data_df['message'], data_df['target'] cv = CountVectorizer() X = cv.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) bp = test_tool.auto_test(X_train, y_train, 0.1, 1.5, 200, scoring) nb = MultiNB_Wrapper.train_once(X_train, y_train, bp) MultiNB_Wrapper.save_model(nb, cv, model_name='old_model', cv_name="old_cv") pred = nb.predict(X_test) print(metrics.confusion_matrix(y_test, pred)) return nb
def generate_model(scoring="precision"): data_df = get_sms_dataset(noStopwords=True, overwrite=True) X, y = data_df['message'], data_df['target'] cv = CountVectorizer() X = cv.fit_transform(X) lens = data_df['length'] new_len = csr_matrix(lens) X = hstack((X, new_len.reshape(-1, 1))) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2) bp = test_tool.auto_test(X_train, y_train, 0.1, 1.5, 200, scoring) nb = MultiNB_Wrapper.train_once(X_train, y_train, bp) MultiNB_Wrapper.save_model(nb, cv) pred = nb.predict(X_test) print(metrics.confusion_matrix(y_test, pred)) return nb
""" @Author: peviroy @Date: 2020-09-03 20:54 @Last Modified by: peviroy @Last Modified time: 2020-09-06 21:07 """ import os os.chdir(os.path.split(os.path.realpath(__file__))[0]) import sys sys.path.append(os.path.abspath("..")) from dataset import get_sms_dataset from utils.preprocessing import TextPurifier raw_texts = get_sms_dataset(raw=True)['v2'].tolist() tp = TextPurifier(raw_texts) class TestTextPurifierInstance: def test_str(self): assert tp.__str__().startswith('Name:') def test_iter(self): for rule_name, matched_strings in tp.show_iter(): assert matched_strings != [] def test_purify(self): purifed_texts = tp.purify() tp.set_texts(purifed_texts) for rule_name, matched_strings in tp.show_iter():
def __init__(self, transform=SMSTransform(), file_path='data/spam.csv'): data_df = get_sms_dataset(SMS_DATASET=file_path) texts, targets = data_df.message.to_list(), data_df.target.to_list() self.texts, self.targets, self.word_dict, self.index_dict = transform( texts, targets)
def test_raw_file(self): assert get_sms_dataset(raw=True) is not None
def test_null_raw_file(self): try: get_sms_dataset(raw=True, SMS_DATASET='null') except Exception as e: assert isinstance(e, FileNotFoundError)