def test_feature_extraction(): try: tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') pre.preprocess(tagged_df) assert isinstance(fe.extract_features(tagged_df, FEATURE_LIST), pd.DataFrame) except Exception: pytest.fail('Unexpected error..')
def test_classicifation(): try: tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') pre.preprocess(tagged_df) X = fe.extract_features(tagged_df, FEATURE_LIST) y = (tagged_df['cb_level'] == 3).astype(int) X = X.drop(columns=['id']) rf_obj = rf.RandomForest() rf_obj.train(X, y) rf_obj.predict(X) assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') except Exception: pytest.fail('Unexpected error..')
def test_train_model(): rf_obj = rf.RandomForest() tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) y = (tagged_df['cb_level'] == 3).astype(int) rf_obj.train(X, y) assert rf_obj.model is not None
def test_correct_preprocess(): raw_df = pd.DataFrame([[ 1, 'מילה, עוד מילה. :) english @#$%^&* word סימן קריאה! שני סימני קריאה!! > אנגלית מחוברenglish', 0 ]], columns=['id', 'text', 'cb_level']) clean_df = pre.preprocess(raw_df) text = clean_df['text'].tolist()[0] assert text == 'מילה עוד מילה סימן קריאה! שני סימני קריאה!! אנגלית מחובר '
def test_explain_class(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') post = 'אני אוהבת אותך' tagged_df = pd.DataFrame({'id': [1], 'text': [post]}) tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) explanation.explain_class(model, X) assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/force_plot_post.png')
def test_explain_dataset(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) explanation.explain_model(model, X) assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/dependence_plot.png') assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot_bar.png') assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot.png')
def test_explainability(): try: model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) X = fe.extract_features(tagged_df, FEATURE_LIST) X = X.drop(columns=['id']) explanation.explain_model(model, X) explanation.explain_class(model, X) except Exception: pytest.fail('Unexpected error..')
def test_correct_classification(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') rf_obj = rf.RandomForest() rf_obj.model = model post = 'אני אוהבת אותך' tagged_df = pd.DataFrame({'id': [1], 'text': [post]}) tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) y_prob_rf = rf_obj.predict(X) my_prob = y_prob_rf[0] if my_prob is None: assert False assert isinstance(my_prob, numpy.float64)
def test_performance(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') rf_obj = rf.RandomForest() rf_obj.model = model df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') df = pre.preprocess(df) X = fe.extract_features(df, FEATURE_LIST) X = X.drop(columns=['id']) y = (df['cb_level'] == 3).astype(int) y_prob_rf = rf_obj.predict(X) pred = np.where(y_prob_rf > 0.5, 1, 0) performance = per.get_performances(y, pred) assert 'f-score' in performance.keys() assert 'recall' in performance.keys() assert 'precision' in performance.keys()
""" Creating the logger """ logger = Logger.get_logger_instance() """ get tagged df """ tagged_df = utils.read_to_df() # Vigo data # tagged_df = utils.create_csv_from_keepers_files() # Keepers data """ Run a pre-processing function on the tagged data """ tagged_df = pre.preprocess(tagged_df) """ Run a extract features function on the clean and tagged data """ feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] fe.folder_name = logger.folder_name X = fe.extract_features(tagged_df, feature_list) logger.write_features(feature_list) y = (tagged_df['cb_level'] == 3).astype(int) X = X.drop(columns=['id']) """ Split data to train and test """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
def test_empty_dataset(): with pytest.raises(ValueError): df = pd.DataFrame([], columns=['not', 'the', 'right', 'columns']) pre.preprocess(df)
def test_incorrect_column_dataset(): with pytest.raises(ValueError): df = pd.DataFrame([[0, 0, 0, 0]], columns=['not', 'the', 'right', 'columns']) pre.preprocess(df)
def test_preprocessing(): try: tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') assert isinstance(pre.preprocess(tagged_df), pd.DataFrame) except Exception: pytest.fail('Unexpected error..')