def test_load_data(self): """ Test Case for correct loading of dataframes """ self.assertIsInstance( load_data(os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv')), pd.core.frame.DataFrame) self.assertIsInstance( get_datasets( os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'), os.path.join( os.path.pardir, 'src', 'data', 'hatespeech_text_label_vote_RESTRICTED_100K.csv'))[0], pd.core.frame.DataFrame) self.assertIsInstance( get_datasets( os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'), os.path.join( os.path.pardir, 'src', 'data', 'hatespeech_text_label_vote_RESTRICTED_100K.csv'))[1], pd.core.frame.DataFrame) self.assertIsInstance( concatenate_datasets( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'), self.df2, self.df3), pd.core.frame.DataFrame)
def load_labeled_dataset(): """ Concatenate the data sets from csv-files (labeled_data.csv, hatespeech_text_label_vote_RESTRICTED_100K.csv, tweets.csv) together and return it as a pandas dataframe. Returns ------- df_concatenated: Pandas dataframe The dataframe containing all data from the mentioned csv-files. """ # if tweets not already loaded from TwitterAPI if not os.path.isfile(os.path.join('data', 'tweets.csv')): # load dataset from https://github.com/zeerakw/hatespeech, loads tweets via tweet id df = get_tweets_by_id(config, os.path.join('data', 'NAACL_SRW_2016.csv')) # load datasets from # https://github.com/t-davidson/hate-speech-and-offensive-language/tree/master/data (df2) # and https://github.com/jaeyk/intersectional-bias-in-ml (df3) df2, df3 = get_datasets( os.path.join('data', 'labeled_data.csv'), os.path.join('data', 'hatespeech_text_label_vote_RESTRICTED_100K.csv')) df_concatenated = concatenate_datasets(os.path.join('data', 'tweets.csv'), df2, df3) return df_concatenated
def setUp(self): self.df = load_data( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv')) self.df2, self.df3 = get_datasets( os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'), os.path.join(os.path.pardir, 'src', 'data', 'hatespeech_text_label_vote_RESTRICTED_100K.csv')) self.df_concatenated = concatenate_datasets( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'), self.df2, self.df3)
def setUp(self): self.df = load_data( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv')) self.df2, self.df3 = get_datasets( os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'), os.path.join(os.path.pardir, 'src', 'data', 'hatespeech_text_label_vote_RESTRICTED_100K.csv')) self.df_concatenated = concatenate_datasets( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'), self.df2, self.df3) self.training_data, self.testing_data, self.training_y, self.testing_y = split_data( self.df_concatenated, 'text', 'hate_speech', 0.25)
def setUp(self): self.df = load_data( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv')) self.df2, self.df3 = get_datasets( os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'), os.path.join(os.path.pardir, 'src', 'data', 'hatespeech_text_label_vote_RESTRICTED_100K.csv')) self.df_concatenated = concatenate_datasets( os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'), self.df2, self.df3) self.test_set = pd.DataFrame([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?' ], columns=["text"]) self.test_result_count = [[0, 1, 1, 1, 0, 0, 1, 0, 1], [0, 2, 0, 1, 0, 1, 1, 0, 1], [1, 0, 0, 1, 1, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0, 1, 0, 1]]