Beispiel #1
0
    def prepare_dataset_2_slo(input_file_path: str, lang: str, type: str):

        data = utilities.read_file_contents_pd(input_file_path)

        non_hate = data.loc[data['Class'].isin([0, 1])]
        del non_hate['Type']
        non_hate.columns = ['Text', 'Label']
        non_hate['Label'] = 0

        hate = data.loc[data['Class'].isin([2, 3])]
        del hate['Class']
        hate.columns = ['Text', 'Label']
        hate['Label'] = hate['Label'].fillna(12)

        non_hate['Label'] = non_hate['Label'].astype(int)
        hate['Label'] = hate['Label'].astype(int)

        comments = []
        comments.extend(non_hate.values)
        comments.extend(hate.values)

        comments.insert(0, ['Text', 'Label'])
        comments = utilities.strip_and_replace_new_lines(comments)
        output_path = os.path.join(rootpath.detect(), 'data',
                                   'structured_data', lang, type, 'dataset_2',
                                   'data.csv')
        utilities.write_to_file(comments, output_path)
        logger.info('Slo Dataset_2 has been prepared')
Beispiel #2
0
    def prepare_dataset_6_eng(input_file_path: str, lang: str, type: str):
        """
        Reads the data from dataset_6. Converts it into format Text,Label(0-Appearance,1-Intelligence,2-Political,3-Racial,4-Sextual) and writes the contents to file.
        """

        class_text_number_map = {
            'Appearance Data': 0,
            'Intelligence Data': 1,
            'Political Data': 2,
            'Racial Data': 3,
            'Sextual Data': 4
        }

        directory = os.path.join(rootpath.detect(), 'data', 'source_data',
                                 lang, type, 'dataset_6', 'tweets_dataset')

        comments = []
        for filename in os.listdir(directory):
            labels, data = utilities.read_file_contents(
                os.path.join(directory, filename), ',', 'unicode_escape')
            class_text = filename.split(".")[0]
            comments_file = [[row[0], class_text_number_map[class_text]]
                             for row in data if row[1].lower() == "yes"]
            comments.extend(comments_file)

        comments.insert(0, ['Text', 'Label'])
        comments = utilities.strip_and_replace_new_lines(comments)
        output_path = os.path.join(rootpath.detect(), 'data',
                                   'structured_data', lang, type, 'dataset_6',
                                   'data.csv')
        utilities.write_to_file(comments, output_path)
        logger.info('Eng Dataset_6 has been prepared')
Beispiel #3
0
 def prepare_dataset_1_eng(input_file_path: str, lang: str, type: str):
     """
     Reads the data from dataset_1. Converts it into format Text,Label(1-Hate,0-Non hate) and writes the contents to file.
     """
     labels, data = utilities.read_file_contents(input_file_path, ':')
     labels = [labels[1], labels[0]]
     data = [[' '.join(row[1].split(' ')[1:]), row[0]] for row in data]
     data.insert(0, labels)
     data = utilities.strip_and_replace_new_lines(data)
     output_path = os.path.join(rootpath.detect(), 'data',
                                'structured_data', lang, type, 'dataset_1',
                                'data.csv')
     utilities.write_to_file(data, output_path)
     logger.info("Eng Dataset_1 has been prepared")
Beispiel #4
0
    def prepare_dataset_1_slo(input_file_path: str, lang: str, type: str):

        data = utilities.read_file_contents_pd(input_file_path)

        comments = []
        comments.extend(data.values)

        comments.insert(0, ['Text', 'Label'])
        comments = utilities.strip_and_replace_new_lines(comments)
        output_path = os.path.join(rootpath.detect(), 'data',
                                   'structured_data', lang, type, 'dataset_1',
                                   'data.csv')
        utilities.write_to_file(comments, output_path)
        logger.info('Slo Dataset_1 has been prepared')
Beispiel #5
0
 def prepare_dataset_3_eng(input_file_path: str, lang: str, type: str):
     """
     Reads the data from dataset_3. Converts it into format Text,Label(1-Hate,0-Non hate) and writes the contents to file.
     """
     data = utilities.read_file_contents_pd(input_file_path)
     data['text'] = data.apply(lambda row: utilities.transform_dataset_2_3(
         row.text, row.hate_speech_idx),
                               axis=1)
     comments = [
         item for sublist in data['text'].tolist() for item in sublist
     ]
     comments.insert(0, ['Text', 'Label'])
     comments = utilities.strip_and_replace_new_lines(comments)
     output_path = os.path.join(rootpath.detect(), 'data',
                                'structured_data', lang, type, 'dataset_3',
                                'data.csv')
     utilities.write_to_file(comments, output_path)
     logger.info("Eng Dataset_3 has been prepared")
Beispiel #6
0
    def prepare_dataset_4_eng(input_file_path: str, lang: str, type: str):
        """
        Reads the data from dataset_4. Converts it into format Text,Label(1-Hate,0-Non hate) and writes the contents to file.
        """

        labels, data = utilities.read_file_contents(
            os.path.join(rootpath.detect(), 'data', 'source_data', lang, type,
                         'dataset_4', 'annotations_metadata.csv'), ',')
        comments = []
        for row in data:
            file, user, forum, cont, label = row
            text = utilities.read_file_contents_txt(
                os.path.join(rootpath.detect(), 'data', 'source_data', lang,
                             type, 'dataset_4', 'all_files', file + '.txt'))
            label = 1 if label == 'hate' else 0
            comments.append([text, label])
        comments.insert(0, ['Text', 'Label'])
        comments = utilities.strip_and_replace_new_lines(comments)
        output_path = os.path.join(rootpath.detect(), 'data',
                                   'structured_data', lang, type, 'dataset_4',
                                   'data.csv')
        utilities.write_to_file(comments, output_path)
        logger.info("Eng Dataset_4 has been prepared")
Beispiel #7
0
    def prepare_dataset_5_eng(input_file_path: str, lang: str, type: str):
        """
        Reads the data from dataset_5. Converts it into format Text,Label(0-Archaic,1-Class,2-Disability,3-Ethnicity,4-Gender,5-Nationality,6-Religion,7-Sexual Orientation) and writes the contents to file.
        """

        class_text_number_map = {
            'archaic': 0,
            'class': 1,
            'disability': 2,
            'ethn': 3,
            'gender': 4,
            'nation': 5,
            'rel': 6,
            'sexorient': 7
        }

        directory = os.path.join(rootpath.detect(), 'data', 'source_data',
                                 lang, type, 'dataset_5',
                                 'downloaded_tweets_dataset')

        comments = []
        for filename in os.listdir(directory):
            labels, data = utilities.read_file_contents(
                os.path.join(directory, filename), ',')
            class_text = filename.split("_")[0]
            comments_file = [[row[1], class_text_number_map[class_text]]
                             for row in data]
            comments.extend(comments_file)

        comments.insert(0, ['Text', 'Label'])
        comments = utilities.strip_and_replace_new_lines(comments)
        output_path = os.path.join(rootpath.detect(), 'data',
                                   'structured_data', lang, type, 'dataset_5',
                                   'data.csv')
        utilities.write_to_file(comments, output_path)
        logger.info('Eng Dataset_5 has been prepared')