Ejemplo n.º 1
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'wizard_of_wikipedia')
    version = '1.0'
    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        build_data.mark_done(dpath, version)
Ejemplo n.º 2
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'empatheticdialogues')
    version = '1.0'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
Ejemplo n.º 3
0
def build(datapath):
    version = 'v1.0'
    dpath = os.path.join(datapath, 'dialogue_safety')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
Ejemplo n.º 4
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'blended_skill_talk')
    version = 'v1.4'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        # Format it for use with ParlAIDialogTeacher
        _create_parlai_format(dpath)

        # Mark the data as built
        build_data.mark_done(dpath, version_string=version)
Ejemplo n.º 5
0
 def load(self, path):
     """
     Load from a given path.
     """
     opt = self.opt
     mode = opt.get('image_mode', 'raw')
     is_zip = False
     if mode is None or mode == 'no_image_model':
         # don't need to load images
         return None
     elif '.zip' in path:
         # assume format path/to/file.zip/image_name.jpg
         is_zip = True
         sep = path.index('.zip') + 4
         zipname = path[:sep]
         file_name = path[sep + 1 :]
         path = ZipFile(zipname, 'r').open(file_name)
         task = opt['task']
         prepath = os.path.join(opt['datapath'], task)
         imagefn = ''.join(zipname.strip('.zip').split('/')[-2:]) + path.name
     if mode == 'raw':
         # raw just returns RGB values
         return Image.open(path).convert('RGB')
     elif mode == 'ascii':
         # convert images to ascii ¯\_(ツ)_/¯
         return self._img_to_ascii(path)
     else:
         # otherwise, looks for preprocessed version under 'mode' directory
         if not is_zip:
             prepath, imagefn = os.path.split(path)
         dpath = os.path.join(prepath, mode)
         if not os.path.exists(dpath):
             build_data.make_dir(dpath)
         imagefn = imagefn.split('.')[0]
         new_path = os.path.join(prepath, mode, imagefn)
         if not os.path.isfile(new_path):
             return self.extract(Image.open(path).convert('RGB'), new_path)
         else:
             return self.torch.load(new_path)
Ejemplo n.º 6
0
    def build(self, opt):
        self._get_data()

        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                'Please install pandas by running `pip install pandas`')

        version = 'v1.0'
        read_path = self.data_path
        if not build_data.built(self.data_path, version):
            print('[building data from : ' + read_path + ']')
            build_data.make_dir(self.data_path)
            # Read in data
            train = pd.read_csv(os.path.join(read_path, 'train.csv'))
            test = pd.read_csv(os.path.join(read_path, 'test.csv'))
            test_labels = pd.read_csv(
                os.path.join(read_path, 'test_labels.csv'))

            # Labels for test data; value of -1 indicates it was not used for scoring
            test_labels = test_labels[(test_labels.toxic != -1)
                                      & (test_labels.severe_toxic != -1)
                                      & (test_labels.obscene != -1)
                                      & (test_labels.threat != -1)
                                      & (test_labels.insult != -1)
                                      & (test_labels.identity_hate != -1)]

            # Merge test data with labels
            test = pd.merge(test_labels, test, on='id')

            # Split 10% of train data to be valid
            test['data_type'] = 'test'
            train['data_type'] = 'train'
            valid_set = train.sample(frac=0.1, random_state=42)
            valid_set['data_type'] = 'valid'
            train.update(valid_set)

            # Combine test and train into one data frame
            total_data = pd.concat([test, train],
                                   ignore_index=True,
                                   sort=False)
            # Rename comment_text to text for the act dict
            total_data.rename(columns={'comment_text': 'text'}, inplace=True)

            # Use the different categories to get binary classification
            total_data['sensitive'] = (total_data['severe_toxic'] +
                                       total_data['toxic'] +
                                       total_data['obscene'] +
                                       total_data['threat'] +
                                       total_data['insult'] +
                                       total_data['identity_hate'])

            total_data.loc[total_data['sensitive'] < 1, 'is_sensitive'] = 0
            total_data.loc[total_data['sensitive'] >= 1, 'is_sensitive'] = 1

            # Drop unecessary column
            total_data = total_data.drop(columns=['id'])

            self.data_to_json(total_data, 'wiki-toxic-comments-default.json')

            # Partition 80/10/10 according to arXiv:1811.12900 [cs.CL]
            # 90 train 10 test in paper
            # split based on command line flag
            original_train = total_data[
                (total_data['data_type'] == 'valid')
                | (total_data['data_type'] == 'train')].copy()
            l_td = len(original_train)

            original_train.iloc[:int(0.8 * l_td)]['data_type'] = 'train'
            original_train.iloc[int(0.8 *
                                    l_td):int(0.9 *
                                              l_td)]['data_type'] = 'test'
            original_train.iloc[int(0.9 * l_td):]['data_type'] = 'valid'

            self.data_to_json(original_train,
                              'wiki-toxic-comments-partition.json')

            # Merge the 3 files to get a list of dicts as follows:
            # [
            #     {
            #         'toxic': 0 or 1 ,
            #         'severe_toxic': 0 or 1,
            #         'obscene': 0 or 1,
            #         'threat': 0 or 1,
            #         'insult': 0 or 1,
            #         'identity_hate': 0 or 1,
            #         'text': <comments>,
            #         'data_type': test/validation/train,
            #         'sensitive': 0.0,
            #         'is_sensitive': 0/1
            #     },
            #   ...
            # ]

            build_data.mark_done(self.data_path, version)