Example #1
0
    def write_cleaned_labels(self, labels, options):
        def _create_dir(dir_name):
            if not os.path.isdir(dir_name):
                os.makedirs(dir_name)

        if len(labels) == 0:
            self.logger.info(
                'No labels to write. Maybe the filtering parameters are too strict. Aborting.'
            )
            return
        # write full file
        folder_path = find_folder('4_labels_cleaned')
        option_flags = '_'.join(options)
        f_path = os.path.join(folder_path,
                              'cleaned_labels_{}.csv'.format(option_flags))
        labels[[
            'id', 'text', 'question_id', 'question_tag', 'label_id',
            'label_tag'
        ]].to_csv(f_path, index=False)
        self.logger.info('Successfully wrote {:,} labels to file {}'.format(
            len(labels), f_path))
        # write 1 file per question
        folder_path_by_question = os.path.join(folder_path, 'by_question')
        labels.rename(columns={'label_tag': 'label'}, inplace=True)
        self.logger.info('Writing one files by question...')
        for question_tag, g in labels.groupby('question_tag'):
            f_path = os.path.join(
                folder_path_by_question, question_tag,
                'cleaned_labels_{}_{}.csv'.format(option_flags, question_tag))
            _create_dir(os.path.dirname(f_path))
            g[['id', 'text', 'label']].to_csv(f_path, index=False)
            self.logger.info(
                '... successfully wrote {:,} labels for question {} to file {}'
                .format(len(g), question_tag, f_path))
Example #2
0
 def cleaned_annotation_data(self):
     self.make_title('Cleaned annotation data')
     f_name = os.path.join(find_folder('4_labels_cleaned'),
                           'cleaned_labels.csv')
     num_annotations = 0
     if os.path.isfile(f_name):
         num_annotations = sum(1 for line in open(f_name))
     self.add_key_value('Number of cleaned annotations', num_annotations)
     self.text += '\n'
Example #3
0
 def annotation_data(self):
     self.make_title('Annotation data')
     self.text += 'Number of annotation results:\n'
     for mode in ['public', 'local', 'mturk', 'other']:
         f_names = glob.glob(
             os.path.join(find_folder('3_labelled'), mode, '*.csv'))
         num_annotations = 0
         for f_name in f_names:
             num_annotations += sum(1 for line in open(f_name))
         self.add_key_value('- {}'.format(mode), num_annotations)
     self.text += '\n'
Example #4
0
 def sampled_data(self):
     self.make_title('Sampled data')
     f_names = glob.glob(
         os.path.join(find_folder('2_sampled'),
                      'sampled_{}_{}_*.csv'.format('*', '*')))
     num_sample_files = len(f_names)
     num_tweets_sampled = 0
     for f_name in f_names:
         num_tweets_sampled += sum(1 for line in open(f_name))
     self.add_key_value(
         'Number of tweets sampled ({:,} file(s))'.format(num_sample_files),
         num_tweets_sampled)
     self.text += '\n'
Example #5
0
 def parsed_data(self):
     num_lines = 0
     for dtype in ['original', 'anonymized', 'encrypted']:
         path = os.path.join(find_folder('1_parsed'),
                             'parsed_{}.csv'.format(dtype))
         if os.path.isfile(path):
             num_lines = sum(1 for line in open(path))
             break
     self.make_title('Parsed data')
     if num_lines > 0:
         self.add_key_value('Num tweets in parsed data', num_lines - 1)
     else:
         self.text += 'No parsed data present.\n'
     self.text += '\n'
Example #6
0
 def raw_data(self):
     raw_data_folder = find_folder('0_raw')
     num_historic = len(
         glob.glob(os.path.join(raw_data_folder, 'historic', '*.json*')))
     num_streaming = len(
         glob.glob(
             os.path.join(raw_data_folder, 'streaming', '**', '*.json*')))
     total = num_historic + num_streaming
     self.make_title('Raw data')
     if total > 0:
         self.add_key_value('Number of files in raw data', total)
         self.add_key_value('- historic', num_historic)
         self.add_key_value('- streaming', num_streaming)
     else:
         self.text += 'No raw data present.\n'
     self.text += '\n'
Example #7
0
    def translate(self):
        parser = ArgParseDefault(description='Translate prepared text')
        logger = logging.getLogger(__name__)
        parser.add_argument('-s', '--source', dest='source', choices=["EN", "DE", "FR", "ES", "PT", "IT", "NL", "PL", "RU"], required=True, help='Source language')
        parser.add_argument('-t', '--target', dest='target', choices=["EN", "DE", "FR", "ES", "PT", "IT", "NL", "PL", "RU"], required=True, help='Target language')
        parser.add_argument('--auth-key', dest='auth_key', type=str, required=True, help='DeepL auth key')
        args = parser.parse_args(sys.argv[2:])
        logger.info('Translating from source language {} to target language {}....'.format(args.source, args.target))
        # load data
        folder = os.path.join(find_folder('other'), 'translations')
        if not os.path.isdir(folder):
            os.makedirs(folder)
        f_paths = glob.glob(os.path.join(folder, 'prepare_*.csv')) 
        if len(f_paths) == 0:
            raise FileNotFoundError('No prepare_*.csv file(s) found in folder {}'.format(folder))
        elif len(f_paths) > 1:
            raise ValueError('Found {} prepare_*.csv files in folder {}.'.format(len(f_paths), folder))

        logger.info('Loading prepared data...')
        df = pd.read_csv(f_paths[0], dtype={'id': str})
        df_len = df.text.apply(len)
        costs = df_len.sum()/500 * 0.01
        logger.info('About to translate {:,} characters with an estimated cost of EUR {:.2f}.'.format(df_len.sum(), costs))
        yes_no = input('Continue to translate? (yes/no)\n')
        if not (yes_no == 'y' or yes_no == 'yes'):
            logger.info('Aborting...')
            return

        # params
        base_url = 'https://api.deepl.com/v2/translate'
        other_params = {'auth_key': args.auth_key, 'target_lang': args.target, 'source_lang': args.source}
        chunk_size = 20
        def chunks(total_len, n):
            for i in range(0, total_len, n):
                yield i, i + n - 1
        df['translation'] = ''
        for start, stop in tqdm(chunks(len(df), chunk_size), total=len(range(0, len(df), chunk_size))):
            texts = df.loc[start:stop, 'text'].tolist()
            res = requests.get(base_url, params={'text': texts, **other_params})
            if not res.ok:
                raise Exception('Unable to retrieve data from DeepL. Error status code {}... Aborting'.format(res.status_code))
            res = res.json()
            df_tr = pd.DataFrame(res['translations'])
            df.loc[start:stop, 'translation'] = df_tr['text'].values
        f_path = os.path.join(folder, 'translation_{}.csv'.format(get_df_hash(df)[:5]))
        print('Writing {:,} records to file {}...'.format(len(df), f_path))
        df.to_csv(f_path, index=False)
Example #8
0
 def annotation_cleaned(self):
     self.header('Cleaned annotations')
     f_path = os.path.join(find_folder('4_labels_cleaned'),
                           'cleaned_labels.csv')
     try:
         self.logger.info('Reading cleaned annotation data...')
         df = pd.read_csv(f_path)
     except FileNotFoundError:
         self.text += 'No cleaned annotations present.'
         return
     self.add_key_value('- Num annotation results', len(df))
     self.add_key_value('- Num tweets annotated', len(df.id.unique()))
     self.text += '\n\n'
     for question_tag, q_group in df.groupby('question_tag'):
         self.make_title('Question {}'.format(question_tag))
         total = q_group.count()['id']
         for label_tag, q_a_group in q_group.groupby('label_tag'):
             label_tag_count = q_a_group.count()['id']
             self.add_key_value('- {}'.format(label_tag),
                                label_tag_count,
                                with_percent=100 * label_tag_count / total)
         self.text += '\n\n'
Example #9
0
 def prepare(self):
     parser = ArgParseDefault(description='Prepare text to be translated')
     logger = logging.getLogger(__name__)
     parser.add_argument('--geo-only', dest='geo_only', action='store_true', help='Only use geo-tagged data')
     parser.add_argument('-d', '--dtype', dest='dtype', choices=['original', 'anonymized', 'encrypted'], default='anonymized', help='Data source type')
     parser.add_argument('-l', '--limit', dest='limit', type=int, default=-1, help='If set, only extract random subsample')
     args = parser.parse_args(sys.argv[2:])
     # load data
     logger.info('Loading data...')
     df = get_parsed_data(dtype=args.dtype, usecols=['id', 'text', 'is_duplicate', 'has_place', 'has_coordinates', 'is_retweet'])
     # filter
     if args.geo_only:
         df = df[(df.has_place | df.has_coordinates) & (~df.is_duplicate) & (~df.is_retweet)]
     else:
         df = df[(~df.is_duplicate) & (~df.is_retweet)]
     if args.limit > 0:
         df = df.sample(args.limit)
     # write data
     folder = os.path.join(find_folder('other'), 'translations')
     if not os.path.isdir(folder):
         os.makedirs(folder)
     f_path = os.path.join(folder, 'prepare_{}.csv'.format(get_df_hash(df)[:5]))
     logger.info('Writing {:,} records to file {}...'.format(len(df), f_path))
     df[['id', 'text']].to_csv(f_path, index=False)
Example #10
0
 def write_sample(self,
                  sample,
                  mode,
                  columns=['id', 'text'],
                  size='',
                  min_date=None,
                  max_date=None,
                  flags=''):
     if len(sample) == 0:
         logger.warn('No sample files written. Aborting.')
         return
     timestamp = time.strftime('%Y-%m-%d_%H-%M-%S')
     min_date_str = ''
     if min_date is not None:
         min_date_str = '_min_date_{}'.format(min_date)
     max_date_str = ''
     if max_date is not None:
         max_date_str = '_max_date_{}'.format(max_date)
     f_name = 'sampled_{mode}_{len_sample}_{size}_{seed}{min_date}{max_date}_created_{timestamp}{flags}.csv'.format(
         mode=mode,
         len_sample=len(sample),
         size=size,
         seed=self.seed,
         timestamp=timestamp,
         min_date=min_date_str,
         max_date=max_date_str,
         flags=flags)
     full_path = os.path.join(find_folder('2_sampled'), f_name)
     logger.info('Writing file {} ...'.format(full_path))
     if 'all' in columns:
         sample.to_csv(full_path, encoding='utf8')
     else:
         sample[columns].to_csv(full_path,
                                encoding='utf8',
                                index=False,
                                header=False)
def main(args):
    """
    This script creates a new files in preprocess/data/other/pretrain with tweets which should be used for pretraining language models.
    It excludes training data and duplicates.
    """
    # load data
    logger.info('Reading data...')
    usecols = [
        'id', 'text', 'lang', 'token_count', 'is_retweet', 'contains_keywords'
    ]
    df = get_parsed_data(usecols=usecols, num_files=args.num_files)
    logger.info(f'...loaded a total of {len(df):,} tweets')

    # Filter retweets
    if 'retweets' in args.filters:
        logger.info(f'Filter retweets...')
        num_before = len(df)
        df = df[~df.is_retweet]
        num_after = len(df)
        logger.info(
            f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # Filtering by keyword
    if 'contains_keywords' in args.filters:
        logger.info(f'Filter contains_keywords...')
        num_before = len(df)
        df = df[df.contains_keywords]
        num_after = len(df)
        logger.info(
            f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # filter lang
    if args.lang is not None:
        logger.info(f'Filter lang {args.lang}...')
        num_before = len(df)
        df = df[df.lang == args.lang]
        num_after = len(df)
        logger.info(
            f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # filter min tokens
    if args.min_tokens > 0:
        logger.info(f'Filter has >={args.min_tokens} tokens...')
        num_before = len(df)
        df = df[df.token_count >= args.min_tokens]
        num_after = len(df)
        logger.info(
            f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # generate text column to filter for duplicates
    logger.info('Remove duplicates...')
    num_before = len(df)
    df.loc[:, 'text_cleared'] = df.text.apply(generate_text_cleared)
    df = df.drop_duplicates(subset=['text_cleared'])
    num_after = len(df)
    logger.info(
        f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # shuffle
    logger.info('Shuffle...')
    df = df.sample(frac=1)

    # write output file
    num_lines = len(df)
    logger.info(f'Collected total of {num_lines:,} examples')
    num_train = max(int(0.8 * num_lines), num_lines - int(2e5))
    ts = datetime.datetime.now().strftime('%Y_%m_%d-%H-%M_%s')
    for (_s, _e), _type in zip([(None, num_train), (num_train, None)],
                               ['train', 'dev']):
        _df = df[_s:_e]
        logger.info(f'Writing {len(_df):,} examples for {_type} data...')
        output_folder = os.path.join(find_folder('other'), 'pretrain',
                                     f'run_{ts}', _type)
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)
        if args.no_parallel:
            num_cpus = 1
        else:
            num_cpus = max(multiprocessing.cpu_count() - 1, 1)
        parallel = joblib.Parallel(n_jobs=num_cpus)
        write_output_file_delayed = joblib.delayed(write_output_file)
        res = parallel((write_output_file_delayed(
            _df.iloc[i:(i + args.max_examples_per_file)],
            os.path.join(output_folder, f'pretrain_{_type}_{j:03}.txt'))
                        for j, i in enumerate(
                            trange(0, len(_df), args.max_examples_per_file))))
        logger.info(
            f'Successfully wrote {len(res):,} file(s) to folder {output_folder}'
        )
Example #12
0
def train_dev_test_split(question='sentiment',
                         dev_size=0.1,
                         test_size=0.2,
                         seed=42,
                         name='',
                         balanced_labels=False,
                         all_questions=False,
                         label_tags=[],
                         labelled_as=None,
                         has_label=''):
    """Splits cleaned labelled data into training, dev and test set"""
    def _filter_for_label_balance(df):
        """Performs undersampling for overrepresanted label classes"""
        counts = Counter(df['label'])
        min_count = min(counts.values())
        _df = pd.DataFrame()
        for l in counts.keys():
            _df = pd.concat([_df, df[df['label'] == l].sample(min_count)])
        return _df

    questions = [question]
    np.random.seed(seed)
    if name == '':
        f_path = os.path.join(find_folder('4_labels_cleaned'),
                              'cleaned_labels*.csv')
        annotation_files = glob.glob(f_path)
        if len(annotation_files) == 0:
            raise FileNotFoundError(
                f'No cleaned label files could be found with the pattern {f_path}'
            )
        elif len(annotation_files) > 1:
            raise ValueError(
                f'Found {len(annotation_files)} different files for cleaned labels. Provide "name" argument to specify which.'
            )
        name = os.path.basename(annotation_files[0]).split('.csv')[0]
    if all_questions:
        df = get_cleaned_labelled_data(name=name)
        questions = df['question_tag'].unique()
    for question in questions:
        df = get_cleaned_labelled_data(question=question,
                                       name=name,
                                       has_label=has_label)
        if len(df) == 0:
            logger.warning(
                'No labelled data could be found for question {} under these parameters.'
                .format(question))
            continue
        if balanced_labels:
            df = _filter_for_label_balance(df)
        flags = '{}{}'.format('_' + name if name != '' else '',
                              '_balanced' if balanced_labels else '')
        if len(label_tags) > 0:
            df = df[df['label'].isin(label_tags)]
            flags += '_labels_{}'.format('_'.join(label_tags))
        if len(has_label) > 0:
            has_label_flag = 'has_label_{}'.format(
                has_label.replace('|', '_or_').replace(',', '_and_'))
            flags += '_' + has_label_flag
            folder_path = os.path.join(find_folder('4_labels_cleaned'),
                                       'other', has_label_flag, question)
        else:
            folder_path = os.path.join(find_folder('4_labels_cleaned'),
                                       'splits', question)
        train, dev, test = np.split(df.sample(frac=1, random_state=seed), [
            int((1 - dev_size - test_size) * len(df)),
            int((1 - test_size) * len(df))
        ])
        if not os.path.isdir(folder_path):
            os.makedirs(folder_path)
        for dtype, data in [['train', train], ['dev', dev], ['test', test]]:
            f_name = f'{dtype}_{question}_split_{len(train)}_{len(dev)}_{len(test)}_seed_{seed}{flags}.csv'
            f_path = os.path.join(folder_path, f_name)
            data.to_csv(f_path, index=None, encoding='utf8')
            logger.info(
                f'Successfully wrote data of {len(data):,} examples to file {f_path}.'
            )
Example #13
0
 def generate_batch(self,
                    num_tweets=None,
                    batch_id=None,
                    tail=True,
                    ignore_previous=False):
     """Generates a new batch which takes as input a large sample file provided in `data/2_sampled` and generates a new batch
     not including previously annotated tweets.
     """
     if num_tweets is None:
         raise ValueError('Num tweets is zero. Cannot create empty batch.')
     # vars
     sample_folder = find_folder('2_sampled')
     # Ids from sample file
     df_samples = get_sampled_data()
     if len(df_samples) == 0:
         raise Exception(
             'Sample file is empty. Generate a sample file first.')
     tweet_ids_sampled = set(df_samples['tweet_id'])
     # Ids from previously labelled data
     try:
         df_labels = get_labelled_data()
     except FileNotFoundError:
         tweet_ids_labelled = set()
     else:
         tweet_ids_labelled = set(df_labels['tweet_id'])
     # Ids from previous batches
     df_batched = get_batched_sample_data()
     if len(df_batched) > 0:
         tweet_ids_batched = set(df_batched['tweet_id'])
     else:
         tweet_ids_batched = set()
     # Ids from previous batches which were not available
     df_unavailable = get_uploaded_batched_data(availability='unavailable')
     if len(df_unavailable) > 0:
         tweet_ids_unavailable = set(df_unavailable['tweet_id'])
     else:
         tweet_ids_unavailable = set()
     # remove tweets which are unavailable, have been previously labelled
     still_available = tweet_ids_sampled - tweet_ids_unavailable - tweet_ids_labelled
     if not ignore_previous:
         still_available -= tweet_ids_batched
     logger.info(
         'Unique tweets in base sample(s): {:,} (labelled: {:,}, unavailable: {:,}, in previous batches: {:,})'
         .format(len(tweet_ids_sampled), len(tweet_ids_labelled),
                 len(tweet_ids_unavailable), len(tweet_ids_batched)))
     logger.info('Tweets left to sample from: {:,}'.format(
         len(still_available)))
     logger.info('Precentage labelled: {:.2f}%'.format(
         100 * float(len(tweet_ids_labelled) / len(tweet_ids_sampled))))
     # return conditions
     if len(still_available) <= 0:
         logger.warn('All available tweets have been labelled.'.format(
             len(tweet_ids_sampled), len(still_available)))
         return
     if num_tweets > len(still_available):
         logger.warn(
             'Requested to create batch of {:,}, but only {:,} are still available.'
             .format(num_tweets, len(still_available)))
         return
     if tail:
         batch = df_samples.loc[df_samples['tweet_id'].isin(
             still_available)][-num_tweets:]
     else:
         batch = df_samples.loc[df_samples['tweet_id'].isin(
             still_available)][:num_tweets]
     assert len(batch) == num_tweets
     # write new batch file
     if batch_id is None:
         try:
             batch_id = 1 + max([
                 int(s.split('_')[-1]) for s in os.listdir(sample_folder)
                 if s.startswith('batch_')
                 and os.path.isdir(os.path.join(sample_folder, s))
             ])
         except ValueError:
             batch_id = 1
     batch_name = 'batch_{}'.format(batch_id)
     logger.info('Generating batch {} of size {:,} tweets...'.format(
         batch_name, num_tweets))
     output_folder = os.path.join(sample_folder, batch_name)
     if not os.path.isdir(output_folder):
         os.mkdir(output_folder)
     else:
         raise Exception(
             'Found pre-existing folder "{}". Please remove this folder first or pick a different batch ID.'
             .format(output_folder))
     f_path = os.path.join(
         output_folder,
         '{}_{}.csv'.format(batch_name,
                            datetime.now().strftime('%Y-%m-%d')))
     batch.to_csv(f_path, header=None, index=False, encoding='utf8')
     logger.info(
         'Successfully wrote file containing new batch "{}"'.format(f_path))