def write_cleaned_labels(self, labels, options): def _create_dir(dir_name): if not os.path.isdir(dir_name): os.makedirs(dir_name) if len(labels) == 0: self.logger.info( 'No labels to write. Maybe the filtering parameters are too strict. Aborting.' ) return # write full file folder_path = find_folder('4_labels_cleaned') option_flags = '_'.join(options) f_path = os.path.join(folder_path, 'cleaned_labels_{}.csv'.format(option_flags)) labels[[ 'id', 'text', 'question_id', 'question_tag', 'label_id', 'label_tag' ]].to_csv(f_path, index=False) self.logger.info('Successfully wrote {:,} labels to file {}'.format( len(labels), f_path)) # write 1 file per question folder_path_by_question = os.path.join(folder_path, 'by_question') labels.rename(columns={'label_tag': 'label'}, inplace=True) self.logger.info('Writing one files by question...') for question_tag, g in labels.groupby('question_tag'): f_path = os.path.join( folder_path_by_question, question_tag, 'cleaned_labels_{}_{}.csv'.format(option_flags, question_tag)) _create_dir(os.path.dirname(f_path)) g[['id', 'text', 'label']].to_csv(f_path, index=False) self.logger.info( '... successfully wrote {:,} labels for question {} to file {}' .format(len(g), question_tag, f_path))
def cleaned_annotation_data(self): self.make_title('Cleaned annotation data') f_name = os.path.join(find_folder('4_labels_cleaned'), 'cleaned_labels.csv') num_annotations = 0 if os.path.isfile(f_name): num_annotations = sum(1 for line in open(f_name)) self.add_key_value('Number of cleaned annotations', num_annotations) self.text += '\n'
def annotation_data(self): self.make_title('Annotation data') self.text += 'Number of annotation results:\n' for mode in ['public', 'local', 'mturk', 'other']: f_names = glob.glob( os.path.join(find_folder('3_labelled'), mode, '*.csv')) num_annotations = 0 for f_name in f_names: num_annotations += sum(1 for line in open(f_name)) self.add_key_value('- {}'.format(mode), num_annotations) self.text += '\n'
def sampled_data(self): self.make_title('Sampled data') f_names = glob.glob( os.path.join(find_folder('2_sampled'), 'sampled_{}_{}_*.csv'.format('*', '*'))) num_sample_files = len(f_names) num_tweets_sampled = 0 for f_name in f_names: num_tweets_sampled += sum(1 for line in open(f_name)) self.add_key_value( 'Number of tweets sampled ({:,} file(s))'.format(num_sample_files), num_tweets_sampled) self.text += '\n'
def parsed_data(self): num_lines = 0 for dtype in ['original', 'anonymized', 'encrypted']: path = os.path.join(find_folder('1_parsed'), 'parsed_{}.csv'.format(dtype)) if os.path.isfile(path): num_lines = sum(1 for line in open(path)) break self.make_title('Parsed data') if num_lines > 0: self.add_key_value('Num tweets in parsed data', num_lines - 1) else: self.text += 'No parsed data present.\n' self.text += '\n'
def raw_data(self): raw_data_folder = find_folder('0_raw') num_historic = len( glob.glob(os.path.join(raw_data_folder, 'historic', '*.json*'))) num_streaming = len( glob.glob( os.path.join(raw_data_folder, 'streaming', '**', '*.json*'))) total = num_historic + num_streaming self.make_title('Raw data') if total > 0: self.add_key_value('Number of files in raw data', total) self.add_key_value('- historic', num_historic) self.add_key_value('- streaming', num_streaming) else: self.text += 'No raw data present.\n' self.text += '\n'
def translate(self): parser = ArgParseDefault(description='Translate prepared text') logger = logging.getLogger(__name__) parser.add_argument('-s', '--source', dest='source', choices=["EN", "DE", "FR", "ES", "PT", "IT", "NL", "PL", "RU"], required=True, help='Source language') parser.add_argument('-t', '--target', dest='target', choices=["EN", "DE", "FR", "ES", "PT", "IT", "NL", "PL", "RU"], required=True, help='Target language') parser.add_argument('--auth-key', dest='auth_key', type=str, required=True, help='DeepL auth key') args = parser.parse_args(sys.argv[2:]) logger.info('Translating from source language {} to target language {}....'.format(args.source, args.target)) # load data folder = os.path.join(find_folder('other'), 'translations') if not os.path.isdir(folder): os.makedirs(folder) f_paths = glob.glob(os.path.join(folder, 'prepare_*.csv')) if len(f_paths) == 0: raise FileNotFoundError('No prepare_*.csv file(s) found in folder {}'.format(folder)) elif len(f_paths) > 1: raise ValueError('Found {} prepare_*.csv files in folder {}.'.format(len(f_paths), folder)) logger.info('Loading prepared data...') df = pd.read_csv(f_paths[0], dtype={'id': str}) df_len = df.text.apply(len) costs = df_len.sum()/500 * 0.01 logger.info('About to translate {:,} characters with an estimated cost of EUR {:.2f}.'.format(df_len.sum(), costs)) yes_no = input('Continue to translate? (yes/no)\n') if not (yes_no == 'y' or yes_no == 'yes'): logger.info('Aborting...') return # params base_url = 'https://api.deepl.com/v2/translate' other_params = {'auth_key': args.auth_key, 'target_lang': args.target, 'source_lang': args.source} chunk_size = 20 def chunks(total_len, n): for i in range(0, total_len, n): yield i, i + n - 1 df['translation'] = '' for start, stop in tqdm(chunks(len(df), chunk_size), total=len(range(0, len(df), chunk_size))): texts = df.loc[start:stop, 'text'].tolist() res = requests.get(base_url, params={'text': texts, **other_params}) if not res.ok: raise Exception('Unable to retrieve data from DeepL. Error status code {}... Aborting'.format(res.status_code)) res = res.json() df_tr = pd.DataFrame(res['translations']) df.loc[start:stop, 'translation'] = df_tr['text'].values f_path = os.path.join(folder, 'translation_{}.csv'.format(get_df_hash(df)[:5])) print('Writing {:,} records to file {}...'.format(len(df), f_path)) df.to_csv(f_path, index=False)
def annotation_cleaned(self): self.header('Cleaned annotations') f_path = os.path.join(find_folder('4_labels_cleaned'), 'cleaned_labels.csv') try: self.logger.info('Reading cleaned annotation data...') df = pd.read_csv(f_path) except FileNotFoundError: self.text += 'No cleaned annotations present.' return self.add_key_value('- Num annotation results', len(df)) self.add_key_value('- Num tweets annotated', len(df.id.unique())) self.text += '\n\n' for question_tag, q_group in df.groupby('question_tag'): self.make_title('Question {}'.format(question_tag)) total = q_group.count()['id'] for label_tag, q_a_group in q_group.groupby('label_tag'): label_tag_count = q_a_group.count()['id'] self.add_key_value('- {}'.format(label_tag), label_tag_count, with_percent=100 * label_tag_count / total) self.text += '\n\n'
def prepare(self): parser = ArgParseDefault(description='Prepare text to be translated') logger = logging.getLogger(__name__) parser.add_argument('--geo-only', dest='geo_only', action='store_true', help='Only use geo-tagged data') parser.add_argument('-d', '--dtype', dest='dtype', choices=['original', 'anonymized', 'encrypted'], default='anonymized', help='Data source type') parser.add_argument('-l', '--limit', dest='limit', type=int, default=-1, help='If set, only extract random subsample') args = parser.parse_args(sys.argv[2:]) # load data logger.info('Loading data...') df = get_parsed_data(dtype=args.dtype, usecols=['id', 'text', 'is_duplicate', 'has_place', 'has_coordinates', 'is_retweet']) # filter if args.geo_only: df = df[(df.has_place | df.has_coordinates) & (~df.is_duplicate) & (~df.is_retweet)] else: df = df[(~df.is_duplicate) & (~df.is_retweet)] if args.limit > 0: df = df.sample(args.limit) # write data folder = os.path.join(find_folder('other'), 'translations') if not os.path.isdir(folder): os.makedirs(folder) f_path = os.path.join(folder, 'prepare_{}.csv'.format(get_df_hash(df)[:5])) logger.info('Writing {:,} records to file {}...'.format(len(df), f_path)) df[['id', 'text']].to_csv(f_path, index=False)
def write_sample(self, sample, mode, columns=['id', 'text'], size='', min_date=None, max_date=None, flags=''): if len(sample) == 0: logger.warn('No sample files written. Aborting.') return timestamp = time.strftime('%Y-%m-%d_%H-%M-%S') min_date_str = '' if min_date is not None: min_date_str = '_min_date_{}'.format(min_date) max_date_str = '' if max_date is not None: max_date_str = '_max_date_{}'.format(max_date) f_name = 'sampled_{mode}_{len_sample}_{size}_{seed}{min_date}{max_date}_created_{timestamp}{flags}.csv'.format( mode=mode, len_sample=len(sample), size=size, seed=self.seed, timestamp=timestamp, min_date=min_date_str, max_date=max_date_str, flags=flags) full_path = os.path.join(find_folder('2_sampled'), f_name) logger.info('Writing file {} ...'.format(full_path)) if 'all' in columns: sample.to_csv(full_path, encoding='utf8') else: sample[columns].to_csv(full_path, encoding='utf8', index=False, header=False)
def main(args): """ This script creates a new files in preprocess/data/other/pretrain with tweets which should be used for pretraining language models. It excludes training data and duplicates. """ # load data logger.info('Reading data...') usecols = [ 'id', 'text', 'lang', 'token_count', 'is_retweet', 'contains_keywords' ] df = get_parsed_data(usecols=usecols, num_files=args.num_files) logger.info(f'...loaded a total of {len(df):,} tweets') # Filter retweets if 'retweets' in args.filters: logger.info(f'Filter retweets...') num_before = len(df) df = df[~df.is_retweet] num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # Filtering by keyword if 'contains_keywords' in args.filters: logger.info(f'Filter contains_keywords...') num_before = len(df) df = df[df.contains_keywords] num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # filter lang if args.lang is not None: logger.info(f'Filter lang {args.lang}...') num_before = len(df) df = df[df.lang == args.lang] num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # filter min tokens if args.min_tokens > 0: logger.info(f'Filter has >={args.min_tokens} tokens...') num_before = len(df) df = df[df.token_count >= args.min_tokens] num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # generate text column to filter for duplicates logger.info('Remove duplicates...') num_before = len(df) df.loc[:, 'text_cleared'] = df.text.apply(generate_text_cleared) df = df.drop_duplicates(subset=['text_cleared']) num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # shuffle logger.info('Shuffle...') df = df.sample(frac=1) # write output file num_lines = len(df) logger.info(f'Collected total of {num_lines:,} examples') num_train = max(int(0.8 * num_lines), num_lines - int(2e5)) ts = datetime.datetime.now().strftime('%Y_%m_%d-%H-%M_%s') for (_s, _e), _type in zip([(None, num_train), (num_train, None)], ['train', 'dev']): _df = df[_s:_e] logger.info(f'Writing {len(_df):,} examples for {_type} data...') output_folder = os.path.join(find_folder('other'), 'pretrain', f'run_{ts}', _type) if not os.path.isdir(output_folder): os.makedirs(output_folder) if args.no_parallel: num_cpus = 1 else: num_cpus = max(multiprocessing.cpu_count() - 1, 1) parallel = joblib.Parallel(n_jobs=num_cpus) write_output_file_delayed = joblib.delayed(write_output_file) res = parallel((write_output_file_delayed( _df.iloc[i:(i + args.max_examples_per_file)], os.path.join(output_folder, f'pretrain_{_type}_{j:03}.txt')) for j, i in enumerate( trange(0, len(_df), args.max_examples_per_file)))) logger.info( f'Successfully wrote {len(res):,} file(s) to folder {output_folder}' )
def train_dev_test_split(question='sentiment', dev_size=0.1, test_size=0.2, seed=42, name='', balanced_labels=False, all_questions=False, label_tags=[], labelled_as=None, has_label=''): """Splits cleaned labelled data into training, dev and test set""" def _filter_for_label_balance(df): """Performs undersampling for overrepresanted label classes""" counts = Counter(df['label']) min_count = min(counts.values()) _df = pd.DataFrame() for l in counts.keys(): _df = pd.concat([_df, df[df['label'] == l].sample(min_count)]) return _df questions = [question] np.random.seed(seed) if name == '': f_path = os.path.join(find_folder('4_labels_cleaned'), 'cleaned_labels*.csv') annotation_files = glob.glob(f_path) if len(annotation_files) == 0: raise FileNotFoundError( f'No cleaned label files could be found with the pattern {f_path}' ) elif len(annotation_files) > 1: raise ValueError( f'Found {len(annotation_files)} different files for cleaned labels. Provide "name" argument to specify which.' ) name = os.path.basename(annotation_files[0]).split('.csv')[0] if all_questions: df = get_cleaned_labelled_data(name=name) questions = df['question_tag'].unique() for question in questions: df = get_cleaned_labelled_data(question=question, name=name, has_label=has_label) if len(df) == 0: logger.warning( 'No labelled data could be found for question {} under these parameters.' .format(question)) continue if balanced_labels: df = _filter_for_label_balance(df) flags = '{}{}'.format('_' + name if name != '' else '', '_balanced' if balanced_labels else '') if len(label_tags) > 0: df = df[df['label'].isin(label_tags)] flags += '_labels_{}'.format('_'.join(label_tags)) if len(has_label) > 0: has_label_flag = 'has_label_{}'.format( has_label.replace('|', '_or_').replace(',', '_and_')) flags += '_' + has_label_flag folder_path = os.path.join(find_folder('4_labels_cleaned'), 'other', has_label_flag, question) else: folder_path = os.path.join(find_folder('4_labels_cleaned'), 'splits', question) train, dev, test = np.split(df.sample(frac=1, random_state=seed), [ int((1 - dev_size - test_size) * len(df)), int((1 - test_size) * len(df)) ]) if not os.path.isdir(folder_path): os.makedirs(folder_path) for dtype, data in [['train', train], ['dev', dev], ['test', test]]: f_name = f'{dtype}_{question}_split_{len(train)}_{len(dev)}_{len(test)}_seed_{seed}{flags}.csv' f_path = os.path.join(folder_path, f_name) data.to_csv(f_path, index=None, encoding='utf8') logger.info( f'Successfully wrote data of {len(data):,} examples to file {f_path}.' )
def generate_batch(self, num_tweets=None, batch_id=None, tail=True, ignore_previous=False): """Generates a new batch which takes as input a large sample file provided in `data/2_sampled` and generates a new batch not including previously annotated tweets. """ if num_tweets is None: raise ValueError('Num tweets is zero. Cannot create empty batch.') # vars sample_folder = find_folder('2_sampled') # Ids from sample file df_samples = get_sampled_data() if len(df_samples) == 0: raise Exception( 'Sample file is empty. Generate a sample file first.') tweet_ids_sampled = set(df_samples['tweet_id']) # Ids from previously labelled data try: df_labels = get_labelled_data() except FileNotFoundError: tweet_ids_labelled = set() else: tweet_ids_labelled = set(df_labels['tweet_id']) # Ids from previous batches df_batched = get_batched_sample_data() if len(df_batched) > 0: tweet_ids_batched = set(df_batched['tweet_id']) else: tweet_ids_batched = set() # Ids from previous batches which were not available df_unavailable = get_uploaded_batched_data(availability='unavailable') if len(df_unavailable) > 0: tweet_ids_unavailable = set(df_unavailable['tweet_id']) else: tweet_ids_unavailable = set() # remove tweets which are unavailable, have been previously labelled still_available = tweet_ids_sampled - tweet_ids_unavailable - tweet_ids_labelled if not ignore_previous: still_available -= tweet_ids_batched logger.info( 'Unique tweets in base sample(s): {:,} (labelled: {:,}, unavailable: {:,}, in previous batches: {:,})' .format(len(tweet_ids_sampled), len(tweet_ids_labelled), len(tweet_ids_unavailable), len(tweet_ids_batched))) logger.info('Tweets left to sample from: {:,}'.format( len(still_available))) logger.info('Precentage labelled: {:.2f}%'.format( 100 * float(len(tweet_ids_labelled) / len(tweet_ids_sampled)))) # return conditions if len(still_available) <= 0: logger.warn('All available tweets have been labelled.'.format( len(tweet_ids_sampled), len(still_available))) return if num_tweets > len(still_available): logger.warn( 'Requested to create batch of {:,}, but only {:,} are still available.' .format(num_tweets, len(still_available))) return if tail: batch = df_samples.loc[df_samples['tweet_id'].isin( still_available)][-num_tweets:] else: batch = df_samples.loc[df_samples['tweet_id'].isin( still_available)][:num_tweets] assert len(batch) == num_tweets # write new batch file if batch_id is None: try: batch_id = 1 + max([ int(s.split('_')[-1]) for s in os.listdir(sample_folder) if s.startswith('batch_') and os.path.isdir(os.path.join(sample_folder, s)) ]) except ValueError: batch_id = 1 batch_name = 'batch_{}'.format(batch_id) logger.info('Generating batch {} of size {:,} tweets...'.format( batch_name, num_tweets)) output_folder = os.path.join(sample_folder, batch_name) if not os.path.isdir(output_folder): os.mkdir(output_folder) else: raise Exception( 'Found pre-existing folder "{}". Please remove this folder first or pick a different batch ID.' .format(output_folder)) f_path = os.path.join( output_folder, '{}_{}.csv'.format(batch_name, datetime.now().strftime('%Y-%m-%d'))) batch.to_csv(f_path, header=None, index=False, encoding='utf8') logger.info( 'Successfully wrote file containing new batch "{}"'.format(f_path))