def test_should_pass_refresh_false_when_updating_description_and_not_terminal( self, tqdm_mock: MagicMock, isatty_mock: MagicMock): isatty_mock.return_value = False set_description_mock = tqdm_mock.return_value.set_description tqdm(*ARGS, **KW_ARGS).set_description(DESCRIPTION) set_description_mock.assert_called_with(DESCRIPTION, refresh=False)
def test_should_not_set_min_interval_if_terminal(self, tqdm_mock: MagicMock, isatty_mock: MagicMock): isatty_mock.return_value = True tqdm(*ARGS, **KW_ARGS) tqdm_mock.assert_called_with(*ARGS, **KW_ARGS)
def test_should_use_default_non_atty_min_interval_if_not_terminal( self, tqdm_mock: MagicMock, isatty_mock: MagicMock): isatty_mock.return_value = False tqdm(*ARGS, **KW_ARGS) tqdm_mock.assert_called_with(*ARGS, mininterval=DEFAULT_NON_ATTY_MIN_INTERVAL, **KW_ARGS)
def test_should_use_min_interval_from_env(self, tqdm_mock: MagicMock, env_get_mock: MagicMock): env_get_mock.return_value = OTHER_MIN_INTERVAL tqdm(*ARGS, **KW_ARGS) tqdm_mock.assert_called_with(*ARGS, mininterval=OTHER_MIN_INTERVAL, **KW_ARGS) env_get_mock.assert_called_with(MIN_INTERVAL_KEY)
def transform(self, X): nlp = get_nlp() valid_strings = [ (i, text) for i, text in enumerate(X) if isinstance(text, str) ] valid_text_list = [text for _, text in valid_strings] if self.use_pipe: source_list = nlp.pipe( valid_text_list, n_threads=2, batch_size=10 ) else: source_list = [nlp(text) for text in valid_text_list] if self.use_progress: source_list = tqdm(source_list, total=len(valid_strings)) result = [None] * len(X) for item, doc in zip(valid_strings, source_list): result[item[0]] = transform_doc(doc) return result
def write_tables_to_csv(csv_path, tables, pickle=False): makedirs(csv_path, exist_ok=True) pbar = tqdm(tables.keys(), leave=False) for name in pbar: pbar.set_description(rjust_and_shorten_text(name, width=40)) write_csv(csv_path + "/" + name + ".csv", tables[name].matrix()) if pickle: write_pickle(csv_path + "/" + name + ".pickle", tables[name].matrix()) pbar.set_description("Done")
def process_files_in_zip(zip_filename, process_file, ext=None): with ZipFile(zip_filename) as zip_archive: pbar = tqdm(filter_filenames_by_ext(zip_archive.namelist(), ext), leave=False) for filename in pbar: pbar.set_description(rjust_and_shorten_text(filename, width=40)) with zip_archive.open(filename, 'r') as zip_file: try: # process_file(filename, zip_file.read()) process_file(filename, zip_file) except xml.etree.ElementTree.ParseError as err: pbar.write("Parse error in file {}/{}: {}".format( zip_filename, filename, err)) pbar.set_description("Done")
def download_objects(client, obj_list, download_path, downloaded_files=None): makedirs(download_path, exist_ok=True) pbar = tqdm(list(obj_list), leave=False) for obj in pbar: remote_file = obj.key pbar.set_description("%40s" % shorten(remote_file, width=40)) local_file = download_path + '/' + remote_file if not isfile(local_file): remote_file_timestamp = obj.last_modified LOGGER.debug('downloading file %s (timestamp: %s)', remote_file, remote_file_timestamp) local_access_time = datetime.now().timestamp() local_modified_time = remote_file_timestamp.timestamp() client.download_file(obj.bucket_name, remote_file, local_file) os.utime(local_file, (local_access_time, local_modified_time)) if downloaded_files is not None: downloaded_files.append(remote_file)
def convert_mbox_file(filename, stream, writer, fieldnames): file_size = stream_size(stream) prev_pos = 0 pos_unit = 1024 * 1024 with tqdm(total=file_size // pos_unit if file_size is not None else None) as pbar: pbar.set_description(filename) for lines in split_messages_skip_content(stream): cur_pos = stream_position(stream) if cur_pos is not None: pos_change = (cur_pos - prev_pos) // pos_unit if pos_change > 0: pbar.update(pos_change) prev_pos = cur_pos else: pbar.update() header_dict = dict((k, v) for k, v in parse_header_properties( lines, required_keys=fieldnames)) writer.writerow(header_dict)
def process_files_in_directory(root_dir, process_file, ext=None): filenames = listfiles(root_dir) if ext is not None: filenames =\ filter_filenames_by_ext(filenames, ext) +\ filter_filenames_by_ext(filenames, '.zip') sorted_filenames = sort_relative_filenames_by_file_modified_time( root_dir, set(filenames)) pbar = tqdm(sorted_filenames, leave=False) for filename in pbar: pbar.set_description(rjust_and_shorten_text(filename, width=40)) full_filename = os.path.join(root_dir, filename) if get_filename_ext(filename) == '.zip' and ext != '.zip': process_files_in_zip(full_filename, process_file, ext) else: with open(full_filename, 'rb') as f: try: process_file(filename, f) except Exception as e: raise Exception('failed to process ' + full_filename) from e
def _convert_data(process_fn: callable, db, field_mapping_by_table_name, early_career_researcher_person_ids, export_emails=False): table_names = { 'person', 'manuscript_email_meta', 'manuscript', 'manuscript_version' } | person_custom_extractors_by_table_name.keys() if export_emails: table_names.add('emails') for copy_paths in xml_copy_paths.values(): for table_name in copy_paths.keys(): table_names.add(table_name) tables = dict((table_name, TableOutput(name=table_name)) for table_name in table_names) process_fn(tables=tables) db.session.close_all() table_names = ['person', 'manuscript', 'manuscript_version'] table_names = (table_names + [t for t in sorted(tables.keys()) if t not in table_names]) table_names = [t for t in table_names if t in field_mapping_by_table_name] # print("table_names:", table_names) frame_by_table_name = { table_name: tables[table_name].to_frame() for table_name in table_names } frame_by_table_name['person'] = apply_early_career_researcher_flag( frame_by_table_name['person'], early_career_researcher_person_ids) frame_by_table_name[ 'manuscript_stage'] = filter_duplicate_stage_use_highest_trigger_by_df( frame_by_table_name['manuscript_stage']) # ignore entries with invalid person id (perhaps address that differently in the future) filter_invalid_person_ids(frame_by_table_name) # we currently only support update or inserts with a single primary key # updating relationships is more complicated, # we would need to remove no longer existing relationships for example. # removing parents of relationships seem to be costly (tend to have a single primary key), # the speed gain of just supporting only those for now is still worth it. table_names_supporting_update_or_insert_set = ([ t for t in table_names if len(db[t].primary_key) == 1 ]) table_names_supporting_update_or_insert = [ t for t in table_names if t in table_names_supporting_update_or_insert_set ] table_names_not_supporting_update_or_insert = [ t for t in table_names if t not in table_names_supporting_update_or_insert_set ] LOGGER.debug('removing records: %s', table_names_not_supporting_update_or_insert) pbar = tqdm(list(reversed(table_names_not_supporting_update_or_insert)), leave=False) for table_name in pbar: pbar.set_description( rjust_and_shorten_text('remove {}'.format(table_name), width=40)) remove_records(db, table_name, frame_by_table_name[table_name], tables[table_name].key) LOGGER.debug('updating/creating records: %s', table_names_supporting_update_or_insert) pbar = tqdm(table_names_supporting_update_or_insert, leave=False) for table_name in pbar: df = frame_by_table_name[table_name] pbar.set_description( rjust_and_shorten_text('update/insert {}({})'.format( table_name, len(df)), width=40)) if len(df) > 0: db[table_name].update_or_create_list( replace_null_with_none(df).to_dict(orient='records')) LOGGER.debug('inserting records: %s', table_names_not_supporting_update_or_insert) pbar = tqdm(table_names_not_supporting_update_or_insert, leave=False) for table_name in pbar: df = frame_by_table_name[table_name] pbar.set_description( rjust_and_shorten_text('insert {}({})'.format(table_name, len(df)), width=40)) if len(df) > 0: insert_records(db, table_name, df)
def tqdm_parallel_map_unordered(executor, fn, iterable, **kwargs): futures_list = [executor.submit(fn, item) for item in iterable] yield from (f.result() for f in tqdm(concurrent.futures.as_completed(futures_list), total=len(futures_list), **kwargs))