Example #1
0
    def test_should_pass_refresh_false_when_updating_description_and_not_terminal(
            self, tqdm_mock: MagicMock, isatty_mock: MagicMock):

        isatty_mock.return_value = False
        set_description_mock = tqdm_mock.return_value.set_description
        tqdm(*ARGS, **KW_ARGS).set_description(DESCRIPTION)
        set_description_mock.assert_called_with(DESCRIPTION, refresh=False)
Example #2
0
    def test_should_not_set_min_interval_if_terminal(self,
                                                     tqdm_mock: MagicMock,
                                                     isatty_mock: MagicMock):

        isatty_mock.return_value = True
        tqdm(*ARGS, **KW_ARGS)
        tqdm_mock.assert_called_with(*ARGS, **KW_ARGS)
Example #3
0
    def test_should_use_default_non_atty_min_interval_if_not_terminal(
            self, tqdm_mock: MagicMock, isatty_mock: MagicMock):

        isatty_mock.return_value = False
        tqdm(*ARGS, **KW_ARGS)
        tqdm_mock.assert_called_with(*ARGS,
                                     mininterval=DEFAULT_NON_ATTY_MIN_INTERVAL,
                                     **KW_ARGS)
Example #4
0
    def test_should_use_min_interval_from_env(self, tqdm_mock: MagicMock,
                                              env_get_mock: MagicMock):

        env_get_mock.return_value = OTHER_MIN_INTERVAL
        tqdm(*ARGS, **KW_ARGS)
        tqdm_mock.assert_called_with(*ARGS,
                                     mininterval=OTHER_MIN_INTERVAL,
                                     **KW_ARGS)
        env_get_mock.assert_called_with(MIN_INTERVAL_KEY)
    def transform(self, X):
        nlp = get_nlp()
        valid_strings = [
            (i, text)
            for i, text in enumerate(X)
            if isinstance(text, str)
        ]
        valid_text_list = [text for _, text in valid_strings]

        if self.use_pipe:
            source_list = nlp.pipe(
                valid_text_list,
                n_threads=2,
                batch_size=10
            )
        else:
            source_list = [nlp(text) for text in valid_text_list]

        if self.use_progress:
            source_list = tqdm(source_list, total=len(valid_strings))

        result = [None] * len(X)
        for item, doc in zip(valid_strings, source_list):
            result[item[0]] = transform_doc(doc)

        return result
Example #6
0
def write_tables_to_csv(csv_path, tables, pickle=False):
    makedirs(csv_path, exist_ok=True)
    pbar = tqdm(tables.keys(), leave=False)
    for name in pbar:
        pbar.set_description(rjust_and_shorten_text(name, width=40))
        write_csv(csv_path + "/" + name + ".csv", tables[name].matrix())
        if pickle:
            write_pickle(csv_path + "/" + name + ".pickle",
                         tables[name].matrix())
    pbar.set_description("Done")
Example #7
0
def process_files_in_zip(zip_filename, process_file, ext=None):
    with ZipFile(zip_filename) as zip_archive:
        pbar = tqdm(filter_filenames_by_ext(zip_archive.namelist(), ext),
                    leave=False)
        for filename in pbar:
            pbar.set_description(rjust_and_shorten_text(filename, width=40))
            with zip_archive.open(filename, 'r') as zip_file:
                try:
                    # process_file(filename, zip_file.read())
                    process_file(filename, zip_file)
                except xml.etree.ElementTree.ParseError as err:
                    pbar.write("Parse error in file {}/{}: {}".format(
                        zip_filename, filename, err))
        pbar.set_description("Done")
Example #8
0
def download_objects(client, obj_list, download_path, downloaded_files=None):
    makedirs(download_path, exist_ok=True)

    pbar = tqdm(list(obj_list), leave=False)
    for obj in pbar:
        remote_file = obj.key
        pbar.set_description("%40s" % shorten(remote_file, width=40))
        local_file = download_path + '/' + remote_file
        if not isfile(local_file):
            remote_file_timestamp = obj.last_modified
            LOGGER.debug('downloading file %s (timestamp: %s)', remote_file,
                         remote_file_timestamp)
            local_access_time = datetime.now().timestamp()
            local_modified_time = remote_file_timestamp.timestamp()

            client.download_file(obj.bucket_name, remote_file, local_file)
            os.utime(local_file, (local_access_time, local_modified_time))

            if downloaded_files is not None:
                downloaded_files.append(remote_file)
Example #9
0
def convert_mbox_file(filename, stream, writer, fieldnames):
    file_size = stream_size(stream)
    prev_pos = 0
    pos_unit = 1024 * 1024
    with tqdm(total=file_size //
              pos_unit if file_size is not None else None) as pbar:
        pbar.set_description(filename)
        for lines in split_messages_skip_content(stream):
            cur_pos = stream_position(stream)
            if cur_pos is not None:
                pos_change = (cur_pos - prev_pos) // pos_unit
                if pos_change > 0:
                    pbar.update(pos_change)
                prev_pos = cur_pos
            else:
                pbar.update()

            header_dict = dict((k, v) for k, v in parse_header_properties(
                lines, required_keys=fieldnames))
            writer.writerow(header_dict)
Example #10
0
def process_files_in_directory(root_dir, process_file, ext=None):
    filenames = listfiles(root_dir)
    if ext is not None:
        filenames =\
            filter_filenames_by_ext(filenames, ext) +\
            filter_filenames_by_ext(filenames, '.zip')
    sorted_filenames = sort_relative_filenames_by_file_modified_time(
        root_dir, set(filenames))
    pbar = tqdm(sorted_filenames, leave=False)
    for filename in pbar:
        pbar.set_description(rjust_and_shorten_text(filename, width=40))
        full_filename = os.path.join(root_dir, filename)
        if get_filename_ext(filename) == '.zip' and ext != '.zip':
            process_files_in_zip(full_filename, process_file, ext)
        else:
            with open(full_filename, 'rb') as f:
                try:
                    process_file(filename, f)
                except Exception as e:
                    raise Exception('failed to process ' +
                                    full_filename) from e
Example #11
0
def _convert_data(process_fn: callable,
                  db,
                  field_mapping_by_table_name,
                  early_career_researcher_person_ids,
                  export_emails=False):

    table_names = {
        'person', 'manuscript_email_meta', 'manuscript', 'manuscript_version'
    } | person_custom_extractors_by_table_name.keys()
    if export_emails:
        table_names.add('emails')
    for copy_paths in xml_copy_paths.values():
        for table_name in copy_paths.keys():
            table_names.add(table_name)
    tables = dict((table_name, TableOutput(name=table_name))
                  for table_name in table_names)

    process_fn(tables=tables)

    db.session.close_all()

    table_names = ['person', 'manuscript', 'manuscript_version']
    table_names = (table_names +
                   [t for t in sorted(tables.keys()) if t not in table_names])
    table_names = [t for t in table_names if t in field_mapping_by_table_name]

    # print("table_names:", table_names)
    frame_by_table_name = {
        table_name: tables[table_name].to_frame()
        for table_name in table_names
    }

    frame_by_table_name['person'] = apply_early_career_researcher_flag(
        frame_by_table_name['person'], early_career_researcher_person_ids)

    frame_by_table_name[
        'manuscript_stage'] = filter_duplicate_stage_use_highest_trigger_by_df(
            frame_by_table_name['manuscript_stage'])

    # ignore entries with invalid person id (perhaps address that differently in the future)
    filter_invalid_person_ids(frame_by_table_name)

    # we currently only support update or inserts with a single primary key
    # updating relationships is more complicated,
    # we would need to remove no longer existing relationships for example.
    # removing parents of relationships seem to be costly (tend to have a single primary key),
    # the speed gain of just supporting only those for now is still worth it.
    table_names_supporting_update_or_insert_set = ([
        t for t in table_names if len(db[t].primary_key) == 1
    ])

    table_names_supporting_update_or_insert = [
        t for t in table_names
        if t in table_names_supporting_update_or_insert_set
    ]

    table_names_not_supporting_update_or_insert = [
        t for t in table_names
        if t not in table_names_supporting_update_or_insert_set
    ]

    LOGGER.debug('removing records: %s',
                 table_names_not_supporting_update_or_insert)
    pbar = tqdm(list(reversed(table_names_not_supporting_update_or_insert)),
                leave=False)
    for table_name in pbar:
        pbar.set_description(
            rjust_and_shorten_text('remove {}'.format(table_name), width=40))
        remove_records(db, table_name, frame_by_table_name[table_name],
                       tables[table_name].key)

    LOGGER.debug('updating/creating records: %s',
                 table_names_supporting_update_or_insert)
    pbar = tqdm(table_names_supporting_update_or_insert, leave=False)
    for table_name in pbar:
        df = frame_by_table_name[table_name]
        pbar.set_description(
            rjust_and_shorten_text('update/insert {}({})'.format(
                table_name, len(df)),
                                   width=40))
        if len(df) > 0:
            db[table_name].update_or_create_list(
                replace_null_with_none(df).to_dict(orient='records'))

    LOGGER.debug('inserting records: %s',
                 table_names_not_supporting_update_or_insert)
    pbar = tqdm(table_names_not_supporting_update_or_insert, leave=False)
    for table_name in pbar:
        df = frame_by_table_name[table_name]
        pbar.set_description(
            rjust_and_shorten_text('insert {}({})'.format(table_name, len(df)),
                                   width=40))
        if len(df) > 0:
            insert_records(db, table_name, df)
Example #12
0
def tqdm_parallel_map_unordered(executor, fn, iterable, **kwargs):
    futures_list = [executor.submit(fn, item) for item in iterable]
    yield from (f.result()
                for f in tqdm(concurrent.futures.as_completed(futures_list),
                              total=len(futures_list),
                              **kwargs))