コード例 #1
0
 def read_frame(self):
     primary_key = self.primary_key
     df = pd.read_sql_table(
         self.table.__tablename__,
         self.session.get_bind(),
         index_col=primary_key[0].name if len(primary_key) == 1 else None)
     return replace_null_with_none(df)
コード例 #2
0
 def test_should_return_none_for_nat(self):
     assert replace_null_with_none(
         pd.DataFrame([[pd.NaT]])
     )[0][0] is None
コード例 #3
0
 def test_should_keep_valid_value(self):
     assert replace_null_with_none(
         pd.DataFrame([['valid']])
     )[0][0] == 'valid'
コード例 #4
0
def _convert_data(process_fn: callable,
                  db,
                  field_mapping_by_table_name,
                  early_career_researcher_person_ids,
                  export_emails=False):

    table_names = {
        'person', 'manuscript_email_meta', 'manuscript', 'manuscript_version'
    } | person_custom_extractors_by_table_name.keys()
    if export_emails:
        table_names.add('emails')
    for copy_paths in xml_copy_paths.values():
        for table_name in copy_paths.keys():
            table_names.add(table_name)
    tables = dict((table_name, TableOutput(name=table_name))
                  for table_name in table_names)

    process_fn(tables=tables)

    db.session.close_all()

    table_names = ['person', 'manuscript', 'manuscript_version']
    table_names = (table_names +
                   [t for t in sorted(tables.keys()) if t not in table_names])
    table_names = [t for t in table_names if t in field_mapping_by_table_name]

    # print("table_names:", table_names)
    frame_by_table_name = {
        table_name: tables[table_name].to_frame()
        for table_name in table_names
    }

    frame_by_table_name['person'] = apply_early_career_researcher_flag(
        frame_by_table_name['person'], early_career_researcher_person_ids)

    frame_by_table_name[
        'manuscript_stage'] = filter_duplicate_stage_use_highest_trigger_by_df(
            frame_by_table_name['manuscript_stage'])

    # ignore entries with invalid person id (perhaps address that differently in the future)
    filter_invalid_person_ids(frame_by_table_name)

    # we currently only support update or inserts with a single primary key
    # updating relationships is more complicated,
    # we would need to remove no longer existing relationships for example.
    # removing parents of relationships seem to be costly (tend to have a single primary key),
    # the speed gain of just supporting only those for now is still worth it.
    table_names_supporting_update_or_insert_set = ([
        t for t in table_names if len(db[t].primary_key) == 1
    ])

    table_names_supporting_update_or_insert = [
        t for t in table_names
        if t in table_names_supporting_update_or_insert_set
    ]

    table_names_not_supporting_update_or_insert = [
        t for t in table_names
        if t not in table_names_supporting_update_or_insert_set
    ]

    LOGGER.debug('removing records: %s',
                 table_names_not_supporting_update_or_insert)
    pbar = tqdm(list(reversed(table_names_not_supporting_update_or_insert)),
                leave=False)
    for table_name in pbar:
        pbar.set_description(
            rjust_and_shorten_text('remove {}'.format(table_name), width=40))
        remove_records(db, table_name, frame_by_table_name[table_name],
                       tables[table_name].key)

    LOGGER.debug('updating/creating records: %s',
                 table_names_supporting_update_or_insert)
    pbar = tqdm(table_names_supporting_update_or_insert, leave=False)
    for table_name in pbar:
        df = frame_by_table_name[table_name]
        pbar.set_description(
            rjust_and_shorten_text('update/insert {}({})'.format(
                table_name, len(df)),
                                   width=40))
        if len(df) > 0:
            db[table_name].update_or_create_list(
                replace_null_with_none(df).to_dict(orient='records'))

    LOGGER.debug('inserting records: %s',
                 table_names_not_supporting_update_or_insert)
    pbar = tqdm(table_names_not_supporting_update_or_insert, leave=False)
    for table_name in pbar:
        df = frame_by_table_name[table_name]
        pbar.set_description(
            rjust_and_shorten_text('insert {}({})'.format(table_name, len(df)),
                                   width=40))
        if len(df) > 0:
            insert_records(db, table_name, df)