def read_frame(self): primary_key = self.primary_key df = pd.read_sql_table( self.table.__tablename__, self.session.get_bind(), index_col=primary_key[0].name if len(primary_key) == 1 else None) return replace_null_with_none(df)
def test_should_return_none_for_nat(self): assert replace_null_with_none( pd.DataFrame([[pd.NaT]]) )[0][0] is None
def test_should_keep_valid_value(self): assert replace_null_with_none( pd.DataFrame([['valid']]) )[0][0] == 'valid'
def _convert_data(process_fn: callable, db, field_mapping_by_table_name, early_career_researcher_person_ids, export_emails=False): table_names = { 'person', 'manuscript_email_meta', 'manuscript', 'manuscript_version' } | person_custom_extractors_by_table_name.keys() if export_emails: table_names.add('emails') for copy_paths in xml_copy_paths.values(): for table_name in copy_paths.keys(): table_names.add(table_name) tables = dict((table_name, TableOutput(name=table_name)) for table_name in table_names) process_fn(tables=tables) db.session.close_all() table_names = ['person', 'manuscript', 'manuscript_version'] table_names = (table_names + [t for t in sorted(tables.keys()) if t not in table_names]) table_names = [t for t in table_names if t in field_mapping_by_table_name] # print("table_names:", table_names) frame_by_table_name = { table_name: tables[table_name].to_frame() for table_name in table_names } frame_by_table_name['person'] = apply_early_career_researcher_flag( frame_by_table_name['person'], early_career_researcher_person_ids) frame_by_table_name[ 'manuscript_stage'] = filter_duplicate_stage_use_highest_trigger_by_df( frame_by_table_name['manuscript_stage']) # ignore entries with invalid person id (perhaps address that differently in the future) filter_invalid_person_ids(frame_by_table_name) # we currently only support update or inserts with a single primary key # updating relationships is more complicated, # we would need to remove no longer existing relationships for example. # removing parents of relationships seem to be costly (tend to have a single primary key), # the speed gain of just supporting only those for now is still worth it. table_names_supporting_update_or_insert_set = ([ t for t in table_names if len(db[t].primary_key) == 1 ]) table_names_supporting_update_or_insert = [ t for t in table_names if t in table_names_supporting_update_or_insert_set ] table_names_not_supporting_update_or_insert = [ t for t in table_names if t not in table_names_supporting_update_or_insert_set ] LOGGER.debug('removing records: %s', table_names_not_supporting_update_or_insert) pbar = tqdm(list(reversed(table_names_not_supporting_update_or_insert)), leave=False) for table_name in pbar: pbar.set_description( rjust_and_shorten_text('remove {}'.format(table_name), width=40)) remove_records(db, table_name, frame_by_table_name[table_name], tables[table_name].key) LOGGER.debug('updating/creating records: %s', table_names_supporting_update_or_insert) pbar = tqdm(table_names_supporting_update_or_insert, leave=False) for table_name in pbar: df = frame_by_table_name[table_name] pbar.set_description( rjust_and_shorten_text('update/insert {}({})'.format( table_name, len(df)), width=40)) if len(df) > 0: db[table_name].update_or_create_list( replace_null_with_none(df).to_dict(orient='records')) LOGGER.debug('inserting records: %s', table_names_not_supporting_update_or_insert) pbar = tqdm(table_names_not_supporting_update_or_insert, leave=False) for table_name in pbar: df = frame_by_table_name[table_name] pbar.set_description( rjust_and_shorten_text('insert {}({})'.format(table_name, len(df)), width=40)) if len(df) > 0: insert_records(db, table_name, df)