def migrate_organization_alias(connections): ''' Extract: Pull aliases along with the corresponding taxonomies Transform: Synthesize the value field using the :func:nppes_data_generators.names.organizations.synthetic_org_name_generator Load: Push synthesized aliases to scrubbed DB :param connections: Two-connection tuple to know where to pull the data from and here to push it back to :return: Just(connections) if everything went well else Nothing() ''' extract_stmt = """ SELECT COALESCE(GROUP_CONCAT(DISTINCT t.value), ''), a.* FROM spd_small.organization_alias a JOIN organization_taxonomy t ON a.organization_id = t.organization_id GROUP BY a.organization_id ORDER BY a.value; """ load_stmt = 'INSERT INTO organization_alias VALUES ({});'.format( ('%s,' * 6)[:-1]) def transform(_, aliases): name_synthesizer = synthetic_org_name_generator() def scrub(record): taxonomies, organization_alias_id, period_start, period_end, value, *rest = record return (organization_alias_id, period_start, period_end, name_synthesizer( value, *taxonomies.split(',') if taxonomies else []), *rest) return Just(map(scrub, aliases)) return SQLJob(extract_stmt, transform, load_stmt).run(connections)
def migrate_vhdir_insurance_plan(): ''' Extract: Pull all addresses from original DB Transform: Synthesize address line1 for each address Load: Push synthesized addresses to scrubbed DB :param connections: Two-connection tuple to know where to pull the data from and here to push it back to :return: Just(connections) if everything went well else Nothing() ''' extract_stmt = 'SELECT * FROM vhdir_insurance_plan;' load_stmt = 'INSERT INTO vhdir_insurance_plan VALUES ({});'.format( ('%s,' * 10)[:-1]) def transform(_, plans): name_synthesizer = synthetic_insurance_plan_name_generator() def scrub_plan(idx_and_record): idx, record = idx_and_record ip_id, md_id, status, name, *rest = record return (ip_id, md_id, status, name_synthesizer(idx, name), *rest) return Just(map(scrub_plan, enumerate(plans))) return SQLJob(extract_stmt, transform, load_stmt)
def migrate_identifier(entity_id, system, transformer): ''' Migrate the identifier table for the given entity and system. This function is meant to be called by the entity for which we want to synthesize identifiers. :param entity_id: Entity to migrate identifiers for (ex. vhdir_organization, vhdir_practitioner...) :param system: Identifier system use for these identifiers (ex. http://hl7.org/fhir/sid/us-npi) :param transformer: Not all identifiers are synthesized the same way depending on it's context. The entity calling this method will tell which one to use. :return: Just(connections) if everything went well else Nothing() ''' extract_stmt = """ SELECT {0}, identifier_id, identifier_status, identifier_status_value_code, `use`, system, type_cc_id, period_start, period_end, value FROM identifier WHERE {0} IS NOT NULL AND system='{1}'; """.format(entity_id, system) load_stmt = """ INSERT INTO identifier ({}, identifier_id, identifier_status, identifier_status_value_code, `use`, system, type_cc_id, period_start, period_end, value) VALUES ({}); """.format(entity_id, ('%s,'*10)[:-1]) return SQLJob(extract_stmt, transformer(entity_id), load_stmt)
def migrate_vhdir_network(): ''' Extract: Pull all networks from original DB Transform: Synthesize both name and alias for each network using :func:nppes_data_generators.names.networks.synthetic_network_name_generator Load: Push synthesized networks to scrubbed DB :param connections: Two-connection tuple to know where to pull the data from and here to push it back to :return: Just(connections) if everything went well else Nothing() ''' extract_stmt = """SELECT n.network_id, n.meta_data_id, n.active, n.period_start, n.period_end, n.part_of_resource_reference_id, n.practitioner_role_id, n.organization_affiliation_id, n.coverage_id, n.plan_id, n.alias, n.name, a.state FROM vhdir_network n JOIN (SELECT network_id, COALESCE(GROUP_CONCAT(state)) AS state FROM address GROUP BY network_id) a ON n.network_id=a.network_id ORDER BY a.state;""" load_stmt = """INSERT INTO vhdir_network (network_id, meta_data_id, active, period_start, period_end, part_of_resource_reference_id, practitioner_role_id, organization_affiliation_id, coverage_id, plan_id, name, alias) VALUES ({})""".format(('%s,'*12)[:-1]) def transform(_, networks): name_synthesizer = synthetic_network_name_generator by_state = (_[1] + (_[0],) for group in groupby(networks, lambda n: n[-1]) for _ in enumerate(group[1])) with_name = (_[:-4] + (name_synthesizer(_[-3], _[-2], _[-1]), name_synthesizer(_[-4], _[-2], _[-1])) for _ in by_state) return Just(with_name) return SQLJob(extract_stmt, transform, load_stmt)
def run(connections): from_, to = connections return SQLPipeline(SQLJob( '', lambda cursor, params: Just(None), 'INSERT INTO {2}.{0} SELECT * FROM {1}.{0};'.format( table, from_.database, to.database)), setup=setup, teardown=teardown).run(connections)
def migrate_vhdir_organization(): ''' Extract: Pull all organizations from original DB, along with their taxonomies Transform: Synthesize the name field using the :func:nppes_data_generators.names.organizations.synthetic_org_name_generator, then attach the name of the parent organization if any Load: Push synthesized organizations to scrubbed DB :param connections: Two-connection tuple to know where to pull the data from and here to push it back to :return: Just(connections) if everything went well else Nothing() ''' extract_stmt = """ SELECT o.organization_id, o.name, COALESCE(GROUP_CONCAT(DISTINCT t.value), ''), active, partOf_organization_id FROM vhdir_organization o LEFT JOIN organization_taxonomy t ON o.organization_id = t.organization_id GROUP BY o.organization_id ORDER BY name; """ load_stmt = """INSERT INTO vhdir_organization (organization_id, name, active, partOf_organization_id, partOf_organization_name) VALUES (%s, %s, %s, %s, %s);""" def transform(_, orgs): name_synthesizer = synthetic_org_name_generator() def scrub_org(org): organization_id, name, taxonomies, active, partOf_organization_id = org if organization_id % 10000 == 0: print(organization_id) return organization_id, name_synthesizer(name, *taxonomies.split(',') if taxonomies else []), active, \ partOf_organization_id @toolz.curry def attach_partOf_name(d, org): partOf_name = (d.get(org[-1])[1], ) if d.get(org[-1]) else (None, ) return org + partOf_name # Keep dict of generated names to attach parent org. names afterward without_partOf_name = {_[0]: _ for _ in map(scrub_org, orgs)} with_partOf_name = map(attach_partOf_name(without_partOf_name), without_partOf_name.values()) return Just(with_partOf_name) return SQLJob(extract_stmt, transform, load_stmt)
def migrate_telecom(connections): ''' Extract: Pull all telecom records from original DB Transform: Synthesize the value field using :func:nppes_data_generators.phone_numbers.phone_number.synthetic_number Load: Push synthesized records to scrubbed DB :param connections: Two-connection tuple to know where to pull the data from and here to push it back to :return: Just(connections) if everything went well else Nothing() ''' extract_stmt = 'SELECT * FROM telecom;' load_stmt = 'INSERT INTO telecom VALUES ({});'.format(('%s,' * 16)[:-1]) def transform(_, telecoms): def scrub_number(telecom): telecom_id, system, value, *rest = telecom return (telecom_id, system, synthetic_number(value), *rest) res = Just(list(map(scrub_number, telecoms))) return res return SQLJob(extract_stmt, transform, load_stmt).run(connections)
def migrate_address(connections): ''' Extract: Pull all addresses from original DB Transform: Synthesize address line1 for each address Load: Push synthesized addresses to scrubbed DB :param connections: Two-connection tuple to know where to pull the data from and here to push it back to :return: Just(connections) if everything went well else Nothing() ''' extract_stmt = 'SELECT * FROM address;' load_stmt = 'INSERT INTO address VALUES ({});'.format(('%s,' * 18)[:-1]) def transform(_, addresses): address_line_generator = synthetic_address_line_generator() def scrub_line1(record): address_id, use, type, text, line1, *rest = record return (address_id, use, type, text, address_line_generator(line1), *rest) return Just(map(scrub_line1, addresses)) return SQLJob(extract_stmt, transform, load_stmt).run(connections)
def migrate_name(connections): ''' Extract: Pull all names from original DB Transform: Synthesize first and last names for each record Load: Push synthesized names to scrubbed DB :param connections: Two-connection tuple to know where to pull the data from and here to push it back to :return: Just(connections) if everything went well else Nothing() ''' extract_stmt = 'SELECT * FROM name;' load_stmt = 'INSERT INTO name VALUES ({});'.format(('%s,' * 10)[:-1]) def transform(_, names): first_name_generator = synthetic_first_name_generator() last_name_generator = synthetic_last_name_generator() def scrub_name(record): name_id, use, text, family, given, *rest = record return (name_id, use, text, last_name_generator(family), first_name_generator(given), *rest) return Just(map(scrub_name, names)) return SQLJob(extract_stmt, transform, load_stmt).run(connections)