def _join(self, key, values): val = parallel.pivot_values(values) final = {'device': [], 'mdr_text': [], 'patient': []} if not val.get('mdrfoi', []): logging.info('MDR REPORT %s: Missing mdrfoi record, Skipping join', key) return for i, main_report in enumerate(val.get('mdrfoi', [])): final.update(main_report) try: int(final.get('mdr_report_key', None)) except TypeError: logging.info('%s', '*' * 2400) return for source_file, target_key in self.join_map.items(): for row in val.get(source_file, []): row.pop('mdr_report_key', 0) # No need to keep join key on nested data final[target_key].append(row) return final
def reduce(self, key, values, output): def _safe_get(value): if isinstance(value, list): if len(value) > 0: value = value[0] else: return None return value # If there is only one value, then we use it. If there are many then # then choose the right one in the order: changed, added, or existing. # Remember, we are merging the additions and updates with last weeks run, # which is where the existing come from. All of this is due to the fact # that a record can exist in all three places, which is not ideal but is # reality. if len(values) == 1: value = _safe_get(values[0][1]) if value: output.put(key, value) elif len(values) > 1: pivoted = parallel.pivot_values(values) change = _safe_get(pivoted.get('change', [])) add = _safe_get(pivoted.get('add', [])) init = _safe_get(pivoted.get('init', [])) if change: output.put(key, change) elif add: output.put(key, add) else: output.put(key, init)
def _join(self, key, values): val = parallel.pivot_values(values) final = {'device': [], 'mdr_text': [], 'patient': []} if not val.get('mdrfoi', []): # logging.info('MDR REPORT %s: Missing mdrfoi record, Skipping join', key) return for i, main_report in enumerate(val.get('mdrfoi', [])): final.update(main_report) try: int(final.get('mdr_report_key', None)) except TypeError: logging.info('%s', '*' * 2400) return for source_file, target_key in self.join_map.items(): for row in val.get(source_file, []): row.pop('mdr_report_key', 0) # No need to keep join key on nested data if target_key != 'mdr_text' or len([ txt for txt in final[target_key] if txt['mdr_text_key'] == row['mdr_text_key'] ]) == 0: final[target_key].append(row) # Now tuck the device and patient problem codes onto the final record if val.get('foidevproblem', []): final['product_problems'] = list( map(lambda x: x['product_problem'], val['foidevproblem'])) # https://github.com/FDA/openfda/issues/179 # In some cases we have patient problem codes without the actual patient. # We create a 'placeholder' empty patient record in this case just to hold the problem codes. for patient_problem in val.get('patientproblemcode', []): if len([ patient for patient in final['patient'] if patient['patient_sequence_number'] == patient_problem['patient_sequence_number'] ]) == 0: patient = { 'patient_sequence_number': patient_problem['patient_sequence_number'] } final['patient'].append(patient) for patient in final['patient']: for patient_problem in val.get('patientproblemcode', []): if patient['patient_sequence_number'] == patient_problem[ 'patient_sequence_number']: patient['patient_problems'] = [ patient_problem['patient_problem'] ] if patient.get('patient_problems') is None else patient[ 'patient_problems'] + [ patient_problem['patient_problem'] ] return final
def _join(self, key, values): val = parallel.pivot_values(values) final = { 'device': [], 'mdr_text': [], 'patient': [] } if not val.get('mdrfoi', []): logging.info('MDR REPORT %s: Missing mdrfoi record, Skipping join', key) return for i, main_report in enumerate(val.get('mdrfoi', [])): final.update(main_report) try: int(final.get('mdr_report_key', None)) except TypeError: logging.info('%s', '*' * 2400) return for source_file, target_key in self.join_map.items(): for row in val.get(source_file, []): row.pop('mdr_report_key', 0) # No need to keep join key on nested data final[target_key].append(row) return final
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] # There should be only one estblishment type streamed est_type = intermediate.get('estabtypes', [])[0] for data in intermediate.get('listing_estabtypes', []): final = dict(data.items() + est_type.items()) result.append(final) return result
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] # There should be only one estblishment type streamed est_type = intermediate.get("estabtypes", [])[0] for data in intermediate.get("listing_estabtypes", []): final = dict(data.items() + est_type.items()) result.append(final) return result
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] for row in intermediate.get("remapped_registration_listing", []): final = dict(row) final["products"] = intermediate.get("listing_pcd", []) final["proprietary_name"] = intermediate.get("listing_proprietary_name", []) result.append(final) return result
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] for row in intermediate.get('remapped_registration_listing', []): final = dict(row) final['products'] = intermediate.get('listing_pcd', []) final['proprietary_name'] = intermediate.get('listing_proprietary_name', []) result.append(final) return result
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] # There should be only one estblishment type streamed # However, establishment type with ID of 4 no longer exists in estabtypes.txt, but there # were at least two listing_estabtypes records still referring to this non-existent value. # Likely a data issue on the FDA side, which we need to handle here too. est_type = intermediate.get('estabtypes', [{'description': ''}])[0] for data in intermediate.get('listing_estabtypes', []): final = dict(list(data.items()) + list(est_type.items())) result.append(final) return result
def _join(self, values): address_keys = [ 'address_line_1', 'address_line_2', 'city', 'state_id', 'zip_code', 'postal_code', 'iso_country_code' ] intermediate = parallel.pivot_values(values) # The US Agent Address is in the registration dataset, we need to pluck it # out and merge it with each us agent record. us_agent_address = {} for row in intermediate.get('registration', []): _type = row.get('address_type_id', None) if _type == 'U': us_agent_address = {k:v for k, v in row.items() if k in address_keys} # There are 0 or 1 US Agents assigned to a facility us_agent = {} agent_data = intermediate.get('us_agent', []) if agent_data: us_agent = dict(agent_data[0].items() + us_agent_address.items()) # There is 0 or 1 owner operators owner_operator = {} owner_operator_data = intermediate.get('intermediate_owner_operator', []) if owner_operator_data: owner_operator = owner_operator_data[0] result = [] for row in intermediate.get('registration', []): _type = row.get('address_type_id', None) # We only want `Facility` records, i.e. skip all the us agent addresses if _type == 'F': final = dict(row) final['us_agent'] = us_agent final['owner_operator'] = owner_operator result.append(final) return result
def _join(self, values): address_keys = [ 'address_line_1', 'address_line_2', 'city', 'state_id', 'zip_code', 'postal_code', 'iso_country_code' ] intermediate = parallel.pivot_values(values) # The US Agent Address is in the registration dataset, we need to pluck it # out and merge it with each us agent record. us_agent_address = {} for row in intermediate.get('registration', []): _type = row.get('address_type_id', None) if _type == 'U': us_agent_address = {k:v for k, v in row.items() if k in address_keys} # There are 0 or 1 US Agents assigned to a facility us_agent = {} agent_data = intermediate.get('us_agent', []) if agent_data: us_agent = dict(list(agent_data[0].items()) + list(us_agent_address.items())) # There is 0 or 1 owner operators owner_operator = {} owner_operator_data = intermediate.get('intermediate_owner_operator', []) if owner_operator_data: owner_operator = owner_operator_data[0] result = [] for row in intermediate.get('registration', []): _type = row.get('address_type_id', None) # We only want `Facility` records, i.e. skip all the us agent addresses if _type == 'F': final = dict(row) final['us_agent'] = us_agent final['owner_operator'] = owner_operator result.append(final) return result
def _join(self, values): address_keys = [ "address_line_1", "address_line_2", "city", "state_id", "zip_code", "postal_code", "iso_country_code", ] intermediate = parallel.pivot_values(values) # The US Agent Address is in the registration dataset, we need to pluck it # out and merge it with each us agent record. us_agent_address = {} for row in intermediate.get("registration", []): _type = row.get("address_type_id", None) if _type == "U": us_agent_address = {k: v for k, v in row.items() if k in address_keys} # There are 0 or 1 US Agents assigned to a facility us_agent = {} agent_data = intermediate.get("us_agent", []) if agent_data: us_agent = dict(agent_data[0].items() + us_agent_address.items()) # There is 0 or 1 owner operators owner_operator = {} owner_operator_data = intermediate.get("intermediate_owner_operator", []) if owner_operator_data: owner_operator = owner_operator_data[0] result = [] for row in intermediate.get("registration", []): _type = row.get("address_type_id", None) # We only want `Facility` records, i.e. skip all the us agent addresses if _type == "F": final = dict(row) final["us_agent"] = us_agent final["owner_operator"] = owner_operator result.append(final) return result
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] for data in intermediate.get('intermediate_registration_listing', []): final = dict(data) final['establishment_type'] = [] final['registration'] = [] final['proprietary_name'] = [] # De-dup the proprietary names for prop_name in data.get('proprietary_name', []): name = prop_name.get('proprietary_name', None) if name and name not in final['proprietary_name']: final['proprietary_name'].append(name) est_join = ['registration_listing_id'] reg_join = ['reg_key'] est_left_key = construct_join_key(data, est_join) reg_left_key = construct_join_key(final, reg_join) # Grab just the descriptions of the establishment type for row in intermediate.get('intermediate_establishment_listing', []): est_right_key = construct_join_key(row, est_join) if est_left_key == est_right_key: final['establishment_type'].append(row['description']) # There is only one registered facility registrant = {} facility = intermediate.get('intermediate_registration', []) if facility: registrant = facility[0] final['registration'] = registrant result.append(final) return result
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] for row in intermediate['owner_operator']: final = dict(row) final['official_correspondent'] = {} final['contact_address'] = {} # Should only be one address, but the intermediate interface is a list # so we will just grab the first item from the list. contact_address_data = intermediate.get('contact_addresses', None) if contact_address_data: final['contact_address'] = contact_address_data[0] left_key = final['reg_key'] for data in intermediate.get('official_correspondent', []): right_key = data['reg_key'] if right_key == left_key: final['official_correspondent'] = data result.append(final) return result
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] for row in intermediate["owner_operator"]: final = dict(row) final["official_correspondent"] = {} final["contact_address"] = {} # Should only be one address, but the intermediate interface is a list # so we will just grab the first item from the list. contact_address_data = intermediate.get("contact_addresses", None) if contact_address_data: final["contact_address"] = contact_address_data[0] left_key = final["reg_key"] for data in intermediate.get("official_correspondent", []): right_key = data["reg_key"] if right_key == left_key: final["official_correspondent"] = data result.append(final) return result
def _join(self, values): intermediate = parallel.pivot_values(values) result = [] for data in intermediate.get("intermediate_registration_listing", []): final = dict(data) final["establishment_type"] = [] final["registration"] = [] final["proprietary_name"] = [] # De-dup the proprietary names for prop_name in data.get("proprietary_name", []): name = prop_name.get("proprietary_name", None) if name and name not in final["proprietary_name"]: final["proprietary_name"].append(name) est_join = ["registration_listing_id"] reg_join = ["reg_key"] est_left_key = construct_join_key(data, est_join) reg_left_key = construct_join_key(final, reg_join) # Grab just the descriptions of the establishment type for row in intermediate.get("intermediate_establishment_listing", []): est_right_key = construct_join_key(row, est_join) if est_left_key == est_right_key: final["establishment_type"].append(row["description"]) # There is only one registered facility registrant = {} facility = intermediate.get("intermediate_registration", []) if facility: registrant = facility[0] final["registration"] = registrant result.append(final) return result