def test_extract_upc_more_than_one_labeler(self): # Note 300436122 is not included (invalid length) even though the record # indicates it's a UPC. expected_upcs = [ '300676122304', '032251004292', '400910131502', '300670675103', '300670668150', '300670675301', '300670675400', '300670675004', '300670675127', '300676255736', '300670674106', '300670674304', '681131739276', '681131739283', '050428068595', '050428046784', '050428065747', '050428579350', '050428046791', '050428065822', '050428160282' ] xml = open_data_file('upc-more-than-one-labeler.xml') xml_dict = xmltodict.parse(xml) extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number']) self.assertListEqual(sorted(expected_upcs), sorted(extracted_upcs) )
def test_extract_upc_with_spaces(self): expected_upcs = [ '361958010115' ] xml = open_data_file('upc-with-spaces.xml') xml_dict = xmltodict.parse(xml) extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number']) self.assertListEqual(expected_upcs, extracted_upcs, 'upc-with-spaces.xml')
def test_extract_upc_with_spaces_and_dashes(self): expected_upcs = [ '306037039397' ] xml = open_data_file('upc-with-spaces-and-dashes.xml') xml_dict = xmltodict.parse(xml) extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number']) self.assertListEqual(expected_upcs, extracted_upcs, 'upc-with-spaces-and-dashes.xml')
def map(self, key, value, output): # These keys must exist in the JSON records for the annotation logic to work logic_keys = [ 'code-info', 'report-date', 'product-description' ] val = json.loads(value) if val['product-type'] == 'Drugs': val['upc'] = extract.extract_upc_from_recall(val) val['ndc'] = extract.extract_ndc_from_recall(val) # There is no good ID for the report, so we need to make one doc_id = self._hash(json.dumps(val, sort_keys=True)) val['@id'] = doc_id val['@version'] = 1 # Only write out vals that have required keys and a meaningful date if set(logic_keys).issubset(val) and val['report-date'] != None: output.add(key, val) else: logging.warn('Docuemnt is missing required fields. %s', json.dumps(val, indent=2, sort_keys=True))
def run(self): output_dir = self.output().path os.system('mkdir -p "%s"' % output_dir) current_event_files = glob.glob(self.input()[0].path + '/*.json') historic_event_files = glob.glob(self.input()[1].path + '/*.json') all_files = current_event_files + historic_event_files output_file = self.output().path + '/all_res.json' out = open(output_file, 'w') # These keys must exist in the JSON records for the annotation logic to work logic_keys = [ 'code-info', 'report-date', 'product-description' ] for filename in all_files: json_file = open(filename, 'r') for row in json_file: record = json.loads(row) if record['product-type'] == 'Drugs': record['upc'] = extract.extract_upc_from_recall(record) record['ndc'] = extract.extract_ndc_from_recall(record) # Only write out records that have required keys and a meaningful date if set(logic_keys).issubset(record) and record['report-date'] != None: out.write(json.dumps(record) + '\n')
def map(self, key, value, output): def cleaner(k, v): if k in RENAME_MAP: k = RENAME_MAP[k] if k in DATE_KEYS: if not v: return None v = arrow.get(v, 'MM/DD/YYYY').format('YYYYMMDD') return (k, v) val = common.transform_dict(value, cleaner) # These keys must exist in the JSON records for the annotation logic to work logic_keys = ['code-info', 'report-date', 'product-description'] if val.get('product-type') == 'Drugs': val['upc'] = extract.extract_upc_from_recall(val) val['ndc'] = extract.extract_ndc_from_recall(val) # There is not a decent ID for the report, so we need to make one doc_id = self._hash(json.dumps(val, sort_keys=True)) val['@id'] = doc_id val['@version'] = 1 # Only write out vals that have required keys and a meaningful date if set(logic_keys).issubset(val) and val['report-date'] is not None: output.add(doc_id, val) else: logging.warn('Docuemnt is missing required fields. %s', json.dumps(val, indent=2, sort_keys=True))
def test_extract_upc_with_dashes(self): # ensure the best by date "041913 12265 1" is not included expected_upcs = [ '698997806158' ] xml = open_data_file('upc-with-dashes.xml') xml_dict = xmltodict.parse(xml) extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number']) self.assertListEqual(expected_upcs, extracted_upcs, 'upc-with-dashes.xml')
def run(self): output_dir = self.output().path os.system('mkdir -p "%s"' % output_dir) current_event_files = glob.glob(self.input()[0].path + '/*.json') historic_event_files = glob.glob(self.input()[1].path + '/*.json') all_files = current_event_files + historic_event_files output_file = self.output().path + '/all_res.json' out = open(output_file, 'w') # These keys must exist in the JSON records for the annotation logic to work logic_keys = ['code-info', 'report-date', 'product-description'] for filename in all_files: json_file = open(filename, 'r') for row in json_file: record = json.loads(row) if record['product-type'] == 'Drugs': record['upc'] = extract.extract_upc_from_recall(record) record['ndc'] = extract.extract_ndc_from_recall(record) # Only write out records that have required keys and a meaningful date if set(logic_keys).issubset( record) and record['report-date'] != None: out.write(json.dumps(record) + '\n')
def map(self, key, value, output): # These keys must exist in the JSON records for the annotation logic to work logic_keys = ['code-info', 'report-date', 'product-description'] for val in value['recall-number']: if val['product-type'] == 'Drugs': val['upc'] = extract.extract_upc_from_recall(val) val['ndc'] = extract.extract_ndc_from_recall(val) # Copy the recall-number attribute value to an actual field # The recall-number is not a reliable id, since it repeats val['recall-number'] = val['@id'] # There is no good ID for the report, so we need to make one doc_id = self._hash(json.dumps(val, sort_keys=True)) val['@id'] = doc_id val['@version'] = 1 # Only write out vals that have required keys and a meaningful date if set(logic_keys).issubset(val) and val['report-date'] != None: output.add(doc_id, val) else: logging.warn('Docuemnt is missing required fields. %s', json.dumps(val, indent=2, sort_keys=True))