コード例 #1
0
ファイル: extract_test.py プロジェクト: zheismysavior/openfda
  def test_extract_upc_more_than_one_labeler(self):
    # Note 300436122 is not included (invalid length) even though the record
    # indicates it's a UPC.
    expected_upcs = [
      '300676122304',
      '032251004292',
      '400910131502',
      '300670675103',
      '300670668150',
      '300670675301',
      '300670675400',
      '300670675004',
      '300670675127',
      '300676255736',
      '300670674106',
      '300670674304',
      '681131739276',
      '681131739283',
      '050428068595',
      '050428046784',
      '050428065747',
      '050428579350',
      '050428046791',
      '050428065822',
      '050428160282'
    ]

    xml = open_data_file('upc-more-than-one-labeler.xml')
    xml_dict = xmltodict.parse(xml)
    extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number'])
    self.assertListEqual(sorted(expected_upcs), sorted(extracted_upcs) )
コード例 #2
0
ファイル: extract_test.py プロジェクト: zheismysavior/openfda
 def test_extract_upc_with_spaces(self):
   expected_upcs = [ '361958010115' ]
   xml = open_data_file('upc-with-spaces.xml')
   xml_dict = xmltodict.parse(xml)
   extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number'])
   self.assertListEqual(expected_upcs, extracted_upcs,
                        'upc-with-spaces.xml')
コード例 #3
0
ファイル: extract_test.py プロジェクト: zheismysavior/openfda
 def test_extract_upc_with_spaces_and_dashes(self):
   expected_upcs = [ '306037039397' ]
   xml = open_data_file('upc-with-spaces-and-dashes.xml')
   xml_dict = xmltodict.parse(xml)
   extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number'])
   self.assertListEqual(expected_upcs, extracted_upcs,
                        'upc-with-spaces-and-dashes.xml')
コード例 #4
0
ファイル: pipeline.py プロジェクト: tjyacoub/openfda
  def map(self, key, value, output):
    # These keys must exist in the JSON records for the annotation logic to work
    logic_keys = [
      'code-info',
      'report-date',
      'product-description'
    ]

    val = json.loads(value)

    if val['product-type'] == 'Drugs':
      val['upc'] = extract.extract_upc_from_recall(val)
      val['ndc'] = extract.extract_ndc_from_recall(val)

    # There is no good ID for the report, so we need to make one
    doc_id = self._hash(json.dumps(val, sort_keys=True))
    val['@id'] = doc_id
    val['@version'] = 1

    # Only write out vals that have required keys and a meaningful date
    if set(logic_keys).issubset(val) and val['report-date'] != None:
      output.add(key, val)
    else:
      logging.warn('Docuemnt is missing required fields. %s',
                   json.dumps(val, indent=2, sort_keys=True))
コード例 #5
0
ファイル: pipeline.py プロジェクト: gforz/openfda
  def run(self):
    output_dir = self.output().path
    os.system('mkdir -p "%s"' % output_dir)
    current_event_files = glob.glob(self.input()[0].path + '/*.json')
    historic_event_files = glob.glob(self.input()[1].path + '/*.json')
    all_files = current_event_files + historic_event_files
    output_file = self.output().path + '/all_res.json'
    out = open(output_file, 'w')
    # These keys must exist in the JSON records for the annotation logic to work
    logic_keys = [
      'code-info',
      'report-date',
      'product-description'
    ]

    for filename in all_files:
      json_file = open(filename, 'r')
      for row in json_file:
        record = json.loads(row)
        if record['product-type'] == 'Drugs':
          record['upc'] = extract.extract_upc_from_recall(record)
          record['ndc'] = extract.extract_ndc_from_recall(record)
        # Only write out records that have required keys and a meaningful date
        if set(logic_keys).issubset(record) and record['report-date'] != None:
          out.write(json.dumps(record) + '\n')
コード例 #6
0
ファイル: pipeline.py プロジェクト: LiuFang816/SALSTM_py_data
    def map(self, key, value, output):
        def cleaner(k, v):
            if k in RENAME_MAP:
                k = RENAME_MAP[k]

            if k in DATE_KEYS:
                if not v:
                    return None
                v = arrow.get(v, 'MM/DD/YYYY').format('YYYYMMDD')

            return (k, v)

        val = common.transform_dict(value, cleaner)

        # These keys must exist in the JSON records for the annotation logic to work
        logic_keys = ['code-info', 'report-date', 'product-description']

        if val.get('product-type') == 'Drugs':
            val['upc'] = extract.extract_upc_from_recall(val)
            val['ndc'] = extract.extract_ndc_from_recall(val)

        # There is not a decent ID for the report, so we need to make one
        doc_id = self._hash(json.dumps(val, sort_keys=True))
        val['@id'] = doc_id
        val['@version'] = 1

        # Only write out vals that have required keys and a meaningful date
        if set(logic_keys).issubset(val) and val['report-date'] is not None:
            output.add(doc_id, val)
        else:
            logging.warn('Docuemnt is missing required fields. %s',
                         json.dumps(val, indent=2, sort_keys=True))
コード例 #7
0
ファイル: extract_test.py プロジェクト: zheismysavior/openfda
 def test_extract_upc_with_dashes(self):
   # ensure the best by date "041913 12265 1" is not included
   expected_upcs = [ '698997806158' ]
   xml = open_data_file('upc-with-dashes.xml')
   xml_dict = xmltodict.parse(xml)
   extracted_upcs = extract.extract_upc_from_recall(xml_dict['recall-number'])
   self.assertListEqual(expected_upcs, extracted_upcs,
                        'upc-with-dashes.xml')
コード例 #8
0
    def run(self):
        output_dir = self.output().path
        os.system('mkdir -p "%s"' % output_dir)
        current_event_files = glob.glob(self.input()[0].path + '/*.json')
        historic_event_files = glob.glob(self.input()[1].path + '/*.json')
        all_files = current_event_files + historic_event_files
        output_file = self.output().path + '/all_res.json'
        out = open(output_file, 'w')
        # These keys must exist in the JSON records for the annotation logic to work
        logic_keys = ['code-info', 'report-date', 'product-description']

        for filename in all_files:
            json_file = open(filename, 'r')
            for row in json_file:
                record = json.loads(row)
                if record['product-type'] == 'Drugs':
                    record['upc'] = extract.extract_upc_from_recall(record)
                    record['ndc'] = extract.extract_ndc_from_recall(record)
                # Only write out records that have required keys and a meaningful date
                if set(logic_keys).issubset(
                        record) and record['report-date'] != None:
                    out.write(json.dumps(record) + '\n')
コード例 #9
0
ファイル: pipeline.py プロジェクト: serayamaouche/openfda
    def map(self, key, value, output):
        # These keys must exist in the JSON records for the annotation logic to work
        logic_keys = ['code-info', 'report-date', 'product-description']

        for val in value['recall-number']:
            if val['product-type'] == 'Drugs':
                val['upc'] = extract.extract_upc_from_recall(val)
                val['ndc'] = extract.extract_ndc_from_recall(val)

            # Copy the recall-number attribute value to an actual field
            # The recall-number is not a reliable id, since it repeats
            val['recall-number'] = val['@id']

            # There is no good ID for the report, so we need to make one
            doc_id = self._hash(json.dumps(val, sort_keys=True))
            val['@id'] = doc_id
            val['@version'] = 1

            # Only write out vals that have required keys and a meaningful date
            if set(logic_keys).issubset(val) and val['report-date'] != None:
                output.add(doc_id, val)
            else:
                logging.warn('Docuemnt is missing required fields. %s',
                             json.dumps(val, indent=2, sort_keys=True))