Esempio n. 1
0
def harmonize_unii(out_file, product_file, class_index_dir):
    out = open(out_file, 'w')
    meta_file = csv.DictReader(open(product_file, 'rb'), delimiter='\t')

    ndc_dict = {}
    for row in meta_file:

        # Building the ndc_dict which is the out_fileer data structure of the final loop
        # A little weird because there are duplicate set_id entries in the Product
        # file. Setting the key of the ndc_dict as the substance name and then
        # checking to see if the set_id is already in value list of that key.
        this_spl_id = row['PRODUCTID'].split('_')[1]
        this_substance = row['SUBSTANCENAME']

        if this_spl_id in ndc_dict:
            tmp_substance = [s.lstrip() for s in this_substance.split(';')]
            ndc_dict[this_spl_id] = set(tmp_substance + ndc_dict[this_spl_id])
            ndc_dict[this_spl_id] = list(ndc_dict[this_spl_id])
        else:
            ndc_dict[this_spl_id] = [
                s.lstrip() for s in this_substance.split(';')
            ]

    xmls = []
    # Grab all of the xml files
    for root, _, filenames in os.walk(class_index_dir):
        for filename in fnmatch.filter(filenames, '*.xml'):
            xmls.append(os.path.join(root, filename))
    # call async worker
    rows = parallel_runner.parallel_extract(xmls, harmonization_extract_worker)

    combo = []

    # Loop over ndc_dict, split its key, look for each token as a separate
    # UNII element, if it is one, then add it to the unii_info dict for this
    # loop cycle, once done with all of the tokenized keys, then loop over each
    # set_id in the ndc_dict value list and push a combine record onto the
    # list that will be the output.
    # Loop handles the many-to-many relationship of ingredients to products.
    unii_pivot = {}
    for key, value in ndc_dict.iteritems():
        for substance_list in value:
            for unii_extract_dict in rows:
                if unii_extract_dict[0]['name'] in substance_list:
                    if key in unii_pivot:
                        unii_pivot[key].append(unii_extract_dict[0])
                    else:
                        unii_pivot[key] = [unii_extract_dict[0]]

    for key, value in unii_pivot.iteritems():
        output_dict = {}
        output_dict['spl_id'] = key
        output_dict['unii_indexing'] = value[0]
        combo.append(output_dict)

    for row in combo:
        out.write(json.dumps(row) + '\n')
Esempio n. 2
0
def harmonize_unii(out_file, product_file, class_index_dir):
  out = open(out_file, 'w')
  meta_file = csv.DictReader(open(product_file, 'rb'), delimiter='\t')

  ndc_dict = {}
  for row in meta_file:

    # Building the ndc_dict which is the out_fileer data structure of the final loop
    # A little weird because there are duplicate set_id entries in the Product
    # file. Setting the key of the ndc_dict as the substance name and then
    # checking to see if the set_id is already in value list of that key.
    this_spl_id = row['PRODUCTID'].split('_')[1]
    this_substance = row['SUBSTANCENAME']

    if this_spl_id in ndc_dict:
      tmp_substance = [s.lstrip() for s in this_substance.split(';')]
      ndc_dict[this_spl_id] = set(tmp_substance + ndc_dict[this_spl_id])
      ndc_dict[this_spl_id] = list(ndc_dict[this_spl_id])
    else:
      ndc_dict[this_spl_id] = [s.lstrip() for s in this_substance.split(';')]



  xmls = []
  # Grab all of the xml files
  for root, _, filenames in os.walk(class_index_dir):
    for filename in fnmatch.filter(filenames, '*.xml'):
      xmls.append(os.path.join(root, filename))
  # call async worker
  rows = parallel_runner.parallel_extract(xmls, harmonization_extract_worker)

  combo = []

    # Loop over ndc_dict, split its key, look for each token as a separate
    # UNII element, if it is one, then add it to the unii_info dict for this
    # loop cycle, once done with all of the tokenized keys, then loop over each
    # set_id in the ndc_dict value list and push a combine record onto the
    # list that will be the output.
    # Loop handles the many-to-many relationship of ingredients to products.
  unii_pivot = {}
  for key, value in ndc_dict.iteritems():
    for substance_list in value:
      for unii_extract_dict in rows:
        if unii_extract_dict[0]['name'] in substance_list:
          if key in unii_pivot:
            unii_pivot[key].append(unii_extract_dict[0])
          else:
            unii_pivot[key] = [unii_extract_dict[0]]

  for key, value in unii_pivot.iteritems():
    output_dict = {}
    output_dict['spl_id'] = key
    output_dict['unii_indexing'] = value[0]
    combo.append(output_dict)

  for row in combo:
    out.write(json.dumps(row) + '\n')
Esempio n. 3
0
def harmonize_spl(spl_dir, json_out):
  json_output_file = open(json_out, 'w')

  xmls = []
  # grab only original xml files
  for root, _, filenames in os.walk(spl_dir):
    for filename in fnmatch.filter(filenames, '*.xml'):
      xmls.append(os.path.join(root, filename))

  rows = parallel_runner.parallel_extract(xmls, harmonization_extract_worker)
  for output in rows:
    json_output_file.write(json.dumps(output) + '\n')
Esempio n. 4
0
def harmonize_unii(out_file, product_file, class_index_dir):
  out = open(out_file, 'w')
  meta_file = csv.DictReader(open(product_file, 'rb'), delimiter='\t')

  ndc_dict = {}
  for row in meta_file:
    # Building the ndc_dict which is the out_fileer data structure of the final
    # loop. A little weird because there are duplicate set_id entries in the
    # Product file. Setting the key of the ndc_dict as the substance name
    # and then checking to see if the set_id is already in value list of
    # that key.
    this_spl_id = row['PRODUCTID'].split('_')[1]
    this_substance = row['SUBSTANCENAME']

    if this_spl_id in ndc_dict:
      tmp_substance = [s.lstrip() for s in this_substance.split(';')]
      ndc_dict[this_spl_id] = set(tmp_substance + ndc_dict[this_spl_id])
      ndc_dict[this_spl_id] = list(ndc_dict[this_spl_id])
    else:
      ndc_dict[this_spl_id] = [s.lstrip() for s in this_substance.split(';')]

  xmls = []
  for root, _, filenames in os.walk(class_index_dir):
    for filename in fnmatch.filter(filenames, '*.xml'):
      xmls.append(os.path.join(root, filename))

  rows = parallel_runner.parallel_extract(xmls, harmonization_extract_worker)

  combo = []

  # Handles the many-to-many relationship of ingredients to products.
  unii_pivot = {}
  for key, value in ndc_dict.iteritems():
    for substance_list in value:
      for unii_extract_dict in rows:
        if unii_extract_dict[0]['name'] in substance_list:
          if key in unii_pivot:
            unii_pivot[key].append(unii_extract_dict[0])
          else:
            unii_pivot[key] = [unii_extract_dict[0]]

  for key, value in unii_pivot.iteritems():
    output_dict = {}
    output_dict['spl_id'] = key
    output_dict['unii_indexing'] = value[0]
    combo.append(output_dict)

  for row in combo:
    out.write(json.dumps(row) + '\n')