def harmonize_unii(out_file, product_file, class_index_dir): out = open(out_file, 'w') meta_file = csv.DictReader(open(product_file, 'rb'), delimiter='\t') ndc_dict = {} for row in meta_file: # Building the ndc_dict which is the out_fileer data structure of the final loop # A little weird because there are duplicate set_id entries in the Product # file. Setting the key of the ndc_dict as the substance name and then # checking to see if the set_id is already in value list of that key. this_spl_id = row['PRODUCTID'].split('_')[1] this_substance = row['SUBSTANCENAME'] if this_spl_id in ndc_dict: tmp_substance = [s.lstrip() for s in this_substance.split(';')] ndc_dict[this_spl_id] = set(tmp_substance + ndc_dict[this_spl_id]) ndc_dict[this_spl_id] = list(ndc_dict[this_spl_id]) else: ndc_dict[this_spl_id] = [ s.lstrip() for s in this_substance.split(';') ] xmls = [] # Grab all of the xml files for root, _, filenames in os.walk(class_index_dir): for filename in fnmatch.filter(filenames, '*.xml'): xmls.append(os.path.join(root, filename)) # call async worker rows = parallel_runner.parallel_extract(xmls, harmonization_extract_worker) combo = [] # Loop over ndc_dict, split its key, look for each token as a separate # UNII element, if it is one, then add it to the unii_info dict for this # loop cycle, once done with all of the tokenized keys, then loop over each # set_id in the ndc_dict value list and push a combine record onto the # list that will be the output. # Loop handles the many-to-many relationship of ingredients to products. unii_pivot = {} for key, value in ndc_dict.iteritems(): for substance_list in value: for unii_extract_dict in rows: if unii_extract_dict[0]['name'] in substance_list: if key in unii_pivot: unii_pivot[key].append(unii_extract_dict[0]) else: unii_pivot[key] = [unii_extract_dict[0]] for key, value in unii_pivot.iteritems(): output_dict = {} output_dict['spl_id'] = key output_dict['unii_indexing'] = value[0] combo.append(output_dict) for row in combo: out.write(json.dumps(row) + '\n')
def harmonize_unii(out_file, product_file, class_index_dir): out = open(out_file, 'w') meta_file = csv.DictReader(open(product_file, 'rb'), delimiter='\t') ndc_dict = {} for row in meta_file: # Building the ndc_dict which is the out_fileer data structure of the final loop # A little weird because there are duplicate set_id entries in the Product # file. Setting the key of the ndc_dict as the substance name and then # checking to see if the set_id is already in value list of that key. this_spl_id = row['PRODUCTID'].split('_')[1] this_substance = row['SUBSTANCENAME'] if this_spl_id in ndc_dict: tmp_substance = [s.lstrip() for s in this_substance.split(';')] ndc_dict[this_spl_id] = set(tmp_substance + ndc_dict[this_spl_id]) ndc_dict[this_spl_id] = list(ndc_dict[this_spl_id]) else: ndc_dict[this_spl_id] = [s.lstrip() for s in this_substance.split(';')] xmls = [] # Grab all of the xml files for root, _, filenames in os.walk(class_index_dir): for filename in fnmatch.filter(filenames, '*.xml'): xmls.append(os.path.join(root, filename)) # call async worker rows = parallel_runner.parallel_extract(xmls, harmonization_extract_worker) combo = [] # Loop over ndc_dict, split its key, look for each token as a separate # UNII element, if it is one, then add it to the unii_info dict for this # loop cycle, once done with all of the tokenized keys, then loop over each # set_id in the ndc_dict value list and push a combine record onto the # list that will be the output. # Loop handles the many-to-many relationship of ingredients to products. unii_pivot = {} for key, value in ndc_dict.iteritems(): for substance_list in value: for unii_extract_dict in rows: if unii_extract_dict[0]['name'] in substance_list: if key in unii_pivot: unii_pivot[key].append(unii_extract_dict[0]) else: unii_pivot[key] = [unii_extract_dict[0]] for key, value in unii_pivot.iteritems(): output_dict = {} output_dict['spl_id'] = key output_dict['unii_indexing'] = value[0] combo.append(output_dict) for row in combo: out.write(json.dumps(row) + '\n')
def harmonize_spl(spl_dir, json_out): json_output_file = open(json_out, 'w') xmls = [] # grab only original xml files for root, _, filenames in os.walk(spl_dir): for filename in fnmatch.filter(filenames, '*.xml'): xmls.append(os.path.join(root, filename)) rows = parallel_runner.parallel_extract(xmls, harmonization_extract_worker) for output in rows: json_output_file.write(json.dumps(output) + '\n')
def harmonize_unii(out_file, product_file, class_index_dir): out = open(out_file, 'w') meta_file = csv.DictReader(open(product_file, 'rb'), delimiter='\t') ndc_dict = {} for row in meta_file: # Building the ndc_dict which is the out_fileer data structure of the final # loop. A little weird because there are duplicate set_id entries in the # Product file. Setting the key of the ndc_dict as the substance name # and then checking to see if the set_id is already in value list of # that key. this_spl_id = row['PRODUCTID'].split('_')[1] this_substance = row['SUBSTANCENAME'] if this_spl_id in ndc_dict: tmp_substance = [s.lstrip() for s in this_substance.split(';')] ndc_dict[this_spl_id] = set(tmp_substance + ndc_dict[this_spl_id]) ndc_dict[this_spl_id] = list(ndc_dict[this_spl_id]) else: ndc_dict[this_spl_id] = [s.lstrip() for s in this_substance.split(';')] xmls = [] for root, _, filenames in os.walk(class_index_dir): for filename in fnmatch.filter(filenames, '*.xml'): xmls.append(os.path.join(root, filename)) rows = parallel_runner.parallel_extract(xmls, harmonization_extract_worker) combo = [] # Handles the many-to-many relationship of ingredients to products. unii_pivot = {} for key, value in ndc_dict.iteritems(): for substance_list in value: for unii_extract_dict in rows: if unii_extract_dict[0]['name'] in substance_list: if key in unii_pivot: unii_pivot[key].append(unii_extract_dict[0]) else: unii_pivot[key] = [unii_extract_dict[0]] for key, value in unii_pivot.iteritems(): output_dict = {} output_dict['spl_id'] = key output_dict['unii_indexing'] = value[0] combo.append(output_dict) for row in combo: out.write(json.dumps(row) + '\n')