def filter_samples_by_attributes(root_folder_name, input_file, output_file, filter_specs, atts_and_variations, log_frequency=100000): """ Utility to filter NCBI biosamples by attribute names and/or attribute values :param root_folder_name: :param input_file: :param output_file: :param filter_specs: :param atts_and_variations: :param log_frequency: :return: """ constants.BASE_FOLDER = utils.get_base_folder(root_folder_name) execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: relevant_atts_and_variations = filter_atts_and_variations( filter_specs, atts_and_variations) filter_samples(input_file, output_file, True, True, relevant_atts_and_variations, log_frequency)
def export_samples_to_csv(root_folder_name, input_file, output_file, filter_specs, atts_and_variations, log_frequency=1000): """ Generates a simplified version of the samples in CSV and saves them to a file :param samples: samples in BioSamples's XML format :param attributes: :return: """ constants.BASE_FOLDER = utils.get_base_folder(root_folder_name) execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: # attribute names and variations of the attributes to be exported. We need to do this to be able to aggregate # different attribute variations so that the attribute values will be shown under the same column header relevant_atts_and_variations = filter_utils.filter_atts_and_variations( filter_specs, atts_and_variations) # Read and export samples exported_samples = [] if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) print('Input file: ' + input_file) print('Output file: ' + output_file) print('Attributes to be exported: ' + str(filter_specs)) print('Processing NCBI samples...') # Read biosamples from XML file content = utils.read_xml_or_gz_file(input_file) processed_samples_count = 0 for event, node in content: if event == 'START_ELEMENT' and node.tagName == 'BioSample': content.expandNode(node) node_xml = node.toxml() processed_samples_count = processed_samples_count + 1 if processed_samples_count % log_frequency == 0: print('Processed samples: ' + str(processed_samples_count)) exported_samples.append( sample_to_json(node_xml, relevant_atts_and_variations)) utils.save_json_to_csv(exported_samples, output_file) print('Finished processing NCBI samples') print('- Total samples processed: ' + str(processed_samples_count)) print('- Total samples exported: ' + str(len(exported_samples)))
def main(): constants.BASE_FOLDER = utils.get_base_folder(constants.ROOT_FOLDER_NAME) execute = True if os.path.exists(OUTPUT_FILE): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: filter_utils.filter_samples(INPUT_FILE, OUTPUT_FILE, True, False)
def export_samples_to_json(root_folder_name, input_file, output_file, log_frequency=1000): """ Generates a direct translation of the samples from the BioSample's XML to JSON and saves them to a file :param root_folder_name: :param input_file: :param output_file: :param log_frequency: :return: It saves the samples to the output_file """ constants.BASE_FOLDER = utils.get_base_folder(root_folder_name) execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: # Array of sample dictionaries samples_dct = [] if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) print('Input file: ' + input_file) print('Output file: ' + output_file) print('Processing NCBI samples...') # Read biosamples from XML file content = utils.read_xml_or_gz_file(input_file) processed_samples_count = 0 for event, node in content: if event == 'START_ELEMENT' and node.tagName == 'BioSample': content.expandNode(node) node_xml = node.toxml() sample_dct = xmltodict.parse(node_xml) samples_dct.append(sample_dct) processed_samples_count = processed_samples_count + 1 if processed_samples_count % log_frequency == 0: print('Processed samples: ' + str(processed_samples_count)) with open(output_file, 'w') as f: json.dump(samples_dct, f) print('Finished processing NCBI samples') print('- Total samples processed: ' + str(processed_samples_count)) print('- Total samples exported: ' + str(len(samples_dct)))
#!/usr/bin/python3 import scripts.util.utils as utils import os ROOT_FOLDER_NAME = 'metadata-provider-annotator' BASE_FOLDER = utils.get_base_folder(ROOT_FOLDER_NAME) # Resources RESOURCES_FOLDER = BASE_FOLDER + '/' + 'resources' # Workspace WORKSPACE_FOLDER = BASE_FOLDER + '/' + 'workspace' SAMPLES_FOLDER = 'samples' PROJECTS_FOLDER = 'projects' SOURCE_SAMPLES_FOLDER = 'source' FILTERED_SAMPLES_FOLDER = 'filtered' SAMPLES_ANALYSIS_FOLDER = 'analysis' EXPORT_FOLDER = 'exported' ANNOTATED_SAMPLES_FOLDER = 'annotated' # Results folder RESULTS_FOLDER = BASE_FOLDER + '/' + 'results' # BioPortal Annotator BIOPORTAL_APIKEY = os.environ[ 'NCATS_TRANSLATOR_BIOPORTAL_API_KEY'] # You need to define it in your local environment # Data download NCBI_DOWNLOAD_URL = 'https://ftp.ncbi.nih.gov/biosample/biosample_set.xml.gz' NCBI_SAMPLES_FOLDER_DEST = WORKSPACE_FOLDER + '/' + SAMPLES_FOLDER + '/' + SOURCE_SAMPLES_FOLDER
def transform_and_export_samples_to_json(root_folder_name, input_file, output_file, insert_bioproject_info, projects_file, log_frequency=1000): """ Parses an XML file with multiple NCBI biosamples and exports them to JSON. Optionally, inserts additional BioProject info. """ constants.BASE_FOLDER = utils.get_base_folder(root_folder_name) execute = True if os.path.exists(output_file): if not utils.confirm( 'The destination file already exist. Do you want to overwrite it [y/n]? ' ): execute = False if execute: biosamples = [] if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file)) print('Input file: ' + input_file) print('Output file: ' + output_file) if insert_bioproject_info: print('Bioprojects input file: ' + output_file) print('Processing NCBI samples...') processed_samples_count = 0 # Read biosamples from XML file tree = ET.parse(input_file) root = tree.getroot() num_biosamples = len(list(root)) if insert_bioproject_info: # Read bioprojects from JSON file with open(projects_file) as f: projects = json.load(f) print('Extracting all samples from file (no. samples: ' + str(num_biosamples) + ')') for child in root: biosample = NcbiBiosample() description_node = child.find('Description') attributes_node = child.find('Attributes') # sample identifiers sample_ids = child.find('Ids') for sample_id in sample_ids: value = sample_id.text if sample_id.get('db') == 'BioSample': biosample.biosampleAccession = value # sample name for sample_id in sample_ids: if sample_id.get('db_label') == 'Sample name': value = sample_id.text biosample.sampleName = value # sample title if description_node is not None and description_node.find( 'Title') is not None: value = description_node.find('Title').text biosample.sampleTitle = value # bioproject accession links = child.find('Links') if links is not None: for link in links: if link.get('target') == 'bioproject': prj_accession = link.get('label') if prj_accession in projects.keys(): biosample.bioprojectAccession = prj_accession biosample.bioproject = copy.deepcopy( projects.get(prj_accession)) # else: # print('Bioproject not found: ' + prj_accession) # organism if description_node is not None: organism_node = description_node.find('Organism') if organism_node is not None and organism_node.get( 'taxonomy_name') is not None: value = organism_node.get('taxonomy_name') biosample.organism = value # attributes biosample_attributes = [] for att in attributes_node: biosample_attribute = NcbiBiosampleAttribute() if att.get('display_name') is not None: att_name = att.get('display_name') else: att_name = att.get('attribute_name') biosample_attribute.attributeName = att_name biosample_attribute.attributeValue = att.text biosample_attributes.append(biosample_attribute) biosample.attributes = biosample_attributes biosamples.append(biosample) processed_samples_count = processed_samples_count + 1 # from pprint import pprint # pprint(vars(biosample)) with open(output_file, 'w') as f: # json_string = json.dumps(biosamples, default=obj_dict) # print json.dump(biosamples, f, default=obj_dict) print('Finished processing NCBI samples') print('- Total samples processed: ' + str(processed_samples_count)) print('- Total samples exported: ' + str(len(biosamples)))