コード例 #1
0
def filter_samples_by_attributes(root_folder_name,
                                 input_file,
                                 output_file,
                                 filter_specs,
                                 atts_and_variations,
                                 log_frequency=100000):
    """
    Utility to filter NCBI biosamples by attribute names and/or attribute values
    :param root_folder_name:
    :param input_file:
    :param output_file:
    :param filter_specs:
    :param atts_and_variations:
    :param log_frequency:
    :return:
    """
    constants.BASE_FOLDER = utils.get_base_folder(root_folder_name)
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:
        relevant_atts_and_variations = filter_atts_and_variations(
            filter_specs, atts_and_variations)
        filter_samples(input_file, output_file, True, True,
                       relevant_atts_and_variations, log_frequency)
コード例 #2
0
def export_samples_to_csv(root_folder_name,
                          input_file,
                          output_file,
                          filter_specs,
                          atts_and_variations,
                          log_frequency=1000):
    """
    Generates a simplified version of the samples in CSV and saves them to a file
    :param samples: samples in BioSamples's XML format
    :param attributes:
    :return:
    """
    constants.BASE_FOLDER = utils.get_base_folder(root_folder_name)
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:
        # attribute names and variations of the attributes to be exported. We need to do this to be able to aggregate
        # different attribute variations so that the attribute values will be shown under the same column header
        relevant_atts_and_variations = filter_utils.filter_atts_and_variations(
            filter_specs, atts_and_variations)

        # Read and export samples
        exported_samples = []

        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        print('Input file: ' + input_file)
        print('Output file: ' + output_file)
        print('Attributes to be exported: ' + str(filter_specs))
        print('Processing NCBI samples...')
        # Read biosamples from XML file
        content = utils.read_xml_or_gz_file(input_file)

        processed_samples_count = 0

        for event, node in content:
            if event == 'START_ELEMENT' and node.tagName == 'BioSample':
                content.expandNode(node)
                node_xml = node.toxml()
                processed_samples_count = processed_samples_count + 1

                if processed_samples_count % log_frequency == 0:
                    print('Processed samples: ' + str(processed_samples_count))

                exported_samples.append(
                    sample_to_json(node_xml, relevant_atts_and_variations))

        utils.save_json_to_csv(exported_samples, output_file)

        print('Finished processing NCBI samples')
        print('- Total samples processed: ' + str(processed_samples_count))
        print('- Total samples exported: ' + str(len(exported_samples)))
コード例 #3
0
def main():
    constants.BASE_FOLDER = utils.get_base_folder(constants.ROOT_FOLDER_NAME)
    execute = True
    if os.path.exists(OUTPUT_FILE):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:
        filter_utils.filter_samples(INPUT_FILE, OUTPUT_FILE, True, False)
コード例 #4
0
def export_samples_to_json(root_folder_name,
                           input_file,
                           output_file,
                           log_frequency=1000):
    """
    Generates a direct translation of the samples from the BioSample's XML to JSON and saves them to a file
    :param root_folder_name:
    :param input_file:
    :param output_file:
    :param log_frequency:
    :return: It saves the samples to the output_file
    """

    constants.BASE_FOLDER = utils.get_base_folder(root_folder_name)
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:

        # Array of sample dictionaries
        samples_dct = []

        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        print('Input file: ' + input_file)
        print('Output file: ' + output_file)
        print('Processing NCBI samples...')

        # Read biosamples from XML file
        content = utils.read_xml_or_gz_file(input_file)

        processed_samples_count = 0

        for event, node in content:
            if event == 'START_ELEMENT' and node.tagName == 'BioSample':
                content.expandNode(node)
                node_xml = node.toxml()
                sample_dct = xmltodict.parse(node_xml)
                samples_dct.append(sample_dct)

                processed_samples_count = processed_samples_count + 1
                if processed_samples_count % log_frequency == 0:
                    print('Processed samples: ' + str(processed_samples_count))

        with open(output_file, 'w') as f:
            json.dump(samples_dct, f)

        print('Finished processing NCBI samples')
        print('- Total samples processed: ' + str(processed_samples_count))
        print('- Total samples exported: ' + str(len(samples_dct)))
コード例 #5
0
#!/usr/bin/python3

import scripts.util.utils as utils
import os

ROOT_FOLDER_NAME = 'metadata-provider-annotator'
BASE_FOLDER = utils.get_base_folder(ROOT_FOLDER_NAME)

# Resources
RESOURCES_FOLDER = BASE_FOLDER + '/' + 'resources'

# Workspace
WORKSPACE_FOLDER = BASE_FOLDER + '/' + 'workspace'
SAMPLES_FOLDER = 'samples'
PROJECTS_FOLDER = 'projects'
SOURCE_SAMPLES_FOLDER = 'source'
FILTERED_SAMPLES_FOLDER = 'filtered'
SAMPLES_ANALYSIS_FOLDER = 'analysis'
EXPORT_FOLDER = 'exported'
ANNOTATED_SAMPLES_FOLDER = 'annotated'

# Results folder
RESULTS_FOLDER = BASE_FOLDER + '/' + 'results'

# BioPortal Annotator
BIOPORTAL_APIKEY = os.environ[
    'NCATS_TRANSLATOR_BIOPORTAL_API_KEY']  # You need to define it in your local environment

# Data download
NCBI_DOWNLOAD_URL = 'https://ftp.ncbi.nih.gov/biosample/biosample_set.xml.gz'
NCBI_SAMPLES_FOLDER_DEST = WORKSPACE_FOLDER + '/' + SAMPLES_FOLDER + '/' + SOURCE_SAMPLES_FOLDER
コード例 #6
0
def transform_and_export_samples_to_json(root_folder_name,
                                         input_file,
                                         output_file,
                                         insert_bioproject_info,
                                         projects_file,
                                         log_frequency=1000):
    """
       Parses an XML file with multiple NCBI biosamples and exports them to JSON. Optionally, inserts additional BioProject info.

    """
    constants.BASE_FOLDER = utils.get_base_folder(root_folder_name)
    execute = True
    if os.path.exists(output_file):
        if not utils.confirm(
                'The destination file already exist. Do you want to overwrite it [y/n]? '
        ):
            execute = False
    if execute:

        biosamples = []

        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file))

        print('Input file: ' + input_file)
        print('Output file: ' + output_file)
        if insert_bioproject_info:
            print('Bioprojects input file: ' + output_file)
        print('Processing NCBI samples...')

        processed_samples_count = 0

        # Read biosamples from XML file
        tree = ET.parse(input_file)
        root = tree.getroot()
        num_biosamples = len(list(root))

        if insert_bioproject_info:
            # Read bioprojects from JSON file
            with open(projects_file) as f:
                projects = json.load(f)

        print('Extracting all samples from file (no. samples: ' +
              str(num_biosamples) + ')')
        for child in root:

            biosample = NcbiBiosample()

            description_node = child.find('Description')
            attributes_node = child.find('Attributes')

            # sample identifiers
            sample_ids = child.find('Ids')
            for sample_id in sample_ids:
                value = sample_id.text
                if sample_id.get('db') == 'BioSample':
                    biosample.biosampleAccession = value

            # sample name
            for sample_id in sample_ids:
                if sample_id.get('db_label') == 'Sample name':
                    value = sample_id.text
                    biosample.sampleName = value

            # sample title
            if description_node is not None and description_node.find(
                    'Title') is not None:
                value = description_node.find('Title').text
                biosample.sampleTitle = value

            # bioproject accession
            links = child.find('Links')
            if links is not None:
                for link in links:
                    if link.get('target') == 'bioproject':
                        prj_accession = link.get('label')

                        if prj_accession in projects.keys():
                            biosample.bioprojectAccession = prj_accession
                            biosample.bioproject = copy.deepcopy(
                                projects.get(prj_accession))
                        # else:
                        #     print('Bioproject not found: ' + prj_accession)

            # organism
            if description_node is not None:
                organism_node = description_node.find('Organism')
                if organism_node is not None and organism_node.get(
                        'taxonomy_name') is not None:
                    value = organism_node.get('taxonomy_name')
                    biosample.organism = value

            # attributes
            biosample_attributes = []

            for att in attributes_node:
                biosample_attribute = NcbiBiosampleAttribute()

                if att.get('display_name') is not None:
                    att_name = att.get('display_name')
                else:
                    att_name = att.get('attribute_name')

                biosample_attribute.attributeName = att_name
                biosample_attribute.attributeValue = att.text

                biosample_attributes.append(biosample_attribute)

            biosample.attributes = biosample_attributes
            biosamples.append(biosample)
            processed_samples_count = processed_samples_count + 1

            # from pprint import pprint
            # pprint(vars(biosample))

        with open(output_file, 'w') as f:
            # json_string = json.dumps(biosamples, default=obj_dict)
            # print
            json.dump(biosamples, f, default=obj_dict)

        print('Finished processing NCBI samples')
        print('- Total samples processed: ' + str(processed_samples_count))
        print('- Total samples exported: ' + str(len(biosamples)))