Exemple #1
0
    def _process_sub_type(self, sub_type, query_tracking_list):
        self.logger.info("Loading GOAnnot Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_file_to_download()
        filepath = os.path.join('tmp/', filepath)
        self.logger.info("goannot path: %s", filepath)
        file = open(filepath, "r")

        self.logger.info("Finished Loading GOAnnot Data: %s",
                         sub_type.get_data_provider())

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(\
                file,
                ETLHelper.go_annot_prefix_lookup(sub_type.get_data_provider()),
                batch_size)

        query_template_list = [
            [
                self.main_query_template, commit_size,
                "go_annot_" + sub_type.get_data_provider() + ".csv"
            ],
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)

        for item in query_and_file_list:
            query_tracking_list.append(item)
    def get_generators(self, expression_atlas_gene_pages, data_provider,
                       batch_size):
        """Get Generators."""
        return_set = Neo4jHelper.run_single_parameter_query(
            ExpressionAtlasETL.get_genes_with_expression_atlas_links_query,
            list(expression_atlas_gene_pages.keys()))

        counter = 0
        cross_reference_list = []
        for record in return_set:
            counter += 1
            cross_reference = ETLHelper.get_xref_dict(
                record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene",
                "gene/expression-atlas", "gene/expressionAtlas",
                record["g.modLocalId"],
                expression_atlas_gene_pages[record["g.primaryKey"].lower()],
                data_provider + ":" + record["g.modLocalId"] +
                "gene/expression-atlas")
            cross_reference["genePrimaryKey"] = record["g.primaryKey"]
            cross_reference_list.append(cross_reference)
            if counter > batch_size:
                yield [cross_reference_list]
                counter = 0
                cross_reference_list = []

        if counter > 0:
            yield [cross_reference_list]
Exemple #3
0
    def crossref_process(self, record, global_id, cross_reference_list):
        """Get xref."""
        valid_pages = [
            'allele', 'allele/references', 'transgene', 'construct',
            'transgene/references', 'construct/references'
        ]
        if 'crossReferences' not in record:
            return
        for crossRef in record['crossReferences']:
            crossRefId = crossRef.get('id')
            local_crossref_id = crossRefId.split(":")[1]
            prefix = crossRef.get('id').split(":")[0]
            pages = crossRef.get('pages')

            # some pages collection have 0 elements
            if pages is not None and len(pages) > 0:
                for page in pages:
                    if page in valid_pages:
                        mod_global_cross_ref_id = self.etlh.rdh2.return_url_from_key_value(
                            prefix, local_crossref_id, page)
                        xref = ETLHelper.get_xref_dict(
                            local_crossref_id, prefix, page, page, crossRefId,
                            mod_global_cross_ref_id, crossRefId + page)
                        xref['dataId'] = global_id
                        cross_reference_list.append(xref)
Exemple #4
0
    def data_providers_process(self, data):
        """Get data providers.

        Creates 4 attributes.
        data_provider: provider name/symbol
        data_providers: list of providers
        data_provider_pages: pages
        data_provider_cross_ref_set: list of xref dicts
        """
        data_provider_object = data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        self.data_provider = data_provider_cross_ref.get('id')
        self.data_provider_pages = data_provider_cross_ref.get('pages')

        self.data_providers = []
        self.data_provider_cross_ref_set = []

        if self.data_provider_pages is None:
            return
        for data_provider_page in self.data_provider_pages:
            cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                self.data_provider, self.data_provider, alt_page=data_provider_page)
            self.data_provider_cross_ref_set.append(
                ETLHelper.get_xref_dict(
                    self.data_provider,
                    self.data_provider,
                    data_provider_page,
                    data_provider_page,
                    self.data_provider,
                    cross_ref_complete_url,
                    self.data_provider + data_provider_page))

            self.data_providers.append(self.data_provider)
            self.logger.info("data provider: %s", self.data_provider)
    def _load_and_process_data(self):

        for sub_type in self.data_type_config.get_sub_type_objects():

            species_encoded = urllib.parse.quote_plus(\
                    ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider()))

            commit_size = self.data_type_config.get_neo4j_commit_size()
            #batch_size = self.data_type_config.get_generator_batch_size()
            batch_size = 100000

            generators = self.get_generators(sub_type, batch_size,
                                             species_encoded)

            query_template_list = [
                [
                    self.geo_xref_query_template, commit_size,
                    "geo_xref_data_" + sub_type.get_data_provider() + ".csv"
                ],
            ]

            query_and_file_list = self.process_query_params(
                query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
Exemple #6
0
 def ortho_xrefs(self, o_xrefs, ident, xrefs):
     """Geenrate xref for orthos."""
     if o_xrefs is None:
         return
     # turn into a list
     if type(o_xrefs) != list:
         self.logger.critical("BOB: o_xrefs is not a list but is a '{}'".format(type(o_xrefs)))
     for xref_id_dict in o_xrefs:
         xref_id = xref_id_dict["val"]
         if ":" in xref_id:
             local_id = xref_id.split(":")[1].strip()
             prefix = xref_id.split(":")[0].strip()
             complete_url = self.etlh.get_complete_url_ont(local_id, xref_id)
             generated_xref = ETLHelper.get_xref_dict(
                 local_id,
                 prefix,
                 "ontology_provided_cross_reference",
                 "ontology_provided_cross_reference",
                 xref_id,
                 complete_url,
                 xref_id + "ontology_provided_cross_reference")
             generated_xref["oid"] = ident
             xrefs.append(generated_xref)
     if ":" in o_xrefs:  # if o_xrefs is a str with ":" in it.
         local_id = o_xrefs.split(":")[1].strip()
         prefix = o_xrefs.split(":")[0].strip()
         complete_url = self.etlh.get_complete_url_ont(local_id, o_xrefs)
         generated_xref = ETLHelper.get_xref_dict(
             local_id,
             prefix,
             "ontology_provided_cross_reference",
             "ontology_provided_cross_reference",
             o_xrefs,
             complete_url,
             o_xrefs)
         generated_xref["oid"] = ident
         xrefs.append(generated_xref)
    def get_generators(self, sub_type, batch_size, species_encoded):
        """Get Generators"""

        entrez_ids = []

        geo_data_file_contents = Path(sub_type.get_filepath()).read_text()
        geo_data = json.loads(
            json.dumps(xmltodict.parse(geo_data_file_contents)))
        for efetch_value in dict(geo_data.items()).values():
            # IdList is a value returned from efetch XML spec,
            # within IdList, there is another map with "Id"
            # as the key and the entrez local ids a list value.
            for sub_map_key, sub_map_value in efetch_value.items():
                if sub_map_key == 'IdList':
                    for id_list in dict(sub_map_value.items()).values():
                        for entrez_id in id_list:
                            self.logger.debug("here is the entrez id: %s",
                                              entrez_id)
                            entrez_ids.append("NCBI_Gene:" + entrez_id)

        geo_data_list = []
        return_set = Neo4jHelper.run_single_parameter_query(
            self.gene_crossref_query_template, entrez_ids)

        for record in return_set:
            gene_primary_key = record["g.primaryKey"]
            mod_local_id = record["g.modLocalId"]
            global_cross_ref_id = record["cr.globalCrossRefId"]
            geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1],
                                               "NCBI_Gene",
                                               "gene/other_expression",
                                               "gene/other_expression",
                                               "GEO",
                                               "https://www.ncbi.nlm.nih.gov/sites/entrez?" \
                                                       + "Db=geoprofiles"\
                                                       + "&DbFrom=gene"\
                                                       + "&Cmd=Link"\
                                                       + "&LinkName=gene_geoprofiles"\
                                                       + "&LinkReadableName=GEO%20Profiles"\
                                                       + "&IdsFromResult="\
                                                       + global_cross_ref_id.split(":")[1],
                                               global_cross_ref_id+"gene/other_expression")

            geo_xref["genePrimaryKey"] = gene_primary_key
            geo_xref["modLocalId"] = mod_local_id

            geo_data_list.append(geo_xref)

        yield [geo_data_list]
    def save_descriptions_report_files(self, data_provider, json_desc_writer, context_info, gd_data_manager):
        """Save Descripitons Report Files"""

        release_version = ".".join(context_info.env["ALLIANCE_RELEASE"].split(".")[0:2])
        json_desc_writer.overall_properties.species = data_provider
        json_desc_writer.overall_properties.release_version = release_version
        json_desc_writer.overall_properties.date = self.cur_date
        file_name = self.cur_date + "_" + data_provider
        file_path = os.path.join("tmp", file_name)
        json_desc_writer.write_json(file_path=file_path + ".json",
                                    pretty=True,
                                    include_single_gene_stats=True,
                                    data_manager=gd_data_manager)
        json_desc_writer.write_plain_text(file_path=file_path + ".txt")
        readme = "This file contains the following fields: gene ID, gene name, and gene description. The gene " \
                 "descriptions are generated by an algorithm developed by the Alliance that uses highly structured " \
                 "gene data such as associations to various ontology terms (e.g., Gene Ontology terms) and the " \
                 "Alliance strict orthology set. The original set of ontology terms that a gene is annotated to may " \
                 "have been trimmed to an ancestor term in the ontology, in order to balance readability with the " \
                 "amount of information in the description. The complete set of annotations to any gene in this file " \
                 "may be found in the relevant data tables on the Alliance gene page."
        species = ETLHelper.species_lookup_by_data_provider(data_provider)
        taxon_id = ETLHelper.get_taxon_from_mod(data_provider)
        header = create_header(file_type='Gene Descriptions', database_version=context_info.env["ALLIANCE_RELEASE"],
                               data_format='txt', readme=readme, species=species, taxon_ids='# TaxonIDs:NCBITaxon:' +
                                                                                            taxon_id)
        header = "\n".join([line.strip() for line in header.splitlines() if len(line.strip()) != 0])
        self.add_header_to_file(file_path=file_path + ".txt", header=header)
        json_desc_writer.write_tsv(file_path=file_path + ".tsv")
        header = create_header(file_type='Gene Descriptions', database_version=context_info.env["ALLIANCE_RELEASE"],
                               data_format='tsv', readme=readme, species=species, taxon_ids='# TaxonIDs:NCBITaxon:' +
                                                                                            taxon_id)
        header = "\n".join([line.strip() for line in header.splitlines() if len(line.strip()) != 0])
        self.add_header_to_file(file_path=file_path + ".tsv", header=header)
        if context_info.env["GENERATE_REPORTS"]:
            self.upload_files_to_fms(file_path, context_info, data_provider, self.logger)
Exemple #9
0
    def process_pages(self, dp, xrefs, pages):
        """Process pages to get xrefs."""
        annotation_type = dp.get('type')
        xref = dp.get('crossReference')
        cross_ref_id = xref.get('id')
        if ":" in cross_ref_id:
            local_crossref_id = cross_ref_id.split(":")[1]
            prefix = cross_ref_id.split(":")[0]
        else:
            local_crossref_id = ""
            prefix = cross_ref_id

        if annotation_type is None:
            annotation_type = 'curated'

        for page in pages:
            if (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'DOID':
                display_name = 'RGD'
            elif (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'OMIM':
                display_name = 'OMIM'
            else:
                display_name = cross_ref_id.split(":")[0]
                if display_name == 'DOID':
                    display_name = self.data_provider

            mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value(
                prefix, local_crossref_id, page)
            passing_xref = ETLHelper.get_xref_dict(
                local_crossref_id, prefix, page, page,
                display_name, mod_global_cross_ref_url,
                cross_ref_id + page + annotation_type)
            passing_xref['dataId'] = self.disease_unique_key

            if 'loaded' in annotation_type:
                passing_xref['loadedDB'] = 'true'
                passing_xref['curatedDB'] = 'false'
            else:
                passing_xref['curatedDB'] = 'true'
                passing_xref['loadedDB'] = 'false'

            xrefs.append(passing_xref)
Exemple #10
0
    def get_generators(self, sub_type, batch_size, species_encoded):
        """Get Generators."""
        entrez_ids = []

        geo_data_file_contents = Path(sub_type.get_filepath()).read_text()
        geo_data = json.loads(
            json.dumps(xmltodict.parse(geo_data_file_contents)))
        for efetch_value in dict(geo_data.items()).values():
            # IdList is a value returned from efetch XML spec,
            # within IdList, there is another map with "Id"
            # as the key and the entrez local ids a list value.
            for sub_map_key, sub_map_value in efetch_value.items():
                if sub_map_key == 'IdList':
                    for id_list in dict(sub_map_value.items()).values():
                        for entrez_id in id_list:
                            self.logger.debug("here is the entrez id: %s",
                                              entrez_id)
                            entrez_ids.append("NCBI_Gene:" + entrez_id)

        geo_data_list = []
        return_set = Neo4jHelper.run_single_parameter_query(
            self.gene_crossref_query_template, entrez_ids)

        for record in return_set:
            gene_primary_key = record["g.primaryKey"]
            mod_local_id = record["g.modLocalId"]
            global_cross_ref_id = record["cr.globalCrossRefId"]
            url = self.etlh.rdh2.return_url_from_key_value(
                'GEO',
                global_cross_ref_id.split(":")[1], 'entrezgene')
            geo_xref = ETLHelper.get_xref_dict(
                global_cross_ref_id.split(":")[1], "NCBI_Gene",
                "gene/other_expression", "gene/other_expression", "GEO", url,
                global_cross_ref_id + "gene/other_expression")

            geo_xref["genePrimaryKey"] = gene_primary_key
            geo_xref["modLocalId"] = mod_local_id

            geo_data_list.append(geo_xref)

        yield [geo_data_list]
Exemple #11
0
    def xref_process(self, construct_record, cross_reference_list):
        """Process the xrefs."""
        global_id = construct_record['primaryId']
        if 'crossReferences' not in construct_record:
            return
        for cross_ref in construct_record.get('crossReferences'):
            cross_ref_id = cross_ref.get('id')
            local_crossref_id = cross_ref_id.split(":")[1]
            prefix = cross_ref.get('id').split(":")[0]
            pages = cross_ref.get('pages')

            # some pages collection have 0 elements
            if pages is None or len(pages) == 0:
                continue
            for page in pages:
                if page == 'construct':
                    mod_global_cross_ref_id = self.etlh.rdh2.return_url_from_key_value(
                        prefix, local_crossref_id, page)
                    xref = ETLHelper.get_xref_dict(local_crossref_id, prefix,
                                                   page, page, cross_ref_id,
                                                   mod_global_cross_ref_id,
                                                   cross_ref_id + page)
                    xref['dataId'] = global_id
                    cross_reference_list.append(xref)
Exemple #12
0
    def xref_process(self, basic_genetic_entity, cross_references,
                     urls):  # noqa
        """Process xrefs."""
        primary_id = basic_genetic_entity.get('primaryId')
        global_id = basic_genetic_entity.get('primaryId')
        local_id = global_id.split(":")[1]
        taxon_id = basic_genetic_entity.get("taxonId")
        if 'crossReferences' not in basic_genetic_entity:
            return
        for cross_ref in basic_genetic_entity.get('crossReferences'):
            if ':' not in cross_ref.get('id'):
                continue
            cross_ref_id = cross_ref.get('id')
            local_cross_ref_id = cross_ref_id.split(":")[1]
            prefix = cross_ref.get('id').split(":")[0]
            pages = cross_ref.get('pages')
            global_xref_id = cross_ref.get('id')
            display_name = global_xref_id

            # some pages collection have 0 elements
            if pages is not None and len(pages) > 0:
                for page in pages:
                    display_name = ""

                    cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                        prefix, local_cross_ref_id, page)

                    if page == 'gene/expression_images':
                        cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                            prefix, local_cross_ref_id, page)
                    elif page == 'gene':
                        urls[
                            'mod_cross_reference_complete_url'] = self.etlh.rdh2.return_url_from_key_value(
                                prefix, local_cross_ref_id, page)

                    urls[
                        'genetic_entity_external_url'] = self.etlh.rdh2.return_url_from_key_value(
                            prefix, local_cross_ref_id, page)

                    if page == 'gene/references':
                        urls[
                            'gene_literature_url'] = self.etlh.rdh2.return_url_from_key_value(
                                prefix, local_cross_ref_id, page)

                    if page == 'gene/spell':
                        display_name = 'Serial Patterns of Expression Levels Locator (SPELL)'

                    # TODO: fix generic_cross_reference in SGD, RGD

                    if page == 'generic_cross_reference':
                        cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                            local_cross_ref_id, prefix, primary_id)

                    # TODO: fix gene/disease xrefs for SGD once
                    # resourceDescriptor change in develop
                    # makes its way to the release branch.

                    if page == 'gene/disease' and taxon_id == 'NCBITaxon:559292':
                        cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                            'SGD', local_id, page)

                    xref_map = ETLHelper.get_xref_dict(local_cross_ref_id,
                                                       prefix, page, page,
                                                       display_name,
                                                       cross_ref_complete_url,
                                                       global_xref_id + page)
                    xref_map['dataId'] = primary_id
                    cross_references.append(xref_map)
            else:
                if prefix == 'PANTHER':
                    cross_ref_primary_id = cross_ref.get(
                        'id') + '_' + primary_id
                    cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                        local_cross_ref_id, prefix, primary_id)
                    page = "gene/panther"
                elif prefix == 'RGD':
                    cross_ref_primary_id = cross_ref.get('id')
                    cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                        'RGD', local_cross_ref_id)
                    page = "generic_cross_reference"
                else:
                    cross_ref_primary_id = cross_ref.get('id')
                    cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                        local_cross_ref_id, prefix, primary_id)
                    page = "generic_cross_reference"
                xref_map = ETLHelper.get_xref_dict(local_cross_ref_id, prefix,
                                                   page, page, display_name,
                                                   cross_ref_complete_url,
                                                   cross_ref_primary_id + page)
                xref_map['dataId'] = primary_id
                cross_references.append(xref_map)
class HTPMetaDatasetSampleETL(ETL):

    htp_dataset_sample_query_template = """
    
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        
        MATCH (o:OBITerm {primaryKey:row.sampleType})
        MATCH (s:Species {primaryKey: row.taxonId})
        MATCH (a:MMOTerm {primaryKey: row.assayType})
    
        MERGE (ds:HTPDatasetSample {primaryKey:row.datasetSampleId})
          ON CREATE SET ds.dateAssigned = row.dateAssigned,
              ds.abundance = row.abundance,
              ds.sex = row.sex,
              ds.notes = row.notes,
              ds.dateAssigned = row.dateAssigned,
              //ds.biosampleText = row.biosampleText,
              ds.sequencingFormat = row.sequencingFormat,
              ds.title = row.sampleTitle,
              ds.sampleAge = row.sampleAge
              
        MERGE (ds)-[dssp:FROM_SPECIES]-(s)
        //MERGE (ds)-[dsat:ASSAY_TYPE]-(a)
        //MERGE (ds)-[dsst:SAMPLE_TYPE]-(o)
        
          
    """

    htp_dataset_sample_agm_query_template = """
        USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
           
        MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId})
        MATCH (agm:AffectedGenomicModel {primaryKey:row.biosampleId})

        MERGE (agm)-[agmds:ASSOCIATION]-(ds)
    
    """

    htp_dataset_sample_agmtext_query_template = """
    
        USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
    
        MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId})
        MERGE (agm:AffectedGenomicModel {primaryKey:row.biosampleText})

        MERGE (agm)-[agmds:ASSOCIATION]-(ds)
    
    """

    htp_bio_entity_expression_query_template = """

       USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

       MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId})
       
       MERGE (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            ON CREATE SET e.whereExpressedStatement = row.whereExpressedStatement
       
       MERGE (dss)-[dsdss:STRUCTURE_SAMPLED]-(e)
            
    """

    htp_stages_query_template = """

       USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

       MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId})
       MATCH (st:Stage {primaryKey:row.stageName})
               
       MERGE (dss)-[eotcctq:SAMPLED_DURING]-(s)
       
    """

    htp_dataset_join_query_template = """

       USING PERIODIC COMMIT %s
           LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

       MATCH (ds:HTPDataset {primaryKey:row.datasetId})
       MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId})
       
       MERGE (ds)-[dsdss:ASSOCIATION]-(dss)
    
    """

    htp_secondaryIds_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        MATCH (dss:HTPDatasetSample {primaryKey: row.datasetSampleId})

        MERGE (sec:SecondaryId:Identifier {primaryKey:row.secondaryId})
                ON CREATE SET sec.name = row.secondaryId

        MERGE (dss)<-[aka:ALSO_KNOWN_AS]-(sec)


    """

    ao_substructures_query_template = """
     USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasst)
                    AND NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 
    
    """

    ao_qualifiers_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalStructureQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasst)
                    AND NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 
    
    
    """

    ao_ss_qualifiers_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasst)
                    AND NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 


    """

    ao_terms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalStructureTermId})
                WHERE NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 
    """

    cc_term_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.cellularComponentTermId})
                WHERE NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) 
    """

    eas_substructure_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId})
                WHERE NOT 'FBCVTerm' in LABELS(otasst)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})       
            MERGE (e)-[eotasst:ANATOMICAL_SUB_SUBSTRUCTURE]->(otasst) """

    eas_qualified_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otastq:Ontology {primaryKey:row.anatomicalStructureQualifierTermId})
                WHERE NOT 'FBCVTerm' in LABELS(otastq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})
            MERGE (e)-[eotastq:ANATOMICAL_STRUCTURE_QUALIFIER]-(otastq) """

    eass_qualified_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otasstq:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otasstq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})

            MERGE (e)-[eotasstq:ANATOMICAL_SUB_STRUCTURE_QUALIFIER]-(otasstq) """

    ccq_expression_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (otcctq:Ontology {primaryKey:row.cellularComponentQualifierTermId})
                WHERE NOT 'UBERONTerm' in LABELS(otcctq)
            MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid})

            MERGE (e)-[eotcctq:CELLULAR_COMPONENT_QUALIFIER]-(otcctq) """

    stage_expression_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})
            MERGE (s:Stage {primaryKey:row.stageName})
                ON CREATE SET s.name = row.stageName
            MERGE (ei)-[eotcctq:DURING]-(s) """

    uberon_ao_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid})  
            MATCH (o:Ontology:UBERONTerm {primaryKey:row.aoUberonId})     
            MERGE (ebe)-[ebeo:ANATOMICAL_RIBBON_TERM]-(o) """

    uberon_stage_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})  
            MATCH (o:Ontology:UBERONTerm {primaryKey:row.uberonStageId})

            MERGE (ei)-[eio:STAGE_RIBBON_TERM]-(o) """

    uberon_ao_other_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) 
            MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'}) 
            MERGE (ebe)-[ebeu:ANATOMICAL_RIBBON_TERM]-(u) """

    uberon_stage_other_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid})
            MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'})

            MERGE (ei)-[eiu:STAGE_RIBBON_TERM]-(u) """

    htp_dataset_sample_assemblies_query_template = """
            USING PERIODIC COMMIT %s
            LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

                MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId})
                MATCH (u:Assembly {primaryKey:row.assembly})

                MERGE (ds)-[dsu:ASSEMBLY]-(u) """

    htpdatasetsample_xrefs_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:HTPDatasetSample {primaryKey:row.datasetId}) """ + ETLHelper.get_cypher_xref_text(
    )

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            p = multiprocessing.Process(target=self._process_sub_type,
                                        args=(sub_type, ))
            p.start()
            thread_pool.append(p)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata sample data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_query_template,
                commit_size, "htp_metadataset_sample_samples_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_bio_entity_expression_query_template, commit_size,
                "htp_metadataset_sample_bioentities_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_secondaryIds_query_template,
                commit_size, "htp_metadataset_sample_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_join_query_template,
                commit_size, "htp_metadataset_sample_datasets_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size,
                "htp_metadataset_sample_stages_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size,
                "htp_metadataset_sample_aoterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_substructures_query_template,
                commit_size, "htp_metadataset_sample_ao_substructures_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template,
                commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.cc_term_query_template, commit_size,
                "htp_metadataset_sample_ccterms" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.ccq_expression_query_template,
                commit_size, "htp_metadataset_sample_ccqterms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size,
                "htp_metadataset_sample_uberon_ao_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.uberon_ao_other_query_template,
                commit_size, "htp_metadataset_sample_uberon_ao_other_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template,
                commit_size, "htp_metadataset_sample_agms_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_agmtext_query_template, commit_size,
                "htp_metadataset_sample_agmstext_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetSampleETL.
                htp_dataset_sample_assemblies_query_template, commit_size,
                "htp_metadataset_sample_assemblies_" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, htp_datasetsample_data, batch_size):

        htp_datasetsamples = []
        secondaryIds = []
        datasetIds = []
        assemblies = []
        uberon_ao_data = []
        ao_qualifiers = []
        bio_entities = []
        ao_ss_qualifiers = []
        ao_substructures = []
        ao_terms = []
        uberon_ao_other_data = []
        stages = []
        ccq_components = []
        cc_components = []
        biosamples = []
        biosamplesTexts = []
        counter = 0

        data_provider_object = htp_datasetsample_data['metaData'][
            'dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')

        for datasample_record in htp_datasetsample_data['data']:

            counter = counter + 1

            sampleIds = ''
            biosampleId = ''
            biosampleText = ''
            sampleId = ''
            sampleTitle = ''
            biosamplesTexts = ''

            if 'sampleId' in datasample_record:
                sampleIdObj = datasample_record.get('sampleId')
                sampleId = sampleIdObj.get('primaryId')

                if 'secondaryIds' in sampleIdObj:
                    for secId in sampleIdObj.get('secondaryIds'):
                        secid = {
                            "datasetSampleId": sampleId,
                            "secondaryId": secId
                        }
                        secondaryIds.append(secid)

            if 'sampleTitle' in sampleIds:
                sampleTitle = datasample_record.get('sampleTitle')

            datasetSampleId = sampleId + sampleTitle

            if 'datasetIds' in datasample_record:
                datasetIdSet = datasample_record.get('datasetIds')
                for datasetID in datasetIdSet:
                    datasetsample = {
                        "datasetSampleId": datasetSampleId,
                        "datasetId": datasetID
                    }
                    datasetIds.append(datasetsample)

                    if self.test_object.using_test_data() is True:
                        is_it_test_entry = self.test_object.check_for_test_id_entry(
                            datasetID)
                        if is_it_test_entry is False:
                            counter = counter - 1
                            continue

            if 'genomicInformation' in datasample_record:
                genomicInformation = datasample_record.get(
                    'genomicInformation')
                if 'biosampleId' in genomicInformation:
                    biosampleId = genomicInformation.get('biosampleId')
                if 'bioSampleText' in genomicInformation:
                    biosampleText = genomicInformation.get('bioSampleText')

                if biosampleId is not None and biosampleId != '':
                    biosample = {
                        "biosampleId": biosampleId,
                        "datasetSampleId": datasetSampleId
                    }
                    biosamples.append(biosample)

                if biosampleText is not None and biosampleText != '' and biosampleId == '':
                    biosampleText = {
                        "biosampleText": biosampleText,
                        "datasetSampleId": datasetSampleId
                    }
                    biosamplesTexts.append(biosampleText)

            if 'assemblyVersions' in datasample_record:
                for assembly in datasample_record.get('assemblyVersions'):

                    datasetsample = {
                        "datasetSampleId": datasetSampleId,
                        "assembly": assembly
                    }
                    assemblies.append(datasetsample)

            age = ''
            if 'sampleAge' in datasample_record:
                sampleAge = datasample_record.get('sampleAge')
                stageId = ""
                if 'age' in sampleAge:
                    age = sampleAge.get('age')
                    stageId = stageId + age
                if 'stage' in sampleAge:
                    stage = sampleAge.get('stage')
                    stageId = stageId + stage.get('stageName')

                    stage = {
                        "stageId": stageId,
                        "stageTermId": stage.get('stageTermId'),
                        "stageName": stage.get('stageName'),
                        "stageUberonSlimTerm":
                        stage.get('stageUberonSlimTerm'),
                        "sampleAge": age,
                        "datasetSampleId": datasetSampleId
                    }
                    stages.append(stage)
                else:
                    stage = {"stageId": stageId, "sampleAge": age}
                    stages.append(stage)

            if 'sampleLocations' in datasample_record:
                sampleLocations = datasample_record.get('sampleLocations')

                for location in sampleLocations:

                    cellular_component_qualifier_term_id = location.get(
                        'cellularComponentQualifierTermId')
                    cellular_component_term_id = location.get(
                        'cellularComponentTermId')
                    anatomical_structure_term_id = location.get(
                        'anatomicalStructureTermId')
                    anatomical_structure_qualifier_term_id = location.get(
                        'anatomicalStructureQualifierTermId')
                    anatomical_sub_structure_term_id = location.get(
                        'anatomicalSubStructureTermId')
                    anatomical_sub_structure_qualifier_term_id = location.get(
                        'anatomicalSubStructureQualifierTermId')
                    where_expressed_statement = location.get(
                        'whereExpressedStatement')

                    expression_unique_key = datasetSampleId
                    expression_entity_unique_key = ''

                    if anatomical_structure_term_id is not None:
                        expression_unique_key += anatomical_structure_term_id
                        expression_entity_unique_key = anatomical_structure_term_id

                        if anatomical_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_structure_qualifier_term_id
                            expression_entity_unique_key += anatomical_structure_qualifier_term_id

                    if cellular_component_term_id is not None:
                        expression_unique_key += cellular_component_term_id
                        expression_entity_unique_key += cellular_component_term_id

                        if cellular_component_qualifier_term_id is not None:
                            expression_unique_key += cellular_component_qualifier_term_id
                            expression_entity_unique_key += cellular_component_qualifier_term_id

                    if anatomical_sub_structure_term_id is not None:
                        expression_unique_key += anatomical_sub_structure_term_id

                        if anatomical_sub_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_sub_structure_qualifier_term_id
                            expression_entity_unique_key += anatomical_sub_structure_qualifier_term_id

                    expression_entity_unique_key += where_expressed_statement

                    if location.get('anatomicalStructureUberonSlimTermIds'
                                    ) is not None:

                        for uberon_structure_term_object in location.get(
                                'anatomicalStructureUberonSlimTermIds'):
                            structure_uberon_term_id = uberon_structure_term_object.get(
                                'uberonTerm')

                            if structure_uberon_term_id is not None and structure_uberon_term_id != 'Other':
                                structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": structure_uberon_term_id
                                }
                                uberon_ao_data.append(structure_uberon_term)

                            elif structure_uberon_term_id is not None and structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if location.get('anatomicalSubStructureUberonSlimTermIds'
                                    ) is not None:

                        for uberon_sub_structure_term_object in location.get(
                                'anatomicalSubStructureUberonSlimTermIds'):
                            sub_structure_uberon_term_id = uberon_sub_structure_term_object.get(
                                'uberonTerm')

                            if sub_structure_uberon_term_id is not None and sub_structure_uberon_term_id != 'Other':
                                sub_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": sub_structure_uberon_term_id
                                }
                                uberon_ao_data.append(
                                    sub_structure_uberon_term)

                            elif sub_structure_uberon_term_id is not None and sub_structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if cellular_component_term_id is not None:
                        cc_term = {
                            "ebe_uuid": expression_entity_unique_key,
                            "cellularComponentTermId":
                            cellular_component_term_id
                        }
                        cc_components.append(cc_term)

                    if cellular_component_qualifier_term_id is not None:
                        ccq_term = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "cellularComponentQualifierTermId":
                            cellular_component_qualifier_term_id
                        }
                        ccq_components.append(ccq_term)

                    if anatomical_structure_term_id is not None:
                        ao_term = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalStructureTermId":
                            anatomical_structure_term_id
                        }
                        ao_terms.append(ao_term)

                    if anatomical_structure_qualifier_term_id is not None:
                        ao_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalStructureQualifierTermId":
                            anatomical_structure_qualifier_term_id
                        }

                        ao_qualifiers.append(ao_qualifier)

                    if anatomical_sub_structure_term_id is not None:
                        ao_substructure = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureTermId":
                            anatomical_sub_structure_term_id
                        }

                        ao_substructures.append(ao_substructure)

                    if anatomical_sub_structure_qualifier_term_id is not None:
                        ao_ss_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureQualifierTermId":
                            anatomical_sub_structure_qualifier_term_id
                        }

                        ao_ss_qualifiers.append(ao_ss_qualifier)

                    if where_expressed_statement is None:
                        where_expressed_statement = ""

                    bio_entity = {
                        "ebe_uuid": expression_entity_unique_key,
                        "whereExpressedStatement": where_expressed_statement,
                        "datasetSampleId": datasetSampleId
                    }
                    bio_entities.append(bio_entity)

            htp_dataset_sample = {
                "datasetSampleId": datasetSampleId,
                "abundance": datasample_record.get('abundance'),
                "sampleType": datasample_record.get('sampleType'),
                "taxonId": datasample_record.get('taxonId'),
                "sex": datasample_record.get('sex'),
                "assayType": datasample_record.get('assayType'),
                "notes": datasample_record.get('notes'),
                "dateAssigned": datasample_record.get('dateAssigned'),
                "sequencingFormat": datasample_record.get('sequencingFormat'),
                "sampleTitle": sampleTitle,
                "sampleAge": age
            }

            htp_datasetsamples.append(htp_dataset_sample)

            #
            # if self.test_object.using_test_data() is True:
            #     is_it_test_entry = self.test_object.check_for_test_id_entry(datasetID)
            #     if is_it_test_entry is True:
            #         self.logger.info(htp_dataset_sample)

            if counter == batch_size:
                yield [
                    htp_datasetsamples,
                    bio_entities,
                    secondaryIds,
                    datasetIds,
                    stages,
                    ao_terms,
                    ao_substructures,
                    ao_qualifiers,
                    ao_ss_qualifiers,
                    cc_components,
                    ccq_components,
                    uberon_ao_data,
                    uberon_ao_other_data,
                    biosamples,
                    biosamplesTexts,
                    assemblies,
                ]
                counter = 0
                htp_datasetsamples = []
                datasetIds = []
                uberon_ao_data = []
                ao_qualifiers = []
                bio_entities = []
                ao_ss_qualifiers = []
                ao_substructures = []
                ao_terms = []
                uberon_ao_other_data = []
                stages = []
                ccq_components = []
                cc_components = []
                biosamples = []
                assemblies = []

        if counter > 0:
            yield [
                htp_datasetsamples, bio_entities, secondaryIds, datasetIds,
                stages, ao_terms, ao_substructures, ao_qualifiers,
                ao_ss_qualifiers, cc_components, ccq_components,
                uberon_ao_data, uberon_ao_other_data, biosamples,
                biosamplesTexts, assemblies
            ]
Exemple #14
0
    def get_generators(self, construct_data, data_provider, batch_size):
        """Create Generators"""

        data_providers = []
        release = ""
        constructs = []
        construct_synonyms = []
        construct_secondary_ids = []
        cross_reference_list = []
        component_details = []
        component_no_gene_details = []
        non_bgi_components = []

        counter = 0
        date_produced = construct_data['metaData']['dateProduced']

        data_provider_object = construct_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        self.logger.info("DataProvider: " + data_provider)
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_construct"

        # TODO: get SGD to fix their files.

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(
                    ETLHelper.get_xref_dict(
                        data_provider, data_provider, data_provider_page,
                        data_provider_page, data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        if 'release' in construct_data['metaData']:
            release = construct_data['metaData']['release']

        for construct_record in construct_data['data']:

            counter = counter + 1
            global_id = construct_record['primaryId']
            local_id = global_id.split(":")[1]
            mod_global_cross_ref_id = ""

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            name_text = TextProcessingHelper.cleanhtml(
                construct_record.get('name'))

            construct_dataset = {
                "symbol": construct_record.get('name'),
                "primaryId": construct_record.get('primaryId'),
                "globalId": global_id,
                "localId": local_id,
                "dataProviders": data_providers,
                "dateProduced": date_produced,
                "loadKey": load_key,
                "release": release,
                "modGlobalCrossRefId": mod_global_cross_ref_id,
                "uuid": str(uuid.uuid4()),
                "dataProvider": data_provider,
                "nameText": name_text,
                "name": construct_record.get('name')
            }
            constructs.append(construct_dataset)

            if 'crossReferences' in construct_record:

                for cross_ref in construct_record.get('crossReferences'):
                    cross_ref_id = cross_ref.get('id')
                    local_crossref_id = cross_ref_id.split(":")[1]
                    prefix = cross_ref.get('id').split(":")[0]
                    pages = cross_ref.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if page == 'construct':
                                mod_global_cross_ref_id = ETLHelper.get_page_complete_url(
                                    local_crossref_id, self.xref_url_map,
                                    prefix, page)
                                xref = ETLHelper.get_xref_dict(
                                    local_crossref_id, prefix, page, page,
                                    cross_ref_id, mod_global_cross_ref_id,
                                    cross_ref_id + page)
                                xref['dataId'] = global_id
                                cross_reference_list.append(xref)

            if 'constructComponents' in construct_record:
                for component in construct_record.get('constructComponents'):
                    component_relation = component.get(
                        'componentRelation').upper()
                    component_symbol = component.get('componentSymbol')
                    component_id = component.get('componentID')

                    if component_id is not None:
                        component_detail = {
                            "componentRelation": component_relation.upper(),
                            "componentSymbol": component_symbol,
                            "componentID": component_id,
                            "constructID": construct_record.get('primaryId')
                        }
                        component_details.append(component_detail)
                    else:
                        component_detail = {
                            "componentRelation": component_relation.upper(),
                            "componentSymbol": component_symbol,
                            "constructID": construct_record.get('primaryId')
                        }
                        non_bgi_component = {
                            "componentSymbol": component_symbol
                        }
                        non_bgi_components.append(non_bgi_component)
                        component_no_gene_details.append(component_detail)

            if 'synonyms' in construct_record:
                for syn in construct_record.get('synonyms'):
                    construct_synonym = {
                        "data_id": construct_record.get('primaryId'),
                        "synonym": syn.strip()
                    }
                    construct_synonyms.append(construct_synonym)

            if 'secondaryIds' in construct_record:
                for secondary_id in construct_record.get('secondaryIds'):
                    construct_secondary_id = {
                        "data_id": construct_record.get('primaryId'),
                        "secondary_id": secondary_id
                    }
                    construct_secondary_ids.append(construct_secondary_id)

            if counter == batch_size:
                yield [
                    constructs, construct_secondary_ids, construct_synonyms,
                    cross_reference_list, non_bgi_components,
                    component_details, component_no_gene_details
                ]
                constructs = []
                construct_secondary_ids = []
                construct_synonyms = []
                cross_reference_list = []
                non_bgi_components = []
                component_details = []
                component_no_gene_details = []
                counter = 0

        if counter > 0:
            yield [
                constructs, construct_secondary_ids, construct_synonyms,
                cross_reference_list, non_bgi_components, component_details,
                component_no_gene_details
            ]
Exemple #15
0
class BGIETL(ETL):
    """BGI ETL."""

    logger = logging.getLogger(__name__)

    # Query templates which take params and will be processed later

    so_terms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:Gene {primaryKey:row.primaryKey})
            MATCH (s:SOTerm {primaryKey:row.soTermId})
            MERGE (o)-[:ANNOTATED_TO]->(s)"""

    chromosomes_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MERGE (chrm:Chromosome {primaryKey: row.primaryKey}) """

    genomic_locations_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Gene {primaryKey:row.primaryId})
            MATCH (chrm:Chromosome {primaryKey:row.chromosome})

            MERGE (o)-[ochrm:LOCATED_ON]->(chrm)
            MERGE (a:Assembly {primaryKey:row.assembly})
              ON CREATE SET a.dataProvider = row.dataProvider

            MERGE (gchrm:GenomicLocation {primaryKey:row.uuid})
            ON CREATE SET gchrm.start = apoc.number.parseInt(row.start),
                gchrm.end = apoc.number.parseInt(row.end),
                gchrm.assembly = row.assembly,
                gchrm.strand = row.strand,
                gchrm.chromosome = row.chromosome,
                gchrm.assembly = row.assembly

            MERGE (o)-[of:ASSOCIATION]-(gchrm)
            MERGE (gchrm)-[ofc:ASSOCIATION]-(chrm)
            MERGE (gchrmn)-[ao:ASSOCIATION]->(a)

        """

    genomic_locations_bins_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Gene {primaryKey:row.genePrimaryId})
            MATCH (chrm:Chromosome {primaryKey:row.chromosome})

            MERGE (bin:GenomicLocationBin {primaryKey:row.binPrimaryKey})
            ON CREATE SET bin.number = toInt(row.number),
               bin.assembly = row.assembly

            MERGE (o)-[:LOCATED_IN]->(bin)
            MERGE (bin)-[:LOCATED_ON]->(chrm) """

    gene_secondary_ids_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (g:Gene {primaryKey:row.primary_id})

            MERGE (second:SecondaryId:Identifier {primaryKey:row.secondary_id})
                ON CREATE SET second.name = row.secondary_id
            MERGE (g)-[aka1:ALSO_KNOWN_AS]->(second) """

    gene_synonyms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (g:Gene {primaryKey:row.primary_id})

            MERGE(syn:Synonym:Identifier {primaryKey:row.synonym})
                    SET syn.name = row.synonym
            MERGE (g)-[aka2:ALSO_KNOWN_AS]->(syn) """

    gene_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (l:Load {primaryKey:row.loadKey})

            //Create the Gene node and set properties. primaryKey is required.
            MERGE (o:Gene {primaryKey:row.primaryId})
                ON CREATE SET o.symbol = row.symbol,
                              o.taxonId = row.taxonId,
                              o.name = row.name,
                              o.description = row.description,
                              o.geneSynopsisUrl = row.geneSynopsisUrl,
                              o.geneSynopsis = row.geneSynopsis,
                              o.geneLiteratureUrl = row.geneLiteratureUrl,
                              o.geneticEntityExternalUrl = row.geneticEntityExternalUrl,
                              o.dateProduced = row.dateProduced,
                              o.modGlobalCrossRefId = row.modGlobalCrossRefId,
                              o.modCrossRefCompleteUrl = row.modCrossRefCompleteUrl,
                              o.modLocalId = row.localId,
                              o.modGlobalId = row.modGlobalId,
                              o.uuid = row.uuid,
                              o.dataProvider = row.dataProvider,
                              o.symbolWithSpecies = row.symbolWithSpecies
    """

    basic_gene_load_relations_query_template = """
    USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        MATCH (l:Load {primaryKey:row.loadKey})
        MATCH (g:Gene {primaryKey:row.primaryId})
        MERGE (g)-[:LOADED_FROM]->(l)

    """

    basic_gene_species_relations_query_template = """
    USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        MATCH (spec:Species {primaryKey: row.taxonId})
        MATCH (g:Gene {primaryKey: row.primaryId})

        MERGE (g)-[:FROM_SPECIES]->(spec)

    """

    xrefs_query_template = """

        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Gene {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_tuned_text(
    )

    xrefs_relationships_query_template = """

        USING PERIODIC COMMIT %s
            LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Gene {primaryKey:row.dataId})
            MATCH (c:CrossReference {primaryKey:row.primaryKey})

            MERGE (o)-[oc:CROSS_REFERENCE]-(c)

    """ + ETLHelper.merge_crossref_relationships()

    gene_metadata_query_template = """

        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        //Create the load node(s)
        CREATE (l:Load:Entity {primaryKey:row.loadKey})
            SET l.dateProduced = row.dateProduced,
                l.loadName = "BGI",
                l.release = row.release,
                l.dataProviders = row.dataProviders,
                l.dataProvider = row.dataProvider
        """

    def __init__(self, config):
        """Initialise object."""
        self.metadata_is_loaded = {
        }  # Dictionary for optimizing metadata loading.
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):

        thread_pool = []

        query_tracking_list = multiprocessing.Manager().list()
        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type,
                                                    query_tracking_list))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

        queries = []
        for item in query_tracking_list:
            queries.append(item)

        Neo4jTransactor.execute_query_batch(queries)

    def _process_sub_type(self, sub_type, query_tracking_list):

        self.logger.info("Loading BGI Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        if filepath is None:
            self.logger.error("Can't find input file for %s", sub_type)
            sys.exit()

        data = JSONFile().get_data(filepath)

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # gene_metadata, gene_dataset, secondary_ids, genomic_locations, cross_references, synonyms
        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                self.gene_metadata_query_template, commit_size,
                "gene_metadata_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_query_template, commit_size,
                "gene_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.basic_gene_load_relations_query_template, commit_size,
                "gene_data_load_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.basic_gene_species_relations_query_template, commit_size,
                "gene_data_species_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.so_terms_query_template, commit_size,
                "gene_so_terms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.chromosomes_query_template, commit_size,
                "gene_chromosomes_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_secondary_ids_query_template, commit_size,
                "gene_secondary_ids_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.genomic_locations_query_template, commit_size,
                "gene_genomic_locations_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "gene_cross_references_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.xrefs_relationships_query_template, commit_size,
                "gene_cross_references_relationships_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                self.gene_synonyms_query_template, 600000,
                "gene_synonyms_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)

        for item in query_and_file_list:
            query_tracking_list.append(item)

        self.error_messages("BGI-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading BGI Data: %s",
                         sub_type.get_data_provider())

    def secondary_process(self, secondarys, data_record):
        """Get secondary ids.

        secondarys: list of dataset items.
        data_record: record to process.
        """
        if data_record.get('secondaryIds') is None:
            return
        for sid in data_record.get('secondaryIds'):
            secondary_id_dataset = {
                "primary_id": data_record.get('primaryId'),
                "secondary_id": sid
            }
            secondarys.append(secondary_id_dataset)

    def synonyms_process(self, synonyms, data_record):
        """Get synonyms."""
        if data_record.get('synonyms') is None:
            return
        for syn in data_record.get('synonyms'):
            syn_dataset = {
                "primary_id": data_record.get('primaryId'),
                "synonym": syn.strip()
            }
            synonyms.append(syn_dataset)

    def xref_process(self, basic_genetic_entity, cross_references,
                     urls):  # noqa
        """Process xrefs."""
        primary_id = basic_genetic_entity.get('primaryId')
        global_id = basic_genetic_entity.get('primaryId')
        local_id = global_id.split(":")[1]
        taxon_id = basic_genetic_entity.get("taxonId")
        if 'crossReferences' not in basic_genetic_entity:
            return
        for cross_ref in basic_genetic_entity.get('crossReferences'):
            if ':' not in cross_ref.get('id'):
                continue
            cross_ref_id = cross_ref.get('id')
            local_cross_ref_id = cross_ref_id.split(":")[1]
            prefix = cross_ref.get('id').split(":")[0]
            pages = cross_ref.get('pages')
            global_xref_id = cross_ref.get('id')
            display_name = global_xref_id

            # some pages collection have 0 elements
            if pages is not None and len(pages) > 0:
                for page in pages:
                    display_name = ""

                    cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                        prefix, local_cross_ref_id, page)

                    if page == 'gene/expression_images':
                        cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                            prefix, local_cross_ref_id, page)
                    elif page == 'gene':
                        urls[
                            'mod_cross_reference_complete_url'] = self.etlh.rdh2.return_url_from_key_value(
                                prefix, local_cross_ref_id, page)

                    urls[
                        'genetic_entity_external_url'] = self.etlh.rdh2.return_url_from_key_value(
                            prefix, local_cross_ref_id, page)

                    if page == 'gene/references':
                        urls[
                            'gene_literature_url'] = self.etlh.rdh2.return_url_from_key_value(
                                prefix, local_cross_ref_id, page)

                    if page == 'gene/spell':
                        display_name = 'Serial Patterns of Expression Levels Locator (SPELL)'

                    # TODO: fix generic_cross_reference in SGD, RGD

                    if page == 'generic_cross_reference':
                        cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                            local_cross_ref_id, prefix, primary_id)

                    # TODO: fix gene/disease xrefs for SGD once
                    # resourceDescriptor change in develop
                    # makes its way to the release branch.

                    if page == 'gene/disease' and taxon_id == 'NCBITaxon:559292':
                        cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                            'SGD', local_id, page)

                    xref_map = ETLHelper.get_xref_dict(local_cross_ref_id,
                                                       prefix, page, page,
                                                       display_name,
                                                       cross_ref_complete_url,
                                                       global_xref_id + page)
                    xref_map['dataId'] = primary_id
                    cross_references.append(xref_map)
            else:
                if prefix == 'PANTHER':
                    cross_ref_primary_id = cross_ref.get(
                        'id') + '_' + primary_id
                    cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                        local_cross_ref_id, prefix, primary_id)
                    page = "gene/panther"
                elif prefix == 'RGD':
                    cross_ref_primary_id = cross_ref.get('id')
                    cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value(
                        'RGD', local_cross_ref_id)
                    page = "generic_cross_reference"
                else:
                    cross_ref_primary_id = cross_ref.get('id')
                    cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                        local_cross_ref_id, prefix, primary_id)
                    page = "generic_cross_reference"
                xref_map = ETLHelper.get_xref_dict(local_cross_ref_id, prefix,
                                                   page, page, display_name,
                                                   cross_ref_complete_url,
                                                   cross_ref_primary_id + page)
                xref_map['dataId'] = primary_id
                cross_references.append(xref_map)

    def locations_process(self, basic_genetic_entity, chromosomes,
                          genomic_locations):
        """Get chromosome and genomic location info."""
        primary_id = basic_genetic_entity.get('primaryId')

        if 'genomeLocations' not in basic_genetic_entity:
            return
        for genome_location in basic_genetic_entity.get('genomeLocations'):
            chromosome = genome_location.get('chromosome')
            if chromosome is not None:
                if chromosome.startswith("chr"):
                    chromosome = chromosome[3:]

                if chromosome not in chromosomes:
                    chromosomes[chromosome] = {"primaryKey": chromosome}

                if 'startPosition' in genome_location:
                    start = genome_location['startPosition']
                else:
                    start = None

                if 'endPosition' in genome_location:
                    end = genome_location['endPosition']
                else:
                    end = None

                if 'strand' in basic_genetic_entity['genomeLocations']:
                    strand = genome_location['strand']
                else:
                    strand = None

            assembly = genome_location.get('assembly')

            if 'strand' in genome_location:
                strand = genome_location['strand']
            else:
                strand = None

            genomic_locations.append({
                "primaryId": primary_id,
                "chromosome": chromosome,
                "start": start,
                "end": end,
                "strand": strand,
                "assembly": assembly,
                "uuid": str(uuid.uuid4()),
                "dataProvider": self.data_provider
            })

    def get_generators(self, gene_data, data_provider, batch_size):
        """Create Generators."""
        date_produced = gene_data['metaData']['dateProduced']
        synonyms = []
        secondary_ids = []
        cross_references = []
        genomic_locations = []
        gene_dataset = []
        gene_metadata = []
        gene_to_so_terms = []
        chromosomes = {}
        release = None
        counter = 0

        self.data_providers_process(gene_data)
        load_key = date_produced + data_provider + "_BGI"

        # If we're not tracking the metadata, create the entry in our tracker.
        if load_key not in self.metadata_is_loaded:
            self.metadata_is_loaded[load_key] = False

        if 'release' in gene_data['metaData']:
            release = gene_data['metaData']['release']

        if self.metadata_is_loaded[load_key] is False:
            gene_metadata = []
            metadata_dict = {
                'loadKey': load_key,
                'loadName': 'BGI',
                'release': release,
                'dataProviders': None,
                'dataProvider': data_provider
            }
            gene_metadata.append(metadata_dict)

        for gene_record in gene_data['data']:
            counter = counter + 1
            urls = {
                'gene_literature_url': "",
                'genetic_entity_external_url': "",
                'mod_cross_reference_complete_url': ""
            }
            basic_genetic_entity = gene_record['basicGeneticEntity']
            primary_id = basic_genetic_entity.get('primaryId')
            global_id = basic_genetic_entity.get('primaryId')
            local_id = global_id.split(":")[1]
            taxon_id = basic_genetic_entity.get("taxonId")
            short_species_abbreviation = self.etlh.get_short_species_abbreviation(
                taxon_id)

            if basic_genetic_entity.get('taxonId') in [
                    "NCBITaxon:9606", "NCBITaxon:10090"
            ]:
                local_id = basic_genetic_entity.get('primaryId')

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    primary_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            self.xref_process(basic_genetic_entity, cross_references, urls)
            # TODO Metadata can be safely removed from this dictionary. Needs to be tested.

            gene_to_so_terms.append({
                "primaryKey": primary_id,
                "soTermId": gene_record['soTermId']
            })

            gene = {
                "symbol":
                gene_record.get('symbol'),
                # globallyUniqueSymbolWithSpecies requested by search group
                "symbolWithSpecies":
                gene_record.get('symbol') + " (" + short_species_abbreviation +
                ")",
                "name":
                gene_record.get('name'),
                "geneticEntityExternalUrl":
                urls['genetic_entity_external_url'],
                "description":
                gene_record.get('description'),
                "geneSynopsis":
                gene_record.get('geneSynopsis'),
                "geneSynopsisUrl":
                gene_record.get('geneSynopsisUrl'),
                "taxonId":
                basic_genetic_entity.get('taxonId'),
                "geneLiteratureUrl":
                urls['gene_literature_url'],
                "name_key":
                gene_record.get('symbol'),
                "primaryId":
                primary_id,
                "category":
                "gene",
                "href":
                None,
                "uuid":
                str(uuid.uuid4()),
                "modCrossRefCompleteUrl":
                urls['mod_cross_reference_complete_url'],
                "localId":
                local_id,
                "modGlobalCrossRefId":
                global_id,
                "modGlobalId":
                global_id,
                "loadKey":
                load_key,
                "dataProvider":
                data_provider,
                "dateProduced":
                date_produced
            }

            gene_dataset.append(gene)
            self.locations_process(basic_genetic_entity, chromosomes,
                                   genomic_locations)
            self.synonyms_process(synonyms, basic_genetic_entity)
            self.secondary_process(secondary_ids, basic_genetic_entity)

            # We should have the metadata ready to go after the first loop of the generator.
            self.metadata_is_loaded[load_key] = True

            # Establishes the number of genes to yield (return) at a time.
            if counter == batch_size:  # only sending unique chromosomes, hense empty list here.
                counter = 0
                yield [
                    gene_metadata,
                    gene_dataset, gene_dataset, gene_dataset, gene_to_so_terms,
                    [], secondary_ids, genomic_locations, cross_references,
                    cross_references, synonyms
                ]
                gene_metadata = []
                gene_dataset = []
                synonyms = []
                secondary_ids = []
                genomic_locations = []
                cross_references = []
                gene_to_so_terms = []
                # xref_relations = []

        if counter > 0:
            yield [
                gene_metadata, gene_dataset, gene_dataset, gene_dataset,
                gene_to_so_terms,
                chromosomes.values(), secondary_ids, genomic_locations,
                cross_references, cross_references, synonyms
            ]
    def get_generators(self, sqtr_data, data_provider, batch_size):
        """Get Generators"""

        data_providers = []
        sqtrs = []
        sqtr_synonyms = []
        sqtr_secondary_ids = []
        mod_global_cross_ref_url = ""
        tgs = []

        counter = 0
        date_produced = sqtr_data['metaData']['dateProduced']

        data_provider_object = sqtr_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_SqTR"


        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider,
                                                                         self.xref_url_map,
                                                                         data_provider,
                                                                         data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict( \
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        for sqtr_record in sqtr_data['data']:
            counter = counter + 1
            global_id = sqtr_record['primaryId']
            local_id = global_id.split(":")[1]

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            if sqtr_record.get('secondaryIds') is not None:
                for sid in sqtr_record.get('secondaryIds'):
                    sqtr_secondary_id_dataset = {
                        "primaryId": sqtr_record.get('primaryId'),
                        "secondaryId": sid
                    }
                    sqtr_secondary_ids.append(sqtr_secondary_id_dataset)

            if sqtr_record.get('synonyms') is not None:
                for syn in sqtr_record.get('synonyms'):
                    syn_dataset = {
                        "primaryId": sqtr_record.get('primaryId'),
                        "synonym": syn
                    }
                    sqtr_synonyms.append(syn_dataset)

            if sqtr_record.get('targetGeneIds') is not None:
                for target_gene_id in sqtr_record.get('targetGeneIds'):
                    tg_dataset = {
                        "primaryId": sqtr_record.get('primaryId'),
                        "geneId": target_gene_id
                    }
                    tgs.append(tg_dataset)

            if 'crossReferences' in sqtr_record:

                for cross_ref in sqtr_record['modCrossReference']:
                    cross_ref_id = cross_ref.get('id')
                    local_crossref_id = cross_ref_id.split(":")[1]
                    prefix = cross_ref.get('id').split(":")[0]
                    pages = cross_ref.get('pages')

                    # some pages collection have 0 elements
                    if pages is None or len(pages) == 0:
                        continue
                    if 'sequence_targeting_reagent' in pages:
                        page = 'sequence_targeting_reagent'
                        mod_global_cross_ref_url = ETLHelper.get_page_complete_url( \
                                local_crossref_id,
                                self.xref_url_map,
                                prefix,
                                page)


            sqtr_dataset = {
                "primaryId": sqtr_record.get('primaryId'),
                "name": sqtr_record.get('name'),
                "globalId": global_id,
                "localId": local_id,
                "soTerm": sqtr_record.get('soTermId'),
                "taxonId": sqtr_record.get('taxonId'),
                "dataProviders": data_providers,
                "dateProduced": date_produced,
                "loadKey": load_key,
                "modGlobalCrossRefUrl": mod_global_cross_ref_url,
                "dataProvider": data_provider
            }
            sqtrs.append(sqtr_dataset)



            if counter == batch_size:
                yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs]
                sqtrs = []
                sqtr_secondary_ids = []
                sqtr_synonyms = []
                tgs = []
                counter = 0

        if counter > 0:
            yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs]
Exemple #17
0
class ConstructETL(ETL):
    """Construct ETL"""

    logger = logging.getLogger(__name__)
    xref_url_map = ResourceDescriptorHelper().get_data()

    # Query templates which take params and will be processed later

    construct_query_template = """
          USING PERIODIC COMMIT %s
          LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

              //Create the Construct node and set properties. primaryKey is required.
              MERGE (o:Construct {primaryKey:row.primaryId})
                  ON CREATE SET o.name = row.name,
                   o.dateProduced = row.dateProduced,
                   o.release = row.release,
                   o.localId = row.localId,
                   o.globalId = row.globalId,
                   o.uuid = row.uuid,
                   o.nameText = row.nameText,
                   o.modCrossRefCompleteUrl = row.modGlobalCrossRefId,
                   o.dataProviders = row.dataProviders,
                   o.dataProvider = row.dataProvider,
                   o.symbol = row.symbol

            """

    construct_secondary_ids_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (f:Construct {primaryKey:row.data_id})

            MERGE (second:SecondaryId {primaryKey:row.secondary_id})
                SET second.name = row.secondary_id
            MERGE (f)-[aka1:ALSO_KNOWN_AS]->(second) """

    construct_synonyms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (a:Construct {primaryKey:row.data_id})

            MERGE(syn:Synonym {primaryKey:row.synonym})
                SET syn.name = row.synonym
            MERGE (a)-[aka2:ALSO_KNOWN_AS]->(syn) """

    construct_xrefs_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Construct {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text(
    )

    construct_gene_component_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Construct {primaryKey:row.constructID}), (g:Gene {primaryKey:row.componentID})
            CALL apoc.create.relationship(g, row.componentRelation, {}, o) yield rel
            REMOVE rel.noOp"""

    construct_no_gene_component_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Construct {primaryKey:row.constructID}), (g:NonBGIConstructComponent {primaryKey:row.componentSymbol})
            CALL apoc.create.relationship(g, row.componentRelation, {}, o) yield rel
            REMOVE rel.noOp"""

    non_bgi_component_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
            MERGE (o:NonBGIConstructComponent {primaryKey:row.componentSymbol})"""

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type, ))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Construct Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Construct Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [
                ConstructETL.construct_query_template, commit_size,
                "Construct_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_secondary_ids_query_template,
                commit_size, "Construct_secondary_ids_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_synonyms_query_template, commit_size,
                "Construct_synonyms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_xrefs_query_template, commit_size,
                "Construct_xrefs_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.non_bgi_component_query_template, commit_size,
                "Construct_non_bgi_component_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                ConstructETL.construct_gene_component_query_template,
                commit_size, "Construct_components_gene" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                ConstructETL.construct_no_gene_component_query_template,
                commit_size, "Construct_components_no_gene" +
                sub_type.get_data_provider() + ".csv"
            ]
        ]

        # Obtain the generator
        generators = self.get_generators(data, sub_type.get_data_provider(),
                                         batch_size)

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, construct_data, data_provider, batch_size):
        """Create Generators"""

        data_providers = []
        release = ""
        constructs = []
        construct_synonyms = []
        construct_secondary_ids = []
        cross_reference_list = []
        component_details = []
        component_no_gene_details = []
        non_bgi_components = []

        counter = 0
        date_produced = construct_data['metaData']['dateProduced']

        data_provider_object = construct_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        self.logger.info("DataProvider: " + data_provider)
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_construct"

        # TODO: get SGD to fix their files.

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(
                    ETLHelper.get_xref_dict(
                        data_provider, data_provider, data_provider_page,
                        data_provider_page, data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        if 'release' in construct_data['metaData']:
            release = construct_data['metaData']['release']

        for construct_record in construct_data['data']:

            counter = counter + 1
            global_id = construct_record['primaryId']
            local_id = global_id.split(":")[1]
            mod_global_cross_ref_id = ""

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            name_text = TextProcessingHelper.cleanhtml(
                construct_record.get('name'))

            construct_dataset = {
                "symbol": construct_record.get('name'),
                "primaryId": construct_record.get('primaryId'),
                "globalId": global_id,
                "localId": local_id,
                "dataProviders": data_providers,
                "dateProduced": date_produced,
                "loadKey": load_key,
                "release": release,
                "modGlobalCrossRefId": mod_global_cross_ref_id,
                "uuid": str(uuid.uuid4()),
                "dataProvider": data_provider,
                "nameText": name_text,
                "name": construct_record.get('name')
            }
            constructs.append(construct_dataset)

            if 'crossReferences' in construct_record:

                for cross_ref in construct_record.get('crossReferences'):
                    cross_ref_id = cross_ref.get('id')
                    local_crossref_id = cross_ref_id.split(":")[1]
                    prefix = cross_ref.get('id').split(":")[0]
                    pages = cross_ref.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if page == 'construct':
                                mod_global_cross_ref_id = ETLHelper.get_page_complete_url(
                                    local_crossref_id, self.xref_url_map,
                                    prefix, page)
                                xref = ETLHelper.get_xref_dict(
                                    local_crossref_id, prefix, page, page,
                                    cross_ref_id, mod_global_cross_ref_id,
                                    cross_ref_id + page)
                                xref['dataId'] = global_id
                                cross_reference_list.append(xref)

            if 'constructComponents' in construct_record:
                for component in construct_record.get('constructComponents'):
                    component_relation = component.get(
                        'componentRelation').upper()
                    component_symbol = component.get('componentSymbol')
                    component_id = component.get('componentID')

                    if component_id is not None:
                        component_detail = {
                            "componentRelation": component_relation.upper(),
                            "componentSymbol": component_symbol,
                            "componentID": component_id,
                            "constructID": construct_record.get('primaryId')
                        }
                        component_details.append(component_detail)
                    else:
                        component_detail = {
                            "componentRelation": component_relation.upper(),
                            "componentSymbol": component_symbol,
                            "constructID": construct_record.get('primaryId')
                        }
                        non_bgi_component = {
                            "componentSymbol": component_symbol
                        }
                        non_bgi_components.append(non_bgi_component)
                        component_no_gene_details.append(component_detail)

            if 'synonyms' in construct_record:
                for syn in construct_record.get('synonyms'):
                    construct_synonym = {
                        "data_id": construct_record.get('primaryId'),
                        "synonym": syn.strip()
                    }
                    construct_synonyms.append(construct_synonym)

            if 'secondaryIds' in construct_record:
                for secondary_id in construct_record.get('secondaryIds'):
                    construct_secondary_id = {
                        "data_id": construct_record.get('primaryId'),
                        "secondary_id": secondary_id
                    }
                    construct_secondary_ids.append(construct_secondary_id)

            if counter == batch_size:
                yield [
                    constructs, construct_secondary_ids, construct_synonyms,
                    cross_reference_list, non_bgi_components,
                    component_details, component_no_gene_details
                ]
                constructs = []
                construct_secondary_ids = []
                construct_synonyms = []
                cross_reference_list = []
                non_bgi_components = []
                component_details = []
                component_no_gene_details = []
                counter = 0

        if counter > 0:
            yield [
                constructs, construct_secondary_ids, construct_synonyms,
                cross_reference_list, non_bgi_components, component_details,
                component_no_gene_details
            ]
class ExpressionAtlasETL(ETL):
    """Expression Atlas ETL."""

    logger = logging.getLogger(__name__)

    # Querys which do not take params and can be used as is

    get_all_gene_primary_to_ensmbl_ids_query = """
        MATCH (g:Gene)-[:CROSS_REFERENCE]-(c:CrossReference)
        WHERE c.prefix = 'ENSEMBL'
        RETURN g.primaryKey, c.localId"""

    get_mod_gene_symbol_to_primary_ids_query = """
        MATCH (g:Gene)
        WHERE g.dataProvider = {parameter}
        RETURN g.primaryKey, g.symbol"""

    get_genes_with_expression_atlas_links_query = """
        MATCH (g:Gene)
        WHERE LOWER(g.primaryKey) IN {parameter}
        RETURN g.primaryKey, g.modLocalId"""

    # Query templates which take params and will be processed later

    add_expression_atlas_crossreferences_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        MATCH (o:Gene)
        WHERE o.primaryKey = row.genePrimaryKey
        """ + ETLHelper.get_cypher_xref_text()

    def __init__(self, config):
        """Initialise object."""
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []
        ensg_to_gene_primary_id_map = self._get_primary_gene_ids_to_ensembl_ids(
        )

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(
                target=self._process_sub_type,
                args=(sub_type, ensg_to_gene_primary_id_map))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

    @staticmethod
    def _get_primary_gene_ids_to_ensembl_ids():
        return_set = Neo4jHelper.run_single_query(
            ExpressionAtlasETL.get_all_gene_primary_to_ensmbl_ids_query)
        return {
            record["c.localId"].lower(): record["g.primaryKey"]
            for record in return_set
        }

    @staticmethod
    def _get_mod_gene_symbol_to_primary_ids(data_provider):
        return_set = Neo4jHelper.run_single_parameter_query(
            ExpressionAtlasETL.get_mod_gene_symbol_to_primary_ids_query,
            data_provider)
        return {
            record["g.symbol"].lower(): record["g.primaryKey"]
            for record in return_set
        }

    # Returns only pages for genes that we have in the Alliance
    def _get_expression_atlas_gene_pages(self, sub_type, data_provider,
                                         ensg_to_gene_primary_id_map):
        filepath = sub_type.get_filepath()
        gene_symbol_to_primary_id_map = self._get_mod_gene_symbol_to_primary_ids(
            data_provider)

        expression_atlas_gene_pages = {}
        with open(filepath) as file_handle:
            doc = xmltodict.parse(file_handle.read())["urlset"]
            for value in doc.values():
                if isinstance(value, (list, )):
                    for element in value:
                        url = element['loc']
                        expression_atlas_gene = url.split("/")[-1]
                        expression_atlas_gene = expression_atlas_gene.lower()
                        if expression_atlas_gene in ensg_to_gene_primary_id_map:
                            expression_atlas_gene_pages[
                                ensg_to_gene_primary_id_map[
                                    expression_atlas_gene].lower()] = url
                        elif expression_atlas_gene in gene_symbol_to_primary_id_map:
                            expression_atlas_gene_pages[
                                gene_symbol_to_primary_id_map[
                                    expression_atlas_gene].lower()] = url
                        else:
                            alliance_gene = data_provider + ":" + expression_atlas_gene
                            expression_atlas_gene_pages[
                                alliance_gene.lower()] = url

        return expression_atlas_gene_pages

    def _process_sub_type(self, sub_type, ensg_to_gene_primary_id_map):

        data_provider = sub_type.get_data_provider()
        expression_atlas_gene_pages = self._get_expression_atlas_gene_pages(
            sub_type, data_provider, ensg_to_gene_primary_id_map)

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(expression_atlas_gene_pages,
                                         data_provider, batch_size)

        query_template_list = [
            [
                self.add_expression_atlas_crossreferences_query_template,
                commit_size, "expression_atlas_" + data_provider + "_data.csv"
            ],
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("ExpAtlas-{}: ".format(
            sub_type.get_data_provider()))

    def get_generators(self, expression_atlas_gene_pages, data_provider,
                       batch_size):
        """Get Generators."""
        return_set = Neo4jHelper.run_single_parameter_query(
            ExpressionAtlasETL.get_genes_with_expression_atlas_links_query,
            list(expression_atlas_gene_pages.keys()))

        counter = 0
        cross_reference_list = []
        for record in return_set:
            counter += 1
            cross_reference = ETLHelper.get_xref_dict(
                record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene",
                "gene/expression-atlas", "gene/expressionAtlas",
                record["g.modLocalId"],
                expression_atlas_gene_pages[record["g.primaryKey"].lower()],
                data_provider + ":" + record["g.modLocalId"] +
                "gene/expression-atlas")
            cross_reference["genePrimaryKey"] = record["g.primaryKey"]
            cross_reference_list.append(cross_reference)
            if counter > batch_size:
                yield [cross_reference_list]
                counter = 0
                cross_reference_list = []

        if counter > 0:
            yield [cross_reference_list]
    def get_generators(self, disease_data, batch_size, data_provider):
        """Creating generators"""

        counter = 0
        disease_association_type = None
        gene_list_to_yield = []
        allele_list_to_yield = []
        agm_list_to_yield = []
        evidence_code_list_to_yield = []
        withs = []
        pge_list_to_yield = []
        xrefs = []
        data_provider_object = disease_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')

        for disease_record in disease_data['data']:

            publication_mod_id = ""
            pub_med_id = ""
            pub_mod_url = None
            pub_med_url = None
            pge_key = ''

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    disease_record.get('objectId'))
                if is_it_test_entry is False:
                    continue

            disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \
                                 disease_record['objectRelation'].get("associationType").upper()

            counter = counter + 1
            disease_object_type = disease_record['objectRelation'].get(
                "objectType")

            primary_id = disease_record.get('objectId')
            do_id = disease_record.get('DOid')

            if 'evidence' in disease_record:
                pecj_primary_key = str(uuid.uuid4())
                evidence = disease_record.get('evidence')
                if 'publication' in evidence:
                    publication = evidence.get('publication')
                    if publication.get('publicationId').startswith('PMID:'):
                        pub_med_id = publication.get('publicationId')
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_url = ETLHelper.get_complete_pub_url(
                            local_pub_med_id, pub_med_id)
                        if 'crossReference' in evidence:
                            pub_xref = evidence.get('crossReference')
                            publication_mod_id = pub_xref.get('id')
                            local_pub_mod_id = publication_mod_id.split(":")[1]
                            pub_mod_url = ETLHelper.get_complete_pub_url(
                                local_pub_mod_id, publication_mod_id)
                    else:
                        publication_mod_id = publication.get('publicationId')
                        local_pub_mod_id = publication_mod_id.split(":")[1]
                        pub_mod_url = ETLHelper.get_complete_pub_url(
                            local_pub_mod_id, publication_mod_id)

                if 'evidenceCodes' in disease_record['evidence']:
                    for ecode in disease_record['evidence'].get(
                            'evidenceCodes'):
                        ecode_map = {
                            "pecjPrimaryKey": pecj_primary_key,
                            "ecode": ecode
                        }
                        evidence_code_list_to_yield.append(ecode_map)

            negation = ''
            if 'objectRelation' in disease_record:
                disease_association_type = disease_record[
                    'objectRelation'].get("associationType").upper()
                if 'negation' in disease_record:
                    # this capitalization is purposeful
                    if disease_association_type == 'IS_IMPLICATED_IN':
                        disease_association_type = 'IS_NOT_IMPLICATED_IN'
                    if disease_association_type == 'IS_MODEL_OF':
                        disease_association_type = 'IS_NOT_MODEL_OF'
                    if disease_association_type == 'IS_MARKER_FOR':
                        disease_association_type = 'IS_NOT_MARKER_FOR'
                    negation = 'NOT'
                    disease_unique_key = disease_unique_key + negation

                additional_genetic_components = []

                if 'additionalGeneticComponents' in disease_record[
                        'objectRelation']:
                    for component in disease_record['objectRelation'][
                            'additionalGeneticComponents']:
                        component_symbol = component.get('componentSymbol')
                        component_id = component.get('componentId')
                        component_url = component.get(
                            'componentUrl') + component_id
                        additional_genetic_components.append({
                            "id":
                            component_id,
                            "componentUrl":
                            component_url,
                            "componentSymbol":
                            component_symbol
                        })

            if 'with' in disease_record:
                with_record = disease_record.get('with')
                for rec in with_record:
                    disease_unique_key = disease_unique_key + rec
                for rec in with_record:
                    with_map = {
                        "diseaseUniqueKey": disease_unique_key,
                        "withD": rec
                    }
                    withs.append(with_map)

            if 'primaryGeneticEntityIDs' in disease_record:

                pge_ids = disease_record.get('primaryGeneticEntityIDs')

                for pge in pge_ids:
                    pge_key = pge_key + pge
                    pge_map = {
                        "pecjPrimaryKey": pecj_primary_key,
                        "pgeId": pge
                    }
                    pge_list_to_yield.append(pge_map)

            if 'dataProvider' in disease_record:
                for dp in disease_record['dataProvider']:
                    annotation_type = dp.get('type')
                    xref = dp.get('crossReference')
                    cross_ref_id = xref.get('id')
                    pages = xref.get('pages')

                    if ":" in cross_ref_id:
                        local_crossref_id = cross_ref_id.split(":")[1]
                        prefix = cross_ref_id.split(":")[0]
                    else:
                        local_crossref_id = ""
                        prefix = cross_ref_id

                    if annotation_type is None:
                        annotation_type = 'curated'

                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if (data_provider == 'RGD' or data_provider
                                    == 'HUMAN') and prefix == 'DOID':
                                display_name = 'RGD'
                            elif (data_provider == 'RGD' or data_provider
                                  == 'HUMAN') and prefix == 'OMIM':
                                display_name = 'OMIM'
                            else:
                                display_name = cross_ref_id.split(":")[0]
                                if display_name == 'DOID':
                                    display_name = data_provider

                            mod_global_cross_ref_id = ETLHelper.get_page_complete_url(
                                local_crossref_id, self.xref_url_map, prefix,
                                page)
                            passing_xref = ETLHelper.get_xref_dict(
                                local_crossref_id, prefix, page, page,
                                display_name, mod_global_cross_ref_id,
                                cross_ref_id + page + annotation_type)
                            passing_xref['dataId'] = disease_unique_key

                            if 'loaded' in annotation_type:
                                passing_xref['loadedDB'] = 'true'
                                passing_xref['curatedDB'] = 'false'
                            else:
                                passing_xref['curatedDB'] = 'true'
                                passing_xref['loadedDB'] = 'false'

                            xrefs.append(passing_xref)

            disease_record = {
                "diseaseUniqueKey": disease_unique_key,
                "doId": do_id,
                "primaryId": primary_id,
                "pecjPrimaryKey": pecj_primary_key,
                "relationshipType": disease_association_type.upper(),
                "dataProvider": data_provider,
                "dateAssigned": disease_record.get("dateAssigned"),
                "pubPrimaryKey": publication_mod_id + pub_med_id,
                "pubModId": publication_mod_id,
                "pubMedId": pub_med_id,
                "pubMedUrl": pub_med_url,
                "pubModUrl": pub_mod_url,
                "negation": negation
            }

            if disease_object_type == 'gene':
                gene_list_to_yield.append(disease_record)
            elif disease_object_type == 'allele':
                allele_list_to_yield.append(disease_record)
            else:
                agm_list_to_yield.append(disease_record)

            if counter == batch_size:
                yield [
                    allele_list_to_yield, gene_list_to_yield,
                    agm_list_to_yield, pge_list_to_yield, pge_list_to_yield,
                    pge_list_to_yield, withs, evidence_code_list_to_yield,
                    xrefs
                ]
                agm_list_to_yield = []
                allele_list_to_yield = []
                gene_list_to_yield = []
                evidence_code_list_to_yield = []
                pge_list_to_yield = []
                xrefs = []
                withs = []
                counter = 0

        if counter > 0:
            yield [
                allele_list_to_yield, gene_list_to_yield, agm_list_to_yield,
                pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs,
                evidence_code_list_to_yield, xrefs
            ]
Exemple #20
0
    def get_generators(self, variant_data, batch_size):  # noqa
        """Get Generators."""

        data_providers = []
        release = ""
        variants = []
        variant_genomic_locations = []
        variant_so_terms = []
        cross_references = []
        counter = 0
        date_produced = variant_data['metaData']['dateProduced']

        self.data_providers_process(variant_data)
        load_key = date_produced + self.data_provider + "_VARIATION"

        if 'release' in variant_data['metaData']:
            release = variant_data['metaData']['release']

        assemblies = {}
        for allele_record in variant_data['data']:
            chromosome = allele_record["chromosome"]
            if chromosome.startswith("chr"):
                chromosome_str = chromosome[3:]
            else:
                chromosome_str = chromosome

            assembly = allele_record["assembly"]

            if assembly not in assemblies:
                self.logger.info(assembly)
                context_info = ContextInfo()
                data_manager = DataFileManager(
                    context_info.config_file_location)
                assemblies[assembly] = AssemblySequenceHelper(
                    assembly, data_manager)

            so_term_id = allele_record.get('type')
            genomic_reference_sequence = allele_record.get(
                'genomicReferenceSequence')
            genomic_variant_sequence = allele_record.get(
                'genomicVariantSequence')

            if genomic_reference_sequence == 'N/A':
                genomic_reference_sequence = ""
            if genomic_variant_sequence == 'N/A':
                genomic_variant_sequence = ""

            padding_left = ""
            padding_right = ""
            if allele_record.get('start') != "" and allele_record.get(
                    'end') != "":

                # not insertion
                if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567":
                    genomic_reference_sequence = assemblies[
                        assembly].get_sequence(chromosome_str,
                                               allele_record.get('start'),
                                               allele_record.get('end'))

                if allele_record.get('start') < allele_record.get('end'):
                    start = allele_record.get('start')
                    end = allele_record.get('end')
                else:
                    start = allele_record.get('end')
                    end = allele_record.get('start')

                padding_width = 500
                if so_term_id != "SO:0000667":  # not insertion
                    start = start - 1
                    end = end + 1

                left_padding_start = start - padding_width
                if left_padding_start < 1:
                    left_padding_start = 1

                padding_left = assemblies[assembly].get_sequence(
                    chromosome_str, left_padding_start, start)
                right_padding_end = end + padding_width
                padding_right = assemblies[assembly].get_sequence(
                    chromosome_str, end, right_padding_end)
            counter = counter + 1
            global_id = allele_record.get('alleleId')
            mod_global_cross_ref_id = ""
            cross_references = []

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            cross_ref_primary_id = allele_record.get(
                'sequenceOfReferenceAccessionNumber')
            local_cross_ref_id = cross_ref_primary_id.split(":")[1]
            prefix = cross_ref_primary_id.split(":")[0]

            cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                local_cross_ref_id, prefix, global_id)
            xref_map = ETLHelper.get_xref_dict(
                local_cross_ref_id, prefix, "variant_sequence_of_reference",
                "sequence_of_reference_accession_number", global_id,
                cross_ref_complete_url,
                cross_ref_primary_id + "variant_sequence_of_reference")

            xref_map['dataId'] = global_id
            if cross_ref_primary_id is not None:
                cross_references.append(xref_map)

            if genomic_reference_sequence is not None:
                if len(genomic_reference_sequence) > 1000 and (
                        allele_record.get('type') == 'SO:1000002'
                        or allele_record.get('type') == 'SO:1000008'):
                    self.logger.debug("%s genomicReferenceSequence",
                                      allele_record.get('alleleId'))

            if genomic_variant_sequence is not None:
                if len(genomic_variant_sequence) > 1000 and (
                        allele_record.get('type')
                        in ['SO:1000002', 'SO:1000008']):
                    self.logger.debug("%s genomicVariantSequence",
                                      allele_record.get('alleleId'))

            hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature(
                allele_record.get('sequenceOfReferenceAccessionNumber'),
                allele_record.get('type'), allele_record.get('start'),
                allele_record.get('end'),
                genomic_reference_sequence, genomic_variant_sequence,
                allele_record.get('assembly'), chromosome_str)

            if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \
                    or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000:
                self.logger.debug(
                    "%s has too long of a sequence potentionally",
                    allele_record.get('alleleId'))

            # TODO: fix typo in MGI Submission for this variant so
            # that it doesn't list a 40K bp point mutation.
            if allele_record.get('alleleId') != 'MGI:6113870':

                variant_dataset = {
                    "hgvs_nomenclature": hgvs_nomenclature,
                    "genomicReferenceSequence": genomic_reference_sequence,
                    "genomicVariantSequence": genomic_variant_sequence,
                    "paddingLeft": padding_left,
                    "paddingRight": padding_right,
                    "alleleId": allele_record.get('alleleId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": load_key,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "dataProvider": self.data_provider,
                    "variantHGVSSynonym": hgvs_synonym
                }

                variant_genomic_location_dataset = {
                    "variantId": hgvs_nomenclature,
                    "assembly": allele_record.get('assembly'),
                    "chromosome": chromosome_str,
                    "start": allele_record.get('start'),
                    "end": allele_record.get('end'),
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": self.data_provider
                }

                variant_so_term = {
                    "variantId": hgvs_nomenclature,
                    "soTermId": allele_record.get('type')
                }

                variant_so_terms.append(variant_so_term)
                variant_genomic_locations.append(
                    variant_genomic_location_dataset)
                variants.append(variant_dataset)

            if counter == batch_size:
                yield [
                    variants, variant_genomic_locations, variant_so_terms,
                    cross_references
                ]
                variants = []
                variant_genomic_locations = []
                variant_so_terms = []
                cross_references = []

        if counter > 0:
            yield [
                variants, variant_genomic_locations, variant_so_terms,
                cross_references
            ]
Exemple #21
0
class DiseaseETL(ETL):
    """Disease ETL."""

    logger = logging.getLogger(__name__)

    # Query templates which take params and will be processed later

    execute_annotation_xrefs_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:DiseaseEntityJoin:Association {primaryKey:row.dataId})
        """ + ETLHelper.get_cypher_xref_text_annotation_level()

    execute_agms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            // GET PRIMARY DATA OBJECTS

            MATCH (d:DOTerm:Ontology {primaryKey:row.doId})
            MATCH (agm:AffectedGenomicModel {primaryKey:row.primaryId})

            CALL apoc.create.relationship(d, row.relationshipType, {}, agm) yield rel
            SET rel.uuid = row.diseaseUniqueKey
            REMOVE rel.noOp

            //This is an intentional MERGE, please leave as is

            MERGE (dfa:Association:DiseaseEntityJoin {primaryKey:row.diseaseUniqueKey})
                ON CREATE SET dfa.dataProvider = row.dataProvider,
                              dfa.sortOrder = 1,
                              dfa.joinType = row.relationshipType,
                              dfa.negation = row.negation

            MERGE (agm)-[fdaf:ASSOCIATION]->(dfa)
            MERGE (dfa)-[dadf:ASSOCIATION]->(d)

            // PUBLICATIONS FOR FEATURE

            MERGE (pubf:Publication {primaryKey:row.pubPrimaryKey})
                ON CREATE SET pubf.pubModId = row.pubModId,
                 pubf.pubMedId = row.pubMedId,
                 pubf.pubModUrl = row.pubModUrl,
                 pubf.pubMedUrl = row.pubMedUrl

            MERGE (pubEJ:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey})
                ON CREATE SET pubEJ.joinType = 'pub_evidence_code_join',
                                pubEJ.dateAssigned = row.dateAssigned

            MERGE (dfa)-[dapug:EVIDENCE {uuid:row.pecjPrimaryKey}]->(pubEJ)

            MERGE (pubf)-[pubfpubEJ:ASSOCIATION {uuid:row.pecjPrimaryKey}]->(pubEJ)
            """

    execute_allele_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            // GET PRIMARY DATA OBJECTS

            MATCH (d:DOTerm:Ontology {primaryKey:row.doId})
            MATCH (allele:Allele:Feature {primaryKey:row.primaryId})

            CALL apoc.create.relationship(d, row.relationshipType, {}, allele) yield rel
                        SET rel.uuid = row.diseaseUniqueKey
            REMOVE rel.noOp

            //This is an intentional MERGE, please leave as is

            MERGE (dfa:Association:DiseaseEntityJoin {primaryKey:row.diseaseUniqueKey})
                ON CREATE SET dfa.dataProvider = row.dataProvider,
                              dfa.sortOrder = 1,
                              dfa.joinType = row.relationshipType,
                              dfa.negation = row.negation

            MERGE (allele)-[fdaf:ASSOCIATION]->(dfa)
            MERGE (dfa)-[dadf:ASSOCIATION]->(d)

            // PUBLICATIONS FOR FEATURE

            MERGE (pubf:Publication {primaryKey:row.pubPrimaryKey})
                ON CREATE SET pubf.pubModId = row.pubModId,
                 pubf.pubMedId = row.pubMedId,
                 pubf.pubModUrl = row.pubModUrl,
                 pubf.pubMedUrl = row.pubMedUrl

            MERGE (pubEJ:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey})
                ON CREATE SET pubEJ.joinType = 'pub_evidence_code_join',
                                pubEJ.dateAssigned = row.dateAssigned

            MERGE (dfa)-[dapug:EVIDENCE {uuid:row.pecjPrimaryKey}]->(pubEJ)

            MERGE (pubf)-[pubfpubEJ:ASSOCIATION {uuid:row.pecjPrimaryKey}]->(pubEJ)"""

    execute_gene_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (d:DOTerm:Ontology {primaryKey:row.doId})
            MATCH (gene:Gene {primaryKey:row.primaryId})

            CALL apoc.create.relationship(d, row.relationshipType, {}, gene) yield rel
                        SET rel.uuid = row.diseaseUniqueKey
            REMOVE rel.noOp

            MERGE (dga:Association:DiseaseEntityJoin {primaryKey:row.diseaseUniqueKey})
                SET dga.dataProvider = row.dataProvider,
                    dga.sortOrder = 1,
                    dga.joinType = row.relationshipType,
                    dga.negation = row.negation


            MERGE (gene)-[fdag:ASSOCIATION]->(dga)
            MERGE (dga)-[dadg:ASSOCIATION]->(d)

            // PUBLICATIONS FOR GENE

            MERGE (pubg:Publication {primaryKey:row.pubPrimaryKey})
                ON CREATE SET pubg.pubModId = row.pubModId,
                    pubg.pubMedId = row.pubMedId,
                    pubg.pubModUrl = row.pubModUrl,
                    pubg.pubMedUrl = row.pubMedUrl

            MERGE (pubEJ:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey})
            ON CREATE SET pubEJ.joinType = 'pub_evidence_code_join',
                                pubEJ.dateAssigned = row.dateAssigned

            MERGE (dga)-[dapug:EVIDENCE {uuid:row.pecjPrimaryKey}]->(pubEJ)
            MERGE (pubg)-[pubgpubEJ:ASSOCIATION {uuid:row.pecjPrimaryKey}]->(pubEJ)"""

    execute_ecode_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Ontology:ECOTerm {primaryKey:row.ecode})
            MATCH (pubjk:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey})
            MERGE (pubjk)-[daecode1g:ASSOCIATION {uuid:row.pecjPrimaryKey}]->(o)"""

    execute_withs_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (dga:Association:DiseaseEntityJoin {primaryKey:row.diseaseUniqueKey})

            MATCH (diseaseWith:Gene {primaryKey:row.withD})
            MERGE (dga)-[dgaw:FROM_ORTHOLOGOUS_GENE]-(diseaseWith) """

    execute_pges_gene_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (n:Gene {primaryKey:row.pgeId})
            MATCH (d:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey})

            MERGE (d)-[dgaw:PRIMARY_GENETIC_ENTITY]-(n)"""

    execute_pges_allele_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (n:Allele {primaryKey:row.pgeId})
            MATCH (d:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey})

            MERGE (d)-[dgaw:PRIMARY_GENETIC_ENTITY]-(n)"""

    execute_pges_agm_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (n:AffectedGenomicModel {primaryKey:row.pgeId})
            MATCH (d:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey})

            MERGE (d)-[dgaw:PRIMARY_GENETIC_ENTITY]-(n)"""

    def __init__(self, config):
        """Initialise object."""
        super().__init__()
        self.data_type_config = config
        self.disease_unique_key = None
        self.disease_association_type = None

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type,))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

        self.delete_empty_nodes()

    def delete_empty_nodes(self):
        """Delete Empty Nodes."""
        self.logger.debug("delete empty nodes")

        delete_empty_do_nodes_query = """
                MATCH (dd:DOTerm)
                WHERE keys(dd)[0] = 'primaryKey'
                      AND size(keys(dd)) = 1
                DETACH DELETE (dd)"""

        Neo4jHelper.run_single_query(delete_empty_do_nodes_query)

    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Disease Data: %s", sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider())
            return

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_template_list = [
            [self.execute_allele_query_template, commit_size,
             "disease_allele_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_gene_query_template, commit_size,
             "disease_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_agms_query_template, commit_size,
             "disease_agms_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_gene_query_template, commit_size,
             "disease_pges_gene_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_allele_query_template, commit_size,
             "disease_pges_allele_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_pges_agm_query_template, commit_size,
             "disease_pges_agms_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_withs_query_template, commit_size,
             "disease_withs_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_ecode_query_template, commit_size,
             "disease_evidence_code_data_" + sub_type.get_data_provider() + ".csv"],
            [self.execute_annotation_xrefs_query_template, commit_size,
             "disease_annotation_xrefs_data_" + sub_type.get_data_provider() + ".csv"]
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size, sub_type.get_data_provider())

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Disease-{}: ".format(sub_type.get_data_provider()))
        self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider())

    def process_pages(self, dp, xrefs, pages):
        """Process pages to get xrefs."""
        annotation_type = dp.get('type')
        xref = dp.get('crossReference')
        cross_ref_id = xref.get('id')
        if ":" in cross_ref_id:
            local_crossref_id = cross_ref_id.split(":")[1]
            prefix = cross_ref_id.split(":")[0]
        else:
            local_crossref_id = ""
            prefix = cross_ref_id

        if annotation_type is None:
            annotation_type = 'curated'

        for page in pages:
            if (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'DOID':
                display_name = 'RGD'
            elif (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'OMIM':
                display_name = 'OMIM'
            else:
                display_name = cross_ref_id.split(":")[0]
                if display_name == 'DOID':
                    display_name = self.data_provider

            mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value(
                prefix, local_crossref_id, page)
            passing_xref = ETLHelper.get_xref_dict(
                local_crossref_id, prefix, page, page,
                display_name, mod_global_cross_ref_url,
                cross_ref_id + page + annotation_type)
            passing_xref['dataId'] = self.disease_unique_key

            if 'loaded' in annotation_type:
                passing_xref['loadedDB'] = 'true'
                passing_xref['curatedDB'] = 'false'
            else:
                passing_xref['curatedDB'] = 'true'
                passing_xref['loadedDB'] = 'false'

            xrefs.append(passing_xref)

    def xrefs_process(self, disease_record, xrefs):
        """Process the xrefs."""
        if 'dataProvider' not in disease_record:
            return

        for dp in disease_record['dataProvider']:
            xref = dp.get('crossReference')
            pages = xref.get('pages')

            if pages is None or len(pages) == 0:
                continue
            self.process_pages(dp, xrefs, pages)

    def evidence_process(self, disease_record, pubs, evidence_code_list_to_yield):
        """Process evidence."""
        pecj_primary_key = str(uuid.uuid4())
        if 'evidence' not in disease_record:
            self.logger.critical("No evidence but creating new pecj_primary_key anyway")
            return pecj_primary_key
        evidence = disease_record.get('evidence')
        if 'publication' in evidence:
            publication = evidence.get('publication')
            if publication.get('publicationId').startswith('PMID:'):
                pubs['pub_med_id'] = publication.get('publicationId')
                pubs['pub_med_url'] = self.etlh.return_url_from_identifier(pubs['pub_med_id'])
                if 'crossReference' in evidence:
                    pub_xref = evidence.get('crossReference')
                    pubs['publication_mod_id'] = pub_xref.get('id')
                    pubs['pub_mod_url'] = self.etlh.return_url_from_identifier(pubs['publication_mod_id'])
            else:
                pubs['publication_mod_id'] = publication.get('publicationId')
                pubs['pub_mod_url'] = self.etlh.return_url_from_identifier(pubs['publication_mod_id'])

        if 'evidenceCodes' in disease_record['evidence']:
            for ecode in disease_record['evidence'].get('evidenceCodes'):
                ecode_map = {"pecjPrimaryKey": pecj_primary_key,
                             "ecode": ecode}
                evidence_code_list_to_yield.append(ecode_map)
        return pecj_primary_key

    def objectrelation_process(self, disease_record):
        """Object Relation processing."""
        negation = ''
        if 'objectRelation' not in disease_record:
            self.logger.critical("objectRelation not in record so disease_annotation_type is the last one seen")
            return negation, None

        if 'negation' in disease_record:
            # this capitalization is purposeful
            if self.disease_association_type == 'IS_IMPLICATED_IN':
                self.disease_association_type = 'IS_NOT_IMPLICATED_IN'
            elif self.disease_association_type == 'IS_MODEL_OF':
                self.disease_association_type = 'IS_NOT_MODEL_OF'
            elif self.disease_association_type == 'IS_MARKER_FOR':
                self.disease_association_type = 'IS_NOT_MARKER_FOR'
            negation = 'NOT'
            self.disease_unique_key = self.disease_unique_key + negation

        return negation
    # Not used anywhere so commented out for now?
    #     additional_genetic_components = []

    #     if 'additionalGeneticComponents' in disease_record['objectRelation']:
    #         for component in disease_record['objectRelation']['additionalGeneticComponents']:
    #             component_symbol = component.get('componentSymbol')
    #             component_id = component.get('componentId')
    #             component_url = component.get('componentUrl') + component_id
    #             additional_genetic_components.append(
    #                 {"id": component_id,
    #                  "componentUrl": component_url,
    #                  "componentSymbol": component_symbol}
    #             )

    def withs_process(self, disease_record, withs):
        """Process withs."""
        if 'with' not in disease_record:
            return
        with_record = disease_record.get('with')
        for rec in with_record:
            self.disease_unique_key = self.disease_unique_key + rec
        for rec in with_record:
            with_map = {
                "diseaseUniqueKey": self.disease_unique_key,
                "withD": rec
                }
            withs.append(with_map)

    def primgenent_process(self, disease_record, pge_list_to_yield, pecj_primary_key):
        """Primary Genetic Entity ID process."""
        if 'primaryGeneticEntityIDs' not in disease_record:
            return

        pge_ids = disease_record.get('primaryGeneticEntityIDs')
        for pge in pge_ids:
            # ? pge_key = pge_key + pge
            pge_map = {"pecjPrimaryKey": pecj_primary_key,
                       "pgeId": pge}
            pge_list_to_yield.append(pge_map)

    def get_generators(self, disease_data, batch_size, data_provider):
        """Create generators."""
        counter = 0
        gene_list_to_yield = []
        allele_list_to_yield = []
        agm_list_to_yield = []
        evidence_code_list_to_yield = []
        withs = []
        pge_list_to_yield = []
        xrefs = []

        self.data_providers_process(disease_data)

        for disease_record in disease_data['data']:

            pubs = {'pub_med_url': None,
                    'pub_med_id': "",
                    'pub_mod_url': None,
                    'publication_mod_id': ""
                    }
            # pge_key = ''

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(disease_record.get('objectId'))
                if is_it_test_entry is False:
                    continue

            self.disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \
                disease_record['objectRelation'].get("associationType").upper()
            self.disease_association_type = disease_record['objectRelation'].get("associationType").upper()
            counter = counter + 1
            disease_object_type = disease_record['objectRelation'].get("objectType")

            primary_id = disease_record.get('objectId')
            do_id = disease_record.get('DOid')

            self.xrefs_process(disease_record, xrefs)
            pecj_primary_key = self.evidence_process(disease_record, pubs, evidence_code_list_to_yield)

            negation = self.objectrelation_process(disease_record)

            self.withs_process(disease_record, withs)
            self.primgenent_process(disease_record, pge_list_to_yield, pecj_primary_key)

            self.xrefs_process(disease_record, xrefs)

            disease_record = {
                "diseaseUniqueKey": self.disease_unique_key,
                "doId": do_id,
                "primaryId": primary_id,
                "pecjPrimaryKey": pecj_primary_key,
                "relationshipType": self.disease_association_type,
                "dataProvider": data_provider,
                "dateAssigned": disease_record.get("dateAssigned"),
                "pubPrimaryKey": pubs['publication_mod_id'] + pubs['pub_med_id'],
                "pubModId": pubs['publication_mod_id'],
                "pubMedId": pubs['pub_med_id'],
                "pubMedUrl": pubs['pub_med_url'],
                "pubModUrl": pubs['pub_mod_url'],
                "negation": negation}

            if disease_object_type == 'gene':
                gene_list_to_yield.append(disease_record)
            elif disease_object_type == 'allele':
                allele_list_to_yield.append(disease_record)
            else:
                agm_list_to_yield.append(disease_record)

            if counter == batch_size:
                yield [allele_list_to_yield,
                       gene_list_to_yield,
                       agm_list_to_yield,
                       pge_list_to_yield,
                       pge_list_to_yield,
                       pge_list_to_yield,
                       withs,
                       evidence_code_list_to_yield,
                       xrefs]
                agm_list_to_yield = []
                allele_list_to_yield = []
                gene_list_to_yield = []
                evidence_code_list_to_yield = []
                pge_list_to_yield = []
                xrefs = []
                withs = []
                counter = 0

        if counter > 0:
            yield [allele_list_to_yield,
                   gene_list_to_yield,
                   agm_list_to_yield,
                   pge_list_to_yield,
                   pge_list_to_yield,
                   pge_list_to_yield,
                   withs,
                   evidence_code_list_to_yield,
                   xrefs]
Exemple #22
0
    def get_generators(self, phenotype_data, batch_size):
        """Get Generators"""

        list_to_yield = []
        pge_list_to_yield = []
        date_produced = phenotype_data['metaData']['dateProduced']
        data_providers = []
        data_provider_object = phenotype_data['metaData']['dataProvider']
        counter = 0
        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []
        pge_key = ''

        load_key = date_produced + data_provider + "_phenotype"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, ETL.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.debug("data provider: %s", data_provider)

        for pheno in phenotype_data['data']:
            pecj_primary_key = str(uuid.uuid4())
            counter = counter + 1
            pub_med_id = None
            pub_mod_id = None
            pub_med_url = None
            pub_mod_url = None
            primary_id = pheno.get('objectId')
            phenotype_statement = pheno.get('phenotypeStatement')

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    primary_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            evidence = pheno.get('evidence')

            if 'publicationId' in evidence:
                if evidence.get('publicationId').startswith('PMID:'):
                    pub_med_id = evidence['publicationId']
                    local_pub_med_id = pub_med_id.split(":")[1]
                    pub_med_prefix = pub_med_id.split(":")[0]
                    pub_med_url = ETLHelper.get_no_page_complete_url(
                        local_pub_med_id, self.xref_url_map, pub_med_prefix,
                        primary_id)
                    if pub_med_id is None:
                        pub_med_id = ""

                    if 'crossReference' in evidence:
                        pub_xref = evidence.get('crossReference')
                        pub_mod_id = pub_xref.get('id')
                        pub_mod_local_id = pub_mod_id.split(":")[1]
                        if pub_mod_id is not None:
                            pub_mod_url = ETLHelper.get_complete_pub_url(
                                pub_mod_local_id, pub_mod_id)

                else:
                    pub_mod_id = evidence.get('publicationId')
                    if pub_mod_id is not None:
                        pub_mod_local_id = pub_mod_id.split(":")[1]
                        pub_mod_url = ETLHelper.get_complete_pub_url(
                            pub_mod_local_id, pub_mod_id)

                if pub_mod_id is None:
                    pub_mod_id = ""

            if pub_med_id is None:
                pub_med_id = ""

            if pub_mod_id is None:
                pub_mod_id = ""

            date_assigned = pheno.get('dateAssigned')

            if pub_mod_id is None and pub_med_id is None:
                self.logger.info("%s is missing pubMed and pubMod id",
                                 primary_id)

            if 'primaryGeneticEntityIDs' in pheno:
                pge_ids = pheno.get('primaryGeneticEntityIDs')
                for pge in pge_ids:
                    pge_key = pge_key + pge
                    pge_map = {
                        "pecjPrimaryKey": pecj_primary_key,
                        "pgeId": pge
                    }
                    pge_list_to_yield.append(pge_map)

            phenotype = {
                "primaryId": primary_id,
                "phenotypeUniqueKey": primary_id + phenotype_statement.strip(),
                "phenotypeStatement": phenotype_statement.strip(),
                "dateAssigned": date_assigned,
                "loadKey": load_key,
                "type": "gene",
                "dataProviders": data_providers,
                "dataProvider": data_provider,
                "dateProduced": date_produced,
                "pubMedId": pub_med_id,
                "pubMedUrl": pub_med_url,
                "pubModId": pub_mod_id,
                "pubModUrl": pub_mod_url,
                "pubPrimaryKey": pub_med_id + pub_mod_id,
                "pecjPrimaryKey": pecj_primary_key
            }

            list_to_yield.append(phenotype)

            if counter == batch_size:
                yield [
                    list_to_yield, list_to_yield, list_to_yield,
                    pge_list_to_yield, pge_list_to_yield
                ]
                list_to_yield = []
                pge_list_to_yield = []
                counter = 0

        if counter > 0:
            yield [
                list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield,
                pge_list_to_yield
            ]
    def get_generators(self, agm_data, data_provider, batch_size):
        """Get Generators"""

        data_providers = []
        agms = []
        agm_synonyms = []
        agm_secondary_ids = []
        mod_global_cross_ref_url = ""
        components = []
        backgrounds = []
        sqtrs = []

        counter = 0
        date_produced = agm_data['metaData']['dateProduced']

        data_provider_object = agm_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_agm"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        for agm_record in agm_data['data']:
            counter = counter + 1
            global_id = agm_record['primaryID']
            local_id = global_id.split(":")[1]

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            if agm_record.get('secondaryIds') is not None:
                for sid in agm_record.get('secondaryIds'):
                    agm_secondary_id_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "secondaryId": sid
                    }
                    agm_secondary_ids.append(agm_secondary_id_dataset)

            if agm_record.get('synonyms') is not None:
                for syn in agm_record.get('synonyms'):
                    syn_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "synonym": syn
                    }
                    agm_synonyms.append(syn_dataset)

            if 'crossReference' in agm_record:
                cross_ref = agm_record.get('crossReference')
                cross_ref_id = cross_ref.get('id')
                local_crossref_id = cross_ref_id.split(":")[1]
                prefix = cross_ref.get('id').split(":")[0]
                pages = cross_ref.get('pages')

                # some pages collection have 0 elements
                if pages is not None and len(pages) > 0:
                    for page in pages:
                        if page in ['Fish', 'genotype', 'strain']:
                            mod_global_cross_ref_url = ETLHelper.get_page_complete_url(
                                local_crossref_id, self.xref_url_map, prefix,
                                page)

            short_species_abbreviation = ETLHelper.get_short_species_abbreviation(
                agm_record.get('taxonId'))
            name_text = TextProcessingHelper.cleanhtml(agm_record.get('name'))

            # TODO: make subtype required in submission file.

            subtype = agm_record.get('subtype')
            if subtype is None and data_provider == 'WB':
                subtype = 'strain'
            if subtype is None:
                subtype = 'affected_genomic_model'

            # TODO: name_text
            agm_dataset = {
                "primaryId":
                agm_record.get('primaryID'),
                "name":
                agm_record.get('name'),
                "globalId":
                global_id,
                "localId":
                local_id,
                "taxonId":
                agm_record.get('taxonId'),
                "dataProviders":
                data_providers,
                "dateProduced":
                date_produced,
                "loadKey":
                load_key,
                "subtype":
                subtype,
                "modGlobalCrossRefUrl":
                mod_global_cross_ref_url,
                "dataProvider":
                data_provider,
                "nameText":
                name_text,
                "nameWithSpecies":
                agm_record.get('name') + " (" + short_species_abbreviation +
                ")",
                "nameTextWithSpecies":
                name_text + " (" + short_species_abbreviation + ")",
            }
            agms.append(agm_dataset)

            if agm_record.get('affectedGenomicModelComponents') is not None:

                for component in agm_record.get(
                        'affectedGenomicModelComponents'):
                    component_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "componentId": component.get('alleleID'),
                        "zygosityId": component.get('zygosity')
                    }
                    components.append(component_dataset)

            if agm_record.get('sequenceTargetingReagentIDs') is not None:
                for sqtr in agm_record.get('sequenceTargetingReagentIDs'):
                    sqtr_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "sqtrId": sqtr
                    }
                    sqtrs.append(sqtr_dataset)

            if agm_record.get('parentalPopulationIDs') is not None:
                for background in agm_record.get('parentalPopulationIDs'):
                    background_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "backgroundId": background
                    }
                    backgrounds.append(background_dataset)

            if counter == batch_size:
                yield [
                    agms, agm_secondary_ids, agm_synonyms, components, sqtrs,
                    backgrounds
                ]
                agms = []
                agm_secondary_ids = []
                agm_synonyms = []
                components = []
                backgrounds = []
                counter = 0

        if counter > 0:
            yield [
                agms, agm_secondary_ids, agm_synonyms, components, sqtrs,
                backgrounds
            ]
Exemple #24
0
class HTPMetaDatasetETL(ETL):

    htp_dataset_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
        CREATE (ds:HTPDataset {primaryKey:row.datasetId})
          SET ds.dateAssigned = row.dateAssigned,
              ds.summary = row.summary,
              ds.numChannels = row.numChannels,
              ds.subSeries = row.subSeries
         """

    htp_dataset_pub_query_template = """
        
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
        MATCH (ds:HTPDataset {primaryKey: row.datasetId})
        
        MERGE (p:Publication {primaryKey: row.pubPrimaryKey})
            ON CREATE SET p.pubModId = row.pubModId,
                          p.pubMedId = row.pubMedId,
                          p.pubModUrl = row.pubModUrl,
                          p.pubMedUrl = row.pubMedUrl
                          
        MERGE (p)-[:ASSOCIATION]-(ds)
    
    """

    htp_category_tags_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
        MATCH (ds:HTPDataset {primaryKey:row.datasetId})
        
        MERGE (ct:CategoryTag {primaryKey:row.tag})
        
        MERGE (ds)-[:CATEGORY_TAG]-(ct)    
            
    """

    htp_secondaryIds_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        MATCH (ds:HTPDataset {primaryKey: row.datasetId})
        
        MERGE (s:SecondaryId:Identifier {primaryKey:row.secondaryId})
                ON CREATE SET s.name = row.secondaryId
                
        MERGE (ds)-[aka:ALSO_KNOWN_AS]-(s)
   

    """

    htpdataset_xrefs_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:HTPDataset {primaryKey:row.datasetId}) """ + ETLHelper.get_cypher_xref_text(
    )

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            p = multiprocessing.Process(target=self._process_sub_type,
                                        args=(sub_type, ))
            p.start()
            thread_pool.append(p)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        logger.info("Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        logger.info(filepath)
        data = JSONFile().get_data(filepath)
        logger.info("Finished Loading HTP metadata Data: %s" %
                    sub_type.get_data_provider())

        if data is None:
            logger.warn("No Data found for %s skipping" %
                        sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored
        query_list = [
            [
                HTPMetaDatasetETL.htp_dataset_query_template, commit_size,
                "htp_metadataset_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_category_tags_query_template,
                commit_size,
                "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size,
                "htp_metadataset_publications_" +
                sub_type.get_data_provider() + ".csv"
            ],
            [
                HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size,
                "htp_metadataset_xrefs_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size,
                "htp_metadataset_secondaryIds_" +
                sub_type.get_data_provider() + ".csv"
            ],
        ]

        # Obtain the generator
        generators = self.get_generators(data, batch_size)

        query_and_file_list = self.process_query_params(query_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, htp_dataset_data, batch_size):
        dataset_tags = []
        data_providers = []
        htp_datasets = []
        publications = []
        secondaryIds = []
        cross_reference_list = []
        counter = 0
        date_produced = htp_dataset_data['metaData']['dateProduced']

        data_provider_object = htp_dataset_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        for dataset_record in htp_dataset_data['data']:

            counter = counter + 1

            dataset = dataset_record.get('datasetId')
            datasetId = dataset.get('primaryId')

            # spoke to RGD and they wish to remove these datasets as they overlap with SGD.

            if (datasetId == 'GEO:GSE18157'
                    or datasetId == 'GEO:GSE33497') and data_provider == 'RGD':
                continue
            if 'secondaryIds' in dataset:
                for secId in dataset.get('secondaryIds'):
                    secid = {"datasetId": datasetId, "secondaryId": secId}
                    secondaryIds.append(secid)

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    datasetId)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            if 'crossReference' in dataset:
                crossRefO = dataset.get('crossReference')
                if crossRefO is not None:
                    crossRefId = crossRefO.get('id')
                    local_crossref_id = crossRefId.split(":")[1]
                    prefix = crossRefId.split(":")[0]
                    pages = crossRefO.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value(
                                prefix, local_crossref_id, page)
                            xref = ETLHelper.get_xref_dict(
                                local_crossref_id, prefix, page, page,
                                crossRefId, mod_global_cross_ref_url,
                                crossRefId + page)
                            xref['dataId'] = datasetId
                            cross_reference_list.append(xref)

            category_tags = dataset_record.get('categoryTags')

            if category_tags is not None:
                for tag in category_tags:
                    dataset_category_tag = {"datasetId": datasetId, "tag": tag}
                    dataset_tags.append(dataset_category_tag)

            publicationNew = dataset_record.get('publications')
            if publicationNew is not None:
                for pub in publicationNew:
                    pid = pub.get('publicationId')
                    publication_mod_id = ""
                    pub_med_id = ""
                    pub_mod_url = ""
                    pub_med_url = ""
                    if pid is not None and pid.startswith('PMID:'):
                        pub_med_id = pid
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_url = pub_med_url = self.etlh.get_no_page_complete_url(
                            local_pub_med_id, 'PMID', pub_med_id)
                        if 'crossReference' in pub:
                            pub_xref = pub.get('crossReference')
                            publication_mod_id = pub_xref.get('id')
                            pub_mod_url = self.etlh.rdh2.return_url_from_identifier(
                                publication_mod_id)
                    elif pid is not None and not pid.startswith('PMID:'):
                        publication_mod_id = pub.get('publicationId')
                        pub_mod_url = self.etlh.rdh2.return_url_from_identifier(
                            publication_mod_id)

                    publication = {
                        "datasetId": datasetId,
                        "pubPrimaryKey": publication_mod_id + pub_med_id,
                        "pubModId": publication_mod_id,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModUrl": pub_mod_url
                    }
                    publications.append(publication)

            htp_dataset = {
                "datasetId": datasetId,
                "dateAssigned": dataset_record.get('dateAssigned'),
                "title": dataset_record.get('title'),
                "summary": dataset_record.get('summary'),
                "numChannels": dataset_record.get('numChannels'),
                "subSeries": dataset_record.get('subSeries')
            }
            htp_datasets.append(htp_dataset)

            if counter == batch_size:
                yield [
                    htp_datasets, dataset_tags, publications,
                    cross_reference_list, secondaryIds
                ]
                counter = 0
                htp_datasets = []
                dataset_tags = []
                publications = []
                cross_reference_list = []
                secondaryIds = []

        if counter > 0:
            yield [
                htp_datasets, dataset_tags, publications, cross_reference_list,
                secondaryIds
            ]
class GeoXrefETL(ETL):
    """GEO XREF ETL"""

    logger = logging.getLogger(__name__)

    geo_xref_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
        
        MATCH (o:Gene) where o.primaryKey = row.genePrimaryKey
        """ + ETLHelper.get_cypher_xref_text()

    gene_crossref_query_template = """
                   MATCH (g:Gene)-[crr:CROSS_REFERENCE]-(cr:CrossReference)
                   WHERE cr.globalCrossRefId IN {parameter}
                   RETURN g.primaryKey, g.modLocalId, cr.name, cr.globalCrossRefId"""

    def __init__(self, config):
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):

        for sub_type in self.data_type_config.get_sub_type_objects():

            species_encoded = urllib.parse.quote_plus(\
                    ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider()))

            commit_size = self.data_type_config.get_neo4j_commit_size()
            #batch_size = self.data_type_config.get_generator_batch_size()
            batch_size = 100000

            generators = self.get_generators(sub_type, batch_size,
                                             species_encoded)

            query_template_list = [
                [
                    self.geo_xref_query_template, commit_size,
                    "geo_xref_data_" + sub_type.get_data_provider() + ".csv"
                ],
            ]

            query_and_file_list = self.process_query_params(
                query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)

    def get_generators(self, sub_type, batch_size, species_encoded):
        """Get Generators"""

        entrez_ids = []

        geo_data_file_contents = Path(sub_type.get_filepath()).read_text()
        geo_data = json.loads(
            json.dumps(xmltodict.parse(geo_data_file_contents)))
        for efetch_value in dict(geo_data.items()).values():
            # IdList is a value returned from efetch XML spec,
            # within IdList, there is another map with "Id"
            # as the key and the entrez local ids a list value.
            for sub_map_key, sub_map_value in efetch_value.items():
                if sub_map_key == 'IdList':
                    for id_list in dict(sub_map_value.items()).values():
                        for entrez_id in id_list:
                            self.logger.debug("here is the entrez id: %s",
                                              entrez_id)
                            entrez_ids.append("NCBI_Gene:" + entrez_id)

        geo_data_list = []
        return_set = Neo4jHelper.run_single_parameter_query(
            self.gene_crossref_query_template, entrez_ids)

        for record in return_set:
            gene_primary_key = record["g.primaryKey"]
            mod_local_id = record["g.modLocalId"]
            global_cross_ref_id = record["cr.globalCrossRefId"]
            geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1],
                                               "NCBI_Gene",
                                               "gene/other_expression",
                                               "gene/other_expression",
                                               "GEO",
                                               "https://www.ncbi.nlm.nih.gov/sites/entrez?" \
                                                       + "Db=geoprofiles"\
                                                       + "&DbFrom=gene"\
                                                       + "&Cmd=Link"\
                                                       + "&LinkName=gene_geoprofiles"\
                                                       + "&LinkReadableName=GEO%20Profiles"\
                                                       + "&IdsFromResult="\
                                                       + global_cross_ref_id.split(":")[1],
                                               global_cross_ref_id+"gene/other_expression")

            geo_xref["genePrimaryKey"] = gene_primary_key
            geo_xref["modLocalId"] = mod_local_id

            geo_data_list.append(geo_xref)

        yield [geo_data_list]
Exemple #26
0
    def get_generators(self, expression_file, batch_size):
        """Get Generators"""

        self.logger.debug("made it to the expression generator")

        counter = 0

        cross_references = []
        bio_entities = []
        bio_join_entities = []
        bio_entity_gene_aos = []
        pubs = []
        ao_expressions = []
        cc_expressions = []
        ao_qualifiers = []
        ao_substructures = []
        ao_ss_qualifiers = []
        cc_qualifiers = []
        ao_cc_expressions = []
        stage_list = []
        stage_uberon_data = []
        uberon_ao_data = []
        uberon_ao_other_data = []
        uberon_stage_other_data = []

        self.logger.debug("streaming json data from %s ...", expression_file)
        with codecs.open(expression_file, 'r', 'utf-8') as file_handle:
            for xpat in ijson.items(file_handle, 'data.item'):
                counter = counter + 1

                pub_med_url = None
                pub_mod_url = None
                pub_med_id = ""
                publication_mod_id = ""
                stage_term_id = ""
                stage_name = ""
                stage_uberon_term_id = ""
                gene_id = xpat.get('geneId')

                if self.test_object.using_test_data() is True:
                    is_it_test_entry = self.test_object.check_for_test_id_entry(
                        gene_id)
                    if is_it_test_entry is False:
                        counter = counter - 1
                        continue

                evidence = xpat.get('evidence')

                if 'publicationId' in evidence:
                    if evidence.get('publicationId').startswith('PMID:'):
                        pub_med_id = evidence.get('publicationId')
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_prefix = pub_med_id.split(":")[0]
                        pub_med_url = ETLHelper.get_no_page_complete_url(
                            local_pub_med_id, self.xref_url_map,
                            pub_med_prefix, gene_id)
                        if pub_med_id is None:
                            pub_med_id = ""

                        if 'crossReference' in evidence:
                            pub_xref = evidence.get('crossReference')
                            publication_mod_id = pub_xref.get('id')

                            if publication_mod_id is not None:
                                pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \
                                        publication_mod_id)

                    else:
                        publication_mod_id = evidence['publicationId']
                        if publication_mod_id is not None:
                            pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\
                                              publication_mod_id)

                    if publication_mod_id is None:
                        publication_mod_id = ""

                assay = xpat.get('assay')

                if 'whereExpressed' in xpat:

                    where_expressed = xpat.get('whereExpressed')
                    cellular_component_qualifier_term_id = \
                           where_expressed.get('cellularComponentQualifierTermId')
                    cellular_component_term_id = where_expressed.get(
                        'cellularComponentTermId')
                    anatomical_structure_term_id = where_expressed.get(
                        'anatomicalStructureTermId')
                    anatomical_structure_qualifier_term_id = where_expressed.get(
                        'anatomicalStructureQualifierTermId')
                    anatomical_sub_structure_term_id = \
                            where_expressed.get('anatomicalSubStructureTermId')
                    anatomical_sub_structure_qualifier_term_id = where_expressed.get(
                        'anatomicalSubStructureQualifierTermId')
                    where_expressed_statement = where_expressed.get(
                        'whereExpressedStatement')

                    when_expressed_stage = xpat.get('whenExpressed')

                    if 'stageTermId' in when_expressed_stage:
                        stage_term_id = when_expressed_stage.get('stageTermId')
                    if 'stageName' in when_expressed_stage:
                        stage_name = when_expressed_stage.get('stageName')

                    # TODO: making unique BioEntityGeneExpressionJoin nodes
                    # and ExpressionBioEntity nodes is tedious.
                    # TODO: Lets get the DQMs to fix this.
                    expression_unique_key = gene_id + assay + stage_name
                    expression_entity_unique_key = ""

                    if anatomical_structure_term_id is not None:
                        expression_unique_key += anatomical_structure_term_id
                        expression_entity_unique_key = anatomical_structure_term_id

                        if anatomical_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_structure_qualifier_term_id
                            expression_entity_unique_key += anatomical_structure_qualifier_term_id

                    if cellular_component_term_id is not None:
                        expression_unique_key += cellular_component_term_id
                        expression_entity_unique_key += cellular_component_term_id

                        if cellular_component_qualifier_term_id is not None:
                            expression_unique_key += cellular_component_qualifier_term_id
                            expression_entity_unique_key += cellular_component_qualifier_term_id

                    if anatomical_sub_structure_term_id is not None:
                        expression_unique_key += anatomical_sub_structure_term_id

                        if anatomical_sub_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_sub_structure_qualifier_term_id
                            expression_entity_unique_key \
                                    += anatomical_sub_structure_qualifier_term_id

                    expression_entity_unique_key += where_expressed_statement
                    expression_unique_key += where_expressed_statement

                    if where_expressed.get(
                            'anatomicalStructureUberonSlimTermIds'
                    ) is not None:
                        for uberon_structure_term_object in \
                                where_expressed.get('anatomicalStructureUberonSlimTermIds'):
                            structure_uberon_term_id = \
                                    uberon_structure_term_object.get('uberonTerm')
                            if structure_uberon_term_id is not None \
                                    and structure_uberon_term_id != 'Other':
                                structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": structure_uberon_term_id
                                }
                                uberon_ao_data.append(structure_uberon_term)
                            elif structure_uberon_term_id is not None \
                                    and structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if where_expressed.get(
                            'anatomicalSubStructureUberonSlimTermIds'
                    ) is not None:
                        for uberon_sub_structure_term_object in \
                                where_expressed.get('anatomicalSubStructureUberonSlimTermIds'):
                            sub_structure_uberon_term_id = \
                                    uberon_sub_structure_term_object.get('uberonTerm')
                            if sub_structure_uberon_term_id is not None \
                                    and sub_structure_uberon_term_id != 'Other':
                                sub_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": sub_structure_uberon_term_id
                                }
                                uberon_ao_data.append(
                                    sub_structure_uberon_term)
                            elif sub_structure_uberon_term_id is not None \
                                    and sub_structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if cellular_component_term_id is None:
                        cellular_component_term_id = ""

                    if when_expressed_stage.get(
                            'stageUberonSlimTerm') is not None:
                        stage_uberon_term_object = when_expressed_stage.get(
                            'stageUberonSlimTerm')
                        stage_uberon_term_id = stage_uberon_term_object.get(
                            "uberonTerm")
                        if stage_uberon_term_id is not None \
                            and stage_uberon_term_id != "post embryonic, pre-adult":
                            stage_uberon = {
                                "uberonStageId": stage_uberon_term_id,
                                "ei_uuid": expression_unique_key
                            }
                            stage_uberon_data.append(stage_uberon)
                        if stage_uberon_term_id == "post embryonic, pre-adult":
                            stage_uberon_other = {
                                "ei_uuid": expression_unique_key
                            }
                            uberon_stage_other_data.append(stage_uberon_other)

                    if stage_term_id is None or stage_name == 'N/A':
                        stage_term_id = ""
                        stage_name = ""
                        stage_uberon_term_id = ""

                    if stage_name is not None:
                        stage = {
                            "stageTermId": stage_term_id,
                            "stageName": stage_name,
                            "ei_uuid": expression_unique_key
                        }
                        stage_list.append(stage)
                    else:
                        stage_uberon_term_id = ""

                    if 'crossReference' in xpat:
                        cross_ref = xpat.get('crossReference')
                        cross_ref_id = cross_ref.get('id')
                        local_cross_ref_id = cross_ref_id.split(":")[1]
                        prefix = cross_ref.get('id').split(":")[0]
                        pages = cross_ref.get('pages')

                        # some pages collection have 0 elements
                        if pages is not None and len(pages) > 0:
                            for page in pages:
                                if page == 'gene/expression/annotation/detail':
                                    mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\
                                            local_cross_ref_id,
                                            self.xref_url_map,
                                            prefix, page)

                                    xref = ETLHelper.get_xref_dict(
                                        local_cross_ref_id, prefix, page, page,
                                        cross_ref_id, mod_global_cross_ref_id,
                                        cross_ref_id + page)
                                    xref['ei_uuid'] = expression_unique_key
                                    cross_references.append(xref)

                    bio_entity = {
                        "ebe_uuid": expression_entity_unique_key,
                        "whereExpressedStatement": where_expressed_statement
                    }
                    bio_entities.append(bio_entity)

                    bio_join_entity = {
                        "ei_uuid": expression_unique_key,
                        "assay": assay
                    }
                    bio_join_entities.append(bio_join_entity)

                    bio_entity_gene_ao = {
                        "geneId": gene_id,
                        "ebe_uuid": expression_entity_unique_key,
                        "anatomicalStructureTermId":
                        anatomical_structure_term_id,
                        "ei_uuid": expression_unique_key
                    }
                    bio_entity_gene_aos.append(bio_entity_gene_ao)

                    pub = {
                        "ei_uuid": expression_unique_key,
                        "pubPrimaryKey": pub_med_id + publication_mod_id,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModId": publication_mod_id,
                        "pubModUrl": pub_mod_url
                    }
                    pubs.append(pub)

                    ao_expression = {
                        "geneId": gene_id,
                        "whenExpressedStage": when_expressed_stage,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModId": publication_mod_id,
                        "pubModUrl": pub_mod_url,
                        "pubPrimaryKey": pub_med_id + publication_mod_id,
                        "uuid": str(uuid.uuid4()),
                        "assay": assay,
                        "anatomicalStructureTermId":
                        anatomical_structure_term_id,
                        "whereExpressedStatement": where_expressed_statement,
                        "ei_uuid": expression_unique_key,
                        "ebe_uuid": expression_entity_unique_key
                    }
                    ao_expressions.append(ao_expression)

                    if cellular_component_qualifier_term_id is not None:

                        cc_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "cellularComponentQualifierTermId":
                            cellular_component_qualifier_term_id
                        }
                        cc_qualifiers.append(cc_qualifier)

                    if anatomical_structure_term_id is None:
                        anatomical_structure_term_id = ""

                        cc_expression = {
                            "geneId": gene_id,
                            "whenExpressedStage": when_expressed_stage,
                            "pubMedId": pub_med_id,
                            "pubMedUrl": pub_med_url,
                            "pubModId": publication_mod_id,
                            "pubModUrl": pub_mod_url,
                            "pubPrimaryKey": pub_med_id + publication_mod_id,
                            "assay": assay,
                            "whereExpressedStatement":
                            where_expressed_statement,
                            "cellularComponentTermId":
                            cellular_component_term_id,
                            "ei_uuid": expression_unique_key,
                            "ebe_uuid": expression_entity_unique_key
                        }
                        cc_expressions.append(cc_expression)

                    if anatomical_structure_qualifier_term_id is not None:
                        ao_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalStructureQualifierTermId":
                            anatomical_structure_qualifier_term_id
                        }

                        ao_qualifiers.append(ao_qualifier)

                    if anatomical_sub_structure_term_id is not None:
                        ao_substructure = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureTermId":
                            anatomical_sub_structure_term_id
                        }

                        ao_substructures.append(ao_substructure)

                    if anatomical_sub_structure_qualifier_term_id is not None:
                        ao_ss_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureQualifierTermId":
                            anatomical_sub_structure_qualifier_term_id
                        }

                        ao_ss_qualifiers.append(ao_ss_qualifier)

                    if where_expressed_statement is None:
                        where_expressed_statement = ""

                    if anatomical_structure_term_id is not None \
                            and anatomical_structure_term_id != "" \
                            and cellular_component_term_id is not None \
                            and cellular_component_term_id != "":

                        ao_cc_expression = {
                            "geneId": gene_id,
                            "whenExpressedStage": when_expressed_stage,
                            "pubMedId": pub_med_id,
                            "pubMedUrl": pub_med_url,
                            "pubModId": publication_mod_id,
                            "pubModUrl": pub_mod_url,
                            "pubPrimaryKey": pub_med_id + publication_mod_id,
                            "uuid": str(uuid.uuid4()),
                            "stageTermId": stage_term_id,
                            "stageName": stage_name,
                            "stageUberonTermId": stage_uberon_term_id,
                            "assay": assay,
                            "cellularComponentTermId":
                            cellular_component_term_id,
                            "anatomicalStructureTermId":
                            anatomical_structure_term_id,
                            "whereExpressedStatement":
                            where_expressed_statement,
                            "ei_uuid": expression_unique_key,
                            "ebe_uuid": expression_entity_unique_key
                        }

                        ao_cc_expressions.append(ao_cc_expression)

                if counter == batch_size:
                    yield [
                        bio_entities, bio_entity_gene_aos, bio_join_entities,
                        ao_expressions, cc_expressions, ao_cc_expressions,
                        ao_qualifiers, ao_substructures, ao_ss_qualifiers,
                        cc_qualifiers, stage_list, stage_uberon_data,
                        uberon_ao_data, uberon_ao_other_data,
                        uberon_stage_other_data, cross_references, pubs
                    ]
                    bio_entities = []
                    bio_join_entities = []
                    ao_expressions = []
                    cc_expressions = []
                    ao_qualifiers = []
                    ao_substructures = []
                    ao_ss_qualifiers = []
                    cc_qualifiers = []
                    ao_cc_expressions = []
                    stage_list = []
                    uberon_stage_other_data = []
                    stage_uberon_data = []
                    uberon_ao_other_data = []
                    uberon_ao_data = []
                    cross_references = []
                    bio_entity_gene_aos = []
                    pubs = []
                    counter = 0

            if counter > 0:
                yield [
                    bio_entities, bio_entity_gene_aos, bio_join_entities,
                    ao_expressions, cc_expressions, ao_cc_expressions,
                    ao_qualifiers, ao_substructures, ao_ss_qualifiers,
                    cc_qualifiers, stage_list, stage_uberon_data,
                    uberon_ao_data, uberon_ao_other_data,
                    uberon_stage_other_data, cross_references, pubs
                ]
Exemple #27
0
    def get_generators(self, htp_dataset_data, batch_size):
        dataset_tags = []
        data_providers = []
        htp_datasets = []
        publications = []
        secondaryIds = []
        cross_reference_list = []
        counter = 0
        date_produced = htp_dataset_data['metaData']['dateProduced']

        data_provider_object = htp_dataset_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        for dataset_record in htp_dataset_data['data']:

            counter = counter + 1

            dataset = dataset_record.get('datasetId')
            datasetId = dataset.get('primaryId')

            # spoke to RGD and they wish to remove these datasets as they overlap with SGD.

            if (datasetId == 'GEO:GSE18157'
                    or datasetId == 'GEO:GSE33497') and data_provider == 'RGD':
                continue
            if 'secondaryIds' in dataset:
                for secId in dataset.get('secondaryIds'):
                    secid = {"datasetId": datasetId, "secondaryId": secId}
                    secondaryIds.append(secid)

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    datasetId)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            if 'crossReference' in dataset:
                crossRefO = dataset.get('crossReference')
                if crossRefO is not None:
                    crossRefId = crossRefO.get('id')
                    local_crossref_id = crossRefId.split(":")[1]
                    prefix = crossRefId.split(":")[0]
                    pages = crossRefO.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value(
                                prefix, local_crossref_id, page)
                            xref = ETLHelper.get_xref_dict(
                                local_crossref_id, prefix, page, page,
                                crossRefId, mod_global_cross_ref_url,
                                crossRefId + page)
                            xref['dataId'] = datasetId
                            cross_reference_list.append(xref)

            category_tags = dataset_record.get('categoryTags')

            if category_tags is not None:
                for tag in category_tags:
                    dataset_category_tag = {"datasetId": datasetId, "tag": tag}
                    dataset_tags.append(dataset_category_tag)

            publicationNew = dataset_record.get('publications')
            if publicationNew is not None:
                for pub in publicationNew:
                    pid = pub.get('publicationId')
                    publication_mod_id = ""
                    pub_med_id = ""
                    pub_mod_url = ""
                    pub_med_url = ""
                    if pid is not None and pid.startswith('PMID:'):
                        pub_med_id = pid
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_url = pub_med_url = self.etlh.get_no_page_complete_url(
                            local_pub_med_id, 'PMID', pub_med_id)
                        if 'crossReference' in pub:
                            pub_xref = pub.get('crossReference')
                            publication_mod_id = pub_xref.get('id')
                            pub_mod_url = self.etlh.rdh2.return_url_from_identifier(
                                publication_mod_id)
                    elif pid is not None and not pid.startswith('PMID:'):
                        publication_mod_id = pub.get('publicationId')
                        pub_mod_url = self.etlh.rdh2.return_url_from_identifier(
                            publication_mod_id)

                    publication = {
                        "datasetId": datasetId,
                        "pubPrimaryKey": publication_mod_id + pub_med_id,
                        "pubModId": publication_mod_id,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModUrl": pub_mod_url
                    }
                    publications.append(publication)

            htp_dataset = {
                "datasetId": datasetId,
                "dateAssigned": dataset_record.get('dateAssigned'),
                "title": dataset_record.get('title'),
                "summary": dataset_record.get('summary'),
                "numChannels": dataset_record.get('numChannels'),
                "subSeries": dataset_record.get('subSeries')
            }
            htp_datasets.append(htp_dataset)

            if counter == batch_size:
                yield [
                    htp_datasets, dataset_tags, publications,
                    cross_reference_list, secondaryIds
                ]
                counter = 0
                htp_datasets = []
                dataset_tags = []
                publications = []
                cross_reference_list = []
                secondaryIds = []

        if counter > 0:
            yield [
                htp_datasets, dataset_tags, publications, cross_reference_list,
                secondaryIds
            ]
Exemple #28
0
class DOETL(ETL):
    """DO ETL."""

    logger = logging.getLogger(__name__)

    # Query templates which take params and will be processed later

    do_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

        //Create the DOTerm node and set properties. primaryKey is required.
        MERGE (doterm:DOTerm:Ontology {primaryKey:row.oid})
            SET doterm.name = row.name,
             doterm.nameKey = row.name_key,
             doterm.definition = row.definition,
             doterm.defLinks = apoc.convert.fromJsonList(row.defLinksProcessed),
             doterm.isObsolete = row.is_obsolete,
             doterm.subset = row.subset,
             doterm.doDisplayId = row.oid,
             doterm.doUrl = row.oUrl,
             doterm.doPrefix = "DOID",
             doterm.doId = row.oid,
             doterm.rgdLink = row.rgd_link,
             doterm.ratOnlyRgdLink = row.rat_only_rgd_link,
             doterm.humanOnlyRgdLink = row.human_only_rgd_link,
             doterm.mgiLink = row.mgi_link,
             doterm.zfinLink = row.zfin_link,
             doterm.flybaseLink = row.flybase_link,
             doterm.wormbaseLink = row.wormbase_link,
             doterm.sgdLink = row.sgd_link

            MERGE (doterm)-[ggcg:IS_A_PART_OF_CLOSURE]->(doterm)"""

    doterm_synonyms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (d:DOTerm {primaryKey:row.primary_id})

            MERGE (syn:Synonym:Identifier {primaryKey:row.synonym})
                SET syn.name = row.synonym
            MERGE (d)-[aka2:ALSO_KNOWN_AS]->(syn) """

    doterm_isas_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (d1:DOTerm:Ontology {primaryKey:row.primary_id})
            MATCH (d2:DOTerm:Ontology {primaryKey:row.primary_id2})
            MERGE (d1)-[aka:IS_A]->(d2) """

    xrefs_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:DOTerm {primaryKey:row.oid}) """ + ETLHelper.get_cypher_xref_text(
    )

    doterm_alt_ids_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (d:DOTerm {primaryKey:row.primary_id})

            MERGE (sec:SecondaryId:Identifier {primaryKey:row.secondary_id})

            MERGE (d)-[aka2:ALSO_KNOWN_AS]->(sec) """

    def __init__(self, config):
        """Initialise object."""
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        filepath = self.data_type_config.get_single_filepath()

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        generators = self.get_generators(filepath, batch_size)

        query_template_list = [
            [self.do_query_template, commit_size, "do_term_data.csv"],
            [self.doterm_isas_query_template, commit_size, "do_isas_data.csv"],
            [
                self.doterm_synonyms_query_template, commit_size,
                "do_synonyms_data.csv"
            ], [self.xrefs_query_template, commit_size, "do_xrefs_data.csv"],
            [
                self.doterm_alt_ids_query_template, commit_size,
                "do_alt_ids_data.csv"
            ]
        ]

        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("DO-?: ")

    def get_generators(self, filepath,
                       batch_size):  # noqa TODO:Needs splitting up really
        """Get Generators."""
        ont = OntologyFactory().create(filepath)
        parsed_line = ont.graph.copy().node

        do_term_list = []
        do_isas_list = []
        do_synonyms_list = []
        do_alt_ids_list = []
        xrefs = []
        counter = 0

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key, line in parsed_line.items():
            counter = counter + 1
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            # Switching id to curie form and saving URI in "uri"
            # - might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []

            def_links_unprocessed = []
            def_links_processed = []
            subset = []
            definition = ""
            is_obsolete = "false"
            ident = key

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                    for synonym in syns:
                        do_synonym = {"primary_id": key, "synonym": synonym}
                        do_synonyms_list.append(do_synonym)

                if "basicPropertyValues" in node["meta"]:
                    alt_ids = [
                        s["val"] for s in node["meta"]["basicPropertyValues"]
                    ]
                    for alt_id in alt_ids:
                        if "DOID:" in alt_id:
                            secondary_id = {
                                "primary_id": key,
                                "secondary_id": alt_id
                            }
                            do_alt_ids_list.append(secondary_id)

                if "xrefs" in node["meta"]:
                    o_xrefs = node["meta"].get('xrefs')
                    self.ortho_xrefs(o_xrefs, ident, xrefs)

                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)
            isas_without_names = all_parents_subont.parents(
                key, relations=['subClassOf'])

            for item in isas_without_names:
                dictionary = {"primary_id": key, "primary_id2": item}

                do_isas_list.append(dictionary)

            def_links_processed = []
            def_links = ""
            if definition is None:
                definition = ""
            else:
                # Remove new lines that cause this to split across two lines in the file
                # definition = definition.replace('\n', ' ')

                # Remove any extra double space that might have been introduces in the last replace
                # definition = definition.replace('  ', ' ')

                if definition is not None and "\"" in definition:
                    split_definition = re.split(r'(?<!\\)"', definition)
                    if len(split_definition) > 1:
                        if len(split_definition
                               ) > 2 and "[" in split_definition[2].strip():
                            def_links = split_definition[2].strip()
                            def_links = def_links.rstrip("]").replace("[", "")
                            def_links_unprocessed.append(def_links)

            for def_link in def_links_unprocessed:
                def_link = def_link.replace("url:www", "http://www")
                def_link = def_link.replace("url:", "")
                def_link = def_link.replace("URL:", "")
                def_link = def_link.replace("\\:", ":")
                def_link = def_link.replace('\\', '')

                if "," in def_link:
                    def_link = def_link.split(",")
                    for link in def_link:
                        if link.strip().startswith("http"):
                            def_links_processed.append(link)
                else:
                    if def_link.strip().startswith("http"):
                        def_links_processed.append(def_link)

            # TODO: make this a generic section based on the resourceDescriptor.yaml file.
            # need to have MODs add disease pages to their yaml stanzas

            # NU: alt_ids = node.get('alt_id')
            # if alt_ids:
            #     if not isinstance(alt_ids, (list, tuple)):
            #         alt_ids = [alt_ids]
            # else:
            #     alt_ids = []

            # TODO: Need to add urls to resource Descriptis for SGD and MGI.
            # NOTE: MGI had one but has 'MGI:' at the end of the url not required here.
            dict_to_append = {
                'oid':
                node['id'],
                'name':
                node.get('label'),
                'name_key':
                node.get('label'),
                'definition':
                definition,
                'defLinksProcessed':
                def_links_processed,
                'is_obsolete':
                is_obsolete,
                'subset':
                subset,
                'oUrl':
                self.etlh.rdh2.return_url_from_key_value('DOID', node['id']),
                'rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/all'),
                'rat_only_rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/rat'),
                'human_only_rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/human'),
                'mgi_link':
                'http://www.informatics.jax.org/disease/' + node['id'],
                'zfin_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'ZFIN', node['id'], 'disease'),
                'flybase_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'FB', node['id'], 'disease'),
                'wormbase_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'WB', node['id'], 'disease'),
                'sgd_link':
                'https://yeastgenome.org/disease/' + node['id']
            }

            do_term_list.append(dict_to_append)

            if counter == batch_size:
                yield [
                    do_term_list, do_isas_list, do_synonyms_list, xrefs,
                    do_alt_ids_list
                ]
                do_term_list = []
                do_isas_list = []
                do_synonyms_list = []
                do_alt_ids_list = []
                xrefs = []
                counter = 0

        if counter > 0:
            yield [
                do_term_list, do_isas_list, do_synonyms_list, xrefs,
                do_alt_ids_list
            ]
Exemple #29
0
class TestClass():
    """Test Class."""

    etlh = ETLHelper()

    def test_get_species_name_from_various_keys(self):
        """Test getting valid species names from DB."""
        lookups = {'RGD': 'Rattus norvegicus',
                   'NCBITaxon:10116': 'Rattus norvegicus',
                   '10116': 'Rattus norvegicus',
                   'Cel': 'Caenorhabditis elegans',
                   'worm': 'Caenorhabditis elegans',
                   'Dme': 'Drosophila melanogaster',
                   'bad': None}  # Bad lookup returns None

        for key in lookups.keys():
            name = self.etlh.species_name_lookup(key)
            assert name == lookups[key]

    def test_get_species_order(self):
        """Test getting order."""
        lookups = {'RGD': 20,
                   'NCBITaxon:10116': 20,
                   '10116': 20,
                   'Cel': 60,
                   'worm': 60,
                   'Dme': 50,
                   'bad': None}  # Bad lookup returns None

        for key in lookups.keys():
            name = self.etlh.get_species_order(key)
            assert name == lookups[key]

    def test_data_provider_lookup(self):
        """Test provider lookup."""
        lookups = {'RGD': 'RGD',
                   'NCBITaxon:10116': 'RGD',
                   'worm': 'WB',
                   'Dme': 'FB',
                   'Saccharomyces cerevisiae': 'SGD',
                   'H**o sapiens': 'RGD',  # Wierd one
                   'bad': None}  # Bad lookup returns None

        for key in lookups.keys():
            name = self.etlh.data_provider_lookup(key)
            assert name == lookups[key]

    def test_url_lookup_key_value(self):
        """Test url lookups."""
        # reset critical error
        self.etlh.rdh2.missing_keys = {}
        self.etlh.rdh2.missing_pages = {}
        self.etlh.rdh2.bad_pages = {}

        lookups = [{'key': 'RGD', 'value': '123456', 'page': None, 'result': 'https://rgd.mcw.edu/rgdweb/elasticResults.html?term=RGD:123456'},
                   {'key': 'RGD', 'value': '234567', 'page': 'allele', 'result': 'https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=RGD:234567'},
                   {'key': 'FB', 'value': 'something', 'page': None, 'result': 'https://flybase.org/reports/something.html'},
                   {'key': 'FB', 'value': 'FBsomething', 'page': 'badpage', 'result': None},
                   {'key': 'BADKEY', 'value': 'something', 'page': None, 'result': None}]

        for item in lookups:
            url = self.etlh.rdh2.return_url_from_key_value(item['key'], item['value'], alt_page=item['page'])
            assert url == item['result']

        for item_name in self.etlh.rdh2.missing_keys.keys():
            assert 1 == self.etlh.rdh2.missing_keys[item_name]
        assert 'BADKEY-None' in self.etlh.rdh2.missing_keys.keys()
        assert 'BADKEY' in self.etlh.rdh2.missing_keys.keys()

        for item_name in self.etlh.rdh2.missing_pages.keys():
            assert 1 == self.etlh.rdh2.missing_pages[item_name]
            assert item_name == 'FB-badpage'

    def test_url_lookup(self):
        """Get url tests for ETLHelper."""
        self.etlh.rdh2.missing_keys = {}
        self.etlh.rdh2.missing_pages = {}
        self.etlh.rdh2.bad_pages = {}
        self.etlh.rdh2.bad_regex = {}

        lookups = [{'local_id': 'C5604', 'global_id': 'NCI:C5604',
                    'result': 'https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=C5604'},
                   {'local_id': 'badregexdoesnotmatch', 'global_id': 'MESH:badregexdoesnotmatch',
                    'result': 'https://www.ncbi.nlm.nih.gov/mesh/badregexdoesnotmatch'},
                   {'local_id': 'Cdiff', 'global_id': 'OMIM:1111', 'result': 'https://www.omim.org/entry/1111'}]

        for item in lookups:
            url = self.etlh.get_complete_url_ont(item['local_id'], item['global_id'])
            assert url == item['result']

        if self.etlh.rdh2.missing_keys.keys():
            assert 1 == "Should be no missing keys"
        if self.etlh.rdh2.missing_pages.keys():
            assert 1 == "Should be no missing pages"
        for item_name in self.etlh.rdh2.bad_pages.keys():
            # Due to local_id and global not matching we will get a bad pages.
            assert item_name == "OMIM-None"
        # mesh fails the regex so make sure we got an error message
        # we still get a url eror is logged.
        for item_name in self.etlh.rdh2.bad_regex.keys():
            assert 1 == self.etlh.rdh2.bad_regex[item_name]
            assert item_name == 'MESH'
Exemple #30
0
class VariationETL(ETL):
    """Variation ETL."""

    logger = logging.getLogger(__name__)

    # Query templates which take params and will be processed later

    variation_query_template = """
            USING PERIODIC COMMIT %s
            LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

                MATCH (a:Allele {primaryKey:row.alleleId})
                MATCH (g:Gene)-[:IS_ALLELE_OF]-(a)

                //Create the variant node and set properties. primaryKey is required.
                MERGE (o:Variant {primaryKey:row.hgvs_nomenclature})
                    ON CREATE SET
                     o.name = row.variantHGVSSynonym,
                     o.hgvsNomenclature = row.hgvs_nomenclature,
                     o.genomicReferenceSequence = row.genomicReferenceSequence,
                     o.paddingLeft = row.paddingLeft,
                     o.paddingRight = row.paddingRight,
                     o.genomicVariantSequence = row.genomicVariantSequence,
                     o.dateProduced = row.dateProduced,
                     o.release = row.release,
                     o.dataProviders = row.dataProviders,
                     o.dataProvider = row.dataProvider

                MERGE (s:Synonym:Identifier {primaryKey:row.hgvs_nomenclature})
                    SET s.name = row.hgvs_nomenclature
                MERGE (o)-[aka2:ALSO_KNOWN_AS]->(s)

                MERGE (o)-[:VARIATION]->(a)
                MERGE (g)-[:COMPUTED_GENE]->(o) """

    so_terms_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row
            MATCH (o:Variant {primaryKey:row.variantId})
            MATCH (s:SOTerm {primaryKey:row.soTermId})
            CREATE (o)-[:VARIATION_TYPE]->(s)"""

    genomic_locations_query_template = """
        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Variant {primaryKey:row.variantId})
            MATCH (chrm:Chromosome {primaryKey:row.chromosome})
            MERGE (a:Assembly {primaryKey:row.assembly})
             ON CREATE SET a.dataProvider = row.dataProvider

            CREATE (o)-[gchrm:LOCATED_ON]->(chrm)

            CREATE (gchrmn:GenomicLocation {primaryKey:row.uuid})
              SET gchrmn.start = apoc.number.parseInt(row.start),
                gchrmn.end = apoc.number.parseInt(row.end),
                gchrmn.assembly = row.assembly,
                gchrmn.strand = row.strand,
                gchrmn.chromosome = row.chromosome

            CREATE (o)-[of:ASSOCIATION]->(gchrmn)
            CREATE (gchrmn)-[ofc:ASSOCIATION]->(chrm)
            CREATE (gchrmn)-[ao:ASSOCIATION]->(a)
    """

    xrefs_query_template = """

        USING PERIODIC COMMIT %s
        LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row

            MATCH (o:Variant {primaryKey:row.dataId})
    """ + ETLHelper.get_cypher_xref_text()

    def __init__(self, config):
        """Initialise object."""
        super().__init__()
        self.data_type_config = config

    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type, ))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

    def _process_sub_type(self, sub_type):

        self.logger.info("Loading Variation Data: %s",
                         sub_type.get_data_provider())
        filepath = sub_type.get_filepath()
        data = JSONFile().get_data(filepath)
        self.logger.info("Finished Loading Variation Data: %s",
                         sub_type.get_data_provider())

        if data is None:
            self.logger.warning("No Data found for %s skipping",
                                sub_type.get_data_provider())
            return

        # This order is the same as the lists yielded from the get_generators function.
        # A list of tuples.

        commit_size = self.data_type_config.get_neo4j_commit_size()
        batch_size = self.data_type_config.get_generator_batch_size()

        # This needs to be in this format (template, param1, params2) others will be ignored

        query_template_list = [
            [
                self.variation_query_template, commit_size,
                "variation_data_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.genomic_locations_query_template, commit_size,
                "variant_genomiclocations_" + sub_type.get_data_provider() +
                ".csv"
            ],
            [
                self.so_terms_query_template, commit_size,
                "variant_so_terms_" + sub_type.get_data_provider() + ".csv"
            ],
            [
                self.xrefs_query_template, commit_size,
                "variant_xrefs_" + sub_type.get_data_provider() + ".csv"
            ]
        ]

        generators = self.get_generators(data, batch_size)
        query_and_file_list = self.process_query_params(query_template_list)
        CSVTransactor.save_file_static(generators, query_and_file_list)
        Neo4jTransactor.execute_query_batch(query_and_file_list)
        self.error_messages("Var-{}: ".format(sub_type.get_data_provider()))

    def get_hgvs_nomenclature(self, refseq_id, variant_type, start_position,
                              end_position, reference_sequence,
                              variant_sequence, assembly, chromosome):
        """Get HGVS nomenclature."""
        if start_position is None:
            start_position_str = ""
        else:
            start_position_str = str(start_position)

        if end_position is None:
            end_position_str = ""
        else:
            end_position_str = str(end_position)

        if variant_sequence is None:
            variant_sequence_str = ""
        else:
            variant_sequence_str = variant_sequence

        if reference_sequence is None:
            reference_sequence_str = ""
        else:
            reference_sequence_str = reference_sequence

        hgvs_nomenclature = refseq_id.split(
            ":")[1] + ':g.' + start_position_str
        hgvs_synonym = '(' + assembly + ')' + chromosome + ':' + start_position_str

        if variant_type in ['SO:1000002',
                            'SO:1000008']:  # point mutation/substitution
            hgvs_nomenclature += reference_sequence_str + ">" + variant_sequence_str
            hgvs_synonym += reference_sequence_str + ">" + variant_sequence_str
        elif variant_type == "SO:0000667":  # insertion
            hgvs_nomenclature += '_' + end_position_str + 'ins' + variant_sequence_str
            hgvs_synonym += '_' + end_position_str + 'ins' + variant_sequence_str
        elif variant_type == "SO:0000159":  # deletion
            hgvs_nomenclature += '_' + end_position_str + 'del'
            hgvs_synonym += '_' + end_position_str + 'del'
        elif variant_type == "SO:0002007":  # MNV
            hgvs_nomenclature += '_' + end_position_str + 'delins' + variant_sequence_str
            hgvs_synonym += '_' + end_position_str + 'delins' + variant_sequence_str
        elif variant_type == "SO:1000032":  # DELIN
            hgvs_nomenclature += '_' + end_position_str + 'delins' + variant_sequence_str
            hgvs_synonym += '_' + end_position_str + 'delins' + variant_sequence_str
        else:
            hgvs_nomenclature = ''
            hgvs_synonym = ''
        return hgvs_nomenclature, hgvs_synonym

    def get_generators(self, variant_data, batch_size):  # noqa
        """Get Generators."""

        data_providers = []
        release = ""
        variants = []
        variant_genomic_locations = []
        variant_so_terms = []
        cross_references = []
        counter = 0
        date_produced = variant_data['metaData']['dateProduced']

        self.data_providers_process(variant_data)
        load_key = date_produced + self.data_provider + "_VARIATION"

        if 'release' in variant_data['metaData']:
            release = variant_data['metaData']['release']

        assemblies = {}
        for allele_record in variant_data['data']:
            chromosome = allele_record["chromosome"]
            if chromosome.startswith("chr"):
                chromosome_str = chromosome[3:]
            else:
                chromosome_str = chromosome

            assembly = allele_record["assembly"]

            if assembly not in assemblies:
                self.logger.info(assembly)
                context_info = ContextInfo()
                data_manager = DataFileManager(
                    context_info.config_file_location)
                assemblies[assembly] = AssemblySequenceHelper(
                    assembly, data_manager)

            so_term_id = allele_record.get('type')
            genomic_reference_sequence = allele_record.get(
                'genomicReferenceSequence')
            genomic_variant_sequence = allele_record.get(
                'genomicVariantSequence')

            if genomic_reference_sequence == 'N/A':
                genomic_reference_sequence = ""
            if genomic_variant_sequence == 'N/A':
                genomic_variant_sequence = ""

            padding_left = ""
            padding_right = ""
            if allele_record.get('start') != "" and allele_record.get(
                    'end') != "":

                # not insertion
                if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567":
                    genomic_reference_sequence = assemblies[
                        assembly].get_sequence(chromosome_str,
                                               allele_record.get('start'),
                                               allele_record.get('end'))

                if allele_record.get('start') < allele_record.get('end'):
                    start = allele_record.get('start')
                    end = allele_record.get('end')
                else:
                    start = allele_record.get('end')
                    end = allele_record.get('start')

                padding_width = 500
                if so_term_id != "SO:0000667":  # not insertion
                    start = start - 1
                    end = end + 1

                left_padding_start = start - padding_width
                if left_padding_start < 1:
                    left_padding_start = 1

                padding_left = assemblies[assembly].get_sequence(
                    chromosome_str, left_padding_start, start)
                right_padding_end = end + padding_width
                padding_right = assemblies[assembly].get_sequence(
                    chromosome_str, end, right_padding_end)
            counter = counter + 1
            global_id = allele_record.get('alleleId')
            mod_global_cross_ref_id = ""
            cross_references = []

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            cross_ref_primary_id = allele_record.get(
                'sequenceOfReferenceAccessionNumber')
            local_cross_ref_id = cross_ref_primary_id.split(":")[1]
            prefix = cross_ref_primary_id.split(":")[0]

            cross_ref_complete_url = self.etlh.get_no_page_complete_url(
                local_cross_ref_id, prefix, global_id)
            xref_map = ETLHelper.get_xref_dict(
                local_cross_ref_id, prefix, "variant_sequence_of_reference",
                "sequence_of_reference_accession_number", global_id,
                cross_ref_complete_url,
                cross_ref_primary_id + "variant_sequence_of_reference")

            xref_map['dataId'] = global_id
            if cross_ref_primary_id is not None:
                cross_references.append(xref_map)

            if genomic_reference_sequence is not None:
                if len(genomic_reference_sequence) > 1000 and (
                        allele_record.get('type') == 'SO:1000002'
                        or allele_record.get('type') == 'SO:1000008'):
                    self.logger.debug("%s genomicReferenceSequence",
                                      allele_record.get('alleleId'))

            if genomic_variant_sequence is not None:
                if len(genomic_variant_sequence) > 1000 and (
                        allele_record.get('type')
                        in ['SO:1000002', 'SO:1000008']):
                    self.logger.debug("%s genomicVariantSequence",
                                      allele_record.get('alleleId'))

            hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature(
                allele_record.get('sequenceOfReferenceAccessionNumber'),
                allele_record.get('type'), allele_record.get('start'),
                allele_record.get('end'),
                genomic_reference_sequence, genomic_variant_sequence,
                allele_record.get('assembly'), chromosome_str)

            if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \
                    or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000:
                self.logger.debug(
                    "%s has too long of a sequence potentionally",
                    allele_record.get('alleleId'))

            # TODO: fix typo in MGI Submission for this variant so
            # that it doesn't list a 40K bp point mutation.
            if allele_record.get('alleleId') != 'MGI:6113870':

                variant_dataset = {
                    "hgvs_nomenclature": hgvs_nomenclature,
                    "genomicReferenceSequence": genomic_reference_sequence,
                    "genomicVariantSequence": genomic_variant_sequence,
                    "paddingLeft": padding_left,
                    "paddingRight": padding_right,
                    "alleleId": allele_record.get('alleleId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": load_key,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "dataProvider": self.data_provider,
                    "variantHGVSSynonym": hgvs_synonym
                }

                variant_genomic_location_dataset = {
                    "variantId": hgvs_nomenclature,
                    "assembly": allele_record.get('assembly'),
                    "chromosome": chromosome_str,
                    "start": allele_record.get('start'),
                    "end": allele_record.get('end'),
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": self.data_provider
                }

                variant_so_term = {
                    "variantId": hgvs_nomenclature,
                    "soTermId": allele_record.get('type')
                }

                variant_so_terms.append(variant_so_term)
                variant_genomic_locations.append(
                    variant_genomic_location_dataset)
                variants.append(variant_dataset)

            if counter == batch_size:
                yield [
                    variants, variant_genomic_locations, variant_so_terms,
                    cross_references
                ]
                variants = []
                variant_genomic_locations = []
                variant_so_terms = []
                cross_references = []

        if counter > 0:
            yield [
                variants, variant_genomic_locations, variant_so_terms,
                cross_references
            ]