コード例 #1
0
    def get_generators(self, disease_data, batch_size, data_provider):
        """Creating generators"""

        counter = 0
        disease_association_type = None
        gene_list_to_yield = []
        allele_list_to_yield = []
        agm_list_to_yield = []
        evidence_code_list_to_yield = []
        withs = []
        pge_list_to_yield = []
        xrefs = []
        data_provider_object = disease_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')

        for disease_record in disease_data['data']:

            publication_mod_id = ""
            pub_med_id = ""
            pub_mod_url = None
            pub_med_url = None
            pge_key = ''

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    disease_record.get('objectId'))
                if is_it_test_entry is False:
                    continue

            disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \
                                 disease_record['objectRelation'].get("associationType").upper()

            counter = counter + 1
            disease_object_type = disease_record['objectRelation'].get(
                "objectType")

            primary_id = disease_record.get('objectId')
            do_id = disease_record.get('DOid')

            if 'evidence' in disease_record:
                pecj_primary_key = str(uuid.uuid4())
                evidence = disease_record.get('evidence')
                if 'publication' in evidence:
                    publication = evidence.get('publication')
                    if publication.get('publicationId').startswith('PMID:'):
                        pub_med_id = publication.get('publicationId')
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_url = ETLHelper.get_complete_pub_url(
                            local_pub_med_id, pub_med_id)
                        if 'crossReference' in evidence:
                            pub_xref = evidence.get('crossReference')
                            publication_mod_id = pub_xref.get('id')
                            local_pub_mod_id = publication_mod_id.split(":")[1]
                            pub_mod_url = ETLHelper.get_complete_pub_url(
                                local_pub_mod_id, publication_mod_id)
                    else:
                        publication_mod_id = publication.get('publicationId')
                        local_pub_mod_id = publication_mod_id.split(":")[1]
                        pub_mod_url = ETLHelper.get_complete_pub_url(
                            local_pub_mod_id, publication_mod_id)

                if 'evidenceCodes' in disease_record['evidence']:
                    for ecode in disease_record['evidence'].get(
                            'evidenceCodes'):
                        ecode_map = {
                            "pecjPrimaryKey": pecj_primary_key,
                            "ecode": ecode
                        }
                        evidence_code_list_to_yield.append(ecode_map)

            negation = ''
            if 'objectRelation' in disease_record:
                disease_association_type = disease_record[
                    'objectRelation'].get("associationType").upper()
                if 'negation' in disease_record:
                    # this capitalization is purposeful
                    if disease_association_type == 'IS_IMPLICATED_IN':
                        disease_association_type = 'IS_NOT_IMPLICATED_IN'
                    if disease_association_type == 'IS_MODEL_OF':
                        disease_association_type = 'IS_NOT_MODEL_OF'
                    if disease_association_type == 'IS_MARKER_FOR':
                        disease_association_type = 'IS_NOT_MARKER_FOR'
                    negation = 'NOT'
                    disease_unique_key = disease_unique_key + negation

                additional_genetic_components = []

                if 'additionalGeneticComponents' in disease_record[
                        'objectRelation']:
                    for component in disease_record['objectRelation'][
                            'additionalGeneticComponents']:
                        component_symbol = component.get('componentSymbol')
                        component_id = component.get('componentId')
                        component_url = component.get(
                            'componentUrl') + component_id
                        additional_genetic_components.append({
                            "id":
                            component_id,
                            "componentUrl":
                            component_url,
                            "componentSymbol":
                            component_symbol
                        })

            if 'with' in disease_record:
                with_record = disease_record.get('with')
                for rec in with_record:
                    disease_unique_key = disease_unique_key + rec
                for rec in with_record:
                    with_map = {
                        "diseaseUniqueKey": disease_unique_key,
                        "withD": rec
                    }
                    withs.append(with_map)

            if 'primaryGeneticEntityIDs' in disease_record:

                pge_ids = disease_record.get('primaryGeneticEntityIDs')

                for pge in pge_ids:
                    pge_key = pge_key + pge
                    pge_map = {
                        "pecjPrimaryKey": pecj_primary_key,
                        "pgeId": pge
                    }
                    pge_list_to_yield.append(pge_map)

            if 'dataProvider' in disease_record:
                for dp in disease_record['dataProvider']:
                    annotation_type = dp.get('type')
                    xref = dp.get('crossReference')
                    cross_ref_id = xref.get('id')
                    pages = xref.get('pages')

                    if ":" in cross_ref_id:
                        local_crossref_id = cross_ref_id.split(":")[1]
                        prefix = cross_ref_id.split(":")[0]
                    else:
                        local_crossref_id = ""
                        prefix = cross_ref_id

                    if annotation_type is None:
                        annotation_type = 'curated'

                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if (data_provider == 'RGD' or data_provider
                                    == 'HUMAN') and prefix == 'DOID':
                                display_name = 'RGD'
                            elif (data_provider == 'RGD' or data_provider
                                  == 'HUMAN') and prefix == 'OMIM':
                                display_name = 'OMIM'
                            else:
                                display_name = cross_ref_id.split(":")[0]
                                if display_name == 'DOID':
                                    display_name = data_provider

                            mod_global_cross_ref_id = ETLHelper.get_page_complete_url(
                                local_crossref_id, self.xref_url_map, prefix,
                                page)
                            passing_xref = ETLHelper.get_xref_dict(
                                local_crossref_id, prefix, page, page,
                                display_name, mod_global_cross_ref_id,
                                cross_ref_id + page + annotation_type)
                            passing_xref['dataId'] = disease_unique_key

                            if 'loaded' in annotation_type:
                                passing_xref['loadedDB'] = 'true'
                                passing_xref['curatedDB'] = 'false'
                            else:
                                passing_xref['curatedDB'] = 'true'
                                passing_xref['loadedDB'] = 'false'

                            xrefs.append(passing_xref)

            disease_record = {
                "diseaseUniqueKey": disease_unique_key,
                "doId": do_id,
                "primaryId": primary_id,
                "pecjPrimaryKey": pecj_primary_key,
                "relationshipType": disease_association_type.upper(),
                "dataProvider": data_provider,
                "dateAssigned": disease_record.get("dateAssigned"),
                "pubPrimaryKey": publication_mod_id + pub_med_id,
                "pubModId": publication_mod_id,
                "pubMedId": pub_med_id,
                "pubMedUrl": pub_med_url,
                "pubModUrl": pub_mod_url,
                "negation": negation
            }

            if disease_object_type == 'gene':
                gene_list_to_yield.append(disease_record)
            elif disease_object_type == 'allele':
                allele_list_to_yield.append(disease_record)
            else:
                agm_list_to_yield.append(disease_record)

            if counter == batch_size:
                yield [
                    allele_list_to_yield, gene_list_to_yield,
                    agm_list_to_yield, pge_list_to_yield, pge_list_to_yield,
                    pge_list_to_yield, withs, evidence_code_list_to_yield,
                    xrefs
                ]
                agm_list_to_yield = []
                allele_list_to_yield = []
                gene_list_to_yield = []
                evidence_code_list_to_yield = []
                pge_list_to_yield = []
                xrefs = []
                withs = []
                counter = 0

        if counter > 0:
            yield [
                allele_list_to_yield, gene_list_to_yield, agm_list_to_yield,
                pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs,
                evidence_code_list_to_yield, xrefs
            ]
コード例 #2
0
    def get_generators(self, phenotype_data, batch_size):
        """Get Generators"""

        list_to_yield = []
        pge_list_to_yield = []
        date_produced = phenotype_data['metaData']['dateProduced']
        data_providers = []
        data_provider_object = phenotype_data['metaData']['dataProvider']
        counter = 0
        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []
        pge_key = ''

        load_key = date_produced + data_provider + "_phenotype"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, ETL.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.debug("data provider: %s", data_provider)

        for pheno in phenotype_data['data']:
            pecj_primary_key = str(uuid.uuid4())
            counter = counter + 1
            pub_med_id = None
            pub_mod_id = None
            pub_med_url = None
            pub_mod_url = None
            primary_id = pheno.get('objectId')
            phenotype_statement = pheno.get('phenotypeStatement')

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    primary_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            evidence = pheno.get('evidence')

            if 'publicationId' in evidence:
                if evidence.get('publicationId').startswith('PMID:'):
                    pub_med_id = evidence['publicationId']
                    local_pub_med_id = pub_med_id.split(":")[1]
                    pub_med_prefix = pub_med_id.split(":")[0]
                    pub_med_url = ETLHelper.get_no_page_complete_url(
                        local_pub_med_id, self.xref_url_map, pub_med_prefix,
                        primary_id)
                    if pub_med_id is None:
                        pub_med_id = ""

                    if 'crossReference' in evidence:
                        pub_xref = evidence.get('crossReference')
                        pub_mod_id = pub_xref.get('id')
                        pub_mod_local_id = pub_mod_id.split(":")[1]
                        if pub_mod_id is not None:
                            pub_mod_url = ETLHelper.get_complete_pub_url(
                                pub_mod_local_id, pub_mod_id)

                else:
                    pub_mod_id = evidence.get('publicationId')
                    if pub_mod_id is not None:
                        pub_mod_local_id = pub_mod_id.split(":")[1]
                        pub_mod_url = ETLHelper.get_complete_pub_url(
                            pub_mod_local_id, pub_mod_id)

                if pub_mod_id is None:
                    pub_mod_id = ""

            if pub_med_id is None:
                pub_med_id = ""

            if pub_mod_id is None:
                pub_mod_id = ""

            date_assigned = pheno.get('dateAssigned')

            if pub_mod_id is None and pub_med_id is None:
                self.logger.info("%s is missing pubMed and pubMod id",
                                 primary_id)

            if 'primaryGeneticEntityIDs' in pheno:
                pge_ids = pheno.get('primaryGeneticEntityIDs')
                for pge in pge_ids:
                    pge_key = pge_key + pge
                    pge_map = {
                        "pecjPrimaryKey": pecj_primary_key,
                        "pgeId": pge
                    }
                    pge_list_to_yield.append(pge_map)

            phenotype = {
                "primaryId": primary_id,
                "phenotypeUniqueKey": primary_id + phenotype_statement.strip(),
                "phenotypeStatement": phenotype_statement.strip(),
                "dateAssigned": date_assigned,
                "loadKey": load_key,
                "type": "gene",
                "dataProviders": data_providers,
                "dataProvider": data_provider,
                "dateProduced": date_produced,
                "pubMedId": pub_med_id,
                "pubMedUrl": pub_med_url,
                "pubModId": pub_mod_id,
                "pubModUrl": pub_mod_url,
                "pubPrimaryKey": pub_med_id + pub_mod_id,
                "pecjPrimaryKey": pecj_primary_key
            }

            list_to_yield.append(phenotype)

            if counter == batch_size:
                yield [
                    list_to_yield, list_to_yield, list_to_yield,
                    pge_list_to_yield, pge_list_to_yield
                ]
                list_to_yield = []
                pge_list_to_yield = []
                counter = 0

        if counter > 0:
            yield [
                list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield,
                pge_list_to_yield
            ]