コード例 #1
0
    def get_generators(self, construct_data, data_provider, batch_size):
        """Create Generators"""

        data_providers = []
        release = ""
        constructs = []
        construct_synonyms = []
        construct_secondary_ids = []
        cross_reference_list = []
        component_details = []
        component_no_gene_details = []
        non_bgi_components = []

        counter = 0
        date_produced = construct_data['metaData']['dateProduced']

        data_provider_object = construct_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        self.logger.info("DataProvider: " + data_provider)
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_construct"

        # TODO: get SGD to fix their files.

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(
                    ETLHelper.get_xref_dict(
                        data_provider, data_provider, data_provider_page,
                        data_provider_page, data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        if 'release' in construct_data['metaData']:
            release = construct_data['metaData']['release']

        for construct_record in construct_data['data']:

            counter = counter + 1
            global_id = construct_record['primaryId']
            local_id = global_id.split(":")[1]
            mod_global_cross_ref_id = ""

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            name_text = TextProcessingHelper.cleanhtml(
                construct_record.get('name'))

            construct_dataset = {
                "symbol": construct_record.get('name'),
                "primaryId": construct_record.get('primaryId'),
                "globalId": global_id,
                "localId": local_id,
                "dataProviders": data_providers,
                "dateProduced": date_produced,
                "loadKey": load_key,
                "release": release,
                "modGlobalCrossRefId": mod_global_cross_ref_id,
                "uuid": str(uuid.uuid4()),
                "dataProvider": data_provider,
                "nameText": name_text,
                "name": construct_record.get('name')
            }
            constructs.append(construct_dataset)

            if 'crossReferences' in construct_record:

                for cross_ref in construct_record.get('crossReferences'):
                    cross_ref_id = cross_ref.get('id')
                    local_crossref_id = cross_ref_id.split(":")[1]
                    prefix = cross_ref.get('id').split(":")[0]
                    pages = cross_ref.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if page == 'construct':
                                mod_global_cross_ref_id = ETLHelper.get_page_complete_url(
                                    local_crossref_id, self.xref_url_map,
                                    prefix, page)
                                xref = ETLHelper.get_xref_dict(
                                    local_crossref_id, prefix, page, page,
                                    cross_ref_id, mod_global_cross_ref_id,
                                    cross_ref_id + page)
                                xref['dataId'] = global_id
                                cross_reference_list.append(xref)

            if 'constructComponents' in construct_record:
                for component in construct_record.get('constructComponents'):
                    component_relation = component.get(
                        'componentRelation').upper()
                    component_symbol = component.get('componentSymbol')
                    component_id = component.get('componentID')

                    if component_id is not None:
                        component_detail = {
                            "componentRelation": component_relation.upper(),
                            "componentSymbol": component_symbol,
                            "componentID": component_id,
                            "constructID": construct_record.get('primaryId')
                        }
                        component_details.append(component_detail)
                    else:
                        component_detail = {
                            "componentRelation": component_relation.upper(),
                            "componentSymbol": component_symbol,
                            "constructID": construct_record.get('primaryId')
                        }
                        non_bgi_component = {
                            "componentSymbol": component_symbol
                        }
                        non_bgi_components.append(non_bgi_component)
                        component_no_gene_details.append(component_detail)

            if 'synonyms' in construct_record:
                for syn in construct_record.get('synonyms'):
                    construct_synonym = {
                        "data_id": construct_record.get('primaryId'),
                        "synonym": syn.strip()
                    }
                    construct_synonyms.append(construct_synonym)

            if 'secondaryIds' in construct_record:
                for secondary_id in construct_record.get('secondaryIds'):
                    construct_secondary_id = {
                        "data_id": construct_record.get('primaryId'),
                        "secondary_id": secondary_id
                    }
                    construct_secondary_ids.append(construct_secondary_id)

            if counter == batch_size:
                yield [
                    constructs, construct_secondary_ids, construct_synonyms,
                    cross_reference_list, non_bgi_components,
                    component_details, component_no_gene_details
                ]
                constructs = []
                construct_secondary_ids = []
                construct_synonyms = []
                cross_reference_list = []
                non_bgi_components = []
                component_details = []
                component_no_gene_details = []
                counter = 0

        if counter > 0:
            yield [
                constructs, construct_secondary_ids, construct_synonyms,
                cross_reference_list, non_bgi_components, component_details,
                component_no_gene_details
            ]
コード例 #2
0
    def get_generators(self, disease_data, batch_size, data_provider):
        """Creating generators"""

        counter = 0
        disease_association_type = None
        gene_list_to_yield = []
        allele_list_to_yield = []
        agm_list_to_yield = []
        evidence_code_list_to_yield = []
        withs = []
        pge_list_to_yield = []
        xrefs = []
        data_provider_object = disease_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')

        for disease_record in disease_data['data']:

            publication_mod_id = ""
            pub_med_id = ""
            pub_mod_url = None
            pub_med_url = None
            pge_key = ''

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    disease_record.get('objectId'))
                if is_it_test_entry is False:
                    continue

            disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \
                                 disease_record['objectRelation'].get("associationType").upper()

            counter = counter + 1
            disease_object_type = disease_record['objectRelation'].get(
                "objectType")

            primary_id = disease_record.get('objectId')
            do_id = disease_record.get('DOid')

            if 'evidence' in disease_record:
                pecj_primary_key = str(uuid.uuid4())
                evidence = disease_record.get('evidence')
                if 'publication' in evidence:
                    publication = evidence.get('publication')
                    if publication.get('publicationId').startswith('PMID:'):
                        pub_med_id = publication.get('publicationId')
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_url = ETLHelper.get_complete_pub_url(
                            local_pub_med_id, pub_med_id)
                        if 'crossReference' in evidence:
                            pub_xref = evidence.get('crossReference')
                            publication_mod_id = pub_xref.get('id')
                            local_pub_mod_id = publication_mod_id.split(":")[1]
                            pub_mod_url = ETLHelper.get_complete_pub_url(
                                local_pub_mod_id, publication_mod_id)
                    else:
                        publication_mod_id = publication.get('publicationId')
                        local_pub_mod_id = publication_mod_id.split(":")[1]
                        pub_mod_url = ETLHelper.get_complete_pub_url(
                            local_pub_mod_id, publication_mod_id)

                if 'evidenceCodes' in disease_record['evidence']:
                    for ecode in disease_record['evidence'].get(
                            'evidenceCodes'):
                        ecode_map = {
                            "pecjPrimaryKey": pecj_primary_key,
                            "ecode": ecode
                        }
                        evidence_code_list_to_yield.append(ecode_map)

            negation = ''
            if 'objectRelation' in disease_record:
                disease_association_type = disease_record[
                    'objectRelation'].get("associationType").upper()
                if 'negation' in disease_record:
                    # this capitalization is purposeful
                    if disease_association_type == 'IS_IMPLICATED_IN':
                        disease_association_type = 'IS_NOT_IMPLICATED_IN'
                    if disease_association_type == 'IS_MODEL_OF':
                        disease_association_type = 'IS_NOT_MODEL_OF'
                    if disease_association_type == 'IS_MARKER_FOR':
                        disease_association_type = 'IS_NOT_MARKER_FOR'
                    negation = 'NOT'
                    disease_unique_key = disease_unique_key + negation

                additional_genetic_components = []

                if 'additionalGeneticComponents' in disease_record[
                        'objectRelation']:
                    for component in disease_record['objectRelation'][
                            'additionalGeneticComponents']:
                        component_symbol = component.get('componentSymbol')
                        component_id = component.get('componentId')
                        component_url = component.get(
                            'componentUrl') + component_id
                        additional_genetic_components.append({
                            "id":
                            component_id,
                            "componentUrl":
                            component_url,
                            "componentSymbol":
                            component_symbol
                        })

            if 'with' in disease_record:
                with_record = disease_record.get('with')
                for rec in with_record:
                    disease_unique_key = disease_unique_key + rec
                for rec in with_record:
                    with_map = {
                        "diseaseUniqueKey": disease_unique_key,
                        "withD": rec
                    }
                    withs.append(with_map)

            if 'primaryGeneticEntityIDs' in disease_record:

                pge_ids = disease_record.get('primaryGeneticEntityIDs')

                for pge in pge_ids:
                    pge_key = pge_key + pge
                    pge_map = {
                        "pecjPrimaryKey": pecj_primary_key,
                        "pgeId": pge
                    }
                    pge_list_to_yield.append(pge_map)

            if 'dataProvider' in disease_record:
                for dp in disease_record['dataProvider']:
                    annotation_type = dp.get('type')
                    xref = dp.get('crossReference')
                    cross_ref_id = xref.get('id')
                    pages = xref.get('pages')

                    if ":" in cross_ref_id:
                        local_crossref_id = cross_ref_id.split(":")[1]
                        prefix = cross_ref_id.split(":")[0]
                    else:
                        local_crossref_id = ""
                        prefix = cross_ref_id

                    if annotation_type is None:
                        annotation_type = 'curated'

                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if (data_provider == 'RGD' or data_provider
                                    == 'HUMAN') and prefix == 'DOID':
                                display_name = 'RGD'
                            elif (data_provider == 'RGD' or data_provider
                                  == 'HUMAN') and prefix == 'OMIM':
                                display_name = 'OMIM'
                            else:
                                display_name = cross_ref_id.split(":")[0]
                                if display_name == 'DOID':
                                    display_name = data_provider

                            mod_global_cross_ref_id = ETLHelper.get_page_complete_url(
                                local_crossref_id, self.xref_url_map, prefix,
                                page)
                            passing_xref = ETLHelper.get_xref_dict(
                                local_crossref_id, prefix, page, page,
                                display_name, mod_global_cross_ref_id,
                                cross_ref_id + page + annotation_type)
                            passing_xref['dataId'] = disease_unique_key

                            if 'loaded' in annotation_type:
                                passing_xref['loadedDB'] = 'true'
                                passing_xref['curatedDB'] = 'false'
                            else:
                                passing_xref['curatedDB'] = 'true'
                                passing_xref['loadedDB'] = 'false'

                            xrefs.append(passing_xref)

            disease_record = {
                "diseaseUniqueKey": disease_unique_key,
                "doId": do_id,
                "primaryId": primary_id,
                "pecjPrimaryKey": pecj_primary_key,
                "relationshipType": disease_association_type.upper(),
                "dataProvider": data_provider,
                "dateAssigned": disease_record.get("dateAssigned"),
                "pubPrimaryKey": publication_mod_id + pub_med_id,
                "pubModId": publication_mod_id,
                "pubMedId": pub_med_id,
                "pubMedUrl": pub_med_url,
                "pubModUrl": pub_mod_url,
                "negation": negation
            }

            if disease_object_type == 'gene':
                gene_list_to_yield.append(disease_record)
            elif disease_object_type == 'allele':
                allele_list_to_yield.append(disease_record)
            else:
                agm_list_to_yield.append(disease_record)

            if counter == batch_size:
                yield [
                    allele_list_to_yield, gene_list_to_yield,
                    agm_list_to_yield, pge_list_to_yield, pge_list_to_yield,
                    pge_list_to_yield, withs, evidence_code_list_to_yield,
                    xrefs
                ]
                agm_list_to_yield = []
                allele_list_to_yield = []
                gene_list_to_yield = []
                evidence_code_list_to_yield = []
                pge_list_to_yield = []
                xrefs = []
                withs = []
                counter = 0

        if counter > 0:
            yield [
                allele_list_to_yield, gene_list_to_yield, agm_list_to_yield,
                pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs,
                evidence_code_list_to_yield, xrefs
            ]
コード例 #3
0
    def get_generators(self, sqtr_data, data_provider, batch_size):
        """Get Generators"""

        data_providers = []
        sqtrs = []
        sqtr_synonyms = []
        sqtr_secondary_ids = []
        mod_global_cross_ref_url = ""
        tgs = []

        counter = 0
        date_produced = sqtr_data['metaData']['dateProduced']

        data_provider_object = sqtr_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_SqTR"


        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider,
                                                                         self.xref_url_map,
                                                                         data_provider,
                                                                         data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict( \
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        for sqtr_record in sqtr_data['data']:
            counter = counter + 1
            global_id = sqtr_record['primaryId']
            local_id = global_id.split(":")[1]

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            if sqtr_record.get('secondaryIds') is not None:
                for sid in sqtr_record.get('secondaryIds'):
                    sqtr_secondary_id_dataset = {
                        "primaryId": sqtr_record.get('primaryId'),
                        "secondaryId": sid
                    }
                    sqtr_secondary_ids.append(sqtr_secondary_id_dataset)

            if sqtr_record.get('synonyms') is not None:
                for syn in sqtr_record.get('synonyms'):
                    syn_dataset = {
                        "primaryId": sqtr_record.get('primaryId'),
                        "synonym": syn
                    }
                    sqtr_synonyms.append(syn_dataset)

            if sqtr_record.get('targetGeneIds') is not None:
                for target_gene_id in sqtr_record.get('targetGeneIds'):
                    tg_dataset = {
                        "primaryId": sqtr_record.get('primaryId'),
                        "geneId": target_gene_id
                    }
                    tgs.append(tg_dataset)

            if 'crossReferences' in sqtr_record:

                for cross_ref in sqtr_record['modCrossReference']:
                    cross_ref_id = cross_ref.get('id')
                    local_crossref_id = cross_ref_id.split(":")[1]
                    prefix = cross_ref.get('id').split(":")[0]
                    pages = cross_ref.get('pages')

                    # some pages collection have 0 elements
                    if pages is None or len(pages) == 0:
                        continue
                    if 'sequence_targeting_reagent' in pages:
                        page = 'sequence_targeting_reagent'
                        mod_global_cross_ref_url = ETLHelper.get_page_complete_url( \
                                local_crossref_id,
                                self.xref_url_map,
                                prefix,
                                page)


            sqtr_dataset = {
                "primaryId": sqtr_record.get('primaryId'),
                "name": sqtr_record.get('name'),
                "globalId": global_id,
                "localId": local_id,
                "soTerm": sqtr_record.get('soTermId'),
                "taxonId": sqtr_record.get('taxonId'),
                "dataProviders": data_providers,
                "dateProduced": date_produced,
                "loadKey": load_key,
                "modGlobalCrossRefUrl": mod_global_cross_ref_url,
                "dataProvider": data_provider
            }
            sqtrs.append(sqtr_dataset)



            if counter == batch_size:
                yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs]
                sqtrs = []
                sqtr_secondary_ids = []
                sqtr_synonyms = []
                tgs = []
                counter = 0

        if counter > 0:
            yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs]
コード例 #4
0
    def get_generators(self, agm_data, data_provider, batch_size):
        """Get Generators"""

        data_providers = []
        agms = []
        agm_synonyms = []
        agm_secondary_ids = []
        mod_global_cross_ref_url = ""
        components = []
        backgrounds = []
        sqtrs = []

        counter = 0
        date_produced = agm_data['metaData']['dateProduced']

        data_provider_object = agm_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_agm"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.info("data provider: %s", data_provider)

        for agm_record in agm_data['data']:
            counter = counter + 1
            global_id = agm_record['primaryID']
            local_id = global_id.split(":")[1]

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            if agm_record.get('secondaryIds') is not None:
                for sid in agm_record.get('secondaryIds'):
                    agm_secondary_id_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "secondaryId": sid
                    }
                    agm_secondary_ids.append(agm_secondary_id_dataset)

            if agm_record.get('synonyms') is not None:
                for syn in agm_record.get('synonyms'):
                    syn_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "synonym": syn
                    }
                    agm_synonyms.append(syn_dataset)

            if 'crossReference' in agm_record:
                cross_ref = agm_record.get('crossReference')
                cross_ref_id = cross_ref.get('id')
                local_crossref_id = cross_ref_id.split(":")[1]
                prefix = cross_ref.get('id').split(":")[0]
                pages = cross_ref.get('pages')

                # some pages collection have 0 elements
                if pages is not None and len(pages) > 0:
                    for page in pages:
                        if page in ['Fish', 'genotype', 'strain']:
                            mod_global_cross_ref_url = ETLHelper.get_page_complete_url(
                                local_crossref_id, self.xref_url_map, prefix,
                                page)

            short_species_abbreviation = ETLHelper.get_short_species_abbreviation(
                agm_record.get('taxonId'))
            name_text = TextProcessingHelper.cleanhtml(agm_record.get('name'))

            # TODO: make subtype required in submission file.

            subtype = agm_record.get('subtype')
            if subtype is None and data_provider == 'WB':
                subtype = 'strain'
            if subtype is None:
                subtype = 'affected_genomic_model'

            # TODO: name_text
            agm_dataset = {
                "primaryId":
                agm_record.get('primaryID'),
                "name":
                agm_record.get('name'),
                "globalId":
                global_id,
                "localId":
                local_id,
                "taxonId":
                agm_record.get('taxonId'),
                "dataProviders":
                data_providers,
                "dateProduced":
                date_produced,
                "loadKey":
                load_key,
                "subtype":
                subtype,
                "modGlobalCrossRefUrl":
                mod_global_cross_ref_url,
                "dataProvider":
                data_provider,
                "nameText":
                name_text,
                "nameWithSpecies":
                agm_record.get('name') + " (" + short_species_abbreviation +
                ")",
                "nameTextWithSpecies":
                name_text + " (" + short_species_abbreviation + ")",
            }
            agms.append(agm_dataset)

            if agm_record.get('affectedGenomicModelComponents') is not None:

                for component in agm_record.get(
                        'affectedGenomicModelComponents'):
                    component_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "componentId": component.get('alleleID'),
                        "zygosityId": component.get('zygosity')
                    }
                    components.append(component_dataset)

            if agm_record.get('sequenceTargetingReagentIDs') is not None:
                for sqtr in agm_record.get('sequenceTargetingReagentIDs'):
                    sqtr_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "sqtrId": sqtr
                    }
                    sqtrs.append(sqtr_dataset)

            if agm_record.get('parentalPopulationIDs') is not None:
                for background in agm_record.get('parentalPopulationIDs'):
                    background_dataset = {
                        "primaryId": agm_record.get('primaryID'),
                        "backgroundId": background
                    }
                    backgrounds.append(background_dataset)

            if counter == batch_size:
                yield [
                    agms, agm_secondary_ids, agm_synonyms, components, sqtrs,
                    backgrounds
                ]
                agms = []
                agm_secondary_ids = []
                agm_synonyms = []
                components = []
                backgrounds = []
                counter = 0

        if counter > 0:
            yield [
                agms, agm_secondary_ids, agm_synonyms, components, sqtrs,
                backgrounds
            ]
コード例 #5
0
    def get_generators(self, phenotype_data, batch_size):
        """Get Generators"""

        list_to_yield = []
        pge_list_to_yield = []
        date_produced = phenotype_data['metaData']['dateProduced']
        data_providers = []
        data_provider_object = phenotype_data['metaData']['dataProvider']
        counter = 0
        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []
        pge_key = ''

        load_key = date_produced + data_provider + "_phenotype"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, ETL.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.debug("data provider: %s", data_provider)

        for pheno in phenotype_data['data']:
            pecj_primary_key = str(uuid.uuid4())
            counter = counter + 1
            pub_med_id = None
            pub_mod_id = None
            pub_med_url = None
            pub_mod_url = None
            primary_id = pheno.get('objectId')
            phenotype_statement = pheno.get('phenotypeStatement')

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    primary_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            evidence = pheno.get('evidence')

            if 'publicationId' in evidence:
                if evidence.get('publicationId').startswith('PMID:'):
                    pub_med_id = evidence['publicationId']
                    local_pub_med_id = pub_med_id.split(":")[1]
                    pub_med_prefix = pub_med_id.split(":")[0]
                    pub_med_url = ETLHelper.get_no_page_complete_url(
                        local_pub_med_id, self.xref_url_map, pub_med_prefix,
                        primary_id)
                    if pub_med_id is None:
                        pub_med_id = ""

                    if 'crossReference' in evidence:
                        pub_xref = evidence.get('crossReference')
                        pub_mod_id = pub_xref.get('id')
                        pub_mod_local_id = pub_mod_id.split(":")[1]
                        if pub_mod_id is not None:
                            pub_mod_url = ETLHelper.get_complete_pub_url(
                                pub_mod_local_id, pub_mod_id)

                else:
                    pub_mod_id = evidence.get('publicationId')
                    if pub_mod_id is not None:
                        pub_mod_local_id = pub_mod_id.split(":")[1]
                        pub_mod_url = ETLHelper.get_complete_pub_url(
                            pub_mod_local_id, pub_mod_id)

                if pub_mod_id is None:
                    pub_mod_id = ""

            if pub_med_id is None:
                pub_med_id = ""

            if pub_mod_id is None:
                pub_mod_id = ""

            date_assigned = pheno.get('dateAssigned')

            if pub_mod_id is None and pub_med_id is None:
                self.logger.info("%s is missing pubMed and pubMod id",
                                 primary_id)

            if 'primaryGeneticEntityIDs' in pheno:
                pge_ids = pheno.get('primaryGeneticEntityIDs')
                for pge in pge_ids:
                    pge_key = pge_key + pge
                    pge_map = {
                        "pecjPrimaryKey": pecj_primary_key,
                        "pgeId": pge
                    }
                    pge_list_to_yield.append(pge_map)

            phenotype = {
                "primaryId": primary_id,
                "phenotypeUniqueKey": primary_id + phenotype_statement.strip(),
                "phenotypeStatement": phenotype_statement.strip(),
                "dateAssigned": date_assigned,
                "loadKey": load_key,
                "type": "gene",
                "dataProviders": data_providers,
                "dataProvider": data_provider,
                "dateProduced": date_produced,
                "pubMedId": pub_med_id,
                "pubMedUrl": pub_med_url,
                "pubModId": pub_mod_id,
                "pubModUrl": pub_mod_url,
                "pubPrimaryKey": pub_med_id + pub_mod_id,
                "pecjPrimaryKey": pecj_primary_key
            }

            list_to_yield.append(phenotype)

            if counter == batch_size:
                yield [
                    list_to_yield, list_to_yield, list_to_yield,
                    pge_list_to_yield, pge_list_to_yield
                ]
                list_to_yield = []
                pge_list_to_yield = []
                counter = 0

        if counter > 0:
            yield [
                list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield,
                pge_list_to_yield
            ]
コード例 #6
0
    def get_generators(self, expression_file, batch_size):
        """Get Generators"""

        self.logger.debug("made it to the expression generator")

        counter = 0

        cross_references = []
        bio_entities = []
        bio_join_entities = []
        bio_entity_gene_aos = []
        pubs = []
        ao_expressions = []
        cc_expressions = []
        ao_qualifiers = []
        ao_substructures = []
        ao_ss_qualifiers = []
        cc_qualifiers = []
        ao_cc_expressions = []
        stage_list = []
        stage_uberon_data = []
        uberon_ao_data = []
        uberon_ao_other_data = []
        uberon_stage_other_data = []

        self.logger.debug("streaming json data from %s ...", expression_file)
        with codecs.open(expression_file, 'r', 'utf-8') as file_handle:
            for xpat in ijson.items(file_handle, 'data.item'):
                counter = counter + 1

                pub_med_url = None
                pub_mod_url = None
                pub_med_id = ""
                publication_mod_id = ""
                stage_term_id = ""
                stage_name = ""
                stage_uberon_term_id = ""
                gene_id = xpat.get('geneId')

                if self.test_object.using_test_data() is True:
                    is_it_test_entry = self.test_object.check_for_test_id_entry(
                        gene_id)
                    if is_it_test_entry is False:
                        counter = counter - 1
                        continue

                evidence = xpat.get('evidence')

                if 'publicationId' in evidence:
                    if evidence.get('publicationId').startswith('PMID:'):
                        pub_med_id = evidence.get('publicationId')
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_prefix = pub_med_id.split(":")[0]
                        pub_med_url = ETLHelper.get_no_page_complete_url(
                            local_pub_med_id, self.xref_url_map,
                            pub_med_prefix, gene_id)
                        if pub_med_id is None:
                            pub_med_id = ""

                        if 'crossReference' in evidence:
                            pub_xref = evidence.get('crossReference')
                            publication_mod_id = pub_xref.get('id')

                            if publication_mod_id is not None:
                                pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \
                                        publication_mod_id)

                    else:
                        publication_mod_id = evidence['publicationId']
                        if publication_mod_id is not None:
                            pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\
                                              publication_mod_id)

                    if publication_mod_id is None:
                        publication_mod_id = ""

                assay = xpat.get('assay')

                if 'whereExpressed' in xpat:

                    where_expressed = xpat.get('whereExpressed')
                    cellular_component_qualifier_term_id = \
                           where_expressed.get('cellularComponentQualifierTermId')
                    cellular_component_term_id = where_expressed.get(
                        'cellularComponentTermId')
                    anatomical_structure_term_id = where_expressed.get(
                        'anatomicalStructureTermId')
                    anatomical_structure_qualifier_term_id = where_expressed.get(
                        'anatomicalStructureQualifierTermId')
                    anatomical_sub_structure_term_id = \
                            where_expressed.get('anatomicalSubStructureTermId')
                    anatomical_sub_structure_qualifier_term_id = where_expressed.get(
                        'anatomicalSubStructureQualifierTermId')
                    where_expressed_statement = where_expressed.get(
                        'whereExpressedStatement')

                    when_expressed_stage = xpat.get('whenExpressed')

                    if 'stageTermId' in when_expressed_stage:
                        stage_term_id = when_expressed_stage.get('stageTermId')
                    if 'stageName' in when_expressed_stage:
                        stage_name = when_expressed_stage.get('stageName')

                    # TODO: making unique BioEntityGeneExpressionJoin nodes
                    # and ExpressionBioEntity nodes is tedious.
                    # TODO: Lets get the DQMs to fix this.
                    expression_unique_key = gene_id + assay + stage_name
                    expression_entity_unique_key = ""

                    if anatomical_structure_term_id is not None:
                        expression_unique_key += anatomical_structure_term_id
                        expression_entity_unique_key = anatomical_structure_term_id

                        if anatomical_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_structure_qualifier_term_id
                            expression_entity_unique_key += anatomical_structure_qualifier_term_id

                    if cellular_component_term_id is not None:
                        expression_unique_key += cellular_component_term_id
                        expression_entity_unique_key += cellular_component_term_id

                        if cellular_component_qualifier_term_id is not None:
                            expression_unique_key += cellular_component_qualifier_term_id
                            expression_entity_unique_key += cellular_component_qualifier_term_id

                    if anatomical_sub_structure_term_id is not None:
                        expression_unique_key += anatomical_sub_structure_term_id

                        if anatomical_sub_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_sub_structure_qualifier_term_id
                            expression_entity_unique_key \
                                    += anatomical_sub_structure_qualifier_term_id

                    expression_entity_unique_key += where_expressed_statement
                    expression_unique_key += where_expressed_statement

                    if where_expressed.get(
                            'anatomicalStructureUberonSlimTermIds'
                    ) is not None:
                        for uberon_structure_term_object in \
                                where_expressed.get('anatomicalStructureUberonSlimTermIds'):
                            structure_uberon_term_id = \
                                    uberon_structure_term_object.get('uberonTerm')
                            if structure_uberon_term_id is not None \
                                    and structure_uberon_term_id != 'Other':
                                structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": structure_uberon_term_id
                                }
                                uberon_ao_data.append(structure_uberon_term)
                            elif structure_uberon_term_id is not None \
                                    and structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if where_expressed.get(
                            'anatomicalSubStructureUberonSlimTermIds'
                    ) is not None:
                        for uberon_sub_structure_term_object in \
                                where_expressed.get('anatomicalSubStructureUberonSlimTermIds'):
                            sub_structure_uberon_term_id = \
                                    uberon_sub_structure_term_object.get('uberonTerm')
                            if sub_structure_uberon_term_id is not None \
                                    and sub_structure_uberon_term_id != 'Other':
                                sub_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": sub_structure_uberon_term_id
                                }
                                uberon_ao_data.append(
                                    sub_structure_uberon_term)
                            elif sub_structure_uberon_term_id is not None \
                                    and sub_structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if cellular_component_term_id is None:
                        cellular_component_term_id = ""

                    if when_expressed_stage.get(
                            'stageUberonSlimTerm') is not None:
                        stage_uberon_term_object = when_expressed_stage.get(
                            'stageUberonSlimTerm')
                        stage_uberon_term_id = stage_uberon_term_object.get(
                            "uberonTerm")
                        if stage_uberon_term_id is not None \
                            and stage_uberon_term_id != "post embryonic, pre-adult":
                            stage_uberon = {
                                "uberonStageId": stage_uberon_term_id,
                                "ei_uuid": expression_unique_key
                            }
                            stage_uberon_data.append(stage_uberon)
                        if stage_uberon_term_id == "post embryonic, pre-adult":
                            stage_uberon_other = {
                                "ei_uuid": expression_unique_key
                            }
                            uberon_stage_other_data.append(stage_uberon_other)

                    if stage_term_id is None or stage_name == 'N/A':
                        stage_term_id = ""
                        stage_name = ""
                        stage_uberon_term_id = ""

                    if stage_name is not None:
                        stage = {
                            "stageTermId": stage_term_id,
                            "stageName": stage_name,
                            "ei_uuid": expression_unique_key
                        }
                        stage_list.append(stage)
                    else:
                        stage_uberon_term_id = ""

                    if 'crossReference' in xpat:
                        cross_ref = xpat.get('crossReference')
                        cross_ref_id = cross_ref.get('id')
                        local_cross_ref_id = cross_ref_id.split(":")[1]
                        prefix = cross_ref.get('id').split(":")[0]
                        pages = cross_ref.get('pages')

                        # some pages collection have 0 elements
                        if pages is not None and len(pages) > 0:
                            for page in pages:
                                if page == 'gene/expression/annotation/detail':
                                    mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\
                                            local_cross_ref_id,
                                            self.xref_url_map,
                                            prefix, page)

                                    xref = ETLHelper.get_xref_dict(
                                        local_cross_ref_id, prefix, page, page,
                                        cross_ref_id, mod_global_cross_ref_id,
                                        cross_ref_id + page)
                                    xref['ei_uuid'] = expression_unique_key
                                    cross_references.append(xref)

                    bio_entity = {
                        "ebe_uuid": expression_entity_unique_key,
                        "whereExpressedStatement": where_expressed_statement
                    }
                    bio_entities.append(bio_entity)

                    bio_join_entity = {
                        "ei_uuid": expression_unique_key,
                        "assay": assay
                    }
                    bio_join_entities.append(bio_join_entity)

                    bio_entity_gene_ao = {
                        "geneId": gene_id,
                        "ebe_uuid": expression_entity_unique_key,
                        "anatomicalStructureTermId":
                        anatomical_structure_term_id,
                        "ei_uuid": expression_unique_key
                    }
                    bio_entity_gene_aos.append(bio_entity_gene_ao)

                    pub = {
                        "ei_uuid": expression_unique_key,
                        "pubPrimaryKey": pub_med_id + publication_mod_id,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModId": publication_mod_id,
                        "pubModUrl": pub_mod_url
                    }
                    pubs.append(pub)

                    ao_expression = {
                        "geneId": gene_id,
                        "whenExpressedStage": when_expressed_stage,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModId": publication_mod_id,
                        "pubModUrl": pub_mod_url,
                        "pubPrimaryKey": pub_med_id + publication_mod_id,
                        "uuid": str(uuid.uuid4()),
                        "assay": assay,
                        "anatomicalStructureTermId":
                        anatomical_structure_term_id,
                        "whereExpressedStatement": where_expressed_statement,
                        "ei_uuid": expression_unique_key,
                        "ebe_uuid": expression_entity_unique_key
                    }
                    ao_expressions.append(ao_expression)

                    if cellular_component_qualifier_term_id is not None:

                        cc_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "cellularComponentQualifierTermId":
                            cellular_component_qualifier_term_id
                        }
                        cc_qualifiers.append(cc_qualifier)

                    if anatomical_structure_term_id is None:
                        anatomical_structure_term_id = ""

                        cc_expression = {
                            "geneId": gene_id,
                            "whenExpressedStage": when_expressed_stage,
                            "pubMedId": pub_med_id,
                            "pubMedUrl": pub_med_url,
                            "pubModId": publication_mod_id,
                            "pubModUrl": pub_mod_url,
                            "pubPrimaryKey": pub_med_id + publication_mod_id,
                            "assay": assay,
                            "whereExpressedStatement":
                            where_expressed_statement,
                            "cellularComponentTermId":
                            cellular_component_term_id,
                            "ei_uuid": expression_unique_key,
                            "ebe_uuid": expression_entity_unique_key
                        }
                        cc_expressions.append(cc_expression)

                    if anatomical_structure_qualifier_term_id is not None:
                        ao_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalStructureQualifierTermId":
                            anatomical_structure_qualifier_term_id
                        }

                        ao_qualifiers.append(ao_qualifier)

                    if anatomical_sub_structure_term_id is not None:
                        ao_substructure = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureTermId":
                            anatomical_sub_structure_term_id
                        }

                        ao_substructures.append(ao_substructure)

                    if anatomical_sub_structure_qualifier_term_id is not None:
                        ao_ss_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureQualifierTermId":
                            anatomical_sub_structure_qualifier_term_id
                        }

                        ao_ss_qualifiers.append(ao_ss_qualifier)

                    if where_expressed_statement is None:
                        where_expressed_statement = ""

                    if anatomical_structure_term_id is not None \
                            and anatomical_structure_term_id != "" \
                            and cellular_component_term_id is not None \
                            and cellular_component_term_id != "":

                        ao_cc_expression = {
                            "geneId": gene_id,
                            "whenExpressedStage": when_expressed_stage,
                            "pubMedId": pub_med_id,
                            "pubMedUrl": pub_med_url,
                            "pubModId": publication_mod_id,
                            "pubModUrl": pub_mod_url,
                            "pubPrimaryKey": pub_med_id + publication_mod_id,
                            "uuid": str(uuid.uuid4()),
                            "stageTermId": stage_term_id,
                            "stageName": stage_name,
                            "stageUberonTermId": stage_uberon_term_id,
                            "assay": assay,
                            "cellularComponentTermId":
                            cellular_component_term_id,
                            "anatomicalStructureTermId":
                            anatomical_structure_term_id,
                            "whereExpressedStatement":
                            where_expressed_statement,
                            "ei_uuid": expression_unique_key,
                            "ebe_uuid": expression_entity_unique_key
                        }

                        ao_cc_expressions.append(ao_cc_expression)

                if counter == batch_size:
                    yield [
                        bio_entities, bio_entity_gene_aos, bio_join_entities,
                        ao_expressions, cc_expressions, ao_cc_expressions,
                        ao_qualifiers, ao_substructures, ao_ss_qualifiers,
                        cc_qualifiers, stage_list, stage_uberon_data,
                        uberon_ao_data, uberon_ao_other_data,
                        uberon_stage_other_data, cross_references, pubs
                    ]
                    bio_entities = []
                    bio_join_entities = []
                    ao_expressions = []
                    cc_expressions = []
                    ao_qualifiers = []
                    ao_substructures = []
                    ao_ss_qualifiers = []
                    cc_qualifiers = []
                    ao_cc_expressions = []
                    stage_list = []
                    uberon_stage_other_data = []
                    stage_uberon_data = []
                    uberon_ao_other_data = []
                    uberon_ao_data = []
                    cross_references = []
                    bio_entity_gene_aos = []
                    pubs = []
                    counter = 0

            if counter > 0:
                yield [
                    bio_entities, bio_entity_gene_aos, bio_join_entities,
                    ao_expressions, cc_expressions, ao_cc_expressions,
                    ao_qualifiers, ao_substructures, ao_ss_qualifiers,
                    cc_qualifiers, stage_list, stage_uberon_data,
                    uberon_ao_data, uberon_ao_other_data,
                    uberon_stage_other_data, cross_references, pubs
                ]
コード例 #7
0
    def get_generators(self, allele_data, batch_size):

        data_providers = []
        release = ""
        alleles_no_constrcut_no_gene = []
        alleles_construct_gene = []
        alleles_no_construct = []
        alleles_no_gene = []
        allele_synonyms = []
        allele_secondary_ids = []
        cross_reference_list = []

        counter = 0
        date_produced = allele_data['metaData']['dateProduced']

        data_provider_object = allele_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        loadKey = date_produced + data_provider + "_ALLELE"

        # TODO: get SGD to fix their files.

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider,
                                                                      data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(data_provider, data_provider, data_provider_page,
                                                                       data_provider_page, data_provider,
                                                                       cross_ref_complete_url,
                                                                       data_provider + data_provider_page))

                data_providers.append(data_provider)
                logger.info("data provider: " + data_provider)

        if 'release' in allele_data['metaData']:
            release = allele_data['metaData']['release']

        for allele_record in allele_data['data']:
            counter = counter + 1
            global_id = allele_record['primaryId']
            # fixing parsing error on this end while MGI fixes on their end.
            if global_id == 'MGI:3826848':
                description = allele_record.get('description')[:-2]
            else:
                description = allele_record.get('description')

            local_id = global_id.split(":")[1]
            mod_global_cross_ref_id = ""

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            gene_id = ''
            construct_id = ''
            association_type = ''

            short_species_abbreviation = ETLHelper.get_short_species_abbreviation(allele_record.get('taxonId'))
            symbol_text = TextProcessingHelper.cleanhtml(allele_record.get('symbol'))

            if allele_record.get('alleleObjectRelations') is not None:
                for relation in allele_record.get('alleleObjectRelations'):
                    association_type = relation.get('objectRelation').get('associationType')
                    if relation.get('objectRelation').get('gene') is not None:
                        gene_id = relation.get('objectRelation').get('gene')
                    if relation.get('objectRelation').get('construct') is not None:
                        construct_id = relation.get('objectRelation').get('construct')

                    if gene_id != '' and construct_id != '':
                        allele_construct_gene_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "geneId": gene_id,
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "constructId": construct_id,
                            "associationType": association_type
                        }
                        alleles_construct_gene.append(allele_construct_gene_dataset)

                    elif construct_id != '' and gene_id == '':
                        allele_construct_no_gene_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "constructId": construct_id,
                            "associationType": association_type
                        }

                        alleles_no_gene.append(allele_construct_no_gene_dataset)

                    elif gene_id != '' and construct_id == '':
                        allele_gene_no_construct_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "geneId": gene_id,
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "associationType": association_type
                        }

                        alleles_no_construct.append(allele_gene_no_construct_dataset)

                    elif gene_id == '' and construct_id == '':
                        allele_no_gene_no_construct_dataset = {
                            "symbol": allele_record.get('symbol'),
                            "primaryId": allele_record.get('primaryId'),
                            "globalId": global_id,
                            "localId": local_id,
                            "taxonId": allele_record.get('taxonId'),
                            "dataProviders": data_providers,
                            "dateProduced": date_produced,
                            "loadKey": loadKey,
                            "release": release,
                            "modGlobalCrossRefId": mod_global_cross_ref_id,
                            "uuid": str(uuid.uuid4()),
                            "dataProvider": data_provider,
                            "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                            "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                            "symbolText": symbol_text,
                            "alleleDescription": description,
                            "associationType": association_type
                        }

                        alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset)

            else:
                allele_no_gene_no_construct_dataset = {
                    "symbol": allele_record.get('symbol'),
                    "primaryId": allele_record.get('primaryId'),
                    "globalId": global_id,
                    "localId": local_id,
                    "taxonId": allele_record.get('taxonId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": loadKey,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": data_provider,
                    "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")",
                    "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")",
                    "symbolText": symbol_text,
                    "alleleDescription": description,
                    "associationType": association_type
                }
                alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset)

            if 'crossReferences' in allele_record:

                for crossRef in allele_record['crossReferences']:
                    crossRefId = crossRef.get('id')
                    local_crossref_id = crossRefId.split(":")[1]
                    prefix = crossRef.get('id').split(":")[0]
                    pages = crossRef.get('pages')

                    # some pages collection have 0 elements
                    if pages is not None and len(pages) > 0:
                        for page in pages:
                            if page == 'allele' or page == 'allele/references' or page == 'transgene' or page == 'construct' \
                                    or page == 'transgene/references' or page == 'construct/references':
                                mod_global_cross_ref_id = ETLHelper.get_page_complete_url(local_crossref_id,
                                                                                      self.xref_url_map, prefix, page)
                                xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, crossRefId,
                                                               mod_global_cross_ref_id, crossRefId + page)
                                xref['dataId'] = global_id
                                cross_reference_list.append(xref)

            if 'synonyms' in allele_record:
                for syn in allele_record.get('synonyms'):
                    allele_synonym = {
                        "data_id": allele_record.get('primaryId'),
                        "synonym": syn.strip()
                    }
                    allele_synonyms.append(allele_synonym)

            if 'secondaryIds' in allele_record:
                for secondary_id in allele_record.get('secondaryIds'):
                    allele_secondary_id = {
                        "data_id": allele_record.get('primaryId'),
                        "secondary_id": secondary_id
                    }
                    allele_secondary_ids.append(allele_secondary_id)

            if counter == batch_size:
                yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene,
                       allele_secondary_ids, allele_synonyms, cross_reference_list]
                alleles_no_construct = []
                alleles_construct_gene = []
                alleles_no_gene = []
                alleles_no_constrcut_no_gene = []

                allele_secondary_ids = []
                allele_synonyms = []
                cross_reference_list = []
                counter = 0

        if counter > 0:
            yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene,
                   allele_secondary_ids, allele_synonyms, cross_reference_list]
コード例 #8
0
    def get_generators(self, variant_data, batch_size):
        """Get Generators"""

        data_providers = []
        release = ""
        variants = []
        variant_genomic_locations = []
        variant_so_terms = []
        cross_references = []
        counter = 0
        date_produced = variant_data['metaData']['dateProduced']

        data_provider_object = variant_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_VARIATION"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(
                    ETLHelper.get_xref_dict(
                        data_provider, data_provider, data_provider_page,
                        data_provider_page, data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.debug("data provider: %s", data_provider)

        if 'release' in variant_data['metaData']:
            release = variant_data['metaData']['release']

        assemblies = {}
        for allele_record in variant_data['data']:
            chromosome = allele_record["chromosome"]
            if chromosome.startswith("chr"):
                chromosome_str = chromosome[3:]
            else:
                chromosome_str = chromosome

            assembly = allele_record["assembly"]

            if assembly not in assemblies:
                self.logger.info(assembly)
                context_info = ContextInfo()
                data_manager = DataFileManager(
                    context_info.config_file_location)
                assemblies[assembly] = AssemblySequenceHelper(
                    assembly, data_manager)

            so_term_id = allele_record.get('type')
            genomic_reference_sequence = allele_record.get(
                'genomicReferenceSequence')
            genomic_variant_sequence = allele_record.get(
                'genomicVariantSequence')

            if genomic_reference_sequence == 'N/A':
                genomic_reference_sequence = ""
            if genomic_variant_sequence == 'N/A':
                genomic_variant_sequence = ""

            padding_left = ""
            padding_right = ""
            if allele_record.get('start') != "" and allele_record.get(
                    'end') != "":

                # not insertion
                if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567":
                    genomic_reference_sequence = assemblies[
                        assembly].get_sequence(chromosome_str,
                                               allele_record.get('start'),
                                               allele_record.get('end'))

                if allele_record.get('start') < allele_record.get('end'):
                    start = allele_record.get('start')
                    end = allele_record.get('end')
                else:
                    start = allele_record.get('end')
                    end = allele_record.get('start')

                padding_width = 500
                if so_term_id != "SO:0000667":  #not insertion
                    start = start - 1
                    end = end + 1

                left_padding_start = start - padding_width
                if left_padding_start < 1:
                    left_padding_start = 1

                padding_left = assemblies[assembly].get_sequence(
                    chromosome_str, left_padding_start, start)
                right_padding_end = end + padding_width
                padding_right = assemblies[assembly].get_sequence(
                    chromosome_str, end, right_padding_end)
            counter = counter + 1
            global_id = allele_record.get('alleleId')
            mod_global_cross_ref_id = ""
            cross_references = []

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            cross_ref_primary_id = allele_record.get(
                'sequenceOfReferenceAccessionNumber')
            local_cross_ref_id = cross_ref_primary_id.split(":")[1]
            prefix = cross_ref_primary_id.split(":")[0]

            cross_ref_complete_url = ETLHelper.get_no_page_complete_url(
                local_cross_ref_id, ETL.xref_url_map, prefix, global_id)
            xref_map = ETLHelper.get_xref_dict(
                local_cross_ref_id, prefix, "variant_sequence_of_reference",
                "sequence_of_reference_accession_number", global_id,
                cross_ref_complete_url,
                cross_ref_primary_id + "variant_sequence_of_reference")

            xref_map['dataId'] = global_id
            if cross_ref_primary_id is not None:
                cross_references.append(xref_map)

            if genomic_reference_sequence is not None:
                if len(genomic_reference_sequence) > 1000 and (
                        allele_record.get('type') == 'SO:1000002'
                        or allele_record.get('type') == 'SO:1000008'):
                    self.logger.debug("%s genomicReferenceSequence",
                                      allele_record.get('alleleId'))

            if genomic_variant_sequence is not None:
                if len(genomic_variant_sequence) > 1000 and (
                        allele_record.get('type')
                        in ['SO:1000002', 'SO:1000008']):
                    self.logger.debug("%s genomicVariantSequence",
                                      allele_record.get('alleleId'))

            hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature(
                allele_record.get('sequenceOfReferenceAccessionNumber'),
                allele_record.get('type'), allele_record.get('start'),
                allele_record.get('end'),
                genomic_reference_sequence, genomic_variant_sequence,
                allele_record.get('assembly'), chromosome_str)

            if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \
                    or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000:
                self.logger.debug(
                    "%s has too long of a sequence potentionally",
                    allele_record.get('alleleId'))

            # TODO: fix typo in MGI Submission for this variant so
            # that it doesn't list a 40K bp point mutation.
            if allele_record.get('alleleId') != 'MGI:6113870':

                variant_dataset = {
                    "hgvs_nomenclature": hgvs_nomenclature,
                    "genomicReferenceSequence": genomic_reference_sequence,
                    "genomicVariantSequence": genomic_variant_sequence,
                    "paddingLeft": padding_left,
                    "paddingRight": padding_right,
                    "alleleId": allele_record.get('alleleId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": load_key,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "dataProvider": data_provider,
                    "variantHGVSSynonym": hgvs_synonym
                }

                variant_genomic_location_dataset = {
                    "variantId": hgvs_nomenclature,
                    "assembly": allele_record.get('assembly'),
                    "chromosome": chromosome_str,
                    "start": allele_record.get('start'),
                    "end": allele_record.get('end'),
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": data_provider
                }

                variant_so_term = {
                    "variantId": hgvs_nomenclature,
                    "soTermId": allele_record.get('type')
                }

                variant_so_terms.append(variant_so_term)
                variant_genomic_locations.append(
                    variant_genomic_location_dataset)
                variants.append(variant_dataset)

            if counter == batch_size:
                yield [
                    variants, variant_genomic_locations, variant_so_terms,
                    cross_references
                ]
                variants = []
                variant_genomic_locations = []
                variant_so_terms = []
                cross_references = []

        if counter > 0:
            yield [
                variants, variant_genomic_locations, variant_so_terms,
                cross_references
            ]