コード例 #1
0
    def get_generators(self, expression_file, batch_size):
        """Get Generators"""

        self.logger.debug("made it to the expression generator")

        counter = 0

        cross_references = []
        bio_entities = []
        bio_join_entities = []
        bio_entity_gene_aos = []
        pubs = []
        ao_expressions = []
        cc_expressions = []
        ao_qualifiers = []
        ao_substructures = []
        ao_ss_qualifiers = []
        cc_qualifiers = []
        ao_cc_expressions = []
        stage_list = []
        stage_uberon_data = []
        uberon_ao_data = []
        uberon_ao_other_data = []
        uberon_stage_other_data = []

        self.logger.debug("streaming json data from %s ...", expression_file)
        with codecs.open(expression_file, 'r', 'utf-8') as file_handle:
            for xpat in ijson.items(file_handle, 'data.item'):
                counter = counter + 1

                pub_med_url = None
                pub_mod_url = None
                pub_med_id = ""
                publication_mod_id = ""
                stage_term_id = ""
                stage_name = ""
                stage_uberon_term_id = ""
                gene_id = xpat.get('geneId')

                if self.test_object.using_test_data() is True:
                    is_it_test_entry = self.test_object.check_for_test_id_entry(
                        gene_id)
                    if is_it_test_entry is False:
                        counter = counter - 1
                        continue

                evidence = xpat.get('evidence')

                if 'publicationId' in evidence:
                    if evidence.get('publicationId').startswith('PMID:'):
                        pub_med_id = evidence.get('publicationId')
                        local_pub_med_id = pub_med_id.split(":")[1]
                        pub_med_prefix = pub_med_id.split(":")[0]
                        pub_med_url = ETLHelper.get_no_page_complete_url(
                            local_pub_med_id, self.xref_url_map,
                            pub_med_prefix, gene_id)
                        if pub_med_id is None:
                            pub_med_id = ""

                        if 'crossReference' in evidence:
                            pub_xref = evidence.get('crossReference')
                            publication_mod_id = pub_xref.get('id')

                            if publication_mod_id is not None:
                                pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \
                                        publication_mod_id)

                    else:
                        publication_mod_id = evidence['publicationId']
                        if publication_mod_id is not None:
                            pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\
                                              publication_mod_id)

                    if publication_mod_id is None:
                        publication_mod_id = ""

                assay = xpat.get('assay')

                if 'whereExpressed' in xpat:

                    where_expressed = xpat.get('whereExpressed')
                    cellular_component_qualifier_term_id = \
                           where_expressed.get('cellularComponentQualifierTermId')
                    cellular_component_term_id = where_expressed.get(
                        'cellularComponentTermId')
                    anatomical_structure_term_id = where_expressed.get(
                        'anatomicalStructureTermId')
                    anatomical_structure_qualifier_term_id = where_expressed.get(
                        'anatomicalStructureQualifierTermId')
                    anatomical_sub_structure_term_id = \
                            where_expressed.get('anatomicalSubStructureTermId')
                    anatomical_sub_structure_qualifier_term_id = where_expressed.get(
                        'anatomicalSubStructureQualifierTermId')
                    where_expressed_statement = where_expressed.get(
                        'whereExpressedStatement')

                    when_expressed_stage = xpat.get('whenExpressed')

                    if 'stageTermId' in when_expressed_stage:
                        stage_term_id = when_expressed_stage.get('stageTermId')
                    if 'stageName' in when_expressed_stage:
                        stage_name = when_expressed_stage.get('stageName')

                    # TODO: making unique BioEntityGeneExpressionJoin nodes
                    # and ExpressionBioEntity nodes is tedious.
                    # TODO: Lets get the DQMs to fix this.
                    expression_unique_key = gene_id + assay + stage_name
                    expression_entity_unique_key = ""

                    if anatomical_structure_term_id is not None:
                        expression_unique_key += anatomical_structure_term_id
                        expression_entity_unique_key = anatomical_structure_term_id

                        if anatomical_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_structure_qualifier_term_id
                            expression_entity_unique_key += anatomical_structure_qualifier_term_id

                    if cellular_component_term_id is not None:
                        expression_unique_key += cellular_component_term_id
                        expression_entity_unique_key += cellular_component_term_id

                        if cellular_component_qualifier_term_id is not None:
                            expression_unique_key += cellular_component_qualifier_term_id
                            expression_entity_unique_key += cellular_component_qualifier_term_id

                    if anatomical_sub_structure_term_id is not None:
                        expression_unique_key += anatomical_sub_structure_term_id

                        if anatomical_sub_structure_qualifier_term_id is not None:
                            expression_unique_key += anatomical_sub_structure_qualifier_term_id
                            expression_entity_unique_key \
                                    += anatomical_sub_structure_qualifier_term_id

                    expression_entity_unique_key += where_expressed_statement
                    expression_unique_key += where_expressed_statement

                    if where_expressed.get(
                            'anatomicalStructureUberonSlimTermIds'
                    ) is not None:
                        for uberon_structure_term_object in \
                                where_expressed.get('anatomicalStructureUberonSlimTermIds'):
                            structure_uberon_term_id = \
                                    uberon_structure_term_object.get('uberonTerm')
                            if structure_uberon_term_id is not None \
                                    and structure_uberon_term_id != 'Other':
                                structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": structure_uberon_term_id
                                }
                                uberon_ao_data.append(structure_uberon_term)
                            elif structure_uberon_term_id is not None \
                                    and structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if where_expressed.get(
                            'anatomicalSubStructureUberonSlimTermIds'
                    ) is not None:
                        for uberon_sub_structure_term_object in \
                                where_expressed.get('anatomicalSubStructureUberonSlimTermIds'):
                            sub_structure_uberon_term_id = \
                                    uberon_sub_structure_term_object.get('uberonTerm')
                            if sub_structure_uberon_term_id is not None \
                                    and sub_structure_uberon_term_id != 'Other':
                                sub_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key,
                                    "aoUberonId": sub_structure_uberon_term_id
                                }
                                uberon_ao_data.append(
                                    sub_structure_uberon_term)
                            elif sub_structure_uberon_term_id is not None \
                                    and sub_structure_uberon_term_id == 'Other':
                                other_structure_uberon_term = {
                                    "ebe_uuid": expression_entity_unique_key
                                }
                                uberon_ao_other_data.append(
                                    other_structure_uberon_term)

                    if cellular_component_term_id is None:
                        cellular_component_term_id = ""

                    if when_expressed_stage.get(
                            'stageUberonSlimTerm') is not None:
                        stage_uberon_term_object = when_expressed_stage.get(
                            'stageUberonSlimTerm')
                        stage_uberon_term_id = stage_uberon_term_object.get(
                            "uberonTerm")
                        if stage_uberon_term_id is not None \
                            and stage_uberon_term_id != "post embryonic, pre-adult":
                            stage_uberon = {
                                "uberonStageId": stage_uberon_term_id,
                                "ei_uuid": expression_unique_key
                            }
                            stage_uberon_data.append(stage_uberon)
                        if stage_uberon_term_id == "post embryonic, pre-adult":
                            stage_uberon_other = {
                                "ei_uuid": expression_unique_key
                            }
                            uberon_stage_other_data.append(stage_uberon_other)

                    if stage_term_id is None or stage_name == 'N/A':
                        stage_term_id = ""
                        stage_name = ""
                        stage_uberon_term_id = ""

                    if stage_name is not None:
                        stage = {
                            "stageTermId": stage_term_id,
                            "stageName": stage_name,
                            "ei_uuid": expression_unique_key
                        }
                        stage_list.append(stage)
                    else:
                        stage_uberon_term_id = ""

                    if 'crossReference' in xpat:
                        cross_ref = xpat.get('crossReference')
                        cross_ref_id = cross_ref.get('id')
                        local_cross_ref_id = cross_ref_id.split(":")[1]
                        prefix = cross_ref.get('id').split(":")[0]
                        pages = cross_ref.get('pages')

                        # some pages collection have 0 elements
                        if pages is not None and len(pages) > 0:
                            for page in pages:
                                if page == 'gene/expression/annotation/detail':
                                    mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\
                                            local_cross_ref_id,
                                            self.xref_url_map,
                                            prefix, page)

                                    xref = ETLHelper.get_xref_dict(
                                        local_cross_ref_id, prefix, page, page,
                                        cross_ref_id, mod_global_cross_ref_id,
                                        cross_ref_id + page)
                                    xref['ei_uuid'] = expression_unique_key
                                    cross_references.append(xref)

                    bio_entity = {
                        "ebe_uuid": expression_entity_unique_key,
                        "whereExpressedStatement": where_expressed_statement
                    }
                    bio_entities.append(bio_entity)

                    bio_join_entity = {
                        "ei_uuid": expression_unique_key,
                        "assay": assay
                    }
                    bio_join_entities.append(bio_join_entity)

                    bio_entity_gene_ao = {
                        "geneId": gene_id,
                        "ebe_uuid": expression_entity_unique_key,
                        "anatomicalStructureTermId":
                        anatomical_structure_term_id,
                        "ei_uuid": expression_unique_key
                    }
                    bio_entity_gene_aos.append(bio_entity_gene_ao)

                    pub = {
                        "ei_uuid": expression_unique_key,
                        "pubPrimaryKey": pub_med_id + publication_mod_id,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModId": publication_mod_id,
                        "pubModUrl": pub_mod_url
                    }
                    pubs.append(pub)

                    ao_expression = {
                        "geneId": gene_id,
                        "whenExpressedStage": when_expressed_stage,
                        "pubMedId": pub_med_id,
                        "pubMedUrl": pub_med_url,
                        "pubModId": publication_mod_id,
                        "pubModUrl": pub_mod_url,
                        "pubPrimaryKey": pub_med_id + publication_mod_id,
                        "uuid": str(uuid.uuid4()),
                        "assay": assay,
                        "anatomicalStructureTermId":
                        anatomical_structure_term_id,
                        "whereExpressedStatement": where_expressed_statement,
                        "ei_uuid": expression_unique_key,
                        "ebe_uuid": expression_entity_unique_key
                    }
                    ao_expressions.append(ao_expression)

                    if cellular_component_qualifier_term_id is not None:

                        cc_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "cellularComponentQualifierTermId":
                            cellular_component_qualifier_term_id
                        }
                        cc_qualifiers.append(cc_qualifier)

                    if anatomical_structure_term_id is None:
                        anatomical_structure_term_id = ""

                        cc_expression = {
                            "geneId": gene_id,
                            "whenExpressedStage": when_expressed_stage,
                            "pubMedId": pub_med_id,
                            "pubMedUrl": pub_med_url,
                            "pubModId": publication_mod_id,
                            "pubModUrl": pub_mod_url,
                            "pubPrimaryKey": pub_med_id + publication_mod_id,
                            "assay": assay,
                            "whereExpressedStatement":
                            where_expressed_statement,
                            "cellularComponentTermId":
                            cellular_component_term_id,
                            "ei_uuid": expression_unique_key,
                            "ebe_uuid": expression_entity_unique_key
                        }
                        cc_expressions.append(cc_expression)

                    if anatomical_structure_qualifier_term_id is not None:
                        ao_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalStructureQualifierTermId":
                            anatomical_structure_qualifier_term_id
                        }

                        ao_qualifiers.append(ao_qualifier)

                    if anatomical_sub_structure_term_id is not None:
                        ao_substructure = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureTermId":
                            anatomical_sub_structure_term_id
                        }

                        ao_substructures.append(ao_substructure)

                    if anatomical_sub_structure_qualifier_term_id is not None:
                        ao_ss_qualifier = {
                            "ebe_uuid":
                            expression_entity_unique_key,
                            "anatomicalSubStructureQualifierTermId":
                            anatomical_sub_structure_qualifier_term_id
                        }

                        ao_ss_qualifiers.append(ao_ss_qualifier)

                    if where_expressed_statement is None:
                        where_expressed_statement = ""

                    if anatomical_structure_term_id is not None \
                            and anatomical_structure_term_id != "" \
                            and cellular_component_term_id is not None \
                            and cellular_component_term_id != "":

                        ao_cc_expression = {
                            "geneId": gene_id,
                            "whenExpressedStage": when_expressed_stage,
                            "pubMedId": pub_med_id,
                            "pubMedUrl": pub_med_url,
                            "pubModId": publication_mod_id,
                            "pubModUrl": pub_mod_url,
                            "pubPrimaryKey": pub_med_id + publication_mod_id,
                            "uuid": str(uuid.uuid4()),
                            "stageTermId": stage_term_id,
                            "stageName": stage_name,
                            "stageUberonTermId": stage_uberon_term_id,
                            "assay": assay,
                            "cellularComponentTermId":
                            cellular_component_term_id,
                            "anatomicalStructureTermId":
                            anatomical_structure_term_id,
                            "whereExpressedStatement":
                            where_expressed_statement,
                            "ei_uuid": expression_unique_key,
                            "ebe_uuid": expression_entity_unique_key
                        }

                        ao_cc_expressions.append(ao_cc_expression)

                if counter == batch_size:
                    yield [
                        bio_entities, bio_entity_gene_aos, bio_join_entities,
                        ao_expressions, cc_expressions, ao_cc_expressions,
                        ao_qualifiers, ao_substructures, ao_ss_qualifiers,
                        cc_qualifiers, stage_list, stage_uberon_data,
                        uberon_ao_data, uberon_ao_other_data,
                        uberon_stage_other_data, cross_references, pubs
                    ]
                    bio_entities = []
                    bio_join_entities = []
                    ao_expressions = []
                    cc_expressions = []
                    ao_qualifiers = []
                    ao_substructures = []
                    ao_ss_qualifiers = []
                    cc_qualifiers = []
                    ao_cc_expressions = []
                    stage_list = []
                    uberon_stage_other_data = []
                    stage_uberon_data = []
                    uberon_ao_other_data = []
                    uberon_ao_data = []
                    cross_references = []
                    bio_entity_gene_aos = []
                    pubs = []
                    counter = 0

            if counter > 0:
                yield [
                    bio_entities, bio_entity_gene_aos, bio_join_entities,
                    ao_expressions, cc_expressions, ao_cc_expressions,
                    ao_qualifiers, ao_substructures, ao_ss_qualifiers,
                    cc_qualifiers, stage_list, stage_uberon_data,
                    uberon_ao_data, uberon_ao_other_data,
                    uberon_stage_other_data, cross_references, pubs
                ]
コード例 #2
0
    def get_generators(self, phenotype_data, batch_size):
        """Get Generators"""

        list_to_yield = []
        pge_list_to_yield = []
        date_produced = phenotype_data['metaData']['dateProduced']
        data_providers = []
        data_provider_object = phenotype_data['metaData']['dataProvider']
        counter = 0
        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []
        pge_key = ''

        load_key = date_produced + data_provider + "_phenotype"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, ETL.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\
                        data_provider,
                        data_provider,
                        data_provider_page,
                        data_provider_page,
                        data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.debug("data provider: %s", data_provider)

        for pheno in phenotype_data['data']:
            pecj_primary_key = str(uuid.uuid4())
            counter = counter + 1
            pub_med_id = None
            pub_mod_id = None
            pub_med_url = None
            pub_mod_url = None
            primary_id = pheno.get('objectId')
            phenotype_statement = pheno.get('phenotypeStatement')

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    primary_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            evidence = pheno.get('evidence')

            if 'publicationId' in evidence:
                if evidence.get('publicationId').startswith('PMID:'):
                    pub_med_id = evidence['publicationId']
                    local_pub_med_id = pub_med_id.split(":")[1]
                    pub_med_prefix = pub_med_id.split(":")[0]
                    pub_med_url = ETLHelper.get_no_page_complete_url(
                        local_pub_med_id, self.xref_url_map, pub_med_prefix,
                        primary_id)
                    if pub_med_id is None:
                        pub_med_id = ""

                    if 'crossReference' in evidence:
                        pub_xref = evidence.get('crossReference')
                        pub_mod_id = pub_xref.get('id')
                        pub_mod_local_id = pub_mod_id.split(":")[1]
                        if pub_mod_id is not None:
                            pub_mod_url = ETLHelper.get_complete_pub_url(
                                pub_mod_local_id, pub_mod_id)

                else:
                    pub_mod_id = evidence.get('publicationId')
                    if pub_mod_id is not None:
                        pub_mod_local_id = pub_mod_id.split(":")[1]
                        pub_mod_url = ETLHelper.get_complete_pub_url(
                            pub_mod_local_id, pub_mod_id)

                if pub_mod_id is None:
                    pub_mod_id = ""

            if pub_med_id is None:
                pub_med_id = ""

            if pub_mod_id is None:
                pub_mod_id = ""

            date_assigned = pheno.get('dateAssigned')

            if pub_mod_id is None and pub_med_id is None:
                self.logger.info("%s is missing pubMed and pubMod id",
                                 primary_id)

            if 'primaryGeneticEntityIDs' in pheno:
                pge_ids = pheno.get('primaryGeneticEntityIDs')
                for pge in pge_ids:
                    pge_key = pge_key + pge
                    pge_map = {
                        "pecjPrimaryKey": pecj_primary_key,
                        "pgeId": pge
                    }
                    pge_list_to_yield.append(pge_map)

            phenotype = {
                "primaryId": primary_id,
                "phenotypeUniqueKey": primary_id + phenotype_statement.strip(),
                "phenotypeStatement": phenotype_statement.strip(),
                "dateAssigned": date_assigned,
                "loadKey": load_key,
                "type": "gene",
                "dataProviders": data_providers,
                "dataProvider": data_provider,
                "dateProduced": date_produced,
                "pubMedId": pub_med_id,
                "pubMedUrl": pub_med_url,
                "pubModId": pub_mod_id,
                "pubModUrl": pub_mod_url,
                "pubPrimaryKey": pub_med_id + pub_mod_id,
                "pecjPrimaryKey": pecj_primary_key
            }

            list_to_yield.append(phenotype)

            if counter == batch_size:
                yield [
                    list_to_yield, list_to_yield, list_to_yield,
                    pge_list_to_yield, pge_list_to_yield
                ]
                list_to_yield = []
                pge_list_to_yield = []
                counter = 0

        if counter > 0:
            yield [
                list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield,
                pge_list_to_yield
            ]
コード例 #3
0
    def get_generators(self, variant_data, batch_size):
        """Get Generators"""

        data_providers = []
        release = ""
        variants = []
        variant_genomic_locations = []
        variant_so_terms = []
        cross_references = []
        counter = 0
        date_produced = variant_data['metaData']['dateProduced']

        data_provider_object = variant_data['metaData']['dataProvider']

        data_provider_cross_ref = data_provider_object.get('crossReference')
        data_provider = data_provider_cross_ref.get('id')
        data_provider_pages = data_provider_cross_ref.get('pages')
        data_provider_cross_ref_set = []

        load_key = date_produced + data_provider + "_VARIATION"

        if data_provider_pages is not None:
            for data_provider_page in data_provider_pages:
                cross_ref_complete_url = ETLHelper.get_page_complete_url(
                    data_provider, self.xref_url_map, data_provider,
                    data_provider_page)

                data_provider_cross_ref_set.append(
                    ETLHelper.get_xref_dict(
                        data_provider, data_provider, data_provider_page,
                        data_provider_page, data_provider,
                        cross_ref_complete_url,
                        data_provider + data_provider_page))

                data_providers.append(data_provider)
                self.logger.debug("data provider: %s", data_provider)

        if 'release' in variant_data['metaData']:
            release = variant_data['metaData']['release']

        assemblies = {}
        for allele_record in variant_data['data']:
            chromosome = allele_record["chromosome"]
            if chromosome.startswith("chr"):
                chromosome_str = chromosome[3:]
            else:
                chromosome_str = chromosome

            assembly = allele_record["assembly"]

            if assembly not in assemblies:
                self.logger.info(assembly)
                context_info = ContextInfo()
                data_manager = DataFileManager(
                    context_info.config_file_location)
                assemblies[assembly] = AssemblySequenceHelper(
                    assembly, data_manager)

            so_term_id = allele_record.get('type')
            genomic_reference_sequence = allele_record.get(
                'genomicReferenceSequence')
            genomic_variant_sequence = allele_record.get(
                'genomicVariantSequence')

            if genomic_reference_sequence == 'N/A':
                genomic_reference_sequence = ""
            if genomic_variant_sequence == 'N/A':
                genomic_variant_sequence = ""

            padding_left = ""
            padding_right = ""
            if allele_record.get('start') != "" and allele_record.get(
                    'end') != "":

                # not insertion
                if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567":
                    genomic_reference_sequence = assemblies[
                        assembly].get_sequence(chromosome_str,
                                               allele_record.get('start'),
                                               allele_record.get('end'))

                if allele_record.get('start') < allele_record.get('end'):
                    start = allele_record.get('start')
                    end = allele_record.get('end')
                else:
                    start = allele_record.get('end')
                    end = allele_record.get('start')

                padding_width = 500
                if so_term_id != "SO:0000667":  #not insertion
                    start = start - 1
                    end = end + 1

                left_padding_start = start - padding_width
                if left_padding_start < 1:
                    left_padding_start = 1

                padding_left = assemblies[assembly].get_sequence(
                    chromosome_str, left_padding_start, start)
                right_padding_end = end + padding_width
                padding_right = assemblies[assembly].get_sequence(
                    chromosome_str, end, right_padding_end)
            counter = counter + 1
            global_id = allele_record.get('alleleId')
            mod_global_cross_ref_id = ""
            cross_references = []

            if self.test_object.using_test_data() is True:
                is_it_test_entry = self.test_object.check_for_test_id_entry(
                    global_id)
                if is_it_test_entry is False:
                    counter = counter - 1
                    continue

            cross_ref_primary_id = allele_record.get(
                'sequenceOfReferenceAccessionNumber')
            local_cross_ref_id = cross_ref_primary_id.split(":")[1]
            prefix = cross_ref_primary_id.split(":")[0]

            cross_ref_complete_url = ETLHelper.get_no_page_complete_url(
                local_cross_ref_id, ETL.xref_url_map, prefix, global_id)
            xref_map = ETLHelper.get_xref_dict(
                local_cross_ref_id, prefix, "variant_sequence_of_reference",
                "sequence_of_reference_accession_number", global_id,
                cross_ref_complete_url,
                cross_ref_primary_id + "variant_sequence_of_reference")

            xref_map['dataId'] = global_id
            if cross_ref_primary_id is not None:
                cross_references.append(xref_map)

            if genomic_reference_sequence is not None:
                if len(genomic_reference_sequence) > 1000 and (
                        allele_record.get('type') == 'SO:1000002'
                        or allele_record.get('type') == 'SO:1000008'):
                    self.logger.debug("%s genomicReferenceSequence",
                                      allele_record.get('alleleId'))

            if genomic_variant_sequence is not None:
                if len(genomic_variant_sequence) > 1000 and (
                        allele_record.get('type')
                        in ['SO:1000002', 'SO:1000008']):
                    self.logger.debug("%s genomicVariantSequence",
                                      allele_record.get('alleleId'))

            hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature(
                allele_record.get('sequenceOfReferenceAccessionNumber'),
                allele_record.get('type'), allele_record.get('start'),
                allele_record.get('end'),
                genomic_reference_sequence, genomic_variant_sequence,
                allele_record.get('assembly'), chromosome_str)

            if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \
                    or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000:
                self.logger.debug(
                    "%s has too long of a sequence potentionally",
                    allele_record.get('alleleId'))

            # TODO: fix typo in MGI Submission for this variant so
            # that it doesn't list a 40K bp point mutation.
            if allele_record.get('alleleId') != 'MGI:6113870':

                variant_dataset = {
                    "hgvs_nomenclature": hgvs_nomenclature,
                    "genomicReferenceSequence": genomic_reference_sequence,
                    "genomicVariantSequence": genomic_variant_sequence,
                    "paddingLeft": padding_left,
                    "paddingRight": padding_right,
                    "alleleId": allele_record.get('alleleId'),
                    "dataProviders": data_providers,
                    "dateProduced": date_produced,
                    "loadKey": load_key,
                    "release": release,
                    "modGlobalCrossRefId": mod_global_cross_ref_id,
                    "dataProvider": data_provider,
                    "variantHGVSSynonym": hgvs_synonym
                }

                variant_genomic_location_dataset = {
                    "variantId": hgvs_nomenclature,
                    "assembly": allele_record.get('assembly'),
                    "chromosome": chromosome_str,
                    "start": allele_record.get('start'),
                    "end": allele_record.get('end'),
                    "uuid": str(uuid.uuid4()),
                    "dataProvider": data_provider
                }

                variant_so_term = {
                    "variantId": hgvs_nomenclature,
                    "soTermId": allele_record.get('type')
                }

                variant_so_terms.append(variant_so_term)
                variant_genomic_locations.append(
                    variant_genomic_location_dataset)
                variants.append(variant_dataset)

            if counter == batch_size:
                yield [
                    variants, variant_genomic_locations, variant_so_terms,
                    cross_references
                ]
                variants = []
                variant_genomic_locations = []
                variant_so_terms = []
                cross_references = []

        if counter > 0:
            yield [
                variants, variant_genomic_locations, variant_so_terms,
                cross_references
            ]