Exemple #1
0
def test_sequences(test_data_dir, tmp_path, gbk, flavor):
    """Parse a genbank, write it to disk, then parse it again and compare."""
    gbk = test_data_dir / gbk
    with open(gbk, "r") as fh:
        collections = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh)))

    tmp_gbk = tmp_path / "tmp.gbk"
    with open(tmp_gbk, "w") as fh:
        collection_to_genbank(collections, fh, flavor)

    with open(tmp_gbk, "r") as fh:
        new_collection = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh)))

    assert len(collections[0].genes) == len(new_collection[0].genes)

    for gene_a, gene_b in zip(collections[0], new_collection[0]):
        assert gene_a._location == gene_b._location
        tx_a = gene_a.transcripts[0]
        tx_b = gene_b.transcripts[0]
        assert tx_a._location == tx_b._location
        if tx_a.is_coding:
            assert tx_a.cds._location == tx_b.cds._location
            assert tx_a.get_protein_sequence() == tx_b.get_protein_sequence()
        assert tx_a.get_transcript_sequence() == tx_b.get_transcript_sequence()
Exemple #2
0
def test_missing_translation(test_data_dir, tmp_path):
    gbk = test_data_dir / "INSC1003_wrong_missing_translation.gbk"
    with open(gbk, "r") as fh:
        collections = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))

    tmp_gbk = tmp_path / "tmp.gbk"
    with open(tmp_gbk, "w") as fh:
        collection_to_genbank(collections, fh, update_translations=False)

    with open(tmp_gbk, "r") as fh:
        annot = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))[0]
        genes = annot.genes
        assert "translation" not in genes[0].transcripts[0].qualifiers
        assert "translation" in genes[1].transcripts[0].qualifiers
        assert genes[1].transcripts[0].qualifiers["translation"] == {
            "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGK"
            "QIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEE"
            "NDIPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPDTAQRVADWLGKNYLQNQEGF"
            "VHICRLDTAGARVLEN"
        }

    # now export to file and force the translations to be recalculated
    tmp_gbk = tmp_path / "tmp.gbk"
    with open(tmp_gbk, "w") as fh:
        collection_to_genbank(collections, fh, update_translations=True)

    with open(tmp_gbk, "r") as fh:
        annot = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))[0]
        genes = annot.genes
        assert genes[0].transcripts[0].qualifiers["translation"] == {
            "MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERIFAELLTGLAAA"
            "QPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLE"
            "STVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDAR"
            "LLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMK"
            "GMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRT"
            "LRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSWL"
            "KNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHV"
            "VTPNKKANTSSMDYYHLLRHAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKLDEGMSFSEATT"
            "LAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVL"
            "RYVGNIDEDGACRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV*"
        }
        assert genes[1].transcripts[0].qualifiers["translation"] == {
            "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGK"
            "QIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEE"
            "NDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGFIHACYSRQPELAAKLMKDVIAEPYRE"
            "RLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPDTAQRVADWLGKNYLQNQEGFVHICRLDTAGARVLEN*"
        }
Exemple #3
0
 def test_parse_peg10(self, test_data_dir):
     """PEG10 is a human gene with a -1 frameshift"""
     gff3 = test_data_dir / "PEG10_offset_gff3_fasta.gff3"
     gff3_rec = list(
         ParsedAnnotationRecord.parsed_annotation_records_to_model(
             parse_gff3_embedded_fasta(gff3)))[0]
     tx = gff3_rec.genes[0].transcripts[0]
     assert not tx.has_in_frame_stop
Exemple #4
0
    def test_broken_frameshift(self, test_data_dir):
        """If I merge the transcript, the frames list no longer matches the location and an exception is raised."""
        gbk = test_data_dir / "insO_frameshift.gbk"
        with open(gbk, "r") as fh:
            gbk_rec = list(
                ParsedAnnotationRecord.parsed_annotation_records_to_model(
                    parse_genbank(fh)))[0]

        cds = gbk_rec.genes[0].get_primary_transcript().cds
        cds._location = cds._location.merge_overlapping()
        with pytest.raises(MismatchedFrameException):
            _ = cds.translate()
Exemple #5
0
    def test_parse_inso(self, test_data_dir):
        """This proves we handle frame and phase"""
        gbk = test_data_dir / "insO_frameshift.gbk"
        gff3 = test_data_dir / "insO_frameshift.gff3"

        with open(gbk, "r") as fh:
            gbk_rec = list(
                ParsedAnnotationRecord.parsed_annotation_records_to_model(
                    parse_genbank(fh)))[0]

        gff3_rec = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_gff3_embedded_fasta(gff3)))[0]

        expected_protein = (
            "MKKRNFSAEFKRESAQLVVDQKYTVADAAKAMDVGLSTMTRWVKQLRDERQGKTPKASPITPEQIEIRKLRKKLQRIEMENEILKKNRP"
            "EKPDGRRAVLRSQVLELHGISHGSAGARSIATMATRRGYQMGRWLAGRLMKELGLVSCQQPTHRYKRGGHEHVAIPNYLERQFAVTEPNQV"
            "WCGDVTYIWTGKRWAYLAVVLDLFARKPVGWAMSFSPDSRLTMKALEMAWETRGKPVGVMFQSDQGSHYTSRQFRQLLWRYRIRQSMSRR"
            "GNCWDNSPMERFFRSLKNEWVPATGYVSFSDAAHAITDYIVGYYSALRPHEYNGGLPPNESENRYWKNSNAEASFS*"
        )
        assert (str(gbk_rec.genes[0].get_primary_protein()) == str(
            gff3_rec.genes[0].get_primary_protein()) == expected_protein)
Exemple #6
0
def test_tbl_export_from_gff3(test_data_dir, tmp_path, gff3, expected_tbl):
    gff3 = test_data_dir / gff3
    recs = list(
        ParsedAnnotationRecord.parsed_annotation_records_to_model(
            parse_gff3_embedded_fasta(gff3)))
    tmp = tmp_path / "tmp.tbl"
    with open(tmp, "w") as fh:
        collection_to_tbl(recs,
                          fh,
                          locus_tag_prefix="test",
                          submitter_lab_name="inscripta",
                          random_seed=123)
    with open(tmp) as fh1, open(test_data_dir / expected_tbl) as fh2:
        assert fh1.read() == fh2.read()
    def test_genbank_to_gff(self, test_data_dir, tmp_path, gbk, gff3, add_sequences):
        """
        INSC1006_chrI.gff3 and INSC1003.gff3 were created from INSC1006_chrI.gbff and INSC1003.gbk
        respectively, so we can compare to the source file.
        """
        gbk = test_data_dir / gbk
        with open(gbk, "r") as fh:
            parsed = list(ParsedAnnotationRecord.parsed_annotation_records_to_model(parse_genbank(fh)))

        tmp_gff = tmp_path / "tmp.gff"
        with open(tmp_gff, "w") as fh:
            collection_to_gff3(parsed, fh, add_sequences=add_sequences)

        for l1, l2 in zip(open(tmp_gff), open(test_data_dir / gff3)):
            assert l1 == l2
Exemple #8
0
def test_collection_to_fasta_from_genbank(test_data_dir, tmp_path):
    """This FASTA export matches exactly because there are no FASTA comments."""
    gbk = test_data_dir / "INSC1006_chrI.gbff"
    with open(gbk, "r") as fh:
        parsed = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh)))

    tmp_fasta = tmp_path / "tmp.fasta"
    with open(tmp_fasta, "w") as fh:
        collection_to_fasta(parsed, fh)

    with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1006_chrI.fa",
                                           "r") as fh2:
        assert fh1.read() == fh2.read()
Exemple #9
0
def test_tbl_export_from_genbank_prokaryotic(test_data_dir, tmp_path, genbank,
                                             expected_tbl):
    genbank = test_data_dir / genbank
    recs = list(
        ParsedAnnotationRecord.parsed_annotation_records_to_model(
            parse_genbank(genbank)))
    tmp = tmp_path / "tmp.tbl"
    with open(tmp, "w") as fh:
        collection_to_tbl(
            recs,
            fh,
            locus_tag_prefix="test",
            submitter_lab_name="inscripta",
            genbank_flavor=GenbankFlavor.PROKARYOTIC,
            random_seed=123,
        )
    with open(tmp) as fh1, open(test_data_dir / expected_tbl) as fh2:
        assert fh1.read() == fh2.read()
Exemple #10
0
def _produce_empty_records(
    seqrecords_dict: Dict[str, SeqRecord], seen_seqs: Set[str]
) -> Iterable[ParsedAnnotationRecord]:
    """
    Convenience function shared by :meth:`parse_gff3_embedded_fasta()` and :meth:`parse_gff3_fasta()` that appends
    empty ``ParsedAnnotationRecord`` objects to the end. This ensures that every sequence in the FASTA is still
    represented in the final object set, even if it has zero annotations.

    Args:
        seqrecords_dict: Dictionary mapping sequence names to SeqRecord objects.
        seen_seqs: Set of sequences that were found when parsing the GFF3.

    Yields:
        Iterable of ``ParsedAnnotationRecord`` objects with empty annotations.
    """
    for sequence_name in seqrecords_dict.keys() - seen_seqs:
        seqrecord = seqrecords_dict[sequence_name]
        annot = AnnotationCollectionModel.Schema().load(dict(sequence_name=seqrecord.id, start=0, end=len(seqrecord)))
        yield ParsedAnnotationRecord(annotation=annot, seqrecord=seqrecord)
Exemple #11
0
def test_collection_to_fasta_from_genbank_fasta_header(test_data_dir,
                                                       tmp_path):
    """INSC1003.fa has FASTA comments, and so the sequence will match but the comments will be lost."""
    gbk = test_data_dir / "INSC1003.gbk"
    with open(gbk, "r") as fh:
        parsed = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh)))

    tmp_fasta = tmp_path / "tmp.fasta"
    with open(tmp_fasta, "w") as fh:
        collection_to_fasta(parsed, fh)

    with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1003.fa",
                                           "r") as fh2:
        f1 = fh1.readlines()
        f2 = fh2.readlines()
        assert f1[1:] == f2[1:]
        assert f1[0] != f2[0]
        assert f1[0].split()[0] == f2[0].split()[0]
Exemple #12
0
def parse_standard_gff3(
    gff: Path,
    gffutil_parse_args: Optional[GffutilsParseArgs] = GffutilsParseArgs(),
    parse_func: Optional[Callable[[FeatureDB, List[str]], Iterable[AnnotationCollectionModel]]] = default_parse_func,
    gffutil_transform_func: Optional[Callable[[Feature], Feature]] = None,
    db_fn: Optional[str] = ":memory:",
) -> Iterable[ParsedAnnotationRecord]:
    """Parses a GFF3 file using gffutils.

    The parameters parse_func, gffutil_parse_args are implemented separately for each data source. A default
    implementation exists in this module.

    Args:
        gff: Path to a GFF. Must be local or HTTPS.
        parse_func: Function that actually converts gffutils to BioCantor representation.
        gffutil_transform_func: Function that transforms feature keys. Can be necessary in cases where IDs are not
            unique.
        gffutil_parse_args: Parsing arguments to pass to gffutils.
        db_fn: Location to write a gffutils database. Defaults to `:memory:`, which means the database will be built
            transiently. This value can be set to a file location if memory is a concern, or if you want to retain
            the gffutils database. It will not be cleaned up.

    Yields:
        Iterable of ``ParsedAnnotationRecord`` objects.
    """
    db = gffutils.create_db(str(gff), db_fn, transform=gffutil_transform_func, **gffutil_parse_args.__dict__)
    if sum(db.count_features_of_type(i) for i in db.featuretypes()) == 0:
        raise EmptyGFF3Exception("Parsing this GFF3 led to zero features. Is it empty or corrupted?")
    logger.info(f"Parsed {gff}")
    for i in db.featuretypes():
        logger.info(f"Found feature type {i} with {db.count_features_of_type(i)} features")
    # get the sequences
    chrom_query = db.execute("SELECT DISTINCT seqid FROM features")
    chroms = [x["seqid"] for x in chrom_query]
    logger.info(f"Found {len(chroms)} sequences")
    for annot in parse_func(db, chroms):
        yield ParsedAnnotationRecord(annot)
Exemple #13
0
def group_gene_records_by_locus_tag(
    record_iter: Iterator[SeqRecord],
    parse_func: Callable[[GeneFeature], Dict[str, Any]],
    feature_parse_func: Callable[[FeatureIntervalGenBankCollection],
                                 Dict[str, Any]],
    genbank_parser_type: GenBankParserType = GenBankParserType.LOCUS_TAG,
) -> Iterator[ParsedAnnotationRecord]:
    """Model 2: ``locus_tag`` defined GenBank.

    All feature types that qualify within the hierarchical structure, possess a locus_tag, and whose feature type
    are valid for a known transcribed interval type, will be included in the gene parsing.

    All other feature types will become generic features (FeatureIntervals), unless we are in hybrid mode.

    In hybrid mode, locus_tag is used first, then all of the remaining features are sent to the
    sorted parser.

    Args:
        record_iter: Iterator of SeqRecord objects.
        parse_func: Optional parse function implementation.
        feature_parse_func: Optional feature interval parse function implementation.
        genbank_parser_type: Optional parser type. Changing this to GenBankParserType.HYBRID
            will enable hybrid parsing mode.

    Yields:
        :class:`ParsedAnnotationRecord`.
    """
    if genbank_parser_type not in [
            GenBankParserType.LOCUS_TAG, GenBankParserType.HYBRID
    ]:
        raise GenBankParserError("Must use either locus_tag or hybrid")

    tot_genes = 0
    tot_features = 0
    for seqrecord in record_iter:
        gene_filtered_features = []
        remaining_features = []
        source = None
        for f in seqrecord.features:
            if f.type in GENBANK_GENE_FEATURES and KnownQualifiers.LOCUS_TAG.value in f.qualifiers:
                gene_filtered_features.append(f)
            elif f.type == MetadataFeatures.SOURCE.value:
                source = f
            else:
                remaining_features.append(f)

        sorted_gene_filtered_features = sorted(
            gene_filtered_features,
            key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value])

        genes = []
        for locus_tag, gene_features in itertools.groupby(
                sorted_gene_filtered_features,
                key=lambda f: f.qualifiers[KnownQualifiers.LOCUS_TAG.value][0
                                                                            ]):
            # sort the features for this locus tag to bubble the "gene" feature to the top, if it exists
            gene_features = sorted(
                gene_features, key=lambda f: f.type != GeneFeatures.GENE.value)

            # do we have more than one gene with this locus_tag?
            if len(gene_features
                   ) > 1 and gene_features[1].type == GeneFeatures.GENE.value:
                raise GenBankLocusTagError(
                    f"Grouping by locus tag found multiple gene features with the same locus tag:"
                    f"\n{gene_features[0]}\n{gene_features[1]}")

            gene_feature = gene_features[0]
            if gene_feature.type == GeneFeatures.GENE.value:
                gene = _construct_gene_from_feature(gene_feature, seqrecord,
                                                    GeneFeature)
            else:
                gene = _construct_gene_from_feature(
                    gene_feature, seqrecord,
                    GeneFeature.from_transcript_or_cds_feature)
            # gene is None if it was not parseable
            if not gene:
                continue

            for feature in gene_features[1:]:
                if feature.type in TranscriptFeature.types:
                    gene.add_child(feature)
                elif feature.type in IntervalFeature.types:
                    if len(gene.children) == 0:
                        gene.add_child(feature)
                    else:
                        gene.children[-1].add_child(feature)

            if gene.has_children:
                gene.finalize()
                gene = parse_func(gene)
                genes.append(gene)

        if source is not None:
            source_qualifiers = source.qualifiers
        else:
            source_qualifiers = None

        if genbank_parser_type == GenBankParserType.LOCUS_TAG:
            feature_collections = _extract_generic_features(
                seqrecord, remaining_features, feature_parse_func)
        else:
            # hybrid parsing mode
            tmp_seqrecord = deepcopy(seqrecord)
            tmp_seqrecord.features = remaining_features
            tmp_annotation = next(
                group_gene_records_from_sorted_genbank(
                    (tmp_seqrecord, ), parse_func, feature_parse_func))
            if tmp_annotation.annotation.feature_collections:
                feature_collections = [
                    FeatureIntervalCollectionModel.Schema().dump(x)
                    for x in tmp_annotation.annotation.feature_collections
                ]
            else:
                feature_collections = None
            if tmp_annotation.annotation.genes:
                genes.extend([
                    GeneIntervalModel.Schema().dump(x)
                    for x in tmp_annotation.annotation.genes
                ])

        tot_features += len(feature_collections) if feature_collections else 0
        tot_genes += len(genes) if genes else 0

        annotation = AnnotationCollectionModel.Schema().load(
            dict(
                genes=genes,
                feature_collections=feature_collections,
                name=seqrecord.id,
                sequence_name=seqrecord.id,
                start=0,
                end=len(seqrecord),
                qualifiers=source_qualifiers,
            ))
        yield ParsedAnnotationRecord(annotation=annotation,
                                     seqrecord=seqrecord)

    if tot_genes + tot_features == 0:
        raise EmptyGenBankError(
            "GenBank parsing produced zero genes and zero features.")
Exemple #14
0
def group_gene_records_from_sorted_genbank(
    record_iter: Iterator[SeqRecord],
    parse_func: Callable[[GeneFeature], Dict[str, Any]],
    feature_parse_func: Callable[[FeatureIntervalGenBankCollection],
                                 Dict[str, Any]],
) -> Iterator[ParsedAnnotationRecord]:
    """Model 1: position sorted GenBank.

    This function looks for canonical gene records:
        gene -> Optional(mRNA) -> CDS records
    It also looks for canonical non-coding records:
        gene -> {misc_RNA,tRNA,rRNA,etc)

    It also will infer non-canonical record types, including non-coding transcripts and coding genes
    from isolated CDS/non-coding features (those without a gene feature before them in the sort order).

    Any features that do not fit the above bins are interpreted as generic features.

    Some GenBank files are improperly ordered, and will have things like the CDS feature first, or the mRNA feature
    first. To try and capture this, the full set of records are sorted first by position, then in the order:

    gene
    mRNA
    CDS
    exon
    anything else

    Args:
        record_iter: Iterator of SeqRecord objects.
        parse_func: Optional parse function implementation.
        feature_parse_func: Optional feature interval parse function implementation.

    Yields:
        :class:`ParsedAnnotationRecord`.
    """
    tot_genes = 0
    tot_features = 0
    for seqrecord in record_iter:
        gene = None
        source = None
        genes = []
        # capture non-gene intervals downstream
        feature_features = []

        # sort features to try to capture weirdly ordered genbank files
        sorted_features = sorted(
            seqrecord.features,
            key=lambda x: (
                x.location.nofuzzy_start,
                x.type != GeneFeatures.GENE.value,
                x.type != TranscriptFeatures.CODING_TRANSCRIPT.value,
                x.type != GeneIntervalFeatures.CDS.value,
                x.type != GeneIntervalFeatures.EXON.value,
            ),
        )
        for feature in sorted_features:
            # try to capture the Source field, if it exists
            if feature.type == MetadataFeatures.SOURCE.value:
                source = feature
            # base case for start; iterate until we find a gene
            elif gene is None:
                if feature.type in GeneFeature.types:
                    gene = _construct_gene_from_feature(
                        feature, seqrecord, GeneFeature)
                    # gene is None if it was not parseable
                    if not gene:
                        continue
                # base case for starting with a isolated ncRNA or CDS feature; immediately add them
                # and reset the gene to None
                elif feature.type in TranscriptFeature.types or feature.type in IntervalFeature.types:
                    gene = _construct_gene_from_feature(
                        feature, seqrecord,
                        GeneFeature.from_transcript_or_cds_feature)
                    # gene is None if it was not parseable
                    if gene:
                        gene.finalize()
                        gene = parse_func(gene)
                        genes.append(gene)
                        gene = None
                # this must be a generic feature
                else:
                    feature_features.append(feature)
            # next gene; re-set the gene object and report out the collection
            elif feature.type in GeneFeature.types:
                if gene.has_children:
                    gene.finalize()
                    gene = parse_func(gene)
                    genes.append(gene)
                gene = _construct_gene_from_feature(feature, seqrecord,
                                                    GeneFeature)
                if not gene:
                    continue
            elif feature.type in TranscriptFeature.types:
                # if the current gene is non-empty, and the feature is not a mRNA, then this is a isolated ncRNA
                # finish this gene and start a new one
                if feature.type != TranscriptFeatures.CODING_TRANSCRIPT and gene.has_children:
                    gene.finalize()
                    gene = parse_func(gene)
                    genes.append(gene)
                    gene = _construct_gene_from_feature(
                        feature, seqrecord,
                        GeneFeature.from_transcript_or_cds_feature)
                    # gene is None if it was not parseable
                    if not gene:
                        continue
                else:
                    gene.add_child(feature)
            elif feature.type in IntervalFeature.types:
                if not gene.has_children:
                    gene.add_child(feature)
                else:
                    gene.children[-1].add_child(feature)
            else:
                feature_features.append(feature)

        # gene could be None if this record has no annotations
        if gene is not None and gene.has_children:
            gene.finalize()
            gene = parse_func(gene)
            genes.append(gene)

        if source is not None:
            source_qualifiers = source.qualifiers
        else:
            source_qualifiers = None

        feature_collections = _extract_generic_features(
            seqrecord, feature_features, feature_parse_func)

        tot_features += len(feature_collections) if feature_collections else 0
        tot_genes += len(genes) if genes else 0

        annotation = AnnotationCollectionModel.Schema().load(
            dict(
                genes=genes,
                feature_collections=feature_collections,
                sequence_name=seqrecord.id,
                start=0,
                end=len(seqrecord),
                qualifiers=source_qualifiers,
            ))
        yield ParsedAnnotationRecord(annotation=annotation,
                                     seqrecord=seqrecord)

    if tot_genes + tot_features == 0:
        raise EmptyGenBankError(
            "GenBank parsing produced zero genes and zero features.")