Esempio n. 1
0
def get_locations(CDSs, start, end, strand):
    """Return mRNA and CDS locations
    CDS has exact boundaries, while mRNA not.
    """
    #gff is 1-based, gb also, but sf is 0-based
    if len(CDSs) > 1:
        parts, mrnaparts = [], []
        for cdsi, (s, e) in enumerate(CDSs):
            parts.append(FeatureLocation(s - 1, e, strand=strand))
            if cdsi == 0:
                mrnaparts.append(
                    FeatureLocation(BeforePosition(s - 1), e, strand=strand))
            elif cdsi == len(CDSs) - 1:
                mrnaparts.append(
                    FeatureLocation(s - 1, AfterPosition(e), strand=strand))
            else:
                mrnaparts.append(FeatureLocation(s - 1, e, strand=strand))
        cdsloc = CompoundLocation(parts)
        mrnaloc = CompoundLocation(parts)
    else:
        cdsloc = FeatureLocation(start - 1, end, strand=strand)
        mrnaloc = FeatureLocation(BeforePosition(start - 1),
                                  AfterPosition(end),
                                  strand=strand)
    return cdsloc, mrnaloc
Esempio n. 2
0
 def _set_after(self, location):
     """
     Changes a FeatureLocation to include a "BeforePosition" or 
     "AfterPosition" to indicate that the mRNA does not include 
     stop codon.
     """
     if location.strand >= 0:  # forward strand
         if len(location.parts) > 1:
             location.parts[-1] = FeatureLocation(
                 location.parts[-1].start,
                 AfterPosition(location.parts[-1].end),
                 strand=location.parts[-1].strand)
         else:
             location = FeatureLocation(location.start,
                                        AfterPosition(location.end),
                                        strand=location.strand)
     else:
         if len(location.parts) > 1:
             location.parts[0] = FeatureLocation(
                 BeforePosition(location.parts[0].start),
                 location.parts[0].end,
                 strand=location.parts[0].strand)
         else:
             location = FeatureLocation(BeforePosition(location.start),
                                        location.end,
                                        strand=location.strand)
     return location
Esempio n. 3
0
 def test_before(self):
     """Features: write/read simple before locations."""
     f = SeqFeature(FeatureLocation(BeforePosition(5),10), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f), "<6..10")
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(BeforePosition(15),BeforePosition(20)), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f), "<16..<20")
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(25,BeforePosition(30)), \
                    strand=+1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f), "26..<30")
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(BeforePosition(35),40), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f),
                      "complement(<36..40)")
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(BeforePosition(45),BeforePosition(50)), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f),
                      "complement(<46..<50)")
     self.record.features.append(f)
     f = SeqFeature(FeatureLocation(55,BeforePosition(60)), \
                    strand=-1, type="CDS")
     self.assertEqual(_insdc_feature_location_string(f),
                      "complement(56..<60)")
     self.record.features.append(f)
     self.write_read_check()
Esempio n. 4
0
 def setUp(self):
     f0 = SeqFeature(
         FeatureLocation(0, 26),
         type="source",
         qualifiers={"mol_type": ["fake protein"]},
     )
     f1 = SeqFeature(FeatureLocation(0, ExactPosition(10)))
     f2 = SeqFeature(
         FeatureLocation(WithinPosition(12, left=12, right=15),
                         BeforePosition(22)))
     f3 = SeqFeature(
         FeatureLocation(
             AfterPosition(16),
             OneOfPosition(
                 26,
                 [ExactPosition(25), AfterPosition(26)]),
         ))
     self.record = SeqRecord(
         Seq("ABCDEFGHIJKLMNOPQRSTUVWZYX", generic_protein),
         id="TestID",
         name="TestName",
         description="TestDescr",
         dbxrefs=["TestXRef"],
         annotations={"k": "v"},
         letter_annotations={"fake": "X" * 26},
         features=[f0, f1, f2, f3],
     )
Esempio n. 5
0
        def _get_feature(line):
            # modified at 2017.8.15. Suppress error when a crispr is detected at the edge of a contig.
            cols = line.strip().split()
            start, end = int(cols[3]), int(cols[5])

            extracted = [
                x for x in self.seq_info
                if x.start - self.__class__.INTERVAL <= start
                and end <= x.end + self.__class__.INTERVAL
            ]
            # extracted = [x for x in self.seq_info if x.start <= start and end <= x.end]
            # if len(extracted) != 1:
            #     print(extracted)
            #     print(self.seq_info)
            #     print(start, end)
            assert len(extracted) == 1
            crt_seq = extracted[0]
            seq_id = crt_seq.id
            start = start - crt_seq.start
            end = end - crt_seq.start + 1

            start_position = BeforePosition(0) if start <= 0 else start
            end_position = AfterPosition(
                crt_seq.length) if end >= crt_seq.length else end
            location = FeatureLocation(start_position, end_position, strand=1)
            return ExtendedFeature(location=location,
                                   type="CRISPR",
                                   seq_id=seq_id)
Esempio n. 6
0
    def test_warnings_on_data_loss(self):
        """Emit warnings when dropping data on write."""
        h = BytesIO()

        # Fabricate a record with > 255 features
        record = SeqRecord(Seq("ACGT"))
        for i in range(260):
            feature = SeqFeature(FeatureLocation(1, 2), type="misc_feature")
            record.features.append(feature)
        with self.assertWarnsRegex(BiopythonWarning, "Too many features"):
            SeqIO.write([record], h, "xdna")

        # Now a record with a fuzzy-located feature
        feature = SeqFeature(FeatureLocation(BeforePosition(2), 3),
                             type="misc_feature")
        record.features = [feature]
        with self.assertWarnsRegex(
                BiopythonWarning,
                r"Dropping \d+ features with fuzzy locations"):
            SeqIO.write([record], h, "xdna")

        # Now a record with a feature with a qualifier too long
        qualifiers = {"note": ["x" * 260]}
        feature = SeqFeature(FeatureLocation(2, 3),
                             type="misc_feature",
                             qualifiers=qualifiers)
        record.features = [feature]
        with self.assertWarnsRegex(
                BiopythonWarning,
                "Some annotations were truncated to 255 characters"):
            SeqIO.write([record], h, "xdna")

        h.close()
Esempio n. 7
0
def construct_location(raw_start: str, raw_end: str, raw_strand: str,
                       attributes: Dict[str, str], strict: bool = False) -> FeatureLocation:
    """ Converts the raw sections of a GFF line into a FeatureLocation.

        Some attribute keys can modify the location's values.
    """

    try:
        start = ExactPosition(int(raw_start) - 1)  # 0-indexed as FeatureLocation expects
        end = ExactPosition(int(raw_end))
    except ValueError as err:
        raise GFFParseError("Invalid location values: %s" % str(err))

    if start < 0 or end < 0:
        raise GFFParseError("Invalid location values: %s, %s" % (raw_start, raw_end))

    strand = interpret_strand(raw_strand, strict=strict)

    # handle ambiguous positions as noted in attributes
    if attributes.get("partial") == "true" and ("start_range" in attributes or "end_range" in attributes):
        attributes.pop("partial")
        start_range = attributes.pop("start_range", "%s,%s" % (start, end))
        end_range = attributes.pop("end_range", "%s,%s" % (start, end))
        if start_range.startswith("."):
            start = BeforePosition(int(start))
        if end_range.endswith("."):
            end = AfterPosition(int(end))

    return FeatureLocation(start, end, strand)
Esempio n. 8
0
 def get_feature_location(self):
     # Coordinate is 0-based in Biopython object
     start = BeforePosition(
         self.left - 1) if self.left_partial else ExactPosition(self.left -
                                                                1)
     end = AfterPosition(
         self.right) if self.right_partial else ExactPosition(self.right)
     return FeatureLocation(start, end, strand=self.strand)
Esempio n. 9
0
 def parse_position(string: str):
     """ Converts a positiong from a string into a Position subclass """
     if string[0] == '<':
         return BeforePosition(int(string[1:]))
     if string[0] == '>':
         return AfterPosition(int(string[1:]))
     if string == "UnknownPosition()":
         return UnknownPosition()
     return ExactPosition(int(string))
 def getLocation(self, left, right, strand, partial_flag="00"):
     """partialFlag = {00:both ends existing, 10:left-end missing, 01:right-end missing, 00:both-ends missing}"""
     strand = 1 if (strand == "+" or strand == "1" or strand == 1) else -1
     leftPosition = BeforePosition(
         int(left) -
         1) if partial_flag[0] == "1" else ExactPosition(int(left) - 1)
     rightPosition = AfterPosition(
         int(right)) if partial_flag[1] == "1" else ExactPosition(
             int(right))
     return FeatureLocation(leftPosition, rightPosition, strand=strand)
Esempio n. 11
0
    def test_before_position(self):
        location = FeatureLocation(BeforePosition(1),
                                   ExactPosition(6),
                                   strand=-1)
        new_location = self.convert(location)

        assert isinstance(new_location.start, BeforePosition)
        assert new_location.start == 1

        assert isinstance(new_location.end, ExactPosition)
        assert new_location.end == 6
Esempio n. 12
0
 def test_fuzzy_join(self):
     """Features: write/read fuzzy join locations."""
     f1 = SeqFeature(FeatureLocation(BeforePosition(10), 20), strand=+1)
     f2 = SeqFeature(FeatureLocation(25, AfterPosition(40)), strand=+1)
     f = self.make_join_feature([f1, f2])
     self.record.features.append(f)
     self.assertEqual(_insdc_feature_location_string(f),
                      "join(<11..20,26..>40)")
     f1 = SeqFeature(FeatureLocation(
         OneOfPosition([ExactPosition(107),
                        ExactPosition(110)]), 120),
                     strand=+1)
     f2 = SeqFeature(FeatureLocation(125, 140), strand=+1)
     f3 = SeqFeature(FeatureLocation(145, WithinPosition(150, 10)),
                     strand=+1)
     f = self.make_join_feature([f1, f2, f3], "CDS")
     self.assertEqual(_insdc_feature_location_string(f),
                      "join(one-of(108,111)..120,126..140,146..(150.160))")
     self.record.features.append(f)
     f1 = SeqFeature(FeatureLocation(BeforePosition(210), 220), strand=-1)
     f2 = SeqFeature(FeatureLocation(225, WithinPosition(240, 4)),
                     strand=-1)
     f = self.make_join_feature([f1, f2], "gene")
     self.assertEqual(_insdc_feature_location_string(f),
                      "complement(join(<211..220,226..(240.244)))")
     self.record.features.append(f)
     f1 = SeqFeature(FeatureLocation(AfterPosition(310), 320), strand=-1)
     f2 = SeqFeature(FeatureLocation(
         325, OneOfPosition([ExactPosition(340),
                             ExactPosition(337)])),
                     strand=-1)
     f3 = SeqFeature(FeatureLocation(345, WithinPosition(350, 5)),
                     strand=-1)
     f = self.make_join_feature([f1, f2, f3], "CDS")
     self.assertEqual(
         _insdc_feature_location_string(f),
         "complement(join(>311..320,326..one-of(340,337),346..(350.355)))")
     self.record.features.append(f)
     self.write_read_check()
Esempio n. 13
0
def create_1_part_seqfeature(start=0,
                             stop=0,
                             strand=1,
                             type="",
                             fuzzy="neither",
                             qualifiers=None):
    """Constructs simple BioPython SeqFeature.

    Start = int
    Stop = int
    Strand = int (-1, 1)
    Type = 'CDS', 'Source', 'tRNA', etc.
    Fuzzy = 'start', 'stop', 'both', or 'neither'
    Qualifiers = dictionary of feature descriptions."""
    if fuzzy == "start":
        seq_ftr = SeqFeature(FeatureLocation(BeforePosition(start),
                                             ExactPosition(stop),
                                             strand=strand),
                             type=type,
                             qualifiers=qualifiers)
    elif fuzzy == "stop":
        seq_ftr = SeqFeature(FeatureLocation(ExactPosition(start),
                                             AfterPosition(stop),
                                             strand=strand),
                             type=type,
                             qualifiers=qualifiers)
    elif fuzzy == "both":
        seq_ftr = SeqFeature(FeatureLocation(BeforePosition(start),
                                             AfterPosition(stop),
                                             strand=strand),
                             type=type,
                             qualifiers=qualifiers)
    else:
        seq_ftr = SeqFeature(FeatureLocation(ExactPosition(start),
                                             ExactPosition(stop),
                                             strand=strand),
                             type=type,
                             qualifiers=qualifiers)
    return seq_ftr
Esempio n. 14
0
    def test_eq_identical(self):
        """Test two identical locations are equal."""
        loc1 = FeatureLocation(23, 42, 1)
        loc2 = FeatureLocation(23, 42, 1)
        self.assertEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, -1)
        loc2 = FeatureLocation(23, 42, -1)
        self.assertEqual(loc1, loc2)

        loc1 = FeatureLocation(BeforePosition(23), AfterPosition(42), 1)
        loc2 = FeatureLocation(23, 42, 1)
        self.assertEqual(loc1, loc2)

        loc1 = FeatureLocation(23, 42, 1, "foo", "bar")
        loc2 = FeatureLocation(23, 42, 1, "foo", "bar")
        self.assertEqual(loc1, loc2)
Esempio n. 15
0
 def test_multiple(self):
     # start, stop, start, stop
     expected = [
         FeatureLocation(ExactPosition(0), ExactPosition(66), strand=1),
         FeatureLocation(ExactPosition(66), ExactPosition(132), strand=1)
     ]
     self.run_both_dirs(expected,
                        "ATG" + "N" * 60 + "TAGGTG" + "N" * 60 + "TGA")
     # start, stop, start
     expected[1] = FeatureLocation(ExactPosition(66),
                                   AfterPosition(69),
                                   strand=1)
     self.run_both_dirs(expected, "ATG" + "N" * 60 + "TAGGTG")
     # stop, start
     expected = [
         FeatureLocation(BeforePosition(0), ExactPosition(3), strand=1),
         FeatureLocation(ExactPosition(3), AfterPosition(9), strand=1)
     ]
     self.run_both_dirs(expected, "TAGGTGNNN")
Esempio n. 16
0
def getgenefromgbk(gbkfile, location):  # change to work with locations
    """parses a genesequence from a gbk file using the gene location
    parameters
    ----------
    gbkfile
        string, path to gbk file + file
    location
        string of coordinates, example: "[start:end>](+)"
    returns
    ----------
    ret = DNA sequence of housekeepinggene from featurelocation
          coordinates
    abs_loc = validation, contains the location of HG on specific
              scaffold. [scaffold, start, end]
    """
    ret = ""
    scaff_number, start, end, strand = location.split(",")
    scaff_number = int(scaff_number)

    # Making the FeatureLocation
    f_start = BeforePosition(
        start.strip("<")) if "<" in start else ExactPosition(start)
    f_end = AfterPosition(end.strip(">")) if ">" in end else ExactPosition(end)
    f = FeatureLocation(f_start, f_end, int(strand))

    gbkcontents = SeqIO.parse(gbkfile, "genbank")
    for record in gbkcontents:
        record_no = record.name.split(".")[0]
        scaff_check = int(record_no[-3:])  # = scaffold number
        if scaff_check == scaff_number:
            DNA = record.seq
    ret = f.extract(DNA)  # The DNA sequence of the housekeepinggene

    # VALIDATION
    start = start.replace(">", "")
    start = start.replace("<", "")
    start = int(start)
    end = end.replace(">", "")
    end = end.replace("<", "")
    end = int(end)
    abs_loc = [scaff_number, start, end]
    return (ret, abs_loc)
Esempio n. 17
0
def create_2_part_seqfeature(start1=0,
                             stop1=0,
                             strand1=1,
                             start2=0,
                             stop2=0,
                             strand2=1,
                             type="",
                             fuzzy="neither",
                             qualifiers=None):
    """Constructs simple BioPython SeqFeature.

    Start1, Start2 = int
    Stop1, Stop2 = int
    Strand1, Strand2 = int (-1, 1)
    Type = 'CDS', 'Source', 'tRNA', etc.
    Fuzzy = 'start', or 'neither'
    Qualifiers = dictionary of feature descriptions."""
    # This function could be improved if needed, but right now only
    # can toggle one of the coordinate fuzziness.
    if fuzzy == "start":
        seq_ftr = SeqFeature(CompoundLocation([
            FeatureLocation(
                BeforePosition(start1), ExactPosition(stop1), strand=strand1),
            FeatureLocation(
                ExactPosition(start2), ExactPosition(stop2), strand=strand2)
        ], "join"),
                             type=type,
                             location_operator="join",
                             qualifiers=qualifiers)
    else:
        seq_ftr = SeqFeature(CompoundLocation([
            FeatureLocation(
                ExactPosition(start1), ExactPosition(stop1), strand=strand1),
            FeatureLocation(
                ExactPosition(start2), ExactPosition(stop2), strand=strand2)
        ], "join"),
                             type=type,
                             location_operator="join",
                             qualifiers=qualifiers)
    return seq_ftr
Esempio n. 18
0
 def test_fuzzy(self):
     """Test fuzzy representations."""
     # check the positions alone
     exact_pos = ExactPosition(5)
     within_pos_s = WithinPosition(10, left=10, right=13)
     within_pos_e = WithinPosition(13, left=10, right=13)
     between_pos_e = BetweenPosition(24, left=20, right=24)
     before_pos = BeforePosition(15)
     after_pos = AfterPosition(40)
     self.assertEqual(int(within_pos_s), 10)
     self.assertEqual(str(within_pos_s), "(10.13)")
     self.assertEqual(int(within_pos_e), 13)
     self.assertEqual(str(within_pos_e), "(10.13)")
     self.assertEqual(int(between_pos_e), 24)
     self.assertEqual(str(between_pos_e), "(20^24)")
     self.assertEqual(str(before_pos), "<15")
     self.assertEqual(str(after_pos), ">40")
     # put these into Locations
     location1 = FeatureLocation(exact_pos, within_pos_e)
     location2 = FeatureLocation(before_pos, between_pos_e)
     location3 = FeatureLocation(within_pos_s, after_pos)
     self.assertEqual(str(location1), "[5:(10.13)]")
     self.assertEqual(str(location1.start), "5")
     self.assertEqual(str(location1.end), "(10.13)")
     self.assertEqual(str(location2), "[<15:(20^24)]")
     self.assertEqual(str(location2.start), "<15")
     self.assertEqual(str(location2.end), "(20^24)")
     self.assertEqual(str(location3), "[(10.13):>40]")
     self.assertEqual(str(location3.start), "(10.13)")
     self.assertEqual(str(location3.end), ">40")
     # --- test non-fuzzy representations
     self.assertEqual(location1.nofuzzy_start, 5)
     self.assertEqual(location1.nofuzzy_end, 13)
     self.assertEqual(location2.nofuzzy_start, 15)
     self.assertEqual(location2.nofuzzy_end, 24)
     self.assertEqual(location3.nofuzzy_start, 10)
     self.assertEqual(location3.nofuzzy_end, 40)
Esempio n. 19
0
    def test_start_before_end(self):
        expected = "must be greater than or equal to start location"
        with self.assertRaises(ValueError) as err:
            FeatureLocation(42, 23, 1)
        self.assertIn(expected, str(err.exception))

        with self.assertRaises(ValueError) as err:
            FeatureLocation(42, 0, 1)
        self.assertIn(expected, str(err.exception))

        with self.assertRaises(ValueError) as err:
            FeatureLocation(BeforePosition(42), AfterPosition(23), -1)
        self.assertIn(expected, str(err.exception))

        with self.assertRaises(ValueError) as err:
            FeatureLocation(42, AfterPosition(0), 1)
        self.assertIn(expected, str(err.exception))

        # Features with UnknownPositions should pass check
        FeatureLocation(42, UnknownPosition())
        FeatureLocation(UnknownPosition(), 42)

        # Same start and end should pass check
        FeatureLocation(42, 42)
Esempio n. 20
0
def _read_ft(record, line):
    name = line[5:13].rstrip()
    if name:
        if line[13:21] == "        ":  # new-style FT line
            location = line[21:80].rstrip()
            try:
                isoform_id, location = location.split(":")
            except ValueError:
                isoform_id = None
            try:
                from_res, to_res = location.split("..")
            except ValueError:
                from_res = location
                to_res = ""
            qualifiers = {}
        else:  # old-style FT line
            from_res = line[14:20].lstrip()
            to_res = line[21:27].lstrip()
            isoform_id = None
            description = line[34:75].rstrip()
            qualifiers = {"description": description}
        if from_res == "?":
            from_res = UnknownPosition()
        elif from_res.startswith("?"):
            position = int(from_res[1:]) - 1  # Python zero-based counting
            from_res = UncertainPosition(position)
        elif from_res.startswith("<"):
            position = int(from_res[1:]) - 1  # Python zero-based counting
            from_res = BeforePosition(position)
        else:
            position = int(from_res) - 1  # Python zero-based counting
            from_res = ExactPosition(position)
        if to_res == "":
            position = from_res + 1
            to_res = ExactPosition(position)
        elif to_res == "?":
            to_res = UnknownPosition()
        elif to_res.startswith("?"):
            position = int(to_res[1:])
            to_res = UncertainPosition(position)
        elif to_res.startswith(">"):
            position = int(to_res[1:])
            to_res = AfterPosition(position)
        else:
            position = int(to_res)
            to_res = ExactPosition(position)
        location = FeatureLocation(from_res, to_res, ref=isoform_id)
        feature = FeatureTable(
            location=location, type=name, id=None, qualifiers=qualifiers
        )
        record.features.append(feature)
        return
    # this line is a continuation of the previous feature
    feature = record.features[-1]
    if line[5:34] == "                             ":  # old-style FT line
        description = line[34:75].rstrip()
        if description.startswith("/FTId="):
            # store the FTId as the feature ID
            feature.id = description[6:].rstrip(".")
            return
        # this line is a continuation of the description of the previous feature
        old_description = feature.qualifiers["description"]
        if old_description.endswith("-"):
            description = "%s%s" % (old_description, description)
        else:
            description = "%s %s" % (old_description, description)

        if feature.type in ("VARSPLIC", "VAR_SEQ"):  # special case
            # Remove unwanted spaces in sequences.
            # During line carryover, the sequences in VARSPLIC/VAR_SEQ can get
            # mangled with unwanted spaces like:
            # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH'
            # We want to check for this case and correct it as it happens.
            try:
                first_seq, second_seq = description.split(" -> ")
            except ValueError:
                pass
            else:
                extra_info = ""
                # we might have more information at the end of the
                # second sequence, which should be in parenthesis
                extra_info_pos = second_seq.find(" (")
                if extra_info_pos != -1:
                    extra_info = second_seq[extra_info_pos:]
                    second_seq = second_seq[:extra_info_pos]
                # now clean spaces out of the first and second string
                first_seq = first_seq.replace(" ", "")
                second_seq = second_seq.replace(" ", "")
                # reassemble the description
                description = first_seq + " -> " + second_seq + extra_info
        feature.qualifiers["description"] = description
    else:  # new-style FT line
        value = line[21:].rstrip()
        if value.startswith("/id="):
            qualifier_type = "id"
            value = value[4:]
            assert value.startswith('"')
            assert value.endswith('"')
            feature.id = value[1:-1]
            return
        elif value.startswith("/evidence="):
            value = value[10:]
            assert value.startswith('"')
            if value.endswith('"'):
                value = value[1:-1]
            else:  # continues on the next line
                value = value[1:]
            assert "evidence" not in feature.qualifiers
            feature.qualifiers["evidence"] = value
            return
        elif value.startswith("/note="):
            value = value[6:]
            assert value.startswith('"')
            if value.endswith('"'):
                value = value[1:-1]
            else:  # continues on the next line
                value = value[1:]
            assert "note" not in feature.qualifiers
            feature.qualifiers["note"] = value
            return
        # this line is a continuation of the description of the previous feature
        keys = list(feature.qualifiers.keys())
        key = keys[-1]
        description = value.rstrip('"')
        old_description = feature.qualifiers[key]
        if key == "evidence" or old_description.endswith("-"):
            description = "%s%s" % (old_description, description)
        else:
            description = "%s %s" % (old_description, description)
        if feature.type == "VAR_SEQ":  # see VARSPLIC above
            try:
                first_seq, second_seq = description.split(" -> ")
            except ValueError:
                pass
            else:
                extra_info = ""
                # we might have more information at the end of the
                # second sequence, which should be in parenthesis
                extra_info_pos = second_seq.find(" (")
                if extra_info_pos != -1:
                    extra_info = second_seq[extra_info_pos:]
                    second_seq = second_seq[:extra_info_pos]
                # now clean spaces out of the first and second string
                first_seq = first_seq.replace(" ", "")
                second_seq = second_seq.replace(" ", "")
                # reassemble the description
                description = first_seq + " -> " + second_seq + extra_info
        feature.qualifiers[key] = description
Esempio n. 21
0
def write_insdc(genome, features, genbank_output_path, embl_output_path):
    log.debug('prepare: genbank=%s, embl=%s', genbank_output_path,
              embl_output_path)

    contig_list = []
    for contig in genome['contigs']:
        contig_features = [
            feat for feat in features if feat['contig'] == contig['id']
        ]
        comment = (
            f"Annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n",
            f"Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n",
            '\n',
            f"##Genome Annotation Summary:##\n",
            f"{'Annotation Date':<30} :: {datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}\n",
            f"{'Annotation Pipeline':<30} :: Bakta\n",
            f"{'Annotation Software version':<30} ::  v{bakta.__version__}\n",
            f"{'Annotation Database version':<30} ::  v{cfg.db_info['major']}.{cfg.db_info['minor']}\n",
            f"{'CDSs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF]):5,}\n",
            f"{'tRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_T_RNA]):5,}\n",
            f"{'tmRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_TM_RNA]):5,}\n",
            f"{'tRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_R_RNA]):5,}\n",
            f"{'ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA]):5,}\n",
            f"{'regulatory ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA_REGION]):5,}\n",
            f"{'CRISPR Arrays':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CRISPR]):5,}",
            f"{'oriCs/oriVs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV]):5,}",
            f"{'oriTs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIT]):5,}",
            f"{'gaps':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_GAP]):5,}",
        )
        contig_annotations = {
            'molecule_type':
            'DNA',
            'source':
            genome['taxon'],
            'date':
            date.today().strftime('%d-%b-%Y').upper(),
            'topology':
            contig['topology'],
            'data_file_division':
            'HGT' if contig['type'] == bc.REPLICON_CONTIG else 'BCT',
            'comment':
            comment
            # TODO: taxonomy
        }
        source_qualifiers = {
            'mol_type': 'genomic DNA'
            # 'molecule_type': 'DNA' #  might be necessary in BioPython > 1.78 along with removal of Seq(..., generic_dna)
        }

        description = ''
        if (genome['taxon']):
            contig_annotations['organism'] = genome['taxon']
            source_qualifiers['organism'] = genome['taxon']
            description = genome['taxon']
        if (genome['strain']):
            source_qualifiers['strain'] = genome['strain']

        if (contig['type'] == bc.REPLICON_PLASMID):
            source_qualifiers['plasmid'] = contig['name'] if contig.get(
                'name', None) else 'unnamed'
            description = f"{description} plasmid {contig.get('name', 'unnamed')}"
            description += ', complete sequence' if contig[
                'complete'] else ', whole genome shotgun sequence'
        elif (contig['type'] == bc.REPLICON_CHROMOSOME):
            source_qualifiers['chromosome'] = contig['name'] if contig.get(
                'name', None) else contig['id']
            description = f'{description} chromosome, complete genome' if contig[
                'complete'] else f"{description} chromosome {contig['id']}, whole genome shotgun sequence"
        else:
            description += f" {contig['id']}, whole genome shotgun sequence"

        if (len(description) > 0 and description[0]
                == ' '):  # discard potential leading whitespace
            description = description[1:]

        contig_rec = SeqIO.SeqRecord(id=contig['id'],
                                     name=contig['id'],
                                     description=description,
                                     annotations=contig_annotations,
                                     seq=Seq(contig['sequence']))

        source = SeqFeature(FeatureLocation(0, contig['length'], strand=+1),
                            type='source',
                            qualifiers=source_qualifiers)
        seq_feature_list = [source]

        for feature in contig_features:
            insdc_feature_type = None
            qualifiers = {}
            if ('db_xrefs' in feature):
                qualifiers['db_xref'] = feature['db_xrefs']
            if ('product' in feature):
                qualifiers['product'] = feature['product']
            if ('locus' in feature):
                qualifiers['locus_tag'] = feature['locus']

            if (feature['type'] == bc.FEATURE_GAP):
                insdc_feature_type = bc.INSDC_FEATURE_GAP
                qualifiers['estimated_length'] = feature['length']
            elif (feature['type'] == bc.FEATURE_ORIC
                  or feature['type'] == bc.FEATURE_ORIV):
                # TODO: Add fuzzy positions for oriC/oriV
                insdc_feature_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION
                qualifiers['inference'] = 'similar to DNA sequence'
            elif (feature['type'] == bc.FEATURE_ORIT):
                # TODO: Add fuzzy positions for oriT
                insdc_feature_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER
                qualifiers['inference'] = 'similar to DNA sequence'
            elif (feature['type'] == bc.FEATURE_CDS) or (feature['type']
                                                         == bc.FEATURE_SORF):
                qualifiers['translation'] = feature['sequence']
                qualifiers['codon_start'] = 1
                qualifiers['transl_table'] = cfg.translation_table
                insdc_feature_type = bc.INSDC_FEATURE_CDS
                inference = []
                inference.append(
                    'ab initio prediction:Prodigal:2.6' if feature['type'] ==
                    bc.FEATURE_CDS else 'ab initio prediction:Bakta')
                if ('ups' in feature):
                    if ('ncbi_nrp_id' in feature['ups']):
                        qualifiers['protein_id'] = feature['ups'][
                            'ncbi_nrp_id']
                if ('ips' in feature):
                    if ('uniref100_id' in feature['ips']):
                        ips_subject_id = feature['ips']['uniref100_id']
                        inference.append(
                            f'similar to AA sequence:UniProtKB:{ips_subject_id}'
                        )
                if ('psc' in feature):
                    if ('uniref90_id' in feature['psc']):
                        psc_subject_id = feature['psc']['uniref90_id']
                        inference.append(
                            f'similar to AA sequence:UniProtKB:{psc_subject_id}'
                        )
                qualifiers['inference'] = inference
            elif (feature['type'] == bc.FEATURE_T_RNA):
                # TODO: Position anticodon
                if ('amino_acid' in feature and 'anti_codon' in feature):
                    if ('anti_codon_pos' in feature):
                        anti_codon_pos = feature['anti_codon_pos']
                        qualifiers[
                            'anticodon'] = f"(pos:{anti_codon_pos[0]}..{anti_codon_pos[1]},aa:{feature['amino_acid']},seq:{feature['anti_codon']})"
                    else:
                        qualifiers[
                            'note'] = f"tRNA-{feature['amino_acid']} ({feature['anti_codon']})"
                qualifiers['inference'] = 'profile:tRNAscan:2.0'
                insdc_feature_type = bc.INSDC_FEATURE_T_RNA
                if ('pseudo' in feature):
                    qualifiers['pseudo'] = None
            elif (feature['type'] == bc.FEATURE_TM_RNA):
                qualifiers['inference'] = 'profile:aragorn:1.2'
                insdc_feature_type = bc.INSDC_FEATURE_TM_RNA
            elif (feature['type'] == bc.FEATURE_R_RNA):
                for dbxref in feature['db_xrefs']:
                    if (dbxref.split(':')[0] == 'RFAM'):
                        rfam_id = dbxref.split(':')[1]
                        qualifiers['inference'] = f'profile:Rfam:{rfam_id}'
                insdc_feature_type = bc.INSDC_FEATURE_R_RNA
            elif (feature['type'] == bc.FEATURE_NC_RNA):
                # TODO: ncRNA_class
                for dbxref in feature['db_xrefs']:
                    if (dbxref.split(':')[0] == 'RFAM'):
                        rfam_id = dbxref.split(':')[1]
                        qualifiers['inference'] = f'profile:Rfam:{rfam_id}'
                qualifiers[bc.INSDC_FEATURE_NC_RNA_CLASS] = select_ncrna_class(
                    feature)
                insdc_feature_type = bc.INSDC_FEATURE_NC_RNA
            elif (feature['type'] == bc.FEATURE_NC_RNA_REGION):
                for dbxref in feature['db_xrefs']:
                    if (dbxref.split(':')[0] == 'RFAM'):
                        rfam_id = dbxref.split(':')[1]
                        qualifiers['inference'] = f'profile:Rfam:{rfam_id}'
                qualifiers[
                    bc.
                    INSDC_FEATURE_REGULATORY_CLASS] = select_regulatory_class(
                        feature)
                insdc_feature_type = bc.INSDC_FEATURE_REGULATORY
                qualifiers['note'] = feature['product']
                qualifiers.pop('product', None)
            elif (feature['type'] == bc.FEATURE_CRISPR):
                qualifiers[bc.INSDC_FEATURE_REPEAT_FAMILY] = 'CRISPR'
                qualifiers[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
                qualifiers[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feature[
                    'repeat_consensus']
                qualifiers['inference'] = 'COORDINATES:alignment:pilercr:1.02'
                insdc_feature_type = bc.INSDC_FEATURE_REPEAT_REGION
                qualifiers['note'] = feature['product']
                qualifiers.pop('product', None)

            strand = None
            if (feature['strand'] == bc.STRAND_FORWARD):
                strand = 1
            elif (feature['strand'] == bc.STRAND_REVERSE):
                strand = -1
            elif (feature['strand'] == bc.STRAND_UNKNOWN):
                strand = 0

            start = feature['start'] - 1
            stop = feature['stop']
            if ('edge' in feature):
                fl_1 = FeatureLocation(start, contig['length'], strand=strand)
                fl_2 = FeatureLocation(0, stop, strand=strand)
                feature_location = CompoundLocation([fl_1, fl_2])
            else:
                if ('truncated' in feature):
                    if (feature['truncated'] == bc.FEATURE_END_5_PRIME):
                        if (feature['strand'] == bc.STRAND_FORWARD):
                            start = BeforePosition(start)
                        else:
                            stop = AfterPosition(stop)
                    elif (feature['truncated'] == bc.FEATURE_END_3_PRIME):
                        if (feature['strand'] == bc.STRAND_FORWARD):
                            stop = AfterPosition(stop)
                        else:
                            start = BeforePosition(start)
                    else:
                        start = BeforePosition(start)
                        stop = AfterPosition(stop)
                feature_location = FeatureLocation(start, stop, strand=strand)
            if (feature.get('locus', None)):
                gene_qualifier = {'locus_tag': feature['locus']}
                if (feature.get('gene', None)):
                    qualifiers['gene'] = feature['gene']
                    gene_qualifier['gene'] = feature['gene']
                gen_seqfeat = SeqFeature(feature_location,
                                         type='gene',
                                         qualifiers=gene_qualifier)
                seq_feature_list.append(gen_seqfeat)
            feat_seqfeat = SeqFeature(feature_location,
                                      type=insdc_feature_type,
                                      qualifiers=qualifiers)
            seq_feature_list.append(feat_seqfeat)
        contig_rec.features = seq_feature_list
        contig_list.append(contig_rec)

    with genbank_output_path.open('wt', encoding='utf-8') as fh:
        log.info('write GenBank: path=%s', genbank_output_path)
        SeqIO.write(contig_list, fh, format='genbank')

    with embl_output_path.open('wt', encoding='utf-8') as fh:
        log.info('write EMBL: path=%s', embl_output_path)
        SeqIO.write(contig_list, fh, format='embl')
Esempio n. 22
0
class SeqFeatureTests(unittest.TestCase):
    sprot: SeqRecord = SeqRecord(
        SeqEM2.protein('MYNAMEISFREDHEREIAMWHEREARETHEYALLTHISISEXCELLENT'),
        id='X',
        name='DummyProt')
    sprot.features = [
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(0, 11),
                      type='domain',
                      id='d1'),  # MYNAMEISFRED
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(8, 18),
                      type='domain',
                      id='d2'),  # FREDHEREIAM
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(19, 30),
                      type='domain',
                      id='d3'),  # WHEREARETHEY
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(6, 23),
                      type='domain',
                      id='d4'),  # ISFREDHEREIAMWHERE
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(34, AfterPosition(39)),
                      id='d5'),  # THISIS
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(BeforePosition(2), 5),
                      type='domain',
                      id='d6'),  # MYNAME
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(19, 23),
                      type='domain',
                      id='d7'),  # WHERE
        SeqFeatureEM2(parent=sprot,
                      location=FeatureLocation(BeforePosition(30), 37),
                      type='domain',
                      id='d8')  # YALLTHI
    ]

    @classmethod
    def test_parent(cls):
        assert [f.id for f in cls.sprot.features
                ] == ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8']
        assert cls.sprot.features[1].parent.id == cls.sprot.id
        assert cls.sprot.features[1].parent.name == cls.sprot.name
        assert cls.sprot.features[1].parent.seq._data == cls.sprot.seq._data

    @classmethod
    def test_lies_within(cls):
        assert cls.sprot.features[1].lies_within(5, 25)
        assert not cls.sprot.features[1].lies_within(10, 25)
        assert not cls.sprot.features[1].lies_within(19, 25)

    @classmethod
    def test_lies_within_fuzzy(cls):
        with pytest.warns(UserWarning):
            cls.sprot.features[4].lies_within(30, 42)
            cls.sprot.features[5].lies_within(0, 10)

    @classmethod
    def test_overlaps(cls):
        assert cls.sprot.features[2].overlaps(20, 25)
        assert cls.sprot.features[2].overlaps(20, 40)
        assert cls.sprot.features[2].overlaps(20)
        assert not cls.sprot.features[2].overlaps(35)
        assert not cls.sprot.features[2].overlaps(2, 5)

    @classmethod
    def test_overlaps_fuzzy(cls):
        with pytest.warns(UserWarning):
            cls.sprot.features[4].overlaps(35)
            cls.sprot.features[5].overlaps(3)

    @classmethod
    def test_covers(cls):
        assert cls.sprot.features[3].covers(15, 20)
        assert not cls.sprot.features[3].covers(4, 20)

    @classmethod
    def test_covers_fuzzy(cls):
        with pytest.warns(UserWarning):
            cls.sprot.features[4].covers(35, 38)
            cls.sprot.features[5].covers(3, 4)

    @classmethod
    def test_intersect(cls):
        assert cls.sprot.features[4].intersect(
            cls.sprot.features[7]).location == FeatureLocation(34, 37)
        assert cls.sprot.features[2].intersect(
            cls.sprot.features[3]).location == cls.sprot.features[6].location
        assert cls.sprot.features[1].intersect(
            cls.sprot.features[3]).location == FeatureLocation(8, 18)

    @classmethod
    def test_intersect_errors(cls):
        with pytest.raises(ValueError, match=r'Undetermined .*'):
            cls.sprot.features[0].intersect(
                SeqFeatureEM2(location=FeatureLocation(30, 37)))

    @classmethod
    def test_intersect_fuzzy(cls):
        with pytest.warns(UserWarning):
            cls.sprot.features[5].intersect(cls.sprot.features[0])

    @classmethod
    def test_move(cls):
        assert cls.sprot.features[0].move(5).location == FeatureLocation(5, 16)
Esempio n. 23
0
def gene2features(r, gene, gene2position, gene2product, start, end, gcode,
                  partialyes, verbose):
    """
    """
    contig, CDSs, gffstrand, function, frames = gene2position[gene]
    if gffstrand in ('1', '+'):
        strand = +1
    else:
        strand = -1
        CDSs.reverse()
    '''#add stop codon if not partial seq
    if strand==1 and CDSs[-1][1]+3 <= len(r.seq):
            CDSs[-1][1] += 3
    elif strand==-1 and CDSs[0][0]-3 > 0:
        CDSs[0][0] -= 3'''
    cdsloc, mrnaloc = get_locations(CDSs, start, end, strand)
    #add gene
    geneid = gene  #".".join(gene.split('.')[:-1])
    #get product
    product = "hypothetical protein"
    if geneid in gene2product:
        product = gene2product[geneid]
    if gene.endswith('.t1'):
        sf = SeqFeature(FeatureLocation(BeforePosition(start - 1),
                                        AfterPosition(end)),
                        strand=strand,
                        type='gene',
                        id=geneid)
        sf.qualifiers = {
            "locus_tag": geneid,
            "gene": geneid,
            "product": product
        }
        r.features.append(sf)
    #get mRNA sf
    sf = SeqFeature(mrnaloc, type='mRNA', id=gene)
    sf.qualifiers = {
        "locus_tag": geneid,
        "gene": geneid,
        "product": product
    }  #"protein_id": gene
    r.features.append(sf)
    #get CDS sf
    sf = SeqFeature(cdsloc, type='CDS', id=gene)
    #get translation
    seq = sf.extract(r.seq)
    aa = str(seq.translate(table=gcode))
    #solve non-triplets issue
    if len(seq) % 3:
        if strand == 1:
            end -= len(seq) % 3
        else:
            start += len(seq) % 3
    ##check for partial sequence - no M as first or no * as last aa
    partial = 0
    #both ends partial
    if aa[0] != "M" and aa[-1] != "*":
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start - 1),
                                      AfterPosition(end))
    #left end partial
    elif aa[0] != "M" and strand == 1 or aa[-1] != "*" and strand == -1:
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start - 1), end)
    #right end partial
    elif aa[-1] != "*" and strand == 1 or aa[0] != "M" and strand == -1:
        partial = 1
        sf.location = FeatureLocation(start - 1, AfterPosition(end))
    #strip stop codon
    aa = aa.strip("*")
    #replace internal stop codons by X
    if "*" in aa:
        if verbose:
            sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" %
                             gene)
        return r
        #aa = aa.replace("*","X")
    sf.qualifiers = {
        'transl_table': gcode,
        "locus_tag": geneid,
        "gene": geneid,
        "product": product,
        "translation": aa
    }  #"protein_id": gene,
    if function:
        sf.qualifiers['note'] = function
    #inform about partial entries
    if partial:
        #skip if not partial are allowed
        if not partialyes:
            return r
        if aa[0] != "M":
            sf.qualifiers['codon_start'] = 1
        sf.qualifiers['product'] += ", partial cds"
        if verbose:
            sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene, ))
            #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf))
    #add to features
    r.features.append(sf)
    return r
Esempio n. 24
0
def scan_orfs(seq: str, direction: int, offset: int = 0) -> List[FeatureLocation]:
    """ Scan for open reading frames on a given sequence.
        Skips all ORFs with a size less than 60 bases.

        Arguments:
            seq: the sequence to examine
            direction: the search direction to use (all ORFs will use this as the strand)
            offset: an offset to add to any location discovered

        Returns:
            a list of FeatureLocations for each ORF, ordered by ascending position
    """
    seq = seq.upper()
    start_codons = ('ATG', 'GTG', 'TTG')
    stop_codons = ('TAA', 'TAG', 'TGA')
    matches = []
    # cache the sequence length
    seq_len = len(seq)
    for frame in [0, 1, 2]:
        i = frame
        last_stop = 0
        while i < seq_len - 2:
            if seq[i:i+3] in stop_codons and last_stop == 0:
                # special case for unstarted stops
                last_stop = i
                new_orf = FeatureLocation(BeforePosition(offset), offset + i + 2 + 1, direction)
                if direction == -1:
                    start = AfterPosition(seq_len + offset - new_orf.start)
                    end = seq_len + offset - new_orf.end
                    new_orf = FeatureLocation(end, start, strand=direction)
                matches.append(new_orf)
            if seq[i:i+3] not in start_codons:
                i += 3
                continue
            # Look for the next stop codon in this frame
            for j in range(i, seq_len - 2, 3):
                if seq[j:j+3] in stop_codons:
                    last_stop = j
                    # Skip Orfs that are shorter than 20 AA / 60 bases
                    if j - i <= 60:
                        break  # since no ORFs will be bigger before the stop
                    start = i
                    end = j + 2 + 1
                    if direction == 1:
                        new_orf = FeatureLocation(offset + start,
                                                  offset + end, direction)
                    else:
                        # reversed, so convert back to the forward positions
                        new_orf = FeatureLocation(seq_len + offset - end,
                                                  seq_len + offset - start, direction)
                    matches.append(new_orf)
                    # This was a good hit, update the last_stop cache.
                    break

            # if we found a matching stop, carry on looking for starts after this stop
            if last_stop > i:
                i = last_stop
                continue

            # Save orfs ending at the end of the sequence without stop codon
            if direction == 1:
                new_orf = FeatureLocation(i + offset, AfterPosition(seq_len + offset), direction)
            else:
                # reversed, so convert back to the forward positions
                new_orf = FeatureLocation(BeforePosition(offset), offset + seq_len - i, direction)
            matches.append(new_orf)
            # since there are no stop codons, just stop here
            break
    return sorted(matches, key=lambda x: min(x.start, x.end))
Esempio n. 25
0
 def test_end_without_start(self):
     expected = [
         FeatureLocation(BeforePosition(0), ExactPosition(6), strand=1)
     ]
     self.run_both_dirs(expected, "NNNTAGNNN")
Esempio n. 26
0
                # Find and fix peptide sequence
                if pep_id not in prot_seq_dict:
                    raise RuntimeError(
                        "Could not find protein sequence for id '%s'" % pep_id)

                pep_seq = str(prot_seq_dict[pep_id].seq)

                # If the protein doesn't start with methionine, it is probably a partial one
                fuzzy_start = False
                fuzzy_end = False
                if pep_seq[0] != 'M':
                    fuzzy_start = True
                    if sf.strand == 1:
                        cds_locs[0] = FeatureLocation(
                            BeforePosition(cds_locs[0].start), cds_locs[0].end,
                            sf.strand)
                        cds_quals['codon_start'] = 1
                    else:
                        cds_locs[-1] = FeatureLocation(
                            cds_locs[-1].start,
                            AfterPosition(cds_locs[-1].end), sf.strand)
                        cds_quals['codon_start'] = 1
                # If the protein doesn't end with stop codon, it is probably a partial one
                if not args.no_stop_codon and pep_seq[-1] not in ('.', '*'):
                    fuzzy_end = True
                    if sf.strand == 1:
                        cds_locs[-1] = FeatureLocation(
                            cds_locs[-1].start,
                            AfterPosition(cds_locs[-1].end), sf.strand)
                        cds_quals['codon_start'] = 1