Ejemplo n.º 1
0
        def _get_nucseq(feature, seq_record, extension):
            start = feature.location.start - extension
            cds_start = extension
            cds_end = cds_start + len(feature.location)
            if start < 0:
                offset = 0 - start
                start = start + offset
                cds_start = cds_start - offset
                cds_end = cds_end - offset
            end = feature.location.end + extension
            if end > len(seq_record):
                offset = end - len(seq_record)
                end = end - offset
            if feature.location.strand == -1:
                # swap cds_start and cd_end
                length = end - start
                cds_start, cds_end = length - cds_end, length - cds_start
            extended_location = FeatureLocation(start=start, end=end, strand=feature.strand)
            nucseq = extended_location.extract(seq_record)

            # for debug
            # print(feature.id, start, end, feature.location.strand, cds_start, cds_end, str(nucseq.seq), str(nucseq.seq)[cds_start:cds_end])
            # print(str(nucseq.seq)[cds_start:cds_end])
            # print(str(feature.extract(seq_record).seq))

            assert str(nucseq.seq)[cds_start:cds_end] == str(feature.extract(seq_record).seq)
            return Extended_CDS(feature.id, str(nucseq.seq), int(cds_start), int(cds_end),
                                int(start), int(end), int(feature.location.strand))
Ejemplo n.º 2
0
    def get_aa_translation_from_location(
            self,
            location: FeatureLocation,
            transl_table: Union[str, int] = None) -> Seq:
        """ Obtain the translation for a feature based on its location """
        if location.end > len(self.seq):
            raise ValueError("location outside available sequence")
        if transl_table is None:
            transl_table = self._transl_table
        extracted = location.extract(self.seq).ungap('-')
        if len(extracted) % 3 != 0:
            extracted = extracted[:-(len(extracted) % 3)]
        seq = extracted.translate(to_stop=True, table=transl_table)
        if not seq:
            # go past stop codons and hope for something to work with
            seq = extracted.translate(table=transl_table)

        # replace ambiguous proteins with an explicit unknown
        string_version = str(seq)
        for invalid in "*BJOUZ":
            string_version = string_version.replace(invalid, "X")

        if "-" in str(seq):
            seq = Seq(str(seq).replace("-", ""))

        return Seq(string_version)
Ejemplo n.º 3
0
 def test_reference_in_location_record(self):
     """Test location with reference to another record."""
     parent_record = SeqRecord.SeqRecord(seq=Seq.Seq("actg"))
     another_record = SeqRecord.SeqRecord(seq=Seq.Seq("gtcagctac"))
     location = FeatureLocation(5, 8, ref="ANOTHER.7")
     with self.assertRaisesRegex(
         ValueError,
         r"Feature references another sequence \(ANOTHER\.7\), references mandatory",
     ):
         location.extract(parent_record)
     with self.assertRaisesRegex(
         ValueError,
         r"Feature references another sequence \(ANOTHER\.7\), not found in references",
     ):
         location.extract(parent_record, references={"SOMEOTHER.2": another_record})
     self.assertEqual(
         location.extract(parent_record, references={"ANOTHER.7": another_record}),
         "cta",
     )
Ejemplo n.º 4
0
 def test_reference_in_location_sequence(self):
     """Test location with reference to another sequence."""
     parent_sequence = Seq.Seq("actg")
     another_sequence = Seq.Seq("gtcagctac")
     location = FeatureLocation(5, 8, ref="ANOTHER.7")
     self.assertEqual(
         location.extract(parent_sequence,
                          references={"ANOTHER.7": another_sequence}),
         "cta",
     )
Ejemplo n.º 5
0
def createSeqFromTblastn(subject_fna, sseq_seq_faa, exonerate_target_id, start_match, end_match):
    """
    Use the result from the tBlastn to extract a region from the subject genome.
    The region extracted corresponds to the match region and 10kb before and 10kb after.

    Parameters
    ----------
    subject_fna: str
        path to subject fasta sequence (genome)
    sseq_seq_faa: str
        path to output fasta sequence
    exonerate_target_id: str
        ID of the contig/scaffold/chromosome where a match has been found
    start_match: int
        start of the match
    end_match: int
        end of the match
    """
    if not os.path.exists(sseq_seq_faa):
        with open(subject_fna, "r") as fna:
            sseq_seq = [seq_record for seq_record in SeqIO.parse(fna, "fasta") if seq_record.id == exonerate_target_id][0]
            sseq_seq.description = "tblastn identified sequence"
            sseq_seq.id = exonerate_target_id + "_" + str(start_match) + "_" + str(end_match)
            if start_match > end_match:
                start_match = start_match + 10000
                end_match = end_match - 10000
                if start_match > len(sseq_seq.seq):
                    start_match = len(sseq_seq.seq)
                if end_match < 0:
                    end_match = 0
                seq_location = FeatureLocation(end_match, start_match)
                sseq_seq.seq = seq_location.extract(sseq_seq.seq)
            elif start_match < end_match:
                start_match = start_match - 10000
                end_match = end_match + 10000
                if start_match < 0:
                    start_match = 0
                if end_match > len(sseq_seq.seq):
                    end_match = len(sseq_seq.seq)
                seq_location = FeatureLocation(start_match, end_match)
                sseq_seq.seq = seq_location.extract(sseq_seq.seq)
            SeqIO.write(sseq_seq, sseq_seq_faa, "fasta")
Ejemplo n.º 6
0
 def test_reference_in_compound_location_sequence(self):
     """Test compound location with reference to another sequence."""
     parent_sequence = Seq.Seq("aaccaaccaaccaaccaa")
     another_sequence = Seq.Seq("ttggttggttggttggtt")
     location = FeatureLocation(2, 6) + FeatureLocation(5, 8, ref="ANOTHER.7")
     self.assertEqual(
         location.extract(
             parent_sequence, references={"ANOTHER.7": another_sequence}
         ),
         "ccaatgg",
     )
Ejemplo n.º 7
0
    def __init__(self, logger, sequences, reference, dateFormat):
        super(sequence_set, self).__init__()
        self.log = logger

        # load sequences from the (parsed) JSON - don't forget to sort out dates
        self.seqs = {}
        for name, data in sequences.iteritems():
            self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna),
                                        id=name,
                                        name=name,
                                        description=name)
            self.seqs[name].attributes = data["attributes"]
            # tidy up dates
            date_struc = parse_date(self.seqs[name].attributes["raw_date"],
                                    dateFormat)
            self.seqs[name].attributes["num_date"] = date_struc[1]
            self.seqs[name].attributes["date"] = date_struc[2]

        # if the reference is to be analysed it'll already be in the (filtered & subsampled)
        # sequences, so no need to add it here, and no need to care about attributes etc
        # we do, however, need it for alignment
        self.reference_in_dataset = reference["included"]
        name = reference["strain"]
        self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna),
                                       id=name,
                                       name=name,
                                       description=name)
        if "genes" in reference and len(reference["genes"]):
            self.proteins = {}
            for k, v in reference["genes"].iteritems():
                feature = FeatureLocation(start=v["start"],
                                          end=v["end"],
                                          strand=v["strand"])

                # Translate sequences to identify any proteins ending with a stop codon.
                translation = Seq.translate(
                    Seq(feature.extract(str(self.reference_seq.seq))))
                if translation.endswith("*"):
                    # Truncate the last codon of the protein to omit the stop codon.
                    feature = FeatureLocation(start=v["start"],
                                              end=v["end"] - 3,
                                              strand=v["strand"])

                self.proteins[k] = feature
        else:
            self.proteins = None

        # other things:
        self.run_dir = '_'.join([
            'temp',
            time.strftime('%Y%m%d-%H%M%S', time.gmtime()),
            str(random.randint(0, 1000000))
        ])
        self.nthreads = 2  # should load from config file
Ejemplo n.º 8
0
 def test_reference_in_compound_location_record(self):
     """Test compound location with reference to another record."""
     parent_record = SeqRecord.SeqRecord(Seq.Seq("aaccaaccaaccaaccaa"))
     another_record = SeqRecord.SeqRecord(Seq.Seq("ttggttggttggttggtt"))
     location = FeatureLocation(2, 6) + FeatureLocation(5, 8, ref="ANOTHER.7")
     with self.assertRaisesRegex(
         ValueError,
         r"Feature references another sequence \(ANOTHER\.7\), references mandatory",
     ):
         location.extract(parent_record)
     with self.assertRaisesRegex(
         ValueError,
         r"Feature references another sequence \(ANOTHER\.7\), not found in references",
     ):
         location.extract(parent_record, references={"SOMEOTHER.2": another_record})
     self.assertEqual(
         location.extract(
             parent_record, references={"ANOTHER.7": another_record}
         ).seq,
         "ccaatgg",
     )
Ejemplo n.º 9
0
def extractUpstream(r, f, leftmost=200):
    from Bio.SeqFeature import FeatureLocation

    location = f.location
    start, end, strand = location.start, location.end, location.strand
    if strand == 1: start_, end_ = start - 200, start
    else: start_, end_ = end + 1, end + 201

    fl = FeatureLocation(start_, end_, strand)
    upstream = fl.extract(r)
    upstream.id = f.qualifiers['locus_tag'][0]
    upstream.name, upstream.description = '', ''
def find_cds ():
    seq_des = str(record_dict[keys].description).split("|")
    for i in seq_des:
        if re.match("CDS", i):
            feature, cds_start, cds_end = re.split(":|-", i)
            f = FeatureLocation(int(cds_start)-1, int(cds_end))
            cds_sequence = f.extract(record_dict[keys].seq)
            protein_sequence = cds_sequence.translate()
            if "*" not in protein_sequence:
                return 0
            else
                return 1

        else
            return 0
def find_cds ():
    seq_des = str(record_dict[keys].description).split("|")
    if any("CDS:" in s for s in seq_des):
        for des in seq_des:
            match = re.match("CDS:", des)
            if match is not None:
                print record_dict[keys].id
                feature, cds_start, cds_end = re.split(":|-", des)
                f = FeatureLocation(int(cds_start)-1, int(cds_end))
                cds_sequence = f.extract(record_dict[keys].seq)
                protein_sequence = cds_sequence.translate()
                if "*" not in protein_sequence:
                    return 3
                else:
                    return 1
        
    else:
        return 0
Ejemplo n.º 12
0
def get_stop(
    cdss: List[GFF3Record],
    seq: SeqRecord,
    strand: Strand,
) -> Tuple[str, int, int]:
    assert len(cdss) > 0

    if strand == Strand.MINUS:
        cds = cdss[0]
        start = cds.start
        end = start + 3
        feat = FeatureLocation(start, end, -1)
    else:
        cds = cdss[-1]
        end = cds.end
        start = end - 3
        feat = FeatureLocation(start, end, +1)

    return str(feat.extract(seq).seq), start, end
Ejemplo n.º 13
0
 def get_aa_translation_from_location(
         self,
         location: FeatureLocation,
         transl_table: Union[str, int] = None) -> Seq:
     """ Obtain the translation for a feature based on its location """
     if transl_table is None:
         transl_table = self._transl_table
     extracted = location.extract(self.seq).ungap('-')
     if len(extracted) % 3 != 0:
         extracted = extracted[:-(len(extracted) % 3)]
     seq = extracted.translate(to_stop=True, table=transl_table)
     if not seq:
         # go past stop codons and hope for something to work with
         seq = extracted.translate(table=transl_table)
     if "*" in str(seq):
         seq = Seq(str(seq).replace("*", "X"), Bio.Alphabet.generic_protein)
     if "-" in str(seq):
         seq = Seq(str(seq).replace("-", ""), Bio.Alphabet.generic_protein)
     return seq
Ejemplo n.º 14
0
def getgenefromgbk(gbkfile, location):  # change to work with locations
    """parses a genesequence from a gbk file using the gene location
    parameters
    ----------
    gbkfile
        string, path to gbk file + file
    location
        string of coordinates, example: "[start:end>](+)"
    returns
    ----------
    ret = DNA sequence of housekeepinggene from featurelocation
          coordinates
    abs_loc = validation, contains the location of HG on specific
              scaffold. [scaffold, start, end]
    """
    ret = ""
    scaff_number, start, end, strand = location.split(",")
    scaff_number = int(scaff_number)

    # Making the FeatureLocation
    f_start = BeforePosition(
        start.strip("<")) if "<" in start else ExactPosition(start)
    f_end = AfterPosition(end.strip(">")) if ">" in end else ExactPosition(end)
    f = FeatureLocation(f_start, f_end, int(strand))

    gbkcontents = SeqIO.parse(gbkfile, "genbank")
    for record in gbkcontents:
        record_no = record.name.split(".")[0]
        scaff_check = int(record_no[-3:])  # = scaffold number
        if scaff_check == scaff_number:
            DNA = record.seq
    ret = f.extract(DNA)  # The DNA sequence of the housekeepinggene

    # VALIDATION
    start = start.replace(">", "")
    start = start.replace("<", "")
    start = int(start)
    end = end.replace(">", "")
    end = end.replace("<", "")
    end = int(end)
    abs_loc = [scaff_number, start, end]
    return (ret, abs_loc)
Ejemplo n.º 15
0
    def __init__(self, logger, sequences, reference, dateFormat):
        super(sequence_set, self).__init__()
        self.log = logger

        # load sequences from the (parsed) JSON - don't forget to sort out dates
        self.seqs = {}
        for name, data in sequences.items():
            self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna),
                   id=name, name=name, description=name)
            self.seqs[name].attributes = data["attributes"]
            # tidy up dates
            date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat)
            self.seqs[name].attributes["num_date"] = date_struc[1]
            self.seqs[name].attributes["date"] = date_struc[2]

        # if the reference is to be analysed it'll already be in the (filtered & subsampled)
        # sequences, so no need to add it here, and no need to care about attributes etc
        # we do, however, need it for alignment
        self.reference_in_dataset = reference["included"]
        name = reference["strain"]
        self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna),
               id=name, name=name, description=name)
        if "genes" in reference and len(reference["genes"]):
            self.proteins = {}
            for k, v in reference["genes"].items():
                feature = FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"])

                # Translate sequences to identify any proteins ending with a stop codon.
                translation = Seq.translate(Seq(feature.extract(str(self.reference_seq.seq))))
                if translation.endswith("*"):
                    # Truncate the last codon of the protein to omit the stop codon.
                    feature = FeatureLocation(start=v["start"], end=v["end"] - 3, strand=v["strand"])

                self.proteins[k] = feature
        else:
            self.proteins = None

        # other things:
        self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
        self.nthreads = 2 # should load from config file
Ejemplo n.º 16
0
def cds_extract(seq, location, codon_start=1):
    """Specialized wrapper for extract method of FeatureLocation.

    Returns cds and cds_translation as strings.
    Corrects BeforePosition behavior by specifying default codon_start.
    Correctly handles hanging incomplete codons in tail.
    """
    if location.start.__class__.__name__ == 'ExactPosition':
        cds_Seq = location.extract(seq)
    else:
        new_location = FeatureLocation(
                codon_start - 1,
                location.end,
                location.strand,
                location.ref,
                location.ref_db)
        cds_Seq = new_location.extract(seq)
    cds = str(cds_Seq)
    cds_translation = str(cds_Seq.translate())

    # Fix hanging codons (non-triplets at 3' end)
    # Final cds should be a triplet, either trimmed or with additional 'N'
    if len(cds) % 3 != 0:
        hanging_codon = cds[-(len(cds)%3):]
        hanging_aa = aa_or_X_given_codon[hanging_codon]  # E.g. 'CG(N)' -> 'R'
        if hanging_aa != 'X':
            assert len(hanging_codon) == 2
            cds += 'N'
            cds_translation += hanging_aa
        else:
            cds = cds[:-(len(cds)%3)]

    # Remove stop codons
    if cds_translation.endswith('*'):
        cds = cds[:-3]
        cds_translation = cds_translation[:-1]
    return cds, cds_translation
Ejemplo n.º 17
0
        def _concatenate_features(left_id, right_id):
            '''
            Let the N-term part 'parent' and C-term part 'child'.
            Two features are concatenated based on the parent feature.
            Child features will be removed in the downstream process.

            Returns parent's feature_id and child's feature_id
            
            If consistency check fails, returns None.
            '''

            left_feature = self.genome.features[left_id]
            right_feature = self.genome.features[right_id]
            if left_feature.type != "CDS" or right_feature.type != "CDS" or left_feature.seq_id != right_feature.seq_id:
                return None
            if left_feature.strand == right_feature.strand == 1:
                parent, child = left_feature, right_feature
                stop_codon_location = FeatureLocation(start=parent.location.end - 3, end=parent.location.end, strand=1)
            elif left_feature.strand == right_feature.strand == -1:
                parent, child = right_feature, left_feature
                stop_codon_location = FeatureLocation(start=parent.location.start, end=parent.location.start + 3, strand=-1)
            else:
                return None

            concatenated_location = FeatureLocation(start=left_feature.location.start, end=right_feature.location.end,
                                                    strand=left_feature.strand)

            seq_id = parent.seq_id
            whole_seq = self.genome.seq_records[seq_id]
            # annotations = parent.annotations.copy()
            # qualifiers = parent.qualifiers.copy()
            transl_table = parent.qualifiers.get("trasl_table", [11])[0]  # if not available, use translation table 11.

            extracted_seq = concatenated_location.extract(whole_seq.seq)
            translated_seq = str(extracted_seq.translate(table=transl_table).rstrip("*"))
            if translated_seq.count("*") == 1:
                stop_codon_pos = translated_seq.index("*") + 1
            else:
                return None  # Only one stop codon must be included in aa.

            stop_codon = str(stop_codon_location.extract(whole_seq.seq))
            if stop_codon.upper() == "TGA":  # opal > Selenocysteine, Sec, U
                # /transl_except=(pos:complement(5272379..5272381),aa:Sec)
                transl_except = "(pos:{},aa:Sec)".format(get_location_string(stop_codon_location))
                translated_seq = translated_seq.replace("*", "U")
                note_value = "codon on position {} is selenocysteine opal codon.".format(stop_codon_pos)
            elif stop_codon.upper() == "TAG":  # amber > pyrrolysine, Pyl, O
                # /transl_except=(pos:213..215,aa:Pyl ) 
                transl_except = "(pos:{},aa:Pyl)".format(get_location_string(stop_codon_location))
                translated_seq = translated_seq.replace("*", "O")
                note_value = "codon on position {} is pyrrolysine amber codon.".format(stop_codon_pos)
            else:
                return None  # stop codon must be either of TGA/TAG


            # todo: Change this to hit obejct
            parent.location = concatenated_location
            parent.qualifiers["translation"] = [translated_seq]
            parent.qualifiers["transl_except"] = [transl_except]
            parent.qualifiers.setdefault("note", []).append(note_value)
            parent.primary_hit, parent.secondary_hits = None, []

            # print("left", left_feature.location)
            # print("right", right_feature.location)
            # print(concatenated_location)
            return parent.id, child.id
def check_genomewide(refseq, VERBOSE=0):
    '''Check the integrity of all genes in the genomewide consensus'''
    # Check single-exon genes
    length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15}
    for genename, tol in length_tolerance.iteritems():
        (start, end,
         start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'
        
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start: end]
        gene = geneseq.seq
        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes the gene ends a few nucleotides upstream, and there is a
            # frameshift mutation that screws up
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
            else:
                return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if (not check):
            if genename != 'vpu':
                return False
            else:
                print 'ERROR IN VPU STARTING CODON, CONTINUING!'

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes a gene is a bit longer
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            elif 0 < end_diff < 90:
                print genename.upper()+' ENDS '+str(end_new + 1 - (end - start) // 3)+' AMINO ACIDS DOWNSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            else:
                return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    # Vif is special because it can be longer than in HXB2
    genename = 'vif'
    (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: '+genename+' not found in genomewide!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of '+genename+' found'
    
    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start, len(gene_HXB2), genename,
                                     VERBOSE=VERBOSE, maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start: end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=0)
    if not check:
        # Vif tends to be a bit longer than in HXB2
        for nc in xrange(1, 4):
            gene_ext = refseq[start: end + 3 * nc].seq
            prot_ext = gene_ext.translate()
            check = check_has_end(prot_ext, genename, VERBOSE=0)
            if check:
                gene = gene_ext
                prot = prot_ext
                if VERBOSE:
                    print 'WARNING: '+genename+' actually ends '+str(nc)+' codons downstream'
                break
        else:
            print 'ERROR: '+genename+' does not end, not even slightly downstream'
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check 2-exon genes
    for genename_whole in ('tat', 'rev'):
        genename = genename_whole+'1'
        (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'
        
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15)
        if not check:
            return False

        geneseq = refseq[start: end]
        geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon1 = start
        end_exon1 = end

        genename = genename_whole+'2'
        (start, end, start_found, end_found) = locate_gene(refseq[end_exon1 + 2000:], genename, VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: '+genename+' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of '+genename+' found'

        start += end_exon1 + 2000
        end += end_exon1 + 2000

        # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions
        if genename == 'rev2':
            tol = 45
        else:
            tol = 15
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start: end]
        frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE)
        geneseq = geneseq[frame:]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            if genename != 'rev2':
                return False

            else:
                # rev2 can end a bit early
                end_new = prot.rfind('*')
                if end_new != -1:
                    if len(prot) - 1 - end_new < 20:
                        print 'REV2 ENDS '+str(len(prot) - end_new - 1)+' AMINO ACIDS UPSTREAM!'
                        prot = prot[:end_new + 1]
                        end = start + frame + 3 * (end_new + 1)
                    else:
                        return False
                else:
                    # rev2 can also end quite a bit late
                    gene_new = refseq.seq[start:]
                    gene_new = gene_new[(end - start) % 3:]
                    gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
                    prot_new = gene_new.translate()
                    end_new = prot_new.find('*')

                    if (start + 3 * end_new) - end < 200:
                        print 'REV2 ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!'
                        prot = prot_new[:end_new + 1]
                        end = start + ((end - start) % 3) + 3 * (end_new + 1)
                    else:
                        return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon2 = start
        end_exon2 = end

        genename = genename_whole
        gene_HXB2 = get_gene_HXB2(genename)

        from Bio.SeqFeature import FeatureLocation
        gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \
                   FeatureLocation(start_exon2, end_exon2, strand=+1)
        geneseq = gene_loc.extract(refseq)
        gene = geneseq.seq

        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    return True
Ejemplo n.º 19
0
 def extract_sequence(self, upstream=0, downstream=0):
     location = FeatureLocation(self.location.start - upstream,
                                self.location.end + downstream, self.strand)
     return location.extract(self.chromosome.nucleic_sequence)
Ejemplo n.º 20
0
 q = f.qualifiers
 loc = f.location
 start = loc.start
 end = loc.end
 s = loc.strand
 if str(f.type).lower() == "rbs & cds":
         
         types = q['label'][0].split("_")
         if types[1] == "PhIF":
             types[1] = "PhlF"
         rbs = pdict[types[0]]
         cds = pdict[types[1]]
         
         r = FeatureLocation(start if s==1 else end-len(rbs), start+len(rbs) if s==1 else end ,strand=s)
         c = FeatureLocation(start+len(rbs) if s==1 else start, end if s==1 else end-len(rbs),strand=s)
         print(pdict[types[0]] == str((r.extract(record).seq)))
         print(pdict[types[1]] == str((c.extract(record).seq)))
         SequenceDictionary  += [[types[0], str((r.extract(record).seq)), "rbs", (r.strand == -1), r.start + 1, r.end]]
         SequenceDictionary  += [[types[1], str((c.extract(record).seq)), "cds", (c.strand == -1), c.start + 1, c.end]]
 elif "scar" in q['label'][0].lower() and (end-start) <= 4 :
         scars = True
         label = q['label'][0]
         label = re.sub(r'\W+', '', label)
         SequenceDictionary  += [[scardict[str((f.extract(record).seq))], str((f.extract(record).seq)), str(f.type).lower(), (loc.strand == -1), start+1, end+0]]
 elif str(f.type).lower() in ["misc_feature"] and 'promoter' in re.sub(r'\W+', '', q['label'][0]):
         label = re.sub(r'\W+', '', q['label'][0])
         SequenceDictionary  += [[label, str((f.extract(record).seq)), "promoter", (loc.strand == -1), start+1, end+0]]
 elif str(f.type).lower() not in ["source", "primer", "primer_bind", "rep_origin", "misc_feature", "repressor"]:
         
         label = q['label'][0]
         label = re.sub(r'\W+', '', label)
Ejemplo n.º 21
0
def main():
    """
    This script extracts a gene sequence from a Genbank file and its promoter region.

    The name of the gene sequence to be extracted must be provided using the -name argument.
    If the gene name is stored in a GBK qualifier different than the "gene" qualifier (e.g.
    locus_tag) must be indicated using the -qual option.

    Output: a fasta file is created (geneName_outName.fna) with two sequences. The first sequence
    correspond to the gene (ORF) and the second to the promoter region (ORF +- 1000 bp)
    
    """
    parser = ArgumentParser(description=main.__doc__)
    parser.add_argument("-gbk", dest="genbank", help="annotated genbank file", type=str)
    parser.add_argument("-name", dest="gene_name", help="name of the gene sequence", type=str)
    parser.add_argument("-qual", dest="tag", \
                        help="gene/locustag qualifier of the gene", type=str, default="gene")
    parser.add_argument("-out", dest="out", help="name of the output e.g. spp/strain name", type=str)
    args = parser.parse_args()

    # Parse genbank file
    annot = SeqIO.parse(args.genbank,"genbank")
    found = False
    for rec in annot:
        for feat in rec.features:
            if feat.type == "CDS":
                bases = feat.location.extract(rec.seq)
                if gen_ok(bases):
                    if args.tag == "locustag":
                        if "locus_tag" in feat.qualifiers.keys():
                            gen = feat.qualifiers["locus_tag"][0]
                            if gen == args.gene_name:
                                output = open(args.gene_name+"_"+args.out+".fna", "a")
                                found = True
                                start = int(feat.location.start - 1000)
                                end = int(feat.location.end + 1000)
                                promoter_loc = FeatureLocation(start,end,strand=feat.location.strand)

                                # write gene seq to file
                                sequence_object = Seq(str(bases))
                                record = SeqRecord(sequence_object, id=args.gene_name+"_"+args.out, description="")
                                SeqIO.write(record, output, "fasta")

                                # write gene seq + promoter to file
                                sequence_promoter = Seq(str(promoter_loc.extract(rec.seq)))
                                record_promoter = SeqRecord(sequence_promoter, id=args.gene_name+"+/-1000bp"+"_"+args.out, description="")
                                SeqIO.write(record_promoter, output, "fasta")
                                output.close()
                        

                    else:
                        if "gene" in feat.qualifiers.keys():
                            gen = feat.qualifiers["gene"][0]
                            if gen == args.gene_name:
                                output = open(args.gene_name+"_"+args.out+".fna", "a")
                                found = True
                                start = int(feat.location.start - 1000)
                                end = int(feat.location.end + 1000)
                                promoter_loc = FeatureLocation(start,end,strand=feat.location.strand)

                                # write gene seq to file
                                sequence_object = Seq(str(bases))
                                record = SeqRecord(sequence_object, id=args.gene_name+"_"+args.out, description="")
                                SeqIO.write(record, output, "fasta")

                                # write gene seq + promoter to file
                                sequence_promoter = Seq(str(promoter_loc.extract(rec.seq)))
                                record_promoter = SeqRecord(sequence_promoter, id=args.gene_name+"+/-1000bp"+"_"+args.out, description="")
                                SeqIO.write(record_promoter, output, "fasta")
                                output.close()





    if not found:
        print("gene not found")
Ejemplo n.º 22
0
for f in in_files:

    cur_genome = SeqIO.parse(f, "embl")
    for record in cur_genome:
        for feat in record.features:
            if feat.type == 'CDS':
                if 'gene' in feat.qualifiers:
                    gene = feat.qualifiers['gene'][0]
                    if gene == sys.argv[1]:
                        s, e, strand = feat.location.start, feat.location.end, feat.location.strand
                        header = '>' + feat.qualifiers['gene'][0] + "," + str(
                            s + 1) + ".." + str(e) + "(" + str(
                                strand) + ")" + "," + "[" + f.replace(
                                    "genomes/", "") + "]"
                        flanked = FeatureLocation(s, e, strand)
                        out_seq = flanked.extract(record.seq)
                        fname = header[1:].split(',')[0] + ".fna"

                        if fname in stored.keys():
                            old = fname
                            fname = fname.replace(
                                ".fna", "_" + str(stored[fname]) + ".fna")
                            stored[old] = stored[old] + 1
                        else:
                            stored[fname] = 1

                        with open(os.path.join('results', fname), 'w') as out:
                            out.write(header + '\n')
                            out.write(str(out_seq) + '\n')
import os

in_files = glob.glob('genomes/*.embl')
flanking_region = 100
try:
        os.mkdir("results")
except OSError:
        print "a 'results' dir already exists"
        print "Overwriting"
stored = {}
for f in in_files:
        cur_genome = SeqIO.parse(f, "embl")
#print cur_genome
for record in cur_genome:
        for feat in record.features:
                if feat.type == 'mobile_element':
                        s, e, strand = feat.location.start, feat.location.end, feat.location.strand
                        header = '>'+feat.qualifiers['mobile_element_type'][0].split(':')[-1]+","+feat.qualifiers['mobile_element_type'][0].split(':')[-1]+".."+str(s+1)+".."+str(e)+"("+str(strand)+"),""100bp flanked,[EC958 IS]"
                        flanked = FeatureLocation(s-flanking_region, e+flanking_region, strand)
                        out_seq = flanked.extract(record.seq)
                        fname = header[1:].split(',')[0].replace('unclassified','unc').replace('family', 'fam').replace('(', '').replace('partial', 'p').replace(')', '').replace(' ', '_').replace('/', '-').strip()+'.fna'
                        if fname in stored.keys():
                                old = fname
                                fname = fname.replace(".fna", "_"+str(stored[fname])+".fna")
                                stored[old] = stored[old]+1
                        else:
                                stored[fname] = 1
                        with open(os.path.join('results', fname), 'w') as out:
                                out.write(header+'\n')
                                out.write(str(out_seq)+'\n')
Ejemplo n.º 24
0
def check_genomewide(refseq, VERBOSE=0):
    '''Check the integrity of all genes in the genomewide consensus'''
    # Check single-exon genes
    length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15}
    for genename, tol in length_tolerance.iteritems():
        (start, end, start_found, end_found) = locate_gene(refseq,
                                                           genename,
                                                           VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: ' + genename + ' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of ' + genename + ' found'

        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start,
                                         len(gene_HXB2),
                                         genename,
                                         VERBOSE=VERBOSE,
                                         maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start:end]
        gene = geneseq.seq
        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes the gene ends a few nucleotides upstream, and there is a
            # frameshift mutation that screws up
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper() + ' ENDS ' + str(
                    (end - start) // 3 - end_new -
                    1) + ' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
            else:
                return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if (not check):
            if genename != 'vpu':
                return False
            else:
                print 'ERROR IN VPU STARTING CODON, CONTINUING!'

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            # sometimes a gene is a bit longer
            gene_new = refseq.seq[start:]
            gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
            prot_new = gene_new.translate()
            end_new = prot_new.find('*')
            end_diff = start + (3 * end_new + 3) - end
            if -90 < end_diff < 0:
                print genename.upper() + ' ENDS ' + str(
                    (end - start) // 3 - end_new -
                    1) + ' AMINO ACIDS UPSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            elif 0 < end_diff < 90:
                print genename.upper() + ' ENDS ' + str(
                    end_new + 1 -
                    (end - start) // 3) + ' AMINO ACIDS DOWNSTREAM!'
                gene = gene_new[:3 * (end_new + 1)]
                prot = gene.translate()
            else:
                return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    # Vif is special because it can be longer than in HXB2
    genename = 'vif'
    (start, end, start_found, end_found) = locate_gene(refseq,
                                                       genename,
                                                       VERBOSE=VERBOSE)
    if (not start_found) or (not end_found):
        print 'ERROR: ' + genename + ' not found in genomewide!'
        return False
    elif VERBOSE >= 3:
        print 'OK: start and end of ' + genename + ' found'

    gene_HXB2 = get_gene_HXB2(genename)
    check = check_has_similar_length(end - start,
                                     len(gene_HXB2),
                                     genename,
                                     VERBOSE=VERBOSE,
                                     maxdiff=15)
    if not check:
        return False

    geneseq = refseq[start:end]
    gene = geneseq.seq
    check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    prot = gene.translate()
    check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    check = check_has_end(prot, genename, VERBOSE=0)
    if not check:
        # Vif tends to be a bit longer than in HXB2
        for nc in xrange(1, 4):
            gene_ext = refseq[start:end + 3 * nc].seq
            prot_ext = gene_ext.translate()
            check = check_has_end(prot_ext, genename, VERBOSE=0)
            if check:
                gene = gene_ext
                prot = prot_ext
                if VERBOSE:
                    print 'WARNING: ' + genename + ' actually ends ' + str(
                        nc) + ' codons downstream'
                break
        else:
            print 'ERROR: ' + genename + ' does not end, not even slightly downstream'
            return False

    check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
    if not check:
        return False

    # Check 2-exon genes
    for genename_whole in ('tat', 'rev'):
        genename = genename_whole + '1'
        (start, end, start_found, end_found) = locate_gene(refseq,
                                                           genename,
                                                           VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: ' + genename + ' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of ' + genename + ' found'

        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start,
                                         len(gene_HXB2),
                                         genename,
                                         VERBOSE=VERBOSE,
                                         maxdiff=15)
        if not check:
            return False

        geneseq = refseq[start:end]
        geneseq = geneseq[:len(geneseq) - len(geneseq) % 3]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon1 = start
        end_exon1 = end

        genename = genename_whole + '2'
        (start, end, start_found,
         end_found) = locate_gene(refseq[end_exon1 + 2000:],
                                  genename,
                                  VERBOSE=VERBOSE)
        if (not start_found) or (not end_found):
            print 'ERROR: ' + genename + ' not found in genomewide!'
            return False
        elif VERBOSE >= 3:
            print 'OK: start and end of ' + genename + ' found'

        start += end_exon1 + 2000
        end += end_exon1 + 2000

        # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions
        if genename == 'rev2':
            tol = 45
        else:
            tol = 15
        gene_HXB2 = get_gene_HXB2(genename)
        check = check_has_similar_length(end - start,
                                         len(gene_HXB2),
                                         genename,
                                         VERBOSE=VERBOSE,
                                         maxdiff=tol)
        if not check:
            return False

        geneseq = refseq[start:end]
        frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE)
        geneseq = geneseq[frame:]
        gene = geneseq.seq
        prot = gene.translate()
        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            if genename != 'rev2':
                return False

            else:
                # rev2 can end a bit early
                end_new = prot.rfind('*')
                if end_new != -1:
                    if len(prot) - 1 - end_new < 20:
                        print 'REV2 ENDS ' + str(len(prot) - end_new -
                                                 1) + ' AMINO ACIDS UPSTREAM!'
                        prot = prot[:end_new + 1]
                        end = start + frame + 3 * (end_new + 1)
                    else:
                        return False
                else:
                    # rev2 can also end quite a bit late
                    gene_new = refseq.seq[start:]
                    gene_new = gene_new[(end - start) % 3:]
                    gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)]
                    prot_new = gene_new.translate()
                    end_new = prot_new.find('*')

                    if (start + 3 * end_new) - end < 200:
                        print 'REV2 ENDS ' + str(end_new - len(prot) + 1
                                                 ) + ' AMINO ACIDS DOWNSTREAM!'
                        prot = prot_new[:end_new + 1]
                        end = start + ((end - start) % 3) + 3 * (end_new + 1)
                    else:
                        return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        start_exon2 = start
        end_exon2 = end

        genename = genename_whole
        gene_HXB2 = get_gene_HXB2(genename)

        from Bio.SeqFeature import FeatureLocation
        gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \
                   FeatureLocation(start_exon2, end_exon2, strand=+1)
        geneseq = gene_loc.extract(refseq)
        gene = geneseq.seq

        check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        prot = gene.translate()
        check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_end(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

        check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE)
        if not check:
            return False

    return True
Ejemplo n.º 25
0
def crispy_scan(
    haystack: List[SeqRecord],
    needle: SeqRecord,
    pam: str = "GG",
    unique_size: int = 13,
    full_size: int = 23,
    threads: int = -1,
) -> List[Tuple[int, List[int]]]:
    if unique_size < 1:
        raise ValueError("unique size cannot be below 1")
    if full_size < unique_size:
        raise ValueError("full size cannot be below unique size")

    def build_json_base(location, seq_section,
                        result) -> Dict[str, Union[str, int]]:
        base = {
            'start': location.start,
            'end': location.end,
            'strand': location.strand,
            'sequence': str(seq_section[:-3]),
            'pam': str(seq_section[-3:]),
            'all_hits': result,  # new to JSON, for handy sorting
            '0bpmm': result[0] - 1,  # remove self-hit
        }
        # add remaining mismatch info
        for i, val in enumerate(result[1:]):
            base['{}bpmm'.format(i + 1)] = val
        return base

    # set the size of the window to the unique size
    # and shift one back since in the previous system it skipped a leading N
    before_window = (-unique_size - 1, -1)

    final_result = []

    comparison_text = build_comparison_text(haystack, unique_size)

    idx = 0
    for strand in [1, -1]:
        if strand == -1:
            searcher = Searcher(str(needle.seq.reverse_complement()))
        else:
            searcher = Searcher(str(needle.seq))
        results = searcher.find_repeat_counts(target=pam,
                                              before_window=before_window,
                                              other_text=comparison_text,
                                              threads=threads)

        for pam_start, result in sorted(results.items(), key=lambda x: x[1]):
            # set the window location, accounting for strand
            if strand == -1:
                start = len(needle.seq) - pam_start - len(pam)
                end = start + full_size
            else:
                start = pam_start - full_size + len(pam)
                end = pam_start + len(pam)
            # skip anything for which the full window shown would be truncated
            if start < 0 or end >= len(needle.seq):
                continue

            location = FeatureLocation(start, end, strand)
            seq = location.extract(needle.seq)
            final_result.append(build_json_base(location, seq, result))
            idx += 1

    # order by lowest hits, then by start position
    final_result.sort(key=lambda x: (x["all_hits"], x["start"]))

    return final_result
Ejemplo n.º 26
0
 def extract_sequence(self, start, end):
     location = FeatureLocation(start, end)
     return location.extract(self.nucleic_sequence)