コード例 #1
0
def getPrimerSeqs(dataInfo):
    primerSeqs = []
    for i, val in enumerate(dataInfo['prm_seq']):
        leftSeq = prep.getFastaSequence(dataInfo['genome_build'][0],
                                        dataInfo['vp_chr'][0],
                                        dataInfo['prm_start'][i] - 300,
                                        dataInfo['prm_end'][i]).upper()
        leftIndex = leftSeq.rfind(dataInfo['re_seq'][0])
        leftPrimerSeq = Seq(
            leftSeq[leftIndex:]).reverse_complement().tostring()

        rightSeq = prep.getFastaSequence(dataInfo['genome_build'][0],
                                         dataInfo['vp_chr'][0],
                                         dataInfo['prm_start'][i],
                                         dataInfo['prm_end'][i] + 300).upper()
        rightIndex = rightSeq.find(dataInfo['re_seq'][0]) + len(
            dataInfo['re_seq'][0])
        rightPrimerSeq = rightSeq[:rightIndex]

        assert max(
            leftPrimerSeq.find(dataInfo['prm_seq'][i]),
            rightPrimerSeq.find(dataInfo['prm_seq'][i])
        ) >= 0, 'Primer sequence is wrong\n' + str(dataInfo['prm_seq'][i])
        assert min(
            leftPrimerSeq.find(dataInfo['prm_seq'][i]),
            rightPrimerSeq.find(dataInfo['prm_seq'][i])
        ) <= 0, 'Primer sequence is ambigious\n' + str(dataInfo['prm_seq'][i])

        if rightPrimerSeq.find(dataInfo['prm_seq'][i]) >= 0:
            primerSeqs.append(rightPrimerSeq)
        else:
            primerSeqs.append(leftPrimerSeq)

    return primerSeqs
コード例 #2
0
def find_coding_sequences(dna_sequence, amino_sequence):
    codon_list = []
    i = 0
    rna_sequences = []

    # First get the codons that code for each amino acids
    for aa in amino_sequence:
        codons = ReverseGeneticCode[aa.upper()]
        codon_list.append(ReverseGeneticCode[aa.upper()])
        i += 1

    # Then make the combinations of these codons
    nr_aminoacids = len(codon_list)
    rna_list = codon_list[0]
    for i in range(1, nr_aminoacids):
        rna_list = add_next_level(rna_list, codon_list[i])

    # Now change the found RNA sequences into DNA sequences
    dna_sequences = []
    for rna in rna_list:
        str1 = rna.replace('U', 'T')
        dna_sequences.append(str1)

    # Then see if you can find it in the given DNA sequence
    myseq = Seq(dna_sequence, IUPAC.unambiguous_dna)
    for dna in dna_sequences:
        if myseq.find(dna) != -1:
            print(f'Found {dna}')

    # Then see if you can find it in the reverse complement DNA sequence
    myseq = myseq.reverse_complement()
    for dna in dna_sequences:
        if myseq.find(dna) != -1:
            print(f'Found {dna}')
コード例 #3
0
def def_extract(rootdir, file_name_list, fwd_primer, rev_primer, limit):
    """
    Extracts the DNA sequence inserts based on 3' and 5' primer recognition sequences
    """

    print "def_extract"
    
    #Extraction primer dictionary (NEEDS TO BE DELETED BEFORE PUBLICATION)
    stat_extract_dict = {}
    rc_primer  = str(Seq(rev_primer).reverse_complement())
    
    #Opening input files and creating output files
    if not os.path.exists(rootdir + "/DNA"): os.makedirs(rootdir + "/DNA")
    for filenames in file_name_list:
        print filenames
        sequences = open(rootdir + "/DEMULTIPLEXED/" + filenames, "rU")
        output = open(rootdir + "/DNA/" + filenames.strip(".csv") + "-DNA.csv","w")
        stat_extract_dict[filenames] = 0
    
    #Searching primers
        for i,line in enumerate(sequences): #enumerate: adds a counter to each element (i is the line number in this case)
            start = 0
            end = 0
            if line.find(rc_primer) > 0:
                line = Seq(line).reverse_complement()
            start = line.find(fwd_primer)
            end = line.find(rev_primer)
            extract = str(line[start + len(fwd_primer):end])
    
    #Extracting inserts
            if start >= 0 and end >= 0:
            #if len(extract) >= limit_low:
                output.write (extract)
                output.write("\n")
                stat_extract_dict[filenames] += 1
    
            #if i == limit:
                #break
        output.close()
        sequences.close()
    
    #Extraction Statisitcs
    if not os.path.exists(rootdir + "/STATISTICS"): os.makedirs(rootdir + "/STATISTICS")
    stat = open(rootdir + "/STATISTICS/" + "Statistics-extract.txt","w")
    stat.write("\t DNA extracted \n")
    
    for k, v in stat_extract_dict.iteritems():
        stat.write(str(k))
        stat.write("\t")
        stat.write(str(v))
        stat.write("\n")
    stat.close()
コード例 #4
0
def test_seq():
    print(f'\n\n{Bio.__version__}')

    myseq = Seq("GATCGAAATGGGCCTAAAAATATAGGATCGAAAATCGC",
                IUPAC.unambiguous_dna)

    print(myseq.alphabet)
    print(myseq)
    print(myseq.__len__())

    # You can calculate the GC ratio yourself or you can you use a special function to do so
    print(f'The number of G\'s is: {myseq.count("G")}')
    print(f'The number of C\'s is: {myseq.count("C")}')
    print(100 * (myseq.count("G") + myseq.count("C")) / myseq.__len__())
    print(f'The ratio GC in the string is: {GC(myseq)}')

    # Sequences can be inverted and complemented
    print(f'The string is:\t\t\t\t {myseq}')
    print(f'The reverse is:\t\t\t\t {myseq[::-1]}')
    print(f'The complement is:\t\t\t {myseq.complement()}')
    print(f'The reverse complement is:\t {myseq.reverse_complement()}')

    # A simple function to determine if a string is palindromic
    print(f'GAAG is palindrome: {ispalindromic("GAAG")}')
    print(f'GAAG is palindrome: {ispalindromic("GTAG")}')

    # Find all the sequences of AAA
    # Start with the first AAA and that start looping
    positions = []
    pos = myseq.find('AAA')
    while pos != -1:
        positions.append(pos)
        pos = myseq.find('AAA', pos + 1)
    print(positions)
    count = len(positions)

    # A demo to show that Seq is not mutable.
    # If you need to change a Seq, make it mutable first
    try:
        myseq[0] = 'C'

    except:
        print("Oops! As Seq is immutable ")

    im_seq = myseq.tomutable()
    im_seq[0] = 'C'

    seq = im_seq
    print(seq)
コード例 #5
0
ファイル: blastoys.py プロジェクト: dueberlab/labtools
	def transextend(self, filepath, temppath, revcom):
		searchseed = self.transcript[self.kbegin:self.kend]
		aln1 = self.transcript.find(searchseed)
		txfound = 0
		# must have single line fastas to work - no n60 splitting; otherwise, can't use grep for max speed
		with open(temppath + '/temp','w') as tempfile:
			if revcom == True:
				grep = subprocess.Popen(['grep', '-B1', str(searchseed.reverse_complement()), filepath],stdout=tempfile)
			else:
				grep = subprocess.Popen(['grep', '-B1', str(searchseed), filepath],stdout=tempfile)
			grep.wait()
			tempfile.flush()
		with open(temppath + '/temp','r') as tempfile:
			line1=''
			line2=''
			for line in tempfile:
				line1=line2
				line2=line.rstrip('\n')
				if ">" in line1:
					seqid = line1[1:]
					if revcom == True: seqread = Seq(line2,generic_dna).reverse_complement()
					else: seqread = Seq(line2,generic_dna)
					if seqid not in self.ids:
						aln2 = seqread.find(searchseed)
						self.reads.append(seqread)
						self.ids.append(seqid)
						self.aligns.append(aln1-aln2)
						self.ends.append(aln1-aln2+len(seqread))
						txfound += 1
		return txfound
コード例 #6
0
ファイル: NoStop.py プロジェクト: JCGiron/SIBL
def NoStop(input_file_path, out_name):
    """NoStop removes stop codons (that are hard coded in the function; change based on taxa/phyla). Function takes an input "file/path" and a "suffix" which is appended to the infile name."""
    codon_stop_array = ["TGA", "TAG", "TAA", "UGA", "UAA", "UAG"]
    #input_file_path = "/Users/chriswirth/Desktop/PruinescenceSeqs/CAD/CAD_Mod/CAD_AllData_mod.fasta"
    file_ext = os.path.basename(input_file_path)
    file, ext = os.path.splitext(file_ext)
    #NB/caution: Assumes sequences are in frame!
    for record in SeqIO.parse(input_file_path, "fasta", generic_alphabet):
        temp_seq = Seq("", generic_alphabet)
        for index in range(0, len(record.seq), 3):
            codon = record.seq[index:index + 3]
            if codon in codon_stop_array:
                codon = UnknownSeq(3, character='?')
            #Note += syntax here
            temp_seq += codon
        #Write output to a .fasta file, note format
        fasta_format_string = (">%s\n%s" % (record.name, temp_seq))
        a = open("%s_%s.fasta" % (file, out_name), "a+")
        print >> a, fasta_format_string
        a.close()
        #Write change log to a .txt file; could record issues/changes across all "genes" if desired, but I've restricted to only a single AllData file for now
        b = open("%s_log.csv" % file, "a+")
        # Prints CSV with path and filename, taxon name, number of stop codons, and position (-1 if none, can be cleaned up easily) of first stop codon
        print >> b, file, ",", record.name, ",", temp_seq.count(
            "???"), ",", temp_seq.find("???")
        b.close()
    return
コード例 #7
0
def find_frameshift(desired_protein: Seq, orfs: list, g_sequence: Seq) -> tuple:
    start_index = 0
    for orf in orfs:
        if orf == desired_protein:
            start_index = g_sequence.find(orf)
            print(start_index)
            break
    return g_sequence[start_index-60:]
コード例 #8
0
def getPrimerSeqs(dataInfo):
    """ Function to check the primer sequences. Checks wether a sequence
		occurs in the target region and is unique enough. 

	:param dataInfo: The settings dictionary created by loading the ini.

	:returns: The provided sequences, both forward and reverse versions.

	:TODO: Improve error message on faulty primes sequences
	"""
    primerSeqs = []
    for i, val in enumerate(dataInfo['prm_seq']):
        leftSeq = prep.getFastaSequence(dataInfo['genome_build'][0],
                                        dataInfo['vp_chr'][0],
                                        dataInfo['prm_start'][i] - 300,
                                        dataInfo['prm_end'][i]).upper()
        leftIndex = leftSeq.rfind(dataInfo['re_seq'][0])
        leftPrimerSeq = Seq(
            leftSeq[leftIndex:]).reverse_complement().tostring()

        rightSeq = prep.getFastaSequence(dataInfo['genome_build'][0],
                                         dataInfo['vp_chr'][0],
                                         dataInfo['prm_start'][i],
                                         dataInfo['prm_end'][i] + 300).upper()
        rightIndex = rightSeq.find(dataInfo['re_seq'][0]) + len(
            dataInfo['re_seq'][0])
        rightPrimerSeq = rightSeq[:rightIndex]

        assert max(
            leftPrimerSeq.find(dataInfo['prm_seq'][i]),
            rightPrimerSeq.find(dataInfo['prm_seq'][i])
        ) >= 0, 'Primer sequence is wrong\n' + str(dataInfo['prm_seq'][i])
        assert min(
            leftPrimerSeq.find(dataInfo['prm_seq'][i]),
            rightPrimerSeq.find(dataInfo['prm_seq'][i])
        ) <= 0, 'Primer sequence is ambigious\n' + str(dataInfo['prm_seq'][i])

        if rightPrimerSeq.find(dataInfo['prm_seq'][i]) >= 0:
            primerSeqs.append(rightPrimerSeq)
        else:
            primerSeqs.append(leftPrimerSeq)

    return primerSeqs
コード例 #9
0
    def find_sequence(self, sequence_to_find):
        sequence_obj = Seq(self.sequence, generic_dna)
        reverse_comp = sequence_obj.reverse_complement()

        matches = []
        # search for exact matches of the consensus sequence
        pos = sequence_obj.find(sequence_to_find)
        while (pos >= 0):
            # got a hit, record position and matching sequence
            matches.append((pos, sequence_to_find, '+'))
            pos = sequence_obj.find(sequence_to_find, pos+1)

        # also, look for matches on the complementary strand
        pos = reverse_comp.find(sequence_to_find)
        while (pos >= 0):
            # got a hit, record position and sequence
            matches.append((pos, sequence_to_find, '-'))
            pos = reverse_comp.find(sequence_to_find, pos + 1)

        return matches
コード例 #10
0
def main(*args, **kwargs):
    fpath = os.path.join(os.getcwd(),args[-1])
    s,t = StrongHold.two_dna(fpath)
    s,t = Seq(s),Seq(t)
    idx = 0
    start=0

    while idx!=-1:
        idx = s.find(t, start=start)
        start = idx+1
        if idx!=-1:
            print idx+1,
コード例 #11
0
    def manage_protein(data):

        sequence = Seq(data.sequence, IUPAC.protein)

        treated_data = Processed_protein(
            creation_date=data.creation_date.strftime(
                "%d/%m/%Y, %H:%M:%S UTC"),
            translation_table=data.translation_table,
            protein=str(sequence),
            protein_to_stop=str(sequence)
            if "*" not in sequence else sequence[0:sequence.find("*")])

        return Sequencer.extract_sequence_data(treated_data)
コード例 #12
0
ファイル: code.py プロジェクト: zouyuesong/ROSALIND_solutions
def translate(DNA):
    candidates = []
    for i in range(3, len(DNA)):
        if DNA[i - 3:i] == 'ATG':
            # print(i, DNA[i-3:])
            candidate = Seq(DNA[i - 3:], IUPAC.unambiguous_dna).translate()
            p = candidate.find('*')
            if p != -1:
                candidate = str(candidate)[:p]
                candidates.append(candidate)

    DNA_rev = reverse_complement(DNA)
    for i in range(3, len(DNA)):
        if DNA_rev[i - 3:i] == 'ATG':
            # print(DNA[i-3:])
            candidate = Seq(DNA_rev[i - 3:], IUPAC.unambiguous_dna).translate()
            p = candidate.find('*')
            if p != -1:
                candidate = str(candidate)[:p]
                candidates.append(candidate)

    return '\n'.join(list(set(candidates)))
コード例 #13
0
ファイル: bio.py プロジェクト: ShalekLab/tcrgo
def find_all(seq: Seq, residue: str, \
 start_index: int, end_slice: int) -> Tuple[Optional[int], ...]:
    """Search for a residue and return a list of positions for each frame"""
    positions = list()
    start = start_index
    end = end_slice
    while start < end:
        position = seq.find(residue, start, end)
        if position == -1:
            break
        positions.append(position)
        start = position + 1
    return tuple(positions)
コード例 #14
0
def extract_amino_acid_position(df, record_dict, pep_idx):
    """
    Make lists of the column values for protein id sequence and topology
    loops through searching and extracting amino acid position number using 
    Biopython Seq.find(). Finally adds the position to the topology to get
    AbsPos1 and Abspos2 for both peptides and creates a new DF column.
    This function is called internally by get_seq_id().
    """
    protein = list(df['prot%s' % pep_idx])
    xl = list(df['seq%s' % pep_idx])
    top = list(df['top%s' % pep_idx].apply(pd.to_numeric))
    AbsPos = []
    for i, prot in enumerate(protein):
        my_seq = Seq(str(record_dict[prot].seq))
        pos = my_seq.find(xl[i])
        AbsPos.append(pos + top[i])
    df['AbsPos%s' % pep_idx] = AbsPos
コード例 #15
0
ファイル: dseq.py プロジェクト: uswa1/pydna
    def find(self, sub, start=0, end=_sys.maxsize):
        """This method behaves like the python string method of the same name.

        Returns an integer, the index of the first occurrence of substring
        argument sub in the (sub)sequence given by [start:end].

        Returns -1 if the subsequence is NOT found.

        Parameters
        ----------

        sub : string or Seq object
            a string or another Seq object to look for.

        start : int, optional
            slice start.

        end : int, optional
            slice end.

        Examples
        --------
        >>> from pydna.dseq import Dseq
        >>> seq = Dseq("atcgactgacgtgtt")
        >>> seq
        Dseq(-15)
        atcgactgacgtgtt
        tagctgactgcacaa
        >>> seq.find("gac")
        3
        >>> seq = Dseq(watson="agt",crick="actta",ovhg=-2)
        >>> seq
        Dseq(-7)
        agt
          attca
        >>> seq.find("taa")
        2
        """

        if self.linear:
            return _Seq.find(self, sub, start, end)

        sub_str = self._get_seq_str_and_check_alphabet(sub)

        return (_pretty_str(self) + _pretty_str(self)).find(
            sub_str, start, end)
コード例 #16
0
ファイル: 15_ORF2.py プロジェクト: selinj/rosalind
def trans_RF(start,strand):
    rf = Seq(strand[start:])

    if len(rf) % 3 == 2:
        rf = str(rf)
        rf = rf + 'N'
        rf = Seq(rf)
    elif len(rf) % 3 == 1:
        rf = str(rf)
        rf = rf + 'NN'
        rf = Seq(rf)
    
    rf = rf.translate()
    rf = str(rf)
    j = rf.find('*')
    
    if j!= -1 and rf[0:1] == 'M':
        reading_frames.append(rf[:j])
コード例 #17
0
def checkhex(seq, dorev):
    # Will look on the revcomp of seq if dorev==True
    seq = Seq(seq)
    if dorev:
        seq = seq.reverse_complement()
    #print "Checking seq: " + seq

    hexes = [
        "AATAAA", "ATTAAA", "AGTAAA", "TATAAA", "CATAAA", "GATAAA", "AATATA",
        "AATACA", "AATAGA", "AATGAA", "ACTAAA", "AACAAA", "TTTAAA"
    ]

    out = "NONE"
    for checkhex in hexes:
        if seq.find(checkhex) > -1:
            out = checkhex
            break
    #print "Found hex: " + out
    return out
コード例 #18
0
from Bio.Seq import Seq

tatabox_seq = Seq("tataaaggcAATATGCAGTAG")
start_idx = tatabox_seq.find("ATG")
end_idx = tatabox_seq.find("TAG", start_idx)
orf = tatabox_seq[start_idx:end_idx + 3]
print(orf)
コード例 #19
0
import numpy as np
import pandas as pd
import Bio
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_dna
from Bio.SeqUtils import GC


#Given two strings s and t, t is a substring of s if t is contained as a contiguous collection of symbols in s (as a result, t must be no longer than s).

s = Seq("TGAGAAGTAGGAGCAAGCAGCTTGCAAGCAGCCCCAAGCAGGACAAGCAGCAAGCAGCCAAGCAGGACAAGCAGTAAGCTCAAGCAGGGCAAGCAGACAAGCAGGAGAAGACAAGCAGCCATGCACAAGCAGCAAGCAGACCTGCAAGCAGCAAGCAGCTCAAGCAGTTAATCAAGCAGCAAGCAGCAAGCAGATCAAGCAGTCACAGATCGGACAAGCAGTTTGCAAGCAGCAAGCAGACAAGCAGCCTCAAGCAGCAAGCAGCTTCAAGCAGTGTACAAGCAGTTACCAAGCAGATCCATATCAAGCAGGGCACAAGCAGACCCAGAGTCAAGCAGGAATCAAGCAGCCAAGCAGCAAGCAGCAAGCAGTACAAGCAGCAAGCAGCAAGCAGCCAAGCAGTTCCAAGCAGCAAGCAGGGTAAGTGTCAAGCAGCAAGCAGGGTGCAGCAAGCAGTGCAAGCAGCAAGCAGCCCAAGCAGCAAGCAGCCAAGCAGCTCCCAAGCAGCCAAGCAGCAACAAGCAGAAGATCAAGCAGTGCCGCAAGCAGCATCCTTCAAGCAGACCAAGCAGCTCCACAAGCAGTATAGGGGGGGAGGCAAGCAGAAGGTCAAGCAGTCAAGCAGCGCACCCAAGCAGAAGCAAGCAGACAAGCAGCAAGCAGACAAGCAGTCAAGCAGACCAAGCAGAGCAAGCAGCCAAGCAGTCCCCAAGCAGACAAGCAGAGTCCCGTTCCAAGCAGGAAACAAGCAGCAAGCAGCAAGCAGTAAAGTCCCACAAGCAGGTTCCAAGCAGCAAGCAGCAACAAGCAGTGGCAAGCAGACAAGCAGTTCCAAGCAGCAAGCAG")
t = Seq("CAAGCAGCA")

locations = []
count = 0
loc = 0

while (count < len(s)):
    loc = s.find(t, start=count)
    if (loc > 0):
        count = loc + 1
        # adding plus 1 since the result needs to be in count from 1 format
        locations.append(loc + 1)
    else:
        count = count + 1

print(' '.join(map(str, locations)))
コード例 #20
0
preseq = "YYC"
postseq = "WGQ"

filein = open(sys.argv[1])
fileout = open(sys.argv[2], "w")

linecount = 0
for line in filein:
    linecount += 1
    if linecount % 2 == 1:
        header = line.rstrip()
        fastaheader = header.replace("@", ">")
    else:
        sequence = Seq(line.rstrip())
        startaa = sequence.find(preseq) + 3
        endaa = sequence.find(postseq)
        if startaa == -1 or endaa == -1:
            continue
        targetbit = line.rstrip()[startaa:endaa]
        if targetbit in bigset or len(targetbit) > 50 or len(targetbit) < 1:
            continue
        bigset.add(targetbit)
        fileout.write(fastaheader + "\n")
        fileout.write(targetbit + "\n")

filein.close()
fileout.close()

#print("Got total number of CDR3s: "+str(len(bigset)))
コード例 #21
0
startseq = "ATGGCACAG"
endseq = "ACCGTCTCCTCA"

readsread = 0
readspassing = 0

linecounter = 0
for line in filein:
    linecounter += 1
    if linecounter % 4 == 1:
        readsread += 1
        headerline = line
    if linecounter % 4 == 2:
        dna = Seq(line.rstrip())
        startbase = dna.find(startseq)
        endbase = dna.find(endseq)
        if startbase == -1 or endbase == -1:  #read fails if missing the start or end sequence
            continue
        targetregion = Seq(line.rstrip()[startbase:endbase + 1 + len(endseq)],
                           generic_dna)
        if len(
                str(targetregion)
        ) % 3 != 0:  #read fails if the grabbed region is not a multiple of three
            continue
        translated = targetregion.translate()
        if "*" in str(translated):  #read fails if it has a stop codon
            continue
        readspassing += 1
        fileout.write(headerline)
        fileout.write(str(translated) + "\n")
コード例 #22
0
ファイル: SeqEval_V41.py プロジェクト: DavidBulger/AbiTracer
        data4r = data4[::-1]
        pos = record.POS                #Base positions in trace files
        posr = pos[::-1]                #Reverse base positions
    print file_name

    # gk loop in order to batch process all mutations
    n = -1
    for gklist in mutation:
        n+=1 # counter

        # Query sequence variables
        gkf = Seq(gklist[1].upper()) # million mutation project gk allele forward sequence 20 bp before mutation
        WTf = Seq(gklist[2].upper()) # wild type nucleotide at mutation site 

        # Find sequence
        lgkf = seq.find(gkf)  # lgkf = location of gkf in input file str1
        lgkr = rseq.find(gkf) # lgkr = location of gkr in input file str1

        # Display results
        if lgkf >= 0:
            print_details(file_name,data1,data2,data3,data4,phd,WTf,lgkf,gkf,fwo,pos)
            continue
        elif lgkr >= 0:
            gkr = gkn.reversecomplement()
            gkr_ = gkr[n]
            gkrF = gkr_[1:20]
            WTrF = gkr_[0]
            lgkrF = seq.find(gkrF)
            if lgkrF >= 0:
                print_rdetails(file_name,data1,data2,data3,data4,phd,WTrF,lgkrF,fwo)
                continue
コード例 #23
0
Similarly, an error will be thrown if you try to do something like translate
a protein sequence.
"""


"""
The Seq object has a number of methods which act just like those of a Python
string (For example, the find and count methods).
"""

#rom Bio.Seq import Seq
#from Bio.Alphabet import generic_dna
my_dna = Seq("AGTACACTGGT", generic_dna)
print my_dna
#Seq('AGTACACTGGT', DNAAlphabet())
my_dna.find("ACT")
#5
my_dna.find("TAG")
#-1
my_dna.count("GG")
#note that count is non-overlapping
"AAAAAAA".count("AA")


"""
BioPython has several built-in functions for biological applications:
complement, reverse complement, translation, back translation
"""

#from Bio.Seq import Seq
#from Bio.Alphabet import generic_dna
コード例 #24
0
ファイル: crisprdesign.py プロジェクト: nmorris/crisprdesign
class HrTemplate:
	"""Holds information about a template for homologous recombination.
	Args:
		seq(Bio.Seq): sequence of the HR template
		target_location(GenomicLocation): genomic location the template is targeting
		target(Bio.Seq): Entire sequence of the targeted site
		frame(int): can be +1, +2, +3, -1, -2, -3, or 0 (non-coding)	
	"""
	def __init__(self, seq):
		# instance variables
		self.seq = Seq(str(seq), generic_dna) # sequence of the hr template itself
		self.target_site = None # GenomicLocation object
		self.target = None # genomic sequence of the region this template targets (not necessarily identical to seq of template)
		self.frame = 1 # can be +1, +2, +3, -1, -2, -3, or 0 (non-coding)
	def __eq__(self, other):
		return (( self.seq, self.target_location, self.target, self.frame ) == (other.seq, other.target_location, other.target, other.frame ))
	def __ne__(self, other):
		return not self == other

	def find_frame(self): #assume that protein seq with fewest number of stops is correct
		revcomp = self.seq.reverse_complement()
		forward_stops = []
		reverse_stops = []
		min_stops = 1000
		for i in range(3):
			#print self.seq[i:].translate()
			#print revcomp[i:].translate()
			#print
			forward_stops = self.seq[i:].translate().count("*")
			reverse_stops = revcomp[i:].translate().count("*")
			if forward_stops < min_stops:
				self.frame = i+1
				min_stops = forward_stops
			if reverse_stops < min_stops:
				self.frame = (-1*i)-1
				min_stops = reverse_stops
				
	def remove_pam( self, sgrna ):
		# find location based on genomic sequence we're targeting
		# then perform mutation based on existing HR template
		# this allows mutations in HR template, serial removal of multiple PAMs, etc.
		protospacer = sgrna.protospacer.back_transcribe()
		sgrna_target_index = self.target.seq.find( protospacer ) # where are we in the whole target sequence?
		sgrna_template_index = self.seq.find( protospacer ) # where are we in the HDR template?
		if sgrna_template_index != -1 and sgrna_target_index != -1:
			strand = "+"
		else:
			strand = "-"
			sgrna_target_index = self.target.seq.find(protospacer.reverse_complement())
			sgrna_template_index = self.seq.find(protospacer.reverse_complement())
		if sgrna_target_index != -1 and sgrna_template_index == -1:
			# this can only happen if the introduced mutation ablates the protospacer
			print "Could not find protospacer in HR template sequence. Mutation ablated protospacer or protospacer lies in the ablated PAM of another guide."
			return True
		elif sgrna_target_index == -1 and sgrna_template_index == -1:
			# Should never get here, if sgrna_template_index is != -1
			print "Could not find protospacer in HR target sequence!"
			return False
		# print "Strand %s" % strand
		if strand == "+":
			possible_bases3 = ["A", "T", "C"] # 3rd base of pam
			possible_bases2 = ["T", "C" ] # 2nd base of pam
			template_pos2,template_pos3 = range(sgrna_template_index + len(protospacer)+1, sgrna_template_index + len(protospacer)+3) # only get the last 2 bases of the PAM
			target_pos2,target_pos3 = range(sgrna_target_index + len(protospacer)+1, sgrna_target_index + len(protospacer)+3) # only get the last 2 bases of the PAM
		else:
			possible_bases3 = ["A", "T", "G"] # 3rd base of pam
			possible_bases2 = ["A", "G" ] # 2nd base of pam
			template_pos3, template_pos2 = range(sgrna_template_index-3, sgrna_template_index-1) # only get the last two bases of the PAM
			target_pos3, target_pos2 = range(sgrna_target_index-3, sgrna_target_index-1) # only get the last two bases of the PAM
		#print pos2, pos3
		# from here on, whether + or - strand, pam_indices order is [pos2,pos3] = 2nd base of PAM, 3rd base of PAM
		# print pam_indices
		if self.frame == 0: # if we're in a noncoding region, replace with a randomly allowed base
			base = random.choice(possible_bases3)
			seq_copy = self.seq[:template_pos3] + base + self.seq[template_pos3+1:]
			self.seq = seq_copy
			return True
		# SHOULD THE CASE BELOW JUST BE USED EVERY TIME?
		else:	# if we're in a coding region, possible_bases is shuffled and we iterate
			random.shuffle( possible_bases3 )
			random.shuffle( possible_bases3 )
			random.shuffle( possible_bases2 )
			possible_bases2.insert(0, self.seq[ template_pos2 ] ) # add existing 2nd PAM base to the list. that way we start with identity at pos2 and vary pos3, then move on to varying both
			ref_translations = []
			# TODO work on pam removal if we don't have feature info.
			if self.target.features == None:
				if strand == "+":
					ref_translations.append(str(self.target[abs(self.frame)-1:].translate()))
				else:
					ref_translations.append(str(self.target.reverse_complement()[abs(self.frame)-1:].translate()))
			# we have feature information, e.g. from a genbank file
			if self.target.features != None:
				for feature in self.target.features:
					if feature.type == "CDS":
						ref_translations.append( str(feature.extract(self.target).seq.translate()) )
			for base3 in possible_bases3:
				for base2 in possible_bases2:
					if strand == "+":
						target_copy = self.target[:target_pos2] + base2 + base3 + self.target[target_pos3+1:] # copy of the full target seq with PAM bases replaced
					else:
						target_copy = self.target[:target_pos3] + base3 + base2 + self.target[target_pos2+1:] # copy of the full target seq with PAM bases replaced
					test_translations = []
					target_copy.features = self.target.features
					if target_copy.features == None: # fasta
						if strand == "+":
							test_translations.append(str(target_copy[abs(self.frame)-1:].translate()))
						else:
							test_translations.append(str(target_copy.reverse_complement()[abs(self.frame)-1:].translate()))
					else: # genbank
						for feature in target_copy.features:
							if feature.type == "CDS":
								test_translations.append(str(feature.extract(target_copy).seq.translate()))
					synonymous = set(test_translations).issubset(set(ref_translations)) # are all of the test translations found in the reference translations?
					if synonymous == True:
						if strand == "+":
							self.seq = self.seq[:template_pos2] + base2 + base3 + self.seq[template_pos3+1:] # copy of the HR template seq with PAM bases replaced
						else:
							self.seq = self.seq[:template_pos3] + base3 + base2 + self.seq[template_pos2+1:]
						return True
#					print protospacer
#					print test_translation
#					print ref_translation
#					print
		#print "Could not remove PAM!"
		return False			
コード例 #25
0
    book = xlrd.open_workbook(xlsfile)
    sheet = book.sheet_by_index(0)
    nrows = sheet.nrows
    #print (nrows)
    #print (xlsfile)
    #print sheet.cell_value(rowx=0, colx=2)

    oligo_row = 0
    oligo_col = 2
    name_col = 1

    for oligo in range(sheet.nrows):
        cell = sheet.cell_value(rowx=oligo_row, colx=oligo_col)
        if oligo_row < nrows:
            oligo_caps = cell.upper()
            oligo_find = ref_seq.find(oligo_caps)
            oligo_rev_find = ref_rev_comp.find(oligo_caps)

            if oligo_find == -1 and oligo_rev_find == -1 or cell == '':
                oligo_row += 1
            elif oligo_find != -1 or oligo_rev_find != -1:
                #print ('elif1')
                name = sheet.cell_value(rowx=oligo_row, colx=name_col)
                name_match_list.extend((name, ))
                no_match += 1
                oligo_row += 1
                #print (name)
                #print (oligo_caps)
                #print ("GOT IT")

    print "this is name_match_list % s" % name_match_list
コード例 #26
0
G | GTG V(s)| GCG A   | GAG E   | GGG G   | G
--+---------+---------+---------+---------+--
"""

# 6 : find ORF from sequence object

# from Bio.Seq import Seq
PHYB = Seq(
    "AAAAAAGTCGCAGAAAATATATGAGGAAACAAAAAGCGAAGACGACAAAAAAAAAAAAAACTCTGATTTTTTTTTGTTATCTCTCTCTATCTGAGAGGCACACATTT\
TGCTTCGTCTTCTTCAATTTATTTTATTGGTTTCTCCACTTATCTCCGATCTCAATTCTCCCCATTTTCTTCTTCCTCAAGTTCAAAATTCTTGAGAATTTAGCTCTACCAGAATTCGT\
CTCCGATAACTAGTGGATGATGATTCACCCTAAATCCTTCCTTGTCTCAAGGTAATTCTGAGAAATTTCTCAAATTCAAAATCAAACGGCATGGTTTCCGGAGTCGGGGGTAGTGGCGG\
TGGCCGTGGCGGTGGCCGTGGCGGAGAAGAAGAACCGTCGTCAAGTCACACTCCTAATAACCGAAGAGGAGGAGAACAAGCTCAATCGTCGGGAACGAAATCTCTCAGACCAAGAAGCA\
ACACTGAATCAATGAGCAAAGCAATTCAACAGTACACCGTCGACGCAAGACTCCACGCCGTTTTCGAACAATCCGGCGAATCAGGGAAATCATTCGACTACTCACAATCACTCAAAAC"
)
# 582 base from PHYB
start_idx = PHYB.find("ATG")
end_idx = PHYB.find("TAG", start_idx)  # skipped TAA, TGA for convenience
orf = PHYB[start_idx:end_idx + 3]
print(orf)
# ATGAGGAAACAAAAAGCGAAGACGACAAAAAAAAAAAAAACTCTGATTTTTTTTTGTTATCTCTCTCTATCTGAGAGGCACACATTTTGCTTCGTCTTCTTCAATTTATTTT
# ATTGGTTTCTCCACTTATCTCCGATCTCAATTCTCCCCATTTTCTTCTTCCTCAAGTTCAAAATTCTTGAGAATTTAG

# 7 : Weight of sequence

# from Bio.Seq import Seq
# from Bio.Alphabet import IUPAC
from Bio.SeqUtils import molecular_weight

seq1 = Seq("ATGCAGTAG")
seq2 = Seq("ATGCAGTAG", IUPAC.unambiguous_dna)
seq3 = Seq("ATGCAGTAG", IUPAC.protein)
コード例 #27
0
ファイル: get_Record.py プロジェクト: sharifmamun/Exon_Finder
def ungap ( seq ) : 
  if ( seq.find('-') >= 0 ):  
    seq = Seq ( str(seq).replace('-',''))
  if ( seq.find('~') >= 0 ):  
    seq = Seq ( str(seq).replace('~',''))
  return seq 
コード例 #28
0
# Check Biopython version 
Bio.__version__

## Sequence Operations

# Sequence 
seq = Seq("GGACCTGGAACAGGCTGAACCCTTTATCCACCTCTCTCCAATTATACCTATCATCCTAACTTCTCAGTGGACCTAACAATCTTCTCCCTTCATCTAGCAGGAGTC")

# Alphabet
seq.alphabet

# Check type 
type(seq.alphabet)

# Find sub-sequence: if TRUE <- SubSeq Position, else <- return -1 
seq.find("ATC")

seq.find("ATGC")

# Number of `A`
seq.count("A")

# Number of `C`
seq.count("C")

# Number of `T`
seq.count("T")

# Number of `G`
seq.count("G")
コード例 #29
0
def get_new_sequence(dfname, dbname, rna_db, protein_db, dataset_name):
    version = get_version(protein_db)
    if (version == 'swissprot'):
        db = 'sp'
    else:
        db = version

    protein_coding_list = get_protein_coding_list_from_db(protein_db)
    records = DataIterator(dbname)
    strand_dict = {}
    protein_id_dict = {}
    for record in records:
        if (record[2] == 'transcript'):
            if ('transcript_type' in record.attributes):
                if (record.attributes['transcript_type'][0] == 'protein_coding'
                    ):
                    if (record.attributes['transcript_id'][0]
                            in protein_coding_list):
                        strand_dict[record.attributes['transcript_id']
                                    [0]] = record[6]
                        protein_id_dict[record.attributes['transcript_id'][0]]=(record.attributes['protein_id'][0],\
                                   record.attributes['gene_id'][0])

    print("protein_id_dict ready")

    sequence_dict = {}
    rna_seqs = SeqIO.parse(rna_db, 'fasta')
    ##check the correctness of rna-seqs
    for correct in rna_seqs:
        tmp = correct.id
        flag = tmp.find("|")
        mrna_id = tmp[:flag]
        tmp = tmp[flag + 1:]
        cds = tmp.find('CDS:')
        if (cds == -1):
            print("The format of file of parameter -r(--rna) is incorrect!")
            return 0
        tmp = tmp[cds:]
        cds_end = tmp.find('|')
        tmp = tmp[tmp.find(':') + 1:cds_end]

        split_flag = tmp.find('-')
        if (split_flag == -1):
            print("The format of file of parameter -r(--rna) is incorrect!")
            return 0
        coding_start = int(tmp[:split_flag])
        coding_end = int(tmp[split_flag + 1:])
    for seq in rna_seqs:
        tmp = seq.id
        flag = tmp.find("|")
        mrna_id = tmp[:flag]
        tmp = tmp[flag + 1:]
        cds = tmp.find('CDS:')
        tmp = tmp[cds:]
        cds_end = tmp.find('|')
        tmp = tmp[tmp.find(':') + 1:cds_end]

        split_flag = tmp.find('-')
        coding_start = int(tmp[:split_flag])
        coding_end = int(tmp[split_flag + 1:])
        sequence_dict[mrna_id] = (coding_start, seq.seq, coding_end)

    print("sequence_dict ready")

    df = pd.read_csv(dfname, sep='\t', header=None)
    change_df = extract_transcript_change(df)
    trans_index_dict = {}
    for i in range(0, change_df.shape[0]):
        if (change_df.iloc[i]['mrna'] not in trans_index_dict.keys()):
            trans_index_dict[change_df.iloc[i]['mrna']] = [i]
        else:
            trans_index_dict[change_df.iloc[i]['mrna']].append(i)

    my_seqs = []
    k_cnt = 0
    hom_cnt = 0
    het_cnt = 0

    for k in trans_index_dict.keys():

        if (k in protein_id_dict.keys()):
            k_cnt += 1
            #            if(k_cnt%1000==0):
            #                print(k_cnt)
            pid = protein_id_dict[k][0]
            gid = protein_id_dict[k][1]

            if (strand_dict[k] == '+'):

                transcript = str(sequence_dict[k][1])
            else:
                tmp = Seq(str(sequence_dict[k][1]),
                          IUPAC.ambiguous_dna).complement()
                transcript = str(tmp)
            coding_start = int(sequence_dict[k][0]) - 1
            coding_end = int(sequence_dict[k][2]) - 1

            transcript = transcript[coding_start:coding_end + 1]

            shift = 0
            des = ""
            het_list = []
            hom_position_list = []
            for i in trans_index_dict[k]:

                if (change_df.iloc[i]['snp_type'] == 'hom'):
                    hom_cnt += 1
                    hom_position_list.append(
                        (int(change_df.iloc[i]['c_start']),
                         change_df.iloc[i]['mutation_type']))
                    #                    if(change_df.iloc[i]['mutation_type']=='snv'):
                    #                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                    #                                                  int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],'snv')
                    #                        des+="snv:"+str(change_df.iloc[i]['c_start'])+change_df.iloc[i]['c_content']+'_'
                    #                    else:
                    if (strand_dict[k] == '-'):
                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                                              int(change_df.iloc[i]['c_end'])+shift,str(Seq(str(change_df.iloc[i]['c_content'])).complement()),\
                                              change_df.iloc[i]['mutation_type'])
                    else:
                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                                              int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],\
                                              change_df.iloc[i]['mutation_type'])
                    des+=change_df.iloc[i]['mutation_type']+":"+str(change_df.iloc[i]['c_start'])+'-'+\
                    str(change_df.iloc[i]['c_end'])+str(change_df.iloc[i]['c_content'])+'_'
                    if (change_df.iloc[i]['mutation_type'].find('del') != -1):
                        shift -= (int(change_df.iloc[i]['c_end']) -
                                  int(change_df.iloc[i]['c_start']) + 1)
                    elif (change_df.iloc[i]['mutation_type'].find('ins') !=
                          -1):
                        shift += (int(change_df.iloc[i]['c_end']) -
                                  int(change_df.iloc[i]['c_start']) + 1)
                else:
                    het_list.append(i)
                    het_cnt += 1

            coding_start = int(sequence_dict[k][0]) + shift - 1
            coding_end = int(sequence_dict[k][2]) + shift - 1
            count = int(len(transcript) / 900)
            if (len(transcript) <= 900):
                count = 1
            cnt = 0

            for l in range(0, count):
                l = l * 900
                start = l
                if (start + 1799 < len(transcript)):
                    stop = start + 1799
                else:
                    stop = len(transcript) - 1
#                while(coding_start>stop):
#                    l=l+1
#                    start=l*1800
#                    if(l+1799<len(transcript)):
#                        stop=l+1799
#                    else:
#                        stop=len(transcript)-1
#                #if(start>coding_end+shift)

                if (len(hom_position_list) != 0):
                    new_sequence = ""
                    new_des = ""
                    for p in hom_position_list:
                        if ((p[0] - 1 >= start) & (p[0] - 1 <= stop)):
                            flag = des.find(str(p[0]))
                            tmp = des[flag:]
                            flag = tmp.find('_')
                            new_des += str(p[1]) + ":" + tmp[:flag + 1]
                            new_sequence = transcript

                    if (len(new_sequence) != 0):

                        new_sequence = new_sequence[start:stop + 1]

                        if (strand_dict[k] == '+'):
                            new_seq = str(
                                Seq(str(new_sequence), IUPAC.ambiguous_dna).
                                transcribe().translate(to_stop=True))
                        else:
                            new_seq = str(
                                Seq(str(new_sequence),
                                    IUPAC.ambiguous_dna).complement().
                                transcribe().translate(to_stop=True))
                        cnt += 1
                        while (new_seq.find('None') != -1):
                            new_seq = new_seq.replace('None', '')
                        my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                 id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1),\
                                             description=new_des))
                het_number = len(het_list)
                for n in range(0, het_number):
                    new_sequence = ""
                    new_des = ""

                    if ((int(change_df.iloc[het_list[n]]['c_start']) >= start)
                            &
                        (int(change_df.iloc[het_list[n]]['c_start']) <= stop)):
                        if (strand_dict[k] == '-'):
                            new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\
                                                  int(change_df.iloc[het_list[n]]['c_end'])+shift,\
                                                  str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\
                                                  change_df.iloc[het_list[n]]['mutation_type'])
                        else:
                            new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\
                                                  int(change_df.iloc[het_list[n]]['c_end'])+shift,change_df.iloc[het_list[n]]['c_content'],\
                                                  change_df.iloc[het_list[n]]['mutation_type'])
                        new_des=des+change_df.iloc[het_list[n]]['mutation_type']+":"+\
                        str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\
                        str(change_df.iloc[het_list[n]]['c_content'])
                    if (len(new_sequence) != 0):
                        new_sequence = new_sequence[start:stop + 1]

                        if (strand_dict[k] == '+'):
                            new_seq = str(
                                Seq(str(new_sequence), IUPAC.ambiguous_dna).
                                transcribe().translate(to_stop=True))
                        else:
                            new_seq = str(
                                Seq(str(new_sequence),
                                    IUPAC.ambiguous_dna).complement().
                                transcribe().translate(to_stop=True))
                        cnt += 1
                        while (new_seq.find('None') != -1):
                            new_seq = new_seq.replace('None', '')
                        my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                 id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1),\
                                             description=new_des))

    print("The number of proteins related is " + str(k_cnt))
    print("The number of sequences generated is " + str(len(my_seqs)))
    print("The number of homozygous is " + str(hom_cnt))
    print("The number of heterozygous is " + str(het_cnt))
    #return my_seqs
    handle = open(dataset_name + "_all_mutation_" + version + ".fasta", "w")
    hom_seq = 0
    het_seq = 0
    hom_het_seq = 0
    for sequence in my_seqs:
        if (str(sequence.description)[-1] == '_'):
            hom_seq += 1
        else:
            if (str(sequence.description).find('_') == -1):
                het_seq += 1
            else:
                hom_het_seq += 1

        SeqIO.write(sequence, handle, "fasta")
    print("The number of homozygous sequences is " + str(hom_seq))
    print("The number of heterozygous sequences is " + str(het_seq))
    print("The number of mixed sequences is " + str(hom_het_seq))
コード例 #30
0
 linecount += 1
 if linecount % 2 == 1:
     header = line.rstrip()
     fastaheader = header.replace(
         "@",
         ">") + "_S" + args.sample + "_I" + args.immuno + "_B" + args.boost
     identifier = header.replace("@", "")
 else:
     sequence = Seq(line.rstrip())
     ## modified pattern search using a motif, i.e. introduced a regex
     ## meaning the regex itself might not yield any results, so additional
     ## conditionals need to be introduced before using .find()
     ## NB: always finds using first regex match
     ### startaa = sequence.find(preseq) + 3
     if len(re.findall(r"T.{2}Y.{1}C", str(sequence))) > 0:
         startaa = sequence.find(
             re.findall(r"T.{2}Y.{1}C", str(sequence))[0]) + 3
     else:
         startaa = -1
     ### endaa = sequence.find(postseq)
     if len(re.findall(r".{6}TVSS", str(sequence))) > 0:
         endaa = sequence.find(re.findall(r".{6}TVSS", str(sequence))[0])
     else:
         endaa = -1
     if startaa == -1 or endaa == -1:
         tsvfile.write(identifier + "\t" + "NA" + "\t" + line.rstrip() +
                       "\tno-cdr3\n")
         continue
     targetbit = line.rstrip()[startaa:endaa]
     if targetbit in bigset or len(targetbit) > 50 or len(targetbit) < 1:
         tsvfile.write(identifier + "\t" + targetbit + "\t" +
                       line.rstrip() + "\tnon-unique\n")
コード例 #31
0
with open(realize_file, 'r') as fin:
    # Skip header
    fin.readline()
    for line in fin:
        parsed = line.split(';')
        v_index = parsed[1]
        v_index = int(v_index.replace('(', '').replace(')', ''))
        v_choice.append(v_genes[v_index])
        j_index = parsed[2]
        j_index = int(j_index.replace('(', '').replace(')', ''))
        j_choice.append(j_genes[j_index])

# Parse the CDR3 file and print clonotype (V gene, J gene, AA CDR3) if:
# (a) J gene is inframe
# (b) CDR3 anchors are found
# (c) Amino acid translation of CDR3 contains no stop codons

cdr3_aa = []
with open(cdr3_file, 'r') as fin:
    # Skip header
    fin.readline()
    for line in fin:
        seq_index, nt_cdr3, anchors_found, is_inframe = line.split(',')
        seq_index = int(seq_index)
        anchors_found = int(anchors_found)
        is_inframe = int(is_inframe)
        if anchors_found and is_inframe:
            aa_cdr3 = Seq(nt_cdr3, IUPAC.unambiguous_dna).translate()
            if aa_cdr3.find('*') == -1:
                print(v_choice[seq_index], j_choice[seq_index], aa_cdr3)
コード例 #32
0
ファイル: gRNA.py プロジェクト: rhong3/gRNAsearch
def search(syn, tag, save=True, output_dir="~", output_name="gRNA"):
    try:
        for record in SeqIO.parse(syn, "fasta"):
            ori_seq = Seq(str(record.seq), IUPAC.unambiguous_dna)
            rev_com_seq = str(ori_seq.reverse_complement())
    except FileNotFoundError:
        print("Sequence file not found!")
    except:
        print("Sequence file error!")
    try:
        dic = pd.read_csv(tag, header=0)
    except FileNotFoundError:
        print("PCR Tag file not found!")
    except:
        print("PCR Tag file error!")

    total_len = len(ori_seq)
    foundlist = []
    # make WT reference table
    for idx, row in dic.iterrows():
        ori_result = ori_seq.find(row["Forward Syn"])
        if ori_result != -1:
            WT = str(ori_seq[int(ori_result - 25):ori_result] +
                     row['Forward WT'] +
                     ori_seq[int(ori_result +
                                 len(row["Forward Syn"])):ori_result +
                             len(row["Forward Syn"]) + 26])
            foundlist.append([
                row["Amplicon"],
                int(ori_result - 25),
                int(ori_result + len(row["Forward Syn"]) + 26), "Forward", WT
            ])
        else:
            print(idx, row["Forward Syn"], " not found!")
        com_result = rev_com_seq.find(row["Reverse Syn"])
        if com_result != -1:
            WT = str(rev_com_seq[int(com_result - 25):com_result] +
                     row['Reverse WT'] +
                     rev_com_seq[int(com_result +
                                     len(row["Reverse Syn"])):com_result +
                                 len(row["Reverse Syn"]) + 26])
            foundlist.append([
                row["Amplicon"],
                int(total_len -
                    int(com_result + len(row["Reverse Syn"]) + 26)),
                int(total_len - int(com_result - 25)), "Reverse", WT
            ])
        else:
            print(idx, row["Reverse Syn"], " not found!")
    frames = pd.DataFrame(foundlist,
                          columns=[
                              'Amplicon', 'syn_FWD_start', 'syn_FWD_end',
                              'strand', 'WT_5_to_3'
                          ])

    output = []
    for idx, row in frames.iterrows():
        WT = row['WT_5_to_3']
        if row['strand'] == "Forward":
            seq = ori_seq
            start = row["syn_FWD_start"]
            end = row["syn_FWD_end"]
        elif row['strand'] == "Reverse":
            seq = rev_com_seq
            start = total_len - row["syn_FWD_end"]
            end = total_len - row["syn_FWD_start"]
        frame = str(seq[start:end])
        target = [n.start() for n in re.finditer(".GG", frame)]
        if len(target) != 0:  # make sure NGG exists on syn
            for f in target:
                f_abs = f + start
                if row['strand'] == "Forward":
                    newstart = f_abs - 20
                    newend = f_abs + 3
                elif row['strand'] == "Reverse":
                    newstart = total_len - (f_abs + 3)
                    newend = total_len - (f_abs - 20)
                else:
                    print(
                        "Deprecated intermediate file at line {}!".format(idx))
                    exit(1)
                GGpos = [n.start()
                         for n in re.finditer("GG", WT)]  # find GG on WT
                if len(GGpos) != 0:  # GG found on WT
                    ce = 0
                    ct = 0
                    for GG in GGpos:
                        eight = WT[GG - 9:GG - 1]  # get -8 to -1
                        if eight == str(seq[f_abs - 8:f_abs]):  # -8 to -1 same
                            twelve = WT[GG - 21:GG - 9]  # get -20 to -9
                            if twelve == str(seq[f_abs - 20:f_abs -
                                                 8]):  # -20 to -9 same
                                break
                            else:  # -20 to -9 different for this GG
                                ct += 1
                        else:  # -8 to -1 different for this GG
                            ce += 1
                            ct += 1
                    if ce == len(GGpos) and ct == len(
                            GGpos):  # -8 to -1 different for all GG
                        output.append([
                            str(seq[f_abs - 20:f_abs + 3]), "8 different",
                            row['strand'], row['Amplicon'], newstart, newend
                        ])
                        # print("eight different: " + str(seq[f_abs - 20:f_abs + 3]))
                    elif ct == len(GGpos):  # -20 to -9 different for all GG
                        output.append([
                            str(seq[f_abs - 20:f_abs + 3]), "12 different",
                            row['strand'], row['Amplicon'], newstart, newend
                        ])  # use with causion
                        # print("twelve caution: " + str(seq[f_abs - 20:f_abs + 3]))
                    else:
                        # print("All same found! Do not use!")  # do not use
                        pass
                else:  # GG not found on WT
                    output.append([
                        str(seq[f_abs - 20:f_abs + 3]), "GG different",
                        row['strand'], row['Amplicon'], newstart, newend
                    ])
                    # print("GG not found in WT: " + str(seq[f_abs - 20:f_abs + 3]))
        else:
            print("No NGG found at line {}.".format(idx))

    summary = pd.DataFrame(output,
                           columns=[
                               '5_to_3_sequence', 'pattern', 'strand',
                               'amplicon', 'syn_FWD_start', 'syn_FWD_end'
                           ])
    if save:
        frames.to_csv("{}/{}_intermediate.csv".format(output_dir, output_name),
                      index=False)
        summary.to_csv("{}/{}_search.csv".format(output_dir, output_name),
                       index=False)
        print("Done! Results are in {}".format(output_dir))
    return summary, frames
コード例 #33
0
    nrows = sheet.nrows
    #print (nrows)
    #print (xlsfile)
    #print sheet.cell_value(rowx=0, colx=2)



    oligo_row = 0
    oligo_col = 2
    name_col = 1

    for oligo in range(sheet.nrows):
        cell = sheet.cell_value(rowx=oligo_row, colx=oligo_col)
        if oligo_row < nrows:
            oligo_caps = cell.upper()
            oligo_find = ref_seq.find(oligo_caps)
            oligo_rev_find = ref_rev_comp.find(oligo_caps)

            if oligo_find == -1 and oligo_rev_find == -1 or cell == '':
                oligo_row += 1
            elif oligo_find != -1 or oligo_rev_find != -1:
                #print ('elif1')
                name = sheet.cell_value(rowx=oligo_row, colx=name_col)
                name_match_list.extend((name,))
                no_match += 1
                oligo_row += 1
                #print (name)
                #print (oligo_caps)
                #print ("GOT IT")

コード例 #34
0
ファイル: RNA to Protein.py プロジェクト: ishansri17/ROSALIND
    'UGU': 'C',     'CGU': 'R',     'AGU': 'S',     'GGU': 'G',
    'UGC': 'C',     'CGC': 'R',     'AGC': 'S',     'GGC': 'G',
    'UGA': 'Stop',  'CGA': 'R',     'AGA': 'R',     'GGA': 'G',
    'UGG': 'W',     'CGG': 'R',     'AGG': 'R',     'GGG': 'G'
}

str1 = "AUGUCCGGCCUCAUCAAUACCUAUUGUCGGGAUGCCUUCUCUACCCCACCGCCCUUAACGGCUACGUGCACAAGAGGGGGAUAUAUUCCAGCGGCCAAUCUCCUCUCUCCUAGCUUCAGGACUUCAUUUGGGGGAGAAUCGCUGAAUGGCCUUCUGGUCGGAGGGAGAAUAGUUCCGAAUGCUUCCGUCCGUCGGUCGCAGCAAGGUAGAGACGGCCUUUGCGUUGGCGUGUCUCUGUCUUGUUGGACGUACGCCGAAUUUCUGAUACGACACGACCAGAUCCGGUCUCUCUGCGAAAGGCUGAACUCCUUUGUACCAGACUCCAUAAGAGUCAGGUCAAUGAGCGACGCCGCAGCUGAAUAUACCAUACUAAGUGAGCAAGUUCCAGACACGAAGAUUGCGUCCCGGUGCGACUGGGUAGCAAACACAAAUUUUCACAAUCGUUGUACGCCUCACGAGUCCAUGCACUGGACGUGUCCCCCUAGUUUCUAUUCGCCUACCGUAACUCUUUCACCGAUCAGUAUCAGGUAUGGCCAGAAAUCCAGGCUAGCGAUGAUACUGUAUGUGAUAAAAUUACGAAGCGUACGCGUCCUUGUAUCAAUGCAAGCAAGCCCGGCAUCAGGACCUCGACGCCUUUUCACUAGCCCCGGCCCGCGAUCGUCCGGAUAUCGACAUAGCUCUCUCCGCCACCGGGAGUUGUACAUCGUGUUGUCAUGCACUAGGUGCUCUGACCUACGACUUGAGAUCAAGCGCUCGCCUUUAUUUCGCUCUUCCCCUGACGUUCGCCGUAGCCGCCAGAUCUCCUGGUCUAGGGGGAUUGGAACGGAGGCUGGUGCCUUUUCAGGGCGAUUGCGUGCGGUGAACCGCCAAGUCGACUUACUGAAGGAGUCAGUUAGUGUCAGGUUGUAUAAGGUGGGAGCUUGGGGUGGUUUGCGCAUUACGUUUCACAUCAUAUCGCCGGAUUAUACCGAGUGGCGUGGGUGGGAUGUUCCCAUUCUAAGUUCAUUGACUCAUCGACGCGGCAGCGACCUGACGAGGCGCAUAAAAAUUCGGGCUAGGUUGCAAACUUAUGGUCAUGCUGGCUGGCGCUGUCUACACGUAAGCUACCCGGCCCUGGUCCGUCGCAGAGCAGGACGGGCUAGUGACUUACGUCGCUUCCGUCAGAAGUUCCAUCUUAAAAAGCGGAUACUGCACGUUCAGACGGUAGGGCUAUGCAGGCCACCGAUUCGAACUCGUGUUAGAUGCGACCGACUUAUAGAAAGGUUCUUAACAAGAAGGCCUAUCGGCCCCCUGAUAAAAUCCCUCUCUAUUCAGUCCCUUUGGACAGGACGCCGCGGUACGGCUACGCAAGCAUCCCCAAAACUUGAUGCCAAAUCCAUGUAUUCAUUGAUAGGUAGGCGAGCGACCCGGGAGUCCGCGCUUAUAUACUUUCCUACCGCAUCCUGGAAUAAGGUCACUCUCAUUAUAGUGGCGGGGCGUCCUGUAGUUGUUUCCGGUCGCCCUCUCUCUACCAUAACCCUCGACGAGGGAGCGAUUCUAGCCAAGGGCGCAACUCCCCGAGAGACUCAGGUUCUUAACUACUCUGUUCAUGCGUAUUUUCAGACCCCCUCGCCGGAGGGUGGGUACAAAACACUGCCGGACCUCCAGAACGCGAGCCUCAAAGGACCUUUCAAGGCAGUCAGUCACGCGCUUCUCGCACCCGCAGUGCGGUGCGUCCUGCUUGUUUAUCGACUCUUUGCGACCAGCGGUAUACACGUGUACAAAUACCAUUAUAUUGUAUAUUGCGUCAAUAGAGUAGAGUGUACUGCCGAUAAACAUCCUCUAAUUGGAGGGCCCAGGAGUUGCUGGACUAACGCGCAACAAUUGGGCGAACGCGUAGGAAGUCGGCAAAGCAUACUAUACAGUGAUGGUCACCAACUUUGCGUUCAUGGGGAGCACUUACGUAGCCCCCUUUGUCCGAAACGCGCGCUAGCGAGGGACGCACGUUAUGCGAGUACAACGUCGAUUCAACCCCGGGGCAACGAAUGGUGGGGCAUCGAUUCCGAGACUCGGCAGCGGACCUAUCGGACUCAAGCCCCUUUAAAAACUUGGAUGCCGGACACAGAAUUGCUAUCGAAGGGGCCAACCUGUUGCUCCCACCGAGACCCUGCGCGGCAUUAUACCGUCCCUAGUUGCCAGGAACGUUGGGUAGAGCAACUCCUCCAACGUUCUGGCCACCCCCUUCAACAUUGCAUUGUGGCGGUAGGGGAUGACUGGUCUAGCAAAUGCAUGUUCCAUAUGAUCCACUUGUCGCUCCGGGGGAUCGAGAAGAGCACAGCUAAUAUACGAAAUAACAACAUGCGCGUCGCGCAUGGAAGAAGAACCUCGCCCCUCGGGACUUGCUUACUUCCGAUCUCAUCUGUCCCCUUCCGACUAUGCUGUUGGUCGCUAAUGUCAGACUACCCGCCAGCUGGCCACACCAAUCACAAUCGUGAAGCUCAUAAAGACAAGGUAAAAACGCGAAGAAUCUGUCUGCCUUGCCCCGCACCGAUUGGGCCCCAGGCCGUUGCUAGCCUGGUUAUCAGGGGUCUCACACAUGGAGGCUCCCUAAUGGCGCCCAACACAAUAACGGUAGCGAUGAUGCACGUCCAUACGCAUGUCUAUCCUGACAUAUACAUCUCUCAACACCUAUCUGAGGCGACGGUUUCUGCGUCUCGCCAGAUAUGGACUGGUGUGCAGACUUUUGCGCGGCUUAGACGGACCUCUAACAGCAUGAUUUCUAUUGAUGCUGACUUCACACUGGGAGUUUCCCCUACCGGAACUACGUUCGACGCUGCCCCUUCUAAUGUACCUUCCCUCAGGUUUGGAGGGAGGUGCUGUCAACAUUACCGGCACAACGCGGACGGUAGUUUUCCUGCGAGAAAAAAGCCGAGUAGGCUACAUAGAACCCCAGCUGAAAGCACAUUAGUUUUAAUUAGCGGGACAGGAUCUCAAAAAGCAAUUCGACGUCUCAUAAACAUUAGGCGUCGACCUGCCAGGAGUGAACAUGGUCGUAUUAUGAAAGCCCGAAACCCUCAUCAAUGGAUAUUAUUUCGAAUAUUCGUGAGUUGGCAGUUUUGCCAUACCACACCAGCCAACUGCUGUUUAAUCGGGGGUCAAUUCCUCGGAGACGUUUCGGGUCAUACGGUCGGCGGCAAUGAAAGCGAUAAAACUAAGCACACGCCAGCUAUCUCUUCAGGGAACCCUAUUAUAGCCUCCCUAGCUGCUUCCGUAUUUCUUGUUGAUUGCUGUUGGUCACAAUACCCCUUCCUUAUAACCUGCCUUAAAGGUGAAUCCAGUGAGAGGGGAGAUUCUACUUCCGCCAAACCAGAGAACUAUUCCAGUGCCGCCAAAAUCAGCGUGAAGUUCUUUACGACCUCAAAUGUUAGAUGUUAUCACGACGUUCUCCCAUUCUUGAUUCCUCGGAUUCAUCUAAGUCAUAUGAAAGCCCGACUACACCUGUUAUUAUGUGGAACCGGUAUAAGGUCCCGACUAGACAAGUUUUUUACAACCCGUGCUAACGAGCUCACAUUAAGGAUUCCGUACGAAAUGAGAGCUUGCCAAGUAAGGCGGCGCAGUUUAGACGCAAGAUACAAGGAUCGACAUGCAGUCGUCAUGAAAUUCAGGUUUAUCGCUCCGCGCUGCUACCCAAUCACCGUGAUUCAGUGUCCUCGCGCGAUGGUUCCACGGACUCCUAAUCGGUCGAGGACUCAUGACUGUGCACUCUAUACUCGUAAAAGUAGCUCUAUGGAAAAAGUACGCGCUGCGGCCGUUGGAGUAGCUUCAGGUUCUGCAAAUCAGGGCCUCCUUAGAUUGCGCUGCCGUUUUGUUAUAAUGAGCCUAGACGUUCCCCCCCCCGCGUCAAAGGACAGCCCGUGGUUAUUUGACACAGGCGGCCAACGGAGCAGUCAUGUGAAGACGUCUCAAGUGCGACGCGGAUUACCAACUAAAAAAUGGGCUCAGAGCGUGAUCCUGGGAAUGAACCGUAAUGGAGGCCGUACUUGUGGAGAAACAAGCUUGAAGCACUUCUACCACCUUUUCCUGGAGAUUUAUCCCGGUAAUCGAUGGCGGCGGGCUAGUUGUCAAUGGAGAAUUAAAUUGGGUCAGUCUAAUCUUUGCCCAACAUUGAGUGUUAAUGGCCCCCCAUUUAGGUCUACGUUCCGGCAACCGAAACUUCCGCACCAGUGUCCACCUGGGGGAUUACUACUCACCGUGAGCCGUGCGCCUUGGCUGGAAUCAUCGCCUCAAACACGGGGUAACCUUGACCCCCUGUCGAGCAUCCUUUCCGUGCUGUUGACACUUGACAGGCUAGCUCUUGGAGGGGAAAACCAGAGGAGACCCCGAUUGGUCAACGAAACAGCCGUGGGUGUGCAGUGUAUCAGGACACGUGCUCUACCAACGCCGGAUACCCCCCUCUGGGGCACUCUUGCUGGAAGAGCCUUAGUCGGCACCCAGGCGGCACCACAGAAACUUAGCAGCCGGCGAAUUGGGUUCCGUGCCAAUAGCAAAGCGGUGUCUGGGCCCCAUCGAGACUCAAUUCCAGGCCGUAAUCACCAGGGUCACCUAGAGCGUUAUUUUAAGUGCUCUGCAAAUUUGCGGCCGUCGCAGCGCAGUCCACGCAGGGUGCCACGACCCACAAGCGGCCAGUGUGUCUUUUCUCCUAACCAGUUGUGGUGUCUGUAUUUGCUUUGGCUUGCGCCGUGCCAUGCCACACACGCUUUUGGUAGGUUCGCCGUAAGGCGAGCCUGCAUGUUAUUUAUUGUCAAUCCGGAGCGUCUCCGCAUCGGAUGGAAAUCCACCAAGACAUCCCAAUAUCAGCUCGAAACAGUACCCGCUACUUAUACAUGUAGCAGAUCCACAGUAGCUGCGGUGGUCAUACACGGCAAAGCAUAUGUCGUCGGCGUCAAAGAGUUCACUGGUAUGCACUUUGCGCGUGAUUUAGACCGGCGCAGGAGUAACUCCGGGUACGUCGGAGCAUUAGAACCUGUGAGGGCCGGUGGGCUGAAGGCCAUCGAAGCCAUAGUCCAUUUGUGUUCGAAGACAGUGCUGUCAAGGUCGUGGUCUGGUACUCACCCGUUUCCAAUCGCAAGAAAGAAGAAGAAACGUUGUACUGACCCUGAGGUAAGAGUCAGGUGUCGGACCGGGGGCGGGAACUACAUAAGUAGGCACCUGAGUCUAUGCGCGACUUGUCGCCAAAAAUGGCGCAAGAUAUACGAAUUUCUAUACAAGUUGUACUCACCAAGUGAUACAUCGAGUCAAACAGAAAUGGCUCUCGUUGGACAGGAAUGUUCAAUCAGCGGGAUUCGUGCCAGUAAGCAUCGUCAUAGGGAAACCGAGGAAUCACCAUGGGAUUCGGGCCCGAGUUGGAGCGAAUCGACAAACGCAAUCGGCCUUGGCCUUUUGGCUAGUCAACACGCUCCACACCUAGUUGGCAUCUGGUUUCACUGUCCACUCCCAAAACUUACCAGAAACGACACGAGCCUUAGCACGACCGGUUAUUCGACAAGUGGUCAAAAUGAGCGCUCUGCAUACUGCAGUUUACGCGCCCCGGACCCCGGGAACGGACCUGGACAUAUCCACCAGGGCACGGCCUCUAGACAAAUCGGGACCGACAGUACGUCAAAACUCGGUGAGCCACAUGCCAGGGGGUCUAUGAUGCGACAUUUGAUCCACGUAGAACUUGUCGCGUAUACGACCGCCCUAUUCCUAUUGCUAGCAACAACAGGGGGUGCACUACUGUGCAUGACGCGACACUUCACCUUGGACGUCCGAUCGAAGCUUGGUUCGUUCGUUCUAGGAGCCGGCCAGGAGGGCUGCCCUCGUGUAGGUUAUCUAUCGCAUUUGGGCCUAUGUCUUAGACGGAUCACAAAAGCAUGUGCAAUCAGACGCGCUGAUAUACCCCACUGCGCGACUGUGCCAUUGACGACUCACCUUGUUGCCCUAGACGCAAUCCAACAUGGAGACCGUUCCUAUGCGUUCUGUCUUAUUGUUGGACUCAGUGGACUUAUGGAUUGUUACGUGGAAAUCAACGUUAUAUAUCCUAUAAUUUACAUUCGACGCCCCAGACAGCAUACUGGGCGUGCAAUGGGGGAGUGGAUCCGCUCGAUACAUGCAGCUGAGGACUAUCAGAGAGAACAGCAUCAUUUUGCAAGUAUUCGCUGGCACUCACAUGCUACCGGCAGAUCCUUCCUUGCAGUCCGUCGUUUUGUUGGAGUUAAAGUUGCGUGGCCCUUAGUUGCUCUUGAGUCCCUGACGCUAGAAAGAGAUCGCUCCGUACAAAUGCCUCUCUGGUCCCUGCUAGAGCAAGCGUACACGAGGGUUGUAUUUUUCACCCCAAGAAAUGGGUUAUGUGUUCUAGACCAACAUUGCAGGUGGGCUGCGGCGACGCCAUCCAAGUCAUCGCGUAUGGAUCUUUUCUACGGCGAGCCAGUUUCUUGUGAAUGGGUCGCCGUCAGGCCUUCCAGAUAUGCUCAAGUAAGAAUAGGGAACGAUCACAGGGUCUCAUAUGGUACGAUUAGCGACUGGACCUCGCAUGAGUUAGUCAAUUACCUCCAAAACAAUCGGGAGGCGUACCUACCUUCCAUUCUACCCUCCAGCCUGGUAGCAGAUCCCACGGGUAGCGACCGGGAAACAUCUAUUAAACUUUGCACAAGCCGCGCAUACUCCGCGCACUUGAAGGAUUCGUGUCCCUCUGUUUGCCCUCUUAUACCAGAGCGUUCGCCGCGACCCGACAUUAAGUUUCAGCCGAAGAUGCCACGUGUACACGACAUGGCAUUCUGCGAUGUACUUCCGGUCGUCUGCCUAAUAUUGACCUCGACCCUGCCCGCCAAUAGGACCUCCGUGAAGAGUGCUCAAAAGGCGGGCCCCGCUUCUAAGGAGGAACCUACUCAGGACCCGAUGCGAGUCGUGGCUUCCGGGGAUGCGCUCGUAAGUACUAUUUCGUGCCCUAUUAUCGAUAUUUGGAGCAAAUACAAACAUCCUAAGGAAAGCGCUCGAAAAAACGUUAAACAGCAUCCAAGCCGCUGGUCGCGCGGAAAUGAAUGUCAACAGUCCGCGGGGUAUUAUGAUGAUAGGGACGGCCGCUGUGUUAAUUAUACCGGUCAUCUCCGAUACUUCCCGCUGCCGGGAGUGACUCGAUUUGUUAACGGCGAUUAUAUACUGCAAACCAACAUCACUAAGAGAACUGUGUCUUUGAUAAUUGGGCGCGCCAGAACCGCGCUAGCUAUAUUUACCCCGAGAAUUACUUGCGAUGGAAGUCGUAAUUUAAAGCCAAUAAUUGAAAAAAGCGGCCCAGGAAAGAGUCCUUUCGCACCAGGUUGGACCCGCAUGUCGUUGCCUCAACCACAGUGCAGGUUGUGCCCAACCGAAGCCCCGUGUUGUGCAGCCGAAUGGCCGUCAAUGCCACCCCUCGUCGUGUGGCCCCACUCCUUAGCUUUUUGGCCUCACGAAAGCUUGGAUGUCGUAUACAUUUUUAUCAUGACUUCAACCCCAGGCGUCUCCCGCAAUCUAUCGGGAUAUUUUAGUUGCCCAGUAGUUGCACGAGGACACGGUACGCACGCUAUGGCUCGUACCGUAAAACGACGUUUACAUAAGUUUCUGUCGUUUAGCGGUCACACAAAACGAUUUGAUGAACUAACUGCCACGGUCUGGCUUAUCAAGAUUAUAGCCCGAGUCUCCGUCACAGGUUCACUGGAGCGGGCUAGUCGCUCGAAUUUCGUAAGGGAGUGCUUCAGAUGUGUAGGCCGUCGCUCCGGGCGAUGGAUGACGUCUCGGUGGUCAAAUAUGACGCGAGAAUGGCCGCAUACAGGCCGAAGAUUUUGGAUUCCCUAUGAGCGUAUAGGUUGUCAAGUCAAGUGGAGGGUCACACAUUUUAGGUGGGUAACGGAAUGGGACUCAAAAACUGCCAUUCACGCGUCCUUACCGCCGUUAACCGACCUUCGAGGAUACGGCGUAUCGUACCCGCAACCAACUUGCUUGACACUCAGUUGGCUUAGGCGCUGCCUACGGCGGACGGUGGUGAAAAGUGGUGAUGCUUACCCCUAUGCAUGCAUUGAUCUCGCGUACGUUAAUAAUGUAUGUAAUGCCUGGCGGCGGGUGAUUUGCUCACUGUGGACGCUAGCUAACGGUCCUAUGUCCGGUUAUAACUAUCAGAGGGCACUGGGCAGAGGGUCAAAUAGGAUUGUCAUGACCAAUUCUUCCACGUGCUGGCGAAGCCCGAUGUUUAUAAUACAUGUCUCAGAAGGCGAAGUUCGAUUAUCCGUGAGGCAAGGACAUCGCACAAUUCUGCUAUGCAACCCUGGGGCCUAUUCCCAAAGUCAUCGCUCUUUUUACGCUGAUGACCAAAACGGAAUUAACACGUGCUGUACGAUCGCGGAUUCGAAAACACAAGCUGAAGGUCUUGUGUUAGUGCGCUUCGAUUGCAAGCAAAUUGUCAAACUCACACGGCAGCGUCAGACUAGGAACACUUGGGACUACACUACAAUGGGAAGCGCCGAGAAAAGGUUCCAUGCGCGCCCCCGCACCAACAUAUCUCUACUGAAUGUAUACGACGCGACACCAUACAAUGGGCAUGGAGUGCAUGGGCCCAGUACGUUCCAAACGAACCUUGGACCGAACCGACGCUACCCGGUUCCCUGUCGAUUGCCCCCUCUAACUGCGAUAUUAACCAUAGGUCCGGGGCGGGUUGGAGAAUCCGCGCUCGGAAUAUUACGGAUCGUGGAUGAUGUGAUGUUGAAUAUAACGGGCCAUAAAGUAUUGACUACCCAUAUCAAGACAGGUGGGUAUGUCUCUGGUCGCGGGUUUCUGAUUGGUCGUCUUCAGGCGGACGGGUAUACGUUAAUUGCUAUUUUUCGGCCUACAUGGUACGACUGCUUCCCUCGAGUCUGUUUUCUGAUUCCAGUUAGGGCUUCCCCUCCAAUUAAGGGACUAGUCUGCUCUCCGGACACAGCCACCGAAUUCGCAACAAUGUCGGGUCUAUCAAAGCAGGACGCAGGAAACUCCGUGUCCACGUUAUCGACCCCACUCUCGCCGCCGGUGCUACGCAGGCACGGCCUCAACAAGCGGUGUCCAGAGACUGGAGUCUCAUUCAGCACCGGCCUACUAAGUCAAGUAAAUGAUAGCCAAGCUAGCGUAUCAAUAUACGUAGCUAAUCCAAAUACAAAGUGGUUCAGUCGAGGUGAGAACGUAAGUGGGUCACUCAUUGCAUGUGUCCGAUACAUUCGGACCGCCGCGAUGGGCUUAUGGUCAUGGUCAAUGUUUUUGGCGAUCGUAUUCAUCACUGACAGGAUCUUAACUUUACAGUUGAUGGGGGUAAAGUACGCCUCUGCUGUUAUGACGGCACCGAGAGGGCGCCAAAUGCCCCACAACCUUUCCUGUACUCAGAAUUGGACGCUUGUGCGUAAAGUUACUCCGGCUGUUACUGGCUCUCGAAUACCUCAGAGGACAGCAGGCUGUCUCAGGAAUCUAUACUGGUGUUCGUCCGAUAGCCCGACACUUUUCCGUGGAGGCCAGAUAAUGCCAAACUUUUACCGUAGGUCCUCACUCGUGCCUCGUCCAAUAGCGCGGAGUCACCGGCGGAUUGUACUGUCGGAACAAAUGAUCAACAUCGAUUGGGAGAGCAUCUGUUCACUACGGCACACGACAUGCUCGCAAAGGGCUUUAUGUCAUGAGUGGGCGAGGGGCUACGCUCAAGCGGCUACGUUAGCUAUCUUAAUUCACAGUGCGAUCCAAGACUUGUUCUAUCAGCUACUCAGCGCAGUGAUAUUUUUUUUUCCCAUUACGAUCGGACCUCGUAGAUUUGUCAAAUACGUUCCAAGAUCACCGCUAUGUCACUUGCGAUUAGUACUACGGCAUUGCUUUGAGCGUGAAGCUUUUGGCGGUCAUCCGAAUAGACCUGUUGUCAACACUCAUGCUCGUCAAUGUAUGGGUACGAAGACAGAGUGA"

#Use translate method to translate RNA to protein using the hardcoded sequence
str1 = Seq(str1)
protein = str1.translate(to_stop=True)
print(protein)

new = ""
for i in range(0, len(str1), 3):
    symbol = RNA_Dict[i:i+3]
    if symbol == "Stop":
        break
    new += symbol

out = []
protein = ""
start = str1.find('AUG')
tr = str1[start:]
for n in range(0, len(tr), 3):
    if tr[n:n+3] in RNA_Dict:
        if RNA_Dict[tr[n:n+3]] =="Stop":
            out.append(protein)
            protein = ""
        else:
            protein += RNA_Dict[tr[n:n+3]]
コード例 #35
0
    def a(self):
        import sys
        from Bio.Seq import Seq
        from Bio import SeqIO
        import re

        inputlist1 = self.x
        inputlist2 = self.y

        outputfile = open("pep_pos_in_mRNA.txt", "w")
        outputlist = []

        dic = {}
        for num, x in enumerate(inputlist1):  # rna sequence containing file
            x_ls = x.split("@")
            header1 = x_ls[0]
            header1_ls = header1.split(" ")
            mrnaacc1 = header1_ls[0]
            dic[mrnaacc1] = x

        for num_y, y in enumerate(inputlist2):  # RNA acce|Pepseq|PSM
            if num_y % 100 == 0:
                print num_y
            y_st = y.strip()
            y_ls = y_st.split("|")
            mrnaacc10 = y_ls[0].strip()
            pepseq10 = y_ls[1].strip()
            psm10 = y_ls[2].strip()
            v = dic.get(mrnaacc10)
            if v != None:

                v_ls = v.split("@")
                mrnaseq = v_ls[1].strip()
                chrom1 = re.findall("loc:(.*?)\|", v_ls[0])
                chrom3 = chrom1[0]

                if len(mrnaseq) % 3 == 0:
                    mrnaseq1 = mrnaseq.upper()

                elif len(mrnaseq) % 3 == 1:
                    mrnaseq1 = mrnaseq.upper() + "NN"

                elif len(mrnaseq) % 3 == 2:
                    mrnaseq1 = mrnaseq.upper() + "N"

                f1 = Seq(mrnaseq1).translate()
                f2 = Seq(mrnaseq1[1:-2]).translate()
                f3 = Seq(mrnaseq1[2:-1]).translate()

                if pepseq10 in f1:
                    pepstpos = (f1.find(pepseq10) + 1) * 3 - 2  # 1st nucleotide of codon

                elif pepseq10 in f2:
                    pepstpos = (f2.find(pepseq10) + 1) * 3 - 2 + 1

                elif pepseq10 in f3:
                    pepstpos = (f3.find(pepseq10) + 1) * 3 - 2 + 2

                if pepstpos >= 0:
                    pepsppos = pepstpos + (len(pepseq10) - 1) * 3
                    while pepstpos <= pepsppos:
                        result = chrom3 + "\t" + mrnaacc10 + "\t" + pepseq10 + "\t" + str(pepstpos) + "\t" + str(psm10)
                        outputlist.append(result)
                        outputfile.write(result + "\n")
                        pepstpos += 1

        return outputlist
コード例 #36
0
class seq_utilitiesTest(unittest.TestCase):
    def setUp(self):
        self.s1 = 'GACTAGACTTAGT'
        self.s2 = 'GACTATAACTTAATATAG'
        self.s3 = 'GACTATAACTTAATATAC'
        self.s4 = 'GACTATAACTTAATAWAC'
        self.seq1 = Seq(self.s1)
        self.seq2 = Seq(self.s2)
        self.seq3 = Seq(self.s3)
        self.seq4 = Seq(self.s4)
        self.sr1 = SeqRecord(self.seq1)
        self.sr2 = SeqRecord(self.seq2)
        self.sr3 = SeqRecord(self.seq3)
        self.sr4 = SeqRecord(self.seq4)
        self.fastq2 = SeqRecord(self.seq2,
                                letter_annotations={
                                    "phred_quality": [
                                        1, 2, 4, 10, 20, 25, 33, 22, 33, 35,
                                        18, 11, 23, 8, 2, 1, 2, 0
                                    ]
                                })

    def test_find(self):
        self.assertEqual(find_first_stop(self.s1), 3, "Misplaced stop codon")
        self.assertEqual(find_first_stop(self.seq1), 3,
                         "Misplaced stop codon in seq1: " + str(self.seq1))
        self.assertEqual(find_first_stop(self.sr1), 3, "Misplaced stop codon")
        self.assertTrue(internal_stop(self.s1))
        self.assertEqual(find_first_stop(self.s2), 15, "Misplaced stop codon")
        self.assertEqual(find_first_stop(self.seq2), 15,
                         "Misplaced stop codon in seq2: " + str(self.seq2))
        self.assertEqual(find_first_stop(self.sr2), 15, "Misplaced stop codon")
        self.assertFalse(internal_stop(self.seq2))
        self.assertEqual(find_first_stop(self.s3), None,
                         "Misplaced stop codon")
        self.assertEqual(find_first_stop(self.seq3), None,
                         "Misplaced stop codon in seq3: " + str(self.seq3))
        self.assertEqual(find_first_stop(self.sr3), None,
                         "Misplaced stop codon")
        self.assertFalse(internal_stop(self.sr3))

    def test_trimORF(self):
        t1 = 'GAC'
        self.assertEqual(trim_at_first_stop(self.s1), t1,
                         "Misplaced stop codon")
        self.assertEqual(trim_at_first_stop(str(self.seq1)), t1,
                         "Misplaced stop codon in seq1: " + str(self.seq1))
        self.assertEqual(trim_at_first_stop(str(self.sr1.seq)), t1,
                         "Misplaced stop codon")
        t2 = 'GACTATAACTTAATA'
        self.assertEqual(trim_at_first_stop(self.s2), t2,
                         "Misplaced stop codon")
        self.assertEqual(trim_at_first_stop(str(self.seq2)), t2,
                         "Misplaced stop codon in seq1: ")
        self.assertEqual(trim_at_first_stop(str(self.sr2.seq)), t2,
                         "Misplaced stop codon")
        t3 = 'GACTATAACTTAATATAC'
        self.assertEqual(trim_at_first_stop(self.s3), t3,
                         "Misplaced stop codon")
        self.assertEqual(trim_at_first_stop(str(self.seq3)), t3,
                         "Misplaced stop codon in seq1: ")
        self.assertEqual(trim_at_first_stop(str(self.sr3.seq)), t3,
                         "Misplaced stop codon")

    def test_trimFASTQ(self):
        trimmed = trimFASTQtoFirstBase(self.fastq2, 10)
        self.assertEqual(self.seq2.find(trimmed.seq), 3,
                         "Left trim is wrong 1")
        self.assertEqual(
            self.seq2.reverse_complement().find(
                trimmed.seq.reverse_complement()), 5, "Right trim in wrong")
        trimmed = trimFASTQtoFirstBase(self.fastq2, 40)
        self.assertEquals(trimmed, None, "Sequence should be discarded")
        trimmed = trimFASTQtoFirstBase(self.fastq2, 22)
        self.assertEqual(self.seq2.find(trimmed.seq), 5,
                         "Left trim is wrong 2")
        self.assertEqual(
            self.seq2.reverse_complement().find(
                trimmed.seq.reverse_complement()), 5, "Right trim in wrong")
        trimmed = trimFASTQtoFirstBase(self.fastq2, 23)
        self.assertEqual(self.seq2.find(trimmed.seq), 5,
                         "Left trim is wrong 3")
        self.assertEqual(
            self.seq2.reverse_complement().find(
                trimmed.seq.reverse_complement()), 5, "Right trim in wrong")
        trimmed = trimFASTQtoFirstBase(self.fastq2, 35)
        self.assertEqual(len(trimmed), 1, "trimmed too long")
        self.assertEqual(trimmed[0], self.fastq2[10], "Wrong base recovered")

    def test_Ambig(self):
        self.assertTrue(unambiguous_sequence(self.s1))
        self.assertTrue(unambiguous_sequence(self.seq1))
        self.assertTrue(unambiguous_sequence(self.sr1))
        self.assertTrue(unambiguous_sequence(self.s2))
        self.assertTrue(unambiguous_sequence(self.seq2))
        self.assertTrue(unambiguous_sequence(self.sr2))
        self.assertTrue(unambiguous_sequence(self.s3))
        self.assertTrue(unambiguous_sequence(self.seq3))
        self.assertTrue(unambiguous_sequence(self.sr3))
        self.assertFalse(unambiguous_sequence(self.s4))
        self.assertFalse(unambiguous_sequence(self.seq4))
        self.assertFalse(unambiguous_sequence(self.sr4))
コード例 #37
0
def generate_se_sequence_combination(df_SE, dfname, dbname, rna_db, het,
                                     exclude, output_name):

    version = get_version(rna_db)
    if (version == 'swissprot'):
        db = 'sp'
    else:
        db = version

    trans_seq_dict, trans_coding_dict = generate_all_se_db.get_trans_seq_dict(
        rna_db)
    print("transcript sequence dict ready")
    print(len(list(trans_seq_dict.items())))

    df = pd.read_csv(dfname, sep='\t', header=None)
    change_df = generate_all_mutation_db.extract_transcript_change(df)
    trans_index_dict = {}
    for i in range(0, change_df.shape[0]):
        if (change_df.iloc[i]['mrna'] not in trans_index_dict.keys()):
            trans_index_dict[change_df.iloc[i]['mrna']] = [i]
        else:
            trans_index_dict[change_df.iloc[i]['mrna']].append(i)

    trans_records, strand_dict, protein_id_dict = generate_all_se_db.get_trans_records(
        dbname, trans_seq_dict)

    splicing_event_dict = generate_all_se_db.get_splicing_event_dict(
        df_SE, trans_records)
    print("splicing events dict ready")

    trans_exon_dict = generate_all_se_db.get_transcript_exon_dict(
        trans_records)
    print("transcript exon dict ready")

    middle = time.time()
    print("---- %s minutes ----" % ((middle - begin) / 60))

    trans_db_annotation_dict = {}  ##output
    cnt = 0
    trans_records = trans_records.sort_values(by=['exon_id'], axis=0)

    for k in splicing_event_dict.keys():
        cnt += 1
        if (cnt % 1000 == 0):
            print(cnt)
        down_id = splicing_event_dict[k][0]
        skip_id = splicing_event_dict[k][1]
        up_id = splicing_event_dict[k][2]
        for key in trans_exon_dict.keys():
            T = trans_exon_dict[key]
            if ((down_id in T) and (up_id in T)):
                #print(key)
                down_index = T.index(down_id)
                #print(down_index)
                up_index = T.index(up_id)
                low_index = min(down_index, up_index)
                high_index = max(down_index, up_index)

                tmp_exon_coordinate_dict = {}
                tmp_exon_position_dict = {}
                tmp_pos = 0
                for item in range(1, len(T)):
                    ei = generate_all_se_db.find_key_index(
                        'exon_id', T[item], trans_records)
                    if (ei == -1):
                        print("the exon id cannot be found")
                    else:
                        tmp_start = trans_records.iloc[ei]['start']
                        tmp_end = trans_records.iloc[ei]['end']
                        tmp_exon_coordinate_dict[T[item]] = (
                            trans_records.iloc[ei]['chr'], tmp_start, tmp_end)

                        tmp_exon_position_dict[item] = (tmp_pos,
                                                        tmp_pos + tmp_end -
                                                        tmp_start + 1)
                        tmp_pos += tmp_end - tmp_start + 1

                tmp_seq = trans_seq_dict[key]
                coding_start = trans_coding_dict[key][0]
                coding_end = trans_coding_dict[key][1]

                het_list = []
                hom_position_list = []
                if ((key in trans_index_dict.keys())
                        and (key in strand_dict.keys())):
                    shift = 0
                    transcript = tmp_seq
                    hom_des = []

                    for i in trans_index_dict[key]:
                        if (change_df.iloc[i]['mutation_type'].find('snv') !=
                                -1):
                            if (change_df.iloc[i]['snp_type'] == 'hom'):
                                hom_position_list.append(
                                    (int(change_df.iloc[i]['c_start']),
                                     change_df.iloc[i]['mutation_type']))

                                if (strand_dict[key] == '-'):
                                    tmp = Seq(
                                        str(tmp_seq),
                                        IUPAC.ambiguous_dna).complement()
                                    tmp = str(tmp)
                                    transcript=generate_all_mutation_db.change_seq(tmp,int(change_df.iloc[i]['c_start'])+shift+coding_start-1,\
                                                          int(change_df.iloc[i]['c_end'])+shift+coding_start-1,str(Seq(str(change_df.iloc[i]['c_content'])).complement()),\
                                                          change_df.iloc[i]['mutation_type'])
                                else:
                                    transcript=generate_all_mutation_db.change_seq(tmp_seq,int(change_df.iloc[i]['c_start'])+shift+coding_start-1,\
                                                          int(change_df.iloc[i]['c_end'])+shift+coding_start-1,change_df.iloc[i]['c_content'],\
                                                          change_df.iloc[i]['mutation_type'])
                                hom_des.append(change_df.iloc[i]['mutation_type']+":"+str(change_df.iloc[i]['c_start'])+'-'+\
                                str(change_df.iloc[i]['c_end'])+str(change_df.iloc[i]['c_content'])+'_')

#                                if(change_df.iloc[i]['mutation_type'].find('del')!=-1):
#                                    shift-=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
#                                elif(change_df.iloc[i]['mutation_type'].find('ins')!=-1):
#                                    shift+=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
                            else:
                                het_list.append(i)

                    het_number = len(het_list)
                    het_des = []
                    het_seqs = []
                    if (len(hom_position_list) != 0):
                        if (strand_dict[key] == '-'):
                            tmp = Seq(str(transcript),
                                      IUPAC.ambiguous_dna).complement()
                            tmp = str(tmp)
                        else:
                            tmp = transcript
                        het_seqs.append(tmp)

                    else:
                        if (strand_dict[key] == '-'):
                            transcript = Seq(str(transcript),
                                             IUPAC.ambiguous_dna).complement()
                            transcript = str(transcript)
                        het_seqs.append(transcript)
                    het_des.append("")

                    if (het == 1):
                        for n in range(0, het_number):
                            tmp_het_des = []
                            new_sequence = ""
                            splicing_position = 0
                            if ((high_index - low_index) == 1):
                                splicing_position = tmp_exon_position_dict[
                                    low_index][1]
                            elif (T[low_index + 1] == skip_id):
                                splicing_position = tmp_exon_position_dict[
                                    low_index + 1][0]
                            if (abs(
                                    int(change_df.iloc[het_list[n]]['c_start'])
                                    - splicing_position) <= 900):
                                if ((int(change_df.iloc[het_list[n]]
                                         ['c_start']) >= coding_start - 1) &
                                    (int(change_df.iloc[het_list[n]]
                                         ['c_start']) <= coding_end)):
                                    if (strand_dict[key] == '-'):
                                        new_sequence=generate_all_mutation_db.change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift+coding_start-1,\
                                                              int(change_df.iloc[het_list[n]]['c_end'])+shift+coding_start-1,\
                                                              str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\
                                                              change_df.iloc[het_list[n]]['mutation_type'])
                                    else:
                                        new_sequence=generate_all_mutation_db.change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift+coding_start-1,\
                                                              int(change_df.iloc[het_list[n]]['c_end'])+shift+coding_start-1,change_df.iloc[het_list[n]]['c_content'],\
                                                              change_df.iloc[het_list[n]]['mutation_type'])
                                    tmp_het_des.append(change_df.iloc[het_list[n]]['mutation_type']+":"+\
                                    str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\
                                    str(change_df.iloc[het_list[n]]['c_content']))

    #                                if(change_df.iloc[i]['mutation_type'].find('del')!=-1):
    #                                    shift-=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
    #                                elif(change_df.iloc[i]['mutation_type'].find('ins')!=-1):
    #                                    shift+=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
                                het_des.append(tmp_het_des)
                                if (strand_dict[key] == '-'):
                                    new_seq = Seq(
                                        str(new_sequence),
                                        IUPAC.ambiguous_dna).complement()
                                    new_seq = str(new_seq)
                                else:
                                    new_seq = new_sequence
                                het_seqs.append(new_seq)
                    elif (het == 2):
                        for n in range(0, het_number):
                            tmp_het_des = []
                            new_sequence = ""
                            splicing_position = 0
                            if ((high_index - low_index) == 1):
                                splicing_position = tmp_exon_position_dict[
                                    low_index][1]
                            elif (T[low_index + 1] == skip_id):
                                splicing_position = tmp_exon_position_dict[
                                    low_index + 1][0]
                            if (abs(
                                    int(change_df.iloc[het_list[n]]['c_start'])
                                    - splicing_position) <= 900):
                                if ((int(change_df.iloc[het_list[n]]
                                         ['c_start']) >= coding_start - 1) &
                                    (int(change_df.iloc[het_list[n]]
                                         ['c_start']) <= coding_end)):
                                    if (strand_dict[key] == '-'):
                                        new_sequence=generate_all_mutation_db.change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift+coding_start-1,\
                                                              int(change_df.iloc[het_list[n]]['c_end'])+shift+coding_start-1,\
                                                              str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\
                                                              change_df.iloc[het_list[n]]['mutation_type'])
                                    else:
                                        new_sequence=generate_all_mutation_db.change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift+coding_start-1,\
                                                              int(change_df.iloc[het_list[n]]['c_end'])+shift+coding_start-1,change_df.iloc[het_list[n]]['c_content'],\
                                                              change_df.iloc[het_list[n]]['mutation_type'])
                                    tmp_het_des.append(change_df.iloc[het_list[n]]['mutation_type']+":"+\
                                    str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\
                                    str(change_df.iloc[het_list[n]]['c_content']))

    #                                if(change_df.iloc[i]['mutation_type'].find('del')!=-1):
    #                                    shift-=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
    #                                elif(change_df.iloc[i]['mutation_type'].find('ins')!=-1):
    #                                    shift+=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
                                het_des.append(tmp_het_des)

                            for j in range(n, len(het_list)):
                                tmp_new_sequence = new_sequence
                                tmp_new_des = tmp_het_des

                                if (j != n):
                                    if (abs(
                                            int(change_df.iloc[het_list[j]]
                                                ['c_start']) -
                                            splicing_position) <= 900):
                                        if ((int(change_df.iloc[het_list[j]]
                                                 ['c_start']) >=
                                             coding_start - 1) &
                                            (int(change_df.iloc[het_list[j]]
                                                 ['c_start']) <= coding_end)):
                                            if (strand_dict[key] == '-'):
                                                tmp_new_sequence=generate_all_mutation_db.change_seq(tmp_new_sequence,int(change_df.iloc[het_list[j]]['c_start'])+shift,\
                                                                      int(change_df.iloc[het_list[j]]['c_end'])+shift,\
                                                                      str(Seq(str(change_df.iloc[het_list[j]]['c_content'])).complement()),\
                                                                      change_df.iloc[het_list[j]]['mutation_type'])
                                            else:
                                                tmp_new_sequence=generate_all_mutation_db.change_seq(tmp_new_sequence,int(change_df.iloc[het_list[j]]['c_start'])+shift,\
                                                                      int(change_df.iloc[het_list[j]]['c_end'])+shift,change_df.iloc[het_list[j]]['c_content'],\
                                                                      change_df.iloc[het_list[j]]['mutation_type'])
                                            tmp_new_des.append(change_df.iloc[het_list[j]]['mutation_type']+":"+\
                                                str(change_df.iloc[het_list[j]]['c_start'])+'-'+str(change_df.iloc[het_list[j]]['c_end'])+\
                                                str(change_df.iloc[het_list[j]]['c_content']))
                                        het_des.append(tmp_new_des)

                                        if (strand_dict[key] == '+'):
                                            new_seq = tmp_new_sequence
                                        else:
                                            new_seq = str(
                                                Seq(str(tmp_new_sequence),
                                                    IUPAC.ambiguous_dna).
                                                complement())
                                        het_seqs.append(new_seq)

                    for c in range(
                            0, len(het_seqs)
                    ):  ##tmp_cnt is the number of modified sequences
                        tmp_seq = het_seqs[c]

                        if ((high_index - low_index) == 1):

                            position = tmp_exon_position_dict[low_index][1]
                            if (position >= coding_start):
                                db_seq = tmp_seq[coding_start - 1:]
                                if (db_seq not in
                                        trans_db_annotation_dict.keys()):
                                    trans_db_annotation_dict[db_seq] = [
                                        key,
                                        str(k) + "_exclusive_from_" +
                                        str(position - coding_start + 1)
                                    ]
                                else:
                                    des_tmp = str(
                                        k) + "_exclusive_from_" + str(
                                            position - coding_start + 1)
                                    if (des_tmp not in
                                            trans_db_annotation_dict[db_seq]):
                                        trans_db_annotation_dict[
                                            db_seq].append(des_tmp)

                                tmp_anno = ""
                                for d in hom_des:
                                    tmp_anno += d

                                for d in het_des[c]:
                                    tmp_anno += d
                                trans_db_annotation_dict[db_seq].append(
                                    tmp_anno)
                                #tmp_info=exon_coordinate_dict[skip_id]
                                tmp_skip_id_index = generate_all_se_db.find_key_index(
                                    'exon_id', skip_id, trans_records)
                                tmp_info=(trans_records.iloc[tmp_skip_id_index]['chr'],trans_records.iloc[tmp_skip_id_index]['start'],\
                                          trans_records.iloc[tmp_skip_id_index]['end'])

                                skip_seq = generate_all_se_db.fetch_exon_seq(
                                    tmp_info[0], tmp_info[1], tmp_info[2],
                                    T[0])
                                trans_prime_seq = generate_all_se_db.add_exon(
                                    tmp_seq, skip_seq, position)
                                db_prime_seq = trans_prime_seq[coding_start -
                                                               1:]
                                if (db_prime_seq not in
                                        trans_db_annotation_dict.keys()):
                                    trans_db_annotation_dict[db_prime_seq]=[key,"modified_"+str(k)+"_inclusive_from_"\
                                                            +str(position-coding_start+1)+"_to_"+\
                                                            str(position-coding_start+1+len(skip_seq))]
                                else:
                                    des_tmp="modified_"+str(k)+"_inclusive_from_"+\
                                                            str(position-coding_start+1)+"_to_"+\
                                                            str(position-coding_start+1+len(skip_seq))
                                    if (des_tmp
                                            not in trans_db_annotation_dict[
                                                db_prime_seq]):
                                        trans_db_annotation_dict[
                                            db_prime_seq].append(des_tmp)

                                tmp_anno = ""
                                for d in hom_des:
                                    tmp_anno += d
                                for d in het_des[c]:
                                    tmp_anno += d
                                trans_db_annotation_dict[db_prime_seq].append(
                                    tmp_anno)

                        elif (T[low_index + 1] == skip_id):

                            skip_exon_start = tmp_exon_position_dict[low_index
                                                                     + 1][0]
                            skip_exon_end = tmp_exon_position_dict[low_index +
                                                                   1][1]

                            if (skip_exon_start >= coding_start):
                                db_seq = tmp_seq[coding_start - 1:]
                                if (db_seq not in
                                        trans_db_annotation_dict.keys()):
                                    trans_db_annotation_dict[db_seq]=[key,str(k)+"_inclusive_from_"+str(skip_exon_start-coding_start+1)\
                                                            +"_to_"+str(skip_exon_end-coding_start+1)]
                                else:
                                    des_tmp=str(k)+"_inclusive_from_"+str(skip_exon_start-coding_start+1)\
                                                            +"_to_"+str(skip_exon_end-coding_start+1)
                                    if (des_tmp not in
                                            trans_db_annotation_dict[db_seq]):
                                        trans_db_annotation_dict[
                                            db_seq].append(des_tmp)

                                tmp_anno = ""
                                for d in hom_des:
                                    tmp_anno += d
                                for d in het_des[c]:
                                    tmp_anno += d
                                trans_db_annotation_dict[db_seq].append(
                                    tmp_anno)

                                trans_prime_seq = generate_all_se_db.remove_exon(
                                    tmp_seq, skip_exon_start, skip_exon_end)
                                db_prime_seq = trans_prime_seq[coding_start -
                                                               1:]
                                if (db_prime_seq not in
                                        trans_db_annotation_dict.keys()):
                                    trans_db_annotation_dict[db_prime_seq] = [
                                        key, "modified_" + str(k) +
                                        "_exclusive_from_" +
                                        str(skip_exon_start - coding_start + 1)
                                    ]
                                else:
                                    des_tmp="modified_"+str(k)+"_exclusive_from_"+\
                                                            str(skip_exon_start-coding_start+1)
                                    if (des_tmp
                                            not in trans_db_annotation_dict[
                                                db_prime_seq]):
                                        trans_db_annotation_dict[
                                            db_prime_seq].append(des_tmp)

                                tmp_anno = ""
                                for d in hom_des:
                                    p = get_variant_pos_from_des(d)
                                    if p not in range(skip_exon_start,
                                                      skip_exon_end):
                                        tmp_anno += d
                                for d in het_des[c]:
                                    p = get_variant_pos_from_des(d)
                                    if p not in range(skip_exon_start,
                                                      skip_exon_end):
                                        tmp_anno += d
                                trans_db_annotation_dict[db_prime_seq].append(
                                    tmp_anno)

                if ((len(hom_position_list) == 0) and (len(het_list) == 0)):
                    coding_start = trans_coding_dict[key][0]

                    tmp_exon_coordinate_dict = {}
                    tmp_exon_position_dict = {}
                    tmp_pos = 0
                    for item in range(1, len(T)):
                        ei = generate_all_se_db.find_key_index(
                            'exon_id', T[item], trans_records)
                        if (ei == -1):
                            print("the exon id cannot be found")
                        else:
                            tmp_start = trans_records.iloc[ei]['start']
                            tmp_end = trans_records.iloc[ei]['end']
                            tmp_exon_coordinate_dict[T[item]] = (
                                trans_records.iloc[ei]['chr'], tmp_start,
                                tmp_end)

                            tmp_exon_position_dict[item] = (tmp_pos,
                                                            tmp_pos + tmp_end -
                                                            tmp_start + 1)
                            tmp_pos += tmp_end - tmp_start + 1

                    if ((high_index - low_index) == 1):
                        position = tmp_exon_position_dict[low_index][1]
                        if (position >= coding_start):
                            db_seq = tmp_seq[coding_start - 1:]
                            if (db_seq not in trans_db_annotation_dict.keys()):
                                trans_db_annotation_dict[db_seq] = [
                                    key,
                                    str(k) + "_exclusive_from_" +
                                    str(position - coding_start + 1)
                                ]
                            else:
                                des_tmp = str(k) + "_exclusive_from_" + str(
                                    position - coding_start + 1)
                                if (des_tmp not in
                                        trans_db_annotation_dict[db_seq]):
                                    trans_db_annotation_dict[db_seq].append(
                                        des_tmp)

                            tmp_skip_id_index = generate_all_se_db.find_key_index(
                                'exon_id', skip_id, trans_records)
                            tmp_info=(trans_records.iloc[tmp_skip_id_index]['chr'],trans_records.iloc[tmp_skip_id_index]['start'],\
                                      trans_records.iloc[tmp_skip_id_index]['end'])

                            skip_seq = generate_all_se_db.fetch_exon_seq(
                                tmp_info[0], tmp_info[1], tmp_info[2], T[0])
                            trans_prime_seq = generate_all_se_db.add_exon(
                                tmp_seq, skip_seq, position)
                            db_prime_seq = trans_prime_seq[coding_start - 1:]
                            if (db_prime_seq
                                    not in trans_db_annotation_dict.keys()):
                                trans_db_annotation_dict[db_prime_seq]=[key,"modified_"+str(k)+"_inclusive_from_"\
                                                        +str(position-coding_start+1)+"_to_"+\
                                                        str(position-coding_start+1+len(skip_seq))]
                            else:
                                des_tmp="modified_"+str(k)+"_inclusive_from_"+\
                                                        str(position-coding_start+1)+"_to_"+\
                                                        str(position-coding_start+1+len(skip_seq))
                                if (des_tmp not in trans_db_annotation_dict[
                                        db_prime_seq]):
                                    trans_db_annotation_dict[
                                        db_prime_seq].append(des_tmp)
                    elif (T[low_index + 1] == skip_id):
                        skip_exon_start = tmp_exon_position_dict[low_index +
                                                                 1][0]
                        skip_exon_end = tmp_exon_position_dict[low_index +
                                                               1][1]

                        if (skip_exon_start >= coding_start):
                            db_seq = tmp_seq[coding_start - 1:]
                            if (db_seq not in trans_db_annotation_dict.keys()):
                                trans_db_annotation_dict[db_seq]=[key,str(k)+"_inclusive_from_"+str(skip_exon_start-coding_start+1)\
                                                        +"_to_"+str(skip_exon_end-coding_start+1)]
                            else:
                                des_tmp=str(k)+"_inclusive_from_"+str(skip_exon_start-coding_start+1)\
                                                        +"_to_"+str(skip_exon_end-coding_start+1)
                                if (des_tmp not in
                                        trans_db_annotation_dict[db_seq]):
                                    trans_db_annotation_dict[db_seq].append(
                                        des_tmp)

                            trans_prime_seq = generate_all_se_db.remove_exon(
                                tmp_seq, skip_exon_start, skip_exon_end)
                            db_prime_seq = trans_prime_seq[coding_start - 1:]
                            if (db_prime_seq
                                    not in trans_db_annotation_dict.keys()):
                                trans_db_annotation_dict[db_prime_seq] = [
                                    key,
                                    "modified_" + str(k) + "_exclusive_from_" +
                                    str(skip_exon_start - coding_start + 1)
                                ]
                            else:
                                des_tmp="modified_"+str(k)+"_exclusive_from_"+\
                                                        str(skip_exon_start-coding_start+1)
                                if (des_tmp not in trans_db_annotation_dict[
                                        db_prime_seq]):
                                    trans_db_annotation_dict[
                                        db_prime_seq].append(des_tmp)

    trans_records = trans_records.sort_values(by=['trans_id'])
    my_seqs = []
    # my_transcripts=[]
    modified_trans = []
    mutation_seq_cnt = 0
    for k in trans_db_annotation_dict.keys():
        annotation_list = trans_db_annotation_dict[k]
        trans_id = annotation_list[0]
        modified_trans.append(trans_id)
        #coding_start=trans_coding_dict[trans_id][0]
        #print(trans_id)
        pid = protein_id_dict[trans_id][0]
        gid = protein_id_dict[trans_id][1]

        tmp_id = db + "|" + pid + '|' + trans_id + '|' + gid

        des = ''
        flag = 0
        for an in range(1, len(annotation_list)):
            if (annotation_list[an].find('snv') != -1):
                flag = 1
            des += annotation_list[an] + '|'
#            my_transcripts.append(SeqRecord(Seq(str(k),IUPAC.ambiguous_dna),id=tmp_id,description=des))
        if (flag == 1):
            mutation_seq_cnt += 1
        new_seq = str(
            Seq(str(k),
                IUPAC.ambiguous_dna).transcribe().translate(to_stop=True))

        while (new_seq.find('None') != -1):
            new_seq = new_seq.replace('None', '')
        my_seqs.append(
            SeqRecord(Seq(str(new_seq), IUPAC.protein),
                      id=tmp_id,
                      description=des))

    print("The number of sequences with mutation generated is " +
          str(mutation_seq_cnt))

    if (exclude != True):
        original_cnt = 0
        for t in trans_seq_dict.keys():
            if t not in modified_trans:

                coding_start = int(trans_coding_dict[t][0])
                coding_end = int(trans_coding_dict[t][1])
                trans_seq = str(trans_seq_dict[t])
                trans_seq = trans_seq[coding_start - 1:]
                if (trans_seq not in trans_db_annotation_dict.keys()):

                    pid = protein_id_dict[t][0]
                    gid = protein_id_dict[t][1]
                    new_sequence = trans_seq

                    new_seq = str(
                        Seq(str(new_sequence),
                            IUPAC.ambiguous_dna).transcribe().translate(
                                to_stop=True))
                    #            else:
                    #                new_seq=str(Seq(str(new_sequence),IUPAC.ambiguous_dna).complement().transcribe().translate(to_stop=True))

                    my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                 id=db+'|'+pid+'|'+t+'|'+gid+'_0:'+str(coding_start)+'-'+str(coding_end),\
                                             description="no splicing"))
                    original_cnt += 1

        print("The number of original sequences is " + str(original_cnt))

    print("The number of sequences generated is " + str(len(my_seqs)))

    handle = open(output_name + ".fasta", "w")
    #handle=open(dataset_name+"_all_se_db"+".fasta","w")
    for sequence in my_seqs:
        SeqIO.write(sequence, handle, "fasta")
    handle.close()


#    handle_trans=open("../data/"+dataset_name+"/"+dataset_name+"_all_se_transcripts"+".fasta","w")
#    for sequence in my_transcripts:
#        SeqIO.write(sequence,handle_trans,"fasta")
#    handle_trans.close()
#return my_seqs

#dbname="../data/gencode.v28.basic.annotation.gff3"
#SE_name=argv[1]
#dfname=argv[2]
#rna_db=argv[3]
#output_name=argv[4]
#
#df_SE=pd.read_csv(SE_name,sep='\t')
#generate_se_sequence_combination(df_SE,dfname,dbname,rna_db,output_name)
##
#finish=time.time()
#print("---- %s minutes ----" % ((finish-begin)/60))
コード例 #38
0
# 4.4.9.orf_finder.py

from Bio.Seq import Seq

tatabox_seq = Seq("tataaaggcAATATGCAGTAG")
start_idx = tatabox_seq.find("ATG")
end_idx = tatabox_seq.find("TAG", start_idx)  # 예문의 편의상 TAG 로 하였다.
orf = tatabox_seq[start_idx:end_idx + 3]  # 파이썬 문자열과 같은 방법으로 슬라이싱이 가능하다.
print(orf)  # ATGCAGTAG
コード例 #39
0
def get_new_sequence(dfname, dbname, rna_db, het, exclude, output_name):
    version = get_version(rna_db)
    if (version == 'swissprot'):
        db = 'sp'
    else:
        db = version

    sequence_dict = {}
    rna_seq = SeqIO.parse(rna_db, 'fasta')
    ##check the correctness of rna-seqs
    for correct in rna_seq:
        tmp = correct.id
        flag = tmp.find("|")
        tmp = tmp[flag + 1:]
        cds = tmp.find('CDS:')
        if (cds == -1):
            print("The format of file of parameter -r(--rna) is incorrect!")
            return 0
        tmp = tmp[cds:]
        cds_end = tmp.find('|')
        tmp = tmp[tmp.find(':') + 1:cds_end]

        split_flag = tmp.find('-')
        if (split_flag == -1):
            print("The format of file of parameter -r(--rna) is incorrect!")
            return 0

    rna_seqs = SeqIO.parse(rna_db, 'fasta')
    for seq in rna_seqs:
        tmp = seq.id
        flag = tmp.find("|")
        mrna_id = tmp[:flag]
        tmp = tmp[flag + 1:]
        cds = tmp.find('CDS:')
        tmp = tmp[cds:]
        cds_end = tmp.find('|')
        tmp = tmp[tmp.find(':') + 1:cds_end]

        split_flag = tmp.find('-')
        coding_start = int(tmp[:split_flag])
        coding_end = int(tmp[split_flag + 1:])
        sequence_dict[mrna_id] = (coding_start, seq.seq, coding_end)

    print("sequence_dict ready")

    records = DataIterator(dbname)
    strand_dict = {}
    protein_id_dict = {}
    for record in records:
        if (record[2] == 'transcript'):
            if ('transcript_type' in record.attributes):
                if (record.attributes['transcript_id'][0]
                        in sequence_dict.keys()):
                    strand_dict[record.attributes['transcript_id']
                                [0]] = record[6]
                    protein_id_dict[record.attributes['transcript_id'][0]]=(record.attributes['protein_id'][0],\
                                   record.attributes['gene_id'][0])

    print("protein_id_dict ready")

    df = pd.read_csv(dfname, sep='\t', header=None)
    change_df = extract_transcript_change(df)
    trans_index_dict = {}
    for i in range(0, change_df.shape[0]):
        if (change_df.iloc[i]['mrna'] not in trans_index_dict.keys()):
            trans_index_dict[change_df.iloc[i]['mrna']] = [i]
        else:
            trans_index_dict[change_df.iloc[i]['mrna']].append(i)

    my_seqs = []
    k_cnt = 0

    hom_only_cnt = 0
    hom_het_cnt = 0
    het_only_cnt = 0
    original_cnt = 0
    random_cnt = 0

    for k in trans_index_dict.keys():

        if (k in protein_id_dict.keys()):
            k_cnt += 1
            if (k_cnt % 1000 == 0):
                print(k_cnt)
            pid = protein_id_dict[k][0]
            gid = protein_id_dict[k][1]

            if (strand_dict[k] == '+'):

                transcript = str(sequence_dict[k][1])
            else:
                tmp = Seq(str(sequence_dict[k][1]),
                          IUPAC.ambiguous_dna).complement()
                transcript = str(tmp)
            coding_start = int(sequence_dict[k][0]) - 1
            coding_end = int(sequence_dict[k][2]) - 1

            transcript = transcript[coding_start:coding_end + 1]

            shift = 0
            des = ""
            het_list = []
            hom_position_list = []
            for i in trans_index_dict[k]:

                if (change_df.iloc[i]['snp_type'] == 'hom'):
                    hom_position_list.append(
                        (int(change_df.iloc[i]['c_start']),
                         change_df.iloc[i]['mutation_type']))
                    #                    if(change_df.iloc[i]['mutation_type']=='snv'):
                    #                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                    #                                                  int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],'snv')
                    #                        des+="snv:"+str(change_df.iloc[i]['c_start'])+change_df.iloc[i]['c_content']+'_'
                    #                    else:
                    if (strand_dict[k] == '-'):
                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                                              int(change_df.iloc[i]['c_end'])+shift,str(Seq(str(change_df.iloc[i]['c_content'])).complement()),\
                                              change_df.iloc[i]['mutation_type'])
                    else:
                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                                              int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],\
                                              change_df.iloc[i]['mutation_type'])
                    des+=change_df.iloc[i]['mutation_type']+":"+str(change_df.iloc[i]['c_start'])+'-'+\
                    str(change_df.iloc[i]['c_end'])+str(change_df.iloc[i]['c_content'])+'_'
                    if (change_df.iloc[i]['mutation_type'].find('del') != -1):
                        shift -= (int(change_df.iloc[i]['c_end']) -
                                  int(change_df.iloc[i]['c_start']) + 1)
                    elif (change_df.iloc[i]['mutation_type'].find('ins') !=
                          -1):
                        shift += (int(change_df.iloc[i]['c_end']) -
                                  int(change_df.iloc[i]['c_start']) + 1)
                else:
                    het_list.append(i)

            if (len(hom_position_list) != 0):
                new_sequence = ""
                new_des = ""
                for p in hom_position_list:

                    flag = des.find(str(p[0]))
                    tmp = des[flag:]
                    flag = tmp.find('_')
                    new_des += str(p[1]) + ":" + tmp[:flag + 1]
                    new_sequence = transcript

                if (len(new_sequence) != 0):
                    new_sequence = new_sequence[shift:]

                    if (strand_dict[k] == '+'):
                        new_seq = str(
                            Seq(str(new_sequence),
                                IUPAC.ambiguous_dna).transcribe().translate(
                                    to_stop=True))
                    else:
                        new_seq = str(
                            Seq(str(new_sequence), IUPAC.ambiguous_dna).
                            complement().transcribe().translate(to_stop=True))

                    while (new_seq.find('None') != -1):
                        new_seq = new_seq.replace('None', '')
                    my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                             id=db+'|'+pid+'|'+k+'|'+gid+'_0:'+str(int(sequence_dict[k][0])+shift)+'-'+str(int(sequence_dict[k][2])+shift)+'_'+new_des,\
                                         description=new_des))
                    hom_only_cnt += 1

            else:
                if (exclude == False):
                    new_sequence = transcript
                    if (strand_dict[k] == '+'):
                        new_seq = str(
                            Seq(str(new_sequence),
                                IUPAC.ambiguous_dna).transcribe().translate(
                                    to_stop=True))
                    else:
                        new_seq = str(
                            Seq(str(new_sequence), IUPAC.ambiguous_dna).
                            complement().transcribe().translate(to_stop=True))

                    my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                 id=db+'|'+pid+'|'+k+'|'+gid+'_0:'+str(int(sequence_dict[k][0]))+'-'+str(int(sequence_dict[k][2]))+'_no_variant',\
                                             description="no variant"))
                    original_cnt += 1

            if (het == 1):
                coding_start = int(sequence_dict[k][0]) + shift - 1
                coding_end = int(sequence_dict[k][2]) + shift - 1
                count = int(len(transcript) / 900)
                if (len(transcript) <= 900):
                    count = 1
                cnt = 0

                for l in range(0, count):
                    l = l * 900
                    start = l
                    if (start + 1799 < len(transcript)):
                        stop = start + 1799
                    else:
                        stop = len(transcript) - 1

                    het_number = len(het_list)
                    for n in range(0, het_number):
                        new_sequence = ""
                        new_des = ""

                        if ((int(change_df.iloc[het_list[n]]['c_start']) >=
                             start) &
                            (int(change_df.iloc[het_list[n]]['c_start']) <=
                             stop)):
                            if (strand_dict[k] == '-'):
                                new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\
                                                      int(change_df.iloc[het_list[n]]['c_end'])+shift,\
                                                      str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\
                                                      change_df.iloc[het_list[n]]['mutation_type'])
                            else:
                                new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\
                                                      int(change_df.iloc[het_list[n]]['c_end'])+shift,change_df.iloc[het_list[n]]['c_content'],\
                                                      change_df.iloc[het_list[n]]['mutation_type'])
                            new_des=des+change_df.iloc[het_list[n]]['mutation_type']+":"+\
                            str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\
                            str(change_df.iloc[het_list[n]]['c_content'])
                        if (len(new_sequence) != 0):
                            new_sequence = new_sequence[start:stop + 1]

                            if (strand_dict[k] == '+'):
                                new_seq = str(
                                    Seq(str(new_sequence), IUPAC.ambiguous_dna
                                        ).transcribe().translate(to_stop=True))
                            else:
                                new_seq = str(
                                    Seq(str(new_sequence),
                                        IUPAC.ambiguous_dna).complement().
                                    transcribe().translate(to_stop=True))
                            cnt += 1
                            while (new_seq.find('None') != -1):
                                new_seq = new_seq.replace('None', '')
                            my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                     id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+new_des,\
                                                 description=new_des))
                            if (len(hom_position_list) != 0):
                                hom_het_cnt += 1
                            else:
                                het_only_cnt += 1
                            if (change_df.iloc[het_list[n]]['mutation_type'] ==
                                    'snv'):
                                random_seq, random_des = generate_random_SNV_site(
                                    strand_dict[k], k,
                                    transcript[start:stop + 1],
                                    int(change_df.iloc[het_list[n]]['c_start'])
                                    - start)
                                my_seqs.append(SeqRecord(Seq(str(random_seq),IUPAC.protein),\
                                                     id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+des+random_des,\
                                                 description=des+random_des))
                                random_cnt += 1

                            else:
                                random_seq,random_des=generate_random_fs(strand_dict[k],transcript[start:stop+1],\
                                                                         change_df.iloc[het_list[n]]['mutation_type'],\
                                                                         len(change_df.iloc[het_list[n]]['c_content']),\
                                                                         int(change_df.iloc[het_list[n]]['c_start'])-start)
                                #                            if(len(random_seq)!=0):
                                my_seqs.append(SeqRecord(Seq(str(random_seq),IUPAC.protein),\
                                                     id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+des+random_des,\
                                                 description=des+random_des))
                                random_cnt += 1

    if (exclude == False):
        for key in sequence_dict.keys():
            if (key not in trans_index_dict.keys()):
                pid = protein_id_dict[key][0]
                gid = protein_id_dict[key][1]
                coding_start = int(sequence_dict[key][0]) - 1
                coding_end = int(sequence_dict[key][2]) - 1
                new_sequence = sequence_dict[key][1][coding_start:coding_end +
                                                     1]

                new_seq = str(
                    Seq(str(new_sequence),
                        IUPAC.ambiguous_dna).transcribe().translate(
                            to_stop=True))
                my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                             id=db+'|'+pid+'|'+key+'|'+gid+'_0:'+str(coding_start+1)+'-'+str(coding_end+1)+'_no_variant',\
                                         description="no variant"))
                original_cnt += 1

    print("The number of proteins related is " + str(k_cnt))
    print("The number of sequences generated is " + str(len(my_seqs)))

    handle = open(output_name + ".fasta", "w")

    for sequence in my_seqs:
        SeqIO.write(sequence, handle, "fasta")

    print("The number of sequences containing hom only is " +
          str(hom_only_cnt))
    print("The number of sequences containing het only is " +
          str(het_only_cnt))
    print("The number of mixed sequences is " + str(hom_het_cnt))
    print("The number of original sequences is " + str(original_cnt))
    print("The number of random sequences is " + str(random_cnt))
コード例 #40
0
ファイル: 318_test1.py プロジェクト: cgregg/codonmassager
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio.Alphabet import generic_rna
from Bio.Alphabet import generic_protein

my_dna = Seq("ATGGGGAGAAGGCCGTAG", generic_dna)
#print my_dna

#a = my_dna + 'aaa'
#print a

print my_dna.find('AGG')
print my_dna.find('AGA')
print my_dna
print my_dna.count('A')
print len(my_dna)

your_dna = my_dna.complement()
print your_dna
my_rna = my_dna.transcribe()
print my_rna

my_protr = my_rna.translate(table=1, to_stop=True) 
#table = 1 is default std genetic code, http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi#SG1
#to_stop=True tells it to stop at stops
print my_protr
my_protd = my_dna.translate(to_stop=True)
print my_protd

#playing with complete CDS'
#yaaX = Seq("GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCA" + \
コード例 #41
0
ファイル: views.py プロジェクト: Ziska220/MolBioTools
def import_excel_view(request):
#define function called 'import_excel_view'
    new_line_char = "--"
    new_line = 0
    #Adds a new line character between uploaded file information when new_line > 0

    if request.method == "POST":
    #if there is data to be submitted continue with script
        form1 = UploadFileForm(request.POST, request.FILES)
        form2 = RefForm(request.POST)
        form3 = ChrLocForm(request.POST)
        #handles assigned to user submitted data for each form.
        ValidForm1 = False
        #Grants entry into oligo search loop if True

        if form1.is_valid() and (form2.is_valid() or form3.is_valid()):
        #Validates user input for oligo files and at least one reference

            check = (form2.is_valid()), (form3.is_valid())
            if form1.is_valid() and all(check):
                raise forms.ValidationError('OOPS! You submitted two types of reference data. Either paste your reference or identify a chromosome location.')
                #Raises error if there is user input for both references

            #SUBMITTED DATA: OLIGO FILE
            if form1.is_valid():
                oligo_input =  request.FILES.getlist('file')
                #Accesses 'file' from match_oligo/forms.py and uses .getlist to access all items in the MultiValueDict
                name_match_list = []
                sheet_info_list = []
                reference_info = []
                #creates empty  list where  matches from all files will be stored
                ValidForm1 = True
                #Grants entry into oligo search loop if True

            #SUBMITTED DATA: REFERENCE PASTE
            if form2.is_valid():
                reference = form2.cleaned_data['reference']
                #accesses validated form input
                    #reference = request.POST['reference']
                    #access unvalidated form input
                reference_upper = reference.upper().replace(" ", "")
                ref_seq = Seq(reference_upper)
                #uses biopython to convert reference into Seq object
                ref_rev_comp = Seq.reverse_complement(ref_seq)
                #uses biopython to create a reverse compliment of the submitted reference data
                ref_length = str(len(ref_seq))
                reference_info.extend(("The following number of nucleotides were searched: {}".format(ref_length),))
                chr_input_seq = ''
                chr_input_rev_seq = ''
                #create empty list of UCSC das url reference variables to prevent error in oligo search loop

            #SUBMITTED DATA: UCSC DAS URL REFERENCE
            elif form3.is_valid():
                    chrom = request.POST['chr']
                    loc_start = request.POST['loc_start']
                    loc_stop = request.POST['loc_stop']
                    #access user input for chromsome location
                    url = "http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=chr{}:{},{}".format(chrom, loc_start, loc_stop)
                    chr_url = urllib.request.urlopen(url)
                    chr_url_read = chr_url.read()
                    chr_url_decode = chr_url_read.decode('utf-8')
                    #open, read, and decode text from the UCSC das url
                    chr_input = re.sub('<.+>', '', chr_url_decode)
                    chr_input_strip = chr_input.replace('\n','')
                    chr_input_caps = chr_input_strip.upper().replace(" ", "")
                    #remove all non-sequence text between <>, remove newline, and convert to all caps
                    chr_input_seq = Seq(chr_input_caps)
                    #use biopython to create sequence object out of url text
                    chr_input_rev_seq = Seq.reverse_complement(chr_input_seq)
                    #use biopython to create reverse compiment of sequence
                    reference_info.extend(("Chromosome {}: {}-{}".format(chrom,loc_start,loc_stop),))
                    reference_info.extend(("{}".format(url),))
                    ref_seq = ''
                    ref_rev_comp = ''
                    #create empty list of paste reference variables to prevent error in oligo search loop

        if ValidForm1:
        #ValidForm1 is True if form1 (excel oligo input) is valid
            for xlsfile in oligo_input:
            #iterates through user uploaded files
                if new_line > 0:
                    name_match_list.extend((new_line_char,))
                    #adds new line character if a file already had a match (new_line > 0)
                new_line = 0
                #reset- if a file does not have a match a new line character will not be added for next file
                saw_file = 0
                #reset- if first time seeing a file (saw_file = 0) name of file will be displayed
                oligo_row = 0
                oligo_col = 2
                name_col = 0
                #variables assigned to row and columns of excel input and needs to be reset for each file

                book = xlrd.open_workbook(file_contents=xlsfile.read())
                #Uses xlrd package to open and read submitted file as excel sheet.
                #Creates string from 'ExcelInMemoryUploadedFile' with read() function.
                sheet = book.sheet_by_index(0)
                #identifies which sheet in the excel file to use
                nrows = sheet.nrows
                #sets handle to number of rows in identified excel sheet

                sheet_info_list.extend(("{}".format(xlsfile),))
                sheet_info_list.extend(("Sheet: {}".format(sheet.name),))
                sheet_info_list.extend(("Total number of oligos searched: {}".format(sheet.nrows),))
                sheet_info_list.extend((new_line_char,))
                #displays each of the excel file's information

                for oligo in range(sheet.nrows):
                #iterates through items in identified file/sheet
                    cell = sheet.cell_value(rowx=oligo_row, colx=oligo_col)
                    #using above variables, sets handle to the cell in the current sheet/file where match search will begin


                    #OLIGO MATCH SCRIPT: add +1 to oligo_row until reach nrows (ie the total number of rows in the sheet)
                    if oligo_row < nrows:
                        oligo_caps = cell.upper().replace(" ", "")
                        oligo_find = ref_seq.find(oligo_caps)
                        oligo_rev_find = ref_rev_comp.find(oligo_caps)
                        oligo_find_url = chr_input_seq.find(oligo_caps)
                        oligo_rev_find_url = chr_input_rev_seq.find(oligo_caps)
                        #uses biopython to look for oligo in reference and reverse compliment of reference
                        if oligo_find == -1 and oligo_find_url == -1 and oligo_rev_find == -1 and oligo_rev_find == -1 or cell == '':
                            oligo_row += 1
                            #if there is no match (-1), go to next row (add +1 to oligo_row)
                        elif oligo_find != -1 or oligo_find_url != -1 or oligo_rev_find != -1 or oligo_rev_find_url != -1:
                        #if there is a match (not -1, any other number is the index of the match), set handle to that cell name
                            name = sheet.cell_value(rowx=oligo_row, colx=name_col)
                            #assign handle to cell with match
                            name_match = str(name)
                            #create string from cell name
                            if saw_file < 1:
                                xls_match_file_name = "%s:" % xlsfile
                                name_match_list.extend((xls_match_file_name,))
                                name_match_list.extend((name_match,))
                                #if first time seeing a match in file (saw_file = 0) name of file and match will be displayed
                            else:
                                name_match_list.extend((name_match,))
                                #if file already has a match (saw_file > 0) match will be be displayed
                            saw_file += 1
                            oligo_row += 1
                            new_line += 1

            return render(request, 'match_oligo/output.html', {'var': name_match_list, 'search_param': sheet_info_list, 'ref_info': reference_info})

    else:
        form1 = UploadFileForm()
        form2 = RefForm()
        form3 = ChrLocForm()

    return render(request, 'match_oligo/user_input.html', {'form1': form1, 'form2': form2, 'form3':form3})
コード例 #42
0
ファイル: oligo_find.py プロジェクト: Ziska220/MolBioTools
    search_seq = Column[2]
    #identifies column with oligo sequence (2 is actually 3 because numbering starts at 0)

    full_name = Column[1]
    #identifies column with oligo names (1 is actually 2 because numbering starts at 0)

    oligo_key = Column[0]
    #identifies column with oligo identifier eg KH23 (0 is actually 1 because  numbering starts at 0)

    matching_search = Seq(CAPS_reference_seq)
    #opens BioPython's Seq function on reference file and names it

    reverse_compliment = matching_search.reverse_complement()
    #creates a BioPython matching search for the reverse compliment of the reference file

    oligo_find = matching_search.find(search_seq)
    #Uses find function to search for oligos in reference file

    reverse_oligo_find = reverse_compliment.find(search_seq)
    #Uses find function to search for oligos in reverse compliment of reference file

    if oligo_find != -1 :
    #if the results is not -1 then move on with the script. -1 means failure or there was no match.

        if oligo_find != 0 :
        #if the result is not 0 then move on with the script

            OutputString = ("Here is the location oligo name and the location: %s, %s \n" % (oligo_key, oligo_find))
            #defines the oligos that pass the above tests or match to the reference file

            OutFile2.write(OutputString)