コード例 #1
0
ファイル: test_event_mapping.py プロジェクト: Phelimb/dtwmap
class TestEventMapping():

	def __init__(self):
		self.template_model = EventModel("models/template_model_5.model")
		self.complement_model = EventModel("models/complement_model_5.model")
		self.event_mapper = EventMapper()

		self.my_seq = Seq("CAAAACGTGT")
		self.forward_template_events = Seq2Events(self.my_seq, self.template_model).events
		self.reverse_template_events = Seq2Events(self.my_seq.reverse_complement(), self.template_model).events
		
		self.reverse_complement_events = Seq2Events(self.my_seq.reverse_complement(), self.complement_model).events
		self.forward_complement_events = Seq2Events(self.my_seq, self.complement_model).events

		self.complement_seq = self.my_seq.reverse_complement()


	def test_make_simple_reference(self):
		forward_template_mapping_results = self.event_mapper.map(self.forward_template_events, self.forward_template_events)
		reverse_template_mapping_results = self.event_mapper.map(self.reverse_template_events, self.forward_template_events)
		forward_complement_mapping_results = self.event_mapper.map(self.forward_complement_events, self.forward_template_events)
		reverse_complement_mapping_results = self.event_mapper.map(self.reverse_complement_events, self.forward_template_events)


		assert len(forward_template_mapping_results.path[0]) == 6

		assert forward_template_mapping_results.path[0].tolist() == [0, 1, 2 ,3, 4, 5]
		assert forward_template_mapping_results.path[1].tolist() == [0, 1, 2 , 3 ,4, 5]

		assert min([sum(sum(forward_template_mapping_results.cost)),
					sum(sum(reverse_template_mapping_results.cost)),
					sum(sum(forward_complement_mapping_results.cost)),
					sum(sum(reverse_complement_mapping_results.cost))]) == sum(sum(forward_template_mapping_results.cost))

		assert min([forward_template_mapping_results.dist,
					reverse_template_mapping_results.dist, 
					forward_complement_mapping_results.dist, 
					reverse_complement_mapping_results.dist]) == forward_template_mapping_results.dist

	def test_complement_mapping(self):
		forward_template_mapping_results = self.event_mapper.map(self.forward_template_events, self.forward_complement_events)
		reverse_template_mapping_results = self.event_mapper.map(self.reverse_template_events, self.forward_complement_events)
		forward_complement_mapping_results = self.event_mapper.map(self.forward_complement_events, self.forward_complement_events)
		reverse_complement_mapping_results = self.event_mapper.map(self.reverse_complement_events, self.forward_complement_events)


		assert len(forward_complement_mapping_results.path[0]) == 6

		assert forward_complement_mapping_results.path[0].tolist() == [0, 1, 2 ,3, 4, 5]
		assert forward_complement_mapping_results.path[1].tolist() == [0, 1, 2 , 3 ,4, 5]

		assert min([sum(sum(forward_template_mapping_results.cost)),
					sum(sum(reverse_template_mapping_results.cost)),
					sum(sum(forward_complement_mapping_results.cost)),
					sum(sum(reverse_complement_mapping_results.cost))]) == sum(sum(forward_complement_mapping_results.cost))

		assert min([forward_template_mapping_results.dist,
					reverse_template_mapping_results.dist, 
					forward_complement_mapping_results.dist, 
					reverse_complement_mapping_results.dist]) == forward_complement_mapping_results.dist
コード例 #2
0
ファイル: make_ortho_table.py プロジェクト: hjanime/Spanki
def intron_sequence_single(juncid,f):
 	"""
 	Returns the intron sequence and flanks for a join
 	"""
	j1 = Junctionid(juncid)
	if j1.strand == '+':
		fiveprimeflank = Seq(f[j1.chr][j1.start-10:j1.start], IUPAC.unambiguous_dna)
		threeprimeflank = Seq(f[j1.chr][j1.end:j1.end+10], IUPAC.unambiguous_dna)
		donormotif = Seq(f[j1.chr][j1.start:j1.start+2], IUPAC.unambiguous_dna)
		acceptormotif = Seq(f[j1.chr][j1.end-2:j1.end], IUPAC.unambiguous_dna)
		acceptormotif = acceptormotif.upper()
		donormotif = donormotif.upper()
		dastring = donormotif + '..' + acceptormotif
	else:
		fiveprimeflank = Seq(f[j1.chr][j1.end:j1.end+10], IUPAC.unambiguous_dna)
		threeprimeflank = Seq(f[j1.chr][j1.start-10:j1.start], IUPAC.unambiguous_dna)
		fiveprimeflank = fiveprimeflank.reverse_complement()
		threeprimeflank = threeprimeflank.reverse_complement()
		acceptormotif = Seq(f[j1.chr][j1.start:j1.start+2], IUPAC.unambiguous_dna)
		donormotif = Seq(f[j1.chr][j1.end-2:j1.end], IUPAC.unambiguous_dna)
		acceptormotif = acceptormotif.upper()
		donormotif = donormotif.upper()
		dastring = donormotif.reverse_complement() + '..' + acceptormotif.reverse_complement()
	#INTSEQ[juncid]['dinucleotide'] = dastring
	#INTSEQ[juncid]['flank5'] = fiveprimeflank
	#INTSEQ[juncid]['flank3'] = threeprimeflank
	return dastring
コード例 #3
0
ファイル: jsViews.py プロジェクト: graik/labrack
def searchDnaParts(request, sequence_text,displayIdDnaComponent):
    coo = sequence_text
    rs = sequence_text.split('__')
    sequence_text = rs[0]
    sequence_vector = rs[1]
    f = len(sequence_vector)
    r = len(sequence_text)
    seq_exceptVector = sequence_text.replace(sequence_vector, '')
    s = len(seq_exceptVector)

    message = {"list_dnas": "", "extra_values": "","parttypes_values": "","optimizedfor_values": "", \
               "reverse_list_dnas": "","reverse_extra_values": ""}
    if request.is_ajax():
        # calculate the reverse complement sequence and duplicate sequence for better Vector matching
        sequence_textDuplicate = sequence_text + sequence_text
        my_seqDuplicate = Seq(sequence_textDuplicate, IUPAC.unambiguous_dna)
        revseqDuplicate = my_seqDuplicate.reverse_complement()                

        my_seq = Seq(sequence_text, IUPAC.unambiguous_dna)
        revseq = my_seq.reverse_complement()

        my_seq_exceptVect = Seq(seq_exceptVector, IUPAC.unambiguous_dna)
        revseq_exceptVect = my_seq_exceptVect.reverse_complement()

        #part for retriving potential Inserts
        message['extra_values'] = getInsertDBAnnotationBySequence(seq_exceptVector,'+',displayIdDnaComponent)
        message['reverse_extra_values'] = getInsertDBAnnotationBySequence(str(revseq_exceptVect),'-',displayIdDnaComponent)

        #paret retrieving all partTypes
        dnapartstypesAll = M.DnaComponentType.objects.all()
        json_dnaparttype = ''
        for dnaparttype in dnapartstypesAll :
            id = dnaparttype.id
            name = dnaparttype.name
            if json_dnaparttype == '':
                json_dnaparttype = '{ "id":"'+str(id)+'","name":"'+name+'"}'
            else:
                json_dnaparttype = json_dnaparttype+',{ "id":"'+str(id)+'","name":"'+name+'"}'
        json_dnaparttype = '['+json_dnaparttype+']'
        message['parttypes_values'] = json_dnaparttype

        #paret retrieving all optimized for
        chassisOptimizedAll = M.Chassis.objects.all()
        json_chassisOptimizedAll = ''
        json_chassisOptimizedAll = '{ "id":"","name":""}'
        for chas in chassisOptimizedAll :
            id = chas.id
            name = chas.name
            displayId = chas.displayId
            if json_chassisOptimizedAll == '':
                json_chassisOptimizedAll = '{ "id":"'+str(id)+'","name":"'+name+'","displayId":"'+displayId+'"}'
            else:
                json_chassisOptimizedAll = json_chassisOptimizedAll+',{ "id":"'+str(id)+'","name":"'+name+ \
                    '","displayId":"'+displayId+'"}'
        json_chassisOptimizedAll = '['+json_chassisOptimizedAll+']'
        message['optimizedfor_values'] = json_chassisOptimizedAll                
    else:
        message = "None"
    json = simplejson.dumps(message)
    return HttpResponse(json, mimetype='application/json')
コード例 #4
0
ファイル: make_ortho_table.py プロジェクト: hjanime/Spanki
def intron_sequence(myjuncs,f):
 	"""
 	Returns the intron sequence and flanks for each join
 	"""
 	INTSEQ = collections.defaultdict(lambda : collections.defaultdict(dict))
	for juncid in myjuncs:
		j1 = Junctionid(juncid)
		if j1.strand == '+':
			fiveprimeflank = Seq(f[j1.chr][j1.start-10:j1.start], IUPAC.unambiguous_dna)
			threeprimeflank = Seq(f[j1.chr][j1.end:j1.end+10], IUPAC.unambiguous_dna)
			donormotif = Seq(f[j1.chr][j1.start:j1.start+2], IUPAC.unambiguous_dna)
			acceptormotif = Seq(f[j1.chr][j1.end-2:j1.end], IUPAC.unambiguous_dna)
			acceptormotif = acceptormotif.upper()
			donormotif = donormotif.upper()
			dastring = donormotif + '..' + acceptormotif
		else:
			fiveprimeflank = Seq(f[j1.chr][j1.end:j1.end+10], IUPAC.unambiguous_dna)
			threeprimeflank = Seq(f[j1.chr][j1.start-10:j1.start], IUPAC.unambiguous_dna)
			fiveprimeflank = fiveprimeflank.reverse_complement()
			threeprimeflank = threeprimeflank.reverse_complement()
			acceptormotif = Seq(f[j1.chr][j1.start:j1.start+2], IUPAC.unambiguous_dna)
			donormotif = Seq(f[j1.chr][j1.end-2:j1.end], IUPAC.unambiguous_dna)
			acceptormotif = acceptormotif.upper()
			donormotif = donormotif.upper()
			dastring = donormotif.reverse_complement() + '..' + acceptormotif.reverse_complement()
		INTSEQ[juncid]['dinucleotide'] = dastring
		INTSEQ[juncid]['flank5'] = fiveprimeflank
		INTSEQ[juncid]['flank3'] = threeprimeflank
	return INTSEQ
コード例 #5
0
def get_reads_seqs(bamfile, rnames):
    """
    Return the sequences of all the reads from the bam file
    Arguments:
    - `bamfile`: The pysam file
    - `rnames`: reads names
    """
    r1_seqs = {}
    r2_seqs = {}
    rqns = set()
    reads = defaultdict(list)
    for read in bamfile.fetch(until_eof=True):
        rqns.add(read.qname)
        reads[read.qname].append(read)
    for rn in set(rnames) & rqns:
        for read in reads[rn]:
            if read.is_read1:
                outseq = Seq(read.seq)
                if not read.is_reverse:
                    outseq = outseq.reverse_complement()
                r1_seqs[read.qname] = str(outseq)
            else:
                outseq = Seq(read.seq)
                if read.is_reverse:
                    outseq = outseq.reverse_complement()
                r2_seqs[read.qname] = str(outseq)
    # r1_seqs is the 3' end of the second fused RNA, r2_seqs is the 5' of the
    # first fused RNA
    return r1_seqs, r2_seqs
コード例 #6
0
ファイル: primers.py プロジェクト: xapple/fasta
class TwoPrimers(object):
    """A container for the two primers of a sample"""

    def __len__(self): return 2

    def __init__(self, fwd_str, rev_str):
        # Strings #
        self.fwd_str = fwd_str
        self.rev_str = rev_str
        # Lengths #
        self.fwd_len = len(self.fwd_str)
        self.rev_len = len(self.rev_str)
        # Sequences #
        self.fwd_seq = Seq(self.fwd_str, IUPAC.ambiguous_dna)
        self.rev_seq = Seq(self.rev_str, IUPAC.ambiguous_dna)
        # Search patterns #
        self.fwd_pattern = iupac_pattern(self.fwd_seq)
        self.rev_pattern = iupac_pattern(self.rev_seq) # Don't add reverse complement here, use option instead
        # Search patterns reverse complemented #
        self.fwd_pattern_revcompl = iupac_pattern(self.fwd_seq.reverse_complement())
        self.rev_pattern_revcompl = iupac_pattern(self.rev_seq.reverse_complement())
        # Search expression without mismatches #
        self.fwd_regex = re.compile(self.fwd_pattern)
        self.rev_regex = re.compile(self.rev_pattern)
        # Uracil instead of thymine #
        self.fwd_regex_uracil = re.compile(self.fwd_pattern.replace('T', 'U'))
        self.rev_regex_uracil = re.compile(self.rev_pattern.replace('T', 'U'))
コード例 #7
0
ファイル: bamtrim2.py プロジェクト: talonsensei/Bfx_scripts
def extractRegion(bamfile):
    pysam.index(bamfile)                # must create a .bai index for any bam file to be read or fetch won't work
    bam = pysam.Samfile(bamfile,'rb')   # and must be done before bamfile is opened
    ref = bam.references[0]             # Get name of reference reads aligned to in bam
    outFASTQfile = open(bamfile+".extracted.fastq",'w')

    # Need to keep this dictionary up-to-date with references you expect to see 
    gene_pos = {'1b_Con1_full_reference_seq':{'ns5b':{'nterm':7599,'cterm':9371}},
                '1a_H77_full_reference_seq':{'ns5b':{'nterm':7602,'cterm':9374}},
                'H77_genome':{'ns5b':{'nterm':7602,'cterm':9374}},
                'JFH-1_genome':{'ns5b':{'nterm':7666,'cterm':9443}}}

    # Get the reads in region of interest
    read_pool = bam.fetch(bam.references[0], gene_pos[ref]['ns5b']['nterm'],gene_pos[ref]['ns5b']['cterm'])
    
    # Process reads
    for read in read_pool:
        seqlen = len(read.seq)

        # If start and end of read is completely within region of interest, just write it out
        if read.pos >= gene_pos[ref]['ns5b']['nterm'] and read.aend <= gene_pos[ref]['ns5b']['cterm']:

            if read.is_reverse == True:                     # all reverse reads in a bam file have been reverse 
                seq = Seq(read.query)                       # complemented already so they need to be reverse 
                rc = seq.reverse_complement().tostring()    # complemented again, along with the quality scores
                rq = reverseString(read.qqual)              # to write correctly to the fastq
                outFASTQfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n")
            else:
                outFASTQfile.write("@"+read.qname+"\n"+read.query+"\n+\n"+read.qqual+"\n")

        # If read is longer than region on N-term
        elif read.pos < gene_pos[ref]['ns5b']['nterm']:

            q = gene_pos[ref]['ns5b']['nterm'] - read.pos - 1

            if read.is_reverse == True:
                seq = Seq(read.query[q:])
                rc = seq.reverse_complement().tostring()
                rq = reverseString(read.qqual[q:])
                outFASTQfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n")
            else:
                outFASTQfile.write("@"+read.qname+"\n"+read.query[q:]+"\n+\n"+read.qqual[q:]+"\n")

        # If read is longer than region on C-term
        elif ((read.pos-read.qstart) + len(read.seq)) > gene_pos[ref]['ns5b']['cterm']:
            s = gene_pos[ref]['ns5b']['cterm']
            if read.pos <= s:

                q = s - read.pos
                if read.is_reverse == True:
                    seq = Seq(read.query[:q])
                    rc = seq.reverse_complement().tostring()
                    rq = reverseString(read.qqual[:q])
                    outFASTQfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n")
                else:
                    outFASTQfile.write("@"+read.qname+"\n"+read.query[:q]+"\n+\n"+read.qqual[:q]+"\n")

    outFASTQfile.close()
    return
コード例 #8
0
ファイル: TEprofiler.py プロジェクト: karroje/TESearchTool
def readFamilySequences(file, temparr, family):
    fileptr2 = open(file);

    correctFamily2 = 0;      #this part is to find the family in .align2 and reduce dashes
    arrforall = []    #the whole arr
    arrelement = []   #each element of arrforall contains (origseq,organismseq,coordinates in MIRb)
    coor_arr = []     #the coordinates in MIRb line
    for line in fileptr2:
        if line[0] == '#':    # KARRO: Allow us to comment out lines int the file (for testing)
            continue
        line = line.rstrip()
        arr = re.split("\s+", line)
        if len(arr) > 1:
            if arr[1] == family:
                coor_arr.append(int(arr[2]))  # KARRO: Changed these to ints for consistancy. 
                coor_arr.append(int(arr[3]))
                correctFamily2 = 1
                line = "\t".join(arr)
            else:
                correctFamily2 = 0;
        else:
            if correctFamily2 == 1:        #organism sequece line
                correctFamily2 = 2
                organism_sequence = line
            elif correctFamily2 == 2:      #original sequnce line
                origi_sequence = line
                line = line.lower()
                arr2 = re.split("-+", line)    #reduce dashes in "line"
                line = "".join(arr2)
                r0 = re.search(line, temparr)  #search modified origi in MIRb
                if r0 is None:
                    line = Seq(line)       #to use biopython
                    line = line.reverse_complement()
                    line = str(line)
                    organism_sequence = Seq(organism_sequence)
                    organism_sequence = organism_sequence.reverse_complement()
                    organism_sequence = str(organism_sequence)
                    origi_sequence = Seq(origi_sequence)
                    origi_sequence = origi_sequence.reverse_complement()
                    origi_sequence = str(origi_sequence)
                    
                temparr2 = temparr.replace(line, origi_sequence)  #change MIRb to original
                #print temparr2        #MIRb with original changed
                #print line            #lower without dashes original piece
                pat = origi_sequence
                r1 = re.search(pat, temparr2)
                start = r1.start()
                end = r1.end()
                arrelement = [origi_sequence, organism_sequence, start, start, coor_arr[0], coor_arr[1]]
                arrforall.append(arrelement)
                coor_arr = []
                #print arrforall
                #print "\n"
    return arrforall
コード例 #9
0
ファイル: simForward.py プロジェクト: koadman/proxigenomics
def find_priming_sites(oligo, seq):
    """For supplied priming sequence, find positions of all matches in a given sequence
    returns list of sites.
    """
    array = []
    for m in re.finditer(oligo, str(seq)):
        array.append(m.end())
    rc_oligo = Seq(oligo)
    rc_oligo.reverse_complement()
    for m in re.finditer(str(rc_oligo), str(seq)):
        array.append(m.end())
    return array
コード例 #10
0
ファイル: find_TERE.py プロジェクト: gturco/FFL_tools
def get_prom(f,gene):
    seqid = str(gene["seqid"])
    if gene["strand"] == "+":
        prom_start = max(0,int(gene["start"]) - 3000)
        promf = f.sequence({'chr':seqid, 'start': prom_start, 'stop': int(gene["start"])})
        s = Seq(promf, generic_dna)
        promr = s.reverse_complement()
 
    elif gene["strand"] == "-":
        prom_start = int(gene["end"]) + 3000
        promr = f.sequence({'chr':seqid, 'start':int(gene["end"]), 'stop': prom_start, 'strand': '+'})
        s = Seq(promr, generic_dna)
        promf = s.reverse_complement()
    return str(promf),str(promr)
コード例 #11
0
ファイル: pep_change.py プロジェクト: sara62/transabyss
def translate(sequence, min_protein_length=1, orient=None, frame=None, full=False, all=False):
    """Translates cdna sequence into protein"""
    seq = Seq(sequence)
    orfs = []
    seq_len = len(seq)

    if orient == "+":
        strand_and_base = [(+1, seq)]
    elif orient == "-":
        strand_and_base = [(-1, seq.reverse_complement())]
    else:
        strand_and_base = [(+1, seq), (-1, seq.reverse_complement())]

    for strand, nuc in strand_and_base:
        for fm in range(3):
            if frame != None and fm != frame:
                continue

            trans = str(nuc[fm:].translate())
            trans_len = len(trans)

            aa_start = 0
            aa_end = 0
            while aa_start < trans_len:
                aa_end = trans.find("*", aa_start)
                if aa_end == -1:
                    aa_end = trans_len - 1
                if aa_end - aa_start >= min_protein_length:
                    if strand == 1:
                        start = fm + aa_start * 3
                        end = min(seq_len - 1, fm + aa_end * 3 + 3 - 1)
                    else:
                        end = seq_len - 1 - fm - aa_start * 3
                        start = end + 1 - (aa_end - aa_start) * 3 - 3

                    orfs.append((start, end, strand, trans[aa_start:aa_end]))
                aa_start = aa_end + 1

    if len(orfs) > 0:
        if not all:
            orfs.sort(lambda x, y: len(y[3]) - len(x[3]))
            if not full:
                return orfs[0][-1]
            else:
                return orfs[0]
        else:
            return orfs
    else:
        return None
コード例 #12
0
ファイル: primers.py プロジェクト: Xiuying/illumitag
class TwoPrimers(object):
    """A container for the two primers of a pool"""

    def __repr__(self): return '<%s object for pool %s>' % (self.__class__.__name__, self.parent.id_name)
    def __len__(self): return 2

    def __init__(self, parent):
        self.parent, self.pool = parent, parent
        self.info = parent.info['primers']
        # Basic #
        self.name = self.info.get('name')
        # Names #
        self.fwd_name = self.info['forward']['name']
        self.rev_name = self.info['reverse']['name']
        # Strings #
        self.fwd_str = self.info['forward']['sequence']
        self.rev_str = self.info['reverse']['sequence']
        # Lengths #
        self.fwd_len = len(self.fwd_str)
        self.rev_len = len(self.rev_str)
        # Sequences #
        self.fwd_seq = Seq(self.fwd_str, IUPAC.ambiguous_dna)
        self.rev_seq = Seq(self.rev_str, IUPAC.ambiguous_dna)
        # Search patterns #
        self.fwd_pattern = ''.join(['[' + iupac[char] + ']' for char in self.fwd_seq])
        self.rev_pattern = ''.join(['[' + iupac[char] + ']' for char in self.rev_seq.reverse_complement()])
        # Search expression #
        self.fwd_regex = re.compile(self.fwd_pattern)
        self.rev_regex = re.compile(self.rev_pattern)
        # Uracil instead of thymine #
        self.fwd_regex_uracil = re.compile(self.fwd_pattern.replace('T', 'U'))
        self.rev_regex_uracil = re.compile(self.rev_pattern.replace('T', 'U'))
コード例 #13
0
ファイル: views.py プロジェクト: riforin/synbiocad
def stitch(fragments):
	#this function takes seq records and prints primers

	#let's make an empty sequence file
	Nfrags=len(fragments)
	donor=Seq("")
	index=[]
	print("")
	for i in range (0, Nfrags):
		donor=donor+fragments[i]
	# Dummy assignment setup to allow for compilation
	Lup = ""
	Rup = ""
	Ldown = ""
	Rdown = ""
	L = ""
	R = ""

	for i in range (0, Nfrags):
		if i==0:
			Lup = "Lup"+ fragments[i].id + " " + getPrimer(donor)
			Rup = "Rup"+ fragments[i].id + "(" + fragments[i+1].id + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement())
		elif i==Nfrags-1:
			Ldown = "Ldown"+ fragments[i].id + "(" + fragments[i-1].id + ") " + overhangPrimer(fragments[i],fragments[i-1])
			Rdown = "Rdown"+ fragments[i].id + " " + getPrimer(donor.reverse_complement())
		else:
			L = "L"+ fragments[i].id + "(" + fragments[i-1].id + ") " + overhangPrimer(fragments[i],fragments[i-1])
			R = "R"+ fragments[i].id + "(" + fragments[i+1].id + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement())

	sequenceLength = len(donor.seq)
	donorSequence = donor.seq

	return str(Lup), str(Rup), str(Ldown), str(Rdown), str(L), str(R), "Sequence Length: " + str(sequenceLength), "Sequence: " + str(donorSequence)
コード例 #14
0
def insert_element(pos, ref, ofile):
    chrn = pos[0]
    chrseq = ref[chrn]
    half1 = chrseq[:pos[2]]
    half2 = chrseq[pos[2]:]
    repid =pos[4]
    repseq=pos[5]
    reptsd=pos[6]
    repname=pos[7]
    #print 'insert: %s, %s' %(reptsd, repseq)
    #Chr1    not.give        transposable_element_attribute  1132975 1132977 -       .       .       ID=Chr1.1132977.spanners;avg_flankers=17;spanners=0;type=homozygous;TE=mping;TSD=TAA
    gff_newline = '%s\tPseudoGenome\tTransposable_element\t%s\t%s\t%s\t.\t.\tID=%s_%s_%s;Original_ID=%s;TE=%s;TSD=%s;' %(chrn, pos[1], pos[2], pos[3], chrn, pos[1], pos[2], repid, repname, reptsd)
    print >> ofile, gff_newline
    ##we choose sequence at target site as tsd, not use tsd provided
    tsdstart = pos[2] - len(reptsd)
    tsdseq   = chrseq[tsdstart:pos[2]]
    newseq   = ''
    if pos[3] == '+':
        newseq = half1 + repseq + tsdseq + half2
        #print tsdseq, repseq
    else:
        repseq_seq = Seq(repseq)
        repseq_rec = repseq_seq.reverse_complement()
        #print tsdseq, str(repseq_rec)
        newseq = half1 + str(repseq_rec) + tsdseq + half2
    ref[chrn] = newseq
コード例 #15
0
ファイル: zalipon_kmerR.py プロジェクト: nugaziy/retropipe
def main(inputtable, referencefile, window, k_min, k_max, outputdir, outputtable):
	outputdir += "/"

	if not os.path.exists(outputdir):
		os.makedirs(outputdir)

	megatable = pd.read_table(inputtable)
	reference = pysam.Fastafile(referencefile)

	table = open(outputdir + outputtable, 'w')
	table.write('KMER' + '\t' + 'AMOUNT' + '\n')

	kmer = []
	for index, row in log_progress(megatable.iterrows(), name = inputtable, every = 250, size = len(megatable)):
		if (str(row['Alu_hg38']) == 'Unknown') and (str(row['Alu_dbRIP_hg38']) == 'Unknown'):
			pos = int(row['POS'])
			if row['STRAND'] == '+':
				start = pos
				end = pos + window
				seq = Seq(reference.fetch(row['CHR'], start, end))
				seq = seq.reverse_complement()
				seq = str(seq).upper()
			else:
				start = pos - window - 1
				end = pos - 1
				seq = reference.fetch(row['CHR'], start, end)
				seq = seq.upper()
			for i in range(k_min, k_max + 1):
				for j in range(len(seq) - i):
						kmer.append(seq[j : j + i])
	kmer_count = dict(Counter(kmer))
	for key, value in kmer_count.items():
		table.write(str(key) + '\t' + str(value) + '\n')
	table.close()
コード例 #16
0
 def _process_single_end(self, input_fh, output_fh):
     for header, seq, qualities in self._parse_sequences(input_fh):
         raw_seq_len = len(seq)
         self._stats["total_no_of_reads"] += 1
         if self._fastq and not self._min_phred_score is None:
             seq = self._trim_by_quality(seq, qualities)
         if self._reverse_complement:
             seq = Seq(seq)
             seq = str(seq.reverse_complement())
         if not self._adapter is None:
             seq = self._clip_adapter(seq)
         if self._poly_a_clipping:
             seq = self._poly_a_clipper.clip_poly_a_strech(seq)
             seq = self._poly_a_clipper.remove_3_prime_a(seq)
         clipped_seq_len = len(seq)
         if clipped_seq_len == raw_seq_len - 1:
             self._stats["single_a_removed"] += 1
         elif clipped_seq_len < raw_seq_len - 1:
             self._stats["polya_removed"] += 1
         else:
             self._stats["unmodified"] += 1
         if clipped_seq_len < self._min_read_length:
             self._stats["too_short"] += 1
             continue
         self._stats["long_enough"] += 1
         self._stats["read_length_before_processing_and_freq"][
             raw_seq_len] += 1
         self._stats["read_length_after_processing_and_freq"][
             clipped_seq_len] += 1
         # Encoding to bytes is necessary due to saving via gzip
         output_fh.write(str.encode(">%s\n%s\n" % (header, seq)))
コード例 #17
0
def findFragendSites(fasta, resite):
    ''' Function creates FragendDict object. The object contains
    the location of all fragends for eachh strand of all
    chromosomes within a FASTA file.
    '''
    # Process restriction enzyme size and create output dictionary
    resite = resite.upper()
    frags = {'resite': resite}
    # Create sequence object for resite and reverse complent
    standard = Seq(resite)
    revcomp = standard.reverse_complement()
    # Open and parse fasta file
    fastaHandle = open(fasta)
    fastaData = SeqIO.parse(fastaHandle,'fasta')
    # Loop through fasta file and extract fragend information for each chromosome
    for fasta in fastaData:
        # Extract name and sequence
        fName, fSequence = str(fasta.id), str(fasta.seq).upper()
        # Add re sites to dictionary using 1 based index
        forward = nt_search(fSequence, standard)[1:]
        if forward:
            frags[(fName,'+')] = [x + len(resite) for x in forward]
        else:
            frags[(fName,'+')] = []
        reverse = nt_search(fSequence, revcomp)[1:]
        if reverse:
            frags[(fName,'-')] = [x + 1 for x in reverse]
        else:
            frags[(fName,'-')] = []
    # Close input file and return data
    fastaHandle.close()
    return(frags)
コード例 #18
0
def scanSequences(title,sequence,quality):
    tmp = str(random.random())[2:]
    seq = Seq(sequence)
    tempfile = open("hmm.seq"+tmp,"w")
    tempfile.write(">forward\n"+sequence+"\n>reverse\n"+seq.reverse_complement().tostring())   # writing full length seqto file for hmmscan
    tempfile.close()
    
    local_path = os.getcwd()+"/"
    hmmscan_bin = "/usr/local/bin/hmmscan"
    hmmresult_filename = doHMMScan("hmm.seq"+tmp,local_path,hmmscan_bin)
    target_cregions = processHMMresult(hmmresult_filename,local_path)
    s = q =''
    if target_cregions['ns5b_5prime'] =='' and target_cregions['ns5b_3prime'] =='':
        s = sequence
        q = quality
    elif target_cregions['ns5b_5prime'] != '' and target_cregions=='forward':
        x = target_cregions['ns5b_5prime'][0]
        s = sequence[x:]
        q = quality[x:]
    elif target_cregions['ns5b_3prime'] != '' and target_cregions=='forward':
        x = target_cregions['ns5b_3prime'][0]
        s = sequence[:x]
        q = quality[:x]
    elif target_cregions['ns5b_5prime'] and target_cregions=='reverse':
        x = len(sequence)-target_cregions['ns5b_5prime'][0]
        s = sequence[:x]
        q = quality[:x]
    elif target_cregions['ns5b_3prime'] and target_cregions=='reverse':
        x = len(sequence)-target_cregions['ns5b_3prime'][0]+1     # This is a little tricky to compensate for 0-based index of string
        s = sequence[x:]
        q = quality[x:]
    
  #  print "title: ",title," q: ",q," s: ",s
    return (title,s,q)
コード例 #19
0
def stitch(fragments):
    #this function takes seq records and prints primers
    
    #let's make an empty sequence file
    Nfrags=len(fragments)
    donor=Seq("")
    index=[]
    print("")
    for i in range (0, Nfrags):
        donor=donor+fragments[i]
    
    for i in range (0, Nfrags):
        if i==0:
            print("Lup"+ fragments[i].name + " " + getPrimer(donor))
            print("Rup"+ fragments[i].name + "(" + fragments[i+1].name + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement()))
        elif i==Nfrags-1:
            print("Ldown"+ fragments[i].name + "(" + fragments[i-1].name + ") " + overhangPrimer(fragments[i],fragments[i-1]))
            print("Rdown"+ fragments[i].name + " " + getPrimer(donor.reverse_complement()))
        else:
            print("L"+ fragments[i].name + "(" + fragments[i-1].name + ") " + overhangPrimer(fragments[i],fragments[i-1]))
            print("R"+ fragments[i].name + "(" + fragments[i+1].name + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement()))

    print("")
    print("Your donor DNA cassette, has the following bp length and sequence:")


    print("")
    print(len(donor.seq))
    print("")

    print(donor.seq)

    print("")
    print("You might want to copy this entire prompt and save it for your records.")
コード例 #20
0
ファイル: bamextract.py プロジェクト: talonsensei/Bfx_scripts
def extractRegion(bamfile,start,stop,output):
    pysam.index(bamfile)                # must create a .bai index for any bam file to be read or fetch won't work
    bam = pysam.Samfile(bamfile,'rb')   # and must be done before bamfile is opened
    ref = bam.references[0]             # Get name of reference reads aligned to in bam
    outfile = open(bamfile+".extracted."+output,'w')

    # Get the reads in region of interest
    read_pool = bam.fetch(bam.references[0], start,stop)
    
    # Process reads
    for read in read_pool:
        if read.is_reverse == True:                     # all reverse reads in a bam file have been reverse 
            seq = Seq(read.query)                       # complemented already so they need to be reverse 
            rc = seq.reverse_complement().tostring()    # complemented again, along with the quality scores
            rq = reverseString(read.qqual)              # to write correctly to the fastq
            if output == 'fastq':
                outfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n")
            elif output == 'fasta':
                outfile.write('>'+read.qname+'\n'+rc+'\n')
        else:
            if output == 'fastq':
                outfile.write("@"+read.qname+"\n"+read.query+"\n+\n"+read.qqual+"\n")
            elif output == 'fasta':
                outfile.write('>'+read.qname+'\n'+read.query+'\n')

    outfile.close()
    return
コード例 #21
0
def prepend_barcode(seqfile, bcfile, rc, text=''):
    tmph = open(seqfile+'.tmp', 'w')
    itr1 = FastqGeneralIterator(open(seqfile))
    itr2 = FastqGeneralIterator(open(bcfile))
    (h1, s1, q1) = itr1.next()
    (h2, s2, q2) = itr2.next()
    while 1:
        h1 = h1.split()[0]
        h2 = h2.split()[0]
        while h1 != h2:
            try:
                (h2, s2, q2) = itr2.next()
                h2 = h2.split()[0]
            except (StopIteration, IOError):
                break
        if rc:
            rcs = Seq(s2, generic_dna)
            s2 = rcs.reverse_complement()
            q2 = q2[::-1]
        if text:
            h1 = h1+'.'+text
        tmph.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1))
        try:
            (h1, s1, q1) = itr1.next()
            (h2, s2, q2) = itr2.next()
        except (StopIteration, IOError):
            break
    tmph.close()
    os.rename(seqfile+'.tmp', seqfile)
コード例 #22
0
def generateSeqHandles(anIndexCfg):
    """
        The YAML config file to parse is like:
    
    handles:
        prefix: "TTAGTCTCCGACGGCAGGCTTCAAT"
        postfix: "ACGCACCCACCGGGACTCAG"
    indexes: [
        "ACAGTC",
        "TGATGC",
        "TCTCAG"
    ]

    There is a handle at one end of each sequence which is as follows:
    TTAGTCTCCGACGGCAGGCTTCAAT-ACAGTC-ACGCACCCACCGGGACTCAG
              prefix         -index -      postfix
    """
    forwardIdx= []     # the result array to collect handle sequence strings
    handlePrefix = anIndexCfg["handles"]["prefix"]
    handlePostfix = anIndexCfg["handles"]["postfix"]
    for index in anIndexCfg["indexes"]:
        forwardIdx.append(handlePrefix + index + handlePostfix)
    
    reverseIdx = []       # to collect reverse complements
    for handle in forwardIdx:
        seq = Seq(handle)
        rc = str(seq.reverse_complement())
        reverseIdx.append(rc)

    return (forwardIdx,reverseIdx)
コード例 #23
0
def make_consensus( rev_string, for_string, seqfile):
    "function that accepts 2 sequence and returns the consensus sequence"
    # make fasta file for each paired sequence
    rev_sequence = Seq(rev_string.replace("\n", "").replace('\r', '').replace(' ', ''), IUPAC.ambiguous_dna)
    rev_sequence= rev_sequence.reverse_complement()
    for_sequence = Seq(for_string.replace("\n", "").replace('\r', '').replace(' ', ''), IUPAC.ambiguous_dna)
    paired_sequences = [SeqRecord(rev_sequence, id="rev"), SeqRecord(for_sequence, id="for")]
    if not os.path.exists("results/"):
        os.makedirs("results/")
    fasta_file = "results/" + seqfile + ".fasta"
    SeqIO.write(paired_sequences, fasta_file, "fasta")
    # align the paired sequences
    aln_file = "results/" + seqfile + ".aln"
    # clustalw_cline = ClustalwCommandline("clustalw", infile=fasta_file, outfile=aln_file, pwgapopen="100", gapopen="100")
    clustalw_cline = ClustalwCommandline("clustalw", infile=fasta_file, outfile=aln_file, pwgapopen=100, gapopen=100)
    clustalw_cline()
    # hack so that dumb_consensus will accept 1 base call against N
    f = open(aln_file, 'r+')
    contents = f.read()
    f.close()
    f = open(aln_file, 'w')
    f.write( contents.replace('N','.') )
    f.close()
    # read in alignment file and generate consensus
    alignment = AlignIO.read(aln_file, "clustal")
    summary_align = AlignInfo.SummaryInfo(alignment)
    return summary_align.dumb_consensus(ambiguous = "N", threshold=0.0, require_multiple=0)
コード例 #24
0
ファイル: index2barcode.py プロジェクト: MG-RAST/pipeline
def main(args):
    usage  = "usage: %prog [options] -i <input index file> -o <output barcode file>"+__doc__
    parser = OptionParser(usage)
    parser.add_option("-i", "--input", dest="input", default=None, help="Input index fastq file.")
    parser.add_option("-o", "--output", dest="output", default=None, help="Output barcode file.")
    parser.add_option("-p", "--prefix", dest="prefix", default=None, help="Optional string to prepend to names.")
    parser.add_option("-r", "--revcomp", dest="revcomp", action="store_true", default=False, help="Print reverse complement of index sequences for barcodes [default is same].")

    (opts, args) = parser.parse_args()
    if not (opts.input and os.path.isfile(opts.input) and opts.output):
        parser.error("Missing input and/or output")

    # parse index file - build map
    barcodes  = {}
    input_hdl = open(opts.input, 'rU')
    for rec in FastqGeneralIterator(input_hdl):
        seq = rec[1].upper()
        barcodes[seq] = 1
    input_hdl.close()
    
    # print to output
    output_hdl = open(opts.output, 'w')
    for i, bc in enumerate(barcodes.keys()):
        if opts.revcomp:
            bcseq = Seq(bc, generic_dna)
            bc = bcseq.reverse_complement()
        if opts.prefix:
            output_hdl.write("%s.%d\t%s\n"%(opts.prefix, i+1, bc))
        else:
            output_hdl.write(bc+"\n")
    output_hdl.close()
    
    return 0
コード例 #25
0
def calc_repeat_rev_comp(seq, window):

    rec_one = Seq(seq)
    rec_one_rev_comp = rec_one.reverse_complement()

    rec_one_len = len(rec_one)
    dict_one = {}
    dict_two = {}
    for (seq, section_dict) in [(str(rec_one).upper(), dict_one), (str(rec_one_rev_comp).upper(), dict_two)]:
        for i in range(len(seq) - window + 1):
            section = seq[i : i + window]
            try:
                section_dict[section].append(i)
            except KeyError:
                section_dict[section] = [i]
    # Now find any sub-sequences found in both sequences
    matches = set(dict_one).intersection(dict_two)

    repeats_list = []
    repeats_binary_list = [1] * len(rec_one)
    for section in matches:
        repeat_positions = set()
        for i in dict_one[section]:
            for j in dict_two[section]:
                # repeat_positions.add(i + window - 1)
                # repeats_binary_list[i + window - 1] = 0
                repeat_positions.add(rec_one_len - j - 1)
                repeats_binary_list[rec_one_len - j - 1] = 0

        if repeat_positions != set():
            repeat = {section: sorted(list(repeat_positions))}
            repeats_list.append(repeat)

    return repeats_list, repeats_binary_list
コード例 #26
0
ファイル: ensembl_ops.py プロジェクト: rosinaSav/RBP_motifs
def get_MSA(coords, method, species_set, query_species, version, force_strand = True):
    '''
    Get the genome alignments that overlap a particular sequence region.
    '''
    reverse = False
    if coords[6] == "-" and force_strand:
        reverse = True
    MSA = run_process(["perl", "MSA.pl", method, species_set, version, coords[0], coords[2], coords[3], query_species])
    MSA = MSA.split("|||")
    MSA = [i.split(">") for i in MSA if i]
    MSA = [[j.split("\n") for j in i if j] for i in MSA]

    MSA_dict = {}
    for gab in MSA:
        for species in gab: 
            name = species[0]
            temp_name = name.split("/")
            true_name = temp_name[0]
            coords = "-".join(temp_name[1:])
            if true_name not in MSA_dict:
                MSA_dict[true_name] = {}
            current_seq = "".join(species[1:]).upper()
            if reverse:
                current_seq = Seq(current_seq, IUPAC.unambiguous_dna)
                current_seq = current_seq.reverse_complement()
                current_seq = str(current_seq)
            MSA_dict[true_name][coords] = current_seq       
    return(MSA_dict)
コード例 #27
0
	def is_site_confirmed(self, mt_id2sites_ls, line, max_mis_match_perc, min_no_of_mismatches, max_esc_length):
		### 1st parse (copied from transfacdb.py
		ls = line[:-1].split('|')
		mt_id = ls[0].strip()	#remove spaces
		bs_disp_start_strand = ls[1].strip()
		#bs_disp_start = int(bs_disp_start_strand[:-3])
		strand = bs_disp_start_strand[-2]
		#core_similarity_score = float(ls[2])
		#matrix_similarity_score = float(ls[3])
		sequence = ls[4].strip()
		
		if strand=='-':	#take the reverse_compliment()
			seq = Seq(sequence)
			sequence = seq.reverse_complement().tostring()
			"""
			if self.debug:
				sys.stderr.write("Strand is -, need reverse_compliment() from %s to %s.\n"%(seq.data, sequence))
			"""
		#transform it into upper case
		sequence = sequence.upper()
		
		no_of_mismatches_allowed = self.get_no_of_mismatches_allowed(sequence, max_mis_match_perc, \
			min_no_of_mismatches, max_esc_length)
		
		#check the no_of_mismatches
		sites_ls = mt_id2sites_ls[mt_id]
		if sites_ls[0] == 0:	#it's consensus
			return self.get_no_of_mismatches_for_consensus(sequence, sites_ls[1], no_of_mismatches_allowed,\
				max_esc_length)
		elif sites_ls[0] == 1:	#it's the sequence where the consensus is derived
			return self.get_no_of_mismatches_for_site(sequence, sites_ls, no_of_mismatches_allowed,\
				max_esc_length)
		else:
			sys.stderr.write("Wrong type of sites_ls of mt_id2sites_ls: %s.\n"%sites_ls[0])
			return None
コード例 #28
0
ファイル: seq_utils.py プロジェクト: synbiochem/synbiochem-py
def translate(seq, trans_table=CodonTable.unambiguous_dna_by_name["Standard"],
              min_prot_len=128):
    '''Translates supplied nucleotide sequence in all 6 reading frames.'''
    result = []

    seq = Seq(seq)

    for strand, nuc in [('+', seq), ('-', seq.reverse_complement())]:
        for frame in range(3):
            trans = \
                str(nuc[frame:-(len(nuc[frame:]) % 3)].translate(trans_table))
            trans_len = len(trans)
            aa_start = 0
            aa_end = 0

            while aa_start < trans_len:
                aa_end = trans.find("*", aa_start)
                if aa_end == -1:
                    aa_end = trans_len
                if aa_end - aa_start >= min_prot_len:
                    start = frame + aa_start * 3
                    end = frame + aa_end * 3

                    result.append((start, end, strand, frame,
                                   len(trans[aa_start:aa_end]),
                                   trans[aa_start:aa_end]))
                aa_start = aa_end + 1
    return result
コード例 #29
0
def find_and_score(sequence):
	#Initialize arrays
	scores = []
	indices = []
	complements = []
	PAMs = []
	
	#Load scoring file
	model_file = open('crispr_app/V3_model_nopos.pickle', 'rb')
	model = pickle.load(model_file)

	#Score in 5-->3 direction
	for i in range(len(sequence) - 30):
		toScore = sequence[i:i+30]
		if len(toScore) == 30 and toScore[25:27] == 'GG':
			complements.append(toScore[4:24])
			PAMs.append(toScore[24:27])
			scores.append(calculateScore(toScore, model))
			indices.append(i+21)

	#Score in 3-->5 (Reverse complement) direction
	mySeq = Seq(sequence)
	reverseComp = str(mySeq.reverse_complement())
	for i in range(len(reverseComp) - 30):
		toScore = reverseComp[i:i+30]
		if len(toScore) == 30 and toScore[25:27] == 'GG':
			complements.append(toScore[4:24])
			PAMs.append(toScore[24:27])
			scores.append(calculateScore(toScore, model)) 
			indices.append(len(sequence)-(i+21))
	return scores, indices, complements, PAMs
コード例 #30
0
ファイル: orfs.py プロジェクト: Chris7/edge
def detect_orfs(seq):
    orf_list = []

    seq = Seq(seq)
    seq_len = len(seq)
    aa_len = int(math.floor(seq_len/3.0))

    for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
        for frame in range(3):
            trans = str(nuc[frame:].translate(trans_table))
            trans_len = len(trans)
            aa_start = 0
            aa_end = 0

            # go through the translation and find end codons that follow a
            # start codon.
            while aa_start < trans_len and aa_start < aa_len:
                aa_end = trans.find("*", aa_start)
                has_stop = 1
                if aa_end == -1:
                    # no more stop codon, just abort...
                    break

                # we start looking for a M at the earliest at aa_end-aa_len+1,
                # since we don't want an ORF that's actually bigger than the
                # original sequence
                if aa_start < aa_end-aa_len+1:
                    aa_start = aa_end-aa_len+1
                start_codon = trans.find('M', aa_start, aa_end)

                # is there a start codon? and is it before end of sequence
                # (remember we doubled up the sequence earlier to detect orfs
                # crossing boundaries)
                if start_codon == -1 or start_codon >= aa_len:
                    assert(aa_end != -1)
                    aa_start = aa_end+1
                    continue

                if aa_end-start_codon >= min_protein_len:
                    # the following start and end need to start with
                    # 1, not 0.
                    if strand == 1:
                        start = frame+start_codon*3+1
                        end = frame+aa_end*3+has_stop*3
                        size = end-start+1
                        if end > seq_len:
                            end = end % seq_len
                    else:
                        start = seq_len-frame-aa_end*3-has_stop*3+1
                        end = seq_len-frame-start_codon*3
                        size = end-start+1
                        if start < 0:
                            start = seq_len+start

                    f = dict(name='ORF frame '+str(frame+1), start=start, end=end, strand=strand)
                    orf_list.append(f)

                aa_start = aa_end+1

    return orf_list
コード例 #31
0
ファイル: thingDoer2000.py プロジェクト: apmills/blumeria
        ])

total = len(extract)
current = 0
genomeFile = open('../db/blumeria/latest/Bgt_genome_v2_1.fa', 'r')
genomeLines = genomeFile.readlines()
genomeFile.close()
for start, stop, contig, name, strand in extract:
    print(str(round(((current / total) * 100), 4)) + '% Done', end='\r')
    place = 1
    thisContig = False
    output = [genomeSlicer(start, stop, x, contig) for x in genomeLines]
    selection = [x for x in output if x is not None]
    bluSeq = ''.join(selection)
    if strand == '-':
        bluSeqC = Seq(bluSeq)
        bluSeqC = bluSeqC.reverse_complement()
        geneGenie.append([name, str(bluSeqC)])
    else:
        geneGenie.append([name, str(bluSeq)])
    current += 1

# print (upstreamGirl)
outfile = open('bluGenes.fa', 'w')
for name, seq in geneGenie:
    outfile.write('>' + name + '\n')
    outfile.write(seq + '\n')
outfile.close()
toc = time.clock()
print('And it only took ' + str(toc - tic) + ' seconds')
コード例 #32
0
def rev_comp(seq):
    dna = Seq(seq, generic_dna)
    return str(dna.reverse_complement())
コード例 #33
0
ファイル: 1h.py プロジェクト: roshan2004/ROSALIND-1
def reverse_complement(seq):
    my_dna = Seq(seq, generic_dna)
    rc = my_dna.reverse_complement()
    return str(rc)
コード例 #34
0
#Sem BioPython
dna = "AACTCCGTATCGGCTTAGCGCCTGACTTAACCACAGACCCGCCTTATGAGCTTCAACGGAAAGTATGTATCGGCCCCTTCCTATTTGATGTTAATCGCTACTTGGTATTGGCTGATTGCTTCCCTATTTATTTGAAGAGAAGTACCTGTTTCCTTGAAAGTTTGTATTCTTCCCATAAGTACCTTTTCCAAACTTACAGGACACCTCGTTGAATGGCCTAAGCCTAGCTGGCATACTAAGGCTAGTGTGTTAGGTTACAAGTTGGCTCCCTCCCGACGTAGCAGGCGGGTTGGTCTGAGCAGGTCAACCTCGTTCAGTGGCGATTTGAGAGCGAGGTTCTCTGACAAGCGCCCTTTGCCGTATCGTGGCCGCAAGGAGTTCCCGATATCGCGTTGAGTGTCGTAGGAGAGACTCGGAGCTAATGATGCACTTCCTGGGCACCATGGGGCAGCCCCCTTGGGTAACGCCGGAGCATAATAATATCCCCAAAGTAGCAGTGTATACGAATGGCTACGGTCGACATAGCATTATCAATTAAGTGATTTTATGTAAAAAGCGACCTTTTTTTGCCCTTGTACCCGGGCTGAGTCCTGTCGCGGCGGTGGGAGCCCCACTGTAGTCGGGGTTATGTGCTAGTACACCTAAAGTTAGATGGATGTCTAGTCCCTCCAACAATACCCCTAGCGCTGAGGTTCTTTGACTTCCTTTGATTTTTCAACCGAGCTTAATCACACAAACGGTCAGGATAAGTTATCAAACATTCCCTCGTGTAATTCCTCAACGCACTCGTCATACACGGATGGGCAGTACACGCAGCCCTGCGTCCGGCCACCTTGCAAGCCATGGGCGCATTCCCATGTGGAATTCCAGTGTAAGCACCACAGGCAGTGGTTTATATCCTATACCACTCTTGTTAGTGCGAACCTAGGTACGCGACAGCCTCGACCGAGGTCCCTATCACAACCGGAAATTTGCCGATGA"
dna_reverse_complement = str()
dna_reverse = dna[::-1]

for i in dna_reverse:
    if i == "A":
        dna_reverse_complement += "T"
    elif i == "T":
        dna_reverse_complement += "A"
    elif i == "C":
        dna_reverse_complement += "G"
    elif i == "G":
        dna_reverse_complement += "C"
    else:
        print("error")

print(dna_reverse_complement)

#ComBiopython
from Bio.Seq import Seq

dna = "ACATCAGCATGCATGCATGCATCGATCGATGCATCGATGCATCGATGCATGCATCGATCGATCGATCT"
dna = Seq(dna)
print(dna.reverse_complement())
コード例 #35
0
 def _reverse_complement(self, sequence):
     dna = Seq(sequence)
     rev_complement = dna.reverse_complement()
     return rev_complement
コード例 #36
0
def reverse(seq):
    my_seq = Seq(seq, IUPAC.unambiguous_dna)
    re_seq = str(my_seq.reverse_complement())
    return re_seq
コード例 #37
0
def bam_5prime_stranded(file, minlength=0, maxlength=1000, unique=False):
    # empty dicts for read sequences and read counts
    sense_seqs = []
    sense_counts = []
    antisense_seqs = []
    antisense_counts = []
    # read in bamfile
    bamfile = pysam.AlignmentFile(file, "rb")

    for read in bamfile:
        # convert line into string to make it splittable
        line = str(read)
        # split line on tabs
        linesplit = line.split("\t")
        # this filters out reads with flag 4 (i.e. unmapped reads)
        cigar = linesplit[5]
        if cigar.endswith('M'):
            seq = linesplit[9]
            if unique == True:
                count = 1
            else:
                count = int(linesplit[0].split('-')[1])
            # this is where the length filtering happens
            if minlength <= len(seq) <= maxlength:
                # this splits up sense and antisense reads
                if linesplit[1] == '16':
                    # convert seq string to Biopython Seq object
                    tempseq = Seq(seq)
                    # set reverse-complement of Seq object as string
                    tempseqrevcomp = tempseq.reverse_complement()
                    antisense_seqs.append(tempseqrevcomp)
                    antisense_counts.append(count)
                else:
                    sense_seqs.append(seq)
                    sense_counts.append(count)

    # go through all of the sequences, creating a list of all read lengths
    lengths = []
    for i in sense_seqs:
        if len(i) not in lengths:
            lengths.append(len(i))
    for i in antisense_seqs:
        if len(i) not in lengths:
            lengths.append(len(i))
    # sort the lengths list in ascending order
    lengths.sort()

    # for each base, the total number of the shortest reads starting with that base will be appended, followed by the next length, and the next length...
    senseA = []
    senseC = []
    senseG = []
    senseT = []
    senseN = []
    antisenseA = []
    antisenseC = []
    antisenseG = []
    antisenseT = []
    antisenseN = []
    # go through the lengths, looking at the first base of each sense read of that length, and totalling them up
    for readlength in lengths:
        Acount = 0
        Ccount = 0
        Gcount = 0
        Tcount = 0
        Ncount = 0
        for i in range(len(sense_seqs)):
            read = sense_seqs[i]
            if len(read) == readlength:
                firstbase = read[0]
                if unique == True:
                    count = 1
                elif unique == False:
                    count = sense_counts[i]
                if firstbase == 'A':
                    Acount = Acount + count
                elif firstbase == 'C':
                    Ccount = Ccount + count
                elif firstbase == 'G':
                    Gcount = Gcount + count
                elif firstbase == 'T':
                    Tcount = Tcount + count
                elif firstbase == 'N':
                    Ncount = Ncount + count
        senseA.append(Acount)
        senseC.append(Ccount)
        senseG.append(Gcount)
        senseT.append(Tcount)
        senseN.append(Ncount)
    # go through the lengths, looking at the first base of each antisense read of that length, and totalling them up
    for readlength in lengths:
        Acount = 0
        Ccount = 0
        Gcount = 0
        Tcount = 0
        Ncount = 0
        for i in range(len(antisense_seqs)):
            read = antisense_seqs[i]
            if len(read) == readlength:
                firstbase = read[0]
                # NB the counts here are subtracted rather than added to make the plotting on the bottom of the plot work
                if unique == True:
                    count = 1
                elif unique == False:
                    count = antisense_counts[i]
                if firstbase == 'A':
                    Acount = Acount - count
                elif firstbase == 'C':
                    Ccount = Ccount - count
                elif firstbase == 'G':
                    Gcount = Gcount - count
                elif firstbase == 'T':
                    Tcount = Tcount - count
                elif firstbase == 'N':
                    Ncount = Ncount - count
        antisenseA.append(Acount)
        antisenseC.append(Ccount)
        antisenseG.append(Gcount)
        antisenseT.append(Tcount)
        antisenseN.append(Ncount)

    print('Bases counted')

    # format the dataframe
    formatted = {}
    formatted['Length'] = lengths
    formatted['senseA'] = senseA
    formatted['senseC'] = senseC
    formatted['senseG'] = senseG
    formatted['senseT'] = senseT
    formatted['senseN'] = senseN
    formatted['antisenseA'] = antisenseA
    formatted['antisenseC'] = antisenseC
    formatted['antisenseG'] = antisenseG
    formatted['antisenseT'] = antisenseT
    formatted['antisenseN'] = antisenseN
    basecounts = pd.DataFrame(formatted,
                              columns=[
                                  'Length', 'senseA', 'senseC', 'senseG',
                                  'senseT', 'senseN', 'antisenseA',
                                  'antisenseC', 'antisenseG', 'antisenseT',
                                  'antisenseN'
                              ])
    return (basecounts)
コード例 #38
0
ファイル: NetDNA-1.0.py プロジェクト: manojmw/NetDNA-1.0
            'C') - 16.4) / (DNA_Sequence.count('A') + DNA_Sequence.count('T') +
                            DNA_Sequence.count('G') + DNA_Sequence.count('C'))
    except:
        Tm_more = 0
    st.write("-The Melting Temperature(Tm) of the given DNA sequence:",
             Tm_more, "°C")

st.write("""
***
""")

###Reverse Complement of the given DNA Sequence###
DNA_Sequence = Seq(DNA_Sequence)

st.subheader("[7] Reverse Complement of the given DNA Sequence")
st.write(DNA_Sequence.reverse_complement())

st.write("""
***
""")

###Transcribed Sequence of the given data###
st.subheader("[8] Transcribed Sequence (DNA -> RNA)")
st.write(DNA_Sequence.transcribe())

st.write("""
***
""")

###Translated Sequence of the given data###
st.subheader("[9] Translated Sequence (RNA -> Protein)")
コード例 #39
0
barcode_type = int(sys.argv[4])

#dictonary to point to the output files
pointer_dict={
    'F':out_f,
    'R':out_r}

fcount = 0
rcount = 0

#setup to search for the reverse complement 
#sequences of the barcodes
if barcode_type == 2:
    print('two barcodes:' 'Remember: input both barcode seqences in forward orientation')
    #store barcode Forward (reverse complement)
    fr_barcode = f_barcode.reverse_complement()
    out_fr = out_base+'_FR.bam'
    out_fr = pysam.Samfile(out_fr, "wb", template = bamfile)
    #store barcode Reverse (reverse complement)
    rr_barcode = r_barcode.reverse_complement()
    out_rr = out_base+'_RR.bam'
    out_rr = pysam.Samfile(out_rr, "wb", template = bamfile)
    #update the pointer to the add two more output files
    pointer_dict['FR']=out_fr
    pointer_dict['RR']=out_fr
    frcount = 0
    rrcount = 0
elif barcode_type == 1:
    print('one barcode:' 'Remember: input the forward and reverse complement sequences')
else:
    print('something odd, i shuld not be here with barcode type')
コード例 #40
0
                        print "\nmismatches_in_hsp: " + str(mismatches_in_hsp)
                        print "\nQuery:   " + query.upper()
                        print "Subject: " + str(lookup_mature[mature_title])
                        print rec.query

                        count = count + 1
                        found = True
                        counter = 1

                        if (int(
                                strand.split(',')[1].split(' ')[1].split(')')
                            [0]) == -1):
                            mature_fasta.write('>>' + str(rec.query) + '-rc' +
                                               '\n')
                            mature_fasta.write(
                                (str(query_seq.reverse_complement()) + '\n'))
                        else:
                            mature_fasta.write('>>' + str(rec.query) + '\n')
                            mature_fasta.write(
                                str(record_index[rec.query].seq) + '\n')
                        mature_fasta.write(str(hit) + '\n')
                        mature_fasta.write(
                            str(lookup_mature[mature_title]) + '\n')

                        mature_summary.write(
                            str(count) + ',>>' + str(rec.query) + ',' + hit +
                            ',' + str(hsps[10]) + ',' + str(hsps[12]) + ',' +
                            str(hsps[-3]) + ',' + str(hsps[-1]) + ',' +
                            strand_csv + ',' + out_score + ',' +
                            str(hsps[5]).split(',')[0] + ',' +
                            str(len(query)) + ',' + str(len(x.match)) + ',' +
コード例 #41
0
# Pri načrtovanju oligonukleotidov je priporočljivo upoštevati nekaj osnovnih pravil, ki pripomorejo k večji uspešnosti reakcije in manjši količini nespecifičnih produktov. Z iskanjem po spletu lahko najdemo kar nekaj priporočil, na primer [navodila podjetja Addgene](https://www.addgene.org/protocols/primer-design/), nekaj osnovnih pa je tukaj:
# * temperatura tališča ($T_m$) smernega in protismernega oligonukleotida se naj ne bi razlikovala za več kot 5 °C (kar nam omogoča njuno optimalno vezavo na matrico pri enaki temperaturi prileganja ($T_a$)), hkrati pa naj bo njuna temperatura tališča nekje med 50 in 60 °C,
# * delež parov GC naj bo med 40 in 60 %.
#
# ---
# ## Primeri kode
#
# Spodaj je predstavljen zgled, kako definiramo nukleotidno zaporedje in dobimo komplementarno ter obratno-komplementarno zaporedje. Prikazano je tudi, kako preštejemo določen nukleotid v zaporedju.

# In[1]:

from Bio.Seq import Seq
my_seq = Seq('AGTACACTGGT')
print(my_seq)
print(my_seq.complement())
print(my_seq.reverse_complement())
print(my_seq.count('A'))  # preštejemo nek nukleotid
print(len(my_seq))  # dolžina zaporedja

# Drug način za štetje nukleotidov je, da si pripravimo slovar, na primer:

# In[2]:

freq = {}
for x in my_seq:
    freq[x] = my_seq.count(x)
print(freq)  # izpiše slovar
print('A:', freq['A'])  # izpiše, koliko A je v zaporedju

# Pogosto želimo, so vsi nukleotidi napisani bodisi z velikimi ali malimi črkami. Za ta namen lahko uporabimo `upper` ali `lower`:
コード例 #42
0
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.Seq import MutableSeq

# Seq doesn't inherit from String
print(Seq.__bases__)

# Example of f string formatting using Seq
my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna)
print(f"The value of my_seq is {my_seq}.")

# Example of using the format method on a Seq object
print("my_seq can be expressed as a string using the format method: {}".format(
    my_seq))

# Can Seq objects be overwritten by reverse complements or are they immutable?
my_seq = my_seq.reverse_complement()
print(f"The value of my_seq is {my_seq}.")

# What methods are available from MutableSeq
print(f"MutableSeq attributes and methods are {dir(MutableSeq)}.")
コード例 #43
0
ファイル: simulations.py プロジェクト: Steap/Circle-Map
    def simulate_read_with_errors(self, right_read, left_read, common_id,
                                  ins_rate1, ins_rate2, del_rate1, del_rate2,
                                  pid):
        # put all together
        # unique identifiers for right and left reads
        dir = os.getcwd()
        os.chdir("temp_files_%s" % pid)

        right_read_id = "2:N:0:CGCTGTG"
        right_id = common_id + "space" + right_read_id
        left_read_id = "1:N:0:CGCTGTG"
        left_id = common_id + "space" + left_read_id

        # attemp to use art to simulate the quality scores and the error rate
        #create a one read genome
        left_fasta = open("left_read_%s.fa" % (self.process), "w")
        left_fasta.write(">" + left_id + "\n" + str(left_read) + "\n")
        # sim the read with art
        left_fasta.close()

        sp.call(
            "art_illumina -q -na -ss HS25  -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i left_read_%s.fa -l %s -f 1 -o left%s"
            % (ins_rate1, ins_rate2, del_rate1, del_rate2, self.process,
               self.read_length, self.process),
            shell=True,
            stdout=sp.DEVNULL,
            stderr=sp.STDOUT)

        with open("left%s.fq" % (self.process), 'r') as left:
            left_read = left.read().replace('space', '   ').replace(
                '1:N:0:CGCTGTG-1', '1:N:0:CGCTGTG')

        # get the reverse complement of the right read
        right_read = Seq(right_read, generic_dna)
        right_read = right_read.reverse_complement()

        right_fasta = open("right_read_%s.fa" % (self.process), "w")
        right_fasta.write(">" + right_id + "\n" + str(right_read) + "\n")
        right_fasta.close()
        # sim the read with art

        sp.call(
            "art_illumina -na -q -ss HS25  -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i right_read_%s.fa -l %s -f 1 -o right%s"
            % (ins_rate1, ins_rate2, del_rate1, del_rate2, self.process,
               self.read_length, self.process),
            shell=True,
            stdout=sp.DEVNULL,
            stderr=sp.STDOUT)

        with open("right%s.fq" % (self.process), 'r') as right:
            right_read = right.read().replace('space', '   ').replace(
                '1:N:0:CGCTGTG-1', '2:N:0:CGCTGTG')

        #sometimes the reading fails. I introduce this to capture it
        try:

            right_record = SeqIO.read(StringIO(right_read), "fastq")
            left_record = SeqIO.read(StringIO(left_read), "fastq")
            os.chdir(dir)
            return (left_record, right_record)
        except ValueError as v:

            warnings.warn('Catched ValueError in a sampling round. Skipping')
            os.chdir(dir)
            return (None)
コード例 #44
0
def reverse_complement(seq):
    my_dna = Seq(seq, IUPAC.ambiguous_dna)
    seq_RC = str(my_dna.reverse_complement())
    return seq_RC
コード例 #45
0
ファイル: bio.py プロジェクト: Breenori/Development
print(mySequence[0])
stringSequence = str(mySecondSequence)

format_fast_string = ">Genname\n%s\n" % mySecondSequence
print(format_fast_string)

# Zusammenfügen bzw. Konkatenieren von Sequenzen
# ACHTUNG BEIM ARBEITEN von unterschiedlichen Seqs Typ check
dna_seq = Seq("ACGTA")
protein_seq = Seq("EVRNAK")

print ("Sum: ", protein_seq + dna_seq)
print(dna_seq)
print(dna_seq.complement())
print(dna_seq.reverse_complement())

# Transcription and Tanslation
coding_dna = Seq("ATGGCCATTGTAATG")
template_dna = coding_dna.reverse_complement()
messenger_rna = transcribe(coding_dna)
print(messenger_rna)

print(back_transcribe(messenger_rna))
print(translate(messenger_rna))

myThirdSequence = Seq("GATCGATGGGGGCTATCC")
print(GC(myThirdSequence))

# MutableSeq objects
print(dna_seq)
コード例 #46
0
ファイル: revc.py プロジェクト: CharuniKaushalya/ROSALIND
from Bio.Seq import Seq
import Bio.Alphabet

# open txt file with sequence to transcribe
f = open('rosalind_revc.txt', 'r')
# read all lines by removing newline character
data = f.read().replace('\n', '')
# close file
f.close()

# assign sequence as a DNA sequence
t = Seq(data, Bio.Alphabet.IUPAC.unambiguous_dna)
# use transcribe function
o = open('_REVC.txt', 'w')
o.write(str(t.reverse_complement()))
o.close()
コード例 #47
0
def rev_seq(seq):
	#function that reverse and complement a sequence
	my_dna = Seq(seq, generic_dna)
	return str(my_dna.reverse_complement())
コード例 #48
0
sequence_lines = SEQUENCE_FILE.read().splitlines()
seq = "".join(sequence_lines).replace('\n', '')

GENES_FILE = open(args['GENES_FILE'], 'r')
gene_lines = GENES_FILE.read().splitlines()

for line in gene_lines:
    data = line.split('\t')
    start, end, strand = int(data[0]), int(data[1]), data[2]
    if strand == "+":
        print "gene: " + str(start) + "-" + str(
            end) + " DIRECT: ..." + seq[end - NUM:end]
    elif strand == "-":
        #make it a Seq object and reverse complement
        subseq = Seq(seq[start - 1:start + NUM - 1], unambiguous_dna)
        subseq = str(Seq.reverse_complement(subseq))
        print "gene: " + str(start) + "-" + str(
            end) + " COMPLEMENTARY: ..." + subseq
############END########
sys.exit(0)

gene_lines, gene_filetype = geneTools.readORFLines(args['GENES_FILE'])

NUM_GENES = 100
gene_lines_rand = []
attempts = 0
while len(gene_lines_rand) < NUM_GENES:
    index = random.randint(0, len(gene_lines) - 1)
    gene = geneTools.getLineData(gene_lines[index], gene_filetype)
    if gene not in gene_lines_rand:
        gene_lines_rand.append(gene)
コード例 #49
0
def get_reverse_complement(nuc_seq):
    '''Get reverse complement'''
    my_dna = Seq(nuc_seq, generic_dna)
    rev_comp_dna = str(my_dna.reverse_complement())
    return rev_comp_dna
コード例 #50
0
ファイル: translate.py プロジェクト: emmahodcroft/augurlinos
 def str_reverse_comp(str_seq):
     #gets reverse-compliment of a string and returns it as a string
     seq_str = Seq(str_seq)
     return str(seq_str.reverse_complement())
コード例 #51
0
def main():
    # initialization

    # parse command-line arguments
    if len(sys.argv) == 5:
        # have comma separated lists for each field, split on comma
        gff3DB = sys.argv[1]
        refGenomeFile = sys.argv[2]
        alleleTableFilename = sys.argv[3]
        chromToGenotype = sys.argv[4]
    else:
        print("Didn't specify right number of input arguments, exiting...")
        sys.exit()

    db = gffutils.FeatureDB(
        gff3DB)  # this database was created in previous step
    # Assumption in analyses below is that exons are at least 1 a.a. residue long, 3 bp. confirm this
    areExonsAtLeast3bp(db)

    # create data structs to convert position -> functional effect
    Pos_2_CDS, CDSinfo = shFn.constructCdsDicts(
        db, chromToGenotype)  # Pos_2_CDS[pos] = [ sorted CDS ids ]
    # Pos_2_CDS[pos] = [ sorted CDS ids ]
    # CDSinfo[firstCDS] = (0:geneName, 1:strand, 2:frame, 3:startPos, 4:endPos, 5:RefSeqScaffold, 6:UniqueGeneName)

    refGenomeSeqDict = SeqIO.to_dict(
        SeqIO.parse(refGenomeFile, "fasta")
    )  # refGenomeSeqDict[ scaffoldName ].seq = ATG...GGC, sequence of that scaffold
    translateCodonDict = shFn.constructTranslateCodonDict()

    mkDict = {}  # mkDict[gene] = [SynPoly, NonsynPoly, SynFixed, NonsynFixed]
    for gene in db.features_of_type('gene'):
        if gene.seqid == chromToGenotype:
            mkDict[gene['Name'][0]] = [0, 0, 0, 0]

    if not os.path.isfile(alleleTableFilename):
        print(
            "Alleletable -> MKtable script cannot locate the allele table! exiting..."
        )
        sys.exit()
    f = open(alleleTableFilename, 'r')
    # File structure:
    # Col 1: position
    # Col 2: Refbase
    # Col 3: Species1 allele
    # Col 4: Species1 allele frequency
    # ...
    # Col n-1: SpeciesX allele
    # Col n: SpeciesX allele frequency
    total = 0
    biallelic = 0
    biCoding = 0
    numOutgroupPolymorphic = 0
    numOutgroupFixed = 0
    for line in f:
        if not re.match(r'^Position', line):
            mutInfo = line.split()
            posInScaff = int(mutInfo[0])
            refBase = str(mutInfo[1])
            #########
            # NOTE: only analysing ingroup and a single outgroup, for now
            #########
            ingroupBase = str(mutInfo[2])
            ingroupAF = float(mutInfo[3])
            outgroupBase = str(mutInfo[4])
            outgroupAF = float(mutInfo[5])

            allelesAllDict = {
            }  # used to check how many alleles at this site, across all species
            allelesAllDict[
                refBase] = 1  # also use ref base to see if site biallelic
            allelesAllDict[ingroupBase] = 1
            allelesAllDict[outgroupBase] = 1

            #allelesOutgroupDict = {}	# used to check how many alleles at this site, ONLY in outgroup species
            #i = 2				# ALT alleles start at index 2 (Col 3), and proceed until end, skipping a col for frequency each time
            #while i < len(mutInfo)-1:
            #	allelesAllDict[mutInfo[i]] = 1
            #	allelesOutgroupDict[mutInfo[i]] = 1
            #	i+=2
            #if len(allelesAllDict.keys()) == 2:
            #	biallelic+=1
            #if len(allelesOutgroupDict.keys()) == 1:		# IMPORTANT FILTERING STEP: OUTGROUP SPECIES MUST AGREE ON ANCESTRAL BASE, CHANGE ME
            #outgroupBase = next(iter(allelesOutgroupDict.keys()))

            if posInScaff in Pos_2_CDS.keys():  # is position in coding region
                biCoding += 1
                inPolymorphic = None
                inFixed = None
                outPolymorphic = None
                outFixed = None
                # is ingroup polymorphic or fixed?
                if ingroupAF > 0.0 and ingroupAF < 1.0:
                    inPolymorphic = 1
                    inFixed = 0
                else:
                    inPolymorphic = 0
                    inFixed = 1
                # is outgorup polymorphic or fixed?
                if outgroupAF > 0.0 and outgroupAF < 1.0:
                    outPolymorphic = 1
                    outFixed = 0
                    numOutgroupPolymorphic += 1
                else:
                    outPolymorphic = 0
                    outFixed = 1
                    numOutgroupFixed += 1
                firstCDS = Pos_2_CDS[posInScaff][
                    0]  # first of sorted mRNAs position corresponds to
                # Reminder: CDSinfo[firstCDS] = (0:geneName, 1:strand, 2:frame, 3:startPos, 4:endPos, 5:RefSeqScaffold, 6:UniqueGeneName)
                geneName = CDSinfo[firstCDS][6]
                strand = CDSinfo[firstCDS][1]
                refseqScaffold = CDSinfo[firstCDS][5]
                if strand == "+":
                    p1, p2, p3 = shFn.getCodonPositionsInReference_forwardStrand(
                        posInScaff, firstCDS, CDSinfo)
                elif strand == "-":
                    p1, p2, p3 = shFn.getCodonPositionsInReference_reverseStrand(
                        posInScaff, firstCDS, CDSinfo)

                # ignore sites in which flanking bases could not be found, i.e. flanking exon not located from partial gene model
                if p1 == None:
                    continue

                posInCodonToChange = shFn.getPosInCodon(p1, p2, p3, posInScaff)

                # scaffolds in gff3 RefSeq IDs, scaffolds in reference genome Genbank IDs
                b1 = str(
                    refGenomeSeqDict[refseqScaffold].seq[(p1 -
                                                          1):(p1)]).upper()
                b2 = str(
                    refGenomeSeqDict[refseqScaffold].seq[(p2 -
                                                          1):(p2)]).upper()
                b3 = str(
                    refGenomeSeqDict[refseqScaffold].seq[(p3 -
                                                          1):(p3)]).upper()
                codonRef = [b1, b2, b3]
                if "N" not in codonRef:
                    codonIngroup = codonRef.copy(
                    )  # need to use .copy method, otherwise it creates new pointer to same object
                    codonOutgroup = codonRef.copy()
                    codonIngroup[posInCodonToChange] = ingroupBase.upper()
                    codonOutgroup[posInCodonToChange] = outgroupBase.upper()

                    codonRefStr = ''.join(codonRef)
                    codonIngroupStr = ''.join(codonIngroup)
                    codonOutgroupStr = ''.join(codonOutgroup)

                    if strand == "-":
                        ref = Seq(codonRefStr, generic_dna)
                        ref = ref.reverse_complement()
                        inG = Seq(codonIngroupStr, generic_dna)
                        inG = inG.reverse_complement()
                        outG = Seq(codonOutgroupStr, generic_dna)
                        outG = outG.reverse_complement()
                        codonRefStr = str(ref)
                        codonIngroupStr = str(inG)
                        codonOutgroupStr = str(outG)

                    if inFixed:
                        if outFixed and codonIngroupStr != codonOutgroupStr:
                            #ingroup and outGroup fixed, can only be 2 bases
                            if translateCodonDict[
                                    codonIngroupStr] == translateCodonDict[
                                        codonOutgroupStr]:
                                mkDict[geneName][2] += 1  #synonymous
                            else:
                                mkDict[geneName][3] += 1  #nonsynonymous
                        elif outPolymorphic:
                            continue
                            # WE ARE IGNORING THIS FOR NOW;
                            # this could represent a fixed diff in ingroup that's still polymorphic in outgroup
                            # OR, it could be a a polymorphic in the outgroup where NOTHING happened in ingroup (unmutated, monomorphic)
                        else:
                            continue
                            #inPolymorphic
                            #outPolymorphic
                            #codonIngroupStr == codonOutgroupStr, b/c the second outgroup has a different base

                    if inPolymorphic:
                        #is site biallelic?
                        if len(allelesAllDict.keys()) == 2:
                            if translateCodonDict[
                                    codonIngroupStr] == translateCodonDict[
                                        codonOutgroupStr]:
                                mkDict[geneName][0] += 1  #synonymous
                            else:
                                mkDict[geneName][1] += 1  #nonsynonymous
                        # if multiallelic, then Ingroup alternate allele != outgroup allele
                        # these sites count as 1 polymorphism, 1 divergence
                        elif len(allelesAllDict.keys()) > 2:
                            # add 1 to polymorphism
                            if translateCodonDict[
                                    codonIngroupStr] == translateCodonDict[
                                        codonRefStr]:
                                mkDict[geneName][0] += 1  #synonymous
                            else:
                                mkDict[geneName][1] += 1  #nonsynonymous
                            # add 1 to divergence
                            if translateCodonDict[
                                    codonIngroupStr] == translateCodonDict[
                                        codonOutgroupStr]:
                                mkDict[geneName][2] += 0.5  #synonymous
                            else:
                                mkDict[geneName][3] += 0.5  #nonsynonymous
                            if translateCodonDict[
                                    codonRefStr] == translateCodonDict[
                                        codonOutgroupStr]:
                                mkDict[geneName][2] += 0.5  #synonymous
                            else:
                                mkDict[geneName][3] += 0.5  #nonsynonymous

    mkFile = open('MKtable_%s.txt' % chromToGenotype, 'w')
    print("GeneName\tSynPolymorphic\tNonsynPolymorphic\tSynFixed\tNonsynFixed",
          file=mkFile)
    for gene in mkDict.keys():
        print(gene, "\t", end="", file=mkFile)
        for i in (0, 1, 2, 3):
            print(mkDict[gene][i], end="\t", file=mkFile)
        print(file=mkFile)

    outFile = open('out_%s.txt' % chromToGenotype, 'w')
    print("Coding biallelic polymorphic sites: :", biCoding, file=outFile)
    print("Number of sites with polymorphic outgroup: ",
          numOutgroupPolymorphic,
          file=outFile)
    print("Number of sites with fixed outgroup: ",
          numOutgroupFixed,
          file=outFile)
    sys.exit()
コード例 #52
0
class NextOrf:
    def __init__(self, file, options):
        self.options = options
        self.file = file
        self.genetic_code = int(self.options['table'])
        self.table = makeTableX(CodonTable.ambiguous_dna_by_id[self.genetic_code])
        self.counter = 0
        self.ReadFile()

    def ReadFile(self):
        handle = open(self.file)
        for record in SeqIO.parse(handle, "fasta"):
            self.header = record.id
            frame_coordinates = ''
            dir = self.options['strand']
            plus = dir in ['both', 'plus']
            minus = dir in ['both', 'minus']
            start, stop = int(self.options['start']), int(self.options['stop'])
            s = str(record.seq).upper()
            if stop > 0:
                s = s[start:stop]
            else:
                s = s[start:]
            self.seq = Seq(s,IUPAC.ambiguous_dna)
            self.length = len(self.seq)
            self.rseq = None
            CDS = []
            if plus:
                CDS.extend(self.GetCDS(self.seq))
            if minus:
                self.rseq = self.seq.reverse_complement()
                CDS.extend(self.GetCDS(self.rseq, strand = -1))
            self.Output(CDS)

    def ToFasta(self, header, seq):
        seq = re.sub('(............................................................)','\\1\n',seq)
        return '>%s\n%s' % (header, seq)

    def Gc(self, seq):
        d = {}
        for nt in 'ATGC':
            d[nt] = seq.count(nt)
        gc = d['G'] + d['C']
        if gc == 0:
            return 0
        return round(gc*100.0/(d['A'] +d['T'] + gc),1)

    def Gc2(self,seq):
        l = len(seq)
        d= {}
        for nt in ['A','T','G','C']:
            d[nt] = [0,0,0]

        for i in range(0,l,3):
            codon = seq[i:i+3]
            if len(codon) < 3:
                codon = codon + '  '
            for pos in range(0,3):
                for nt in ['A','T','G','C']:
                    if codon[pos] == nt:
                        d[nt][pos] = d[nt][pos] +1

        gc = {}
        gcall = 0
        nall = 0
        for i in range(0,3):
            try:
                n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i]
                gc[i] = (d['G'][i] + d['C'][i])*100.0/n
            except:
                gc[i] = 0

            gcall = gcall + d['G'][i] + d['C'][i]
            nall = nall + n

        gcall = 100.0*gcall/nall
        res = '%.1f%%, %.1f%%, %.1f%%, %.1f%%' % (gcall, gc[0], gc[1], gc[2])
        return res

    def GetOrfCoordinates(self, seq):
        s = seq.data
        letters = []
        table = self.table
        get = self.table.forward_table.get
        n = len(seq)
        start_codons = self.table.start_codons
        stop_codons = self.table.stop_codons
#        print 'Start codons', start_codons
#        print 'Stop codons', stop_codons
        frame_coordinates = []
        for frame in range(0,3):
            coordinates = []
            for i in range(0+frame, n-n%3, 3):
                codon = s[i:i+3]
                if codon in start_codons:
                    coordinates.append((i+1,1,codon))
                elif codon in stop_codons:
                    coordinates.append((i+1,0,codon))
            frame_coordinates.append(coordinates)
        return frame_coordinates

    def GetCDS(self, seq, strand = 1):
        frame_coordinates = self.GetOrfCoordinates(seq)
        START, STOP = 1,0
        so = self.options
        nostart = so['nostart']
        minlength, maxlength = int(so['minlength']), int(so['maxlength'])
        CDS = []
        f = 0
        for frame in frame_coordinates:
            f+=1
            start_site = 0
            if nostart == '1':
                start_site = 1
            frame.append((self.length, 0, 'XXX'))
            for pos, codon_type, codon in frame:
                if codon_type == START:
                    if start_site == 0:
                        start_site = pos
                elif codon_type == STOP:
                    if start_site == 0:
                        continue
#                    if codon == 'XXX': print 'do something'
                    stop = pos + 2
#                    print stop
                    length = stop - start_site +1
                    if length >= minlength and length <= maxlength:
                        if nostart == '1' and start_site == 1:
                            start_site = start_site + f - 1
                        if codon == 'XXX':
                            stop = start_site + 3*((int((stop-1)-start_site)/3))
                        s = seq[start_site -1 : stop]
                        CDS.append((start_site, stop, length, s, strand*f))
                        start_site = 0
                        if nostart == '1':
                            start_site = stop + 1
                    elif length < minlength or length > maxlength:
                        start_site = 0
                        if nostart == '1':
                            start_site = stop + 1
                    del stop
        return CDS

    def Output(self, CDS):
        out = self.options['output']
        seqs = (self.seq, self.rseq)
        n = len(self.seq)
        for start, stop, length, subs, strand in CDS:
            self.counter += 1
            if strand > 0:
                head = 'orf_%s:%s:%d:%d:%d' % (self.counter, self.header, strand, start,stop)
            if strand < 0:
                head = 'orf_%s:%s:%d:%d:%d' % (self.counter, self.header, strand, n-stop+1,n-start+1)
            if self.options['gc']:
                head = '%s:%s' % (head, self.Gc2(subs.data))

            if out == 'aa':
                orf = subs.translate(table=self.genetic_code)
                print self.ToFasta(head, orf.data)
            elif out == 'nt':
                print self.ToFasta(head, subs.data)
            elif out == 'pos':
                print head
コード例 #53
0
from Bio.Seq import Seq

my_dna = Seq(
    "AACATGCGTCGAATTCCGGTCCAAAACCAAGAAGCTATGGAGAAGCTTGGTGCAAAAGGAGAATCTCGTAATCGTTGGTATACAAAACCATGTTCTTGGATCGAAATGAGTTGGACTTTTAACACTGAGCTGCTAACTGATGTCTCTTACTAGCGATTCGACGTCCATGGTCGTGCAGCGGCATTAGCCTGACCGCATGATGCACTCTTTCTAGTGCGTCTGTCGGTGACTACTTAACTTGGTTGGTTCACATGATCCACTAAGGGCGTTTCTGCGGACCTGAGAACTCCGGCAATGTTAGTTACGCTGAGCTATTATGGTGAGTCCACCGTCGGGACAGCCACGCAGACGCTGGTTTGGAACCCTTGAAATATCCTGCACGCGATAGGATGTCAATATTGAATTATTAATCAACACCGTCCTTCCAGTTTTGCGCTCGCACTGCCAGTATGTACGAACAATACCTTTGTGATGCAAATACGTAAAAGTTGTGATCTGATCTCAACACCTGGCGCTTTCCTGCCGGAAAGATTCTCTTTTGAATGCCGCGGCGGACCCTAGAGTAGGACTAGTTCCTACTTGCGCGGCAAGTTTCAAATCTACAAGAATTAACGCATTCACCTCACACGAACGAGCCTGGTCGACTCACTATTACTCCCATCCGGAGCCTCCTACCCATTCTAGTGATATATTCCGGCAGTAGAGACGGATGGCTTGCCCAAGGTTGACGGCAGCGATTAAATCGTTGAGGGTGTTTAGGACCTGAAATACGGACTGATTCACGCGTTTTTGGCTGTTTCGTTTGAGACACCCTTCTCGCGCTCTGGCATTTATGAACCTAGTTTCACTGAGGCAACTACCGCAGGAACTTCTGATTCGCCTTCCACACAATATCTGGACATGTAGCCATCTTAATTTGCAGTGGCACAAGACAAATTACCCACGGTGATGCCCCAGTTATTCAGATCGCCCAACCCTAGTCACCGTAAACTGTCACCGTACGCTTAATTGGTTCGATACTTTCGCCTAACTTAAACTACCGGGGACTCGGTCTGGTACGGGAATTGCGAACGTAGATCCTATGAGCTTCGCAGATATGGCCCAACCACCAAAGACCTTACAGAGATACGGCTGATGGCCATGAGTGATCGATCCTACCAACGCGGGCAATCGATCTTTAGTAGTGCTCTCGGGAAGAGCATACAGCCGGCGAGCAGAATCTGGGTCGGAACTCAACAAGAGTGGTCACTGAGCAATAACAGTCGAACTCACAGATGAATTTATCAAACGGGGTATCCGCTGTGGCGGCCATCCAGACGCGGGTAGTAAGAGGTGCTCTACGCAGCCCTCTCGACGATTATTGTATCGATTTTCGACTCCAGTTATCAGGTTTCTATATCAAGGCTATATATTTTGACCTGGCCCCTCAGTACTCATATAGTCTCATCGAAAGGTGGTTGTCTGAGCTGTCAAAAAGCACCCGATCTGCCCCGCTCAACCCATGCCTATCGTCTTGGTTGCGTGGCGCGTTTCTGTAGTGGCTGGCAAGTTGCGATCGTAGCCTCCCGGTCTTGCCGGGACGCGGCTTTACTCCGGAGGCAAGGAATGTGTCTCTGGCTGTGGCGGAAGGATTTGACGTTCAAGGTTAACCATAATCTCCATGCGTGAGTGTTCAGCGCATGTAAGATGAGAAGATTTCCGACCTAATGGATCGTCGTCCAGCAGCGAGCGCCGCGATCAGACTAGGCATATACAAAGTTCCATGCTATTGAATCGCCCGACGTAAGACTGCCAACCAGCCTTTTCGTGCGTATCTAACGCGTTACTATGTTGCAACACCCATGGTTAGGTATAGTATATCTACACATTGAGGGCACTATAAGAGTGACGGGCGAGGCTAAAAGCAACACTTATTGTGCTGGCGTCATCGAGGACGTAACACAATACCTCAGCTACCCGATTAGATGGGTATCTTGGGAGTAGTCGTAAGCTAGACATGAAATCTAGGCCACTCTCGCTCTCTTTCGTCGTTAGAAATACTTATGACGCATTTTATTAACAAGACAGCGGCCATCTGCGAGCGCTGCGAGATTCACCCAGGTTTCATCTCACTCGGGCTTGGCTGGAGACGTACAAGGAGCAGGGGGAGGCTACAGATAGTTGCGCAAATGGGCTTGAGTAGCAAGTCCTTGGCACGATACTAATTACCCAATAGTTAATCTAAAGCATCTCTCGGATGTAAAGCTTAACTTAGAAATCTCTACGATCTATAGAACAAAGAGATTTACCTAGCGCTAAGTTTTTTCATAGGAGAAAGTACACCCCGGATAGGAGATTGGCACTACTTAGAGATACTGCGAACTTCCCTCACTCCTTGTGTTCTCGTGGAGTATACTCTACTTTCGAGAACAAATTGACACGGGGCGTCAACTCCGTATCTAACTGTAATATACGTCTCATCGAGCGAACGACGCGTATCAAACATGAGATTCGACATTGTCGCGCTGAAGGATTGGTGTTGGGATCCTGAACAAAAGTTCCCTGAGCGCGCTAAGCGTGATGTATAGTCGAGTTTTGGGACCACTAGACTAACTGGTCCTGTGCGGGAGGCACTAATTTGAGCGACACCGCGAACCCCGCGCCCCATTTACTTGGGTCCAAATTACCCACCCAGAACAGGGCGGACCAGTATGGACTTTAATCACAACGGGTGCCCCTTCAACGCTCATGGTGGGGCCCCAACCCCACACGACAATTTGGGTAAGCGCCGAGCGTGCTCGTTGGTCCGAAGCTTTCGTTTAGGATCAATTTGCTGGAAGAATTCTGTACGACCATCAAAATCCCCCATAGCTATCCAGTTCAGTACGACAGCCAGGGGACGCGGGAGGTCTCGTCGCGCTAGGAATTAGCCAGAATTTTATGGTACAACAATGCTAGTCTACGTTCCCAGATCAATCACATGGCGCCCGACCCCACGCAGTATGAACCTAGCGCCTCACGCAACGATTCAGCTATGGCGCTCAAAACTTTGCAAGGAACGGCTTGCCAGGTCTTCAGTACGAATCATAAAATACTGTGCCGTACTGCCTCTAGTGAACCTTCGGTGGCGACCGGTTCCGTGGTACTTCATCTAGTTAGCCTGGGCTCAGCTAAAATGTAATCCGATATGTCGTTCGCGCTCCCGTTAGGGCATACTTACCTCAGAGCGGGGAAGGGATAAAATTTGAAAGCACCCGGGCCCAGGACTCTCTTTGTCTGAACGAATTGCTGCGAGTGCTGTGGCTGAGCTGGCCGTCACCACCCTAGCTGCCATGTAAATGAACTATTGGCATTTATTATAAGTTCCCCCCCTGAGTACCCGATGTTGGTTCTCGCGGACACTAAGCGGTGCGGAGACGCGTTTTCGCTGGAATGAATGGGCCACAAAAGCAAGCGCCGATCATACCGTTCTCATGCATCGGTCGTGCGGGATACCAGGTCGAAAAGCGCACCGGAGTTAATTCTCGGCATGCTTAGAGTGGGCCTGGTTACGTCGAGACGCTATCTGCCCCTACCGGCTGATGTATTTCGATAATCCGAGTCTCGAGGCCTTGGATACACCCCAGTGTACAATTGTAGGACGTAGAACGTGATGTCTGACGCGTTGAGTGCTTTATCATGCGGACACTCCCATGATTTCTATGATGGGACGTCTGAGGTGTCCGCTGGCGAGTATATGATCAACCGTCGGGTTATTTTGAGTGGTGGGTTGTGCGCGAAGTAGTTATTGTGCTTGGAAATTTAGGTAGATGGTTTTCTGCCGAGACATAGAGCGCTTCTTAGTATTTTTGGGGCCGGGTCAAACCTTCCGACCCCGCGTAACTTCAAAGTGCAAGGACTACCTAGCAACCGATTAGCTTCAAGTGCGGGCGTCAGAGTTTAGATAAAGGGGCCGTTAATGCGCGTATCACCACATGCTATATACACTCGGCCCTGTATACTCCTCCTAATATCGCTTGATGAACGCGTTCTACGAGCGCCTCGTACATAACCGAGGAGCCCCCCTGCCCCTGGCTATCCCCGCCACCAAGCTGTCCAAAACCCTACCCCAGGCCGGAACACCTTCTGGCATTAACCAGCAGCGCCAGTGGTAGCAATCTCCTGGGATCCTATGAGACGACGTATCGCTGTTTTTAAGCTTGCGACTGTCGGCCCGACTTCCCGGCAAGAAAAGTTAGGGTATGTGATCCGCCACACGAATCTGGTAGTTCATGCCTTTGCGCGACCGTGAGATACGCAGACTTGAAACCTCTTAGTAATCCATAACGACAATCCCTGCACGGCCACCAGCGAAAACTCTGTATAGTCTAACCGATATTGAACCATGGACACATATTTGCATGGCCGCTGGTTTTTCTCCGTATAATACTCCTTTGCGCTCCCCGATCATGATAAGGCGGGCCTCATAGTGAAACGCTGCCCGGACGCGTTCTGGATCTCGAGTCTCAACCTTTAGGTGCGCTCATGGGACGCCCCACCTTTCGTAGCAGGGGTTAGCGTTTGAACGGGACGCCCAGTGCGCCTATCTCCAGCACGGTAACCTCAACGAATCTTGCGGTGTTGTGAGATTATATAGATGTTCGGTACTTGTGTAAGATGGCAATAGGGACATAGATCTCAACTCAGTCTGCGGACGCCTCCCGGTGCGTGGCTTCAGCCGGGCGGGAGGTCGGGCAGCGTTAGGCCCTGATCACAAGTCATAGAAAGGGGGAGTGTCTGGTCTTCGAGGTAACACTTTGGTTTGACAGAACAATACCCAATAAATGTGTACTAACCACCGCAACATCGAAAGTCAACCAGCGCAGCTGAAATGATCATAGTGGGGTAGTGCGCGACTACTATAAGCACTTACCCGTTACGTTGTTATGTAGACGGTAATTCTTCCTTGGGCACCGCCGCATAGTATCTCCGATTGCGTCTCTGACAACGGTCTCGAGTCTAGCTAGTACCGGTGTCTAAGTGCATGCCTACTCTAGTGTGGACGTCTCTCAGTGTTTAGTCGAATGGTCACCCACCATGTTAGATGGGCCGTAAGTTTTAGTGGTGACGTGTGCTCGTTGTAAACCCCGAACAACCGGTTACGCCATATCTAAACCCGTTCCACGGATTCGAGCCCGAGCATGATGGTGCCCTAACCGCCAATGACGTGCCGAAACCGTATTAGATCCGCTATTACCATAAACTCCCCGGGTTTCTTACAACACATGGTCTCTACCAATATATTGTACCTAGTCGCGAATTGGACACGTTTGCCTGCTTTTTTTAGTTCCACGAATGTGCTTAGCAGCTTTACCAAAAGCGACCTCCGTAATATCGAGAAGTTTAGACTGCCTCCGTGCCTCGCAACTTGGTAAATCTGTCCGGTACTACTTAGAGTGATTGATATCGGCCTATCCCGTACACACAGTACTACAGAAATCGTTGTATCCGTACGAATACAGACCACTCGATAATGATGGGTAAACAGTCAGATTACACACCTCCTAACCCAGTCCAGTGGGGGTCTAGAACGACGTTTCTATGCAATAATGAACGAAGAACTGAGCGATAGAGACTTTAAGAGTGCCACACACGCCGCATGGGTGTTTAGAACGCTTACCGGTAGAACAGCTGTCGAGAACTGTAAAAGAAAACACAAACGTTAGCGTGCACTAAGCAATAGCTCAGATGCTATTACCATCTCTAGGATAGCGCTACAACAGGACCCTCTGGACCACCGCGCAACGTATGCGTACTTCGATCGGGGGTAGGACTCCGTTAGCTGAGGCTGCGGCATGCGGAGCACTGTAGTTTCCTGGCTGCATGTTACTCTATGTGCATGATTGTATGACGTGACATTCTCTTGAGGTAAACCGAATAAAAAGTAAATTCTACTTTAACATCACCTGCTCGACTGTTCCGCAACGCCCCCTTGGCGTCGGAGACTGCGATTAGCTCGACTAAATCCTATGTGCGATTATGATTGGGCTACAGCGACCCGTGCTAGACCTCGTGATCTGGAAAGGGCCTGCACAGGGAGAACATGGTGAGCCGTTTGCTGGTAATGTACCGAGACGCACGTGTGCACTTATACGCAATATGGAAGTAGGCTCGCACTAGTGCACGCTGGACCAATCGGTGTTTCCCCTAACCCCAGAAGCAGTGCATCTCTCATCGATTCATCGGACGTACATACACGGCCCTTGTTCACCATATCCGTGGATCCATGTCGCCTTACCCTCAAGGGCAGCTCCCGGGACAGTCTATAGGAAAAGGGACAGCCGGTCCCAGGTTCTATCCATAGTAGAGATAAGACCTAAAGCATTAAACTACTGAGGGTGAGCCTGAGCTAATCCCTGCATATAAGACCATAAAAGCTGAGCAAGGAGCTTAGATTTAGCTAAGCCTCGGAAACGGATCTATTTAGTCTCAGGTGAACTGCCTCATGGGGTTCACAAGCAAGGCGTCCCTAAGGCGTTTATGCACGTTCTATTAAGCCTTCGTGCTTATAGGTCTACAGCGCATGGCTTATGAGAGCGAGCGGCGGAACGTAATCCCAGCGCAAGGTACGTCTTAGCCTCCTTCGCTCGCCACGAAGATCTTATCGATTCCGTATTCTTGGAGCATACGGAGTTCTTGCATCAGTAGAATATTGCTGAGCAAGATCTGACTTTACGTTCCAGGACGCCGCAAGACGACTATAGCGTAACGTGGCCAAAGATTCCGTCCCTTCCTCGTAAGTTTTCATGGCAAGCTAGATTTTTCGACAACATTTACGACTGAGCAGCTAGTCCAGAGGGCTACCCGGATGTATCGACGGAGAAGCAGATTATTCTTCTGGCTCTCCTGAGAGAGGCTCACCCGGCTACCTCATAGCTGTGCAAAGCTCCCAGGTAGTTAAGAGCTGGATGTATATCTATCTATACGGGTAGAGGGGAGTTGCCATCGGCATTAACCGTAGACTGTAGGGCAAGACTCGCGTTTGGAGAACTTGGCGGATGCGCTTGTTCGGTAGGCAGGTGTCCACATTTAGATCACTGGTTTGGTTTCGGTTGGGGGGATTTTATGGCTGTGAAAGACTAAGTGTCCCGTTTCGCGTGTTCGTGTTGGGGTCTGCCGCCTAGGTGCACGCAACTTTCATCGTTTCGCCTCCTGTGAAGAACCCGCTATGCCAGCTAGAACAACATCCAAAGGACTCTTGTCATTAAGTCGTAGCGAGGTCCTTCGCTTACGCTTCAATGTCGAGTGTCAGGCTCACATTCGGGCCAAAGGTACCCGTCCCATTAAGTGCATTCGAACTCATACTGGCCCTGTCGTGTAGCAAAGACACACACTCTTCGTTTCCTCTTCCTAGTACGACCCTGGAATGAGTATGTGATCATTCACAGGCTGACTTGACTGCAAGGCGGCCCGCGTTAATTTAAGTGAATTACGAGTAATAACGCCCTCCTTGGGTCCTTGTGGGGAGGTATGATAATCAGCAATCTACCGATACACAAGGTCCGAGGTCGCGTCACGAAACGTCGGCTGCCTAGGGGACCGGCATGATACGTAATACGTCTACCTGCCGGCATCGCTATGCCGGTGGTTAGTAGGGGGTCGATATTTTTGTTTCTCCTTGCGTCTCAGTCAGCGGGTTCTACCTGTTGGATATCCTATTCAGATTGTCAGAGCAGTCCTTCTATTCCAGGATCATGCTTTATTTCCATCGCTCCAGTCTTGTCCGGGCCGACGGCCCGACTATCGTGGGCTGAAGGGTCAGCTGAATTTGTGTACTTGAATTACCATGAACTGTGAAAATCTATGACTACGAGTATAACGTTTAAAAGATGAATGCTTTTCGCACACTGTACACTGCTCATAAACTAAATGCAGGCTCGTTATCAGTTTCTGTTACATCGTTTACACTTGGTACATAGTTAAAACGGTTCCCTTAGGGGGAACATATCTACTACCTATTTCACGACAGACGCAAACTGAGTAAACATTGGATTGAGCCTTCCTTGAGTTTCCAATAGCGAGGTACTTTATTAGAGGACGTAGGAATGCTCTTTCACGAACCACTGACGGTCGTGCAGATAGTGCTTAGATTTTTGTGCTCTGGGCCTCACGATTGTAGCGTCTAACCAGGCGCCCATTTAACTCGCAGGCCCTTTCAATAATCTACCTTTTAAGACCCGGCTAGCCAGCTAAAATTAGATACTTCGTCACTTTGTGCTCGGTAGGCGTTTGGGCATCGCGAAATGACTATACTAACTTTTTTCACTACGACTGACACGCGGATAGAGACATGCATGTAAAAAGGTTATCAAAATGATGGTCTTCGGG"
)
#could use any DNA sequence
#method reversecomplement returns the reverse complement, recall that this is a reversal of the string and then swapping C for G, G for C, A for T and T for A.
A = my_dna.reverse_complement()
print(A)
コード例 #54
0
#!/usr/bin/env python3

from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

# In ORF (Stronghold) we did enumerate ORFs in a straightforward O(n^2) way
# (as there are anyway O(n) of them of total size O(n^2) in the worst case).
# Here we enumerate *maximal* ORFs (that cannot be extended to the left
# from an earlier start) in O(n) (there are O(n) of them of total size O(n)
# in the worst case).

start = 'M'
stop = '*'

DNA = Seq(input(), IUPAC.unambiguous_dna)
DNArc = DNA.reverse_complement()
N = len(DNA)
# on traduit integralement les 3 decalages (reading frames) de la chaine
# et de son complementaire inverse
P = []
for i0 in range(3):
    P.append(DNA[i0:N - ((N - i0) % 3)].translate())
    P.append(DNArc[i0:N - ((N - i0) % 3)].translate())
# on recherche les ORF maximaux (non prolongeables sur la gauche)
ir, ml, mr = None, 0, 0
for rf in range(6):
    l = None
    for i in range(len(P[rf])):
        if P[rf][i] == start and l == None:
            l = i
        elif P[rf][i] == stop and l != None:
コード例 #55
0
from Bio.Seq import Seq

minha_sequencia = Seq("AGTACACTGGT")
print(minha_sequencia)

# Sequência complementar
print(minha_sequencia.complement())

# Sequência reverso complementar
print(minha_sequencia.reverse_complement())

# Transcrição
dna = Seq("ATGGCCATTCGCAAGGGTGCCCGATAG")
print("DNA:" + dna)

rna = dna.transcribe()
print("RNA:" + rna)

dna2 = rna.back_transcribe()
print("DNA:" + dna2)

# Tradução
print(rna.translate())
print(dna.translate())
コード例 #56
0
# Because python numbering starts at 0:
start_pos_2 = start_pos - 1
end_pos_2 = end_pos - 1

# Need to extract the invertible region and reverse complement it:

with open("out_file.fa", "w") as f:
    for seq_record in SeqIO.parse(in_file, "fasta"):
        f.write(str(seq_record.seq[start_pos_2:end_pos_2])
                )  # prints sequence from start to end pos

with open("reverse_file.fa", "w'") as f:
    q = open("out_file.fa")
    r = Seq(q.read(), generic_dna)
    inv_region = r.reverse_complement(
    )  # Reverse complements the invertible region of interest
    f.write(str(inv_region))

# Now we should have two files - out_file.fa containing the invertible DNA region, and
# reverse_file.fa containing the reverse complement of the former

# Extract flanking regions as well:

left_flank = start_pos - 1000
right_flank = end_pos + 1000

with open("left_flank.fa", "w") as f:
    for seq_record in SeqIO.parse(in_file, "fasta"):
        f.write(str(seq_record.seq[left_flank:start_pos_2]))

with open("right_flank.fa", "w") as f:
コード例 #57
0
ファイル: ruutils.py プロジェクト: tianyunwang/RUscripts
def get_custom_fasta(ref_fasta, subsectionlist, args, model_kmer_means,
                     kmer_len):
    if (args.verbose is True):
        print "Generating a custom fasta"
    sequencedict = dict()
    for sequence in subsectionlist:
        if (args.verbose is True):
            print sequence
        for record in SeqIO.parse(ref_fasta, 'fasta'):
            if (record.id == sequence):
                if (sequence not in sequencedict):
                    sequencedict[sequence] = list()
                for sections in subsectionlist[sequence]:
                    start = sections[0]
                    end = sections[1]
                    if (len(sequencedict[sequence]) > 0):
                        sequencedict[sequence] = str(
                            sequencedict[sequence]) + str(
                                record.seq[sections[0] - 1:sections[1] - 1])
                    else:
                        sequencedict[sequence] = str(
                            record.seq[sections[0] - 1:sections[1] - 1])
    if (args.verbose is True):
        print "processing the custom fasta"
    kmer_means = dict()
    for sequence in sequencedict:
        kmer_means[record.id] = dict()
        tmp = dict()
        tmp2 = dict()
        tmp["F"] = list()
        tmp["R"] = list()
        tmp["Fprime"] = list()
        tmp["Rprime"] = list()
        print "ID", record.id
        print "length", len(record.seq)
        print "FORWARD STRAND"
        seq = Seq(sequencedict[sequence], generic_dna)
        for x in range(len(seq) + 1 - kmer_len):
            kmer = str(seq[x:x + kmer_len])
            tmp["F"].append(float(model_kmer_means[kmer]))
        print "REVERSE STRAND"
        seq = revcomp = seq.reverse_complement()
        for x in range(len(seq) + 1 - kmer_len):
            kmer = str(seq[x:x + kmer_len])
            tmp["R"].append(float(model_kmer_means[kmer]))
        tmp2["Fprime"] = sklearn.preprocessing.scale(tmp["F"],
                                                     axis=0,
                                                     with_mean=True,
                                                     with_std=True,
                                                     copy=True)
        tmp2["Rprime"] = sklearn.preprocessing.scale(tmp["R"],
                                                     axis=0,
                                                     with_mean=True,
                                                     with_std=True,
                                                     copy=True)
        kmer_means[record.id] = tmp2
    '''From this dictionary we will return a pair consisting of a list of keys(lookup for sequence name) and a
    3D array each slice of which relates to the seqid,forward and reverse and then the values. This will then
    be used as a numpy shared memory multiprocessing array. We hope.
    Caution - the dictionary returns in the wrong order.
    '''

    items = kmer_means.items()
    '''for k,v in kmer_means.items():
        for x,y in kmer_means[k].items():
            print "idiot check",k,x
            '''
    items_ = map(processItems, items)
    seqids, arrays = zip(*items_)
    z = len(seqids)
    print arrays
    r, c = list(arrays)[0].shape
    threedarray = multiprocessing.Array(ctypes.c_double, z * r * c)
    threedarrayshared_array = np.ctypeslib.as_array(threedarray.get_obj())
    a = np.array(arrays, dtype=np.float32)
    threedarrayshared_array = a
    return seqids, threedarrayshared_array
コード例 #58
0
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from snakemake import shell

if snakemake.params['amplicon_type'] == "ITS":
    print("ITS Trimming")
    forward_primer_compl = Seq.reverse_complement(
        Seq(snakemake.params['forward_primer'], IUPAC.ambiguous_dna))
    shell("""cutadapt \
    --cores {snakemake.threads} \
    --error-rate 0.1 \
    --times 2 \
    --overlap 3 \
    -o {snakemake.output[R1_trimmed_reads]} \
    -g '{snakemake.params[forward_primer]}' \
    -a '{reverse_primer_compl}' \
    --match-read-wildcards \
    --discard-untrimmed \
    {snakemake.input[R1_raw_reads]} >> {snakemake.log[0]}""")

elif snakemake.params['amplicon_type'] == "16S":
    print("16S Trimming")
    reverse_primer_compl = Seq.reverse_complement(
        Seq(snakemake.params['reverse_primer'], IUPAC.ambiguous_dna))
    shell("""cutadapt \
    --cores {snakemake.threads} \
    --error-rate 0.1 \
    --times 1 \
    --overlap 3 \
    -o {snakemake.output[R1_trimmed_reads]} \
    -g '{snakemake.params[forward_primer]}' \
コード例 #59
0
def bam_seqlogo(file, outfilename, minlength=0, maxlength=1000, unique=False):
    # empty dicts for read names, sequences and read counts
    posnames = []
    posseqs = []
    poscounts = []
    negnames = []
    negseqs = []
    negcounts = []
    postotal = 0
    negtotal = 0
    mappedtotal = 0
    # read in bamfile
    bamfile = pysam.AlignmentFile(file, "rb")
    for read in bamfile:
        # convert line into string to make it splittable
        line = str(read)
        # split line on tabs
        linesplit = line.split("\t")
        # this filters out reads with flag 4 (i.e. unmapped reads)
        cigar = linesplit[5]
        if cigar.endswith('M'):
            seq = linesplit[9].replace('T', 'U')
            count = int(linesplit[0].split('-')[1])
            name = linesplit[0].split('-')[0]
            strand = linesplit[1]
            # this is where the length filtering happens
            if minlength <= len(seq) <= maxlength:
                mappedtotal += 1
                # this is where the strand filtering happens
                # negative sense
                if strand == '16':
                    negtotal += 1
                    negnames.append(name)
                    # convert seq string to Biopython Seq object
                    tempseq = Seq(seq)
                    # set reverse-complement of Seq object as string
                    tempseqrevcomp = tempseq.reverse_complement()
                    negseqs.append(str(tempseqrevcomp))
                    negcounts.append(count)
                # positive sense
                else:
                    postotal += 1
                    posnames.append(name)
                    posseqs.append(seq)
                    poscounts.append(count)

    print('Total mapped reads = ' + str(mappedtotal) + '\nSense reads = ' +
          str(postotal) + '\nAntisense reads = ' + str(negtotal) +
          '\nMissing mapped reads = ' + str(mappedtotal -
                                            (postotal + negtotal)))
    # because seqlogo needs all reads to be the same length, we now need to pad the shorter reads with Ns (which will be ignored)
    # positive sense
    # sort the sequence list by length, and set the max length as the length of the last (i.e. longest) seq
    posmaxlen = len(sorted(posseqs, key=len)[len(posseqs) - 1])
    # go through each seq, add as many Ns onto end as needed to bring length up to maxlength
    for i in range(len(posseqs)):
        lengthdiff = posmaxlen - len(posseqs[i])
        # if the seq is shorter than maxlength
        if lengthdiff > 0:
            # set the seq as the initial newseq
            newseq = posseqs[i]
            # add as many Ns as necessary to bring newseq up to maxlength
            for j in range(lengthdiff):
                newseq = newseq + 'N'
            # replace the initial seq (which was shorter than maxlength) with the new seq (which is now padded to maxlength)
            posseqs[i] = newseq
    # negative sense
    # sort the sequence list by length, and set the max length as the length of the last (i.e. longest) seq
    negmaxlen = len(sorted(negseqs, key=len)[len(negseqs) - 1])
    # go through each seq, add as many Ns onto end as needed to bring length up to maxlength
    for i in range(len(negseqs)):
        lengthdiff = negmaxlen - len(negseqs[i])
        # if the seq is shorter than maxlength
        if lengthdiff > 0:
            # set the seq as the initial newseq
            newseq = negseqs[i]
            # add as many Ns as necessary to bring newseq up to maxlength
            for j in range(lengthdiff):
                newseq = newseq + 'N'
            # replace the initial seq (which was shorter than maxlength) with the new seq (which is now padded to maxlength)
            negseqs[i] = newseq
    # output the positive sense reads and call Seqlogo
    posout = ''
    for i in range(len(posseqs)):
        if unique == True:
            posout = posout + '>' + posnames[i] + '\n' + posseqs[i] + '\n'
        elif unique == False:
            for j in range(poscounts[i]):
                posout = posout + '>' + posnames[i] + '_' + str(
                    j) + '\n' + posseqs[i] + '\n'
    posoutfile = open('pos.fas', 'wt')
    posoutfile.write(posout)
    posoutfile.close()
    command = 'weblogo -f pos.fas -D fasta -o ' + outfilename + '_sense.pdf -F pdf -A RNA -a \'ACGU\' -c classic --yaxis 1 --errorbars NO'
    subprocess.run(command, shell=True)
    subprocess.run('rm pos.fas', shell=True)
    # output the negative sense reads and call Seqlogo
    negout = ''
    for i in range(len(negseqs)):
        if unique == True:
            negout = negout + '>' + negnames[i] + '\n' + negseqs[i] + '\n'
        elif unique == False:
            for j in range(negcounts[i]):
                negout = negout + '>' + negnames[i] + '_' + str(
                    j) + '\n' + negseqs[i] + '\n'
    negoutfile = open('neg.fas', 'wt')
    negoutfile.write(negout)
    negoutfile.close()
    command = 'weblogo -f neg.fas -D fasta -o ' + outfilename + '_antisense.pdf -F pdf -A RNA -a \'ACGU\' -c classic --yaxis 1 --errorbars NO'
    subprocess.run(command, shell=True)
    subprocess.run('rm neg.fas', shell=True)
    return ('SeqLogos constructed')
コード例 #60
0
ファイル: primer.py プロジェクト: jasminks/biopython
print ">forward", name , #frwd_raw_res                                    #prints the fasta tag for forwward primer with name of sequance and the cuting site
print frwd                                                            #prints the forward primer (eventully with the primer)
frwd_G = frwd.count("G")                                              #counts G's
#print "G: ", frwd_G                                                  #prints number of G's
frwd_C = frwd.count("C")                                              #counts the number of C's
#print "C: ", frwd_C                                                  #prints number of C's
frwd_A = frwd.count("A")                                              #counts number of A's
frwd_T = frwd.count("T")                                              #counts the number of T's
frwdG_C = 100 * float(frwd_G + frwd_C) / len(frwd)                    # GC content (G/C)/len*100
print "GC content", frwdG_C, "%"                                      #prints the gc contents in a precent
frwd_MT = 64.9 + 41 *float( (frwd_G + frwd_C - 16.4) / (frwd_A + frwd_T + frwd_G + frwd_C))   #http://www.biophp.org/minitools/melting_temperature/demo.php?formula=basic
                                                                      #the melting temp equation
print  "melting temp", frwd_MT                                        #prints the melting temp

revs = Seq(sequance_sites[-20:])                                       #sequace for the reverse primer
revs_revs_comp = revs.reverse_complement()                            #reverse complement of the last 20 characters
                                                                       #documentation same as forward

print  ">reverse", name 
print revs_revs_comp

revs_G = revs_revs_comp.count("G") 
revs_C = revs_revs_comp.count("C")
revs_A = revs_revs_comp.count("A")
revs_T = revs_revs_comp.count("T")
revsG_C = 100 * float (revs_G + revs_C ) / len(revs_revs_comp)
print "GC content: ",revsG_C, "%"
revs_MT =  64.9 + 41 *float( (revs_G + revs_C - 16.4) / (revs_A + revs_T + revs_G + revs_C))    #http://www.biophp.org/minitools/melting_temperature/demo.php?formula=basic
print "melting temp",  revs_MT