コード例 #1
0
ファイル: box_mutation.py プロジェクト: rcoor/phaistos_proj
def make_mutation(csv_file,save_folder,tru):
	from Bio.Seq import Seq
	from Bio.Alphabet import IUPAC
	import re
	import csv
	import os
	
	'Import CSV list of mutations with wt sequence as header'
	mutations_list = csv.reader(open(csv_file,'rU'), delimiter=";")
	mutations = []
	for row in mutations_list:
		mutations.append(row[0])

	'split into sequence and mutations'
	prot_seq = mutations[0].lower()
	mutations.pop(0)

	'Creates a folder for data-output'
	directory = save_folder
	if not os.path.exists(directory):
		os.makedirs(directory)

	'Makes a table with all mutations with their matching sequence'
	table_of_mutations = [("aaawildtype",str(Seq(prot_seq)[int(tru):]))]
	i = 0
	for i in range(len(mutations)):
		
		my_seq_un = Seq(prot_seq)
		my_seq = my_seq_un.tomutable()

		if mutations == 'false':
			pass
		else:
			match = re.match(r"([a-z]+)([0-9]+)([a-z]+)", mutations[i], re.I)
			if match:
				position_id = match.groups()
	    		
	    		position = position_id[1]

	    		my_seq[int(position)-1] = position_id[2]
	    		
	    	
	    		if mutations[i]:
	    			table_of_mutations.append((mutations[i],my_seq[int(tru):]))

	    			'save as single files, used for Scwrl'
	    			with open(os.path.join(directory, str(mutations[i])), 'wb') as temp_file:
	    				temp_file.write(str(my_seq[int(tru):]))

	with open(os.path.join(directory, "aaawildtype"), 'wb') as temp_file:
	    temp_file.write(str(Seq(prot_seq)[int(tru):]))


	'Saving the list as a file'
	import csv
	with open(os.path.join(directory,'sum_all_mutations.csv'),'w') as out:
		csv_out = csv.writer(out)
		csv_out.writerow(['Mutation','Sequence'])
		for row in table_of_mutations:
			csv_out.writerow(row)
コード例 #2
0
 def translate_using_sequence(self, contig):
     """
     Translate the SNPs at this position and give the proportion.
     :param contig: A Contig object with associated Gene(s)
     :return:A line to stdout
     """
     if type(contig).__name__ != "Contig":
         Error.error("Genomic_environment: translate_using_sequence: The object passed is not of type Contig")
     #For each gene in Contig... we search for the one that contain the position
     gene = Gene.__init__()
     for orf in contig.genes:
         if orf.start <= self.position <= orf.end:
             gene = orf
     if gene.sequence == "":
         Error.error("Genomic_environment: translate_using_sequence: There is no sequence in the selected gene")
     orf_seq = Seq(gene.sequence, IUPAC.unambiguous_dna)
     amino_acid_sequence = str(orf_seq.translate(table=11))
     orf_seq = orf_seq.tomutable()
     nucleotides_sequences = self.group_environment()
     total = len(self.reads)
     header = "Contig\tSNP position(contig)\tFrame\tAmino acid\tProportion\n"
     #We put - in the frame number column.
     print(header),
     for entry in nucleotides_sequences:
         mutation = entry[2]
         mutated_seq = copy.copy(orf_seq)
         mutated_seq[self.position-gene.start] = mutation
         mutated_prot_seq = str(mutated_seq.translate(table=11))
         changed_amino_acid_position = math.ceil(float(self.position-gene.start+1)/3)
         changed_amino_acid = mutated_prot_seq[changed_amino_acid_position-1]    #-1 cuz its a python string.
         line = (self.contig+"\t"+self.position+"\t-\t"+changed_amino_acid+"\t" +
                 str(float(nucleotides_sequences[entry])/total))
         print(line)
コード例 #3
0
def evolution_of_a_genome(gen_size, number_of_generations, step):
    t1 = time.time()
    my_seq = Seq(genome_generation_v3(gen_size))
    #print(my_seq)
    y = []
    print("J'ai fait le genome en %.4f secondes." % (time.time() - t1))
    for i in range(number_of_generations):  #number of generations
        y.append(int(gen_size - i))  # size of genome - i
    y = y[0:number_of_generations:step]
    print("J'ai fait y")
    my_seq1 = my_seq.tomutable()
    hist_scor_align1 = []
    old_seq = my_seq1

    for i in range(number_of_generations):
        new_seq = q3_muta_ponct(old_seq)
        #print(new_seq)
        if (i % step == 0):
            hist_scor_align1.append(int(alignment_gen1(my_seq, new_seq)))
        old_seq = new_seq
    print("J'ai fait les alignements")
    #if(np.absolute((hist_scor_align1[i/10]-y[i])/y[i]) > 0.05):
    i = 0
    #affiche l'évolution des paramètres
    plt.plot(np.arange(0, number_of_generations, step),
             hist_scor_align1,
             label="score de pairwise2xx")
    plt.plot(np.arange(0, number_of_generations, step),
             y,
             label="Nombre de bases identiques theorique")
    plt.xlabel("Nombre de mutations")
    plt.ylabel("Bases identiques")
    plt.legend()
    plt.show()
コード例 #4
0
def test_padding_truncating():

    from Bio.Seq import Seq
    seq = Seq('ACTCGA')
    norm_seq = padding_truncating(seq.tomutable(), segment_size=12)

    assert str(norm_seq) == 'NNNACTCGANNN'
コード例 #5
0
def RandomProbe(probe, MutateNum):
    #Probe Seq
    Probe = Seq("AGGCCACAACCTCCAAGTAG")
    #convert probe string to mutable object
    mutable_probe = Probe.tomutable()
    global Pos
    Pos = RandomPosition(probe, MutateNum)
    #print(Pos)
    for p in range(len(Pos)):  #single position in mutaition positions
        RB = mutable_probe[Pos[p]]
        mutable_probe[Pos[p]] = RandomBase(RB)
    return mutable_probe
コード例 #6
0
ファイル: aimap_tools.py プロジェクト: bioone/aimap
def get_founction_Prokaryote(infile,outdir,outname,genomefile,anno_file):
    result_file=open("%s/%s_result.txt"%(outdir,outname),"w")
    result_file.write("Accession"+"\t"+"Position"+"\t"+"Old_base"+"\t"+"New_base"+"\t"+"Coverage"+"\t"+"Edit_level"+"\t"+"Gene_biotype"+"\t"+"Gene_name"+"\t"+"Product"+"\t"+"Amino acid_change"+"\n")
    db = gffutils.create_db("%s"% anno_file, ':memory:', force=True, keep_order=True,merge_strategy='merge', id_spec="ID",sort_attribute_values=True)
    with open('%s'% infile, 'r') as pileup:
        data=pileup.readlines()
        data=filter(None, data)
        for line in data:
            linsplit=line.split("\t")
            accession=linsplit[0]
            position=int(linsplit[1])
            oldbase=linsplit[2]
            newbase=linsplit[3]
            for i in db.all_features(featuretype='gene'):
                if position>=db[i.id].start and position<=db[i.id].end:
                    if db[i.id].seqid == accession and db[i.id].attributes['gene_biotype'][0]=="protein_coding":
                        change_loc=position-db[i.id].start
                        old_seq=Seq(i.sequence("%s"% genomefile, use_strand=False),IUPAC.unambiguous_dna)
                        new_seq = old_seq.tomutable()
                        new_seq[change_loc] = newbase
                        new_seq=new_seq.toseq()
                        CDS_id=[h.id for h in db.children(i.id,featuretype="CDS")][0]
                        product=db[CDS_id].attributes['product'][0]
                        if db[i.id].strand== "+":
                            old_pro=old_seq.translate()
                            new_pro=new_seq.translate()
                            for n in range(len(old_pro)):
                                if old_pro[n]==new_pro[n]:
                                    pass
                                else:
                                    result_file.write(str(accession)+"\t"+str(position)+"\t"+str(oldbase)+"\t"+str(newbase)+"\t"+str(linsplit[5])+"\t"+str(linsplit[4])+"\t"+"CDS"+"\t"+db[i.id].attributes['Name'][0]+"\t"+str(product)+"\t"+old_pro[n]+str(n+1)+new_pro[n]+"\n")
                        if db[i.id].strand== "-":
                            old_seq=old_seq.reverse_complement()
                            new_seq=new_seq.reverse_complement()
                            old_pro=old_seq.translate()
                            new_pro=new_seq.translate()
                            for n in range(len(old_pro)):
                                if old_pro[n]==new_pro[n]:
                                    pass
                                else:
                                    result_file.write(str(accession)+"\t"+str(position)+"\t"+str(oldbase)+"\t"+str(newbase)+"\t"+str(linsplit[5])+"\t"+str(linsplit[4])+"\t"+"CDS"+"\t"+db[i.id].attributes['Name'][0]+"\t"+str(product)+"\t"+old_pro[n]+str(n+1)+new_pro[n]+"\n")
                    if db[i.id].seqid == accession and db[i.id].attributes['gene_biotype'][0]!="protein_coding":
                        result_file.write(str(accession)+"\t"+str(position)+"\t"+str(oldbase)+"\t"+str(newbase)+"\t"+str(linsplit[5])+"\t"+str(linsplit[4])+"\t"+db[i.id].attributes['gene_biotype'][0]+"\t"+db[i.id].attributes['Name'][0]+"\t"+"-"+"\t"+"-"+"\n")
            start=[db[i.id].start for i in db.all_features(featuretype='gene')]
            end=[db[i.id].end for i in db.all_features(featuretype='gene')]
            ranges=zip(start,end)
            if any(lower <= position <= upper for (lower, upper) in ranges):
                continue
            else: 
                result_file.write(str(accession)+"\t"+str(position)+"\t"+str(oldbase)+"\t"+str(newbase)+"\t"+str(linsplit[5])+"\t"+str(linsplit[4])+"\t"+"Intergenic region"+"\t"+"-"+"\t"+"-"+"\t"+"-"+"\n")
    result_file.close
コード例 #7
0
def MutableGeneSeq():
    my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)
    #my_seq[5] = "G" #error 'Seq' object does not support item assignment

    mutable_seq = my_seq.tomutable()
    mutable_seq[1] = 'T'
    print('mutable_seq = ', mutable_seq)

    mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA",
                             IUPAC.unambiguous_dna)
    mutable_seq[0] = 'T'
    print('mutable_seq = ', mutable_seq)

    new_seq = mutable_seq.toseq()  #convert to readonly seq
コード例 #8
0
def test_seq():
    print(f'\n\n{Bio.__version__}')

    myseq = Seq("GATCGAAATGGGCCTAAAAATATAGGATCGAAAATCGC",
                IUPAC.unambiguous_dna)

    print(myseq.alphabet)
    print(myseq)
    print(myseq.__len__())

    # You can calculate the GC ratio yourself or you can you use a special function to do so
    print(f'The number of G\'s is: {myseq.count("G")}')
    print(f'The number of C\'s is: {myseq.count("C")}')
    print(100 * (myseq.count("G") + myseq.count("C")) / myseq.__len__())
    print(f'The ratio GC in the string is: {GC(myseq)}')

    # Sequences can be inverted and complemented
    print(f'The string is:\t\t\t\t {myseq}')
    print(f'The reverse is:\t\t\t\t {myseq[::-1]}')
    print(f'The complement is:\t\t\t {myseq.complement()}')
    print(f'The reverse complement is:\t {myseq.reverse_complement()}')

    # A simple function to determine if a string is palindromic
    print(f'GAAG is palindrome: {ispalindromic("GAAG")}')
    print(f'GAAG is palindrome: {ispalindromic("GTAG")}')

    # Find all the sequences of AAA
    # Start with the first AAA and that start looping
    positions = []
    pos = myseq.find('AAA')
    while pos != -1:
        positions.append(pos)
        pos = myseq.find('AAA', pos + 1)
    print(positions)
    count = len(positions)

    # A demo to show that Seq is not mutable.
    # If you need to change a Seq, make it mutable first
    try:
        myseq[0] = 'C'

    except:
        print("Oops! As Seq is immutable ")

    im_seq = myseq.tomutable()
    im_seq[0] = 'C'

    seq = im_seq
    print(seq)
コード例 #9
0
def createMutantAllChromsInOneFastaFile(mutant):
    """
    to construct a fastaFile for a mutant from ref and snp file
    chroms seq are concatened
    """
    snps=createSNPsListFromMutant(mutant)
    mutname=mutant.nom
    seqemblout= Seq("", IUPAC.unambiguous_dna)
    seqemblout.tomutable()
    for chr in ["0A","0B","0C","0D","0E","0F","0G","0H"]:
        emblFile=chooseRightEMBLFile(mutant.strain.emblrep,mutant.strain.specy,chr)
        record=SeqIO.read(emblFile,"embl")
        seqemblin=record.seq
        seqembl=seqemblin.tomutable()
        for snp in snps:
            if chr==snp.chrom:
                seqembl[snp.pos-1]=snp.seqmut
        print chr
        seqemblout=seqemblout+seqembl
    rec = SeqRecord(seqemblout,
                 id=mutname,
                 description= "wholegenome %s of mutant %s from strain %s"%(chr,mutname,mutant.strain.specy))
    print "fini!"
    return rec
コード例 #10
0
 def translate_using_sequence(self, contig):
     """
     Translate the SNPs at this position and give the proportion.
     :param contig: A Contig object with associated Gene(s)
     :return:A line to stdout
     """
     if type(contig).__name__ != "Contig":
         Error.error(
             "Genomic_environment: translate_using_sequence: The object passed is not of type Contig"
         )
     #For each gene in Contig... we search for the one that contain the position
     gene = Gene.__init__()
     for orf in contig.genes:
         if orf.start <= self.position <= orf.end:
             gene = orf
     if gene.sequence == "":
         Error.error(
             "Genomic_environment: translate_using_sequence: There is no sequence in the selected gene"
         )
     orf_seq = Seq(gene.sequence, IUPAC.unambiguous_dna)
     amino_acid_sequence = str(orf_seq.translate(table=11))
     orf_seq = orf_seq.tomutable()
     nucleotides_sequences = self.group_environment()
     total = len(self.reads)
     header = "Contig\tSNP position(contig)\tFrame\tAmino acid\tProportion\n"
     #We put - in the frame number column.
     print(header),
     for entry in nucleotides_sequences:
         mutation = entry[2]
         mutated_seq = copy.copy(orf_seq)
         mutated_seq[self.position - gene.start] = mutation
         mutated_prot_seq = str(mutated_seq.translate(table=11))
         changed_amino_acid_position = math.ceil(
             float(self.position - gene.start + 1) / 3)
         changed_amino_acid = mutated_prot_seq[
             changed_amino_acid_position - 1]  #-1 cuz its a python string.
         line = (self.contig + "\t" + self.position + "\t-\t" +
                 changed_amino_acid + "\t" +
                 str(float(nucleotides_sequences[entry]) / total))
         print(line)
コード例 #11
0

# Direct translation (DNA -> Protein
from Bio.Seq import Seq, translate
from Bio.Alphabet import IUPAC
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
translate(coding_dna)

# we can specify other translation tables by name
translate(coding_dna, table="Vertebrate Mitochondrial")
# or by NCBI number
translate(coding_dna, table=2)

# 3.9 Transcription and Translation

# 3.10 Mutable Seqs
# convert existing sequence to mutable
mutable_seq = my_seq.tomutable()  
# or directly create a mutable one
from Bio.Seq import MutableSeq
from Bio.Alphabet import IUPAC
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

# now we can do
mutable_seq[5] = "T"

# and convert it back to an inmutable seq
new_seq = mutable_seq.toseq()


コード例 #12
0
	def _constructSNPAnnotation(self, session, snp_info, snps_context_wrapper, gene_annotation, snp_annotation_short_name2id):
		"""
		2009-2-5
			bug fixed. when adding a box as UTR, make sure after the forloop above, the current box is still the UTR.
		2009-1-5
		"""
		sys.stderr.write("Constructing SNPAnnotation ...\n")
		standard_translator = Translate.unambiguous_dna_by_id[1]
		counter = 0
		real_counter = 0
		for i in range(len(snp_info.chr_pos_ls)):
			counter += 1
			snp_db_entry = snp_info.data_ls[i]
			snps_id=snp_db_entry.id
			chr=snp_db_entry.chromosome
			pos = snp_db_entry.position
			allele1 = snp_db_entry.allele1
			allele2 = snp_db_entry.allele2
			
			snps_context_matrix = snps_context_wrapper.returnGeneLs(chr, pos)
			snp_annotation_type_short_name_ls = []	#each element is (snp_annotation_type_short_name, gene_id, gene_commentary_id, 
					# which_exon_or_intron, pos_within_codon)
			if snps_context_matrix:
				for snps_context in snps_context_matrix:
					snps_id, disp_pos, gene_id = snps_context
					gene_model = gene_annotation.gene_id2model.get(gene_id)
					if gene_model is None:
						continue
					for gene_commentary in gene_model.gene_commentaries:
						if gene_commentary.protein_box_ls:
							which_intron = -1	#which intron the SNP resides, starting from 1
							which_coding_exon = -1	#which exon the SNP resides in terms of the CDS sequence, starting from 1
							accum_intron_len = 0	#the cumulative length of all introns from the one \
								# after the first coding exon up till the SNP's position (including the intron the SNP is in if it's under the rule).
							exon_5_end_pos = -1	#5' end position of the whole CDS sequence. bad name 'exon'
							UTR_2nd = 0	#whether this is the 2nd UTR
							box_type = None
							UTR_type = None
							is_SNP_within_box = 0	#protect against the possibility that SNP overshoots box_ls
							for i in range(len(gene_commentary.box_ls)):
								box = gene_commentary.box_ls[i]
								start, stop, box_type, is_translated, protein_box_index = box
								if box_type=='exon' and is_translated==0:
									if UTR_2nd==0:
										if gene_model.strand=='-1':
											UTR_type='3UTR'
										else:
											UTR_type='5UTR'
										UTR_2nd += 1
									else:
										if gene_model.strand=='-1':
											UTR_type='5UTR'
										else:
											UTR_type='3UTR'
								elif box_type=='exon' and is_translated==1:
									if exon_5_end_pos==-1:
										exon_5_end_pos=start
									which_coding_exon += 1
								elif box_type=='intron':
									if which_coding_exon!=-1:	#exclude introns that stand before the 1st coding exon
										accum_intron_len += abs(stop-start)+1	#+1 because stop-start is intron_length-1
									which_intron += 1
								
								if pos>=start and pos<=stop:	#with this box
									is_SNP_within_box = 1
									break
							
							if gene_model.strand=='-1':
								#continue to read the box_ls to count no_of_introns
								no_of_introns = which_intron+1
								for j in range(i+1, len(gene_commentary.box_ls)):
									box = gene_commentary.box_ls[i]
									if box[2]=='intron':
										no_of_introns += 1
							
							if box_type!=None and is_SNP_within_box==1:	#box_type is the type of the box where the SNP resides
								if UTR_type!=None and box_type=='exon' and is_translated==0:	#it's UTR. bug fixed. make sure after the forloop above, 
										# the current box is still the UTR.
									snp_annotation_type_short_name_ls.append((UTR_type, gene_id, gene_commentary.gene_commentary_id))
								else:
									if gene_model.strand=='-1':	#reverse the order of exon/intron
										no_of_coding_exons = len(gene_commentary.protein_box_ls)
										#no_of_introns = len(gene_commentary.mrna_box_ls)-no_of_coding_exons	#not right
										which_coding_exon = no_of_coding_exons-which_coding_exon
										which_intron = no_of_introns-which_intron
									else:
										which_coding_exon += 1
										which_intron += 1
									if box_type=='intron':
										snp_annotation_type_short_name_ls.append(('intron', gene_id, gene_commentary.gene_commentary_id, which_intron))
										if pos-start<=1:	#within the donor/acceptor two-nucleotide
											if gene_model.strand=='-1':
												snp_annotation_type_short_name = 'splice-acceptor'	#on the 3' of this intron
											else:
												snp_annotation_type_short_name = 'splice-donor'	#on the 5' of this intron
											snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, \
																					gene_commentary.gene_commentary_id, which_intron))
										elif stop-pos<=1:
											if gene_model.strand=='-1':
												snp_annotation_type_short_name = 'splice-donor'	#on the 5' of this intron
											else:
												snp_annotation_type_short_name = 'splice-acceptor'	#on the 3' of this intron
											snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, \
																					gene_commentary.gene_commentary_id, which_intron))
									elif box_type=='exon':	#must be translated
										try:
											SNP_index_in_CDS = pos - exon_5_end_pos - accum_intron_len
											if gene_model.strand=='-1':	#reverse
												SNP_index_in_CDS = len(gene_commentary.cds_sequence)-SNP_index_in_CDS-1	#-1 because SNP_index_in_CDS starts from 0
												gene_allele1 = nt2complement[allele1]
												gene_allele2 = nt2complement[allele2]
											else:
												gene_allele1 = allele1
												gene_allele2 = allele2
											SNP_index_in_CDS = int(SNP_index_in_CDS)	
											# SNP_index_in_CDS is type long. without int(), cds_seq[SNP_index_in_CDS] returns a Bio.Seq with one nucleotide, 
											# rather than a single-char string
											SNP_index_in_peptide = SNP_index_in_CDS/3
											
											SNP_index_in_peptide = int(SNP_index_in_peptide)	#ditto as int(SNP_index_in_CDS), not necessary
											
											pos_within_codon = SNP_index_in_CDS%3+1	#pos_within_codon starts from 1
											cds_seq = Seq(gene_commentary.cds_sequence, IUPAC.unambiguous_dna)
											if SNP_index_in_CDS>=len(cds_seq):
												sys.stderr.write("Warning: SNP (%s, %s), SNP_index_in_CDS=%s, is beyond any of the boxes from gene %s (chr=%s, %s-%s), \
														gene_commentary_id %s (%s-%s), box_ls=%s, cds-length=%s. counted as intergenic.\n"%\
														(chr, pos, SNP_index_in_CDS, gene_id, gene_model.chromosome, gene_model.start, gene_model.stop, \
														gene_commentary.gene_commentary_id, gene_commentary.start, gene_commentary.stop, repr(gene_commentary.box_ls), \
														len(cds_seq)))
												snp_annotation_type_short_name_ls.append(['intergenic'])
												continue
											if cds_seq[SNP_index_in_CDS]!=gene_allele1 and cds_seq[SNP_index_in_CDS]!=gene_allele2:
												sys.stderr.write("Error: Neither allele (%s, %s) from SNP (%s,%s) matches the nucleotide, %s, from the cds seq of gene %s \
													(gene_commentary_id=%s).\n"%\
													(gene_allele1, gene_allele2, chr, pos, cds_seq[SNP_index_in_CDS], gene_id, gene_commentary.gene_commentary_id))
												sys.exit(3)
											cds_mut_ar = cds_seq.tomutable()
											cds_mut_ar[SNP_index_in_CDS] = gene_allele1
											peptide = standard_translator.translate(cds_mut_ar.toseq())
											
											alt_cds_mut_ar = cds_seq.tomutable()
											alt_cds_mut_ar[SNP_index_in_CDS] = gene_allele2
											alt_peptide = standard_translator.translate(alt_cds_mut_ar.toseq())
											aa = peptide[SNP_index_in_peptide]
											alt_aa = alt_peptide[SNP_index_in_peptide]
											if aa != alt_aa:
												snp_annotation_type_short_name = 'non-synonymous'
												comment = '%s->%s'%(aa, alt_aa)
											else:
												snp_annotation_type_short_name = 'synonymous'
												comment = None
											snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, \
																					which_coding_exon, pos_within_codon, comment))
											
											if aa != alt_aa:
												if aa=='*' or alt_aa=='*':
													snp_annotation_type_short_name = 'premature-stop-codon'	#could also be the last stop codon changing to something else 
													# and thereby extending the cds
													snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, \
																							which_coding_exon, pos_within_codon, comment))
												if SNP_index_in_peptide==0:
													snp_annotation_type_short_name = 'init-Met'
													snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, \
																							which_coding_exon, pos_within_codon, comment))
										except:
											traceback.print_exc()
											sys.stderr.write('%s.\n'%repr(sys.exc_info()))
											sys.stderr.write("Except encountered for SNP (%s, %s), gene %s (chr=%s, %s-%s), gene_commentary_id %s (%s-%s), box_ls=%s.\n"%\
												(chr, pos, gene_id, gene_model.chromosome, gene_model.start, gene_model.stop, \
												gene_commentary.gene_commentary_id, gene_commentary.start, gene_commentary.stop, repr(gene_commentary.box_ls)))
							elif box_type!=None and is_SNP_within_box==0:
								#SNP is over the range of the gene. it happens when one gene has multiple alternative splicing forms.
								# snps_context_wrapper uses the largest span to represent the gene.
								snp_annotation_type_short_name_ls.append(['intergenic'])
							else:
								sys.stderr.write("Warning: SNP (%s, %s) not in any of the boxes from gene %s (chr=%s, %s-%s), gene_commentary_id %s (%s-%s), box_ls=%s.\n"%\
												(chr, pos, gene_id, gene_model.chromosome, gene_model.start, gene_model.stop, \
												gene_commentary.gene_commentary_id, gene_commentary.start, gene_commentary.stop, repr(gene_commentary.box_ls)))
						else:
							if gene_model.type_of_gene=='pseudo':
								snp_annotation_type_short_name = gene_model.type_of_gene
							else:
								snp_annotation_type_short_name = gene_commentary.gene_commentary_type
							snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, None))
			else:	#integenic
				snp_annotation_type_short_name_ls.append(['intergenic'])
			
			#now save everything into db
			for snp_annotation_type_tup in snp_annotation_type_short_name_ls:
				snp_annotation_type_short_name = snp_annotation_type_tup[0]
				if snp_annotation_type_short_name not in snp_annotation_short_name2id:
					ty = Stock_250kDB.SNPAnnotationType(short_name=snp_annotation_type_short_name)
					session.add(ty)
					session.flush()
					snp_annotation_short_name2id[snp_annotation_type_short_name] = ty.id
				if len(snp_annotation_type_tup)>=3:
					gene_id = snp_annotation_type_tup[1]
					gene_commentary_id = snp_annotation_type_tup[2]
				else:
					gene_id = None
					gene_commentary_id = None
				if len(snp_annotation_type_tup)>=4:
					which_exon_or_intron = snp_annotation_type_tup[3]
				else:
					which_exon_or_intron = None
				if len(snp_annotation_type_tup)>=5:
					pos_within_codon = snp_annotation_type_tup[4]
				else:
					pos_within_codon = None
				if len(snp_annotation_type_tup)>=6:
					comment = snp_annotation_type_tup[5]
				else:
					comment = None
				entry = Stock_250kDB.SNPAnnotation(snps_id=snps_id, gene_id=gene_id, gene_commentary_id=gene_commentary_id, \
												which_exon_or_intron=which_exon_or_intron, pos_within_codon=pos_within_codon,\
												comment=comment)
				entry.snp_annotation_type_id = snp_annotation_short_name2id[snp_annotation_type_short_name]
				session.add(entry)
				real_counter += 1
			if self.report and counter%2000==0:
				sys.stderr.write("%s%s\t%s"%('\x08'*40, counter, real_counter))
				session.flush()
		if self.report:
			sys.stderr.write("%s%s\t%s\n"%('\x08'*40, counter, real_counter))
		sys.stderr.write("Done.\n")
コード例 #13
0
    print("str(seq1) == str(seq2)")
else:
    print("str(seq1) != str(seq2)")

#MutableSeq 对象
"""
就像正常的 Python 字符串,Seq 对象是“只读的”,在 Python 术语上就是不可变的。除了想要 Seq 对
象表现得向一个字符串之外,这是一个很有用的默认,因为在生物学应用上你往往需要确保你没有改动你的
序列数据
"""
print("\n###############\n8. 可变对象\n---------------")
my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)
try:
    my_seq[5] = "G"
    print("可以改变碱基")
except:
    print("无法改变碱基")
mutable_seq = my_seq.tomutable() #转换为可变对象
print("可变对象(转变):", repr(mutable_seq))
from Bio.Seq import MutableSeq
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna) #直接创建可变对象
print("可变对象(创建):", repr(mutable_seq))
try:
    mutable_seq[5] = "C"
    print("可以改变碱基")
except:
    print("无法改变碱基")

#转换为只读对象
new_seq = mutable_seq.toseq()
コード例 #14
0
5'   ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG   3'
     |||||||||||||||||||||||||||||||||||||||
3'   TACCGGTAACATTACCCGGCGACTTTCCCACGGGCTATC   5'


|
Transcription
↓
5’   AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG   3’

                   messenger-RNA
"""
from Bio.Seq import Seq, MutableSeq
from Bio.Alphabet import IUPAC

dna = Seq("TACATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAGTAA",
          IUPAC.unambiguous_dna)
print(repr(dna))
dna_m = dna.tomutable()  #mutable DNA
dna_m[8] = "A"
print(repr(dna_m))

m_rna = dna.complement().transcribe()  #transcribtion
print(repr(m_rna))

protein = m_rna.translate()  #translation
print(repr(protein))

mitochondrial_protein = m_rna.translate("Bacterial")
print(repr(mitochondrial_protein))
コード例 #15
0
# my_seq = Seq("GATCG", IUPAC.unambiguous_dna)
# for index, letter in enumerate(my_seq):
#    print("%i %s" % (index, letter))

standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
# or by_id[1]

mito_table = CodonTable.unambiguous_dna_by_name["Vertebrate Mitochondrial"]
# or by_id[2]
# standard_table.stop_codons
# print(standard_table.stop_codons)

# compare sequences as strings

j1 = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna)
j2 = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATTG", IUPAC.unambiguous_dna)

str(j1) == str(j2)

# You cant index and change regular sequences, but if you make them mutable you can do basically anything you want

j3 = j2.tomutable()
j3[0::1]
j3.pop()

# or do this
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA",
                         IUPAC.unambiguous_dna)

# print(mutable_seq.remove("G"))
コード例 #16
0
ファイル: ref_func.py プロジェクト: Char-Al/NEAT
def read_ref(ref_path, ref_inds_i, n_handling, n_unknowns=True, quiet=False):
    tt = time.time()
    if not quiet:
        print('reading ' + ref_inds_i[0] + '... ')

    absolute_reference_path = pathlib.Path(ref_path)
    if absolute_reference_path.suffix == '.gz':
        ref_file = gzip.open(absolute_reference_path, 'rt')
    else:
        ref_file = open(absolute_reference_path, 'r')

    # TODO convert to SeqIO containers
    # for seq_record in SeqIO.parse(ref_file, "fasta"):
    #     pass


    ref_file.seek(ref_inds_i[1])
    my_dat = ''.join(ref_file.read(ref_inds_i[2] - ref_inds_i[1]).split('\n'))
    my_dat = Seq(my_dat.upper())
    # Mutable seqs have a number of disadvantages. I'm going to try making them immutable and see if that helps
    # my_dat = my_dat.tomutable()

    # find N regions
    # data explanation: my_dat[n_atlas[0][0]:n_atlas[0][1]] = solid block of Ns
    prev_ni = 0
    n_count = 0
    n_atlas = []
    for i in range(len(my_dat)):
        if my_dat[i] == 'N' or (n_unknowns and my_dat[i] not in OK_CHR_ORD):
            if n_count == 0:
                prev_ni = i
            n_count += 1
            if i == len(my_dat) - 1:
                n_atlas.append((prev_ni, prev_ni + n_count))
        else:
            if n_count > 0:
                n_atlas.append((prev_ni, prev_ni + n_count))
            n_count = 0

    # handle N base-calls as desired
    # TODO this seems to randomly replace an N with a base. Is this necessary? How to do this in an immutable seq?
    n_info = {'all': [], 'big': [], 'non_N': []}
    if n_handling[0] == 'random':
        for region in n_atlas:
            n_info['all'].extend(region)
            if region[1] - region[0] <= n_handling[1]:
                for i in range(region[0], region[1]):
                    temp = my_dat.tomutable()
                    temp[i] = random.choice(ALLOWED_NUCL)
                    my_dat = temp.toseq()
            else:
                n_info['big'].extend(region)
    elif n_handling[0] == 'allChr' and n_handling[2] in OK_CHR_ORD:
        for region in n_atlas:
            n_info['all'].extend(region)
            if region[1] - region[0] <= n_handling[1]:
                for i in range(region[0], region[1]):
                    temp = my_dat.tomutable()
                    temp[i] = n_handling[2]
                    my_dat = temp.toseq()
            else:
                n_info['big'].extend(region)
    elif n_handling[0] == 'ignore':
        for region in n_atlas:
            n_info['all'].extend(region)
            n_info['big'].extend(region)
    else:
        print('\nERROR: UNKNOWN N_HANDLING MODE\n')
        sys.exit(1)

    habitable_regions = []
    if not n_info['big']:
        n_info['non_N'] = [(0, len(my_dat))]
    else:
        for i in range(0, len(n_info['big']), 2):
            if i == 0:
                habitable_regions.append((0, n_info['big'][0]))
            else:
                habitable_regions.append((n_info['big'][i - 1], n_info['big'][i]))
        habitable_regions.append((n_info['big'][-1], len(my_dat)))
    for n in habitable_regions:
        if n[0] != n[1]:
            n_info['non_N'].append(n)

    ref_file.close()

    if not quiet:
        print('{0:.3f} (sec)'.format(time.time() - tt))

    return my_dat, n_info
コード例 #17
0
seq=Seq('CCGGUU',IUPAC.IUPACUnambiguousRNA())	#constructor class IUPAC...RNA
print seq
print seq.back_transcribe()	#must be RNA to backtranscribe to DNA

seq=Seq('ATGGTCTTTCCAGACGCG',IUPAC.unambiguous_dna)
print Seq.transcribe(seq)	#as function, up is as method

print seq[:5]	#methods as string
print len(seq)
#seq[0]='C'	#aren't mutables
st=str(seq)		#toString
print st

#tipo de dato secuencia editable
from Bio.Seq import MutableSeq
mut_seq=seq.tomutable()	#convertirlo a tipo seq mutable
print mut_seq
mut_seq[0]='C'
print mut_seq
mut_seq=MutableSeq('ATGCCG',IUPAC.IUPACUnambiguousDNA())
#has methods as a list: append(), insert(), pop(), remove()
mut_seq[1:3]='TTT'
mut_seq.reverse()
mut_seq.complement()
print mut_seq
mut_seq.reverse_complement()
print mut_seq

#tipo de dato metadatos de secuencia
from Bio.SeqRecord import SeqRecord
seqrec=SeqRecord(seq,id='001', name='My Secuencia')
コード例 #18
0
             if codon_1.translate(table="Bacterial") != (Seq(str(codon_1_ms),generic_dna)).translate(table="Bacterial"):
                 nonsyn_subst_int.append(1)
                 nonsyn_position_int[str(perm[l][l2]+1)] = nonsyn_position_int[str(perm[l][l2]+1)]+1
                 if (CODON[perm[l][l2]] == "A" or CODON[perm[l][l2]] == "T") and (codon_1_ms[perm[l][l2]] == "G" or codon_1_ms[perm[l][l2]] == "C"):
                     nonsyn_subst_to_gc_int.append(1)
                     nonsyn_position_to_gc_int[str(perm[l][l2]+1)] = nonsyn_position_to_gc_int[str(perm[l][l2]+1)]+1
                     gc_dict_int[str(perm[l][l2]+1)] = gc_dict_int[str(perm[l][l2]+1)]+1
             else:
                 syn_subst_int.append(1)
                 syn_position_int[str(perm[l][l2]+1)] = syn_position_int[str(perm[l][l2]+1)]+1
                 if (CODON[perm[l][l2]] == "A" or CODON[perm[l][l2]] == "T") and (codon_1_ms[perm[l][l2]] == "G" or codon_1_ms[perm[l][l2]] == "C"):
                     syn_subst_to_gc_int.append(1)
                     syn_position_to_gc_int[str(perm[l][l2]+1)] = syn_position_to_gc_int[str(perm[l][l2]+1)]+1
                     gc_dict_int[str(perm[l][l2]+1)] = gc_dict_int[str(perm[l][l2]+1)]+1
         codon_1 = Seq(str(codon_1_ms))
     codon_1_ms, codon_2_ms = codon_1.tomutable(), codon_2.tomutable()
 
 # Score correction
 
 syn_subst.append(sum(syn_subst_int)*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int))))
 nonsyn_subst.append((sum(nonsyn_subst_int)*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int)))))
 syn_subst_to_gc.append((sum(syn_subst_to_gc_int)*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int)))))
 nonsyn_subst_to_gc.append((sum(nonsyn_subst_to_gc_int)*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int)))))
 for x in range(0,3):
     gc_dict[str(x+1)] = gc_dict[str(x+1)]+(gc_dict_int[str(x+1)]*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int))))
     codon_gc_dict[str(x+1)] = codon_gc_dict[str(x+1)]+(codon_gc_dict_int[str(x+1)]*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int))))
     syn_position[str(x+1)] = syn_position[str(x+1)]+(syn_position_int[str(x+1)]*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int))))
     nonsyn_position[str(x+1)] = nonsyn_position[str(x+1)]+(nonsyn_position_int[str(x+1)]*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int))))
     syn_position_to_gc[str(x+1)] = syn_position_to_gc[str(x+1)]+(syn_position_to_gc_int[str(x+1)]*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int))))
     nonsyn_position_to_gc[str(x+1)] = nonsyn_position_to_gc[str(x+1)]+(nonsyn_position_to_gc_int[str(x+1)]*(float(len(perm))/(sum(syn_subst_int)+sum(nonsyn_subst_int))))
     
コード例 #19
0
ファイル: seqparse.py プロジェクト: DadongZ/python_tutorial
my_gene = Seq("ACTAGCAGCGGA", generic_dna)
print(type(my_gene))
attributes = [a for a in dir(my_gene) if not a.startswith("_")]
print(attributes)

my_transcript = my_gene.transcribe()
print(my_transcript)
print(my_transcript.alphabet)

my_protein = my_gene.translate()
print(my_protein)
print(my_protein.alphabet)

coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", generic_dna)
myprot = coding_dna.translate(to_stop=True)
print(myprot)

seq1 = Seq("AAACGGA", generic_dna)
seq2 = Seq("GGAGAT", generic_dna)
mut_seq = seq1.tomutable()
mut_seq
mut_seq[0] = "G"
print(mut_seq)

myseq = Seq("CCAGAAACCCGGAA", generic_dna)
#find the first occurence of the pattern
print(myseq.find("GAA"))
#find the number of non-overlapping occurrences of a pattern
print(myseq.count("GAA"))
コード例 #20
0
ファイル: lecture13.py プロジェクト: villegar/BIO792
dna_seq = Seq("ACGT", generic_dna)
prot_seq = Seq("ACGT", generic_protein)
dna_seq == prot_seq
## BiopythonWarning: Incompatible alphabets DNAAlphabet() and ProteinAlphabet()


# MutableSeq objects 
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

#my_seq[5] = "G"
## Error expected


mutable_seq = my_seq.tomutable()
mutable_seq
new_seq = mutable_seq.toseq()
new_seq

from Bio.Seq import MutableSeq
from Bio.Alphabet import IUPAC
mutable_seq = MutableSeq("GCCATTGTAATGGGCCGCTGAAAGGGTGCCCGA", IUPAC.unambiguous_dna)

mutable_seq
mutable_seq[5] = "C"
mutable_seq
mutable_seq.remove("T")
mutable_seq
mutable_seq.reverse()
mutable_seq
コード例 #21
0
from Bio.Seq import Seq
seq1 = Seq("ACSA")
seq2 = Seq("AST")
seq3 = Seq("ACCST")
hmm1 = Seq("")
hmm1 = hmm1.tomutable()
batas = 4

def kurangDari(seqAwal,seqAkhir):
    y = 0
    for x in range(len(seqAwal)):
        if seqAwal[x] == seqAkhir[y]:
            hmm1.append('M')
            y = y + 1
        elif x + 1 < len(seqAwal):
            if seqAwal[x + 1] == seqAkhir[y]:
                hmm1.append('D')
            else:
                hmm1.append('D')
                hmm1.append('I')
                y = y + 1
        else:
            hmm1.append('M')

def lebihDari(seqAwal,seqAkhir):
    y = 0
    for x in range(len(seqAkhir)):
        if seqAwal[y] == seqAkhir[x]:
            hmm1.append('M')
            y = y + 1
        elif x + 1 < len(seqAkhir):
コード例 #22
0
ファイル: b.py プロジェクト: abinthomas744/Perl_Bio-Info_L-S6
my_seq = Seq("GCCATTGTAATGGAATTAGTGGGTAACCCAGGGTAAACCTACCACCCCAGCCACCTCAG",
             IUPAC.unambiguous_dna)

#my_seq[5]="G" This line won't work becuase the object(Seq) is immutable
from Bio.Seq import MutableSeq

mutable_seq = MutableSeq(
    "GCCATTGTAATGGAATTAGTGGGTAACCCAGGGTAAACCTACCACCCCAGCCACCTCAG",
    IUPAC.unambiguous_dna)
mutable_seq[5] = "C"
mutable_seq.remove("T")

mutable_seq.reverse()

k = my_seq.count('G')
mut_newseq = my_seq.tomutable()
for i in range(k):
    mut_newseq.remove("G")

my_seq = Seq("AGCTTCCATTTGGGTCATGATCC")

print(my_seq.complement())
my_seq.count("T")

GC_count = 100 * float(my_seq.count("G")) + my_seq.count("C") / len(my_seq)
print(GC_count)

#Slicing a sequence
my_seq1 = Seq("GATTATTTCCCCGCGCCCAGTCAAGGTAGTGCCATAACCGTCCCTG",
              IUPAC.unambiguous_dna)
print(my_seq1[4:12])
コード例 #23
0
ファイル: bio.py プロジェクト: Breenori/Development
# Transcription and Tanslation
coding_dna = Seq("ATGGCCATTGTAATG")
template_dna = coding_dna.reverse_complement()
messenger_rna = transcribe(coding_dna)
print(messenger_rna)

print(back_transcribe(messenger_rna))
print(translate(messenger_rna))

myThirdSequence = Seq("GATCGATGGGGGCTATCC")
print(GC(myThirdSequence))

# MutableSeq objects
print(dna_seq)
#dna_seq[0]="T" --> Nicht veränderbar!
mutable_seq = dna_seq.tomutable()
mutable_seq[0] = "T"
print(mutable_seq)

mutableSeq = MutableSeq("GCCCATC")
mutableSeq[1] = "A"
print(mutableSeq)
print('\n')

#----------------------------------------

print("FASTA File")
handle = open("ecoli.fasta")

for seq_record in SeqIO.parse(handle, 'fasta'):
    print(seq_record.id)
コード例 #24
0
	def _constructSNPAnnotation(self, session, snp_info, snps_context_wrapper, gene_annotation, snp_annotation_short_name2id):
		"""
		2009-2-5
			bug fixed. when adding a box as UTR, make sure after the forloop above, the current box is still the UTR.
		2009-1-5
		"""
		sys.stderr.write("Constructing SNPAnnotation ...\n")
		standard_translator = Translate.unambiguous_dna_by_id[1]
		counter = 0
		real_counter = 0
		for i in range(len(snp_info.chr_pos_ls)):
			counter += 1
			snps_id, chr, pos, allele1, allele2 = snp_info.data_ls[i]
			snps_context_matrix = snps_context_wrapper.returnGeneLs(chr, pos)
			snp_annotation_type_short_name_ls = []	#each element is (snp_annotation_type_short_name, gene_id, gene_commentary_id, which_exon_or_intron, pos_within_codon)
			if snps_context_matrix:
				for snps_context in snps_context_matrix:
					snps_id, disp_pos, gene_id = snps_context
					gene_model = gene_annotation.gene_id2model.get(gene_id)
					if gene_model is None:
						continue
					for gene_commentary in gene_model.gene_commentaries:
						if gene_commentary.protein_box_ls:
							which_intron = -1	#which intron the SNP resides, starting from 1
							which_coding_exon = -1	#which exon the SNP resides in terms of the CDS sequence, starting from 1
							accum_intron_len = 0	#the cumulative length of all introns from the one after the first coding exon up till the SNP's position (including the intron the SNP is in if it's under the rule).
							exon_5_end_pos = -1	#5' end position of the whole CDS sequence. bad name 'exon'
							UTR_2nd = 0	#whether this is the 2nd UTR
							box_type = None
							UTR_type = None
							is_SNP_within_box = 0	#protect against the possibility that SNP overshoots box_ls
							for i in range(len(gene_commentary.box_ls)):
								box = gene_commentary.box_ls[i]
								start, stop, box_type, is_translated, protein_box_index = box
								if box_type=='exon' and is_translated==0:
									if UTR_2nd==0:
										if gene_model.strand=='-1':
											UTR_type='3UTR'
										else:
											UTR_type='5UTR'
										UTR_2nd += 1
									else:
										if gene_model.strand=='-1':
											UTR_type='5UTR'
										else:
											UTR_type='3UTR'
								elif box_type=='exon' and is_translated==1:
									if exon_5_end_pos==-1:
										exon_5_end_pos=start
									which_coding_exon += 1
								elif box_type=='intron':
									if which_coding_exon!=-1:	#exclude introns that stand before the 1st coding exon
										accum_intron_len += abs(stop-start)+1	#+1 because stop-start is intron_length-1
									which_intron += 1
								
								if pos>=start and pos<=stop:	#with this box
									is_SNP_within_box = 1
									break
							
							if gene_model.strand=='-1':
								#continue to read the box_ls to count no_of_introns
								no_of_introns = which_intron+1
								for j in range(i+1, len(gene_commentary.box_ls)):
									box = gene_commentary.box_ls[i]
									if box[2]=='intron':
										no_of_introns += 1
							
							if box_type!=None and is_SNP_within_box==1:	#box_type is the type of the box where the SNP resides
								if UTR_type!=None and box_type=='exon' and is_translated==0:	#it's UTR. bug fixed. make sure after the forloop above, the current box is still the UTR.
									snp_annotation_type_short_name_ls.append((UTR_type, gene_id, gene_commentary.gene_commentary_id))
								else:
									if gene_model.strand=='-1':	#reverse the order of exon/intron
										no_of_coding_exons = len(gene_commentary.protein_box_ls)
										#no_of_introns = len(gene_commentary.mrna_box_ls)-no_of_coding_exons	#not right
										which_coding_exon = no_of_coding_exons-which_coding_exon
										which_intron = no_of_introns-which_intron
									else:
										which_coding_exon += 1
										which_intron += 1
									if box_type=='intron':
										snp_annotation_type_short_name_ls.append(('intron', gene_id, gene_commentary.gene_commentary_id, which_intron))
										if pos-start<=1:	#within the donor/acceptor two-nucleotide
											if gene_model.strand=='-1':
												snp_annotation_type_short_name = 'splice-acceptor'	#on the 3' of this intron
											else:
												snp_annotation_type_short_name = 'splice-donor'	#on the 5' of this intron
											snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, which_intron))
										elif stop-pos<=1:
											if gene_model.strand=='-1':
												snp_annotation_type_short_name = 'splice-donor'	#on the 5' of this intron
											else:
												snp_annotation_type_short_name = 'splice-acceptor'	#on the 3' of this intron
											snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, which_intron))
									elif box_type=='exon':	#must be translated
										try:
											SNP_index_in_CDS = pos - exon_5_end_pos - accum_intron_len
											if gene_model.strand=='-1':	#reverse
												SNP_index_in_CDS = len(gene_commentary.cds_sequence)-SNP_index_in_CDS-1	#-1 because SNP_index_in_CDS starts from 0
												gene_allele1 = nt2complement[allele1]
												gene_allele2 = nt2complement[allele2]
											else:
												gene_allele1 = allele1
												gene_allele2 = allele2
											SNP_index_in_CDS = int(SNP_index_in_CDS)	#SNP_index_in_CDS is type long. without int(), cds_seq[SNP_index_in_CDS] returns a Bio.Seq with one nucleotide, rather than a single-char string
											SNP_index_in_peptide = SNP_index_in_CDS/3
											
											SNP_index_in_peptide = int(SNP_index_in_peptide)	#ditto as int(SNP_index_in_CDS), not necessary
											
											pos_within_codon = SNP_index_in_CDS%3+1	#pos_within_codon starts from 1
											cds_seq = Seq(gene_commentary.cds_sequence, IUPAC.unambiguous_dna)
											if SNP_index_in_CDS>=len(cds_seq):
												sys.stderr.write("Warning: SNP (%s, %s), SNP_index_in_CDS=%s, is beyond any of the boxes from gene %s (chr=%s, %s-%s), gene_commentary_id %s (%s-%s), box_ls=%s, cds-length=%s. counted as intergenic.\n"%\
																(chr, pos, SNP_index_in_CDS, gene_id, gene_model.chromosome, gene_model.start, gene_model.stop, \
																gene_commentary.gene_commentary_id, gene_commentary.start, gene_commentary.stop, repr(gene_commentary.box_ls), len(cds_seq)))
												snp_annotation_type_short_name_ls.append(['intergenic'])
												continue
											if cds_seq[SNP_index_in_CDS]!=gene_allele1 and cds_seq[SNP_index_in_CDS]!=gene_allele2:
												sys.stderr.write("Error: Neither allele (%s, %s) from SNP (%s,%s) matches the nucleotide, %s, from the cds seq of gene %s (gene_commentary_id=%s).\n"%\
																	(gene_allele1, gene_allele2, chr, pos, cds_seq[SNP_index_in_CDS], gene_id, gene_commentary.gene_commentary_id))
												sys.exit(3)
											cds_mut_ar = cds_seq.tomutable()
											cds_mut_ar[SNP_index_in_CDS] = gene_allele1
											peptide = standard_translator.translate(cds_mut_ar.toseq())
											
											alt_cds_mut_ar = cds_seq.tomutable()
											alt_cds_mut_ar[SNP_index_in_CDS] = gene_allele2
											alt_peptide = standard_translator.translate(alt_cds_mut_ar.toseq())
											aa = peptide[SNP_index_in_peptide]
											alt_aa = alt_peptide[SNP_index_in_peptide]
											if aa != alt_aa:
												snp_annotation_type_short_name = 'non-synonymous'
												comment = '%s->%s'%(aa, alt_aa)
											else:
												snp_annotation_type_short_name = 'synonymous'
												comment = None
											snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, which_coding_exon, pos_within_codon, comment))
											
											if aa != alt_aa:
												if aa=='*' or alt_aa=='*':
													snp_annotation_type_short_name = 'premature-stop-codon'	#could also be the last stop codon changing to something else and thereby extending the cds
													snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, which_coding_exon, pos_within_codon, comment))
												if SNP_index_in_peptide==0:
													snp_annotation_type_short_name = 'init-Met'
													snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, which_coding_exon, pos_within_codon, comment))
										except:
											traceback.print_exc()
											sys.stderr.write('%s.\n'%repr(sys.exc_info()))
											sys.stderr.write("Except encountered for SNP (%s, %s), gene %s (chr=%s, %s-%s), gene_commentary_id %s (%s-%s), box_ls=%s.\n"%\
												(chr, pos, gene_id, gene_model.chromosome, gene_model.start, gene_model.stop, \
												gene_commentary.gene_commentary_id, gene_commentary.start, gene_commentary.stop, repr(gene_commentary.box_ls)))
							elif box_type!=None and is_SNP_within_box==0:	#SNP is over the range of the gene. it happens when one gene has multiple alternative splicing forms. snps_context_wrapper uses the largest span to represent the gene.
								snp_annotation_type_short_name_ls.append(['intergenic'])
							else:
								sys.stderr.write("Warning: SNP (%s, %s) not in any of the boxes from gene %s (chr=%s, %s-%s), gene_commentary_id %s (%s-%s), box_ls=%s.\n"%\
												(chr, pos, gene_id, gene_model.chromosome, gene_model.start, gene_model.stop, \
												gene_commentary.gene_commentary_id, gene_commentary.start, gene_commentary.stop, repr(gene_commentary.box_ls)))
						else:
							if gene_model.type_of_gene=='pseudo':
								snp_annotation_type_short_name = gene_model.type_of_gene
							else:
								snp_annotation_type_short_name = gene_commentary.gene_commentary_type
							snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary.gene_commentary_id, None))
			else:	#integenic
				snp_annotation_type_short_name_ls.append(['intergenic'])
			
			#now save everything into db
			for snp_annotation_type_tup in snp_annotation_type_short_name_ls:
				snp_annotation_type_short_name = snp_annotation_type_tup[0]
				if snp_annotation_type_short_name not in snp_annotation_short_name2id:
					ty = Stock_250kDB.SNPAnnotationType(short_name=snp_annotation_type_short_name)
					session.save(ty)
					session.flush()
					snp_annotation_short_name2id[snp_annotation_type_short_name] = ty.id
				if len(snp_annotation_type_tup)>=3:
					gene_id = snp_annotation_type_tup[1]
					gene_commentary_id = snp_annotation_type_tup[2]
				else:
					gene_id = None
					gene_commentary_id = None
				if len(snp_annotation_type_tup)>=4:
					which_exon_or_intron = snp_annotation_type_tup[3]
				else:
					which_exon_or_intron = None
				if len(snp_annotation_type_tup)>=5:
					pos_within_codon = snp_annotation_type_tup[4]
				else:
					pos_within_codon = None
				if len(snp_annotation_type_tup)>=6:
					comment = snp_annotation_type_tup[5]
				else:
					comment = None
				entry = Stock_250kDB.SNPAnnotation(snps_id=snps_id, gene_id=gene_id, gene_commentary_id=gene_commentary_id, \
												which_exon_or_intron=which_exon_or_intron, pos_within_codon=pos_within_codon,\
												comment=comment)
				entry.snp_annotation_type_id = snp_annotation_short_name2id[snp_annotation_type_short_name]
				session.save(entry)
				session.flush()
				real_counter += 1
			if self.report and counter%2000==0:
				sys.stderr.write("%s%s\t%s"%('\x08'*40, counter, real_counter))
		if self.report:
			sys.stderr.write("%s%s\t%s\n"%('\x08'*40, counter, real_counter))
		sys.stderr.write("Done.\n")
コード例 #25
0
#from Bio import motifs
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Seq import MutableSeq

#Parse 16S Seq
for rna in SeqIO.parse("vibrio.fasta", "fasta"):
    #print(rna.seq)
    #print(rna.id)
    print(len(rna))

#Probe Seq
Probe = Seq("AGGCCACAACCTCCAAGTAG")

#convert probe string to mutable object
mutable_probe = Probe.tomutable()
global MutNum


#generate the random mutation position
def RandomPosition(Probe, MutateNum):
    rp = np.random.choice(range(len(Probe)), MutateNum, replace=False)
    RP = rp.tolist()
    RP.sort()  #sort the random position in convenient of checking
    return RP


#generate the random mutation base
def RandomBase(ExistBase):
    Base = ["A", "T", "C", "G"]
    Base.remove(ExistBase)
コード例 #26
0
	def _constructSNPAnnotation(self, db_vervet, locus=None, oneGeneData=None, locus_context=None, locus_annotation_short_name2db_entry=None,\
							geneCommentaryRBDict=None, geneSegmentKey = None, compareIns=None, param_obj=None):
		"""
		2012.5.18
			add which_codon into the db_vervet.getLocusAnnotation()
		2012.5.14
			adapted from variation/src/ConstructSNPAnnotation.py
			
			locus is of type VervetDB.Locus
			oneGeneData = PassingData(strand = row.strand, gene_id = row.id, gene_start = row.start, \
										gene_stop = row.stop, geneCommentaryRBDictLs=[],\
										ncbi_gene_id=row.ncbi_gene_id)
		2009-2-5
			bug fixed. when adding a box as UTR, make sure after the forloop above, the current box is still the UTR.
		2009-1-5
		"""
		sys.stderr.write("Constructing LocusAnnotation for SNP ...\n")
		#standard_translator = Translate.unambiguous_dna_by_id[1]	#2012.5.23 Translate is to be deprecated.
		counter = 0
		real_counter = 0
		counter += 1
		chr=locus.chromosome
		pos = locus.start
		allele1 = locus.ref_seq.sequence.encode('ascii')	#2012.5.17 unicode is not accepted by cds_seq.tomutable()
		allele2 = locus.alt_seq.sequence.encode('ascii')
		snp_annotation_type_short_name_ls = []	#each element is (snp_annotation_type_short_name, gene_id, gene_commentary_id, 
				# which_exon_or_intron, pos_within_codon)
		disp_pos = locus_context.disp_pos
		gene_id = locus_context.gene_id
		#gene_box_node_ls = []
		#geneCommentaryRBDict.findNodes(segmentKey, node_ls=gene_box_node_ls, compareIns=compareIns)
		gene_commentary_id = geneCommentaryRBDict.gene_commentary_id
		box_ls = geneCommentaryRBDict.box_ls
		protein_box_ls = geneCommentaryRBDict.protein_box_ls
		#gene_commentary = GeneCommentary.get(gene_commentary_id)
		#box_ls = gene_commentary.construct_annotated_box()
		cds_sequence = geneCommentaryRBDict.cds_sequence
		geneCommentaryRBDict.gene_commentary_type_name
		CDS_5_end_pos = geneCommentaryRBDict.CDS_5_end_pos
		no_of_introns = geneCommentaryRBDict.no_of_introns
		
		detailed_box_type = geneSegmentKey.label
		is_translated = geneSegmentKey.is_translated
		which_intron = geneSegmentKey.intron_number	#which intron the SNP resides, starting from 1
		which_coding_exon = geneSegmentKey.cds_number	#which exon the SNP resides in terms of the CDS sequence, starting from 1
		cumulativeWithinCDSUTRAndIntronLen = geneSegmentKey.cumulativeWithinCDSUTRAndIntronLen
		gene_segment_id = geneSegmentKey.gene_segment_id
		
		if protein_box_ls:	#this is a protein coding gene
			if detailed_box_type.find('UTR')>=0 and detailed_box_type=='exon' and is_translated==0:	#it's UTR. bug fixed. make sure after the forloop above, 
					# the current box is still the UTR.
				snp_annotation_type_short_name_ls.append((detailed_box_type, gene_id, gene_commentary_id, None , None, None))
			else:
				if oneGeneData.strand=='-1':	#reverse the order of exon/intron
					no_of_coding_exons = len(protein_box_ls)
					#no_of_introns = len(gene_commentary.mrna_box_ls)-no_of_coding_exons	#not right
					which_coding_exon = no_of_coding_exons-which_coding_exon+1
					which_intron = no_of_introns-which_intron + 1
				if detailed_box_type=='intron':
					snp_annotation_type_short_name_ls.append(('intron', gene_id, gene_commentary_id, which_intron, None, None))
					if pos-geneSegmentKey.start<=1:	#within the donor/acceptor two-nucleotide
						if oneGeneData.strand=='-1':
							snp_annotation_type_short_name = 'splice-acceptor'	#on the 3' of this intron
						else:
							snp_annotation_type_short_name = 'splice-donor'	#on the 5' of this intron
						snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, \
																gene_commentary_id, which_intron, None, None))
					elif geneSegmentKey.stop-pos<=1:
						if oneGeneData.strand=='-1':
							snp_annotation_type_short_name = 'splice-donor'	#on the 5' of this intron
						else:
							snp_annotation_type_short_name = 'splice-acceptor'	#on the 3' of this intron
						snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, \
																gene_commentary_id, which_intron, None, None))
				elif detailed_box_type=='CDS':	#must be translated
					SNP_index_in_CDS = pos - CDS_5_end_pos - cumulativeWithinCDSUTRAndIntronLen
					if oneGeneData.strand=='-1':	#reverse
						SNP_index_in_CDS = len(cds_sequence)-SNP_index_in_CDS-1	#-1 because SNP_index_in_CDS starts from 0
						gene_allele1 = nt2complement[allele1]
						gene_allele2 = nt2complement[allele2]
					else:
						gene_allele1 = allele1
						gene_allele2 = allele2
					SNP_index_in_CDS = int(SNP_index_in_CDS)	
					# SNP_index_in_CDS is type long. without int(), cds_seq[SNP_index_in_CDS] returns a Bio.Seq with one nucleotide, 
					# rather than a single-char string
					SNP_index_in_peptide = SNP_index_in_CDS/3
					
					SNP_index_in_peptide = int(SNP_index_in_peptide)	#ditto as int(SNP_index_in_CDS), not necessary
					
					pos_within_codon = SNP_index_in_CDS%3+1	#pos_within_codon starts from 1
					cds_seq = Seq(cds_sequence, IUPAC.unambiguous_dna)
					if SNP_index_in_CDS>=len(cds_seq):
						sys.stderr.write("Error: SNP (%s, %s), SNP_index_in_CDS=%s, is beyond any of the boxes from gene %s (chr=%s, %s-%s), \
								gene_commentary_id %s (%s-%s), box_ls=%s, cds-length=%s. counted as intergenic.\n"%\
								(chr, pos, SNP_index_in_CDS, gene_id, oneGeneData.chromosome, oneGeneData.gene_start, oneGeneData.gene_stop, \
								gene_commentary_id, geneCommentaryRBDict.start, geneCommentaryRBDict.stop, repr(box_ls), \
								len(cds_seq)))
						sys.exit(3)
						snp_annotation_type_short_name_ls.append(['intergenic'])
					if cds_seq[SNP_index_in_CDS]!=gene_allele1 and cds_seq[SNP_index_in_CDS]!=gene_allele2:
						sys.stderr.write("Error: Neither allele (%s, %s) from SNP (%s,%s) matches the nucleotide, %s, from the cds seq of gene %s \
							(gene_commentary_id=%s).\n"%\
							(gene_allele1, gene_allele2, chr, pos, cds_seq[SNP_index_in_CDS], gene_id, gene_commentary_id))
						sys.exit(3)
					cds_mut_ar = cds_seq.tomutable()
					cds_mut_ar[SNP_index_in_CDS] = gene_allele1
					peptide = cds_mut_ar.toseq().translate()	#2012.5.23 no more translator. table=1
					
					alt_cds_mut_ar = cds_seq.tomutable()
					alt_cds_mut_ar[SNP_index_in_CDS] = gene_allele2
					alt_peptide = alt_cds_mut_ar.toseq().translate()
					aa = peptide[SNP_index_in_peptide]
					alt_aa = alt_peptide[SNP_index_in_peptide]
					if aa != alt_aa:
						snp_annotation_type_short_name = 'non-synonymous'
						comment = '%s->%s'%(aa, alt_aa)
					else:
						snp_annotation_type_short_name = 'synonymous'
						comment = None
					snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary_id, \
															which_coding_exon, pos_within_codon, comment, SNP_index_in_peptide))
					
					if aa != alt_aa:
						if aa=='*' or alt_aa=='*':
							snp_annotation_type_short_name = 'premature-stop-codon'	#could also be the last stop codon changing to something else 
							# and thereby extending the cds
							snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary_id, \
															which_coding_exon, pos_within_codon, comment, SNP_index_in_peptide))
						if SNP_index_in_peptide==0:
							snp_annotation_type_short_name = 'init-Met'
							snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary_id, \
															which_coding_exon, pos_within_codon, comment, SNP_index_in_peptide))
					"""
					except:
						traceback.print_exc()
						sys.stderr.write('%s.\n'%repr(sys.exc_info()))
						sys.stderr.write("Except encountered for SNP (%s, %s), gene %s (chr=%s, %s-%s), gene_commentary_id %s (%s-%s), box_ls=%s.\n"%\
							(chr, pos, gene_id, locus.chromosome, oneGeneData.start, oneGeneData.stop, \
							gene_commentary_id, geneCommentaryRBDict.start, geneCommentaryRBDict.stop, repr(box_ls)))
					"""
		else:
			if oneGeneData.type_of_gene=='pseudo':
				snp_annotation_type_short_name = oneGeneData.type_of_gene
			else:
				snp_annotation_type_short_name = geneCommentaryRBDict.gene_commentary_type_name
			snp_annotation_type_short_name_ls.append((snp_annotation_type_short_name, gene_id, gene_commentary_id, \
													None, None, None))
		#else:	#integenic
		#	snp_annotation_type_short_name_ls.append(['intergenic'])
			
		#now save everything into db
		locus_annotation_ls = []
		for snp_annotation_type_tup in snp_annotation_type_short_name_ls:
			snp_annotation_type_short_name = snp_annotation_type_tup[0]
			if snp_annotation_type_short_name not in locus_annotation_short_name2db_entry:
				ty = db_vervet.getLocusAnnotationType(locus_annotation_type_short_name=snp_annotation_type_short_name)
				locus_annotation_short_name2db_entry[snp_annotation_type_short_name] = ty
			if len(snp_annotation_type_tup)>=3:
				gene_id = snp_annotation_type_tup[1]
				gene_commentary_id = snp_annotation_type_tup[2]
			else:
				gene_id = None
				gene_commentary_id = None
			if len(snp_annotation_type_tup)>=4:
				which_exon_or_intron = snp_annotation_type_tup[3]
			else:
				which_exon_or_intron = None
			if len(snp_annotation_type_tup)>=5:
				pos_within_codon = snp_annotation_type_tup[4]
			else:
				pos_within_codon = None
			if len(snp_annotation_type_tup)>=6:
				comment = snp_annotation_type_tup[5]
			else:
				comment = None
			if len(snp_annotation_type_tup)>=7:
				which_codon = snp_annotation_type_tup[6] +1	#[6] is the SNP_index_in_peptide
			else:
				which_codon = None
			locus_annotation_type = locus_annotation_short_name2db_entry.get(snp_annotation_type_short_name)
			
			locus_annotation = db_vervet.getLocusAnnotation(locus_id=locus.id, locus_context_id=locus_context.id, \
													locus_context=locus_context, gene_id=gene_id, \
													gene_commentary_id=gene_commentary_id, \
													gene_segment_id=gene_segment_id, \
													locus_annotation_type=locus_annotation_type, locus_annotation_type_id=locus_annotation_type.id,\
													which_exon_or_intron=which_exon_or_intron, pos_within_codon=pos_within_codon, \
													which_codon=which_codon, label=geneSegmentKey.label, \
													utr_number=geneSegmentKey.utr_number, cds_number=geneSegmentKey.cds_number, \
													intron_number=geneSegmentKey.intron_number,\
													exon_number=geneSegmentKey.exon_number, overlap_length=None, \
													overlap_fraction_in_locus=None, overlap_fraction_in_gene=None,\
													comment=comment)
			if locus_annotation:
				param_obj.no_of_locus_annotations_already_in_db += 1
			else:
				param_obj.no_of_into_db += 1
			param_obj.no_of_total_annotations += 1
			locus_annotation_ls.append(locus_annotation)
			real_counter += 1
		"""
		if self.report and counter%2000==0:
			sys.stderr.write("%s%s\t%s"%('\x08'*40, counter, real_counter))
			session.flush()
		if self.report:
			sys.stderr.write("%s%s\t%s\n"%('\x08'*40, counter, real_counter))
		"""
		sys.stderr.write("Done.\n")
		return locus_annotation_ls
コード例 #27
0
#print CodonTable.unambiguous_dna_by_id[2]
print CodonTable.unambiguous_dna_by_id[1].stop_codons
print CodonTable.unambiguous_dna_by_id[2].start_codons
print CodonTable.unambiguous_dna_by_id[1].forward_table['ACG']    # which aminoacid for this codon

#Comparing Sequences
seq1 = Seq('ACGT',IUPAC.unambiguous_dna)
seq2 = Seq('ACGT',IUPAC.unambiguous_dna)
seq3 = Seq('ACGT',IUPAC.protein)
print id(seq1) == id(seq2)    # seq1 == seq2 look for the same object
print str(seq1) == str(seq2)    # convert to string
print str(seq1) == str(seq3)    # dna similar enought to protein

#MutableSeq
from Bio.Seq import MutableSeq
mutseq = seq1.tomutable()    # convert to MutableSeq
print mutseq, type(mutseq)
mutSeq = MutableSeq('CGTTTAAGCTGC',IUPAC.unambiguous_dna)
print mutSeq, type(mutSeq)
mutseq[1]='T'    # imposible on simple Seq
print mutseq
seq1 = mutseq.toseq()    # convert to Seq
mutSeq.remove('A')    # remove first A
mutSeq[2:-5]='TTTT'
mutSeq.reverse()    # reverse() and reverse_complement() change object itself
print mutSeq
#MutableSeq can't be a dictionary key, Seq and string can

#UnknownSeq
# Subclass of Seq when you know length but not the characters to save memory
from Bio.Seq import UnknownSeq