def translateSixFrame(seq): """Translate seq in 6 frames""" from cogent import DNA from cogent.core.genetic_code import DEFAULT as standard_code translations = standard_code.sixframes(seq) stops_frame1 = standard_code.getStopIndices(seq, start=0) print translations return
def translate_seq(sequence, strand): ''' :param sequence: DNA sequence :param strand: strand of DNA sequence :return: translated DNA sequence ''' if str(strand) == "1": return standard_code.translate(sequence) else: return standard_code.translate(reverse_complement(sequence))
def ensembl_construct_sequences(psm_hash,ensembl,transcript_ids,database_v,species,three_frame_translation,mode,): ''' :param psm_hash: dictionair with protein / ensembl information ( see prepareAnnotationENSEMBL) :param ensembl:ensembl genome :param transcript_ids: list of transcrip ids (converted from protein IDs) :param database_v: database version :param species: species name :return: dictionairy mapping proteins into ENSEMBL ''' print "Commencing transcript and protein sequence retrieval" no_protein_seq=[] biomart_key_hash={} stable_transcript_ids=[] for key in psm_hash.keys(): biomart_key_hash[psm_hash[key]['transcript_id']]=key stable_transcript_ids.append(psm_hash[key]['transcript_id']) # Retrieve cds,chr,transcript_id and strand from biomart biomart_result=proBAM_biomart.retrieve_data_from_biomart(database_v,species,stable_transcript_ids,three_frame_translation) for row in biomart_result: row=row.split("\t") try: psm_hash[biomart_key_hash[row[1]]]['transcript_seq']=row[0] psm_hash[biomart_key_hash[row[1]]]['protein_seq']=standard_code.translate(row[0]) #TODO what to do with "special" ensembl chromosomes: currently leave them out => bam conversion #TODO considers these psms unmapped #if "_" in row[2]: # print row[1],row[2] psm_hash[biomart_key_hash[row[1]]]['chr']=row[2] psm_hash[biomart_key_hash[row[1]]]['strand']=row[3] del row except IndexError: pass del biomart_result # get exons directly from core database temp_exon_hash=get_ensembl_exons(ensembl,transcript_ids,psm_hash,mode) exon_hash=temp_exon_hash[0] psm_hash=temp_exon_hash[1] del temp_exon_hash # retrieve protein sequences for transcript where the protein sequence could not be fetched automatically for key in no_protein_seq: psm_hash[key]['transcript_seq']=retrieve_protein_seq(psm_hash[key]['transcript_seq'], exon_hash[psm_hash[key]['transcript_id']], psm_hash[key]['5UTR_offset'], psm_hash[key]['start_exon_rank']) #translate till stop codon psm_hash[key]['protein_seq']=standard_code.translate(psm_hash[key] ['transcript_seq']).partition('*')[0] return [psm_hash,exon_hash]
def findBestSeq(seqobject): dna_seq = str(seqobject.seq) my_seq = DNA.makeSequence(dna_seq,seqobject.id) # x=0 # framedict = dict() # while x < 3: # temp1 = my_seq[x:] # temp2 = temp1..withoutTerminalStopCodon() # framedict[x] = temp2.getTranslation() # x+=1 all_six = standard_code.sixframes(my_seq) seqlist = list() for frame in all_six: seqreturned = frame.split('*')[0] seqlist.append(seqreturned) longestseq = '' x=0 while x < 3: if len(longestseq) < len(seqlist[x]): longestseq = seqlist[x] correctdnaseq = my_seq[x:] x+=1 #longest_seq = max(seqlist, key=len) return longestseq, correctdnaseq
def map_peptide_to_protein_3frame(peptide_seq, transcript_seq, allowed_mismatches, strand): ''' :param peptide_seq: peptide sequence (string) :param transcript_seq: transcript sequence (string) :param allowed_mismatches: number of allowed mismatches :param strand: chromosome strand :return: number of hits of peptide on protein ''' size_adjust = -1 # adjust size of transcript for starting at +1/+2frame hits = [] pre_post_aa = ['', ''] pep_length = len(peptide_seq) frame = [0] * 3 frame[0] = standard_code.translate(transcript_seq) frame[1] = standard_code.translate(transcript_seq[1:]) frame[2] = standard_code.translate(transcript_seq[2:]) for f in frame: size_adjust += 1 for i in range(0, (len(f) - pep_length)): if hamming(peptide_seq, f[i:pep_length + i]) <= allowed_mismatches: adjusted_hit_pos = (i * 3) + size_adjust hits.append([ adjusted_hit_pos, hamming(peptide_seq, f[i:pep_length + i]) ]) # compute 2 preceding AA if (i - 1) == 0: pre_post_aa[0] = f[(i - 1)] elif (i - 2) >= 0: pre_post_aa[0] = f[(i - 2):i] else: pre_post_aa[0] = "*" # compute 2 folowwing AA if (i + 1) == (len(f) - 1): pre_post_aa[1] = f[pep_length + i] elif (i + 2) <= (len(f) - 1): pre_post_aa[1] = f[(pep_length + i):(pep_length + i + 2)] else: pre_post_aa[1] = "*" return [hits, pre_post_aa]
def map_peptide_to_protein_3frame(peptide_seq,transcript_seq,allowed_mismatches,strand): ''' :param peptide_seq: peptide sequence (string) :param transcript_seq: transcript sequence (string) :param allowed_mismatches: number of allowed mismatches :param strand: chromosome strand :return: number of hits of peptide on protein ''' size_adjust=-1 # adjust size of transcript for starting at +1/+2frame hits=[] pre_post_aa=['',''] pep_length=len(peptide_seq) frame=[0]*3 frame[0]=standard_code.translate(transcript_seq) frame[1]=standard_code.translate(transcript_seq[1:]) frame[2]=standard_code.translate(transcript_seq[2:]) for f in frame: size_adjust+=1 for i in range(0,(len(f)-pep_length)): if hamming(peptide_seq,f[i:pep_length+i]) <= allowed_mismatches: adjusted_hit_pos=(i*3)+size_adjust hits.append([adjusted_hit_pos,hamming(peptide_seq,f[i:pep_length+i])]) # compute 2 preceding AA if (i - 1) == 0: pre_post_aa[0] = f[(i - 1)] elif (i - 2) >= 0: pre_post_aa[0] = f[(i - 2):i] else: pre_post_aa[0] = "*" # compute 2 folowwing AA if (i + 1) == (len(f) - 1): pre_post_aa[1] = f[pep_length + i] elif (i + 2) <= (len(f) - 1): pre_post_aa[1] = f[(pep_length + i):(pep_length + i + 2)] else: pre_post_aa[1] = "*" return [hits,pre_post_aa]
def ensembl_construct_sequences( psm_hash, mysql_db, transcript_ids, database_v, species, three_frame_translation, mode, ): ''' :param psm_hash: dictionair with protein / ensembl information ( see prepareAnnotationENSEMBL) :param ensembl:ensembl genome :param transcript_ids: list of transcrip ids (converted from protein IDs) :param database_v: database version :param species: species name :return: dictionairy mapping proteins into ENSEMBL ''' print "Commencing transcript and protein sequence retrieval" no_protein_seq = [] biomart_key_hash = {} stable_transcript_ids = [] for key in psm_hash.keys(): biomart_key_hash[psm_hash[key]['transcript_id']] = key stable_transcript_ids.append(psm_hash[key]['transcript_id']) chunked_stable_transcript_id = chunkIt(stable_transcript_ids, 10) process = 0 c = 0 for chunk in chunked_stable_transcript_id: # Retrieve cds,chr,transcript_id and strand from biomart try: biomart_result = proBAM_biomart.retrieve_data_from_biomart( database_v, species, chunk, three_frame_translation) except AttributeError: time.sleep(60) print "BioMart connection timeout, reconnecting to BioMart" biomart_result = proBAM_biomart.retrieve_data_from_biomart( database_v, species, chunk, three_frame_translation) for row in biomart_result: row = row.split("\t") try: psm_hash[biomart_key_hash[row[1]]]['transcript_seq'] = row[0] psm_hash[biomart_key_hash[row[1]]]['shift'] = _calc_seq_shift_( row[0]) psm_hash[biomart_key_hash[ row[1]]]['protein_seq'] = standard_code.translate(row[0]) psm_hash[biomart_key_hash[row[1]]]['chr'] = row[2] psm_hash[biomart_key_hash[row[1]]]['strand'] = row[3] del row except IndexError: pass del biomart_result if process < 100: process += 10 print str(process) + "% ", print " " # get exons directly from core database temp_exon_hash = get_ensembl_exons(mysql_db, transcript_ids, psm_hash, mode) exon_hash = temp_exon_hash[0] psm_hash = temp_exon_hash[1] del temp_exon_hash # retrieve protein sequences for transcript where the protein sequence could not be fetched automatically for key in no_protein_seq: psm_hash[key]['transcript_seq'] = retrieve_protein_seq( psm_hash[key]['transcript_seq'], exon_hash[psm_hash[key]['transcript_id']], psm_hash[key]['5UTR_offset'], psm_hash[key]['start_exon_rank']) psm_hash[key]['shift'] = _calc_seq_shift_( psm_hash[key]['transcript_seq']) #translate till stop codon psm_hash[key]['protein_seq'] = standard_code.translate( psm_hash[key]['transcript_seq']).partition('*')[0] return [psm_hash, exon_hash]
if not protein in dna_sequence_dic: nucleotide_not_found.append(protein) continue sequence_with_stop_codons = DNA.makeSequence(dna_sequence_dic[protein]) #Check if the sequence is the right one, and check for in frame stops #It seems that in JGI annotation, when scaffolds are joined, the resulted proteins do not match #the DNA sequence #Right now, I'll just remove those sequences, and deal with that later if len(sequence_with_stop_codons) % 3 == 0: seq_no_stop_codon = sequence_with_stop_codons.withoutTerminalStopCodon() #Chec for inframe stop codons stops_frame = standard_code.getStopIndices(seq_no_stop_codon, start=0) if len(stops_frame) > 0: inframe_stops.append([cluster, genome_id, protein_id]) else: curated_protein_list[protein] = seq_no_stop_codon else: frameshift_cases.append([cluster, genome_id, protein_id]) if len(curated_protein_list) < 2: # Only take those clusters with 3 sequences or more clusters_too_short.append(cluster) continue #Alignments and output data
def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp, output_fp, temp_dir = "/tmp", params = None): """Assign DNA reads to a database fasta of protein sequences. Wraps assign_reads_to_database, setting database and query types. All parameters are set to default unless params is passed. A temporary file must be written containing the translated sequences from the input query fasta file because BLAT cannot do this automatically. query_fasta_fp: absolute path to the query fasta file containing DNA sequences. database_fasta_fp: absolute path to the database fasta file containing protein sequences. output_fp: absolute path where the output file will be generated. temp_dir: optional. Change the location where the translated sequences will be written before being used as the query. Defaults to /tmp. params: optional. dict containing parameter settings to be used instead of default values. Cannot change database or query file types from protein and dna, respectively. This method returns an open file object. The output format defaults to blast9 and should be parsable by the PyCogent BLAST parsers. """ if params is None: params = {} my_params = {'-t': 'prot', '-q': 'prot' } # make sure temp_dir specifies an absolute path if not isabs(temp_dir): raise ApplicationError("temp_dir must be an absolute path.") # if the user specified parameters other than default, then use them. # However, if they try to change the database or query types, raise an # applciation error. if '-t' in params or '-q' in params: raise ApplicationError("Cannot change database or query types " + \ "when using " + \ "assign_dna_reads_to_dna_database. " + \ "Use assign_reads_to_database instead.") my_params.update(params) # get six-frame translation of the input DNA sequences and write them to # temporary file. tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str) tmp_out = open(tmp, 'w') for label, sequence in MinimalFastaParser(open(query_fasta_fp)): seq_id = label.split()[0] s = DNA.makeSequence(sequence) translations = standard_code.sixframes(s) frames = [1,2,3,-1,-2,-3] translations = dict(list(zip(frames, translations))) for frame, translation in sorted(translations.items()): entry = '>{seq_id}_frame_{frame}\n{trans}\n' entry = entry.format(seq_id=seq_id, frame=frame, trans=translation) tmp_out.write(entry) tmp_out.close() result = assign_reads_to_database(tmp, database_fasta_fp, output_fp, \ params = my_params) remove(tmp) return result
continue sequence_with_stop_codons = DNA.makeSequence( dna_sequence_dic[protein]) #Check if the sequence is the right one, and check for in frame stops #It seems that in JGI annotation, when scaffolds are joined, the resulted proteins do not match #the DNA sequence #Right now, I'll just remove those sequences, and deal with that later if len(sequence_with_stop_codons) % 3 == 0: seq_no_stop_codon = sequence_with_stop_codons.withoutTerminalStopCodon( ) #Chec for inframe stop codons stops_frame = standard_code.getStopIndices(seq_no_stop_codon, start=0) if len(stops_frame) > 0: inframe_stops.append([cluster, genome_id, protein_id]) else: curated_protein_list[protein] = seq_no_stop_codon else: frameshift_cases.append([cluster, genome_id, protein_id]) if len(curated_protein_list ) < 2: # Only take those clusters with 3 sequences or more clusters_too_short.append(cluster) continue