def translate_seq(sequence, strand): ''' :param sequence: DNA sequence :param strand: strand of DNA sequence :return: translated DNA sequence ''' if str(strand) == "1": return standard_code.translate(sequence) else: return standard_code.translate(reverse_complement(sequence))
def ensembl_construct_sequences(psm_hash,ensembl,transcript_ids,database_v,species,three_frame_translation,mode,): ''' :param psm_hash: dictionair with protein / ensembl information ( see prepareAnnotationENSEMBL) :param ensembl:ensembl genome :param transcript_ids: list of transcrip ids (converted from protein IDs) :param database_v: database version :param species: species name :return: dictionairy mapping proteins into ENSEMBL ''' print "Commencing transcript and protein sequence retrieval" no_protein_seq=[] biomart_key_hash={} stable_transcript_ids=[] for key in psm_hash.keys(): biomart_key_hash[psm_hash[key]['transcript_id']]=key stable_transcript_ids.append(psm_hash[key]['transcript_id']) # Retrieve cds,chr,transcript_id and strand from biomart biomart_result=proBAM_biomart.retrieve_data_from_biomart(database_v,species,stable_transcript_ids,three_frame_translation) for row in biomart_result: row=row.split("\t") try: psm_hash[biomart_key_hash[row[1]]]['transcript_seq']=row[0] psm_hash[biomart_key_hash[row[1]]]['protein_seq']=standard_code.translate(row[0]) #TODO what to do with "special" ensembl chromosomes: currently leave them out => bam conversion #TODO considers these psms unmapped #if "_" in row[2]: # print row[1],row[2] psm_hash[biomart_key_hash[row[1]]]['chr']=row[2] psm_hash[biomart_key_hash[row[1]]]['strand']=row[3] del row except IndexError: pass del biomart_result # get exons directly from core database temp_exon_hash=get_ensembl_exons(ensembl,transcript_ids,psm_hash,mode) exon_hash=temp_exon_hash[0] psm_hash=temp_exon_hash[1] del temp_exon_hash # retrieve protein sequences for transcript where the protein sequence could not be fetched automatically for key in no_protein_seq: psm_hash[key]['transcript_seq']=retrieve_protein_seq(psm_hash[key]['transcript_seq'], exon_hash[psm_hash[key]['transcript_id']], psm_hash[key]['5UTR_offset'], psm_hash[key]['start_exon_rank']) #translate till stop codon psm_hash[key]['protein_seq']=standard_code.translate(psm_hash[key] ['transcript_seq']).partition('*')[0] return [psm_hash,exon_hash]
def map_peptide_to_protein_3frame(peptide_seq, transcript_seq, allowed_mismatches, strand): ''' :param peptide_seq: peptide sequence (string) :param transcript_seq: transcript sequence (string) :param allowed_mismatches: number of allowed mismatches :param strand: chromosome strand :return: number of hits of peptide on protein ''' size_adjust = -1 # adjust size of transcript for starting at +1/+2frame hits = [] pre_post_aa = ['', ''] pep_length = len(peptide_seq) frame = [0] * 3 frame[0] = standard_code.translate(transcript_seq) frame[1] = standard_code.translate(transcript_seq[1:]) frame[2] = standard_code.translate(transcript_seq[2:]) for f in frame: size_adjust += 1 for i in range(0, (len(f) - pep_length)): if hamming(peptide_seq, f[i:pep_length + i]) <= allowed_mismatches: adjusted_hit_pos = (i * 3) + size_adjust hits.append([ adjusted_hit_pos, hamming(peptide_seq, f[i:pep_length + i]) ]) # compute 2 preceding AA if (i - 1) == 0: pre_post_aa[0] = f[(i - 1)] elif (i - 2) >= 0: pre_post_aa[0] = f[(i - 2):i] else: pre_post_aa[0] = "*" # compute 2 folowwing AA if (i + 1) == (len(f) - 1): pre_post_aa[1] = f[pep_length + i] elif (i + 2) <= (len(f) - 1): pre_post_aa[1] = f[(pep_length + i):(pep_length + i + 2)] else: pre_post_aa[1] = "*" return [hits, pre_post_aa]
def map_peptide_to_protein_3frame(peptide_seq,transcript_seq,allowed_mismatches,strand): ''' :param peptide_seq: peptide sequence (string) :param transcript_seq: transcript sequence (string) :param allowed_mismatches: number of allowed mismatches :param strand: chromosome strand :return: number of hits of peptide on protein ''' size_adjust=-1 # adjust size of transcript for starting at +1/+2frame hits=[] pre_post_aa=['',''] pep_length=len(peptide_seq) frame=[0]*3 frame[0]=standard_code.translate(transcript_seq) frame[1]=standard_code.translate(transcript_seq[1:]) frame[2]=standard_code.translate(transcript_seq[2:]) for f in frame: size_adjust+=1 for i in range(0,(len(f)-pep_length)): if hamming(peptide_seq,f[i:pep_length+i]) <= allowed_mismatches: adjusted_hit_pos=(i*3)+size_adjust hits.append([adjusted_hit_pos,hamming(peptide_seq,f[i:pep_length+i])]) # compute 2 preceding AA if (i - 1) == 0: pre_post_aa[0] = f[(i - 1)] elif (i - 2) >= 0: pre_post_aa[0] = f[(i - 2):i] else: pre_post_aa[0] = "*" # compute 2 folowwing AA if (i + 1) == (len(f) - 1): pre_post_aa[1] = f[pep_length + i] elif (i + 2) <= (len(f) - 1): pre_post_aa[1] = f[(pep_length + i):(pep_length + i + 2)] else: pre_post_aa[1] = "*" return [hits,pre_post_aa]
def ensembl_construct_sequences( psm_hash, mysql_db, transcript_ids, database_v, species, three_frame_translation, mode, ): ''' :param psm_hash: dictionair with protein / ensembl information ( see prepareAnnotationENSEMBL) :param ensembl:ensembl genome :param transcript_ids: list of transcrip ids (converted from protein IDs) :param database_v: database version :param species: species name :return: dictionairy mapping proteins into ENSEMBL ''' print "Commencing transcript and protein sequence retrieval" no_protein_seq = [] biomart_key_hash = {} stable_transcript_ids = [] for key in psm_hash.keys(): biomart_key_hash[psm_hash[key]['transcript_id']] = key stable_transcript_ids.append(psm_hash[key]['transcript_id']) chunked_stable_transcript_id = chunkIt(stable_transcript_ids, 10) process = 0 c = 0 for chunk in chunked_stable_transcript_id: # Retrieve cds,chr,transcript_id and strand from biomart try: biomart_result = proBAM_biomart.retrieve_data_from_biomart( database_v, species, chunk, three_frame_translation) except AttributeError: time.sleep(60) print "BioMart connection timeout, reconnecting to BioMart" biomart_result = proBAM_biomart.retrieve_data_from_biomart( database_v, species, chunk, three_frame_translation) for row in biomart_result: row = row.split("\t") try: psm_hash[biomart_key_hash[row[1]]]['transcript_seq'] = row[0] psm_hash[biomart_key_hash[row[1]]]['shift'] = _calc_seq_shift_( row[0]) psm_hash[biomart_key_hash[ row[1]]]['protein_seq'] = standard_code.translate(row[0]) psm_hash[biomart_key_hash[row[1]]]['chr'] = row[2] psm_hash[biomart_key_hash[row[1]]]['strand'] = row[3] del row except IndexError: pass del biomart_result if process < 100: process += 10 print str(process) + "% ", print " " # get exons directly from core database temp_exon_hash = get_ensembl_exons(mysql_db, transcript_ids, psm_hash, mode) exon_hash = temp_exon_hash[0] psm_hash = temp_exon_hash[1] del temp_exon_hash # retrieve protein sequences for transcript where the protein sequence could not be fetched automatically for key in no_protein_seq: psm_hash[key]['transcript_seq'] = retrieve_protein_seq( psm_hash[key]['transcript_seq'], exon_hash[psm_hash[key]['transcript_id']], psm_hash[key]['5UTR_offset'], psm_hash[key]['start_exon_rank']) psm_hash[key]['shift'] = _calc_seq_shift_( psm_hash[key]['transcript_seq']) #translate till stop codon psm_hash[key]['protein_seq'] = standard_code.translate( psm_hash[key]['transcript_seq']).partition('*')[0] return [psm_hash, exon_hash]