Beispiel #1
0
def translateSixFrame(seq):
    """Translate seq in 6 frames"""
    from cogent import DNA
    from cogent.core.genetic_code import DEFAULT as standard_code
    translations = standard_code.sixframes(seq)
    stops_frame1 = standard_code.getStopIndices(seq, start=0)
    print translations
    return
Beispiel #2
0
def translateSixFrame(seq):
    """Translate seq in 6 frames"""
    from cogent import DNA
    from cogent.core.genetic_code import DEFAULT as standard_code
    translations = standard_code.sixframes(seq)
    stops_frame1 = standard_code.getStopIndices(seq, start=0)
    print translations
    return
Beispiel #3
0
def translate_seq(sequence, strand):
    '''
    :param sequence: DNA sequence
    :param strand: strand of DNA sequence
    :return: translated DNA sequence
    '''
    if str(strand) == "1":
        return standard_code.translate(sequence)
    else:
        return standard_code.translate(reverse_complement(sequence))
def ensembl_construct_sequences(psm_hash,ensembl,transcript_ids,database_v,species,three_frame_translation,mode,):
    '''
    :param psm_hash: dictionair with protein / ensembl information ( see prepareAnnotationENSEMBL)
    :param ensembl:ensembl genome
    :param transcript_ids: list of transcrip ids (converted from protein IDs)
    :param database_v: database version
    :param species: species name
    :return: dictionairy mapping proteins into ENSEMBL
    '''
    print "Commencing transcript and protein sequence retrieval"

    no_protein_seq=[]
    biomart_key_hash={}
    stable_transcript_ids=[]

    for key in psm_hash.keys():
        biomart_key_hash[psm_hash[key]['transcript_id']]=key
        stable_transcript_ids.append(psm_hash[key]['transcript_id'])

    # Retrieve cds,chr,transcript_id and strand from biomart
    biomart_result=proBAM_biomart.retrieve_data_from_biomart(database_v,species,stable_transcript_ids,three_frame_translation)
    for row in biomart_result:
        row=row.split("\t")
        try:
            psm_hash[biomart_key_hash[row[1]]]['transcript_seq']=row[0]
            psm_hash[biomart_key_hash[row[1]]]['protein_seq']=standard_code.translate(row[0])
            #TODO what to do with "special" ensembl chromosomes: currently leave them out => bam conversion
            #TODO considers these psms unmapped
            #if "_" in row[2]:
            #    print row[1],row[2]
            psm_hash[biomart_key_hash[row[1]]]['chr']=row[2]
            psm_hash[biomart_key_hash[row[1]]]['strand']=row[3]
            del row
        except IndexError:
            pass
    del biomart_result

    # get exons directly from core database
    temp_exon_hash=get_ensembl_exons(ensembl,transcript_ids,psm_hash,mode)
    exon_hash=temp_exon_hash[0]
    psm_hash=temp_exon_hash[1]
    del temp_exon_hash
    # retrieve protein sequences for transcript where the protein sequence could not be fetched automatically
    for key in no_protein_seq:
        psm_hash[key]['transcript_seq']=retrieve_protein_seq(psm_hash[key]['transcript_seq'],
                                                                   exon_hash[psm_hash[key]['transcript_id']],
                                                                   psm_hash[key]['5UTR_offset'],
                                                                   psm_hash[key]['start_exon_rank'])
        #translate till stop codon
        psm_hash[key]['protein_seq']=standard_code.translate(psm_hash[key]
                                                                   ['transcript_seq']).partition('*')[0]
    return [psm_hash,exon_hash]
def findBestSeq(seqobject):
    dna_seq = str(seqobject.seq)
    my_seq = DNA.makeSequence(dna_seq,seqobject.id)
#    x=0
#    framedict = dict()
#    while x  < 3:
#        temp1 = my_seq[x:]
#        temp2 = temp1..withoutTerminalStopCodon()
#        framedict[x] = temp2.getTranslation()
#        x+=1
    
    all_six = standard_code.sixframes(my_seq)
    seqlist = list()
    for frame in all_six:
        seqreturned = frame.split('*')[0]
        seqlist.append(seqreturned)
    longestseq = ''
    x=0
    while x < 3:
        if len(longestseq) < len(seqlist[x]):
            longestseq = seqlist[x]
            correctdnaseq = my_seq[x:]
        x+=1
    #longest_seq = max(seqlist, key=len)
    return longestseq, correctdnaseq 
Beispiel #6
0
def map_peptide_to_protein_3frame(peptide_seq, transcript_seq,
                                  allowed_mismatches, strand):
    '''
    :param peptide_seq: peptide sequence (string)
    :param transcript_seq: transcript sequence (string)
    :param allowed_mismatches: number of allowed mismatches
    :param strand: chromosome strand
    :return: number of hits of peptide on protein
    '''
    size_adjust = -1  # adjust size of transcript for starting at +1/+2frame
    hits = []
    pre_post_aa = ['', '']
    pep_length = len(peptide_seq)
    frame = [0] * 3
    frame[0] = standard_code.translate(transcript_seq)
    frame[1] = standard_code.translate(transcript_seq[1:])
    frame[2] = standard_code.translate(transcript_seq[2:])
    for f in frame:
        size_adjust += 1
        for i in range(0, (len(f) - pep_length)):
            if hamming(peptide_seq, f[i:pep_length + i]) <= allowed_mismatches:
                adjusted_hit_pos = (i * 3) + size_adjust
                hits.append([
                    adjusted_hit_pos,
                    hamming(peptide_seq, f[i:pep_length + i])
                ])

                # compute 2 preceding AA
                if (i - 1) == 0:
                    pre_post_aa[0] = f[(i - 1)]
                elif (i - 2) >= 0:
                    pre_post_aa[0] = f[(i - 2):i]
                else:
                    pre_post_aa[0] = "*"

                # compute 2 folowwing AA
                if (i + 1) == (len(f) - 1):
                    pre_post_aa[1] = f[pep_length + i]
                elif (i + 2) <= (len(f) - 1):
                    pre_post_aa[1] = f[(pep_length + i):(pep_length + i + 2)]
                else:
                    pre_post_aa[1] = "*"
    return [hits, pre_post_aa]
Beispiel #7
0
def map_peptide_to_protein_3frame(peptide_seq,transcript_seq,allowed_mismatches,strand):
    '''
    :param peptide_seq: peptide sequence (string)
    :param transcript_seq: transcript sequence (string)
    :param allowed_mismatches: number of allowed mismatches
    :param strand: chromosome strand
    :return: number of hits of peptide on protein
    '''
    size_adjust=-1    # adjust size of transcript for starting at +1/+2frame
    hits=[]
    pre_post_aa=['','']
    pep_length=len(peptide_seq)
    frame=[0]*3
    frame[0]=standard_code.translate(transcript_seq)
    frame[1]=standard_code.translate(transcript_seq[1:])
    frame[2]=standard_code.translate(transcript_seq[2:])
    for f in frame:
        size_adjust+=1
        for i in range(0,(len(f)-pep_length)):
            if hamming(peptide_seq,f[i:pep_length+i]) <= allowed_mismatches:
                adjusted_hit_pos=(i*3)+size_adjust
                hits.append([adjusted_hit_pos,hamming(peptide_seq,f[i:pep_length+i])])

                # compute 2 preceding AA
                if (i - 1) == 0:
                    pre_post_aa[0] = f[(i - 1)]
                elif (i - 2) >= 0:
                    pre_post_aa[0] = f[(i - 2):i]
                else:
                    pre_post_aa[0] = "*"


                # compute 2 folowwing AA
                if (i + 1) == (len(f) - 1):
                    pre_post_aa[1] = f[pep_length + i]
                elif (i + 2) <= (len(f) - 1):
                    pre_post_aa[1] = f[(pep_length + i):(pep_length + i + 2)]
                else:
                    pre_post_aa[1] = "*"

    return [hits,pre_post_aa]
Beispiel #8
0
def ensembl_construct_sequences(
    psm_hash,
    mysql_db,
    transcript_ids,
    database_v,
    species,
    three_frame_translation,
    mode,
):
    '''
    :param psm_hash: dictionair with protein / ensembl information ( see prepareAnnotationENSEMBL)
    :param ensembl:ensembl genome
    :param transcript_ids: list of transcrip ids (converted from protein IDs)
    :param database_v: database version
    :param species: species name
    :return: dictionairy mapping proteins into ENSEMBL
    '''
    print "Commencing transcript and protein sequence retrieval"

    no_protein_seq = []
    biomart_key_hash = {}
    stable_transcript_ids = []

    for key in psm_hash.keys():
        biomart_key_hash[psm_hash[key]['transcript_id']] = key
        stable_transcript_ids.append(psm_hash[key]['transcript_id'])

    chunked_stable_transcript_id = chunkIt(stable_transcript_ids, 10)
    process = 0
    c = 0
    for chunk in chunked_stable_transcript_id:
        # Retrieve cds,chr,transcript_id and strand from biomart
        try:
            biomart_result = proBAM_biomart.retrieve_data_from_biomart(
                database_v, species, chunk, three_frame_translation)
        except AttributeError:
            time.sleep(60)
            print "BioMart connection timeout, reconnecting to BioMart"
            biomart_result = proBAM_biomart.retrieve_data_from_biomart(
                database_v, species, chunk, three_frame_translation)
        for row in biomart_result:
            row = row.split("\t")
            try:
                psm_hash[biomart_key_hash[row[1]]]['transcript_seq'] = row[0]
                psm_hash[biomart_key_hash[row[1]]]['shift'] = _calc_seq_shift_(
                    row[0])
                psm_hash[biomart_key_hash[
                    row[1]]]['protein_seq'] = standard_code.translate(row[0])
                psm_hash[biomart_key_hash[row[1]]]['chr'] = row[2]
                psm_hash[biomart_key_hash[row[1]]]['strand'] = row[3]
                del row
            except IndexError:
                pass
        del biomart_result
        if process < 100:
            process += 10
            print str(process) + "% ",
    print " "

    # get exons directly from core database
    temp_exon_hash = get_ensembl_exons(mysql_db, transcript_ids, psm_hash,
                                       mode)
    exon_hash = temp_exon_hash[0]
    psm_hash = temp_exon_hash[1]
    del temp_exon_hash
    # retrieve protein sequences for transcript where the protein sequence could not be fetched automatically
    for key in no_protein_seq:
        psm_hash[key]['transcript_seq'] = retrieve_protein_seq(
            psm_hash[key]['transcript_seq'],
            exon_hash[psm_hash[key]['transcript_id']],
            psm_hash[key]['5UTR_offset'], psm_hash[key]['start_exon_rank'])
        psm_hash[key]['shift'] = _calc_seq_shift_(
            psm_hash[key]['transcript_seq'])
        #translate till stop codon
        psm_hash[key]['protein_seq'] = standard_code.translate(
            psm_hash[key]['transcript_seq']).partition('*')[0]
    return [psm_hash, exon_hash]
            if not protein in dna_sequence_dic:
                nucleotide_not_found.append(protein)
                continue

            sequence_with_stop_codons = DNA.makeSequence(dna_sequence_dic[protein])

            #Check if the sequence is the right one, and check for in frame stops
            #It seems that in JGI annotation, when scaffolds are joined, the resulted proteins do not match
            #the DNA sequence
            #Right now, I'll just remove those sequences, and deal with that later

            if len(sequence_with_stop_codons) % 3 == 0:
                seq_no_stop_codon = sequence_with_stop_codons.withoutTerminalStopCodon()

                #Chec for inframe stop codons
                stops_frame = standard_code.getStopIndices(seq_no_stop_codon, start=0)

                if len(stops_frame) > 0:
                    inframe_stops.append([cluster, genome_id, protein_id])

                else:
                    curated_protein_list[protein] = seq_no_stop_codon

            else:
                frameshift_cases.append([cluster, genome_id, protein_id])

        if len(curated_protein_list) < 2:  # Only take those clusters with 3 sequences or more
            clusters_too_short.append(cluster)
            continue

        #Alignments and output data
Beispiel #10
0
def assign_dna_reads_to_protein_database(query_fasta_fp, database_fasta_fp, 
                        output_fp, temp_dir = "/tmp", params = None):
    """Assign DNA reads to a database fasta of protein sequences.

    Wraps assign_reads_to_database, setting database and query types. All
    parameters are set to default unless params is passed. A temporary
    file must be written containing the translated sequences from the input
    query fasta file because BLAT cannot do this automatically.

    query_fasta_fp: absolute path to the query fasta file containing DNA
                   sequences.
    database_fasta_fp: absolute path to the database fasta file containing
                      protein sequences.
    output_fp: absolute path where the output file will be generated.
    temp_dir: optional. Change the location where the translated sequences
              will be written before being used as the query. Defaults to 
              /tmp.
    params: optional. dict containing parameter settings to be used
                  instead of default values. Cannot change database or query
                  file types from protein and dna, respectively.

    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    if params is None:
        params = {}

    my_params = {'-t': 'prot',
              '-q': 'prot'
             }

    # make sure temp_dir specifies an absolute path
    if not isabs(temp_dir):
        raise ApplicationError("temp_dir must be an absolute path.")

    # if the user specified parameters other than default, then use them.
    # However, if they try to change the database or query types, raise an
    # applciation error.
    if '-t' in params or '-q' in params:
        raise ApplicationError("Cannot change database or query types " + \
                                "when using " + \
                                "assign_dna_reads_to_dna_database. " + \
                                "Use assign_reads_to_database instead.")
    
    my_params.update(params)

    # get six-frame translation of the input DNA sequences and write them to
    # temporary file.
    tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str)
    tmp_out = open(tmp, 'w')

    for label, sequence in MinimalFastaParser(open(query_fasta_fp)):
        seq_id = label.split()[0]

        s = DNA.makeSequence(sequence)
        translations = standard_code.sixframes(s)
        frames = [1,2,3,-1,-2,-3]
        translations = dict(list(zip(frames, translations)))

        for frame, translation in sorted(translations.items()):
            entry = '>{seq_id}_frame_{frame}\n{trans}\n'
            entry = entry.format(seq_id=seq_id, frame=frame, trans=translation)
            tmp_out.write(entry)

    tmp_out.close()
    result = assign_reads_to_database(tmp, database_fasta_fp, output_fp, \
                                      params = my_params)

    remove(tmp)

    return result
Beispiel #11
0
                continue

            sequence_with_stop_codons = DNA.makeSequence(
                dna_sequence_dic[protein])

            #Check if the sequence is the right one, and check for in frame stops
            #It seems that in JGI annotation, when scaffolds are joined, the resulted proteins do not match
            #the DNA sequence
            #Right now, I'll just remove those sequences, and deal with that later

            if len(sequence_with_stop_codons) % 3 == 0:
                seq_no_stop_codon = sequence_with_stop_codons.withoutTerminalStopCodon(
                )

                #Chec for inframe stop codons
                stops_frame = standard_code.getStopIndices(seq_no_stop_codon,
                                                           start=0)

                if len(stops_frame) > 0:
                    inframe_stops.append([cluster, genome_id, protein_id])

                else:
                    curated_protein_list[protein] = seq_no_stop_codon

            else:
                frameshift_cases.append([cluster, genome_id, protein_id])

        if len(curated_protein_list
               ) < 2:  # Only take those clusters with 3 sequences or more
            clusters_too_short.append(cluster)
            continue