コード例 #1
0
ファイル: flaskapp.py プロジェクト: arnaoutleen/crRNA_Go
def designer2():
    if request.method == 'POST':
        old_seq = request.form['input']
        new_seq = Seq(old_seq)
        new_seq = str(new_seq.reverse_complement())
        new_seq = new_seq.replace('T', 'U').upper()
        output = "Here is your resulting sequence: 3'-ACAACAUUAUCGGGGGUUUUGACCUGGAAGGUGUUG" + new_seq + "-5'"
        #this script implements your input seqn into the chassis of the crRNA
    return render_template('designer.html', output=output)
コード例 #2
0
def generate_se_sequence_combination(df_SE, dfname, dbname, rna_db, het,
                                     exclude, output_name):

    version = get_version(rna_db)
    if (version == 'swissprot'):
        db = 'sp'
    else:
        db = version

    trans_seq_dict, trans_coding_dict = generate_all_se_db.get_trans_seq_dict(
        rna_db)
    print("transcript sequence dict ready")
    print(len(list(trans_seq_dict.items())))

    df = pd.read_csv(dfname, sep='\t', header=None)
    change_df = generate_all_mutation_db.extract_transcript_change(df)
    trans_index_dict = {}
    for i in range(0, change_df.shape[0]):
        if (change_df.iloc[i]['mrna'] not in trans_index_dict.keys()):
            trans_index_dict[change_df.iloc[i]['mrna']] = [i]
        else:
            trans_index_dict[change_df.iloc[i]['mrna']].append(i)

    trans_records, strand_dict, protein_id_dict = generate_all_se_db.get_trans_records(
        dbname, trans_seq_dict)

    splicing_event_dict = generate_all_se_db.get_splicing_event_dict(
        df_SE, trans_records)
    print("splicing events dict ready")

    trans_exon_dict = generate_all_se_db.get_transcript_exon_dict(
        trans_records)
    print("transcript exon dict ready")

    middle = time.time()
    print("---- %s minutes ----" % ((middle - begin) / 60))

    trans_db_annotation_dict = {}  ##output
    cnt = 0
    trans_records = trans_records.sort_values(by=['exon_id'], axis=0)

    for k in splicing_event_dict.keys():
        cnt += 1
        if (cnt % 1000 == 0):
            print(cnt)
        down_id = splicing_event_dict[k][0]
        skip_id = splicing_event_dict[k][1]
        up_id = splicing_event_dict[k][2]
        for key in trans_exon_dict.keys():
            T = trans_exon_dict[key]
            if ((down_id in T) and (up_id in T)):
                #print(key)
                down_index = T.index(down_id)
                #print(down_index)
                up_index = T.index(up_id)
                low_index = min(down_index, up_index)
                high_index = max(down_index, up_index)

                tmp_exon_coordinate_dict = {}
                tmp_exon_position_dict = {}
                tmp_pos = 0
                for item in range(1, len(T)):
                    ei = generate_all_se_db.find_key_index(
                        'exon_id', T[item], trans_records)
                    if (ei == -1):
                        print("the exon id cannot be found")
                    else:
                        tmp_start = trans_records.iloc[ei]['start']
                        tmp_end = trans_records.iloc[ei]['end']
                        tmp_exon_coordinate_dict[T[item]] = (
                            trans_records.iloc[ei]['chr'], tmp_start, tmp_end)

                        tmp_exon_position_dict[item] = (tmp_pos,
                                                        tmp_pos + tmp_end -
                                                        tmp_start + 1)
                        tmp_pos += tmp_end - tmp_start + 1

                tmp_seq = trans_seq_dict[key]
                coding_start = trans_coding_dict[key][0]
                coding_end = trans_coding_dict[key][1]

                het_list = []
                hom_position_list = []
                if ((key in trans_index_dict.keys())
                        and (key in strand_dict.keys())):
                    shift = 0
                    transcript = tmp_seq
                    hom_des = []

                    for i in trans_index_dict[key]:
                        if (change_df.iloc[i]['mutation_type'].find('snv') !=
                                -1):
                            if (change_df.iloc[i]['snp_type'] == 'hom'):
                                hom_position_list.append(
                                    (int(change_df.iloc[i]['c_start']),
                                     change_df.iloc[i]['mutation_type']))

                                if (strand_dict[key] == '-'):
                                    tmp = Seq(
                                        str(tmp_seq),
                                        IUPAC.ambiguous_dna).complement()
                                    tmp = str(tmp)
                                    transcript=generate_all_mutation_db.change_seq(tmp,int(change_df.iloc[i]['c_start'])+shift+coding_start-1,\
                                                          int(change_df.iloc[i]['c_end'])+shift+coding_start-1,str(Seq(str(change_df.iloc[i]['c_content'])).complement()),\
                                                          change_df.iloc[i]['mutation_type'])
                                else:
                                    transcript=generate_all_mutation_db.change_seq(tmp_seq,int(change_df.iloc[i]['c_start'])+shift+coding_start-1,\
                                                          int(change_df.iloc[i]['c_end'])+shift+coding_start-1,change_df.iloc[i]['c_content'],\
                                                          change_df.iloc[i]['mutation_type'])
                                hom_des.append(change_df.iloc[i]['mutation_type']+":"+str(change_df.iloc[i]['c_start'])+'-'+\
                                str(change_df.iloc[i]['c_end'])+str(change_df.iloc[i]['c_content'])+'_')

#                                if(change_df.iloc[i]['mutation_type'].find('del')!=-1):
#                                    shift-=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
#                                elif(change_df.iloc[i]['mutation_type'].find('ins')!=-1):
#                                    shift+=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
                            else:
                                het_list.append(i)

                    het_number = len(het_list)
                    het_des = []
                    het_seqs = []
                    if (len(hom_position_list) != 0):
                        if (strand_dict[key] == '-'):
                            tmp = Seq(str(transcript),
                                      IUPAC.ambiguous_dna).complement()
                            tmp = str(tmp)
                        else:
                            tmp = transcript
                        het_seqs.append(tmp)

                    else:
                        if (strand_dict[key] == '-'):
                            transcript = Seq(str(transcript),
                                             IUPAC.ambiguous_dna).complement()
                            transcript = str(transcript)
                        het_seqs.append(transcript)
                    het_des.append("")

                    if (het == 1):
                        for n in range(0, het_number):
                            tmp_het_des = []
                            new_sequence = ""
                            splicing_position = 0
                            if ((high_index - low_index) == 1):
                                splicing_position = tmp_exon_position_dict[
                                    low_index][1]
                            elif (T[low_index + 1] == skip_id):
                                splicing_position = tmp_exon_position_dict[
                                    low_index + 1][0]
                            if (abs(
                                    int(change_df.iloc[het_list[n]]['c_start'])
                                    - splicing_position) <= 900):
                                if ((int(change_df.iloc[het_list[n]]
                                         ['c_start']) >= coding_start - 1) &
                                    (int(change_df.iloc[het_list[n]]
                                         ['c_start']) <= coding_end)):
                                    if (strand_dict[key] == '-'):
                                        new_sequence=generate_all_mutation_db.change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift+coding_start-1,\
                                                              int(change_df.iloc[het_list[n]]['c_end'])+shift+coding_start-1,\
                                                              str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\
                                                              change_df.iloc[het_list[n]]['mutation_type'])
                                    else:
                                        new_sequence=generate_all_mutation_db.change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift+coding_start-1,\
                                                              int(change_df.iloc[het_list[n]]['c_end'])+shift+coding_start-1,change_df.iloc[het_list[n]]['c_content'],\
                                                              change_df.iloc[het_list[n]]['mutation_type'])
                                    tmp_het_des.append(change_df.iloc[het_list[n]]['mutation_type']+":"+\
                                    str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\
                                    str(change_df.iloc[het_list[n]]['c_content']))

    #                                if(change_df.iloc[i]['mutation_type'].find('del')!=-1):
    #                                    shift-=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
    #                                elif(change_df.iloc[i]['mutation_type'].find('ins')!=-1):
    #                                    shift+=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
                                het_des.append(tmp_het_des)
                                if (strand_dict[key] == '-'):
                                    new_seq = Seq(
                                        str(new_sequence),
                                        IUPAC.ambiguous_dna).complement()
                                    new_seq = str(new_seq)
                                else:
                                    new_seq = new_sequence
                                het_seqs.append(new_seq)
                    elif (het == 2):
                        for n in range(0, het_number):
                            tmp_het_des = []
                            new_sequence = ""
                            splicing_position = 0
                            if ((high_index - low_index) == 1):
                                splicing_position = tmp_exon_position_dict[
                                    low_index][1]
                            elif (T[low_index + 1] == skip_id):
                                splicing_position = tmp_exon_position_dict[
                                    low_index + 1][0]
                            if (abs(
                                    int(change_df.iloc[het_list[n]]['c_start'])
                                    - splicing_position) <= 900):
                                if ((int(change_df.iloc[het_list[n]]
                                         ['c_start']) >= coding_start - 1) &
                                    (int(change_df.iloc[het_list[n]]
                                         ['c_start']) <= coding_end)):
                                    if (strand_dict[key] == '-'):
                                        new_sequence=generate_all_mutation_db.change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift+coding_start-1,\
                                                              int(change_df.iloc[het_list[n]]['c_end'])+shift+coding_start-1,\
                                                              str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\
                                                              change_df.iloc[het_list[n]]['mutation_type'])
                                    else:
                                        new_sequence=generate_all_mutation_db.change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift+coding_start-1,\
                                                              int(change_df.iloc[het_list[n]]['c_end'])+shift+coding_start-1,change_df.iloc[het_list[n]]['c_content'],\
                                                              change_df.iloc[het_list[n]]['mutation_type'])
                                    tmp_het_des.append(change_df.iloc[het_list[n]]['mutation_type']+":"+\
                                    str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\
                                    str(change_df.iloc[het_list[n]]['c_content']))

    #                                if(change_df.iloc[i]['mutation_type'].find('del')!=-1):
    #                                    shift-=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
    #                                elif(change_df.iloc[i]['mutation_type'].find('ins')!=-1):
    #                                    shift+=(int(change_df.iloc[i]['c_end'])-int(change_df.iloc[i]['c_start'])+1)
                                het_des.append(tmp_het_des)

                            for j in range(n, len(het_list)):
                                tmp_new_sequence = new_sequence
                                tmp_new_des = tmp_het_des

                                if (j != n):
                                    if (abs(
                                            int(change_df.iloc[het_list[j]]
                                                ['c_start']) -
                                            splicing_position) <= 900):
                                        if ((int(change_df.iloc[het_list[j]]
                                                 ['c_start']) >=
                                             coding_start - 1) &
                                            (int(change_df.iloc[het_list[j]]
                                                 ['c_start']) <= coding_end)):
                                            if (strand_dict[key] == '-'):
                                                tmp_new_sequence=generate_all_mutation_db.change_seq(tmp_new_sequence,int(change_df.iloc[het_list[j]]['c_start'])+shift,\
                                                                      int(change_df.iloc[het_list[j]]['c_end'])+shift,\
                                                                      str(Seq(str(change_df.iloc[het_list[j]]['c_content'])).complement()),\
                                                                      change_df.iloc[het_list[j]]['mutation_type'])
                                            else:
                                                tmp_new_sequence=generate_all_mutation_db.change_seq(tmp_new_sequence,int(change_df.iloc[het_list[j]]['c_start'])+shift,\
                                                                      int(change_df.iloc[het_list[j]]['c_end'])+shift,change_df.iloc[het_list[j]]['c_content'],\
                                                                      change_df.iloc[het_list[j]]['mutation_type'])
                                            tmp_new_des.append(change_df.iloc[het_list[j]]['mutation_type']+":"+\
                                                str(change_df.iloc[het_list[j]]['c_start'])+'-'+str(change_df.iloc[het_list[j]]['c_end'])+\
                                                str(change_df.iloc[het_list[j]]['c_content']))
                                        het_des.append(tmp_new_des)

                                        if (strand_dict[key] == '+'):
                                            new_seq = tmp_new_sequence
                                        else:
                                            new_seq = str(
                                                Seq(str(tmp_new_sequence),
                                                    IUPAC.ambiguous_dna).
                                                complement())
                                        het_seqs.append(new_seq)

                    for c in range(
                            0, len(het_seqs)
                    ):  ##tmp_cnt is the number of modified sequences
                        tmp_seq = het_seqs[c]

                        if ((high_index - low_index) == 1):

                            position = tmp_exon_position_dict[low_index][1]
                            if (position >= coding_start):
                                db_seq = tmp_seq[coding_start - 1:]
                                if (db_seq not in
                                        trans_db_annotation_dict.keys()):
                                    trans_db_annotation_dict[db_seq] = [
                                        key,
                                        str(k) + "_exclusive_from_" +
                                        str(position - coding_start + 1)
                                    ]
                                else:
                                    des_tmp = str(
                                        k) + "_exclusive_from_" + str(
                                            position - coding_start + 1)
                                    if (des_tmp not in
                                            trans_db_annotation_dict[db_seq]):
                                        trans_db_annotation_dict[
                                            db_seq].append(des_tmp)

                                tmp_anno = ""
                                for d in hom_des:
                                    tmp_anno += d

                                for d in het_des[c]:
                                    tmp_anno += d
                                trans_db_annotation_dict[db_seq].append(
                                    tmp_anno)
                                #tmp_info=exon_coordinate_dict[skip_id]
                                tmp_skip_id_index = generate_all_se_db.find_key_index(
                                    'exon_id', skip_id, trans_records)
                                tmp_info=(trans_records.iloc[tmp_skip_id_index]['chr'],trans_records.iloc[tmp_skip_id_index]['start'],\
                                          trans_records.iloc[tmp_skip_id_index]['end'])

                                skip_seq = generate_all_se_db.fetch_exon_seq(
                                    tmp_info[0], tmp_info[1], tmp_info[2],
                                    T[0])
                                trans_prime_seq = generate_all_se_db.add_exon(
                                    tmp_seq, skip_seq, position)
                                db_prime_seq = trans_prime_seq[coding_start -
                                                               1:]
                                if (db_prime_seq not in
                                        trans_db_annotation_dict.keys()):
                                    trans_db_annotation_dict[db_prime_seq]=[key,"modified_"+str(k)+"_inclusive_from_"\
                                                            +str(position-coding_start+1)+"_to_"+\
                                                            str(position-coding_start+1+len(skip_seq))]
                                else:
                                    des_tmp="modified_"+str(k)+"_inclusive_from_"+\
                                                            str(position-coding_start+1)+"_to_"+\
                                                            str(position-coding_start+1+len(skip_seq))
                                    if (des_tmp
                                            not in trans_db_annotation_dict[
                                                db_prime_seq]):
                                        trans_db_annotation_dict[
                                            db_prime_seq].append(des_tmp)

                                tmp_anno = ""
                                for d in hom_des:
                                    tmp_anno += d
                                for d in het_des[c]:
                                    tmp_anno += d
                                trans_db_annotation_dict[db_prime_seq].append(
                                    tmp_anno)

                        elif (T[low_index + 1] == skip_id):

                            skip_exon_start = tmp_exon_position_dict[low_index
                                                                     + 1][0]
                            skip_exon_end = tmp_exon_position_dict[low_index +
                                                                   1][1]

                            if (skip_exon_start >= coding_start):
                                db_seq = tmp_seq[coding_start - 1:]
                                if (db_seq not in
                                        trans_db_annotation_dict.keys()):
                                    trans_db_annotation_dict[db_seq]=[key,str(k)+"_inclusive_from_"+str(skip_exon_start-coding_start+1)\
                                                            +"_to_"+str(skip_exon_end-coding_start+1)]
                                else:
                                    des_tmp=str(k)+"_inclusive_from_"+str(skip_exon_start-coding_start+1)\
                                                            +"_to_"+str(skip_exon_end-coding_start+1)
                                    if (des_tmp not in
                                            trans_db_annotation_dict[db_seq]):
                                        trans_db_annotation_dict[
                                            db_seq].append(des_tmp)

                                tmp_anno = ""
                                for d in hom_des:
                                    tmp_anno += d
                                for d in het_des[c]:
                                    tmp_anno += d
                                trans_db_annotation_dict[db_seq].append(
                                    tmp_anno)

                                trans_prime_seq = generate_all_se_db.remove_exon(
                                    tmp_seq, skip_exon_start, skip_exon_end)
                                db_prime_seq = trans_prime_seq[coding_start -
                                                               1:]
                                if (db_prime_seq not in
                                        trans_db_annotation_dict.keys()):
                                    trans_db_annotation_dict[db_prime_seq] = [
                                        key, "modified_" + str(k) +
                                        "_exclusive_from_" +
                                        str(skip_exon_start - coding_start + 1)
                                    ]
                                else:
                                    des_tmp="modified_"+str(k)+"_exclusive_from_"+\
                                                            str(skip_exon_start-coding_start+1)
                                    if (des_tmp
                                            not in trans_db_annotation_dict[
                                                db_prime_seq]):
                                        trans_db_annotation_dict[
                                            db_prime_seq].append(des_tmp)

                                tmp_anno = ""
                                for d in hom_des:
                                    p = get_variant_pos_from_des(d)
                                    if p not in range(skip_exon_start,
                                                      skip_exon_end):
                                        tmp_anno += d
                                for d in het_des[c]:
                                    p = get_variant_pos_from_des(d)
                                    if p not in range(skip_exon_start,
                                                      skip_exon_end):
                                        tmp_anno += d
                                trans_db_annotation_dict[db_prime_seq].append(
                                    tmp_anno)

                if ((len(hom_position_list) == 0) and (len(het_list) == 0)):
                    coding_start = trans_coding_dict[key][0]

                    tmp_exon_coordinate_dict = {}
                    tmp_exon_position_dict = {}
                    tmp_pos = 0
                    for item in range(1, len(T)):
                        ei = generate_all_se_db.find_key_index(
                            'exon_id', T[item], trans_records)
                        if (ei == -1):
                            print("the exon id cannot be found")
                        else:
                            tmp_start = trans_records.iloc[ei]['start']
                            tmp_end = trans_records.iloc[ei]['end']
                            tmp_exon_coordinate_dict[T[item]] = (
                                trans_records.iloc[ei]['chr'], tmp_start,
                                tmp_end)

                            tmp_exon_position_dict[item] = (tmp_pos,
                                                            tmp_pos + tmp_end -
                                                            tmp_start + 1)
                            tmp_pos += tmp_end - tmp_start + 1

                    if ((high_index - low_index) == 1):
                        position = tmp_exon_position_dict[low_index][1]
                        if (position >= coding_start):
                            db_seq = tmp_seq[coding_start - 1:]
                            if (db_seq not in trans_db_annotation_dict.keys()):
                                trans_db_annotation_dict[db_seq] = [
                                    key,
                                    str(k) + "_exclusive_from_" +
                                    str(position - coding_start + 1)
                                ]
                            else:
                                des_tmp = str(k) + "_exclusive_from_" + str(
                                    position - coding_start + 1)
                                if (des_tmp not in
                                        trans_db_annotation_dict[db_seq]):
                                    trans_db_annotation_dict[db_seq].append(
                                        des_tmp)

                            tmp_skip_id_index = generate_all_se_db.find_key_index(
                                'exon_id', skip_id, trans_records)
                            tmp_info=(trans_records.iloc[tmp_skip_id_index]['chr'],trans_records.iloc[tmp_skip_id_index]['start'],\
                                      trans_records.iloc[tmp_skip_id_index]['end'])

                            skip_seq = generate_all_se_db.fetch_exon_seq(
                                tmp_info[0], tmp_info[1], tmp_info[2], T[0])
                            trans_prime_seq = generate_all_se_db.add_exon(
                                tmp_seq, skip_seq, position)
                            db_prime_seq = trans_prime_seq[coding_start - 1:]
                            if (db_prime_seq
                                    not in trans_db_annotation_dict.keys()):
                                trans_db_annotation_dict[db_prime_seq]=[key,"modified_"+str(k)+"_inclusive_from_"\
                                                        +str(position-coding_start+1)+"_to_"+\
                                                        str(position-coding_start+1+len(skip_seq))]
                            else:
                                des_tmp="modified_"+str(k)+"_inclusive_from_"+\
                                                        str(position-coding_start+1)+"_to_"+\
                                                        str(position-coding_start+1+len(skip_seq))
                                if (des_tmp not in trans_db_annotation_dict[
                                        db_prime_seq]):
                                    trans_db_annotation_dict[
                                        db_prime_seq].append(des_tmp)
                    elif (T[low_index + 1] == skip_id):
                        skip_exon_start = tmp_exon_position_dict[low_index +
                                                                 1][0]
                        skip_exon_end = tmp_exon_position_dict[low_index +
                                                               1][1]

                        if (skip_exon_start >= coding_start):
                            db_seq = tmp_seq[coding_start - 1:]
                            if (db_seq not in trans_db_annotation_dict.keys()):
                                trans_db_annotation_dict[db_seq]=[key,str(k)+"_inclusive_from_"+str(skip_exon_start-coding_start+1)\
                                                        +"_to_"+str(skip_exon_end-coding_start+1)]
                            else:
                                des_tmp=str(k)+"_inclusive_from_"+str(skip_exon_start-coding_start+1)\
                                                        +"_to_"+str(skip_exon_end-coding_start+1)
                                if (des_tmp not in
                                        trans_db_annotation_dict[db_seq]):
                                    trans_db_annotation_dict[db_seq].append(
                                        des_tmp)

                            trans_prime_seq = generate_all_se_db.remove_exon(
                                tmp_seq, skip_exon_start, skip_exon_end)
                            db_prime_seq = trans_prime_seq[coding_start - 1:]
                            if (db_prime_seq
                                    not in trans_db_annotation_dict.keys()):
                                trans_db_annotation_dict[db_prime_seq] = [
                                    key,
                                    "modified_" + str(k) + "_exclusive_from_" +
                                    str(skip_exon_start - coding_start + 1)
                                ]
                            else:
                                des_tmp="modified_"+str(k)+"_exclusive_from_"+\
                                                        str(skip_exon_start-coding_start+1)
                                if (des_tmp not in trans_db_annotation_dict[
                                        db_prime_seq]):
                                    trans_db_annotation_dict[
                                        db_prime_seq].append(des_tmp)

    trans_records = trans_records.sort_values(by=['trans_id'])
    my_seqs = []
    # my_transcripts=[]
    modified_trans = []
    mutation_seq_cnt = 0
    for k in trans_db_annotation_dict.keys():
        annotation_list = trans_db_annotation_dict[k]
        trans_id = annotation_list[0]
        modified_trans.append(trans_id)
        #coding_start=trans_coding_dict[trans_id][0]
        #print(trans_id)
        pid = protein_id_dict[trans_id][0]
        gid = protein_id_dict[trans_id][1]

        tmp_id = db + "|" + pid + '|' + trans_id + '|' + gid

        des = ''
        flag = 0
        for an in range(1, len(annotation_list)):
            if (annotation_list[an].find('snv') != -1):
                flag = 1
            des += annotation_list[an] + '|'
#            my_transcripts.append(SeqRecord(Seq(str(k),IUPAC.ambiguous_dna),id=tmp_id,description=des))
        if (flag == 1):
            mutation_seq_cnt += 1
        new_seq = str(
            Seq(str(k),
                IUPAC.ambiguous_dna).transcribe().translate(to_stop=True))

        while (new_seq.find('None') != -1):
            new_seq = new_seq.replace('None', '')
        my_seqs.append(
            SeqRecord(Seq(str(new_seq), IUPAC.protein),
                      id=tmp_id,
                      description=des))

    print("The number of sequences with mutation generated is " +
          str(mutation_seq_cnt))

    if (exclude != True):
        original_cnt = 0
        for t in trans_seq_dict.keys():
            if t not in modified_trans:

                coding_start = int(trans_coding_dict[t][0])
                coding_end = int(trans_coding_dict[t][1])
                trans_seq = str(trans_seq_dict[t])
                trans_seq = trans_seq[coding_start - 1:]
                if (trans_seq not in trans_db_annotation_dict.keys()):

                    pid = protein_id_dict[t][0]
                    gid = protein_id_dict[t][1]
                    new_sequence = trans_seq

                    new_seq = str(
                        Seq(str(new_sequence),
                            IUPAC.ambiguous_dna).transcribe().translate(
                                to_stop=True))
                    #            else:
                    #                new_seq=str(Seq(str(new_sequence),IUPAC.ambiguous_dna).complement().transcribe().translate(to_stop=True))

                    my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                 id=db+'|'+pid+'|'+t+'|'+gid+'_0:'+str(coding_start)+'-'+str(coding_end),\
                                             description="no splicing"))
                    original_cnt += 1

        print("The number of original sequences is " + str(original_cnt))

    print("The number of sequences generated is " + str(len(my_seqs)))

    handle = open(output_name + ".fasta", "w")
    #handle=open(dataset_name+"_all_se_db"+".fasta","w")
    for sequence in my_seqs:
        SeqIO.write(sequence, handle, "fasta")
    handle.close()


#    handle_trans=open("../data/"+dataset_name+"/"+dataset_name+"_all_se_transcripts"+".fasta","w")
#    for sequence in my_transcripts:
#        SeqIO.write(sequence,handle_trans,"fasta")
#    handle_trans.close()
#return my_seqs

#dbname="../data/gencode.v28.basic.annotation.gff3"
#SE_name=argv[1]
#dfname=argv[2]
#rna_db=argv[3]
#output_name=argv[4]
#
#df_SE=pd.read_csv(SE_name,sep='\t')
#generate_se_sequence_combination(df_SE,dfname,dbname,rna_db,output_name)
##
#finish=time.time()
#print("---- %s minutes ----" % ((finish-begin)/60))
コード例 #3
0
                                else:
                                    start = "<" + col4
                                    ext = "5' Partial CDS"
                                stop = col5
                            if col7 == "-":
                                if dico_end[col9] in stopCodons:
                                    stop = col4
                                else:
                                    stop = ">" + col4
                                    if ext != "":
                                        ext = "Partial CDS"
                                    else:
                                        ext = "3' Partial CDS"
                                start = col5
                            list_tmp_gene.append(
                                int(start.replace("<", "").replace(">", "")))
                            list_tmp_gene.append(
                                int(stop.replace("<", "").replace(">", "")))
                            list_tmp_cds.append(start + "\t" + stop + "\t" +
                                                "CDS")
                            dico_tmp_gff[
                                int(col4) +
                                1] = col1 + "\t" + col2 + "\t" + "CDS" + "\t" + col4 + "\t" + col5 + "\t" + col6 + "\t" + col7 + "\t" + col8 + "\t" + "Parent=" + col9 + ".1 CDS " + ext

                    if c > 1 and c < int(dicog.get(col9)):
                        if size % 3 == 0:
                            if col7 == "+":
                                start = col4
                                stop = col5
                            if col7 == "-":
                                start = col5
コード例 #4
0
                PlusTwoORFs.remove(orf)
        elif orf2[1] in orf[1]:
            if orf2 in PlusTwoORFs:
                PlusTwoORFs.remove(orf2)

# The user can choose to have the number of ORFs found in each reading frame
# printed for inspection

#print('Number of +1 ORFs: %s' %len(PlusOneORFs))
#print('Number of +2 ORFs: %s' %len(PlusTwoORFs))

# The results are printed for the user. The files can be saved to a tab-delimited file
# by the python3 GeneratePreexistingORFControls.py >[filename] in the terminal.
for orf in PlusOneORFs:
    orfSeq = orf[1]
    protein = Seq(orfSeq).translate()
    protein = str(protein)
    noCysProtein = protein.replace('C', '')
    noCysProteinLen = len(noCysProtein)
    output = [orf[0], orf[1], protein, noCysProtein, '+1', noCysProteinLen]
    print(*output, sep='\t')

for orf in PlusTwoORFs:
    orfSeq = orf[1]
    protein = Seq(orfSeq).translate()
    protein = str(protein)
    noCysProtein = protein.replace('C', '')
    noCysProteinLen = len(noCysProtein)
    output = [orf[0], orf[1], protein, noCysProtein, '+2', noCysProteinLen]
    #print(*output,sep='\t')