def gff3_to_gtf(gff3_file):

    dialect = {
        'field separator': '; ',
        'fmt': 'gtf',
        'keyval separator': ' ',
        'leading semicolon': False,
        'multival separator': ',',
        'quoted GFF2 values': True,
        'order': ['gene_id', 'transcript_id'],
        'repeated keys': False,
        'trailing semicolon': True
    }

    out_file = os.path.splitext(gff3_file)[0] + ".gtf"
    if file_exists(out_file):
        return out_file

    print "Converting %s to %s." % (gff3_file, out_file)

    db = gffutils.create_db(gff3_file, ":memory:")
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in DataIterator(db.features_of_type("exon"),
                                        dialect=dialect):
                transcript_id = feature["Parent"][0]
                gene_id = db[transcript_id]["Parent"][0]
                attr = {"transcript_id": transcript_id, "gene_id": gene_id}
                attributes = gffutils.attributes.Attributes(attr)
                feature.attributes = attributes
                print >> out_handle, feature
    return out_file
Esempio n. 2
0
def _output_ncbi_gff3(gff3_file, out_file, dialect):
    gene_key = "gene"
    id_spec = {"gene": gene_key}
    db = gffutils.create_db(gff3_file, ":memory:", id_spec=id_spec)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in DataIterator(db.features_of_type("exon"),
                                        dialect=dialect):
                # Gnomon features are often missing a transcript id
                # some malformed features are also missing the gene key
                try:
                    transcript_id = feature["transcript_id"]
                except KeyError:
                    try:
                        transcript_id = feature[gene_key]
                    except KeyError:
                        continue
                gene_id = feature[gene_key]
                try:
                    biotype = feature["gene_biotype"]
                except KeyError:
                    biotype = "unknown"
                attr = {
                    "transcript_id": transcript_id,
                    "gene_id": gene_id,
                    "gene_biotype": biotype
                }
                attributes = gffutils.attributes.Attributes(attr)
                feature.attributes = attributes
                print(feature, file=out_handle, end="")
Esempio n. 3
0
def _output_gff3(gff3_file, out_file, dialect):
    db = gffutils.create_db(gff3_file, ":memory:")
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in DataIterator(db.features_of_type("exon"), dialect=dialect):
                transcript_id = feature["Parent"][0]
                gene_id = db[transcript_id]["Parent"][0]
                attr = {"transcript_id": transcript_id, "gene_id": gene_id}
                attributes = gffutils.attributes.Attributes(attr)
                feature.attributes = attributes
                print(feature, file=out_handle, end="")
Esempio n. 4
0
def get_trans_records(dbname, trans_seq_dict):
    records = DataIterator(dbname)

    e = 0
    chromosome = []
    strand = []
    start = []
    end = []
    trans_id = []
    exon_number = []
    exon_id = []
    gene_id = []
    protein_id = []
    strand_dict = {}
    protein_id_dict = {}

    for record in records:
        if ('transcript_type' in record.attributes):
            if (record.attributes['transcript_id'][0]
                    in trans_seq_dict.keys()):
                strand_dict[record.attributes['transcript_id'][0]] = record[6]
                protein_id_dict[record.attributes['transcript_id'][0]]=(record.attributes['protein_id'][0],\
                                   record.attributes['gene_id'][0])
                if (record[2] == 'exon'):
                    chromosome.append(record[0])
                    strand.append(record[6])
                    start.append(int(record[3]))
                    end.append(int(record[4]))
                    tmp = record.attributes['ID'][0]
                    flag = tmp.find(':')
                    tmp = tmp[flag + 1:]
                    s = tmp.split(':', 1)
                    if (s[0].find('_') != -1):
                        flag = s[0].find('_')
                        s[0] = s[0][:flag]
                    trans_id.append(s[0])
                    exon_number.append(int(s[1]))
                    gene_id.append(record.attributes['gene_id'][0])
                    exon_id.append(record.attributes['exon_id'][0])
                    protein_id.append(record.attributes['protein_id'][0])

                    e += 1

    #print(e)
    exon_records={'chr':chromosome,'strand':strand,'start':start,'end':end,'trans_id':trans_id,\
                      'exon_number':exon_number,'exon_id':exon_id,'protein_id':protein_id,'gene_id':gene_id}
    trans_records=pd.DataFrame(exon_records,columns=['chr','strand','start','end','trans_id','exon_number',\
                                                     'exon_id','protein_id','gene_id'])
    trans_records = trans_records.sort_values(by=['trans_id', 'exon_number'],
                                              axis=0)

    return trans_records, strand_dict, protein_id_dict
Esempio n. 5
0
def get_new_sequence(dfname, dbname, rna_db, het, exclude, output_name):
    version = get_version(rna_db)
    if (version == 'swissprot'):
        db = 'sp'
    else:
        db = version

    sequence_dict = {}
    rna_seq = SeqIO.parse(rna_db, 'fasta')
    ##check the correctness of rna-seqs
    for correct in rna_seq:
        tmp = correct.id
        flag = tmp.find("|")
        tmp = tmp[flag + 1:]
        cds = tmp.find('CDS:')
        if (cds == -1):
            print("The format of file of parameter -r(--rna) is incorrect!")
            return 0
        tmp = tmp[cds:]
        cds_end = tmp.find('|')
        tmp = tmp[tmp.find(':') + 1:cds_end]

        split_flag = tmp.find('-')
        if (split_flag == -1):
            print("The format of file of parameter -r(--rna) is incorrect!")
            return 0

    rna_seqs = SeqIO.parse(rna_db, 'fasta')
    for seq in rna_seqs:
        tmp = seq.id
        flag = tmp.find("|")
        mrna_id = tmp[:flag]
        tmp = tmp[flag + 1:]
        cds = tmp.find('CDS:')
        tmp = tmp[cds:]
        cds_end = tmp.find('|')
        tmp = tmp[tmp.find(':') + 1:cds_end]

        split_flag = tmp.find('-')
        coding_start = int(tmp[:split_flag])
        coding_end = int(tmp[split_flag + 1:])
        sequence_dict[mrna_id] = (coding_start, seq.seq, coding_end)

    print("sequence_dict ready")

    records = DataIterator(dbname)
    strand_dict = {}
    protein_id_dict = {}
    for record in records:
        if (record[2] == 'transcript'):
            if ('transcript_type' in record.attributes):
                if (record.attributes['transcript_id'][0]
                        in sequence_dict.keys()):
                    strand_dict[record.attributes['transcript_id']
                                [0]] = record[6]
                    protein_id_dict[record.attributes['transcript_id'][0]]=(record.attributes['protein_id'][0],\
                                   record.attributes['gene_id'][0])

    print("protein_id_dict ready")

    df = pd.read_csv(dfname, sep='\t', header=None)
    change_df = extract_transcript_change(df)
    trans_index_dict = {}
    for i in range(0, change_df.shape[0]):
        if (change_df.iloc[i]['mrna'] not in trans_index_dict.keys()):
            trans_index_dict[change_df.iloc[i]['mrna']] = [i]
        else:
            trans_index_dict[change_df.iloc[i]['mrna']].append(i)

    my_seqs = []
    k_cnt = 0

    hom_only_cnt = 0
    hom_het_cnt = 0
    het_only_cnt = 0
    original_cnt = 0
    random_cnt = 0

    for k in trans_index_dict.keys():

        if (k in protein_id_dict.keys()):
            k_cnt += 1
            if (k_cnt % 1000 == 0):
                print(k_cnt)
            pid = protein_id_dict[k][0]
            gid = protein_id_dict[k][1]

            if (strand_dict[k] == '+'):

                transcript = str(sequence_dict[k][1])
            else:
                tmp = Seq(str(sequence_dict[k][1]),
                          IUPAC.ambiguous_dna).complement()
                transcript = str(tmp)
            coding_start = int(sequence_dict[k][0]) - 1
            coding_end = int(sequence_dict[k][2]) - 1

            transcript = transcript[coding_start:coding_end + 1]

            shift = 0
            des = ""
            het_list = []
            hom_position_list = []
            for i in trans_index_dict[k]:

                if (change_df.iloc[i]['snp_type'] == 'hom'):
                    hom_position_list.append(
                        (int(change_df.iloc[i]['c_start']),
                         change_df.iloc[i]['mutation_type']))
                    #                    if(change_df.iloc[i]['mutation_type']=='snv'):
                    #                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                    #                                                  int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],'snv')
                    #                        des+="snv:"+str(change_df.iloc[i]['c_start'])+change_df.iloc[i]['c_content']+'_'
                    #                    else:
                    if (strand_dict[k] == '-'):
                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                                              int(change_df.iloc[i]['c_end'])+shift,str(Seq(str(change_df.iloc[i]['c_content'])).complement()),\
                                              change_df.iloc[i]['mutation_type'])
                    else:
                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                                              int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],\
                                              change_df.iloc[i]['mutation_type'])
                    des+=change_df.iloc[i]['mutation_type']+":"+str(change_df.iloc[i]['c_start'])+'-'+\
                    str(change_df.iloc[i]['c_end'])+str(change_df.iloc[i]['c_content'])+'_'
                    if (change_df.iloc[i]['mutation_type'].find('del') != -1):
                        shift -= (int(change_df.iloc[i]['c_end']) -
                                  int(change_df.iloc[i]['c_start']) + 1)
                    elif (change_df.iloc[i]['mutation_type'].find('ins') !=
                          -1):
                        shift += (int(change_df.iloc[i]['c_end']) -
                                  int(change_df.iloc[i]['c_start']) + 1)
                else:
                    het_list.append(i)

            if (len(hom_position_list) != 0):
                new_sequence = ""
                new_des = ""
                for p in hom_position_list:

                    flag = des.find(str(p[0]))
                    tmp = des[flag:]
                    flag = tmp.find('_')
                    new_des += str(p[1]) + ":" + tmp[:flag + 1]
                    new_sequence = transcript

                if (len(new_sequence) != 0):
                    new_sequence = new_sequence[shift:]

                    if (strand_dict[k] == '+'):
                        new_seq = str(
                            Seq(str(new_sequence),
                                IUPAC.ambiguous_dna).transcribe().translate(
                                    to_stop=True))
                    else:
                        new_seq = str(
                            Seq(str(new_sequence), IUPAC.ambiguous_dna).
                            complement().transcribe().translate(to_stop=True))

                    while (new_seq.find('None') != -1):
                        new_seq = new_seq.replace('None', '')
                    my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                             id=db+'|'+pid+'|'+k+'|'+gid+'_0:'+str(int(sequence_dict[k][0])+shift)+'-'+str(int(sequence_dict[k][2])+shift)+'_'+new_des,\
                                         description=new_des))
                    hom_only_cnt += 1

            else:
                if (exclude == False):
                    new_sequence = transcript
                    if (strand_dict[k] == '+'):
                        new_seq = str(
                            Seq(str(new_sequence),
                                IUPAC.ambiguous_dna).transcribe().translate(
                                    to_stop=True))
                    else:
                        new_seq = str(
                            Seq(str(new_sequence), IUPAC.ambiguous_dna).
                            complement().transcribe().translate(to_stop=True))

                    my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                 id=db+'|'+pid+'|'+k+'|'+gid+'_0:'+str(int(sequence_dict[k][0]))+'-'+str(int(sequence_dict[k][2]))+'_no_variant',\
                                             description="no variant"))
                    original_cnt += 1

            if (het == 1):
                coding_start = int(sequence_dict[k][0]) + shift - 1
                coding_end = int(sequence_dict[k][2]) + shift - 1
                count = int(len(transcript) / 900)
                if (len(transcript) <= 900):
                    count = 1
                cnt = 0

                for l in range(0, count):
                    l = l * 900
                    start = l
                    if (start + 1799 < len(transcript)):
                        stop = start + 1799
                    else:
                        stop = len(transcript) - 1

                    het_number = len(het_list)
                    for n in range(0, het_number):
                        new_sequence = ""
                        new_des = ""

                        if ((int(change_df.iloc[het_list[n]]['c_start']) >=
                             start) &
                            (int(change_df.iloc[het_list[n]]['c_start']) <=
                             stop)):
                            if (strand_dict[k] == '-'):
                                new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\
                                                      int(change_df.iloc[het_list[n]]['c_end'])+shift,\
                                                      str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\
                                                      change_df.iloc[het_list[n]]['mutation_type'])
                            else:
                                new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\
                                                      int(change_df.iloc[het_list[n]]['c_end'])+shift,change_df.iloc[het_list[n]]['c_content'],\
                                                      change_df.iloc[het_list[n]]['mutation_type'])
                            new_des=des+change_df.iloc[het_list[n]]['mutation_type']+":"+\
                            str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\
                            str(change_df.iloc[het_list[n]]['c_content'])
                        if (len(new_sequence) != 0):
                            new_sequence = new_sequence[start:stop + 1]

                            if (strand_dict[k] == '+'):
                                new_seq = str(
                                    Seq(str(new_sequence), IUPAC.ambiguous_dna
                                        ).transcribe().translate(to_stop=True))
                            else:
                                new_seq = str(
                                    Seq(str(new_sequence),
                                        IUPAC.ambiguous_dna).complement().
                                    transcribe().translate(to_stop=True))
                            cnt += 1
                            while (new_seq.find('None') != -1):
                                new_seq = new_seq.replace('None', '')
                            my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                     id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+new_des,\
                                                 description=new_des))
                            if (len(hom_position_list) != 0):
                                hom_het_cnt += 1
                            else:
                                het_only_cnt += 1
                            if (change_df.iloc[het_list[n]]['mutation_type'] ==
                                    'snv'):
                                random_seq, random_des = generate_random_SNV_site(
                                    strand_dict[k], k,
                                    transcript[start:stop + 1],
                                    int(change_df.iloc[het_list[n]]['c_start'])
                                    - start)
                                my_seqs.append(SeqRecord(Seq(str(random_seq),IUPAC.protein),\
                                                     id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+des+random_des,\
                                                 description=des+random_des))
                                random_cnt += 1

                            else:
                                random_seq,random_des=generate_random_fs(strand_dict[k],transcript[start:stop+1],\
                                                                         change_df.iloc[het_list[n]]['mutation_type'],\
                                                                         len(change_df.iloc[het_list[n]]['c_content']),\
                                                                         int(change_df.iloc[het_list[n]]['c_start'])-start)
                                #                            if(len(random_seq)!=0):
                                my_seqs.append(SeqRecord(Seq(str(random_seq),IUPAC.protein),\
                                                     id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1)+'_'+des+random_des,\
                                                 description=des+random_des))
                                random_cnt += 1

    if (exclude == False):
        for key in sequence_dict.keys():
            if (key not in trans_index_dict.keys()):
                pid = protein_id_dict[key][0]
                gid = protein_id_dict[key][1]
                coding_start = int(sequence_dict[key][0]) - 1
                coding_end = int(sequence_dict[key][2]) - 1
                new_sequence = sequence_dict[key][1][coding_start:coding_end +
                                                     1]

                new_seq = str(
                    Seq(str(new_sequence),
                        IUPAC.ambiguous_dna).transcribe().translate(
                            to_stop=True))
                my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                             id=db+'|'+pid+'|'+key+'|'+gid+'_0:'+str(coding_start+1)+'-'+str(coding_end+1)+'_no_variant',\
                                         description="no variant"))
                original_cnt += 1

    print("The number of proteins related is " + str(k_cnt))
    print("The number of sequences generated is " + str(len(my_seqs)))

    handle = open(output_name + ".fasta", "w")

    for sequence in my_seqs:
        SeqIO.write(sequence, handle, "fasta")

    print("The number of sequences containing hom only is " +
          str(hom_only_cnt))
    print("The number of sequences containing het only is " +
          str(het_only_cnt))
    print("The number of mixed sequences is " + str(hom_het_cnt))
    print("The number of original sequences is " + str(original_cnt))
    print("The number of random sequences is " + str(random_cnt))
Esempio n. 6
0
    type=str,
    help=
    'A gff file containing intervals within which gc content can be determined.'
)

conf = ap.parse_args()

#-----------------------------------------------------
# Step 2
# Identify the gc content of features in the gff file
#-----------------------------------------------------

genome_file = conf.genome
gff_file = conf.gff

for feature in DataIterator(gff_file):
    contig_id = str(feature.seqid)
    feat_start = str(feature.start)
    feat_stop = str(feature.stop)
    sequence = feature.sequence(genome_file)

    g_count = sequence.count('G')
    c_count = sequence.count('C')
    n_count = sequence.count('N')
    gc_count = float(g_count + c_count)
    seq_len = int(len(sequence) - n_count)
    gc_frac = np.divide(gc_count, seq_len)

    gc_perc = int(np.round_(np.multiply(gc_frac, 100), decimals=0, out=None))
    outline = [contig_id, feat_start, feat_stop, str(gc_perc)]
    print("\t".join(outline))
Esempio n. 7
0
def get_new_sequence(dfname, dbname, rna_db, protein_db, dataset_name):
    version = get_version(protein_db)
    if (version == 'swissprot'):
        db = 'sp'
    else:
        db = version

    protein_coding_list = get_protein_coding_list_from_db(protein_db)
    records = DataIterator(dbname)
    strand_dict = {}
    protein_id_dict = {}
    for record in records:
        if (record[2] == 'transcript'):
            if ('transcript_type' in record.attributes):
                if (record.attributes['transcript_type'][0] == 'protein_coding'
                    ):
                    if (record.attributes['transcript_id'][0]
                            in protein_coding_list):
                        strand_dict[record.attributes['transcript_id']
                                    [0]] = record[6]
                        protein_id_dict[record.attributes['transcript_id'][0]]=(record.attributes['protein_id'][0],\
                                   record.attributes['gene_id'][0])

    print("protein_id_dict ready")

    sequence_dict = {}
    rna_seqs = SeqIO.parse(rna_db, 'fasta')
    ##check the correctness of rna-seqs
    for correct in rna_seqs:
        tmp = correct.id
        flag = tmp.find("|")
        mrna_id = tmp[:flag]
        tmp = tmp[flag + 1:]
        cds = tmp.find('CDS:')
        if (cds == -1):
            print("The format of file of parameter -r(--rna) is incorrect!")
            return 0
        tmp = tmp[cds:]
        cds_end = tmp.find('|')
        tmp = tmp[tmp.find(':') + 1:cds_end]

        split_flag = tmp.find('-')
        if (split_flag == -1):
            print("The format of file of parameter -r(--rna) is incorrect!")
            return 0
        coding_start = int(tmp[:split_flag])
        coding_end = int(tmp[split_flag + 1:])
    for seq in rna_seqs:
        tmp = seq.id
        flag = tmp.find("|")
        mrna_id = tmp[:flag]
        tmp = tmp[flag + 1:]
        cds = tmp.find('CDS:')
        tmp = tmp[cds:]
        cds_end = tmp.find('|')
        tmp = tmp[tmp.find(':') + 1:cds_end]

        split_flag = tmp.find('-')
        coding_start = int(tmp[:split_flag])
        coding_end = int(tmp[split_flag + 1:])
        sequence_dict[mrna_id] = (coding_start, seq.seq, coding_end)

    print("sequence_dict ready")

    df = pd.read_csv(dfname, sep='\t', header=None)
    change_df = extract_transcript_change(df)
    trans_index_dict = {}
    for i in range(0, change_df.shape[0]):
        if (change_df.iloc[i]['mrna'] not in trans_index_dict.keys()):
            trans_index_dict[change_df.iloc[i]['mrna']] = [i]
        else:
            trans_index_dict[change_df.iloc[i]['mrna']].append(i)

    my_seqs = []
    k_cnt = 0
    hom_cnt = 0
    het_cnt = 0

    for k in trans_index_dict.keys():

        if (k in protein_id_dict.keys()):
            k_cnt += 1
            #            if(k_cnt%1000==0):
            #                print(k_cnt)
            pid = protein_id_dict[k][0]
            gid = protein_id_dict[k][1]

            if (strand_dict[k] == '+'):

                transcript = str(sequence_dict[k][1])
            else:
                tmp = Seq(str(sequence_dict[k][1]),
                          IUPAC.ambiguous_dna).complement()
                transcript = str(tmp)
            coding_start = int(sequence_dict[k][0]) - 1
            coding_end = int(sequence_dict[k][2]) - 1

            transcript = transcript[coding_start:coding_end + 1]

            shift = 0
            des = ""
            het_list = []
            hom_position_list = []
            for i in trans_index_dict[k]:

                if (change_df.iloc[i]['snp_type'] == 'hom'):
                    hom_cnt += 1
                    hom_position_list.append(
                        (int(change_df.iloc[i]['c_start']),
                         change_df.iloc[i]['mutation_type']))
                    #                    if(change_df.iloc[i]['mutation_type']=='snv'):
                    #                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                    #                                                  int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],'snv')
                    #                        des+="snv:"+str(change_df.iloc[i]['c_start'])+change_df.iloc[i]['c_content']+'_'
                    #                    else:
                    if (strand_dict[k] == '-'):
                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                                              int(change_df.iloc[i]['c_end'])+shift,str(Seq(str(change_df.iloc[i]['c_content'])).complement()),\
                                              change_df.iloc[i]['mutation_type'])
                    else:
                        transcript=change_seq(transcript,int(change_df.iloc[i]['c_start'])+shift,\
                                              int(change_df.iloc[i]['c_end'])+shift,change_df.iloc[i]['c_content'],\
                                              change_df.iloc[i]['mutation_type'])
                    des+=change_df.iloc[i]['mutation_type']+":"+str(change_df.iloc[i]['c_start'])+'-'+\
                    str(change_df.iloc[i]['c_end'])+str(change_df.iloc[i]['c_content'])+'_'
                    if (change_df.iloc[i]['mutation_type'].find('del') != -1):
                        shift -= (int(change_df.iloc[i]['c_end']) -
                                  int(change_df.iloc[i]['c_start']) + 1)
                    elif (change_df.iloc[i]['mutation_type'].find('ins') !=
                          -1):
                        shift += (int(change_df.iloc[i]['c_end']) -
                                  int(change_df.iloc[i]['c_start']) + 1)
                else:
                    het_list.append(i)
                    het_cnt += 1

            coding_start = int(sequence_dict[k][0]) + shift - 1
            coding_end = int(sequence_dict[k][2]) + shift - 1
            count = int(len(transcript) / 900)
            if (len(transcript) <= 900):
                count = 1
            cnt = 0

            for l in range(0, count):
                l = l * 900
                start = l
                if (start + 1799 < len(transcript)):
                    stop = start + 1799
                else:
                    stop = len(transcript) - 1
#                while(coding_start>stop):
#                    l=l+1
#                    start=l*1800
#                    if(l+1799<len(transcript)):
#                        stop=l+1799
#                    else:
#                        stop=len(transcript)-1
#                #if(start>coding_end+shift)

                if (len(hom_position_list) != 0):
                    new_sequence = ""
                    new_des = ""
                    for p in hom_position_list:
                        if ((p[0] - 1 >= start) & (p[0] - 1 <= stop)):
                            flag = des.find(str(p[0]))
                            tmp = des[flag:]
                            flag = tmp.find('_')
                            new_des += str(p[1]) + ":" + tmp[:flag + 1]
                            new_sequence = transcript

                    if (len(new_sequence) != 0):

                        new_sequence = new_sequence[start:stop + 1]

                        if (strand_dict[k] == '+'):
                            new_seq = str(
                                Seq(str(new_sequence), IUPAC.ambiguous_dna).
                                transcribe().translate(to_stop=True))
                        else:
                            new_seq = str(
                                Seq(str(new_sequence),
                                    IUPAC.ambiguous_dna).complement().
                                transcribe().translate(to_stop=True))
                        cnt += 1
                        while (new_seq.find('None') != -1):
                            new_seq = new_seq.replace('None', '')
                        my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                 id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1),\
                                             description=new_des))
                het_number = len(het_list)
                for n in range(0, het_number):
                    new_sequence = ""
                    new_des = ""

                    if ((int(change_df.iloc[het_list[n]]['c_start']) >= start)
                            &
                        (int(change_df.iloc[het_list[n]]['c_start']) <= stop)):
                        if (strand_dict[k] == '-'):
                            new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\
                                                  int(change_df.iloc[het_list[n]]['c_end'])+shift,\
                                                  str(Seq(str(change_df.iloc[het_list[n]]['c_content'])).complement()),\
                                                  change_df.iloc[het_list[n]]['mutation_type'])
                        else:
                            new_sequence=change_seq(transcript,int(change_df.iloc[het_list[n]]['c_start'])+shift,\
                                                  int(change_df.iloc[het_list[n]]['c_end'])+shift,change_df.iloc[het_list[n]]['c_content'],\
                                                  change_df.iloc[het_list[n]]['mutation_type'])
                        new_des=des+change_df.iloc[het_list[n]]['mutation_type']+":"+\
                        str(change_df.iloc[het_list[n]]['c_start'])+'-'+str(change_df.iloc[het_list[n]]['c_end'])+\
                        str(change_df.iloc[het_list[n]]['c_content'])
                    if (len(new_sequence) != 0):
                        new_sequence = new_sequence[start:stop + 1]

                        if (strand_dict[k] == '+'):
                            new_seq = str(
                                Seq(str(new_sequence), IUPAC.ambiguous_dna).
                                transcribe().translate(to_stop=True))
                        else:
                            new_seq = str(
                                Seq(str(new_sequence),
                                    IUPAC.ambiguous_dna).complement().
                                transcribe().translate(to_stop=True))
                        cnt += 1
                        while (new_seq.find('None') != -1):
                            new_seq = new_seq.replace('None', '')
                        my_seqs.append(SeqRecord(Seq(str(new_seq),IUPAC.protein),\
                                                 id=db+'|'+pid+'|'+k+'|'+gid+'_'+str(cnt)+':'+str(start+1)+'-'+str(stop+1),\
                                             description=new_des))

    print("The number of proteins related is " + str(k_cnt))
    print("The number of sequences generated is " + str(len(my_seqs)))
    print("The number of homozygous is " + str(hom_cnt))
    print("The number of heterozygous is " + str(het_cnt))
    #return my_seqs
    handle = open(dataset_name + "_all_mutation_" + version + ".fasta", "w")
    hom_seq = 0
    het_seq = 0
    hom_het_seq = 0
    for sequence in my_seqs:
        if (str(sequence.description)[-1] == '_'):
            hom_seq += 1
        else:
            if (str(sequence.description).find('_') == -1):
                het_seq += 1
            else:
                hom_het_seq += 1

        SeqIO.write(sequence, handle, "fasta")
    print("The number of homozygous sequences is " + str(hom_seq))
    print("The number of heterozygous sequences is " + str(het_seq))
    print("The number of mixed sequences is " + str(hom_het_seq))
Esempio n. 8
0
def read_taxon_id(run_folder):
    """
    Search for Taxon ID in genbank or GFF files.
    For GenBank file searc for ''taxon:' key in 'db_xref' qualifier.
    For GFF file search for 'taxon' in dbxref feature.

    Args:
        run_folder (str): path to the input folder
    """
    taxon_ids = {}

    for input_folder in os.listdir(run_folder):
        input_folder_path = os.path.join(run_folder, input_folder)
        for input_file in os.listdir(input_folder_path):
            if '.gbk' in input_file:
                gbk_pathname = os.path.join(input_folder_path, input_file)
                # Take the species name and the taxon id from the genbank file.
                with open(gbk_pathname, "r") as gbk:
                    # Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
                    first_seq_record = next(SeqIO.parse(gbk, "genbank"))
                    # Take the source feature of the first record.
                    # This feature contains the taxon ID in the db_xref qualifier.
                    src_features = [
                        feature for feature in first_seq_record.features
                        if feature.type == "source"
                    ]
                    for src_feature in src_features:
                        try:
                            src_dbxref_qualifiers = src_feature.qualifiers[
                                'db_xref']
                            for src_dbxref_qualifier in src_dbxref_qualifiers:
                                if 'taxon:' in src_dbxref_qualifier:
                                    taxon_id = src_dbxref_qualifier.replace(
                                        'taxon:', '')
                        except KeyError:
                            logger.info(
                                'No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'
                                .format(gbk_pathname))

            elif '.gff' in input_file:
                gff_pathname = os.path.join(input_folder_path, input_file)

                # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature.
                try:
                    region_feature = [
                        feature for feature in DataIterator(gff_pathname)
                        if feature.featuretype == 'region'
                    ][0]
                except IndexError:
                    raise IndexError(
                        'No region feature in the GFF file of {0}, GFF file must have region features.'
                        .format(input_folder))

                try:
                    region_feature.attributes['Dbxref']
                except KeyError:
                    raise KeyError(
                        'No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'
                        .format(input_folder))

                for dbxref in region_feature.attributes['Dbxref']:
                    if 'taxon' in dbxref:
                        taxon_id = dbxref.split('taxon:')[1]

            elif '.pf' in input_file:
                logger.info(
                    'No taxon ID associated to a PathoLogic Format. {0} will have a missing taxon_id'
                    .format(input_folder))
                taxon_id = "missing"
        taxon_ids[input_folder] = taxon_id

    return taxon_ids
Esempio n. 9
0
def create_flats_and_lisp(run_folder, taxon_file):
    """
    Read Genbank/GFF/PF files and create Pathway Tools needed file.
    Create also a lisp file to create flat files from Pathway tools results.
    The name of the PGDB created by Pathway Tools will be the name of the species with '_' instead of space.

    Create organism-params.dat:
    ID  pgdb_id
    STORAGE FILE
    NCBI-TAXON-ID   taxon_id
    NAME    species_name

    Create genetic-elements.dats:
    NAME    
    ANNOT-FILE  gbk_name
    //

    Create flat_files_creation.lisp:
    (in-package :ecocyc)
    (select-organism :org-id 'pgdb_id)
    (create-flat-files-for-current-kb)

    Args:
        run_folder (str): ID of a species of the input folder
        taxon_file (bool): Boolean indicating if a taxon_file must be used
    Returns:
        list: boolean list, True if all files have been created
   """
    # Look for a Genbank/GFF files in the run folder.
    # PGDB ID corresponds to the name of the species folder.
    pgdb_id = os.path.basename(run_folder)
    gbk_name = pgdb_id + ".gbk"
    gbk_pathname = os.path.join(run_folder, gbk_name)
    gbff_name = pgdb_id + ".gbff"
    gbff_pathname = os.path.join(run_folder, gbff_name)
    gff_name = pgdb_id + ".gff"
    gff_pathname = os.path.join(run_folder, gff_name)

    organism_dat = os.path.join(run_folder, 'organism-params.dat')
    genetic_dat = os.path.join(run_folder, 'genetic-elements.dat')
    lisp_pathname = os.path.join(run_folder, 'flat_files_creation.lisp')

    fasta_extensions = ['.fasta', '.fsa']

    taxon_id = ""
    taxon_error = False
    species_name = ""
    taxon_datas = {}

    if os.path.isfile(gbk_pathname) or os.path.isfile(gbff_pathname):
        if os.path.isfile(gbk_pathname):
            input_name = gbk_name
            input_path = gbk_pathname
        else:
            input_name = gbff_name
            input_path = gbff_pathname
        # Take the species name and the taxon id from the genbank file.
        with open(input_path, "r") as gbk:
            # Take the first record of the genbank (first contig/chromosome) to retrieve the species name.
            try:
                first_seq_record = next(SeqIO.parse(gbk, "genbank"))
            except StopIteration:
                logger.critical(
                    'Issue with the genbank {0}, it can be empty or malformatted.'
                    .format(input_path))
                return None

            try:
                species_name = first_seq_record.annotations['organism']
            except KeyError:
                logger.critical(
                    'No organism in the Genbank {0} In the SOURCE you must have: ORGANISM  Species name'
                    .format(pgdb_id))
                return None

            # Take the source feature of the first record.
            # This feature contains the taxon ID in the db_xref qualifier.
            src_features = [
                feature for feature in first_seq_record.features
                if feature.type == "source"
            ]
            for src_feature in src_features:
                if 'db_xref' in src_feature.qualifiers:
                    src_dbxref_qualifiers = src_feature.qualifiers['db_xref']
                    for src_dbxref_qualifier in src_dbxref_qualifiers:
                        if 'taxon:' in src_dbxref_qualifier:
                            taxon_id = src_dbxref_qualifier.replace(
                                'taxon:', '')
                if not taxon_id:
                    logger.info(
                        'No taxon ID in the Genbank {0} In the FEATURES source you must have: /db_xref="taxon:taxonid" Where taxonid is the Id of your organism. You can find it on the NCBI.'
                        .format(gbk_pathname))
                    logger.info('Try to look in the taxon_id.tsv file')
                    taxon_error, taxon_id, taxon_datas = extract_taxon_id(
                        run_folder, pgdb_id, taxon_id, taxon_file)
            if taxon_file:
                taxon_error, taxon_id, taxon_datas = extract_taxon_id(
                    run_folder, pgdb_id, taxon_id, taxon_file)

    elif os.path.isfile(gff_pathname):
        input_name = gff_name
        # Check if there is a fasta file.
        gff_fasta = None
        for fasta_extension in fasta_extensions:
            fasta_input_name = input_name.replace('.gff', fasta_extension)
            fasta_path = os.path.join(run_folder, fasta_input_name)
            if os.path.exists(fasta_path):
                gff_fasta = fasta_input_name
        if not gff_fasta:
            logger.critical(
                'No fasta file (.fasta or .fsa) with the GFF of {0}'.format(
                    pgdb_id))
            return None

        # Instead of parsing and creating a database from the GFF, parse the file and extract the first region feature.
        try:
            region_feature = [
                feature for feature in DataIterator(gff_pathname)
                if feature.featuretype == 'region'
            ][0]
        except IndexError:
            logger.critical(
                'No region feature in the GFF file of {0}, GFF file must have region features.'
                .format(pgdb_id))
            return None

        try:
            region_feature.attributes['Dbxref']
        except KeyError:
            logger.critical(
                'No Dbxref in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'
                .format(pgdb_id))

        for dbxref in region_feature.attributes['Dbxref']:
            if 'taxon' in dbxref:
                taxon_id = dbxref.split('taxon:')[1]
        if not taxon_id or taxon_file:
            if not taxon_id:
                logger.info(
                    'Missing "taxon:" in GFF file of {0} GFF file must have a ;Dbxref=taxon:taxonid; in the region feature.'
                    .format(pgdb_id))
                logger.info('Try to look in the taxon_id.tsv file')
            taxon_error, taxon_id, taxon_datas = extract_taxon_id(
                run_folder, pgdb_id, taxon_id, taxon_file)

    # Look for PF files.
    elif all([
            True for species_file in os.listdir(run_folder)
            if '.pf' in species_file or '.fasta' in species_file
            or '.fsa' in species_file
    ]):
        for species_file in os.listdir(run_folder):
            if '.pf' in species_file:
                # Check if there is a fasta file.
                pf_fasta = None
                for fasta_extension in fasta_extensions:
                    fasta_species_name = species_file.replace(
                        '.pf', fasta_extension)
                    fasta_path = os.path.join(run_folder, fasta_species_name)
                    if os.path.exists(fasta_path):
                        pf_fasta = fasta_species_name
                if not pf_fasta:
                    logger.critical(
                        'No fasta file (.fasta or .fsa) with the Pathologic file of {0}, this could lead to warnings in Pathway Tools.'
                        .format(pgdb_id))

        taxon_error, taxon_id, taxon_datas = extract_taxon_id(
            run_folder, pgdb_id, taxon_id, taxon_file)

    if taxon_error == True:
        logger.critical('Issue with taxon ID of {0}.'.format(run_folder))
        return None

    # Create the organism-params dat file.
    with open(organism_dat, 'w', encoding='utf-8') as organism_file:
        organism_writer = csv.writer(organism_file,
                                     delimiter='\t',
                                     lineterminator='\n')
        organism_writer.writerow(['ID', pgdb_id])
        organism_writer.writerow(['STORAGE', "FILE"])
        organism_writer.writerow(['NCBI-TAXON-ID', taxon_id])
        organism_writer.writerow(['NAME', species_name])
        if 'reference_pgdbs' in taxon_datas:
            for reference_pgdb in taxon_datas['reference_pgdbs']:
                organism_writer.writerow(['REF-ORGID', reference_pgdb])

    # Create the genetic-elements dat file.
    with open(genetic_dat, 'w', encoding='utf-8') as genetic_file:
        if os.path.isfile(gff_pathname) or os.path.isfile(
                gbk_pathname) or os.path.isfile(gbff_pathname):
            genetic_writer = csv.writer(genetic_file,
                                        delimiter='\t',
                                        lineterminator='\n')
            genetic_writer.writerow(['NAME', ''])
            genetic_writer.writerow(['ANNOT-FILE', input_name])
            if os.path.isfile(gff_pathname):
                genetic_writer.writerow(['SEQ-FILE', gff_fasta])
            if 'circular' in taxon_datas:
                circular = taxon_datas['circular']
                genetic_writer.writerow(['CIRCULAR?', circular])
            if 'element_type' in taxon_datas:
                element_type = taxon_datas['element_type']
                genetic_writer.writerow(['TYPE', element_type])
            if 'codon_table' in taxon_datas:
                codon_table = taxon_datas['codon_table']
                genetic_writer.writerow(['CODON-TABLE', codon_table])
            genetic_writer.writerow(['//'])
        elif all([
                True for species_file in os.listdir(run_folder)
                if '.pf' in species_file or '.fasta' in species_file
                or '.fsa' in species_file
        ]):
            genetic_writer = csv.writer(genetic_file,
                                        delimiter='\t',
                                        lineterminator='\n')
            for species_file in os.listdir(run_folder):
                if '.pf' in species_file:
                    species_file_name = os.path.splitext(species_file)[0]
                    genetic_writer.writerow(
                        ['NAME', species_file.replace('.pf', '')])
                    genetic_writer.writerow(
                        ['ID', species_file.replace('.pf', '')])
                    genetic_writer.writerow(['ANNOT-FILE', species_file])
                    fasta_path = os.path.join(
                        run_folder, species_file.replace('.pf', '.fasta'))
                    fsa_path = os.path.join(
                        run_folder, species_file.replace('.pf', '.fsa'))
                    if os.path.exists(fasta_path):
                        genetic_writer.writerow([
                            'SEQ-FILE',
                            species_file.replace('.pf', '.fasta')
                        ])
                    elif os.path.exists(fsa_path):
                        genetic_writer.writerow(
                            ['SEQ-FILE',
                             species_file.replace('.pf', '.fsa')])

                    if species_file_name in taxon_datas:
                        if 'circular' in taxon_datas[species_file_name]:
                            circular = taxon_datas[species_file_name][
                                'circular']
                            genetic_writer.writerow(['CIRCULAR?', circular])
                        if 'element_type' in taxon_datas[species_file_name]:
                            element_type = taxon_datas[species_file_name][
                                'element_type']
                            genetic_writer.writerow(['TYPE', element_type])
                        if 'codon_table' in taxon_datas[species_file_name]:
                            codon_table = taxon_datas[species_file_name][
                                'codon_table']
                            genetic_writer.writerow(
                                ['CODON-TABLE', codon_table])
                    else:
                        if 'circular' in taxon_datas:
                            circular = taxon_datas['circular']
                            genetic_writer.writerow(['CIRCULAR?', circular])
                        if 'element_type' in taxon_datas:
                            element_type = taxon_datas['element_type']
                            genetic_writer.writerow(['TYPE', element_type])
                        if 'codon_table' in taxon_datas:
                            codon_table = taxon_datas['codon_table']
                            genetic_writer.writerow(
                                ['CODON-TABLE', codon_table])
                    genetic_writer.writerow(['//'])
    # Create the lisp script.
    check_lisp_file = create_flat_creation_script(pgdb_id, lisp_pathname)

    return all([
        os.path.isfile(organism_dat),
        os.path.isfile(genetic_dat), check_lisp_file
    ])