Ejemplo n.º 1
0
def extract_feature_from_seq(seq,stt,stp,c_tab,g_tab):
	'''extract features of sequence from fasta entry'''
	
	stt_coden = stt.strip().split(',')
	stp_coden = stp.strip().split(',')
	transtab = maketrans("ACGTNX","TGCANX")
	mRNA_seq = seq.upper()
	mRNA_size = len(seq)
	tmp = orf.ORFFinder(mRNA_seq)
	(CDS_size1, CDS_frame1, CDS_seq1) = tmp.longest_orf(direction="+",start_coden=stt_coden, stop_coden=stp_coden)
	fickett_score1 = fickett.fickett_value(CDS_seq1)
	hexamer = FrameKmer.kmer_ratio(CDS_seq1,6,3,c_tab,g_tab)
	return (mRNA_size, CDS_size1, fickett_score1,hexamer)
Ejemplo n.º 2
0
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab):
    '''extract features of sequence from bed line'''

    stt_coden = stt.strip().split(',')
    stp_coden = stp.strip().split(',')
    transtab = str.maketrans("ACGTNX", "TGCANX")
    mRNA_seq = ''
    mRNA_size = 0
    if inbed.strip():
        try:
            fields = inbed.split()
            chrom = fields[0]
            tx_start = int(fields[1])
            tx_end = int(fields[2])
            geneName = fields[3]
            strand = fields[5].replace(" ", "_")
            exon_num = int(fields[9])
            exon_sizes = list(map(int, fields[10].rstrip(',\n').split(',')))
            exon_starts = list(map(int, fields[11].rstrip(',\n').split(',')))
            exon_starts = list(map((lambda x: x + tx_start), exon_starts))
            exon_ends = list(map(int, fields[10].rstrip(',\n').split(',')))
            exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends))
            intron_starts = exon_ends[:-1]
            intron_ends = exon_starts[1:]
        except:
            print("Wrong format!" + inbed, file=sys.stderr)
            return None
        mRNA_size = sum(exon_sizes)
        for st, end in zip(exon_starts, exon_ends):
            exon_coord = chrom + ':' + str(st + 1) + '-' + str(end)
            tmp = pysam.faidx(refgenome, exon_coord)
            mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp[1:]])
        if strand == '-':
            mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1]
        tmp = orf.ORFFinder(mRNA_seq)
        (CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+",
                                                         start_coden=stt_coden,
                                                         stop_coden=stp_coden)
        fickett_score = fickett.fickett_value(CDS_seq)
        hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab)
        #print CDS_seq
        return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
Ejemplo n.º 3
0
def mainProcess(input, output, number, c_tab, g_tab, codonArr, hash_matrix,
                classifier):

    if number > 1:
        Temp_Dir = output + '_Tmp_Dir'
        temp_score = '' + Temp_Dir + '/' + output + str(number)
        #        temp_feature = ''+Temp_Dir+'/temp_feature' + str(number)
        SCORE = open(temp_score, 'w')
        #        DATA = open(temp_feature,'w')
        sequence_Arr = input.split('\n')
        sLen = len(sequence_Arr) - 1
        del sequence_Arr[sLen]
    if number == 1:
        SCORE = open(output, 'w')
        sequence_Arr = input

    label_Arr_tmp = []
    FastA_seq_Arr_tmp = []
    for n in range(len(sequence_Arr)):
        if n == 0 or n % 2 == 0:
            label = sequence_Arr[n]
            label_Arr_tmp.append(label)
        else:
            seq = sequence_Arr[n]
            FastA_seq_Arr_tmp.append(seq)
    data = []
    ids = []
    for i in range(len(label_Arr_tmp)):
        Seq = FastA_seq_Arr_tmp[i]
        tran_fir_seq = Seq.lower()
        tran_sec_seq_one = tran_fir_seq.replace('u', 't')
        strinfo = re.compile('[^agctn]')
        tran_sec_seq = strinfo.sub('n', tran_sec_seq_one)
        tran_sec_seq2 = tran_sec_seq.upper()
        tmp = orf.ORFFinder(tran_sec_seq2)
        (CDS_start, CDS_stop, CDS_size, CDS_frame,
         CDS_seq) = tmp.longest_orf(direction="+")
        (MCS, CSL, CP) = mcssProcess(tran_sec_seq2, c_tab, g_tab)
        fickett_score = fickett.fickett_value(CDS_seq)
        (orfscore, orfdistance) = HexamerFeatures(CDS_seq.lower(), hash_matrix)
        labels_Arr = label_Arr_tmp[i].split()
        ids.append(labels_Arr[0])
        Exons_mscore = []
        Exons_distance = []
        Exons_GC = []

        Site_start = 0
        for j in range(1, len(labels_Arr)):

            seq = tran_sec_seq[Site_start:Site_start + int(labels_Arr[j])]
            if (len(seq) > 0):
                GCnum = seq.count('c') + seq.count('g')
                GCratio = GCnum / float(len(seq))
                Exons_GC.append(GCratio)
                (mscore, distance) = HexamerFeatures(seq, hash_matrix)
                Exons_mscore.append(mscore)
                Exons_distance.append(distance)
                Site_start = Site_start + int(labels_Arr[j])
            else:
                continue
        Max_Mscore_exon = max(Exons_mscore)
        Max_distance = max(Exons_distance)
        Max_GCcontent = max(Exons_GC)

        full_len = len(tran_sec_seq)
        orf_ratio = CDS_size / float(full_len)

        transcript_features = [
            CDS_size, orf_ratio, fickett_score, orfscore, orfdistance,
            Max_Mscore_exon, Max_distance, Max_GCcontent, MCS, CSL, CP
        ]
        data.append(transcript_features)
#        PROPERTY_STR = labels_Arr[0]  + ' ' + str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n'
#        DATA.write(PROPERTY_STR)
    testing_data = np.array(data)
    del data
    testing_data = testing_data.reshape(len(label_Arr_tmp), 11)
    prob = classifier.predict_proba(testing_data)
    labels = classifier.predict(testing_data)
    PrintResult(ids, labels, prob[:, 1], SCORE)
    SCORE.close()
Ejemplo n.º 4
0
def mainProcess(input,output,number,c_tab,g_tab,codonArr,hash_matrix,mRNA_num):
    mRNA_num = mRNA_num/2
    if number > 1:
        Temp_Dir = output + '_Tmp_Dir'
        temp_feature = ''+Temp_Dir+'/'+ output + str(number)
        DATA = open(temp_feature,'w')
        sequence_Arr = input.split('\n')
        sLen = len(sequence_Arr) - 1
        del sequence_Arr[sLen]
    if number == 1:
        DATA = open(output,'w')
        sequence_Arr = input        
    
    label_Arr_tmp = []
    FastA_seq_Arr_tmp = []
    for n in range(len(sequence_Arr)):
        if n == 0 or n % 2 == 0:
            label = sequence_Arr[n]
            label_Arr_tmp.append(label)
        else :
            seq = sequence_Arr[n]
            FastA_seq_Arr_tmp.append(seq)

    for i in range(len(label_Arr_tmp)):
        Seq = FastA_seq_Arr_tmp[i]
        tran_fir_seq = Seq.lower()
        tran_sec_seq_one = tran_fir_seq.replace('u','t')
        strinfo = re.compile('[^agctn]')                   
        tran_sec_seq = strinfo.sub('n',tran_sec_seq_one)                
        tran_sec_seq2 = tran_sec_seq.upper()
        tmp = orf.ORFFinder(tran_sec_seq2)
       
        (CDS_start, CDS_stop, CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+")
        (MCS,CSL,CP) = mcssProcess(tran_sec_seq2,c_tab,g_tab)
        fickett_score = fickett.fickett_value(CDS_seq)
        (orfscore,orfdistance) = HexamerFeatures(CDS_seq.lower(),hash_matrix)
        labels_Arr = label_Arr_tmp[i].split()
        Exons_mscore = []
        Exons_distance =[]
        Exons_GC = []

        Site_start = 0
        for j in range(1,len(labels_Arr)):
            
            seq = tran_sec_seq[Site_start:Site_start+int(labels_Arr[j])]
            if (len(seq) > 0):
                GCnum = seq.count('c') + seq.count('g')
                GCratio = GCnum/float(len(seq))          
                Exons_GC.append(GCratio)
                (mscore,distance) = HexamerFeatures(seq,hash_matrix)
                Exons_mscore.append(mscore)
                Exons_distance.append(distance)
                Site_start = Site_start + int(labels_Arr[j])
            else:
                continue
        Max_Mscore_exon = max(Exons_mscore)
        Max_distance = max(Exons_distance)
        Max_GCcontent = max(Exons_GC)
       
        full_len = len(tran_sec_seq)
        orf_ratio = CDS_size/float(full_len)
        if i < mRNA_num:
            PROPERTY_STR = '+1 '+ str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n'
        else:
            PROPERTY_STR = '-1 '+  str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n'
            
        DATA.write(PROPERTY_STR)

    DATA.close()