コード例 #1
0
ファイル: cpat.py プロジェクト: pombredanne/presentations-9
def extract_feature_from_seq(seq, stt, stp, c_tab, g_tab):
    """extract features of sequence from fasta entry"""

    stt_coden = stt.strip().split(",")
    stp_coden = stp.strip().split(",")
    transtab = maketrans("ACGTNX", "TGCANX")
    mRNA_seq = seq.upper()
    mRNA_size = len(seq)
    tmp = orf.ORFFinder(mRNA_seq)
    (CDS_size1, CDS_frame1, CDS_seq1) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden)
    fickett_score1 = fickett.fickett_value(CDS_seq1)
    hexamer = FrameKmer.kmer_ratio(CDS_seq1, 6, 3, c_tab, g_tab)
    return (mRNA_size, CDS_size1, fickett_score1, hexamer)
コード例 #2
0
def extract_feature_from_seq(seq,stt,stp,c_tab,g_tab):
	'''extract features of sequence from fasta entry'''
	
	stt_coden = stt.strip().split(',')
	stp_coden = stp.strip().split(',')
	transtab = maketrans("ACGTNX","TGCANX")
	mRNA_seq = seq.upper()
	mRNA_size = len(seq)
	tmp = orf.ORFFinder(mRNA_seq)
	(CDS_size1, CDS_frame1, CDS_seq1) = tmp.longest_orf(direction="+",start_coden=stt_coden, stop_coden=stp_coden)
	fickett_score1 = fickett.fickett_value(CDS_seq1)
	hexamer = FrameKmer.kmer_ratio(CDS_seq1,6,3,c_tab,g_tab)
	return (mRNA_size, CDS_size1, fickett_score1,hexamer)
コード例 #3
0
ファイル: make_logitModel.py プロジェクト: liguowang/cpat
def extract_feature_from_seq(seq, stt, stp, c_tab, g_tab, min_orf):
    '''extract features of sequence from fasta entry'''

    mRNA_seq = seq.upper()
    mRNA_size = len(seq)
    tmp = find_orfs.ORFFinder(mRNA_seq, min_orf=min_orf)
    ORFs = tmp.orf_candidates(start_coden=stt,
                              stop_coden=stp,
                              antisense=False,
                              n_candidate=3)
    if len(ORFs) == 0:
        return None
    (direction, frame, ORF_start, ORF_end, CDS_size, CDS_seq) = ORFs[0]
    fickett_score1 = fickett.fickett_value(CDS_seq)
    hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab)
    return (mRNA_size, CDS_size, fickett_score1, hexamer)
コード例 #4
0
ファイル: make_logitModel.py プロジェクト: liguowang/cpat
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab):
    '''extract features of sequence from bed line'''

    stt_coden = stt.strip().split(',')
    stp_coden = stp.strip().split(',')
    transtab = str.maketrans("ACGTNX", "TGCANX")
    mRNA_seq = ''
    mRNA_size = 0
    if inbed.strip():
        try:
            fields = inbed.split()
            chrom = fields[0]
            tx_start = int(fields[1])
            tx_end = int(fields[2])
            geneName = fields[3]
            strand = fields[5].replace(" ", "_")
            exon_num = int(fields[9])
            exon_sizes = list(map(int, fields[10].rstrip(',\n').split(',')))
            exon_starts = list(map(int, fields[11].rstrip(',\n').split(',')))
            exon_starts = list(map((lambda x: x + tx_start), exon_starts))
            exon_ends = list(map(int, fields[10].rstrip(',\n').split(',')))
            exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends))
            intron_starts = exon_ends[:-1]
            intron_ends = exon_starts[1:]
        except:
            print("Wrong format!" + inbed, file=sys.stderr)
            return None
        mRNA_size = sum(exon_sizes)
        for st, end in zip(exon_starts, exon_ends):
            exon_coord = chrom + ':' + str(st + 1) + '-' + str(end)
            tmp = pysam.faidx(refgenome, exon_coord)
            mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp[1:]])
        if strand == '-':
            mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1]
        tmp = orf.ORFFinder(mRNA_seq)
        (CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+",
                                                         start_coden=stt_coden,
                                                         stop_coden=stp_coden)
        fickett_score = fickett.fickett_value(CDS_seq)
        hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab)
        #print CDS_seq
        return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
コード例 #5
0
ファイル: make_logitModel.py プロジェクト: liguowang/cpat
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab,
                             min_orf):
    '''extract features of sequence from bed line'''
    transtab = str.maketrans("ACGTNX", "TGCANX")
    mRNA_seq = ''
    mRNA_size = 0
    if inbed.strip():
        try:
            fields = inbed.split()
            chrom = fields[0]
            tx_start = int(fields[1])
            #tx_end = int( fields[2] )
            geneName = fields[3]
            strand = fields[5].replace(" ", "_")
            exon_sizes = list(map(int, fields[10].rstrip(',\n').split(',')))
            exon_starts = list(map(int, fields[11].rstrip(',\n').split(',')))
            exon_starts = list(map((lambda x: x + tx_start), exon_starts))
            exon_ends = list(map(int, fields[10].rstrip(',\n').split(',')))
            exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends))
        except:
            print("Wrong format!" + inbed, file=sys.stderr)
            return None
        mRNA_size = sum(exon_sizes)
        for st, end in zip(exon_starts, exon_ends):
            exon_coord = chrom + ':' + str(st + 1) + '-' + str(end)
            tmp1 = pysam.faidx(refgenome, exon_coord)
            mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp1.split()[1:]])
        if strand == '-':
            mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1]
        tmp2 = find_orfs.ORFFinder(mRNA_seq, min_orf=min_orf)
        ORFs = tmp2.orf_candidates(start_coden=stt,
                                   stop_coden=stp,
                                   antisense=False,
                                   n_candidate=1)
        if len(ORFs) == 0:
            return None
        (direction, frame, ORF_start, ORF_end, CDS_size, CDS_seq) = ORFs[0]
        #print (ORFs)
        fickett_score = fickett.fickett_value(CDS_seq)
        hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab)
        return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
コード例 #6
0
def extract_feature_from_seq(seq, c_tab, g_tab):
    '''extract features of sequence from fasta entry'''

    mRNA_seq = seq.upper()
    mRNA_size = len(seq)

    orf_finder = orf_extraction.ORFFinder(mRNA_seq)
    tmp = orf_finder.find_longest()

    ''' in the case if start codon have not been found '''
    if tmp==-1:
        return [0] * 9

    starts, orf_seq, orf_size, mean_orf_length, orf_coverage = tmp

    fickett_score = fickett.fickett_value(orf_seq)

    k34, k21, k6 = kozak.find_kozak_feat(mRNA_seq, starts)
    hexamer = FrameKmer.kmer_ratio(orf_seq, 6, 3, c_tab, g_tab)

    return (mRNA_size, orf_size, mean_orf_length, orf_coverage, fickett_score, hexamer, k34, k21, k6)
コード例 #7
0
ファイル: make_logitModel.py プロジェクト: liguowang/cpat
def extract_CDS_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab, min_orf):
    '''extract CDS sequence from bed line'''
    transtab = str.maketrans("ACGTNX", "TGCANX")
    CDS_seq = ''
    mRNA_size = 0
    if inbed.strip():
        try:
            fields = inbed.split()
            chrom = fields[0]
            tx_start = int(fields[1])
            #tx_end = int( fields[2] )
            geneName = fields[3]
            strand = fields[5].replace(" ", "_")
            cdsStart = int(fields[6])
            cdsEnd = int(fields[7])
            exon_sizes = list(map(int, fields[10].rstrip(',\n').split(',')))
            exon_starts = list(map(int, fields[11].rstrip(',\n').split(',')))
            exon_starts = list(map((lambda x: x + tx_start), exon_starts))
            exon_ends = list(map(int, fields[10].rstrip(',\n').split(',')))
            exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends))
        except:
            print("Wrong format!" + inbed, file=sys.stderr)
            return None
        mRNA_size = sum(exon_sizes)

        for base, offset in zip(exon_starts, exon_sizes):
            if (base + offset) < cdsStart: continue
            if base > cdsEnd: continue
            cds_exon_start = max(base, cdsStart)
            cds_exon_end = min(base + offset, cdsEnd)
            exon_coord = chrom + ':' + str(cds_exon_start +
                                           1) + '-' + str(cds_exon_end)
            tmp1 = pysam.faidx(refgenome, exon_coord)
            CDS_seq += ''.join([i.rstrip('\n\r') for i in tmp1.split()[1:]])
        if strand == '-':
            CDS_seq = CDS_seq.upper().translate(transtab)[::-1]
        CDS_size = len(CDS_seq)
        fickett_score = fickett.fickett_value(CDS_seq)
        hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab)
        return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
コード例 #8
0
ファイル: cpat.py プロジェクト: pombredanne/presentations-9
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab):
    """extract features of sequence from bed line"""

    stt_coden = stt.strip().split(",")
    stp_coden = stp.strip().split(",")
    transtab = maketrans("ACGTNX", "TGCANX")
    mRNA_seq = ""
    mRNA_size = 0
    if inbed.strip():
        try:
            fields = inbed.split()
            chrom = fields[0]
            tx_start = int(fields[1])
            tx_end = int(fields[2])
            geneName = fields[3]
            strand = fields[5].replace(" ", "_")
            exon_num = int(fields[9])
            exon_sizes = map(int, fields[10].rstrip(",\n").split(","))
            exon_starts = map(int, fields[11].rstrip(",\n").split(","))
            exon_starts = map((lambda x: x + tx_start), exon_starts)
            exon_ends = map(int, fields[10].rstrip(",\n").split(","))
            exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
            intron_starts = exon_ends[:-1]
            intron_ends = exon_starts[1:]
        except:
            print >>sys.stderr, "Wrong format!" + inbed
            return None
        mRNA_size = sum(exon_sizes)
        for st, end in zip(exon_starts, exon_ends):
            exon_coord = chrom + ":" + str(st + 1) + "-" + str(end)
            tmp = pysam.faidx(refgenome, exon_coord)
            mRNA_seq += "".join([i.rstrip("\n\r") for i in tmp[1:]])
        if strand == "-":
            mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1]
        tmp = orf.ORFFinder(mRNA_seq)
        (CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden)
        fickett_score = fickett.fickett_value(CDS_seq)
        hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab)
        # print CDS_seq
        return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
コード例 #9
0
ファイル: lncScore.py プロジェクト: zhaodoctor/lncScore
def mainProcess(input, output, number, c_tab, g_tab, codonArr, hash_matrix,
                classifier):

    if number > 1:
        Temp_Dir = output + '_Tmp_Dir'
        temp_score = '' + Temp_Dir + '/' + output + str(number)
        #        temp_feature = ''+Temp_Dir+'/temp_feature' + str(number)
        SCORE = open(temp_score, 'w')
        #        DATA = open(temp_feature,'w')
        sequence_Arr = input.split('\n')
        sLen = len(sequence_Arr) - 1
        del sequence_Arr[sLen]
    if number == 1:
        SCORE = open(output, 'w')
        sequence_Arr = input

    label_Arr_tmp = []
    FastA_seq_Arr_tmp = []
    for n in range(len(sequence_Arr)):
        if n == 0 or n % 2 == 0:
            label = sequence_Arr[n]
            label_Arr_tmp.append(label)
        else:
            seq = sequence_Arr[n]
            FastA_seq_Arr_tmp.append(seq)
    data = []
    ids = []
    for i in range(len(label_Arr_tmp)):
        Seq = FastA_seq_Arr_tmp[i]
        tran_fir_seq = Seq.lower()
        tran_sec_seq_one = tran_fir_seq.replace('u', 't')
        strinfo = re.compile('[^agctn]')
        tran_sec_seq = strinfo.sub('n', tran_sec_seq_one)
        tran_sec_seq2 = tran_sec_seq.upper()
        tmp = orf.ORFFinder(tran_sec_seq2)
        (CDS_start, CDS_stop, CDS_size, CDS_frame,
         CDS_seq) = tmp.longest_orf(direction="+")
        (MCS, CSL, CP) = mcssProcess(tran_sec_seq2, c_tab, g_tab)
        fickett_score = fickett.fickett_value(CDS_seq)
        (orfscore, orfdistance) = HexamerFeatures(CDS_seq.lower(), hash_matrix)
        labels_Arr = label_Arr_tmp[i].split()
        ids.append(labels_Arr[0])
        Exons_mscore = []
        Exons_distance = []
        Exons_GC = []

        Site_start = 0
        for j in range(1, len(labels_Arr)):

            seq = tran_sec_seq[Site_start:Site_start + int(labels_Arr[j])]
            if (len(seq) > 0):
                GCnum = seq.count('c') + seq.count('g')
                GCratio = GCnum / float(len(seq))
                Exons_GC.append(GCratio)
                (mscore, distance) = HexamerFeatures(seq, hash_matrix)
                Exons_mscore.append(mscore)
                Exons_distance.append(distance)
                Site_start = Site_start + int(labels_Arr[j])
            else:
                continue
        Max_Mscore_exon = max(Exons_mscore)
        Max_distance = max(Exons_distance)
        Max_GCcontent = max(Exons_GC)

        full_len = len(tran_sec_seq)
        orf_ratio = CDS_size / float(full_len)

        transcript_features = [
            CDS_size, orf_ratio, fickett_score, orfscore, orfdistance,
            Max_Mscore_exon, Max_distance, Max_GCcontent, MCS, CSL, CP
        ]
        data.append(transcript_features)
#        PROPERTY_STR = labels_Arr[0]  + ' ' + str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n'
#        DATA.write(PROPERTY_STR)
    testing_data = np.array(data)
    del data
    testing_data = testing_data.reshape(len(label_Arr_tmp), 11)
    prob = classifier.predict_proba(testing_data)
    labels = classifier.predict(testing_data)
    PrintResult(ids, labels, prob[:, 1], SCORE)
    SCORE.close()
コード例 #10
0
ファイル: lncScore.py プロジェクト: xiaofengsong/lncScore
def mainProcess(input,output,number,c_tab,g_tab,codonArr,hash_matrix,classifier):

    if number > 1:
        Temp_Dir = output + '_Tmp_Dir'
        temp_score = ''+Temp_Dir+'/'+ output + str(number)
#        temp_feature = ''+Temp_Dir+'/temp_feature' + str(number)
        SCORE = open(temp_score,'w')
#        DATA = open(temp_feature,'w')
        sequence_Arr = input.split('\n')
        sLen = len(sequence_Arr) - 1
        del sequence_Arr[sLen]
    if number == 1:
        SCORE = open(output,'w')
        sequence_Arr = input        
    
    label_Arr_tmp = []
    FastA_seq_Arr_tmp = []
    for n in range(len(sequence_Arr)):
        if n == 0 or n % 2 == 0:
            label = sequence_Arr[n]
            label_Arr_tmp.append(label)
        else :
            seq = sequence_Arr[n]
            FastA_seq_Arr_tmp.append(seq)
    data = []
    ids = []
    for i in range(len(label_Arr_tmp)):
        Seq = FastA_seq_Arr_tmp[i]
        tran_fir_seq = Seq.lower()
        tran_sec_seq_one = tran_fir_seq.replace('u','t')
        strinfo = re.compile('[^agctn]')                   
        tran_sec_seq = strinfo.sub('n',tran_sec_seq_one)                
        tran_sec_seq2 = tran_sec_seq.upper()
        tmp = orf.ORFFinder(tran_sec_seq2)
        (CDS_start, CDS_stop, CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+")
        (MCS,CSL,CP) = mcssProcess(tran_sec_seq2,c_tab,g_tab)
        fickett_score = fickett.fickett_value(CDS_seq)
        (orfscore,orfdistance) = HexamerFeatures(CDS_seq.lower(),hash_matrix)
        labels_Arr = label_Arr_tmp[i].split()
        ids.append(labels_Arr[0])
        Exons_mscore = []
        Exons_distance =[]
        Exons_GC = []

        Site_start = 0
        for j in range(1,len(labels_Arr)):
            
            seq = tran_sec_seq[Site_start:Site_start+int(labels_Arr[j])]
            if (len(seq) > 0):
                GCnum = seq.count('c') + seq.count('g')
                GCratio = GCnum/float(len(seq))          
                Exons_GC.append(GCratio)
                (mscore,distance) = HexamerFeatures(seq,hash_matrix)
                Exons_mscore.append(mscore)
                Exons_distance.append(distance)
                Site_start = Site_start + int(labels_Arr[j])
            else:
                continue
        Max_Mscore_exon = max(Exons_mscore)
        Max_distance = max(Exons_distance)
        Max_GCcontent = max(Exons_GC)
       
        full_len = len(tran_sec_seq)
        orf_ratio = CDS_size/float(full_len)
        
        transcript_features = [CDS_size,orf_ratio,fickett_score,orfscore,orfdistance,Max_Mscore_exon,Max_distance,Max_GCcontent,MCS,CSL,CP]        
        data.append(transcript_features)
#        PROPERTY_STR = labels_Arr[0]  + ' ' + str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n'
#        DATA.write(PROPERTY_STR)
    testing_data = np.array(data)
    del data
    testing_data = testing_data.reshape(len(label_Arr_tmp),11)
    prob = classifier.predict_proba(testing_data)
    labels = classifier.predict(testing_data)
    PrintResult(ids,labels,prob[:,1],SCORE) 
    SCORE.close()
コード例 #11
0
def mainProcess(input,output,number,c_tab,g_tab,codonArr,hash_matrix,mRNA_num):
    mRNA_num = mRNA_num/2
    if number > 1:
        Temp_Dir = output + '_Tmp_Dir'
        temp_feature = ''+Temp_Dir+'/'+ output + str(number)
        DATA = open(temp_feature,'w')
        sequence_Arr = input.split('\n')
        sLen = len(sequence_Arr) - 1
        del sequence_Arr[sLen]
    if number == 1:
        DATA = open(output,'w')
        sequence_Arr = input        
    
    label_Arr_tmp = []
    FastA_seq_Arr_tmp = []
    for n in range(len(sequence_Arr)):
        if n == 0 or n % 2 == 0:
            label = sequence_Arr[n]
            label_Arr_tmp.append(label)
        else :
            seq = sequence_Arr[n]
            FastA_seq_Arr_tmp.append(seq)

    for i in range(len(label_Arr_tmp)):
        Seq = FastA_seq_Arr_tmp[i]
        tran_fir_seq = Seq.lower()
        tran_sec_seq_one = tran_fir_seq.replace('u','t')
        strinfo = re.compile('[^agctn]')                   
        tran_sec_seq = strinfo.sub('n',tran_sec_seq_one)                
        tran_sec_seq2 = tran_sec_seq.upper()
        tmp = orf.ORFFinder(tran_sec_seq2)
       
        (CDS_start, CDS_stop, CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+")
        (MCS,CSL,CP) = mcssProcess(tran_sec_seq2,c_tab,g_tab)
        fickett_score = fickett.fickett_value(CDS_seq)
        (orfscore,orfdistance) = HexamerFeatures(CDS_seq.lower(),hash_matrix)
        labels_Arr = label_Arr_tmp[i].split()
        Exons_mscore = []
        Exons_distance =[]
        Exons_GC = []

        Site_start = 0
        for j in range(1,len(labels_Arr)):
            
            seq = tran_sec_seq[Site_start:Site_start+int(labels_Arr[j])]
            if (len(seq) > 0):
                GCnum = seq.count('c') + seq.count('g')
                GCratio = GCnum/float(len(seq))          
                Exons_GC.append(GCratio)
                (mscore,distance) = HexamerFeatures(seq,hash_matrix)
                Exons_mscore.append(mscore)
                Exons_distance.append(distance)
                Site_start = Site_start + int(labels_Arr[j])
            else:
                continue
        Max_Mscore_exon = max(Exons_mscore)
        Max_distance = max(Exons_distance)
        Max_GCcontent = max(Exons_GC)
       
        full_len = len(tran_sec_seq)
        orf_ratio = CDS_size/float(full_len)
        if i < mRNA_num:
            PROPERTY_STR = '+1 '+ str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n'
        else:
            PROPERTY_STR = '-1 '+  str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n'
            
        DATA.write(PROPERTY_STR)

    DATA.close()
コード例 #12
0
ファイル: cpat.py プロジェクト: liguowang/cpat
def main():
    usage = "\n%prog  [options]"
    parser = OptionParser(usage, version="%prog " + __version__)
    parser.add_option(
        "-g",
        "--gene",
        action="store",
        type="string",
        dest="gene_file",
        help=
        "Genomic sequnence(s) of RNA in FASTA (https://en.wikipedia.org/wiki/FASTA_format) or standard 12-column BED (https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format. It is recommended to use *short* and *unique* sequence identifiers (such as Ensembl transcript id) in FASTA and BED file. If this is a BED file, reference genome ('-r/--ref') should be specified. The input FASTA or BED file could be a regular text file or compressed file (*.gz, *.bz2) or accessible URL (http://, https://, ftp://). URL file cannot be a compressed file."
    )
    parser.add_option("-o",
                      "--outfile",
                      action="store",
                      type="string",
                      dest="out_file",
                      help="The prefix of output files.")
    parser.add_option(
        "-d",
        "--logitModel",
        action="store",
        dest="logit_model",
        help=
        "Logistic regression model. The prebuilt models for Human, Mouse, Fly, Zebrafish are availablel. Run 'make_logitModel.py' to build logistic regression model for your own training datset."
    )
    parser.add_option(
        "-x",
        "--hex",
        action="store",
        dest="hexamer_dat",
        help=
        "The hexamer frequency table. The prebuilt tables for Human, Mouse, Fly, Zebrafish are availablel. Run 'make_hexamer_tab.py' to make this table for your own training dataset."
    )
    parser.add_option(
        "-r",
        "--ref",
        action="store",
        dest="ref_genome",
        help=
        "Reference genome sequences in FASTA format. Reference genome file will be indexed automatically if the index file ( *.fai) does not exist. Will be ignored if FASTA file was provided to '-g/--gene'."
    )
    parser.add_option(
        "--antisense",
        action="store_true",
        dest="antisense",
        default=False,
        help=
        "Logical to determine whether to search for ORFs from the anti-sense strand. *Sense strand* (or coding strand) is DNA strand that carries the translatable code in the 5′ to 3′ direction. default=False (i.e. only search for ORFs from the sense strand)"
    )
    parser.add_option(
        "--start",
        action="store",
        type="string",
        dest="start_codons",
        default='ATG',
        help=
        "Start codon (use 'T' instead of 'U') used to define the start of open reading frame (ORF). default=%default"
    )
    parser.add_option(
        "--stop",
        action="store",
        type="string",
        dest="stop_codons",
        default='TAG,TAA,TGA',
        help=
        "Stop codon (use 'T' instead of 'U') used to define the end of open reading frame (ORF). Multiple stop codons are separated by ','. default=%default"
    )
    parser.add_option(
        "--min-orf",
        action="store",
        type="int",
        dest="min_orf_len",
        default=75,
        help="Minimum ORF length in nucleotides.  default=%default")
    parser.add_option(
        "--top-orf",
        action="store",
        type="int",
        dest="n_top_orf",
        default=5,
        help=
        "Number of ORF candidates reported. RNAs may have dozens of putative ORFs, in most cases, the real ORF is ranked (by size) in the top several. It is not necessary to calculate \"Fickett score\", \"Hexamer score\" and \"coding probability\" for every ORF. default=%default"
    )
    parser.add_option(
        "--width",
        action="store",
        type="int",
        dest="line_width",
        default=100,
        help="Line width of output ORFs in FASTA format.  default=%default")
    parser.add_option("--log-file",
                      action="store",
                      type="string",
                      dest="log_file",
                      default='CPAT_run_info.log',
                      help="Name of log file. default=\"%default\"")
    parser.add_option(
        "--best-orf",
        action="store",
        type="string",
        dest="mode",
        default='p',
        help=
        "Criteria to select the best ORF: \"l\"=length, selection according to the \"ORF length\"; \"p\"=probability, selection according to the \"coding probability\". default=\"%default\""
    )
    parser.add_option(
        "--verbose",
        action="store_true",
        dest="debug",
        default=False,
        help=
        "Logical to determine if detailed running information is printed to screen."
    )
    (options, args) = parser.parse_args()

    for file in ([
            options.gene_file, options.hexamer_dat, options.logit_model,
            options.out_file
    ]):
        if not (file):
            parser.print_help()
            sys.exit(0)

    if options.line_width < 1:
        sys.exit(0)

    if options.mode not in ["p", "l"]:
        print("Please specifiy either \"p\" or \"l\" to --best-orf.",
              file=sys.stderr)
        sys.exit(0)

    #logging to file
    if options.debug:
        logging.basicConfig(filename='%s' % options.log_file,
                            filemode='w',
                            format="%(asctime)s [%(levelname)s]  %(message)s",
                            datefmt='%Y-%m-%d %I:%M:%S',
                            level=logging.DEBUG)
    else:
        logging.basicConfig(filename='%s' % options.log_file,
                            filemode='w',
                            format="%(asctime)s [%(levelname)s]  %(message)s",
                            datefmt='%Y-%m-%d %I:%M:%S',
                            level=logging.INFO)
    #logging to console
    logFormat = logging.Formatter("%(asctime)s [%(levelname)s]  %(message)s",
                                  datefmt='%Y-%m-%d %I:%M:%S')
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormat)
    logging.getLogger().addHandler(consoleHandler)

    logging.info("Running CPAT version %s..." % (__version__))
    start_codons = options.start_codons.replace(' ', '').split(',')
    stop_codons = options.stop_codons.replace(' ', '').split(',')

    SEQOUT = open(options.out_file + '.ORF_seqs.fa', 'w')
    INFOUT = open(options.out_file + '.ORF_info.tsv', 'w')
    NOORF = open(options.out_file + '.no_ORF.txt', 'w')

    logging.info("Start codons used: [%s]" % ','.join(start_codons))
    logging.info("Stop codons used: [%s]" % ','.join(stop_codons))

    #build hexamer table from hexamer frequency file
    logging.info("Reading %s" % options.hexamer_dat)
    coding = {}
    noncoding = {}
    for line in open(options.hexamer_dat):
        line = line.strip()
        fields = line.split()
        if fields[0] == 'hexamer':
            continue
        coding[fields[0]] = float(fields[1])
        noncoding[fields[0]] = float(fields[2])

    count = 0
    logging.info("Checking format of \"%s\"" % options.gene_file)
    file_format = bed_or_fasta(options.gene_file)
    if file_format == 'UNKNOWN':
        logging.error("Unknown file format:%s" % options.gene_file)
        sys.exit(0)

    elif file_format == 'FASTA':
        logging.info("Input gene file is in FASTA format")
        if options.ref_genome:
            logging.warning(
                "\"%s\" is a sequence file. The reference genome file \"%s\" will be ignored."
                % (options.gene_file, options.ref_genome))

        logging.info("Searching for ORFs ...")
        print(
            "\t".join([
                "ID", "mRNA", "ORF_strand", "ORF_frame", "ORF_start",
                "ORF_end", "ORF", "Fickett", "Hexamer"
            ]),
            file=INFOUT
        )  ## do NOT change these labels, they are R variable names in the model.
        for name, seq in FrameKmer.seq_generator(options.gene_file):
            count += 1
            RNA_len = len(seq)
            #ORF serial number, starting from 1
            orf_sn = 1
            tmp1 = find_orfs.ORFFinder(seq=seq, min_orf=options.min_orf_len)
            ORFs = tmp1.orf_candidates(antisense=options.antisense,
                                       n_candidate=options.n_top_orf,
                                       start_coden=start_codons,
                                       stop_coden=stop_codons)
            if len(ORFs) == 0:
                logging.warning("No ORFs found for %s" % name)
                print(name, file=NOORF)
                continue
            for orf in ORFs:
                # (direction, frame_number+1, orf_start, orf_end, L, sequence)
                orf_seq = orf[-1]
                if orf[0] == '+':
                    orf[2] = orf[
                        2] + 1  #change 0-based into 1-based to be consistent with NCBI ORFfinder output (https://www.ncbi.nlm.nih.gov/orffinder/)
                elif orf[0] == '-':
                    orf[2] = RNA_len - (orf[2])
                    orf[3] = RNA_len - orf[3] + 1

                orf_id = name + '_ORF_' + str(orf_sn) + '\t' + str(
                    RNA_len) + '\t' + '\t'.join([str(i) for i in orf[:-1]])

                fickett_score = fickett.fickett_value(orf_seq)
                hexamer_score = FrameKmer.kmer_ratio(orf_seq, 6, 3, coding,
                                                     noncoding)
                print(orf_id + '\t' + str(fickett_score) + '\t' +
                      str(hexamer_score),
                      file=INFOUT)

                print(">" + orf_id, file=SEQOUT)
                print('\n'.join(wrap(orf_seq, width=options.line_width)),
                      file=SEQOUT)
                orf_sn += 1
            print("%d sequences finished\r" % count, end=' ', file=sys.stderr)
        print("\n", file=sys.stderr)

    elif file_format == 'BED':
        logging.info("Input gene file is in BED format")
        if not options.ref_genome:
            logging.error("Reference genome file (-r/--ref) must be provided.")
            parser.print_help()
            sys.exit(0)

        logging.info("Searching for ORFs ...")
        print(
            "\t".join([
                "ID", "mRNA", "ORF_strand", "ORF_frame", "ORF_start",
                "ORF_end", "ORF", "Fickett", "Hexamer"
            ]),
            file=INFOUT
        )  ## do NOT change these labels, they are R variable names in the model.

        index_fasta(options.ref_genome)

        for line in ireader.reader(options.gene_file):
            count += 1
            if line.startswith('track'): continue
            if line.startswith('#'): continue
            if line.startswith('browser'): continue
            name, seq = seq_from_bed(line, options.ref_genome)

            RNA_len = len(seq)
            #ORF serial number, starting from 1
            orf_sn = 1
            tmp1 = find_orfs.ORFFinder(seq=seq, min_orf=options.min_orf_len)
            ORFs = tmp1.orf_candidates(antisense=options.antisense,
                                       n_candidate=options.n_top_orf,
                                       start_coden=start_codons,
                                       stop_coden=stop_codons)
            if len(ORFs) == 0:
                logging.warning("No ORFs found for %s" % name)
                print(line, file=NOORF)
                continue
            for orf in ORFs:
                # (direction, frame_number+1, orf_start, orf_end, L, sequence)
                orf_seq = orf[-1]
                if orf[0] == '+':
                    orf[2] = orf[
                        2] + 1  #change 0-based into 1-based to be consistent with NCBI ORFfinder output (https://www.ncbi.nlm.nih.gov/orffinder/)
                elif orf[0] == '-':
                    orf[2] = RNA_len - (orf[2])
                    orf[3] = RNA_len - orf[3] + 1

                orf_id = name + '_ORF_' + str(orf_sn) + '\t' + str(
                    RNA_len) + '\t' + '\t'.join([str(i) for i in orf[:-1]])

                fickett_score = fickett.fickett_value(orf_seq)
                hexamer_score = FrameKmer.kmer_ratio(orf_seq, 6, 3, coding,
                                                     noncoding)
                print(orf_id + '\t' + str(fickett_score) + '\t' +
                      str(hexamer_score),
                      file=INFOUT)

                print(">" + orf_id, file=SEQOUT)
                print('\n'.join(wrap(orf_seq, width=options.line_width)),
                      file=SEQOUT)
                orf_sn += 1
            print("%d rows finished\r" % count, end=' ', file=sys.stderr)
        print("\n", file=sys.stderr)

    SEQOUT.close()
    INFOUT.close()

    logging.info("Calculate coding probability ...")
    coding_prediction(
        options.logit_model, options.out_file + '.ORF_info.tsv',
        options.out_file)  #output options.out_file + '.ORF_prob.tsv'

    if options.mode == 'p':
        logging.info("Select ORF with the highest coding probability ...")
        col_index = 9
    elif options.mode == 'l':
        logging.info("Select the longest ORF ...")
        col_index = 6

    BEST = open((options.out_file + '.ORF_prob.best.tsv'), 'w')
    best_candidates = {}
    for l in open((options.out_file + '.ORF_prob.tsv'), 'r'):
        l = l.strip()
        if l.startswith('ID'):
            print("seq_ID\t" + l, file=BEST)
            continue
        f = l.split('\t')
        seq_id = f[0].split('_ORF_')[0]
        prob = float(f[col_index])
        if seq_id not in best_candidates:
            best_candidates[seq_id] = f
        else:
            if prob > float(best_candidates[seq_id][col_index]):
                best_candidates[seq_id] = f

    for k, v in best_candidates.items():
        print(k + '\t' + '\t'.join(v), file=BEST)

    BEST.close()
    logging.info("Done!")

    finish_up(options.out_file, options.n_top_orf, options.min_orf_len)