def extract_feature_from_seq(seq, stt, stp, c_tab, g_tab): """extract features of sequence from fasta entry""" stt_coden = stt.strip().split(",") stp_coden = stp.strip().split(",") transtab = maketrans("ACGTNX", "TGCANX") mRNA_seq = seq.upper() mRNA_size = len(seq) tmp = orf.ORFFinder(mRNA_seq) (CDS_size1, CDS_frame1, CDS_seq1) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden) fickett_score1 = fickett.fickett_value(CDS_seq1) hexamer = FrameKmer.kmer_ratio(CDS_seq1, 6, 3, c_tab, g_tab) return (mRNA_size, CDS_size1, fickett_score1, hexamer)
def extract_feature_from_seq(seq,stt,stp,c_tab,g_tab): '''extract features of sequence from fasta entry''' stt_coden = stt.strip().split(',') stp_coden = stp.strip().split(',') transtab = maketrans("ACGTNX","TGCANX") mRNA_seq = seq.upper() mRNA_size = len(seq) tmp = orf.ORFFinder(mRNA_seq) (CDS_size1, CDS_frame1, CDS_seq1) = tmp.longest_orf(direction="+",start_coden=stt_coden, stop_coden=stp_coden) fickett_score1 = fickett.fickett_value(CDS_seq1) hexamer = FrameKmer.kmer_ratio(CDS_seq1,6,3,c_tab,g_tab) return (mRNA_size, CDS_size1, fickett_score1,hexamer)
def extract_feature_from_seq(seq, stt, stp, c_tab, g_tab, min_orf): '''extract features of sequence from fasta entry''' mRNA_seq = seq.upper() mRNA_size = len(seq) tmp = find_orfs.ORFFinder(mRNA_seq, min_orf=min_orf) ORFs = tmp.orf_candidates(start_coden=stt, stop_coden=stp, antisense=False, n_candidate=3) if len(ORFs) == 0: return None (direction, frame, ORF_start, ORF_end, CDS_size, CDS_seq) = ORFs[0] fickett_score1 = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) return (mRNA_size, CDS_size, fickett_score1, hexamer)
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab): '''extract features of sequence from bed line''' stt_coden = stt.strip().split(',') stp_coden = stp.strip().split(',') transtab = str.maketrans("ACGTNX", "TGCANX") mRNA_seq = '' mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5].replace(" ", "_") exon_num = int(fields[9]) exon_sizes = list(map(int, fields[10].rstrip(',\n').split(','))) exon_starts = list(map(int, fields[11].rstrip(',\n').split(','))) exon_starts = list(map((lambda x: x + tx_start), exon_starts)) exon_ends = list(map(int, fields[10].rstrip(',\n').split(','))) exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends)) intron_starts = exon_ends[:-1] intron_ends = exon_starts[1:] except: print("Wrong format!" + inbed, file=sys.stderr) return None mRNA_size = sum(exon_sizes) for st, end in zip(exon_starts, exon_ends): exon_coord = chrom + ':' + str(st + 1) + '-' + str(end) tmp = pysam.faidx(refgenome, exon_coord) mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp[1:]]) if strand == '-': mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1] tmp = orf.ORFFinder(mRNA_seq) (CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) #print CDS_seq return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab, min_orf): '''extract features of sequence from bed line''' transtab = str.maketrans("ACGTNX", "TGCANX") mRNA_seq = '' mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) #tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ", "_") exon_sizes = list(map(int, fields[10].rstrip(',\n').split(','))) exon_starts = list(map(int, fields[11].rstrip(',\n').split(','))) exon_starts = list(map((lambda x: x + tx_start), exon_starts)) exon_ends = list(map(int, fields[10].rstrip(',\n').split(','))) exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends)) except: print("Wrong format!" + inbed, file=sys.stderr) return None mRNA_size = sum(exon_sizes) for st, end in zip(exon_starts, exon_ends): exon_coord = chrom + ':' + str(st + 1) + '-' + str(end) tmp1 = pysam.faidx(refgenome, exon_coord) mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp1.split()[1:]]) if strand == '-': mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1] tmp2 = find_orfs.ORFFinder(mRNA_seq, min_orf=min_orf) ORFs = tmp2.orf_candidates(start_coden=stt, stop_coden=stp, antisense=False, n_candidate=1) if len(ORFs) == 0: return None (direction, frame, ORF_start, ORF_end, CDS_size, CDS_seq) = ORFs[0] #print (ORFs) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def extract_feature_from_seq(seq, c_tab, g_tab): '''extract features of sequence from fasta entry''' mRNA_seq = seq.upper() mRNA_size = len(seq) orf_finder = orf_extraction.ORFFinder(mRNA_seq) tmp = orf_finder.find_longest() ''' in the case if start codon have not been found ''' if tmp==-1: return [0] * 9 starts, orf_seq, orf_size, mean_orf_length, orf_coverage = tmp fickett_score = fickett.fickett_value(orf_seq) k34, k21, k6 = kozak.find_kozak_feat(mRNA_seq, starts) hexamer = FrameKmer.kmer_ratio(orf_seq, 6, 3, c_tab, g_tab) return (mRNA_size, orf_size, mean_orf_length, orf_coverage, fickett_score, hexamer, k34, k21, k6)
def extract_CDS_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab, min_orf): '''extract CDS sequence from bed line''' transtab = str.maketrans("ACGTNX", "TGCANX") CDS_seq = '' mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) #tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ", "_") cdsStart = int(fields[6]) cdsEnd = int(fields[7]) exon_sizes = list(map(int, fields[10].rstrip(',\n').split(','))) exon_starts = list(map(int, fields[11].rstrip(',\n').split(','))) exon_starts = list(map((lambda x: x + tx_start), exon_starts)) exon_ends = list(map(int, fields[10].rstrip(',\n').split(','))) exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends)) except: print("Wrong format!" + inbed, file=sys.stderr) return None mRNA_size = sum(exon_sizes) for base, offset in zip(exon_starts, exon_sizes): if (base + offset) < cdsStart: continue if base > cdsEnd: continue cds_exon_start = max(base, cdsStart) cds_exon_end = min(base + offset, cdsEnd) exon_coord = chrom + ':' + str(cds_exon_start + 1) + '-' + str(cds_exon_end) tmp1 = pysam.faidx(refgenome, exon_coord) CDS_seq += ''.join([i.rstrip('\n\r') for i in tmp1.split()[1:]]) if strand == '-': CDS_seq = CDS_seq.upper().translate(transtab)[::-1] CDS_size = len(CDS_seq) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab): """extract features of sequence from bed line""" stt_coden = stt.strip().split(",") stp_coden = stp.strip().split(",") transtab = maketrans("ACGTNX", "TGCANX") mRNA_seq = "" mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5].replace(" ", "_") exon_num = int(fields[9]) exon_sizes = map(int, fields[10].rstrip(",\n").split(",")) exon_starts = map(int, fields[11].rstrip(",\n").split(",")) exon_starts = map((lambda x: x + tx_start), exon_starts) exon_ends = map(int, fields[10].rstrip(",\n").split(",")) exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) intron_starts = exon_ends[:-1] intron_ends = exon_starts[1:] except: print >>sys.stderr, "Wrong format!" + inbed return None mRNA_size = sum(exon_sizes) for st, end in zip(exon_starts, exon_ends): exon_coord = chrom + ":" + str(st + 1) + "-" + str(end) tmp = pysam.faidx(refgenome, exon_coord) mRNA_seq += "".join([i.rstrip("\n\r") for i in tmp[1:]]) if strand == "-": mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1] tmp = orf.ORFFinder(mRNA_seq) (CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) # print CDS_seq return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def mainProcess(input, output, number, c_tab, g_tab, codonArr, hash_matrix, classifier): if number > 1: Temp_Dir = output + '_Tmp_Dir' temp_score = '' + Temp_Dir + '/' + output + str(number) # temp_feature = ''+Temp_Dir+'/temp_feature' + str(number) SCORE = open(temp_score, 'w') # DATA = open(temp_feature,'w') sequence_Arr = input.split('\n') sLen = len(sequence_Arr) - 1 del sequence_Arr[sLen] if number == 1: SCORE = open(output, 'w') sequence_Arr = input label_Arr_tmp = [] FastA_seq_Arr_tmp = [] for n in range(len(sequence_Arr)): if n == 0 or n % 2 == 0: label = sequence_Arr[n] label_Arr_tmp.append(label) else: seq = sequence_Arr[n] FastA_seq_Arr_tmp.append(seq) data = [] ids = [] for i in range(len(label_Arr_tmp)): Seq = FastA_seq_Arr_tmp[i] tran_fir_seq = Seq.lower() tran_sec_seq_one = tran_fir_seq.replace('u', 't') strinfo = re.compile('[^agctn]') tran_sec_seq = strinfo.sub('n', tran_sec_seq_one) tran_sec_seq2 = tran_sec_seq.upper() tmp = orf.ORFFinder(tran_sec_seq2) (CDS_start, CDS_stop, CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+") (MCS, CSL, CP) = mcssProcess(tran_sec_seq2, c_tab, g_tab) fickett_score = fickett.fickett_value(CDS_seq) (orfscore, orfdistance) = HexamerFeatures(CDS_seq.lower(), hash_matrix) labels_Arr = label_Arr_tmp[i].split() ids.append(labels_Arr[0]) Exons_mscore = [] Exons_distance = [] Exons_GC = [] Site_start = 0 for j in range(1, len(labels_Arr)): seq = tran_sec_seq[Site_start:Site_start + int(labels_Arr[j])] if (len(seq) > 0): GCnum = seq.count('c') + seq.count('g') GCratio = GCnum / float(len(seq)) Exons_GC.append(GCratio) (mscore, distance) = HexamerFeatures(seq, hash_matrix) Exons_mscore.append(mscore) Exons_distance.append(distance) Site_start = Site_start + int(labels_Arr[j]) else: continue Max_Mscore_exon = max(Exons_mscore) Max_distance = max(Exons_distance) Max_GCcontent = max(Exons_GC) full_len = len(tran_sec_seq) orf_ratio = CDS_size / float(full_len) transcript_features = [ CDS_size, orf_ratio, fickett_score, orfscore, orfdistance, Max_Mscore_exon, Max_distance, Max_GCcontent, MCS, CSL, CP ] data.append(transcript_features) # PROPERTY_STR = labels_Arr[0] + ' ' + str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n' # DATA.write(PROPERTY_STR) testing_data = np.array(data) del data testing_data = testing_data.reshape(len(label_Arr_tmp), 11) prob = classifier.predict_proba(testing_data) labels = classifier.predict(testing_data) PrintResult(ids, labels, prob[:, 1], SCORE) SCORE.close()
def mainProcess(input,output,number,c_tab,g_tab,codonArr,hash_matrix,classifier): if number > 1: Temp_Dir = output + '_Tmp_Dir' temp_score = ''+Temp_Dir+'/'+ output + str(number) # temp_feature = ''+Temp_Dir+'/temp_feature' + str(number) SCORE = open(temp_score,'w') # DATA = open(temp_feature,'w') sequence_Arr = input.split('\n') sLen = len(sequence_Arr) - 1 del sequence_Arr[sLen] if number == 1: SCORE = open(output,'w') sequence_Arr = input label_Arr_tmp = [] FastA_seq_Arr_tmp = [] for n in range(len(sequence_Arr)): if n == 0 or n % 2 == 0: label = sequence_Arr[n] label_Arr_tmp.append(label) else : seq = sequence_Arr[n] FastA_seq_Arr_tmp.append(seq) data = [] ids = [] for i in range(len(label_Arr_tmp)): Seq = FastA_seq_Arr_tmp[i] tran_fir_seq = Seq.lower() tran_sec_seq_one = tran_fir_seq.replace('u','t') strinfo = re.compile('[^agctn]') tran_sec_seq = strinfo.sub('n',tran_sec_seq_one) tran_sec_seq2 = tran_sec_seq.upper() tmp = orf.ORFFinder(tran_sec_seq2) (CDS_start, CDS_stop, CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+") (MCS,CSL,CP) = mcssProcess(tran_sec_seq2,c_tab,g_tab) fickett_score = fickett.fickett_value(CDS_seq) (orfscore,orfdistance) = HexamerFeatures(CDS_seq.lower(),hash_matrix) labels_Arr = label_Arr_tmp[i].split() ids.append(labels_Arr[0]) Exons_mscore = [] Exons_distance =[] Exons_GC = [] Site_start = 0 for j in range(1,len(labels_Arr)): seq = tran_sec_seq[Site_start:Site_start+int(labels_Arr[j])] if (len(seq) > 0): GCnum = seq.count('c') + seq.count('g') GCratio = GCnum/float(len(seq)) Exons_GC.append(GCratio) (mscore,distance) = HexamerFeatures(seq,hash_matrix) Exons_mscore.append(mscore) Exons_distance.append(distance) Site_start = Site_start + int(labels_Arr[j]) else: continue Max_Mscore_exon = max(Exons_mscore) Max_distance = max(Exons_distance) Max_GCcontent = max(Exons_GC) full_len = len(tran_sec_seq) orf_ratio = CDS_size/float(full_len) transcript_features = [CDS_size,orf_ratio,fickett_score,orfscore,orfdistance,Max_Mscore_exon,Max_distance,Max_GCcontent,MCS,CSL,CP] data.append(transcript_features) # PROPERTY_STR = labels_Arr[0] + ' ' + str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n' # DATA.write(PROPERTY_STR) testing_data = np.array(data) del data testing_data = testing_data.reshape(len(label_Arr_tmp),11) prob = classifier.predict_proba(testing_data) labels = classifier.predict(testing_data) PrintResult(ids,labels,prob[:,1],SCORE) SCORE.close()
def mainProcess(input,output,number,c_tab,g_tab,codonArr,hash_matrix,mRNA_num): mRNA_num = mRNA_num/2 if number > 1: Temp_Dir = output + '_Tmp_Dir' temp_feature = ''+Temp_Dir+'/'+ output + str(number) DATA = open(temp_feature,'w') sequence_Arr = input.split('\n') sLen = len(sequence_Arr) - 1 del sequence_Arr[sLen] if number == 1: DATA = open(output,'w') sequence_Arr = input label_Arr_tmp = [] FastA_seq_Arr_tmp = [] for n in range(len(sequence_Arr)): if n == 0 or n % 2 == 0: label = sequence_Arr[n] label_Arr_tmp.append(label) else : seq = sequence_Arr[n] FastA_seq_Arr_tmp.append(seq) for i in range(len(label_Arr_tmp)): Seq = FastA_seq_Arr_tmp[i] tran_fir_seq = Seq.lower() tran_sec_seq_one = tran_fir_seq.replace('u','t') strinfo = re.compile('[^agctn]') tran_sec_seq = strinfo.sub('n',tran_sec_seq_one) tran_sec_seq2 = tran_sec_seq.upper() tmp = orf.ORFFinder(tran_sec_seq2) (CDS_start, CDS_stop, CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+") (MCS,CSL,CP) = mcssProcess(tran_sec_seq2,c_tab,g_tab) fickett_score = fickett.fickett_value(CDS_seq) (orfscore,orfdistance) = HexamerFeatures(CDS_seq.lower(),hash_matrix) labels_Arr = label_Arr_tmp[i].split() Exons_mscore = [] Exons_distance =[] Exons_GC = [] Site_start = 0 for j in range(1,len(labels_Arr)): seq = tran_sec_seq[Site_start:Site_start+int(labels_Arr[j])] if (len(seq) > 0): GCnum = seq.count('c') + seq.count('g') GCratio = GCnum/float(len(seq)) Exons_GC.append(GCratio) (mscore,distance) = HexamerFeatures(seq,hash_matrix) Exons_mscore.append(mscore) Exons_distance.append(distance) Site_start = Site_start + int(labels_Arr[j]) else: continue Max_Mscore_exon = max(Exons_mscore) Max_distance = max(Exons_distance) Max_GCcontent = max(Exons_GC) full_len = len(tran_sec_seq) orf_ratio = CDS_size/float(full_len) if i < mRNA_num: PROPERTY_STR = '+1 '+ str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n' else: PROPERTY_STR = '-1 '+ str(CDS_size) + ' '+ str(orf_ratio) + ' ' + str(fickett_score) + ' '+ str(orfscore) + ' '+ str(orfdistance)+' '+ str(Max_Mscore_exon)+ ' ' + str(Max_distance)+ ' ' + str(Max_GCcontent)+ ' ' +str(MCS) +' '+str(CSL)+' '+str(CP)+'\n' DATA.write(PROPERTY_STR) DATA.close()
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-g", "--gene", action="store", type="string", dest="gene_file", help= "Genomic sequnence(s) of RNA in FASTA (https://en.wikipedia.org/wiki/FASTA_format) or standard 12-column BED (https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format. It is recommended to use *short* and *unique* sequence identifiers (such as Ensembl transcript id) in FASTA and BED file. If this is a BED file, reference genome ('-r/--ref') should be specified. The input FASTA or BED file could be a regular text file or compressed file (*.gz, *.bz2) or accessible URL (http://, https://, ftp://). URL file cannot be a compressed file." ) parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", help="The prefix of output files.") parser.add_option( "-d", "--logitModel", action="store", dest="logit_model", help= "Logistic regression model. The prebuilt models for Human, Mouse, Fly, Zebrafish are availablel. Run 'make_logitModel.py' to build logistic regression model for your own training datset." ) parser.add_option( "-x", "--hex", action="store", dest="hexamer_dat", help= "The hexamer frequency table. The prebuilt tables for Human, Mouse, Fly, Zebrafish are availablel. Run 'make_hexamer_tab.py' to make this table for your own training dataset." ) parser.add_option( "-r", "--ref", action="store", dest="ref_genome", help= "Reference genome sequences in FASTA format. Reference genome file will be indexed automatically if the index file ( *.fai) does not exist. Will be ignored if FASTA file was provided to '-g/--gene'." ) parser.add_option( "--antisense", action="store_true", dest="antisense", default=False, help= "Logical to determine whether to search for ORFs from the anti-sense strand. *Sense strand* (or coding strand) is DNA strand that carries the translatable code in the 5′ to 3′ direction. default=False (i.e. only search for ORFs from the sense strand)" ) parser.add_option( "--start", action="store", type="string", dest="start_codons", default='ATG', help= "Start codon (use 'T' instead of 'U') used to define the start of open reading frame (ORF). default=%default" ) parser.add_option( "--stop", action="store", type="string", dest="stop_codons", default='TAG,TAA,TGA', help= "Stop codon (use 'T' instead of 'U') used to define the end of open reading frame (ORF). Multiple stop codons are separated by ','. default=%default" ) parser.add_option( "--min-orf", action="store", type="int", dest="min_orf_len", default=75, help="Minimum ORF length in nucleotides. default=%default") parser.add_option( "--top-orf", action="store", type="int", dest="n_top_orf", default=5, help= "Number of ORF candidates reported. RNAs may have dozens of putative ORFs, in most cases, the real ORF is ranked (by size) in the top several. It is not necessary to calculate \"Fickett score\", \"Hexamer score\" and \"coding probability\" for every ORF. default=%default" ) parser.add_option( "--width", action="store", type="int", dest="line_width", default=100, help="Line width of output ORFs in FASTA format. default=%default") parser.add_option("--log-file", action="store", type="string", dest="log_file", default='CPAT_run_info.log', help="Name of log file. default=\"%default\"") parser.add_option( "--best-orf", action="store", type="string", dest="mode", default='p', help= "Criteria to select the best ORF: \"l\"=length, selection according to the \"ORF length\"; \"p\"=probability, selection according to the \"coding probability\". default=\"%default\"" ) parser.add_option( "--verbose", action="store_true", dest="debug", default=False, help= "Logical to determine if detailed running information is printed to screen." ) (options, args) = parser.parse_args() for file in ([ options.gene_file, options.hexamer_dat, options.logit_model, options.out_file ]): if not (file): parser.print_help() sys.exit(0) if options.line_width < 1: sys.exit(0) if options.mode not in ["p", "l"]: print("Please specifiy either \"p\" or \"l\" to --best-orf.", file=sys.stderr) sys.exit(0) #logging to file if options.debug: logging.basicConfig(filename='%s' % options.log_file, filemode='w', format="%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG) else: logging.basicConfig(filename='%s' % options.log_file, filemode='w', format="%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO) #logging to console logFormat = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S') consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormat) logging.getLogger().addHandler(consoleHandler) logging.info("Running CPAT version %s..." % (__version__)) start_codons = options.start_codons.replace(' ', '').split(',') stop_codons = options.stop_codons.replace(' ', '').split(',') SEQOUT = open(options.out_file + '.ORF_seqs.fa', 'w') INFOUT = open(options.out_file + '.ORF_info.tsv', 'w') NOORF = open(options.out_file + '.no_ORF.txt', 'w') logging.info("Start codons used: [%s]" % ','.join(start_codons)) logging.info("Stop codons used: [%s]" % ','.join(stop_codons)) #build hexamer table from hexamer frequency file logging.info("Reading %s" % options.hexamer_dat) coding = {} noncoding = {} for line in open(options.hexamer_dat): line = line.strip() fields = line.split() if fields[0] == 'hexamer': continue coding[fields[0]] = float(fields[1]) noncoding[fields[0]] = float(fields[2]) count = 0 logging.info("Checking format of \"%s\"" % options.gene_file) file_format = bed_or_fasta(options.gene_file) if file_format == 'UNKNOWN': logging.error("Unknown file format:%s" % options.gene_file) sys.exit(0) elif file_format == 'FASTA': logging.info("Input gene file is in FASTA format") if options.ref_genome: logging.warning( "\"%s\" is a sequence file. The reference genome file \"%s\" will be ignored." % (options.gene_file, options.ref_genome)) logging.info("Searching for ORFs ...") print( "\t".join([ "ID", "mRNA", "ORF_strand", "ORF_frame", "ORF_start", "ORF_end", "ORF", "Fickett", "Hexamer" ]), file=INFOUT ) ## do NOT change these labels, they are R variable names in the model. for name, seq in FrameKmer.seq_generator(options.gene_file): count += 1 RNA_len = len(seq) #ORF serial number, starting from 1 orf_sn = 1 tmp1 = find_orfs.ORFFinder(seq=seq, min_orf=options.min_orf_len) ORFs = tmp1.orf_candidates(antisense=options.antisense, n_candidate=options.n_top_orf, start_coden=start_codons, stop_coden=stop_codons) if len(ORFs) == 0: logging.warning("No ORFs found for %s" % name) print(name, file=NOORF) continue for orf in ORFs: # (direction, frame_number+1, orf_start, orf_end, L, sequence) orf_seq = orf[-1] if orf[0] == '+': orf[2] = orf[ 2] + 1 #change 0-based into 1-based to be consistent with NCBI ORFfinder output (https://www.ncbi.nlm.nih.gov/orffinder/) elif orf[0] == '-': orf[2] = RNA_len - (orf[2]) orf[3] = RNA_len - orf[3] + 1 orf_id = name + '_ORF_' + str(orf_sn) + '\t' + str( RNA_len) + '\t' + '\t'.join([str(i) for i in orf[:-1]]) fickett_score = fickett.fickett_value(orf_seq) hexamer_score = FrameKmer.kmer_ratio(orf_seq, 6, 3, coding, noncoding) print(orf_id + '\t' + str(fickett_score) + '\t' + str(hexamer_score), file=INFOUT) print(">" + orf_id, file=SEQOUT) print('\n'.join(wrap(orf_seq, width=options.line_width)), file=SEQOUT) orf_sn += 1 print("%d sequences finished\r" % count, end=' ', file=sys.stderr) print("\n", file=sys.stderr) elif file_format == 'BED': logging.info("Input gene file is in BED format") if not options.ref_genome: logging.error("Reference genome file (-r/--ref) must be provided.") parser.print_help() sys.exit(0) logging.info("Searching for ORFs ...") print( "\t".join([ "ID", "mRNA", "ORF_strand", "ORF_frame", "ORF_start", "ORF_end", "ORF", "Fickett", "Hexamer" ]), file=INFOUT ) ## do NOT change these labels, they are R variable names in the model. index_fasta(options.ref_genome) for line in ireader.reader(options.gene_file): count += 1 if line.startswith('track'): continue if line.startswith('#'): continue if line.startswith('browser'): continue name, seq = seq_from_bed(line, options.ref_genome) RNA_len = len(seq) #ORF serial number, starting from 1 orf_sn = 1 tmp1 = find_orfs.ORFFinder(seq=seq, min_orf=options.min_orf_len) ORFs = tmp1.orf_candidates(antisense=options.antisense, n_candidate=options.n_top_orf, start_coden=start_codons, stop_coden=stop_codons) if len(ORFs) == 0: logging.warning("No ORFs found for %s" % name) print(line, file=NOORF) continue for orf in ORFs: # (direction, frame_number+1, orf_start, orf_end, L, sequence) orf_seq = orf[-1] if orf[0] == '+': orf[2] = orf[ 2] + 1 #change 0-based into 1-based to be consistent with NCBI ORFfinder output (https://www.ncbi.nlm.nih.gov/orffinder/) elif orf[0] == '-': orf[2] = RNA_len - (orf[2]) orf[3] = RNA_len - orf[3] + 1 orf_id = name + '_ORF_' + str(orf_sn) + '\t' + str( RNA_len) + '\t' + '\t'.join([str(i) for i in orf[:-1]]) fickett_score = fickett.fickett_value(orf_seq) hexamer_score = FrameKmer.kmer_ratio(orf_seq, 6, 3, coding, noncoding) print(orf_id + '\t' + str(fickett_score) + '\t' + str(hexamer_score), file=INFOUT) print(">" + orf_id, file=SEQOUT) print('\n'.join(wrap(orf_seq, width=options.line_width)), file=SEQOUT) orf_sn += 1 print("%d rows finished\r" % count, end=' ', file=sys.stderr) print("\n", file=sys.stderr) SEQOUT.close() INFOUT.close() logging.info("Calculate coding probability ...") coding_prediction( options.logit_model, options.out_file + '.ORF_info.tsv', options.out_file) #output options.out_file + '.ORF_prob.tsv' if options.mode == 'p': logging.info("Select ORF with the highest coding probability ...") col_index = 9 elif options.mode == 'l': logging.info("Select the longest ORF ...") col_index = 6 BEST = open((options.out_file + '.ORF_prob.best.tsv'), 'w') best_candidates = {} for l in open((options.out_file + '.ORF_prob.tsv'), 'r'): l = l.strip() if l.startswith('ID'): print("seq_ID\t" + l, file=BEST) continue f = l.split('\t') seq_id = f[0].split('_ORF_')[0] prob = float(f[col_index]) if seq_id not in best_candidates: best_candidates[seq_id] = f else: if prob > float(best_candidates[seq_id][col_index]): best_candidates[seq_id] = f for k, v in best_candidates.items(): print(k + '\t' + '\t'.join(v), file=BEST) BEST.close() logging.info("Done!") finish_up(options.out_file, options.n_top_orf, options.min_orf_len)