def extract_feature_from_seq(seq, stt, stp, c_tab, g_tab): """extract features of sequence from fasta entry""" stt_coden = stt.strip().split(",") stp_coden = stp.strip().split(",") transtab = maketrans("ACGTNX", "TGCANX") mRNA_seq = seq.upper() mRNA_size = len(seq) tmp = orf.ORFFinder(mRNA_seq) (CDS_size1, CDS_frame1, CDS_seq1) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden) fickett_score1 = fickett.fickett_value(CDS_seq1) hexamer = FrameKmer.kmer_ratio(CDS_seq1, 6, 3, c_tab, g_tab) return (mRNA_size, CDS_size1, fickett_score1, hexamer)
def extract_feature_from_seq(seq,stt,stp,c_tab,g_tab): '''extract features of sequence from fasta entry''' stt_coden = stt.strip().split(',') stp_coden = stp.strip().split(',') transtab = maketrans("ACGTNX","TGCANX") mRNA_seq = seq.upper() mRNA_size = len(seq) tmp = orf.ORFFinder(mRNA_seq) (CDS_size1, CDS_frame1, CDS_seq1) = tmp.longest_orf(direction="+",start_coden=stt_coden, stop_coden=stp_coden) fickett_score1 = fickett.fickett_value(CDS_seq1) hexamer = FrameKmer.kmer_ratio(CDS_seq1,6,3,c_tab,g_tab) return (mRNA_size, CDS_size1, fickett_score1,hexamer)
def extract_feature_from_seq(seq, stt, stp, c_tab, g_tab, min_orf): '''extract features of sequence from fasta entry''' mRNA_seq = seq.upper() mRNA_size = len(seq) tmp = find_orfs.ORFFinder(mRNA_seq, min_orf=min_orf) ORFs = tmp.orf_candidates(start_coden=stt, stop_coden=stp, antisense=False, n_candidate=3) if len(ORFs) == 0: return None (direction, frame, ORF_start, ORF_end, CDS_size, CDS_seq) = ORFs[0] fickett_score1 = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) return (mRNA_size, CDS_size, fickett_score1, hexamer)
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab): '''extract features of sequence from bed line''' stt_coden = stt.strip().split(',') stp_coden = stp.strip().split(',') transtab = str.maketrans("ACGTNX", "TGCANX") mRNA_seq = '' mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5].replace(" ", "_") exon_num = int(fields[9]) exon_sizes = list(map(int, fields[10].rstrip(',\n').split(','))) exon_starts = list(map(int, fields[11].rstrip(',\n').split(','))) exon_starts = list(map((lambda x: x + tx_start), exon_starts)) exon_ends = list(map(int, fields[10].rstrip(',\n').split(','))) exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends)) intron_starts = exon_ends[:-1] intron_ends = exon_starts[1:] except: print("Wrong format!" + inbed, file=sys.stderr) return None mRNA_size = sum(exon_sizes) for st, end in zip(exon_starts, exon_ends): exon_coord = chrom + ':' + str(st + 1) + '-' + str(end) tmp = pysam.faidx(refgenome, exon_coord) mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp[1:]]) if strand == '-': mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1] tmp = orf.ORFFinder(mRNA_seq) (CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) #print CDS_seq return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab, min_orf): '''extract features of sequence from bed line''' transtab = str.maketrans("ACGTNX", "TGCANX") mRNA_seq = '' mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) #tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ", "_") exon_sizes = list(map(int, fields[10].rstrip(',\n').split(','))) exon_starts = list(map(int, fields[11].rstrip(',\n').split(','))) exon_starts = list(map((lambda x: x + tx_start), exon_starts)) exon_ends = list(map(int, fields[10].rstrip(',\n').split(','))) exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends)) except: print("Wrong format!" + inbed, file=sys.stderr) return None mRNA_size = sum(exon_sizes) for st, end in zip(exon_starts, exon_ends): exon_coord = chrom + ':' + str(st + 1) + '-' + str(end) tmp1 = pysam.faidx(refgenome, exon_coord) mRNA_seq += ''.join([i.rstrip('\n\r') for i in tmp1.split()[1:]]) if strand == '-': mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1] tmp2 = find_orfs.ORFFinder(mRNA_seq, min_orf=min_orf) ORFs = tmp2.orf_candidates(start_coden=stt, stop_coden=stp, antisense=False, n_candidate=1) if len(ORFs) == 0: return None (direction, frame, ORF_start, ORF_end, CDS_size, CDS_seq) = ORFs[0] #print (ORFs) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def extract_feature_from_seq(seq, c_tab, g_tab): '''extract features of sequence from fasta entry''' mRNA_seq = seq.upper() mRNA_size = len(seq) orf_finder = orf_extraction.ORFFinder(mRNA_seq) tmp = orf_finder.find_longest() ''' in the case if start codon have not been found ''' if tmp==-1: return [0] * 9 starts, orf_seq, orf_size, mean_orf_length, orf_coverage = tmp fickett_score = fickett.fickett_value(orf_seq) k34, k21, k6 = kozak.find_kozak_feat(mRNA_seq, starts) hexamer = FrameKmer.kmer_ratio(orf_seq, 6, 3, c_tab, g_tab) return (mRNA_size, orf_size, mean_orf_length, orf_coverage, fickett_score, hexamer, k34, k21, k6)
def extract_CDS_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab, min_orf): '''extract CDS sequence from bed line''' transtab = str.maketrans("ACGTNX", "TGCANX") CDS_seq = '' mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) #tx_end = int( fields[2] ) geneName = fields[3] strand = fields[5].replace(" ", "_") cdsStart = int(fields[6]) cdsEnd = int(fields[7]) exon_sizes = list(map(int, fields[10].rstrip(',\n').split(','))) exon_starts = list(map(int, fields[11].rstrip(',\n').split(','))) exon_starts = list(map((lambda x: x + tx_start), exon_starts)) exon_ends = list(map(int, fields[10].rstrip(',\n').split(','))) exon_ends = list(map((lambda x, y: x + y), exon_starts, exon_ends)) except: print("Wrong format!" + inbed, file=sys.stderr) return None mRNA_size = sum(exon_sizes) for base, offset in zip(exon_starts, exon_sizes): if (base + offset) < cdsStart: continue if base > cdsEnd: continue cds_exon_start = max(base, cdsStart) cds_exon_end = min(base + offset, cdsEnd) exon_coord = chrom + ':' + str(cds_exon_start + 1) + '-' + str(cds_exon_end) tmp1 = pysam.faidx(refgenome, exon_coord) CDS_seq += ''.join([i.rstrip('\n\r') for i in tmp1.split()[1:]]) if strand == '-': CDS_seq = CDS_seq.upper().translate(transtab)[::-1] CDS_size = len(CDS_seq) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def extract_feature_from_bed(inbed, refgenome, stt, stp, c_tab, g_tab): """extract features of sequence from bed line""" stt_coden = stt.strip().split(",") stp_coden = stp.strip().split(",") transtab = maketrans("ACGTNX", "TGCANX") mRNA_seq = "" mRNA_size = 0 if inbed.strip(): try: fields = inbed.split() chrom = fields[0] tx_start = int(fields[1]) tx_end = int(fields[2]) geneName = fields[3] strand = fields[5].replace(" ", "_") exon_num = int(fields[9]) exon_sizes = map(int, fields[10].rstrip(",\n").split(",")) exon_starts = map(int, fields[11].rstrip(",\n").split(",")) exon_starts = map((lambda x: x + tx_start), exon_starts) exon_ends = map(int, fields[10].rstrip(",\n").split(",")) exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) intron_starts = exon_ends[:-1] intron_ends = exon_starts[1:] except: print >>sys.stderr, "Wrong format!" + inbed return None mRNA_size = sum(exon_sizes) for st, end in zip(exon_starts, exon_ends): exon_coord = chrom + ":" + str(st + 1) + "-" + str(end) tmp = pysam.faidx(refgenome, exon_coord) mRNA_seq += "".join([i.rstrip("\n\r") for i in tmp[1:]]) if strand == "-": mRNA_seq = mRNA_seq.upper().translate(transtab)[::-1] tmp = orf.ORFFinder(mRNA_seq) (CDS_size, CDS_frame, CDS_seq) = tmp.longest_orf(direction="+", start_coden=stt_coden, stop_coden=stp_coden) fickett_score = fickett.fickett_value(CDS_seq) hexamer = FrameKmer.kmer_ratio(CDS_seq, 6, 3, c_tab, g_tab) # print CDS_seq return (geneName, mRNA_size, CDS_size, fickett_score, hexamer)
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-c", "--cgene", action="store", dest="coding_file", help= "Protein coding transcripts (used to build logit model) either in BED format or mRNA sequences in FASTA format: If this is BED format file, '-r' must be specified; if this is mRNA sequence file in FASTA format, ignore the '-r' option. The input BED or FASTA file could be regular text file or compressed file (*.gz, *.bz2) or accessible url. NOTE: transcript ID should be unique." ) parser.add_option( "-n", "--ngene", action="store", dest="noncoding_file", help= "Non protein coding transcripts (used to build logit model) either in BED format or mRNA sequences in FASTA format: If this is BED format file, '-r' must be specified; if this is mRNA sequence file in FASTA format, ignore the '-r' option. The input BED or FASTA file could be regular text file or compressed file (*.gz, *.bz2) or accessible url. NOTE: transcript ID should be unique." ) parser.add_option("-o", "--outfile", action="store", dest="out_file", help="output prefix.") parser.add_option( "-x", "--hex", action="store", dest="hexamer_dat", help= "Prebuilt hexamer frequency table (Human, Mouse, Fly, Zebrafish). Run 'make_hexamer_tab.py' to generate this table." ) parser.add_option( "-r", "--ref", action="store", dest="ref_genome", help= "Reference genome sequences in FASTA format. Ignore this option if mRNA sequences file was provided to '-g'. Reference genome file will be indexed automatically (produce *.fai file along with the original *.fa file within the same directory) if hasn't been done." ) parser.add_option( "-s", "--start", action="store", dest="start_codons", default='ATG', help= "Start codon (DNA sequence, so use 'T' instead of 'U') used to define open reading frame (ORF). default=%default" ) parser.add_option( "-t", "--stop", action="store", dest="stop_codons", default='TAG,TAA,TGA', help= "Stop codon (DNA sequence, so use 'T' instead of 'U') used to define open reading frame (ORF). Multiple stop codons should be separated by ','. default=%default" ) (options, args) = parser.parse_args() #check input and output files for file in ([ options.coding_file, options.noncoding_file, options.out_file, options.hexamer_dat ]): if not (file): parser.print_help() sys.exit(0) #data used to build logit model train_data = [] coding_label = 1 noncoding_label = 0 header = ['ID', 'mRNA', 'ORF', 'Fickett', 'Hexamer', 'Label'] #build hexamer table from hexamer frequency file coding = {} noncoding = {} for line in open(options.hexamer_dat): line = line.strip() fields = line.split() if fields[0] == 'hexamer': continue coding[fields[0]] = float(fields[1]) noncoding[fields[0]] = float(fields[2]) TMP = open(options.out_file + '.feature.xls', 'w') ###################################################################################### # process protein coding transcripts ###################################################################################### count = 0 print("Process protein coding transcripts: " + options.coding_file, file=sys.stderr) file_format = bed_or_fasta(options.coding_file) if file_format == 'UNKNOWN': print("\nError: unknown file format of '-g'\n", file=sys.stderr) parser.print_help() sys.exit(0) elif file_format == 'BED': print("Input gene file is in BED format", file=sys.stderr) if not options.ref_genome: print("\nError: Reference genome file must be provided\n", file=sys.stderr) parser.print_help() sys.exit(0) index_fasta(options.ref_genome) for line in ireader.reader(options.coding_file): count += 1 if line.startswith('track'): continue if line.startswith('#'): continue if line.startswith('browser'): continue #if not line.strip(): continue (gene_id, mRNA_size, CDS_size, fickett_score, hexamer) = extract_feature_from_bed(line, options.ref_genome, options.start_codons, options.stop_codons, coding, noncoding) train_data.append([ gene_id, mRNA_size, CDS_size, fickett_score, hexamer, coding_label ]) print("%d genes finished\r" % count, end=' ', file=sys.stderr) elif file_format == 'FASTA': if options.ref_genome: print( "Reference genome sequence [-r] and conservation score [-c] will be ignored when input genes are fasta format.", file=sys.stderr) print("Input gene file is in FASTA format", file=sys.stderr) #fa = fasta.Fasta(options.gene_file) for sname, seq in FrameKmer.seq_generator(options.coding_file): count += 1 #geneSeq = fa.getSeq(seqID = geneID) (mRNA_size, CDS_size, fickett_score, hexamer) = extract_feature_from_seq(seq=seq, stt=options.start_codons, stp=options.stop_codons, c_tab=coding, g_tab=noncoding) train_data.append([ sname, mRNA_size, CDS_size, fickett_score, hexamer, coding_label ]) print("%d genes finished\r" % count, end=' ', file=sys.stderr) ###################################################################################### # process Non-protein coding transcripts ###################################################################################### count = 0 print("Process non coding transcripts: " + options.noncoding_file, file=sys.stderr) file_format = bed_or_fasta(options.noncoding_file) if file_format == 'UNKNOWN': print("\nError: unknown file format of '-g'\n", file=sys.stderr) parser.print_help() sys.exit(0) elif file_format == 'BED': print("Input gene file is in BED format", file=sys.stderr) if not options.ref_genome: print("\nError: Reference genome file must be provided\n", file=sys.stderr) parser.print_help() sys.exit(0) index_fasta(options.ref_genome) for line in ireader.reader(options.noncoding_file): count += 1 if line.startswith('track'): continue if line.startswith('#'): continue if line.startswith('browser'): continue #if not line.strip(): continue (gene_id, mRNA_size, CDS_size, fickett_score, hexamer) = extract_feature_from_bed(line, options.ref_genome, options.start_codons, options.stop_codons, coding, noncoding) train_data.append([ gene_id, mRNA_size, CDS_size, fickett_score, hexamer, noncoding_label ]) print("%d genes finished\r" % count, end=' ', file=sys.stderr) elif file_format == 'FASTA': if options.ref_genome: print( "Reference genome sequence [-r] and conservation score [-c] will be ignored when input genes are fasta format.", file=sys.stderr) print("Input gene file is in FASTA format", file=sys.stderr) #fa = fasta.Fasta(options.gene_file) for sname, seq in FrameKmer.seq_generator(options.noncoding_file): count += 1 #geneSeq = fa.getSeq(seqID = geneID) (mRNA_size, CDS_size, fickett_score, hexamer) = extract_feature_from_seq(seq=seq, stt=options.start_codons, stp=options.stop_codons, c_tab=coding, g_tab=noncoding) train_data.append([ sname, mRNA_size, CDS_size, fickett_score, hexamer, noncoding_label ]) print("%d genes finished\r" % count, end=' ', file=sys.stderr) ###################################################################################### # writing data ###################################################################################### print('\t'.join(header), file=TMP) for i in train_data: print('\t'.join([str(j) for j in i]), file=TMP) TMP.close() print("build logi model ...", file=sys.stderr) make_logit(options.out_file + '.feature.xls', options.out_file + '.make_logitModel.r', options.out_file + '.logit.RData', header)
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-g", "--gene", action="store", dest="gene_file", help= "RNAs either in BED or FASTA format: If this is BED format file, '-r/--ref' must also be specified; if this is RNA sequence file in FASTA format, ignore the ' r/--ref ' option. The input BED or FASTA file could be regular text file or compressed file (*.gz, *.bz2) or accessible url (http://, https://, ftp://)." ) parser.add_option( "-o", "--outfile", action="store", dest="out_file", help= "output file. Tab separated text file: geneID <tab> mRNA size <tab> ORF size <tab> Fickett Score <tab> Hexamer Score<tab>Coding Probability." ) parser.add_option( "-x", "--hex", action="store", dest="hexamer_dat", help= "Prebuilt hexamer frequency table (Human, Mouse, Fly, Zebrafish). Run 'make_hexamer_tab.py' to make this table out of your own training dataset." ) parser.add_option( "-d", "--logitModel", action="store", dest="logit_model", help= "Prebuilt training model (Human, Mouse, Fly, Zebrafish). Run 'make_logitModel.py' to build logit model out of your own training datset" ) parser.add_option( "-r", "--ref", action="store", dest="ref_genome", help= "Reference genome sequences in FASTA format. Ignore this option if FASTA file was provided to '-g/--gene'. Reference genome file will be indexed automatically (produce *.fai file along with the original *.fa file within the same directory) if hasn't been done." ) parser.add_option( "-s", "--start", action="store", dest="start_codons", default='ATG', help= "Start codon (DNA sequence, so use 'T' instead of 'U') used to define open reading frame (ORF). default=%default" ) parser.add_option( "-t", "--stop", action="store", dest="stop_codons", default='TAG,TAA,TGA', help= "Stop codon (DNA sequence, so use 'T' instead of 'U') used to define open reading frame (ORF). Multiple stop codons should be separated by ','. default=%default" ) (options, args) = parser.parse_args() #check input and output files for file in ([ options.gene_file, options.out_file, options.logit_model, options.hexamer_dat ]): if not (file): parser.print_help() sys.exit(0) #build hexamer table from hexamer frequency file coding = {} noncoding = {} for line in open(options.hexamer_dat): line = line.strip() fields = line.split() if fields[0] == 'hexamer': continue coding[fields[0]] = float(fields[1]) noncoding[fields[0]] = float(fields[2]) count = 0 TMP = open(options.out_file + '.dat', 'w') file_format = bed_or_fasta(options.gene_file) if file_format == 'UNKNOWN': print("\nError: unknown file format of '-g'\n", file=sys.stderr) parser.print_help() sys.exit(0) elif file_format == 'BED': print("Input gene file is in BED format", file=sys.stderr) if not options.ref_genome: print("\nError: Reference genome file must be provided\n", file=sys.stderr) parser.print_help() sys.exit(0) index_fasta(options.ref_genome) for line in ireader.reader(options.gene_file): count += 1 if line.startswith('track'): continue if line.startswith('#'): continue if line.startswith('browser'): continue #if not line.strip(): continue (gene_id, mRNA_size, CDS_size, fickett_score, hexamer) = extract_feature_from_bed(line, options.ref_genome, options.start_codons, options.stop_codons, coding, noncoding) print('\t'.join([ str(i) for i in [gene_id, mRNA_size, CDS_size, fickett_score, hexamer] ]), file=TMP) print("%d genes finished\r" % count, end=' ', file=sys.stderr) elif file_format == 'FASTA': if options.ref_genome: print( "Reference genome sequence [-r] and conservation score [-c] will be ignored when input genes are fasta format.", file=sys.stderr) print("Input gene file is in FASTA format", file=sys.stderr) #fa = fasta.Fasta(options.gene_file) for sname, seq in FrameKmer.seq_generator(options.gene_file): count += 1 #geneSeq = fa.getSeq(seqID = geneID) (mRNA_size, CDS_size, fickett_score, hexamer) = extract_feature_from_seq(seq=seq, stt=options.start_codons, stp=options.stop_codons, c_tab=coding, g_tab=noncoding) print('\t'.join( str(i) for i in (sname, mRNA_size, CDS_size, fickett_score, hexamer)), file=TMP) print("%d genes finished\r" % count, end=' ', file=sys.stderr) TMP.close() coding_prediction(options.logit_model, options.out_file + '.dat', options.out_file)
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-c", "--cgene", action="store", dest="coding_file", help= "Genomic sequnences of protein-coding RNAs in FASTA (https://en.wikipedia.org/wiki/FASTA_format) or standard 12-column BED (https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format. It is recommended to use *short* and *unique* sequence identifiers (such as Ensembl transcript id) in FASTA and BED file. The input FASTA or BED file could be a regular text file or compressed file (*.gz, *.bz2) or accessible URL (http://, https://, ftp://). When BED file is provided, use the ORF defined in the BED file (the 7th and 8th columns in BED file define the positions of 'start codon, and 'stop codon', respectively). When FASTA file is provided, searching for the longet ORF. For well annotated genome, we recommend using BED file as input because the longest ORF predicted from RNA sequence might not be the real ORF. If this is a BED file, reference genome ('-r/--ref') should be specified." ) parser.add_option( "-n", "--ngene", action="store", dest="noncoding_file", help= "Genomic sequences of non-coding RNAs in FASTA (https://en.wikipedia.org/wiki/FASTA_format) or standard 12-column BED (https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format. It is recommended to use *short* and *unique* sequence identifiers (such as Ensembl transcript id) in FASTA and BED file. The input FASTA or BED file could be a regular text file or compressed file (*.gz, *.bz2) or accessible URL (http://, https://, ftp://). If this is a BED file, reference genome ('-r/--ref') should be specified." ) parser.add_option("-o", "--outfile", action="store", dest="out_file", help="The prefix of output files.") parser.add_option( "-x", "--hex", action="store", dest="hexamer_dat", help= "Hexamer frequency table. CPAT has prebuilt hexamer frequency tables for Human, Mouse, Fly, Zebrafish. Run 'make_hexamer_tab.py' to generate this table." ) parser.add_option( "-r", "--ref", action="store", dest="ref_genome", help= "Reference genome sequences in FASTA format. Ignore this option if mRNA sequences file was provided to '-g'. Reference genome file will be indexed automatically if the index file *.fai) does not exist." ) parser.add_option( "-s", "--start", action="store", dest="start_codons", default='ATG', help= "Start codon (use 'T' instead of 'U') used to define the start of open reading frame (ORF). default=%default" ) parser.add_option( "-t", "--stop", action="store", dest="stop_codons", default='TAG,TAA,TGA', help= "Stop codon (use 'T' instead of 'U') used to define the end of open reading frame (ORF). Multiple stop codons are separated by ','. default=%default" ) parser.add_option( "--min-orf", action="store", type="int", dest="min_orf_len", default=30, help="Minimum ORF length in nucleotides. default=%default") parser.add_option("--log-file", action="store", type="string", dest="log_file", default='make_logitModel_run_info.log', help="Name of log file. default=\"%default\"") parser.add_option( "--verbose", action="store_true", dest="debug", default=False, help= "Logical to determine if detailed running information is printed to screen." ) (options, args) = parser.parse_args() #check input and output files for file in ([ options.coding_file, options.noncoding_file, options.out_file, options.hexamer_dat ]): if not (file): parser.print_help() sys.exit(0) if options.debug: logging.basicConfig(filename='%s' % options.log_file, filemode='w', format="%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG) else: logging.basicConfig(filename='%s' % options.log_file, filemode='w', format="%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO) #logging to console logFormat = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S') consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormat) logging.getLogger().addHandler(consoleHandler) start_codons = options.start_codons.strip().split(',') stop_codons = options.stop_codons.strip().split(',') logging.info("Start codons used: [%s]" % ','.join(start_codons)) logging.info("Stop codons used: [%s]" % ','.join(stop_codons)) #data used to build logit model train_data = [] coding_label = 1 noncoding_label = 0 header = ['ID', 'mRNA', 'ORF', 'Fickett', 'Hexamer', 'Label'] #build hexamer table from hexamer frequency file logging.info("Reading hexamer frequency table file: \"%s\"" % options.hexamer_dat) coding = {} noncoding = {} for line in open(options.hexamer_dat, 'r'): line = line.strip() fields = line.split() if fields[0] == 'hexamer': continue coding[fields[0]] = float(fields[1]) noncoding[fields[0]] = float(fields[2]) ###################################################################################### # process protein coding transcripts ###################################################################################### count = 0 logging.info("Process protein-coding RNA file: \"%s\"" % options.coding_file) file_format = bed_or_fasta(options.coding_file) if file_format == 'UNKNOWN': logging.error("Error: unknown file format \"%s\"" % options.coding_file) parser.print_help() sys.exit(0) elif file_format == 'BED': logging.info("Protein-coding RNA file \"%s\" is in BED format" % options.coding_file) if not options.ref_genome: logging.error("Error: Reference genome file must be provided") parser.print_help() sys.exit(0) index_fasta(options.ref_genome) for line in ireader.reader(options.coding_file): count += 1 if count % 10 == 0: print("%d rows finished\r" % count, end=' ', file=sys.stderr) if line.startswith('track'): continue if line.startswith('#'): continue if line.startswith('browser'): continue #option-1: extract mRNA seq from BED and then find the longest ORF as CDS #features = extract_feature_from_bed(line, refgenome = options.ref_genome, stt = start_codons, stp = stop_codons, c_tab=coding, g_tab=noncoding, min_orf = options.min_orf_len) #if features is None: # logging.warning("No ORF found for: %s" % '\t'.join(line.split()[0:6])) # continue #(gene_id, mRNA_size, CDS_size, fickett_score, hexamer) = features #option-2: extract CDS directly from BED (gene_id, mRNA_size, CDS_size, fickett_score, hexamer) = extract_CDS_from_bed(line, refgenome=options.ref_genome, stt=start_codons, stp=stop_codons, c_tab=coding, g_tab=noncoding, min_orf=options.min_orf_len) train_data.append([ gene_id, mRNA_size, CDS_size, fickett_score, hexamer, coding_label ]) logging.info("Total %d coding rows finished." % count) elif file_format == 'FASTA': if options.ref_genome: logging.warning( "Reference genome sequence [-r] will be ignored when input file is in FASTA format." ) logging.info("Protein-coding RNA file \"%s\" is in FASTA format" % options.coding_file) for sname, seq in FrameKmer.seq_generator(options.coding_file): count += 1 if count % 10 == 0: print("%d sequences finished\r" % count, end=' ', file=sys.stderr) features = extract_feature_from_seq(seq=seq, stt=start_codons, stp=stop_codons, c_tab=coding, g_tab=noncoding, min_orf=options.min_orf_len) if features is None: continue (mRNA_size, CDS_size, fickett_score, hexamer) = features train_data.append([ sname, mRNA_size, CDS_size, fickett_score, hexamer, coding_label ]) logging.info("Total %d coding sequences finished." % count) ###################################################################################### # process Non-protein coding transcripts ###################################################################################### count = 0 logging.info("Process non-coding RNA file: \"%s\"" % options.noncoding_file) file_format = bed_or_fasta(options.noncoding_file) if file_format == 'UNKNOWN': logging.error("Error: unknown file format \"%s\"" % options.noncoding_file) parser.print_help() sys.exit(0) elif file_format == 'BED': logging.info("Non-coding RNA file \"%s\" is in BED format" % options.noncoding_file) if not options.ref_genome: logging.error("Error: Reference genome file must be provided") parser.print_help() sys.exit(0) index_fasta(options.ref_genome) for line in ireader.reader(options.noncoding_file): count += 1 if count % 10 == 0: print("%d genes finished\r" % count, end=' ', file=sys.stderr) if line.startswith('track'): continue if line.startswith('#'): continue if line.startswith('browser'): continue fields = line.split() if int(fields[1]) != int(fields[6]): logging.warning("This seems to be protein-coding:%s" % '\t'.join(fields[0:6])) #if not line.strip(): continue features = extract_feature_from_bed(line, refgenome=options.ref_genome, stt=start_codons, stp=stop_codons, c_tab=coding, g_tab=noncoding, min_orf=options.min_orf_len) if features is None: logging.warning("No ORF found for: %s" % '\t'.join(line.split()[0:6])) continue (gene_id, mRNA_size, CDS_size, fickett_score, hexamer) = features train_data.append([ gene_id, mRNA_size, CDS_size, fickett_score, hexamer, noncoding_label ]) logging.info("Total %d non-coding rows finished." % count) elif file_format == 'FASTA': if options.ref_genome: logging.warning( "Reference genome sequence [-r] will be ignored when input file is in FASTA format." ) logging.info("Non-coding RNA file \"%s\" is in FASTA format" % options.noncoding_file) for sname, seq in FrameKmer.seq_generator(options.noncoding_file): count += 1 if count % 10 == 0: print("%d sequences finished\r" % count, end=' ', file=sys.stderr) #geneSeq = fa.getSeq(seqID = geneID) features = extract_feature_from_seq(seq=seq, stt=start_codons, stp=stop_codons, c_tab=coding, g_tab=noncoding, min_orf=options.min_orf_len) if features is None: continue (mRNA_size, CDS_size, fickett_score, hexamer) = features train_data.append([ sname, mRNA_size, CDS_size, fickett_score, hexamer, noncoding_label ]) logging.info("Total %d non-coding sequences finished." % count) ###################################################################################### # writing data ###################################################################################### logging.info("Wrting to \"%s\"" % (options.out_file + '.feature.xls')) TMP = open(options.out_file + '.feature.xls', 'w') print('\t'.join(header), file=TMP) for i in train_data: print('\t'.join([str(j) for j in i]), file=TMP) TMP.close() #print("build logi model ...", file=sys.stderr) make_logit(options.out_file + '.feature.xls', options.out_file + '.make_logitModel.r', options.out_file + '.logit.RData')
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-g", "--gene", action="store", type="string", dest="gene_file", help= "Genomic sequnence(s) of RNA in FASTA (https://en.wikipedia.org/wiki/FASTA_format) or standard 12-column BED (https://genome.ucsc.edu/FAQ/FAQformat.html#format1) format. It is recommended to use *short* and *unique* sequence identifiers (such as Ensembl transcript id) in FASTA and BED file. If this is a BED file, reference genome ('-r/--ref') should be specified. The input FASTA or BED file could be a regular text file or compressed file (*.gz, *.bz2) or accessible URL (http://, https://, ftp://). URL file cannot be a compressed file." ) parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", help="The prefix of output files.") parser.add_option( "-d", "--logitModel", action="store", dest="logit_model", help= "Logistic regression model. The prebuilt models for Human, Mouse, Fly, Zebrafish are availablel. Run 'make_logitModel.py' to build logistic regression model for your own training datset." ) parser.add_option( "-x", "--hex", action="store", dest="hexamer_dat", help= "The hexamer frequency table. The prebuilt tables for Human, Mouse, Fly, Zebrafish are availablel. Run 'make_hexamer_tab.py' to make this table for your own training dataset." ) parser.add_option( "-r", "--ref", action="store", dest="ref_genome", help= "Reference genome sequences in FASTA format. Reference genome file will be indexed automatically if the index file ( *.fai) does not exist. Will be ignored if FASTA file was provided to '-g/--gene'." ) parser.add_option( "--antisense", action="store_true", dest="antisense", default=False, help= "Logical to determine whether to search for ORFs from the anti-sense strand. *Sense strand* (or coding strand) is DNA strand that carries the translatable code in the 5′ to 3′ direction. default=False (i.e. only search for ORFs from the sense strand)" ) parser.add_option( "--start", action="store", type="string", dest="start_codons", default='ATG', help= "Start codon (use 'T' instead of 'U') used to define the start of open reading frame (ORF). default=%default" ) parser.add_option( "--stop", action="store", type="string", dest="stop_codons", default='TAG,TAA,TGA', help= "Stop codon (use 'T' instead of 'U') used to define the end of open reading frame (ORF). Multiple stop codons are separated by ','. default=%default" ) parser.add_option( "--min-orf", action="store", type="int", dest="min_orf_len", default=75, help="Minimum ORF length in nucleotides. default=%default") parser.add_option( "--top-orf", action="store", type="int", dest="n_top_orf", default=5, help= "Number of ORF candidates reported. RNAs may have dozens of putative ORFs, in most cases, the real ORF is ranked (by size) in the top several. It is not necessary to calculate \"Fickett score\", \"Hexamer score\" and \"coding probability\" for every ORF. default=%default" ) parser.add_option( "--width", action="store", type="int", dest="line_width", default=100, help="Line width of output ORFs in FASTA format. default=%default") parser.add_option("--log-file", action="store", type="string", dest="log_file", default='CPAT_run_info.log', help="Name of log file. default=\"%default\"") parser.add_option( "--best-orf", action="store", type="string", dest="mode", default='p', help= "Criteria to select the best ORF: \"l\"=length, selection according to the \"ORF length\"; \"p\"=probability, selection according to the \"coding probability\". default=\"%default\"" ) parser.add_option( "--verbose", action="store_true", dest="debug", default=False, help= "Logical to determine if detailed running information is printed to screen." ) (options, args) = parser.parse_args() for file in ([ options.gene_file, options.hexamer_dat, options.logit_model, options.out_file ]): if not (file): parser.print_help() sys.exit(0) if options.line_width < 1: sys.exit(0) if options.mode not in ["p", "l"]: print("Please specifiy either \"p\" or \"l\" to --best-orf.", file=sys.stderr) sys.exit(0) #logging to file if options.debug: logging.basicConfig(filename='%s' % options.log_file, filemode='w', format="%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S', level=logging.DEBUG) else: logging.basicConfig(filename='%s' % options.log_file, filemode='w', format="%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S', level=logging.INFO) #logging to console logFormat = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", datefmt='%Y-%m-%d %I:%M:%S') consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormat) logging.getLogger().addHandler(consoleHandler) logging.info("Running CPAT version %s..." % (__version__)) start_codons = options.start_codons.replace(' ', '').split(',') stop_codons = options.stop_codons.replace(' ', '').split(',') SEQOUT = open(options.out_file + '.ORF_seqs.fa', 'w') INFOUT = open(options.out_file + '.ORF_info.tsv', 'w') NOORF = open(options.out_file + '.no_ORF.txt', 'w') logging.info("Start codons used: [%s]" % ','.join(start_codons)) logging.info("Stop codons used: [%s]" % ','.join(stop_codons)) #build hexamer table from hexamer frequency file logging.info("Reading %s" % options.hexamer_dat) coding = {} noncoding = {} for line in open(options.hexamer_dat): line = line.strip() fields = line.split() if fields[0] == 'hexamer': continue coding[fields[0]] = float(fields[1]) noncoding[fields[0]] = float(fields[2]) count = 0 logging.info("Checking format of \"%s\"" % options.gene_file) file_format = bed_or_fasta(options.gene_file) if file_format == 'UNKNOWN': logging.error("Unknown file format:%s" % options.gene_file) sys.exit(0) elif file_format == 'FASTA': logging.info("Input gene file is in FASTA format") if options.ref_genome: logging.warning( "\"%s\" is a sequence file. The reference genome file \"%s\" will be ignored." % (options.gene_file, options.ref_genome)) logging.info("Searching for ORFs ...") print( "\t".join([ "ID", "mRNA", "ORF_strand", "ORF_frame", "ORF_start", "ORF_end", "ORF", "Fickett", "Hexamer" ]), file=INFOUT ) ## do NOT change these labels, they are R variable names in the model. for name, seq in FrameKmer.seq_generator(options.gene_file): count += 1 RNA_len = len(seq) #ORF serial number, starting from 1 orf_sn = 1 tmp1 = find_orfs.ORFFinder(seq=seq, min_orf=options.min_orf_len) ORFs = tmp1.orf_candidates(antisense=options.antisense, n_candidate=options.n_top_orf, start_coden=start_codons, stop_coden=stop_codons) if len(ORFs) == 0: logging.warning("No ORFs found for %s" % name) print(name, file=NOORF) continue for orf in ORFs: # (direction, frame_number+1, orf_start, orf_end, L, sequence) orf_seq = orf[-1] if orf[0] == '+': orf[2] = orf[ 2] + 1 #change 0-based into 1-based to be consistent with NCBI ORFfinder output (https://www.ncbi.nlm.nih.gov/orffinder/) elif orf[0] == '-': orf[2] = RNA_len - (orf[2]) orf[3] = RNA_len - orf[3] + 1 orf_id = name + '_ORF_' + str(orf_sn) + '\t' + str( RNA_len) + '\t' + '\t'.join([str(i) for i in orf[:-1]]) fickett_score = fickett.fickett_value(orf_seq) hexamer_score = FrameKmer.kmer_ratio(orf_seq, 6, 3, coding, noncoding) print(orf_id + '\t' + str(fickett_score) + '\t' + str(hexamer_score), file=INFOUT) print(">" + orf_id, file=SEQOUT) print('\n'.join(wrap(orf_seq, width=options.line_width)), file=SEQOUT) orf_sn += 1 print("%d sequences finished\r" % count, end=' ', file=sys.stderr) print("\n", file=sys.stderr) elif file_format == 'BED': logging.info("Input gene file is in BED format") if not options.ref_genome: logging.error("Reference genome file (-r/--ref) must be provided.") parser.print_help() sys.exit(0) logging.info("Searching for ORFs ...") print( "\t".join([ "ID", "mRNA", "ORF_strand", "ORF_frame", "ORF_start", "ORF_end", "ORF", "Fickett", "Hexamer" ]), file=INFOUT ) ## do NOT change these labels, they are R variable names in the model. index_fasta(options.ref_genome) for line in ireader.reader(options.gene_file): count += 1 if line.startswith('track'): continue if line.startswith('#'): continue if line.startswith('browser'): continue name, seq = seq_from_bed(line, options.ref_genome) RNA_len = len(seq) #ORF serial number, starting from 1 orf_sn = 1 tmp1 = find_orfs.ORFFinder(seq=seq, min_orf=options.min_orf_len) ORFs = tmp1.orf_candidates(antisense=options.antisense, n_candidate=options.n_top_orf, start_coden=start_codons, stop_coden=stop_codons) if len(ORFs) == 0: logging.warning("No ORFs found for %s" % name) print(line, file=NOORF) continue for orf in ORFs: # (direction, frame_number+1, orf_start, orf_end, L, sequence) orf_seq = orf[-1] if orf[0] == '+': orf[2] = orf[ 2] + 1 #change 0-based into 1-based to be consistent with NCBI ORFfinder output (https://www.ncbi.nlm.nih.gov/orffinder/) elif orf[0] == '-': orf[2] = RNA_len - (orf[2]) orf[3] = RNA_len - orf[3] + 1 orf_id = name + '_ORF_' + str(orf_sn) + '\t' + str( RNA_len) + '\t' + '\t'.join([str(i) for i in orf[:-1]]) fickett_score = fickett.fickett_value(orf_seq) hexamer_score = FrameKmer.kmer_ratio(orf_seq, 6, 3, coding, noncoding) print(orf_id + '\t' + str(fickett_score) + '\t' + str(hexamer_score), file=INFOUT) print(">" + orf_id, file=SEQOUT) print('\n'.join(wrap(orf_seq, width=options.line_width)), file=SEQOUT) orf_sn += 1 print("%d rows finished\r" % count, end=' ', file=sys.stderr) print("\n", file=sys.stderr) SEQOUT.close() INFOUT.close() logging.info("Calculate coding probability ...") coding_prediction( options.logit_model, options.out_file + '.ORF_info.tsv', options.out_file) #output options.out_file + '.ORF_prob.tsv' if options.mode == 'p': logging.info("Select ORF with the highest coding probability ...") col_index = 9 elif options.mode == 'l': logging.info("Select the longest ORF ...") col_index = 6 BEST = open((options.out_file + '.ORF_prob.best.tsv'), 'w') best_candidates = {} for l in open((options.out_file + '.ORF_prob.tsv'), 'r'): l = l.strip() if l.startswith('ID'): print("seq_ID\t" + l, file=BEST) continue f = l.split('\t') seq_id = f[0].split('_ORF_')[0] prob = float(f[col_index]) if seq_id not in best_candidates: best_candidates[seq_id] = f else: if prob > float(best_candidates[seq_id][col_index]): best_candidates[seq_id] = f for k, v in best_candidates.items(): print(k + '\t' + '\t'.join(v), file=BEST) BEST.close() logging.info("Done!") finish_up(options.out_file, options.n_top_orf, options.min_orf_len)
for line in open(options.hexamer_dat): line = line.strip() fields = line.split() if fields[0] == 'hexamer': continue coding[fields[0]] = float(fields[1]) noncoding[fields[0]] = float(fields[2]) exon_max, exon_mean, exon_num = gtf_exons.gtf_parser(options.gtf, int(options.lines_drop), options.fformat) TMP = open(options.out_file + '.txt', 'w') TMP.write('\t'.join(("sname", "mRNA_size", "ORF_size", "mean_orf_length", "orf_coverage", "fickett_score", "hexamer", "gc_content", "kozak34", "kozak21", "kozak6", "exon_max", "exon_mean", "exon_num")) + '\n') count = 0 for sname, seq in FrameKmer.seq_generator(options.gene_file): gc_content = count_gc(seq) count+=1 mRNA_size, orf_size, mean_orf_length, orf_coverage, \ fickett_score, hexamer, k34, k21, k6 = extract_feature_from_seq(seq=seq, c_tab=coding, g_tab=noncoding) TMP.write('\t'.join(str(i) for i in ( sname, mRNA_size, orf_size, mean_orf_length, orf_coverage, fickett_score, hexamer, gc_content, k34, k21, k6, exon_max.get(sname, 0), exon_mean.get(sname, 0), exon_num.get(sname, 0))) + '\n') if count % 100 == 0: print(sys.stderr, "%d genes finished\r" % count,)
def main(): usage = "\n%prog [options]" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option( "-g", "--gene", action="store", dest="gene_file", help="Transcripts either in BED format or mRNA sequences in FASTA format: If this is BED format file, '-r' must be specified; if this is mRNA sequence file in FASTA format, ignore the '-r' option. The input BED or FASTA file could be regular text file or compressed file (*.gz, *.bz2) or accessible url.", ) parser.add_option( "-o", "--outfile", action="store", dest="out_file", help="output file. Tab separated text file: geneID <tab> mRNA size <tab> ORF size <tab> Fickett Score <tab> Hexamer Score<tab>Coding Probability.", ) parser.add_option( "-x", "--hex", action="store", dest="hexamer_dat", help="Prebuilt hexamer frequency table (Human, Mouse, Fly, Zebrafish). Run 'make_hexamer_tab.py' to make this table out of your own training dataset.", ) parser.add_option( "-d", "--logitModel", action="store", dest="logit_model", help="Prebuilt training model (Human, Mouse, Fly, Zebrafish). Run 'make_logitModel.py' to build logit model out of your own training datset", ) parser.add_option( "-r", "--ref", action="store", dest="ref_genome", help="Reference genome sequences in FASTA format. Ignore this option if mRNA sequences file was provided to '-g'. Reference genome file will be indexed automatically (produce *.fai file along with the original *.fa file within the same directory) if hasn't been done.", ) parser.add_option( "-s", "--start", action="store", dest="start_codons", default="ATG", help="Start codon (DNA sequence, so use 'T' instead of 'U') used to define open reading frame (ORF). default=%default", ) parser.add_option( "-t", "--stop", action="store", dest="stop_codons", default="TAG,TAA,TGA", help="Stop codon (DNA sequence, so use 'T' instead of 'U') used to define open reading frame (ORF). Multiple stop codons should be separated by ','. default=%default", ) (options, args) = parser.parse_args() # check input and output files for file in [options.gene_file, options.out_file, options.logit_model, options.hexamer_dat]: if not (file): parser.print_help() sys.exit(0) # build hexamer table from hexamer frequency file coding = {} noncoding = {} for line in open(options.hexamer_dat): line = line.strip() fields = line.split() if fields[0] == "hexamer": continue coding[fields[0]] = float(fields[1]) noncoding[fields[0]] = float(fields[2]) count = 0 TMP = open(options.out_file + ".dat", "w") file_format = bed_or_fasta(options.gene_file) if file_format == "UNKNOWN": print >>sys.stderr, "\nError: unknown file format of '-g'\n" parser.print_help() sys.exit(0) elif file_format == "BED": print >>sys.stderr, "Input gene file is in BED format" if not options.ref_genome: print >>sys.stderr, "\nError: Reference genome file must be provided\n" parser.print_help() sys.exit(0) index_fasta(options.ref_genome) for line in ireader.reader(options.gene_file): count += 1 if line.startswith("track"): continue if line.startswith("#"): continue if line.startswith("browser"): continue # if not line.strip(): continue (gene_id, mRNA_size, CDS_size, fickett_score, hexamer) = extract_feature_from_bed( line, options.ref_genome, options.start_codons, options.stop_codons, coding, noncoding ) print >> TMP, "\t".join([str(i) for i in [gene_id, mRNA_size, CDS_size, fickett_score, hexamer]]) print >>sys.stderr, "%d genes finished\r" % count, elif file_format == "FASTA": if options.ref_genome: print >>sys.stderr, "Reference genome sequence [-r] and conservation score [-c] will be ignored when input genes are fasta format." print >>sys.stderr, "Input gene file is in FASTA format" # fa = fasta.Fasta(options.gene_file) for sname, seq in FrameKmer.seq_generator(options.gene_file): count += 1 # geneSeq = fa.getSeq(seqID = geneID) (mRNA_size, CDS_size, fickett_score, hexamer) = extract_feature_from_seq( seq=seq, stt=options.start_codons, stp=options.stop_codons, c_tab=coding, g_tab=noncoding ) print >> TMP, "\t".join(str(i) for i in (sname, mRNA_size, CDS_size, fickett_score, hexamer)) print >>sys.stderr, "%d genes finished\r" % count, TMP.close() coding_prediction(options.logit_model, options.out_file + ".dat", options.out_file)