def align_transcriptome(in_fasta, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm, ref_g, post=True): if t_alnm == '': if aligner == "minimap2": t_alnm = prefix + "_transcriptome_alnm.sam" # Alignment to reference transcriptome sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2 to reference transcriptome\n") call("minimap2 --cs -ax map-ont -t " + num_threads + " " + ref_t + " " + in_fasta + " > " + t_alnm, shell=True) elif aligner == "LAST": t_alnm = prefix + "_transcriptome_alnm.maf" # Alignment to reference transcriptome sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST to reference transcriptome\n") call("lastdb ref_transcriptome " + ref_t, shell=True) call("lastal -a 1 -P " + num_threads + " ref_transcriptome " + in_fasta + " > " + t_alnm, shell=True) if g_alnm == '': if aligner == "minimap2": g_alnm = prefix + "_genome_alnm.sam" # Alignment to reference genome # [EDIT] I may change the options for minimap2 when dealing with cDNA and dRNA reads. sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2 to reference genome\n") call("minimap2 --cs -ax splice -t " + num_threads + " " + ref_g + " " + in_fasta + " > " + g_alnm, shell=True) elif aligner == "LAST": g_alnm = prefix + "_genome_alnm.maf" # Alignment to reference genome sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST to reference genome\n") call("lastdb ref_genome " + ref_g, shell=True) call("lastal -a 1 -P " + num_threads + " ref_genome " + in_fasta + " > " + g_alnm, shell=True) if not post: return t_alnm, g_alnm # post-process t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm) t_alnm_ext = t_alnm_ext[1:] sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Processing transcriptome alignment file: " + t_alnm_ext + '\n') if t_alnm_ext == "maf": processed_maf_t = prefix + "_transcriptome_alnm_processed.maf" call("grep '^s ' " + t_alnm + " > " + processed_maf_t, shell=True) unaligned_length, strandness = get_besthit_maf.besthit_and_unaligned(in_fasta, processed_maf_t, prefix + "_transcriptome") elif t_alnm_ext == "sam": unaligned_length, strandness = get_primary_sam.primary_and_unaligned(t_alnm, prefix + "_transcriptome") g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm) g_alnm_ext = g_alnm_ext[1:] sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Processing genome alignment file: " + g_alnm_ext + '\n') if g_alnm_ext == "maf": processed_maf = prefix + "_processed.maf" call("grep '^s ' " + g_alnm + " > " + processed_maf, shell=True) get_besthit_maf.besthit_and_unaligned(in_fasta, processed_maf, prefix + "_genome") elif g_alnm_ext == "sam": get_primary_sam.primary_and_unaligned(g_alnm, prefix + "_genome") return t_alnm_ext, unaligned_length, g_alnm, t_alnm, strandness
def align_genome(in_fasta, prefix, aligner, num_threads, g_alnm, ref_g): # if an alignment file is provided if g_alnm != '': pre, file_ext = os.path.splitext(g_alnm) file_extension = file_ext[1:] sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Processing alignment file: " + file_extension + "\n") if file_extension == "maf": processed_maf = prefix + "_processed.maf" call("grep '^s ' " + g_alnm + " > " + processed_maf, shell=True) # get best hit and unaligned reads unaligned_length, strandness = get_besthit_maf.besthit_and_unaligned( in_fasta, processed_maf, prefix) elif file_extension == "sam": # get the primary alignments and define unaligned reads. unaligned_length, strandness = get_primary_sam.primary_and_unaligned( g_alnm, prefix) # if alignment file is not provided else: if aligner == "minimap2" or aligner == "": # Align with minimap2 by default file_extension = "sam" out_sam = prefix + "_genome_alnm.sam" sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2\n") call("minimap2 --cs -ax map-ont -t " + num_threads + " " + ref_g + " " + in_fasta + " > " + out_sam, shell=True) # get primary alignments and unaligned reads unaligned_length, strandness = get_primary_sam.primary_and_unaligned( out_sam, prefix) elif aligner == "LAST": file_extension = "maf" out_maf = prefix + "_genome_alnm.maf" sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST\n") call("lastdb ref_genome " + ref, shell=True) call("lastal -a 1 -P " + num_threads + " ref_genome " + in_fasta + " | grep '^s ' > " + out_maf, shell=True) unaligned_length, strandness = get_besthit_maf.besthit_and_unaligned( in_fasta, out_maf, prefix) return file_extension, unaligned_length, strandness
def align_genome(in_fasta, prefix, aligner, num_threads, g_alnm, ref_g, chimeric, quantification=None, q_mode=False): # if an alignment file is not provided if g_alnm == '': if aligner == "minimap2": # Align with minimap2 by default g_alnm = prefix + "_genome_alnm.sam" sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2\n") sys.stdout.flush() call("minimap2 --cs -ax map-ont -t " + num_threads + " " + ref_g + " " + in_fasta + " > " + g_alnm, shell=True) elif aligner == "LAST": g_alnm = prefix + "_genome_alnm.maf" sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST\n") sys.stdout.flush() call("lastdb ref_genome " + ref_g, shell=True) call("lastal -a 1 -P " + num_threads + " ref_genome " + in_fasta + " " + g_alnm, shell=True) # post-process pre, file_ext = os.path.splitext(g_alnm) file_extension = file_ext[1:] sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Processing alignment file: " + file_extension + "\n") sys.stdout.flush() if file_extension == "maf": processed_maf = prefix + "_processed.maf" call("grep '^s ' " + g_alnm + " > " + processed_maf, shell=True) # get best hit and unaligned reads unaligned_length, strandness = get_besthit_maf.besthit_and_unaligned( in_fasta, processed_maf, prefix) elif file_extension == "sam": # get the primary alignments and define unaligned reads. if chimeric: unaligned_length, strandness = get_primary_sam.primary_and_unaligned_chimeric( g_alnm, prefix, quantification, q_mode) else: unaligned_length, strandness = get_primary_sam.primary_and_unaligned( g_alnm, prefix, quantification) return file_extension, unaligned_length, strandness
def main(argv): # Parse input and output files infile = '' prefix = 'training' ref = '' aligner = '' alnm_file = '' model_fit = True num_threads = '1' num_bins = 20 try: opts, args = getopt.getopt( argv, "hi:r:a:o:m:b:t:", ["infile=", "ref=", "prefix=", "no_model_fit"]) except getopt.GetoptError: usage() sys.exit(1) for opt, arg in opts: if opt == '-h': usage() sys.exit(0) elif opt in ("-i", "--infile"): infile = arg elif opt in ("-r", "--ref"): ref = arg elif opt == "-a": aligner = arg elif opt == "-m": alnm_file = arg elif opt in ("-o", "--prefix"): prefix = arg elif opt == "--no_model_fit": model_fit = False elif opt == "-b": num_bins = max(int(arg), 1) elif opt == "-t": num_threads = arg else: usage() sys.exit(1) if infile == '' or ref == '': print("Please specify the training reads and its reference genome!") usage() sys.exit(1) if aligner != '' and alnm_file != '': print( "Please specify either an alignment file (-m ) OR an aligner to use for alignment (-a )" ) usage() sys.exit(1) # READ PRE-PROCESS AND ALIGNMENT ANALYSIS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process and unaligned reads analysis\n") # Read pre-process in_fasta = prefix + "_processed.fasta" # use the prefix of input fasta file for processed fasta file processed_fasta = open(in_fasta, 'w') dic_reads = {} with open(infile, 'r') as f: for line in f: if line[0] == '>': name = '-'.join(line.strip()[1:].split()) dic_reads[name] = "" else: dic_reads[name] += line.strip() for k, v in dic_reads.items(): processed_fasta.write('>' + k + '\n' + v + '\n') processed_fasta.close() del dic_reads # if an alignment file is provided if alnm_file != '': pre, file_ext = os.path.splitext(alnm_file) file_extension = file_ext[1:] sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Processing alignment file: " + file_extension + "\n") if file_extension == "maf": processed_maf = prefix + "_processed.maf" call("grep '^s ' " + alnm_file + " > " + processed_maf, shell=True) # get best hit and unaligned reads unaligned_length = get_besthit_maf.besthit_and_unaligned( in_fasta, processed_maf, prefix) elif file_extension == "sam": # get the primary alignments and define unaligned reads. unaligned_length = get_primary_sam.primary_and_unaligned( alnm_file, prefix) else: print( "Please specify an acceptable alignment format! (maf or sam)\n" ) usage() sys.exit(1) # if alignment file is not provided else: if aligner == "minimap2" or aligner == "": # Align with minimap2 by default file_extension = "sam" out_sam = prefix + ".sam" sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2\n") call("minimap2 --cs -ax map-ont " + ref + " " + in_fasta + " > " + out_sam, shell=True) # get primary alignments and unaligned reads unaligned_length = get_primary_sam.primary_and_unaligned( out_sam, prefix) elif aligner == "LAST": file_extension = "maf" out_maf = prefix + ".maf" sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST\n") call("lastdb ref_genome " + ref, shell=True) call("lastal -a 1 -P " + num_threads + " ref_genome " + in_fasta + " | grep '^s ' > " + out_maf, shell=True) unaligned_length = get_besthit_maf.besthit_and_unaligned( in_fasta, out_maf, prefix) else: print("Please specify an acceptable aligner (minimap2 or LAST)\n") usage() sys.exit(1) # Aligned reads analysis sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n") num_aligned = align.head_align_tail(prefix, num_bins, file_extension) # Length distribution of unaligned reads out_unaligned_ecdf = open(prefix + "_unaligned_length_ecdf", 'w') num_unaligned = len(unaligned_length) if num_unaligned != 0: max_length = max(unaligned_length) hist_unaligned, edges_unaligned = numpy.histogram( unaligned_length, bins=numpy.arange(0, max_length + 50, 50), density=True) cdf = numpy.cumsum(hist_unaligned * 50) out_unaligned_ecdf.write("Aligned / Unaligned ratio:" + "\t" + str(num_aligned * 1.0 / num_unaligned) + '\n') out_unaligned_ecdf.write("bin\t0-" + str(max_length) + '\n') for i in xrange(len(cdf)): out_unaligned_ecdf.write( str(edges_unaligned[i]) + '-' + str(edges_unaligned[i + 1]) + "\t" + str(cdf[i]) + '\n') else: out_unaligned_ecdf.write("Aligned / Unaligned ratio:\t100%\n") out_unaligned_ecdf.close() del unaligned_length # MATCH AND ERROR MODELS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": match and error models\n") error_model.hist(prefix, file_extension) if model_fit: sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Model fitting\n") model_fitting.model_fitting(prefix, int(num_threads)) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
def align_transcriptome(in_fasta, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm=None, ref_g=None): out_g = None if t_alnm != "": out_t = t_alnm t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm) t_alnm_ext = t_alnm_ext[1:] sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Processing the transcriptome alignment file: " + t_alnm_ext + "\n") if t_alnm_ext == "maf": processed_maf_t = prefix + "_transcriptome_alnm_processed.maf" call("grep '^s ' " + t_alnm + " > " + processed_maf_t, shell=True) unaligned_length, strandness = get_besthit_maf.besthit_and_unaligned( in_fasta, processed_maf_t, prefix) elif t_alnm_ext == "sam": unaligned_length, strandness = get_primary_sam.primary_and_unaligned( t_alnm, prefix) else: if aligner == "minimap2": t_alnm_ext = "sam" outsam_t = prefix + "_transcriptome_alnm.sam" out_t = outsam_t # Alignment to reference transcriptome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2 to reference transcriptome\n") call("minimap2 --cs -ax map-ont -t " + num_threads + " " + ref_t + " " + in_fasta + " > " + outsam_t, shell=True) unaligned_length, strandness = get_primary_sam.primary_and_unaligned( outsam_t, prefix) elif aligner == "LAST": t_alnm_ext = "maf" outmaf_t = prefix + "_transcriptome_alnm.maf" out_t = outmaf_t # Alignment to reference transcriptome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST to reference transcriptome\n") call("lastdb ref_transcriptome " + ref_t, shell=True) call("lastal -a 1 -P " + num_threads + " ref_transcriptome " + in_fasta + " | grep '^s ' > " + outmaf_t, shell=True) unaligned_length, strandness = get_besthit_maf.besthit_and_unaligned( in_fasta, outmaf_t, prefix) if g_alnm or ref_g: if g_alnm: out_g = g_alnm g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm) g_alnm_ext = g_alnm_ext[1:] sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Processing the genome alignment file: " + g_alnm_ext + "\n") if g_alnm_ext == "maf": processed_maf_g = prefix + "_genome_alnm_processed.maf" call("grep '^s ' " + g_alnm + " > " + processed_maf_g, shell=True) else: if aligner == "minimap2": g_alnm_ext = "sam" outsam_g = prefix + "_genome_alnm.sam" out_g = outsam_g # Alignment to reference genome # [EDIT] I may change the options for minimap2 when dealing with cDNA and dRNA reads. sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2 to reference genome\n") call("minimap2 -ax splice -t " + num_threads + " " + ref_g + " " + in_fasta + " > " + outsam_g, shell=True) elif aligner == "LAST": g_alnm_ext = "maf" outmaf_g = prefix + "_genome_alnm.maf" out_g = outmaf_g # Alignment to reference genome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST to reference genome\n") call("lastdb ref_genome " + ref_g, shell=True) call("lastal -a 1 -P " + num_threads + " ref_genome " + in_fasta + " | grep '^s ' > " + outmaf_g, shell=True) return t_alnm_ext, unaligned_length, out_g, out_t, strandness
def main(): # Parse input and output files infile = '' ref_g = '' ref_t = '' annot = '' model_fit = True intron_retention = True detect_IR = False quantify = False parser = argparse.ArgumentParser( description='Given the read profiles from characterization step, ' \ 'simulate transcriptome ONT reads and output error profiles', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('-i', '--read', help='Input read for training.', required=True) parser.add_argument('-rg', '--ref_g', help='Reference genome.', required=True) parser.add_argument('-rt', '--ref_t', help='Reference Transcriptome.', required=True) parser.add_argument('-annot', '--annot', help='Annotation file in ensemble GTF/GFF formats.', required=True) parser.add_argument( '-a', '--aligner', help='The aligner to be used minimap2 or LAST (Default = minimap2)', default='minimap2') parser.add_argument( '-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='') parser.add_argument( '-ta', '--t_alnm', help='Transcriptome alignment file in sam or maf format (optional)', default='') parser.add_argument('-o', '--output', help='The output name and location for profiles', default="training") parser.add_argument('--no_model_fit', help='Disable model fitting step', action='store_true') parser.add_argument('--no_intron_retention', help='Disable Intron Retention analysis', action='store_true') parser.add_argument( '--detect_IR', help='Detect Intron Retention events using input reads and exit', action='store_true') parser.add_argument('-b', '--num_bins', help='Number of bins to be used (Default = 20)', default=20) parser.add_argument( '-t', '--num_threads', help= 'Number of threads to be used in alignments and model fitting (Default = 1)', default=1) parser.add_argument('--quantify', help='Quantify expression profile of input reads', action='store_true') args = parser.parse_args() infile = args.read ref_g = args.ref_g ref_t = args.ref_t annot = args.annot aligner = args.aligner g_alnm = args.g_alnm t_alnm = args.t_alnm outfile = args.output num_bins = max(args.num_bins, 1) num_threads = max(args.num_threads, 1) if args.no_model_fit: model_fit = False if args.no_intron_retention: intron_retention = False if args.detect_IR: detect_IR = True if args.quantify: quantify = True print("Running the characterization step with following arguments: \n") print("infile", infile) print("ref_g", ref_g) print("ref_t", ref_t) print("annot", annot) print("aligner", aligner) print("g_alnm", g_alnm) print("t_alnm", t_alnm) print("outfile", outfile) print("model_fit", model_fit) print("num_bins", num_bins) print("num_threads", num_threads) print("detect_IR", detect_IR) print("quantify", quantify) #Quantifying the transcript abundance from input read sys.stdout.write('Quantifying transcripts abundance: \n') #sys.stdout.log.write('Quantifying transcripts abundance: \n') call("minimap2 -t " + str(num_threads) + " -x map-ont -p0 " + ref_t + " " + infile + " > " + outfile + "_mapping.paf", shell=True) call("python nanopore_transcript_abundance.py -i " + outfile + "_mapping.paf > " + outfile + "_abundance.tsv", shell=True) sys.stdout.write('Finished! \n') #sys.stdout.log.write('Finished! \n') if quantify == True: sys.exit(1) if (g_alnm != '' and t_alnm == '') or (g_alnm == '' and t_alnm != ''): print( "Please specify either both alignment files (-ga and -ta) OR an aligner to use for alignment (-a)" ) usage() sys.exit(1) if g_alnm != "" and t_alnm != "": g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm) t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm) g_alnm_ext = g_alnm_ext[1:] t_alnm_ext = t_alnm_ext[1:] if g_alnm_ext != t_alnm_ext: print( "Please provide both alignments in a same format: sam OR maf\n" ) usage() sys.exit(1) # READ PRE-PROCESS AND UNALIGNED READS ANALYSIS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process and unaligned reads analysis\n") # Read pre-process in_fasta = outfile + ".fasta" if in_fasta == infile: in_fasta = outfile + "_processed.fasta" out_fasta = open(in_fasta, 'w') dic_reads = {} with open(infile, 'r') as f: for line in f: if line[0] == '>': name = '-'.join(line.strip()[1:].split()) dic_reads[name] = "" else: dic_reads[name] += line.strip() for k, v in dic_reads.items(): out_fasta.write('>' + k + '\n' + v + '\n') out_fasta.close() del dic_reads # Read the annotation GTF/GFF3 file sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Parse the annotation file (GTF/GFF3)\n") # If gtf provided, convert to GFF3 (gt gtf_to_gff3) annot_filename, annot_file_extension = os.path.splitext(annot) annot_file_extension = annot_file_extension[1:] if annot_file_extension.upper() == "GTF": call("gt gtf_to_gff3 -tidy -o " + outfile + ".gff3" + annot, shell=True) # Next, add intron info into gff3: call("gt gff3 -tidy -retainids -checkids -addintrons -o " + outfile + "_addedintron.gff3 " + annot_filename + ".gff3", shell=True) sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Read the length of reference transcripts \n") #Read the length of reference transcripts from the reference transcriptome dict_ref_len = {} with open(ref_t) as f: for line in f: if line.startswith(">"): ref_id = line.split()[0][1:] dict_ref_len[ref_id] = 0 else: dict_ref_len[ref_id] += len(line.strip()) #If both alignment files are provided: if g_alnm != "" and t_alnm != "": sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Processing the alignment files: " + t_alnm_ext + "\n") if t_alnm_ext == "maf": outmaf_g = outfile + "_genome_alnm.maf" outmaf_t = outfile + "_transcriptome_alnm.maf" if outmaf_g == g_alnm: outmaf_g = outfile + "_genome_alnm_processed.maf" if outmaf_t == t_alnm: outmaf_t = outfile + "_transcriptome_alnm_processed.maf" call("grep '^s ' " + g_alnm + " > " + outmaf_g, shell=True) call("grep '^s ' " + t_alnm + " > " + outmaf_t, shell=True) unaligned_length = list( get_besthit_maf.besthit_and_unaligned(in_fasta, outmaf_t, outfile)) elif t_alnm_ext == "sam": unaligned_length = list( get_primary_sam.primary_and_unaligned(g_alnm, t_alnm, outfile)) else: if aligner == "minimap2": g_alnm_ext = "sam" t_alnm_ext = "sam" outsam_g = outfile + "_genome_alnm.sam" outsam_t = outfile + "_transcriptome_alnm.sam" # Alignment to reference genome # [EDIT] I should change the options for minimap when dealing with cDNA and dRNA reads. sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2 to reference genome\n") call("minimap2 -ax splice " + ref_g + " " + in_fasta + " > " + outsam_g, shell=True) # Alignment to reference transcriptome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2 to reference transcriptome\n") call("minimap2 --cs -ax map-ont " + ref_t + " " + in_fasta + " > " + outsam_t, shell=True) # [EDIT] I may add a script to remove minimap2/LAST post-alignment files after alignment. unaligned_length = list( get_primary_sam.primary_and_unaligned(outsam_g, outsam_t, outfile)) elif aligner == "LAST": g_alnm_ext = "maf" t_alnm_ext = "maf" outmaf_g = outfile + "_genome_alnm.maf" outmaf_t = outfile + "_transcriptome_alnm.maf" # Alignment to reference genome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST to reference genome\n") call("lastdb ref_genome " + ref_g, shell=True) call("lastal -a 1 -P " + num_threads + " ref_genome " + in_fasta + " | grep '^s ' > " + outmaf_g, shell=True) # Alignment to reference transcriptome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST to reference transcriptome\n") call("lastdb ref_transcriptome " + ref_t, shell=True) call("lastal -a 1 -P " + num_threads + " ref_transcriptome " + in_fasta + " | grep '^s ' > " + outmaf_t, shell=True) unaligned_length = list( get_besthit_maf.besthit_and_unaligned(in_fasta, outmaf_t, outfile)) else: print("Please specify an acceptable aligner (minimap2 or LAST)\n") usage() sys.exit(1) if detect_IR == True: sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Detecting Intron Retention events using input reads\n") model_ir.intron_retention(outfile, ref_t) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished\n") sys.exit(1) sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Reads length distribution analysis\n") # Aligned reads length distribution analysis count_aligned = align.head_align_tail(outfile, num_bins, t_alnm_ext, dict_ref_len) # Unaligned reads length distribution analysis out1 = open(outfile + "_unaligned_length_ecdf", 'w') count_unaligned = len(unaligned_length) if count_unaligned != 0: max_length = max(unaligned_length) hist_unaligned, edges_unaligned = numpy.histogram( unaligned_length, bins=numpy.arange(0, max_length + 50, 50), density=True) cdf = numpy.cumsum(hist_unaligned * 50) out1.write("Aligned / Unaligned ratio:" + "\t" + str(count_aligned * 1.0 / count_unaligned) + '\n') out1.write("bin\t0-" + str(max_length) + '\n') for i in xrange(len(cdf)): out1.write( str(edges_unaligned[i]) + '-' + str(edges_unaligned[i + 1]) + "\t" + str(cdf[i]) + '\n') else: out1.write("Aligned / Unaligned ratio:\t100%\n") out1.close() # MATCH AND ERROR MODELS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": match and error models\n") error_model.hist(outfile, t_alnm_ext) if intron_retention: sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n") model_ir.intron_retention(outfile, ref_t) if model_fit: sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Model fitting\n") model_fitting.model_fitting(outfile, int(num_threads)) call("find . -name \*ref_genome.* -delete", shell=True) call("find . -name \*ref_transcriptome.* -delete", shell=True) call("find . -name \*.pyc -delete", shell=True) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")