def main(argv): # Parse input and output files prefix = 'training' model_fit = True intron_retention = True detect_IR = False quantify = False parser = argparse.ArgumentParser( description='Given the read profiles from characterization step, ' \ 'simulate genomic/transcriptic ONT reads and output error profiles', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) subparsers = parser.add_subparsers( help= "You may run the simulator on transcriptome or genome mode. You may also only quanity expression profiles.", dest='mode') parser_g = subparsers.add_parser('genome', help="Run the simulator on genome mode.") parser_g.add_argument('-i', '--read', help='Input read for training.', required=True) parser_g.add_argument('-rg', '--ref_g', help='Reference genome.', required=True) parser_g.add_argument( '-a', '--aligner', help='The aligner to be used minimap2 or LAST (Default = minimap2)', default='minimap2') parser_g.add_argument( '-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='') parser_g.add_argument('-o', '--output', help='The output name and location for profiles', default="training") parser_g.add_argument('--no_model_fit', help='Disable model fitting step', action='store_true') parser_g.add_argument( '-t', '--num_threads', help= 'Number of threads to be used in alignments and model fitting (Default = 1)', default=1) parser_t = subparsers.add_parser( 'transcriptome', help="Run the simulator on transcriptome mode.") parser_t.add_argument('-i', '--read', help='Input read for training.', required=True) parser_t.add_argument('-rg', '--ref_g', help='Reference genome.', required=False, default='') parser_t.add_argument('-rt', '--ref_t', help='Reference Transcriptome.', required=True) parser_t.add_argument('-annot', '--annot', help='Annotation file in ensemble GTF/GFF formats.', required=True, default='') parser_t.add_argument( '-a', '--aligner', help='The aligner to be used: minimap2 or LAST (Default = minimap2)', default='minimap2') parser_t.add_argument( '-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='') parser_t.add_argument( '-ta', '--t_alnm', help='Transcriptome alignment file in sam or maf format (optional)', default='') parser_t.add_argument('-o', '--output', help='The output name and location for profiles', default="training") parser_t.add_argument('--no_model_fit', help='Disable model fitting step', action='store_true') parser_t.add_argument('--no_intron_retention', help='Disable Intron Retention analysis', action='store_true') parser_t.add_argument( '-t', '--num_threads', help= 'Number of threads to be used in alignments and model fitting (Default = 1)', default=1) parser_e = subparsers.add_parser( 'quantify', help="Quantify expression profile of transcripts") parser_e.add_argument('-o', '--output', help='The output name and location', default="expression") parser_e.add_argument('-i', '--read', help='Input reads to use for quantification.', required=True) parser_e.add_argument('-rt', '--ref_t', help='Reference Transcriptome.', required=True) parser_e.add_argument('-t', '--num_threads', help='Number of threads to be used (Default = 1)', default=1) parser_ir = subparsers.add_parser( 'detect_ir', help="Detect Intron Retention events using the alignment file") parser_ir.add_argument('-annot', '--annot', help='Annotation file in ensemble GTF/GFF formats.', required=True) parser_ir.add_argument('-o', '--output', help='The output name and location', default="ir_info") parser_ir.add_argument('-ga', '--g_alnm', help='Genome alignment file in sam or maf format', default='', required=True) parser_ir.add_argument( '-ta', '--t_alnm', help='Transcriptome alignment file in sam or maf format', default='', required=True) args = parser.parse_args() if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) if len(sys.argv) == 2: if args.mode == "genome": parser_g.print_help(sys.stderr) elif args.mode == "transcriptome": parser_t.print_help(sys.stderr) elif args.mode == "detect_ir": parser_ir.print_help(sys.stderr) elif args.mode == "quantify": parser_e.print_help(sys.stderr) else: parser.print_help(sys.stderr) sys.exit(1) #parse quanity mode arguments if args.mode == "quantify": infile = ref_t = args.ref_t prefix = args.output num_threads = max(args.num_threads, 1) print("\nrunning the code with following parameters:\n") print("infile", infile) print("ref_t", ref_t) print("prefix", prefix) print("num_threads", num_threads) # Quantifying the transcript abundance from input read sys.stdout.write('Quantifying transcripts abundance: \n') call("minimap2 -t " + str(num_threads) + " -x map-ont -p0 " + ref_t + " " + infile + " > " + prefix + "_mapping.paf", shell=True) call("python -i " + prefix + "_mapping.paf > " + prefix + "_abundance.tsv", shell=True) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n") sys.exit(1) if args.mode == "detect_ir": annot = args.annot prefix = args.output g_alnm = args.g_alnm t_alnm = args.t_alnm if g_alnm == "" or t_alnm == "": print("Please provide both alignments in sam format\n") parser_ir.print_help(sys.stderr) sys.exit(1) print("\nrunning the code with following parameters:\n") print("annot", annot) print("g_alnm", g_alnm) print("t_alnm", t_alnm) print("prefix", prefix) # Read the annotation GTF/GFF3 file sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Parse the annotation file (GTF/GFF3)\n") # If gtf provided, convert to GFF3 (gt gtf_to_gff3) annot_filename, annot_file_extension = os.path.splitext(annot) annot_file_extension = annot_file_extension[1:] if annot_file_extension.upper() == "GTF": call("gt gtf_to_gff3 -tidy -force -o " + prefix + ".gff3 " + annot, shell=True) annot_filename = prefix # Next, add intron info into gff3: call( "gt gff3 -tidy -retainids -checkids -addintrons -sort -force -o " + prefix + "_addedintron_temp.gff3 " + annot_filename + ".gff3", shell=True) # Inherit "transcript_id" information for intron features from exon info call("gt bequeath.lua transcript_id < " + prefix + "_addedintron_temp.gff3 > " + prefix + "_addedintron_final.gff3", shell=True) sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n") model_ir.intron_retention(prefix, prefix + "_addedintron_final.gff3", g_alnm, t_alnm) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n") sys.exit(1) if args.mode == "genome": infile = ref_g = args.ref_g aligner = args.aligner g_alnm = args.g_alnm prefix = args.output num_threads = str(max(args.num_threads, 1)) if args.no_model_fit: model_fit = False if aligner not in ['minimap2', 'LAST', '']: print("Please specify an acceptable aligner (minimap2 or LAST)\n") parser_g.print_help(sys.stderr) sys.exit(1) if g_alnm != '': pre, file_ext = os.path.splitext(g_alnm) file_extension = file_ext[1:] if file_extension not in ['maf', 'sam']: print( "Please specify an acceptable alignment format! (.maf or .sam)\n" ) parser_g.print_help(sys.stderr) sys.exit(1) print("\nrunning the code with following parameters:\n") print("infile", infile) print("ref_g", ref_g) print("aligner", aligner) print("g_alnm", g_alnm) print("prefix", prefix) print("num_threads", num_threads) print("model_fit", model_fit) dir_name = os.path.dirname(prefix) basename = os.path.basename(prefix) call("mkdir -p " + dir_name, shell=True) # READ PRE-PROCESS AND ALIGNMENT ANALYSIS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process and unaligned reads analysis\n") in_fasta = prefix + "_processed.fasta" processed_fasta = open(in_fasta, 'w') with open(infile, 'r') as f: for seqN, seqS, seqQ in readfq(f): info = re.split(r'[_\s]\s*', seqN) chr_name = "-".join(info) processed_fasta.write('>' + chr_name + '\n' + seqS + '\n') processed_fasta.close() alnm_ext, unaligned_length, strandness = align_genome( in_fasta, prefix, aligner, num_threads, g_alnm, ref_g) # Aligned reads analysis sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n") num_aligned = align.head_align_tail(prefix, alnm_ext, args.mode) if args.mode == "transcriptome": infile = ref_g = args.ref_g ref_t = args.ref_t annot = args.annot aligner = args.aligner g_alnm = args.g_alnm t_alnm = args.t_alnm prefix = args.output num_threads = str(max(args.num_threads, 1)) if args.no_model_fit: model_fit = False if args.no_intron_retention: intron_retention = False if aligner not in ['minimap2', 'LAST', '']: print( "\nPlease specify an acceptable aligner (minimap2 or LAST)\n") parser_t.print_help(sys.stderr) sys.exit(1) if (g_alnm != '' and t_alnm == '') or (g_alnm == '' and t_alnm != ''): print( "\nPlease specify either both alignment files (-ga and -ta) OR an aligner to use for alignment (-a)" ) parser_t.print_help(sys.stderr) sys.exit(1) if g_alnm != "" and t_alnm != "": g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm) t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm) g_alnm_ext = g_alnm_ext[1:] t_alnm_ext = t_alnm_ext[1:] if g_alnm_ext != t_alnm_ext: print( "\nPlease provide both alignments in a same format: sam OR maf\n" ) parser_t.print_help(sys.stderr) sys.exit(1) #development: model IR using MAF alignment formats as well if g_alnm_ext == t_alnm_ext == "maf" and intron_retention: print( "\nThe intron retention only works with sam alignment files for now. Thanks\n" ) parser_t.print_help(sys.stderr) sys.exit(1) if intron_retention and (ref_g == '' or annot == ''): print( "\nPlease also input reference genome and annotation file for Intron Retention modeling\n" ) parser_t.print_help(sys.stderr) sys.exit(1) print("\nrunning the code with following parameters:\n") print("infile", infile) print("ref_g", ref_g) print("ref_t", ref_t) print("annot", annot) print("aligner", aligner) print("g_alnm", g_alnm) print("t_alnm", t_alnm) print("prefix", prefix) print("num_threads", num_threads) print("model_fit", model_fit) print("intron_retention", intron_retention) dir_name = os.path.dirname(prefix) basename = os.path.basename(prefix) call("mkdir -p " + dir_name, shell=True) # READ PRE-PROCESS AND ALIGNMENT ANALYSIS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process and unaligned reads analysis\n") in_fasta = prefix + "_processed.fasta" processed_fasta = open(in_fasta, 'w') with open(infile, 'r') as f: for seqN, seqS, seqQ in readfq(f): info = re.split(r'[_\s]\s*', seqN) chr_name = "-".join(info) processed_fasta.write('>' + chr_name + '\n' + seqS + '\n') processed_fasta.close() # Read the length of reference transcripts from the reference transcriptome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Read the length of reference transcripts \n") dict_ref_len = {} with open(ref_t) as f: for seqN, seqS, seqQ in readfq(f): info = re.split(r'[_\s]\s*', seqN) chr_name = "-".join(info) dict_ref_len[chr_name] = len(seqS) # Read the annotation GTF/GFF3 file sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Parse the annotation file (GTF/GFF3)\n") # If gtf provided, convert to GFF3 (gt gtf_to_gff3) annot_filename, annot_file_extension = os.path.splitext(annot) annot_file_extension = annot_file_extension[1:] if annot_file_extension.upper() == "GTF": call("gt gtf_to_gff3 -tidy -force -o " + prefix + ".gff3 " + annot, shell=True) annot_filename = prefix # Next, add intron info into gff3: call( "gt gff3 -tidy -retainids -checkids -addintrons -sort -force -o " + prefix + "_addedintron_temp.gff3 " + annot_filename + ".gff3", shell=True) # Inherit "transcript_id" information for intron features from exon info call("gt bequeath.lua transcript_id < " + prefix + "_addedintron_temp.gff3 > " + prefix + "_addedintron_final.gff3", shell=True) if intron_retention: alnm_ext, unaligned_length, out_g, out_t, strandness = align_transcriptome( in_fasta, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm, ref_g) sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n") model_ir.intron_retention(prefix, prefix + "_addedintron_final.gff3", out_g, out_t) else: alnm_ext, unaligned_length, out_g, out_t, strandness = align_transcriptome( in_fasta, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm=None, ref_g=None) # Aligned reads analysis sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n") num_aligned = align.head_align_tail(prefix, alnm_ext, args.mode, dict_ref_len) # strandness of the aligned reads strandness_rate = open(prefix + "_strandness_rate", 'w') strandness_rate.write("strandness:\t" + str(round(strandness, 3))) strandness_rate.close() # Length distribution of unaligned reads alignment_rate = open(prefix + "_reads_alignment_rate", 'w') num_unaligned = len(unaligned_length) if num_unaligned != 0: alignment_rate.write("Aligned / Unaligned ratio:" + "\t" + str(num_aligned * 1.0 / num_unaligned) + '\n') unaligned_length_2d = unaligned_length[:, numpy.newaxis] kde_unaligned = KernelDensity(bandwidth=10).fit(unaligned_length_2d) joblib.dump(kde_unaligned, prefix + "_unaligned_length.pkl") else: alignment_rate.write("Aligned / Unaligned ratio:\t100%\n") alignment_rate.close() del unaligned_length # MATCH AND ERROR MODELS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": match and error models\n") error_model.hist(prefix, alnm_ext) if model_fit: sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Model fitting\n") model_fitting.model_fitting(prefix, int(num_threads)) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
def main(): parser = argparse.ArgumentParser( description=dedent(''' Read characterization step ----------------------------------------------------------- Given raw ONT reads, reference genome and/or transcriptome, learn read features and output error profiles '''), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-v', '--version', action='version', version='NanoSim ' + VERSION) subparsers = parser.add_subparsers(dest='mode', description=dedent(''' There are four modes in read_analysis. For detailed usage of each mode: mode -h ------------------------------------------------------- ''')) parser_g = subparsers.add_parser('genome', help="Run the simulator on genome mode") parser_g.add_argument('-i', '--read', help='Input read for training', required=True) parser_g.add_argument('-rg', '--ref_g', help='Reference genome, not required if genome alignment file is provided', default='') parser_g.add_argument('-a', '--aligner', help='The aligner to be used, minimap2 or LAST (Default = minimap2)', choices=['minimap2', 'LAST'], default='minimap2') parser_g.add_argument('-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='') parser_g.add_argument('-o', '--output', help='The location and prefix of outputting profiles (Default = training)', default='training') parser_g.add_argument('--no_model_fit', help='Disable model fitting step', action='store_false', default=True) parser_g.add_argument('-t', '--num_threads', help='Number of threads for alignment and model fitting (Default = 1)', type=int, default=1) parser_t = subparsers.add_parser('transcriptome', help="Run the simulator on transcriptome mode") parser_t.add_argument('-i', '--read', help='Input read for training', required=True) parser_t.add_argument('-rg', '--ref_g', help='Reference genome', required=True) parser_t.add_argument('-rt', '--ref_t', help='Reference Transcriptome', required=True) # ? parser_t.add_argument('-annot', '--annotation', help='Annotation file in ensemble GTF/GFF formats, ' 'required for intron retention detection', default='') parser_t.add_argument('-a', '--aligner', help='The aligner to be used: minimap2 or LAST (Default = minimap2)', choices=['minimap2', 'LAST'], default='minimap2') parser_t.add_argument('-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='') parser_t.add_argument('-ta', '--t_alnm', help='Transcriptome alignment file in sam or maf format (optional)', default='') parser_t.add_argument('-o', '--output', help='The location and prefix of outputting profiles (Default = training)', default='training') parser_t.add_argument('--no_model_fit', help='Disable model fitting step', action='store_false', default=True) parser_t.add_argument('--no_intron_retention', help='Disable Intron Retention analysis', action='store_false', default=True) parser_t.add_argument('-t', '--num_threads', help='Number of threads for alignment and model fitting (Default = 1)', type=int, default=1) parser_e = subparsers.add_parser('quantify', help="Quantify expression profile of transcripts") parser_e.add_argument('-i', '--read', help='Input reads for quantification', required=True) parser_e.add_argument('-rt', '--ref_t', help='Reference Transcriptome', required=True) parser_e.add_argument('-o', '--output', help='The location and prefix of outputting profile (Default = expression)', default='expression') parser_e.add_argument('-t', '--num_threads', help='Number of threads for alignment (Default = 1)', type=int, default=1) parser_ir = subparsers.add_parser('detect_ir', help="Detect Intron Retention events using the alignment file") parser_ir.add_argument('-annot', '--annotation', help='Annotation file in ensemble GTF/GFF formats', required=True) parser_ir.add_argument('-i', '--read', help='Input read for training, not required if alignment files are provided', default='') parser_ir.add_argument('-rg', '--ref_g', help='Reference genome, not required if genome alignment file is provided', default='') parser_ir.add_argument('-rt', '--ref_t', help='Reference Transcriptome, not required if transcriptome alignment ' 'file is provided', default='') parser_ir.add_argument('-a', '--aligner', help='The aligner to be used: minimap2 or LAST (Default = minimap2)', choices=['minimap2', 'LAST'], default='minimap2') parser_ir.add_argument('-o', '--output', help='The output name and location', required=False, default='ir_info') parser_ir.add_argument('-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='') parser_ir.add_argument('-ta', '--t_alnm', help='Transcriptome alignment file in sam or maf format (optional)', default='') parser_ir.add_argument('-t', '--num_threads', help='Number of threads for alignment (Default = 1)', type=int, default=1) args = parser.parse_args() if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) # parse quantify mode arguments if args.mode == "quantify": infile = ref_t = args.ref_t prefix = args.output num_threads = str(max(args.num_threads, 1)) print("\nrunning the code with following parameters:\n") print("infile", infile) print("ref_t", ref_t) print("prefix", prefix) print("num_threads", num_threads) dir_name = os.path.dirname(prefix) if dir_name != '': call("mkdir -p " + dir_name, shell=True) # Quantifying the transcript abundance from input read sys.stdout.write('Quantifying transcripts abundance: \n') map_file = prefix + '_mapping.paf' call("minimap2 -t " + str(num_threads) + " -x map-ont -p0 " + ref_t + " " + infile + " > " + map_file, shell=True) # Get the script path script_path = os.path.realpath(__file__) script_dir = os.path.dirname(script_path) out_file = prefix + '_abundance.tsv' call("python " + script_dir + "/ -i " + map_file + " > " + out_file, shell=True) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n") return # parse detect_ir mode arguments if args.mode == "detect_ir": annot = args.annotation infile = prefix = args.output aligner = args.aligner ref_g = args.ref_g ref_t = args.ref_t g_alnm = args.g_alnm t_alnm = args.t_alnm num_threads = str(max(args.num_threads, 1)) if g_alnm == '' and ref_g == '': print("Please supply a reference genome or genome alignment file\n") parser_ir.print_help(sys.stderr) sys.exit(1) if t_alnm == '' and ref_t == '': print("Please supply a reference transcriptome or transcriptome alignment file\n") parser_ir.print_help(sys.stderr) sys.exit(1) # check validity of parameters if g_alnm != '': pre, file_ext = os.path.splitext(g_alnm) file_extension = file_ext[1:] if file_extension not in ['maf', 'sam']: print("Please specify an acceptable alignment format! (.maf or .sam)\n") parser_ir.print_help(sys.stderr) sys.exit(1) if t_alnm != '': pre, file_ext = os.path.splitext(t_alnm) file_extension = file_ext[1:] if file_extension not in ['maf', 'sam']: print("Please specify an acceptable alignment format! (.maf or .sam)\n") parser_ir.print_help(sys.stderr) sys.exit(1) print("\nrunning the code with following parameters:\n") print("annot", annot) print("infile", infile) print("aligner", aligner) print("ref_g", ref_g) print("ref_t", ref_t) print("g_alnm", g_alnm) print("t_alnm", t_alnm) print("prefix", prefix) dir_name = os.path.dirname(prefix) if dir_name != '': call("mkdir -p " + dir_name, shell=True) # Alignment if maf/sam file not provided, and post process them to include only primary alignments t_alnm, g_alnm = align_transcriptome(infile, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm, ref_g) # Add introns to annotation file add_intron(annot, prefix) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n") model_ir.intron_retention(prefix, prefix + "_added_intron_final.gff3", g_alnm, t_alnm) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n") return if args.mode == "genome": infile = ref_g = args.ref_g aligner = args.aligner g_alnm = args.g_alnm prefix = args.output num_threads = str(max(args.num_threads, 1)) model_fit = args.no_model_fit # check validity of parameters if g_alnm != '': pre, file_ext = os.path.splitext(g_alnm) file_extension = file_ext[1:] if file_extension not in ['maf', 'sam']: print("Please specify an acceptable alignment format! (.maf or .sam)\n") parser_g.print_help(sys.stderr) sys.exit(1) if g_alnm == '' and ref_g == '': print("Please supply a reference genome or genome alignment file\n") parser_g.print_help(sys.stderr) sys.exit(1) print("\nRunning the code with following parameters:\n") print("infile", infile) print("ref_g", ref_g) print("aligner", aligner) print("g_alnm", g_alnm) print("prefix", prefix) print("num_threads", num_threads) print("model_fit", model_fit) dir_name = os.path.dirname(prefix) if dir_name != '': call("mkdir -p " + dir_name, shell=True) # READ PRE-PROCESS AND ALIGNMENT ANALYSIS sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process\n") in_fasta = prefix + "_processed.fasta" processed_fasta = open(in_fasta, 'w') # Replace spaces in sequence headers with dashes to create unique header for each read with open(infile, 'r') as f: for seqN, seqS, seqQ in readfq(f): info = re.split(r'[_\s]\s*', seqN) chr_name = "-".join(info) processed_fasta.write('>' + chr_name + '\n' + seqS + '\n') processed_fasta.close() alnm_ext, unaligned_length, strandness = align_genome(in_fasta, prefix, aligner, num_threads, g_alnm, ref_g) # Aligned reads analysis sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n") num_aligned = align.head_align_tail(prefix, alnm_ext, args.mode) if args.mode == "transcriptome": infile = ref_g = args.ref_g ref_t = args.ref_t annot = args.annotation aligner = args.aligner g_alnm = args.g_alnm t_alnm = args.t_alnm prefix = args.output num_threads = str(max(args.num_threads, 1)) model_fit = args.no_model_fit ir = args.no_intron_retention if ir and g_alnm == '' and ref_g == '': print("For intron retention function, please supply a reference genome or genome alignment file\n") parser_ir.print_help(sys.stderr) sys.exit(1) if t_alnm == '' and ref_t == '': print("Please supply a reference transcriptome or transcriptome alignment file\n") parser_ir.print_help(sys.stderr) sys.exit(1) if g_alnm != '' and t_alnm != '': g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm) t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm) g_alnm_ext = g_alnm_ext[1:] t_alnm_ext = t_alnm_ext[1:] if g_alnm_ext != t_alnm_ext or g_alnm_ext not in ['maf', 'sam']: print("\nPlease provide both alignments in a same format: sam OR maf\n") parser_t.print_help(sys.stderr) sys.exit(1) # Development: model IR using MAF alignment formats as well if g_alnm_ext == t_alnm_ext == "maf" and ir: print("\nThe intron retention only works with sam alignment files for now. Thanks\n") parser_t.print_help(sys.stderr) sys.exit(1) if ir and (ref_g == '' or annot == ''): print("\nPlease also input reference genome and annotation file for Intron Retention modeling\n") parser_t.print_help(sys.stderr) sys.exit(1) print("\nrunning the code with following parameters:\n") print("infile", infile) print("ref_g", ref_g) print("ref_t", ref_t) print("annot", annot) print("aligner", aligner) print("g_alnm", g_alnm) print("t_alnm", t_alnm) print("prefix", prefix) print("num_threads", num_threads) print("model_fit", model_fit) print("intron_retention", ir) dir_name = os.path.dirname(prefix) if dir_name != '': call("mkdir -p " + dir_name, shell=True) # READ PRE-PROCESS AND ALIGNMENT ANALYSIS sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process and unaligned reads analysis\n") in_fasta = prefix + "_processed.fasta" processed_fasta = open(in_fasta, 'w') with open(infile, 'r') as f: for seqN, seqS, seqQ in readfq(f): info = re.split(r'[_\s]\s*', seqN) chr_name = "-".join(info) processed_fasta.write('>' + chr_name + '\n' + seqS + '\n') processed_fasta.close() # Read the length of reference transcripts from the reference transcriptome sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read the length of reference transcripts \n") dict_ref_len = {} with open(ref_t) as f: for seqN, seqS, seqQ in readfq(f): info = re.split(r'[_\s]\s*', seqN) chr_name = "-".join(info) dict_ref_len[chr_name] = len(seqS) alnm_ext, unaligned_length, g_alnm, t_alnm, strandness = \ align_transcriptome(in_fasta, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm, ref_g) if ir: # Add introns to annotation file add_intron(annot, prefix) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n") model_ir.intron_retention(prefix, prefix + "_added_intron_final.gff3", g_alnm, t_alnm) # Aligned reads analysis sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n") num_aligned = align.head_align_tail(prefix + "_transcriptome", alnm_ext, args.mode, dict_ref_len) # strandness of the aligned reads strandness_rate = open(prefix + "_strandness_rate", 'w') strandness_rate.write("strandness:\t" + str(round(strandness, 3))) strandness_rate.close() # Length distribution of unaligned reads sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Unaligned reads analysis\n") alignment_rate = open(prefix + "_reads_alignment_rate", 'w') num_unaligned = len(unaligned_length) if num_unaligned != 0: alignment_rate.write("Aligned / Unaligned ratio:" + "\t" + str(num_aligned * 1.0 / num_unaligned) + '\n') unaligned_length_2d = unaligned_length[:, numpy.newaxis] kde_unaligned = KernelDensity(bandwidth=10).fit(unaligned_length_2d) joblib.dump(kde_unaligned, prefix + "_unaligned_length.pkl") else: alignment_rate.write("Aligned / Unaligned ratio:\t100%\n") alignment_rate.close() del unaligned_length # MATCH AND ERROR MODELS sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": match and error models\n") if args.mode == "transcriptome": error_model.hist(prefix + "_genome", alnm_ext) # Use primary genome alignment for error profiling else: error_model.hist(prefix, alnm_ext) if model_fit: sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Model fitting\n") model_fitting.model_fitting(prefix, int(num_threads)) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
def main(): # Parse input and output files infile = '' ref_g = '' ref_t = '' annot = '' model_fit = True intron_retention = True detect_IR = False quantify = False parser = argparse.ArgumentParser( description='Given the read profiles from characterization step, ' \ 'simulate transcriptome ONT reads and output error profiles', formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument('-i', '--read', help='Input read for training.', required=True) parser.add_argument('-rg', '--ref_g', help='Reference genome.', required=True) parser.add_argument('-rt', '--ref_t', help='Reference Transcriptome.', required=True) parser.add_argument('-annot', '--annot', help='Annotation file in ensemble GTF/GFF formats.', required=True) parser.add_argument( '-a', '--aligner', help='The aligner to be used minimap2 or LAST (Default = minimap2)', default='minimap2') parser.add_argument( '-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='') parser.add_argument( '-ta', '--t_alnm', help='Transcriptome alignment file in sam or maf format (optional)', default='') parser.add_argument('-o', '--output', help='The output name and location for profiles', default="training") parser.add_argument('--no_model_fit', help='Disable model fitting step', action='store_true') parser.add_argument('--no_intron_retention', help='Disable Intron Retention analysis', action='store_true') parser.add_argument( '--detect_IR', help='Detect Intron Retention events using input reads and exit', action='store_true') parser.add_argument('-b', '--num_bins', help='Number of bins to be used (Default = 20)', default=20) parser.add_argument( '-t', '--num_threads', help= 'Number of threads to be used in alignments and model fitting (Default = 1)', default=1) parser.add_argument('--quantify', help='Quantify expression profile of input reads', action='store_true') args = parser.parse_args() infile = ref_g = args.ref_g ref_t = args.ref_t annot = args.annot aligner = args.aligner g_alnm = args.g_alnm t_alnm = args.t_alnm outfile = args.output num_bins = max(args.num_bins, 1) num_threads = max(args.num_threads, 1) if args.no_model_fit: model_fit = False if args.no_intron_retention: intron_retention = False if args.detect_IR: detect_IR = True if args.quantify: quantify = True print("Running the characterization step with following arguments: \n") print("infile", infile) print("ref_g", ref_g) print("ref_t", ref_t) print("annot", annot) print("aligner", aligner) print("g_alnm", g_alnm) print("t_alnm", t_alnm) print("outfile", outfile) print("model_fit", model_fit) print("num_bins", num_bins) print("num_threads", num_threads) print("detect_IR", detect_IR) print("quantify", quantify) #Quantifying the transcript abundance from input read sys.stdout.write('Quantifying transcripts abundance: \n') #sys.stdout.log.write('Quantifying transcripts abundance: \n') call("minimap2 -t " + str(num_threads) + " -x map-ont -p0 " + ref_t + " " + infile + " > " + outfile + "_mapping.paf", shell=True) call("python -i " + outfile + "_mapping.paf > " + outfile + "_abundance.tsv", shell=True) sys.stdout.write('Finished! \n') #sys.stdout.log.write('Finished! \n') if quantify == True: sys.exit(1) if (g_alnm != '' and t_alnm == '') or (g_alnm == '' and t_alnm != ''): print( "Please specify either both alignment files (-ga and -ta) OR an aligner to use for alignment (-a)" ) usage() sys.exit(1) if g_alnm != "" and t_alnm != "": g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm) t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm) g_alnm_ext = g_alnm_ext[1:] t_alnm_ext = t_alnm_ext[1:] if g_alnm_ext != t_alnm_ext: print( "Please provide both alignments in a same format: sam OR maf\n" ) usage() sys.exit(1) # READ PRE-PROCESS AND UNALIGNED READS ANALYSIS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process and unaligned reads analysis\n") # Read pre-process in_fasta = outfile + ".fasta" if in_fasta == infile: in_fasta = outfile + "_processed.fasta" out_fasta = open(in_fasta, 'w') dic_reads = {} with open(infile, 'r') as f: for line in f: if line[0] == '>': name = '-'.join(line.strip()[1:].split()) dic_reads[name] = "" else: dic_reads[name] += line.strip() for k, v in dic_reads.items(): out_fasta.write('>' + k + '\n' + v + '\n') out_fasta.close() del dic_reads # Read the annotation GTF/GFF3 file sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Parse the annotation file (GTF/GFF3)\n") # If gtf provided, convert to GFF3 (gt gtf_to_gff3) annot_filename, annot_file_extension = os.path.splitext(annot) annot_file_extension = annot_file_extension[1:] if annot_file_extension.upper() == "GTF": call("gt gtf_to_gff3 -tidy -o " + outfile + ".gff3" + annot, shell=True) # Next, add intron info into gff3: call("gt gff3 -tidy -retainids -checkids -addintrons -o " + outfile + "_addedintron.gff3 " + annot_filename + ".gff3", shell=True) sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Read the length of reference transcripts \n") #Read the length of reference transcripts from the reference transcriptome dict_ref_len = {} with open(ref_t) as f: for line in f: if line.startswith(">"): ref_id = line.split()[0][1:] dict_ref_len[ref_id] = 0 else: dict_ref_len[ref_id] += len(line.strip()) #If both alignment files are provided: if g_alnm != "" and t_alnm != "": sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Processing the alignment files: " + t_alnm_ext + "\n") if t_alnm_ext == "maf": outmaf_g = outfile + "_genome_alnm.maf" outmaf_t = outfile + "_transcriptome_alnm.maf" if outmaf_g == g_alnm: outmaf_g = outfile + "_genome_alnm_processed.maf" if outmaf_t == t_alnm: outmaf_t = outfile + "_transcriptome_alnm_processed.maf" call("grep '^s ' " + g_alnm + " > " + outmaf_g, shell=True) call("grep '^s ' " + t_alnm + " > " + outmaf_t, shell=True) unaligned_length = list( get_besthit_maf.besthit_and_unaligned(in_fasta, outmaf_t, outfile)) elif t_alnm_ext == "sam": unaligned_length = list( get_primary_sam.primary_and_unaligned(g_alnm, t_alnm, outfile)) else: if aligner == "minimap2": g_alnm_ext = "sam" t_alnm_ext = "sam" outsam_g = outfile + "_genome_alnm.sam" outsam_t = outfile + "_transcriptome_alnm.sam" # Alignment to reference genome # [EDIT] I should change the options for minimap when dealing with cDNA and dRNA reads. sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2 to reference genome\n") call("minimap2 -ax splice " + ref_g + " " + in_fasta + " > " + outsam_g, shell=True) # Alignment to reference transcriptome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with minimap2 to reference transcriptome\n") call("minimap2 --cs -ax map-ont " + ref_t + " " + in_fasta + " > " + outsam_t, shell=True) # [EDIT] I may add a script to remove minimap2/LAST post-alignment files after alignment. unaligned_length = list( get_primary_sam.primary_and_unaligned(outsam_g, outsam_t, outfile)) elif aligner == "LAST": g_alnm_ext = "maf" t_alnm_ext = "maf" outmaf_g = outfile + "_genome_alnm.maf" outmaf_t = outfile + "_transcriptome_alnm.maf" # Alignment to reference genome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST to reference genome\n") call("lastdb ref_genome " + ref_g, shell=True) call("lastal -a 1 -P " + num_threads + " ref_genome " + in_fasta + " | grep '^s ' > " + outmaf_g, shell=True) # Alignment to reference transcriptome sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Alignment with LAST to reference transcriptome\n") call("lastdb ref_transcriptome " + ref_t, shell=True) call("lastal -a 1 -P " + num_threads + " ref_transcriptome " + in_fasta + " | grep '^s ' > " + outmaf_t, shell=True) unaligned_length = list( get_besthit_maf.besthit_and_unaligned(in_fasta, outmaf_t, outfile)) else: print("Please specify an acceptable aligner (minimap2 or LAST)\n") usage() sys.exit(1) if detect_IR == True: sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Detecting Intron Retention events using input reads\n") model_ir.intron_retention(outfile, ref_t) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished\n") sys.exit(1) sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Reads length distribution analysis\n") # Aligned reads length distribution analysis count_aligned = align.head_align_tail(outfile, num_bins, t_alnm_ext, dict_ref_len) # Unaligned reads length distribution analysis out1 = open(outfile + "_unaligned_length_ecdf", 'w') count_unaligned = len(unaligned_length) if count_unaligned != 0: max_length = max(unaligned_length) hist_unaligned, edges_unaligned = numpy.histogram( unaligned_length, bins=numpy.arange(0, max_length + 50, 50), density=True) cdf = numpy.cumsum(hist_unaligned * 50) out1.write("Aligned / Unaligned ratio:" + "\t" + str(count_aligned * 1.0 / count_unaligned) + '\n') out1.write("bin\t0-" + str(max_length) + '\n') for i in xrange(len(cdf)): out1.write( str(edges_unaligned[i]) + '-' + str(edges_unaligned[i + 1]) + "\t" + str(cdf[i]) + '\n') else: out1.write("Aligned / Unaligned ratio:\t100%\n") out1.close() # MATCH AND ERROR MODELS sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": match and error models\n") error_model.hist(outfile, t_alnm_ext) if intron_retention: sys.stdout.write( strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n") model_ir.intron_retention(outfile, ref_t) if model_fit: sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Model fitting\n") model_fitting.model_fitting(outfile, int(num_threads)) call("find . -name \*ref_genome.* -delete", shell=True) call("find . -name \*ref_transcriptome.* -delete", shell=True) call("find . -name \*.pyc -delete", shell=True) sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")