Example #1
0
def main(argv):
    # Parse input and output files

    prefix = 'training'
    model_fit = True
    intron_retention = True
    detect_IR = False
    quantify = False

    parser = argparse.ArgumentParser(
        description='Given the read profiles from characterization step, ' \
                    'simulate genomic/transcriptic ONT reads and output error profiles',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    subparsers = parser.add_subparsers(
        help=
        "You may run the simulator on transcriptome or genome mode. You may also only quanity expression profiles.",
        dest='mode')

    parser_g = subparsers.add_parser('genome',
                                     help="Run the simulator on genome mode.")
    parser_g.add_argument('-i',
                          '--read',
                          help='Input read for training.',
                          required=True)
    parser_g.add_argument('-rg',
                          '--ref_g',
                          help='Reference genome.',
                          required=True)
    parser_g.add_argument(
        '-a',
        '--aligner',
        help='The aligner to be used minimap2 or LAST (Default = minimap2)',
        default='minimap2')
    parser_g.add_argument(
        '-ga',
        '--g_alnm',
        help='Genome alignment file in sam or maf format (optional)',
        default='')
    parser_g.add_argument('-o',
                          '--output',
                          help='The output name and location for profiles',
                          default="training")
    parser_g.add_argument('--no_model_fit',
                          help='Disable model fitting step',
                          action='store_true')
    parser_g.add_argument(
        '-t',
        '--num_threads',
        help=
        'Number of threads to be used in alignments and model fitting (Default = 1)',
        default=1)

    parser_t = subparsers.add_parser(
        'transcriptome', help="Run the simulator on transcriptome mode.")
    parser_t.add_argument('-i',
                          '--read',
                          help='Input read for training.',
                          required=True)
    parser_t.add_argument('-rg',
                          '--ref_g',
                          help='Reference genome.',
                          required=False,
                          default='')
    parser_t.add_argument('-rt',
                          '--ref_t',
                          help='Reference Transcriptome.',
                          required=True)
    parser_t.add_argument('-annot',
                          '--annot',
                          help='Annotation file in ensemble GTF/GFF formats.',
                          required=True,
                          default='')
    parser_t.add_argument(
        '-a',
        '--aligner',
        help='The aligner to be used: minimap2 or LAST (Default = minimap2)',
        default='minimap2')
    parser_t.add_argument(
        '-ga',
        '--g_alnm',
        help='Genome alignment file in sam or maf format (optional)',
        default='')
    parser_t.add_argument(
        '-ta',
        '--t_alnm',
        help='Transcriptome alignment file in sam or maf format (optional)',
        default='')
    parser_t.add_argument('-o',
                          '--output',
                          help='The output name and location for profiles',
                          default="training")
    parser_t.add_argument('--no_model_fit',
                          help='Disable model fitting step',
                          action='store_true')
    parser_t.add_argument('--no_intron_retention',
                          help='Disable Intron Retention analysis',
                          action='store_true')
    parser_t.add_argument(
        '-t',
        '--num_threads',
        help=
        'Number of threads to be used in alignments and model fitting (Default = 1)',
        default=1)

    parser_e = subparsers.add_parser(
        'quantify', help="Quantify expression profile of transcripts")
    parser_e.add_argument('-o',
                          '--output',
                          help='The output name and location',
                          default="expression")
    parser_e.add_argument('-i',
                          '--read',
                          help='Input reads to use for quantification.',
                          required=True)
    parser_e.add_argument('-rt',
                          '--ref_t',
                          help='Reference Transcriptome.',
                          required=True)
    parser_e.add_argument('-t',
                          '--num_threads',
                          help='Number of threads to be used (Default = 1)',
                          default=1)

    parser_ir = subparsers.add_parser(
        'detect_ir',
        help="Detect Intron Retention events using the alignment file")
    parser_ir.add_argument('-annot',
                           '--annot',
                           help='Annotation file in ensemble GTF/GFF formats.',
                           required=True)
    parser_ir.add_argument('-o',
                           '--output',
                           help='The output name and location',
                           default="ir_info")
    parser_ir.add_argument('-ga',
                           '--g_alnm',
                           help='Genome alignment file in sam or maf format',
                           default='',
                           required=True)
    parser_ir.add_argument(
        '-ta',
        '--t_alnm',
        help='Transcriptome alignment file in sam or maf format',
        default='',
        required=True)

    args = parser.parse_args()

    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    if len(sys.argv) == 2:
        if args.mode == "genome":
            parser_g.print_help(sys.stderr)

        elif args.mode == "transcriptome":
            parser_t.print_help(sys.stderr)

        elif args.mode == "detect_ir":
            parser_ir.print_help(sys.stderr)

        elif args.mode == "quantify":
            parser_e.print_help(sys.stderr)

        else:
            parser.print_help(sys.stderr)
        sys.exit(1)

    #parse quanity mode arguments
    if args.mode == "quantify":
        infile = args.read
        ref_t = args.ref_t
        prefix = args.output
        num_threads = max(args.num_threads, 1)

        print("\nrunning the code with following parameters:\n")
        print("infile", infile)
        print("ref_t", ref_t)
        print("prefix", prefix)
        print("num_threads", num_threads)

        # Quantifying the transcript abundance from input read
        sys.stdout.write('Quantifying transcripts abundance: \n')
        call("minimap2 -t " + str(num_threads) + " -x map-ont -p0 " + ref_t +
             " " + infile + " > " + prefix + "_mapping.paf",
             shell=True)
        call("python nanopore_transcript_abundance.py -i " + prefix +
             "_mapping.paf > " + prefix + "_abundance.tsv",
             shell=True)
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
        sys.exit(1)

    if args.mode == "detect_ir":
        annot = args.annot
        prefix = args.output
        g_alnm = args.g_alnm
        t_alnm = args.t_alnm

        if g_alnm == "" or t_alnm == "":
            print("Please provide both alignments in sam format\n")
            parser_ir.print_help(sys.stderr)
            sys.exit(1)

        print("\nrunning the code with following parameters:\n")
        print("annot", annot)
        print("g_alnm", g_alnm)
        print("t_alnm", t_alnm)
        print("prefix", prefix)

        # Read the annotation GTF/GFF3 file
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Parse the annotation file (GTF/GFF3)\n")
        # If gtf provided, convert to GFF3 (gt gtf_to_gff3)
        annot_filename, annot_file_extension = os.path.splitext(annot)
        annot_file_extension = annot_file_extension[1:]
        if annot_file_extension.upper() == "GTF":
            call("gt gtf_to_gff3 -tidy -force -o " + prefix + ".gff3 " + annot,
                 shell=True)
            annot_filename = prefix

        # Next, add intron info into gff3:
        call(
            "gt gff3 -tidy -retainids -checkids -addintrons -sort -force -o " +
            prefix + "_addedintron_temp.gff3 " + annot_filename + ".gff3",
            shell=True)

        # Inherit "transcript_id" information for intron features from exon info
        call("gt bequeath.lua transcript_id < " + prefix +
             "_addedintron_temp.gff3 > " + prefix + "_addedintron_final.gff3",
             shell=True)

        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n")
        model_ir.intron_retention(prefix, prefix + "_addedintron_final.gff3",
                                  g_alnm, t_alnm)

        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
        sys.exit(1)

    if args.mode == "genome":
        infile = args.read
        ref_g = args.ref_g
        aligner = args.aligner
        g_alnm = args.g_alnm
        prefix = args.output
        num_threads = str(max(args.num_threads, 1))
        if args.no_model_fit:
            model_fit = False

        if aligner not in ['minimap2', 'LAST', '']:
            print("Please specify an acceptable aligner (minimap2 or LAST)\n")
            parser_g.print_help(sys.stderr)
            sys.exit(1)

        if g_alnm != '':
            pre, file_ext = os.path.splitext(g_alnm)
            file_extension = file_ext[1:]
            if file_extension not in ['maf', 'sam']:
                print(
                    "Please specify an acceptable alignment format! (.maf or .sam)\n"
                )
                parser_g.print_help(sys.stderr)
                sys.exit(1)

        print("\nrunning the code with following parameters:\n")
        print("infile", infile)
        print("ref_g", ref_g)
        print("aligner", aligner)
        print("g_alnm", g_alnm)
        print("prefix", prefix)
        print("num_threads", num_threads)
        print("model_fit", model_fit)

        dir_name = os.path.dirname(prefix)
        basename = os.path.basename(prefix)
        call("mkdir -p " + dir_name, shell=True)

        # READ PRE-PROCESS AND ALIGNMENT ANALYSIS
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Read pre-process and unaligned reads analysis\n")
        in_fasta = prefix + "_processed.fasta"
        processed_fasta = open(in_fasta, 'w')
        with open(infile, 'r') as f:
            for seqN, seqS, seqQ in readfq(f):
                info = re.split(r'[_\s]\s*', seqN)
                chr_name = "-".join(info)
                processed_fasta.write('>' + chr_name + '\n' + seqS + '\n')
        processed_fasta.close()

        alnm_ext, unaligned_length, strandness = align_genome(
            in_fasta, prefix, aligner, num_threads, g_alnm, ref_g)

        # Aligned reads analysis
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n")
        num_aligned = align.head_align_tail(prefix, alnm_ext, args.mode)

    if args.mode == "transcriptome":
        infile = args.read
        ref_g = args.ref_g
        ref_t = args.ref_t
        annot = args.annot
        aligner = args.aligner
        g_alnm = args.g_alnm
        t_alnm = args.t_alnm
        prefix = args.output
        num_threads = str(max(args.num_threads, 1))
        if args.no_model_fit:
            model_fit = False
        if args.no_intron_retention:
            intron_retention = False

        if aligner not in ['minimap2', 'LAST', '']:
            print(
                "\nPlease specify an acceptable aligner (minimap2 or LAST)\n")
            parser_t.print_help(sys.stderr)
            sys.exit(1)

        if (g_alnm != '' and t_alnm == '') or (g_alnm == '' and t_alnm != ''):
            print(
                "\nPlease specify either both alignment files (-ga and -ta) OR an aligner to use for alignment (-a)"
            )
            parser_t.print_help(sys.stderr)
            sys.exit(1)

        if g_alnm != "" and t_alnm != "":
            g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm)
            t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm)
            g_alnm_ext = g_alnm_ext[1:]
            t_alnm_ext = t_alnm_ext[1:]
            if g_alnm_ext != t_alnm_ext:
                print(
                    "\nPlease provide both alignments in a same format: sam OR maf\n"
                )
                parser_t.print_help(sys.stderr)
                sys.exit(1)
            #development: model IR using MAF alignment formats as well
            if g_alnm_ext == t_alnm_ext == "maf" and intron_retention:
                print(
                    "\nThe intron retention only works with sam alignment files for now. Thanks\n"
                )
                parser_t.print_help(sys.stderr)
                sys.exit(1)

        if intron_retention and (ref_g == '' or annot == ''):
            print(
                "\nPlease also input reference genome and annotation file for Intron Retention modeling\n"
            )
            parser_t.print_help(sys.stderr)
            sys.exit(1)

        print("\nrunning the code with following parameters:\n")
        print("infile", infile)
        print("ref_g", ref_g)
        print("ref_t", ref_t)
        print("annot", annot)
        print("aligner", aligner)
        print("g_alnm", g_alnm)
        print("t_alnm", t_alnm)
        print("prefix", prefix)
        print("num_threads", num_threads)
        print("model_fit", model_fit)
        print("intron_retention", intron_retention)

        dir_name = os.path.dirname(prefix)
        basename = os.path.basename(prefix)
        call("mkdir -p " + dir_name, shell=True)

        # READ PRE-PROCESS AND ALIGNMENT ANALYSIS
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Read pre-process and unaligned reads analysis\n")
        in_fasta = prefix + "_processed.fasta"
        processed_fasta = open(in_fasta, 'w')
        with open(infile, 'r') as f:
            for seqN, seqS, seqQ in readfq(f):
                info = re.split(r'[_\s]\s*', seqN)
                chr_name = "-".join(info)
                processed_fasta.write('>' + chr_name + '\n' + seqS + '\n')
        processed_fasta.close()

        # Read the length of reference transcripts from the reference transcriptome
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Read the length of reference transcripts \n")
        dict_ref_len = {}
        with open(ref_t) as f:
            for seqN, seqS, seqQ in readfq(f):
                info = re.split(r'[_\s]\s*', seqN)
                chr_name = "-".join(info)
                dict_ref_len[chr_name] = len(seqS)

        # Read the annotation GTF/GFF3 file
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Parse the annotation file (GTF/GFF3)\n")
        # If gtf provided, convert to GFF3 (gt gtf_to_gff3)
        annot_filename, annot_file_extension = os.path.splitext(annot)
        annot_file_extension = annot_file_extension[1:]
        if annot_file_extension.upper() == "GTF":
            call("gt gtf_to_gff3 -tidy -force -o " + prefix + ".gff3 " + annot,
                 shell=True)
            annot_filename = prefix

        # Next, add intron info into gff3:
        call(
            "gt gff3 -tidy -retainids -checkids -addintrons -sort -force -o " +
            prefix + "_addedintron_temp.gff3 " + annot_filename + ".gff3",
            shell=True)

        # Inherit "transcript_id" information for intron features from exon info
        call("gt bequeath.lua transcript_id < " + prefix +
             "_addedintron_temp.gff3 > " + prefix + "_addedintron_final.gff3",
             shell=True)

        if intron_retention:

            alnm_ext, unaligned_length, out_g, out_t, strandness = align_transcriptome(
                in_fasta, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm,
                ref_g)

            sys.stdout.write(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Modeling Intron Retention\n")
            model_ir.intron_retention(prefix,
                                      prefix + "_addedintron_final.gff3",
                                      out_g, out_t)

        else:
            alnm_ext, unaligned_length, out_g, out_t, strandness = align_transcriptome(
                in_fasta,
                prefix,
                aligner,
                num_threads,
                t_alnm,
                ref_t,
                g_alnm=None,
                ref_g=None)

        # Aligned reads analysis
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n")
        num_aligned = align.head_align_tail(prefix, alnm_ext, args.mode,
                                            dict_ref_len)

    # strandness of the aligned reads
    strandness_rate = open(prefix + "_strandness_rate", 'w')
    strandness_rate.write("strandness:\t" + str(round(strandness, 3)))
    strandness_rate.close()

    # Length distribution of unaligned reads
    alignment_rate = open(prefix + "_reads_alignment_rate", 'w')

    num_unaligned = len(unaligned_length)
    if num_unaligned != 0:
        alignment_rate.write("Aligned / Unaligned ratio:" + "\t" +
                             str(num_aligned * 1.0 / num_unaligned) + '\n')
        unaligned_length_2d = unaligned_length[:, numpy.newaxis]
        kde_unaligned = KernelDensity(bandwidth=10).fit(unaligned_length_2d)
        joblib.dump(kde_unaligned, prefix + "_unaligned_length.pkl")
    else:
        alignment_rate.write("Aligned / Unaligned ratio:\t100%\n")

    alignment_rate.close()
    del unaligned_length

    # MATCH AND ERROR MODELS
    sys.stdout.write(
        strftime("%Y-%m-%d %H:%M:%S") + ": match and error models\n")
    error_model.hist(prefix, alnm_ext)

    if model_fit:
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Model fitting\n")
        model_fitting.model_fitting(prefix, int(num_threads))

    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        description=dedent('''
        Read characterization step
        -----------------------------------------------------------
        Given raw ONT reads, reference genome and/or transcriptome,
        learn read features and output error profiles
        '''),
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument('-v', '--version', action='version', version='NanoSim ' + VERSION)
    subparsers = parser.add_subparsers(dest='mode', description=dedent('''
        There are four modes in read_analysis.
        For detailed usage of each mode:
            read_analysis.py mode -h
        -------------------------------------------------------
        '''))

    parser_g = subparsers.add_parser('genome', help="Run the simulator on genome mode")
    parser_g.add_argument('-i', '--read', help='Input read for training', required=True)
    parser_g.add_argument('-rg', '--ref_g', help='Reference genome, not required if genome alignment file is provided',
                          default='')
    parser_g.add_argument('-a', '--aligner', help='The aligner to be used, minimap2 or LAST (Default = minimap2)',
                          choices=['minimap2', 'LAST'], default='minimap2')
    parser_g.add_argument('-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='')
    parser_g.add_argument('-o', '--output', help='The location and prefix of outputting profiles (Default = training)',
                          default='training')
    parser_g.add_argument('--no_model_fit', help='Disable model fitting step', action='store_false', default=True)
    parser_g.add_argument('-t', '--num_threads', help='Number of threads for alignment and model fitting (Default = 1)',
                          type=int, default=1)

    parser_t = subparsers.add_parser('transcriptome', help="Run the simulator on transcriptome mode")
    parser_t.add_argument('-i', '--read', help='Input read for training', required=True)
    parser_t.add_argument('-rg', '--ref_g', help='Reference genome', required=True)
    parser_t.add_argument('-rt', '--ref_t', help='Reference Transcriptome', required=True)  # ?
    parser_t.add_argument('-annot', '--annotation', help='Annotation file in ensemble GTF/GFF formats, '
                                                         'required for intron retention detection', default='')
    parser_t.add_argument('-a', '--aligner', help='The aligner to be used: minimap2 or LAST (Default = minimap2)',
                          choices=['minimap2', 'LAST'], default='minimap2')
    parser_t.add_argument('-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='')
    parser_t.add_argument('-ta', '--t_alnm', help='Transcriptome alignment file in sam or maf format (optional)',
                          default='')
    parser_t.add_argument('-o', '--output', help='The location and prefix of outputting profiles (Default = training)',
                          default='training')
    parser_t.add_argument('--no_model_fit', help='Disable model fitting step', action='store_false', default=True)
    parser_t.add_argument('--no_intron_retention', help='Disable Intron Retention analysis', action='store_false',
                          default=True)
    parser_t.add_argument('-t', '--num_threads', help='Number of threads for alignment and model fitting (Default = 1)',
                          type=int, default=1)

    parser_e = subparsers.add_parser('quantify', help="Quantify expression profile of transcripts")
    parser_e.add_argument('-i', '--read', help='Input reads for quantification', required=True)
    parser_e.add_argument('-rt', '--ref_t', help='Reference Transcriptome', required=True)
    parser_e.add_argument('-o', '--output', help='The location and prefix of outputting profile (Default = expression)',
                          default='expression')
    parser_e.add_argument('-t', '--num_threads', help='Number of threads for alignment (Default = 1)', type=int,
                          default=1)

    parser_ir = subparsers.add_parser('detect_ir', help="Detect Intron Retention events using the alignment file")
    parser_ir.add_argument('-annot', '--annotation', help='Annotation file in ensemble GTF/GFF formats', required=True)
    parser_ir.add_argument('-i', '--read', help='Input read for training, not required if alignment files are provided',
                           default='')
    parser_ir.add_argument('-rg', '--ref_g', help='Reference genome, not required if genome alignment file is provided',
                           default='')
    parser_ir.add_argument('-rt', '--ref_t', help='Reference Transcriptome, not required if transcriptome alignment '
                                                  'file is provided', default='')
    parser_ir.add_argument('-a', '--aligner', help='The aligner to be used: minimap2 or LAST (Default = minimap2)',
                           choices=['minimap2', 'LAST'], default='minimap2')
    parser_ir.add_argument('-o', '--output', help='The output name and location', required=False, default='ir_info')
    parser_ir.add_argument('-ga', '--g_alnm', help='Genome alignment file in sam or maf format (optional)', default='')
    parser_ir.add_argument('-ta', '--t_alnm', help='Transcriptome alignment file in sam or maf format (optional)',
                           default='')
    parser_ir.add_argument('-t', '--num_threads', help='Number of threads for alignment (Default = 1)', type=int,
                           default=1)

    args = parser.parse_args()

    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    # parse quantify mode arguments
    if args.mode == "quantify":
        infile = args.read
        ref_t = args.ref_t
        prefix = args.output
        num_threads = str(max(args.num_threads, 1))

        print("\nrunning the code with following parameters:\n")
        print("infile", infile)
        print("ref_t", ref_t)
        print("prefix", prefix)
        print("num_threads", num_threads)

        dir_name = os.path.dirname(prefix)
        if dir_name != '':
            call("mkdir -p " + dir_name, shell=True)

        # Quantifying the transcript abundance from input read
        sys.stdout.write('Quantifying transcripts abundance: \n')
        map_file = prefix + '_mapping.paf'
        call("minimap2 -t " + str(num_threads) + " -x map-ont -p0 " + ref_t + " " + infile + " > " + map_file,
             shell=True)

        # Get the script path
        script_path = os.path.realpath(__file__)
        script_dir = os.path.dirname(script_path)
        out_file = prefix + '_abundance.tsv'
        call("python " + script_dir + "/nanopore_transcript_abundance.py -i " + map_file + " > " + out_file, shell=True)
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
        return

    # parse detect_ir mode arguments
    if args.mode == "detect_ir":
        annot = args.annotation
        infile = args.read
        prefix = args.output
        aligner = args.aligner
        ref_g = args.ref_g
        ref_t = args.ref_t
        g_alnm = args.g_alnm
        t_alnm = args.t_alnm
        num_threads = str(max(args.num_threads, 1))

        if g_alnm == '' and ref_g == '':
            print("Please supply a reference genome or genome alignment file\n")
            parser_ir.print_help(sys.stderr)
            sys.exit(1)

        if t_alnm == '' and ref_t == '':
            print("Please supply a reference transcriptome or transcriptome alignment file\n")
            parser_ir.print_help(sys.stderr)
            sys.exit(1)

        # check validity of parameters
        if g_alnm != '':
            pre, file_ext = os.path.splitext(g_alnm)
            file_extension = file_ext[1:]
            if file_extension not in ['maf', 'sam']:
                print("Please specify an acceptable alignment format! (.maf or .sam)\n")
                parser_ir.print_help(sys.stderr)
                sys.exit(1)
        if t_alnm != '':
            pre, file_ext = os.path.splitext(t_alnm)
            file_extension = file_ext[1:]
            if file_extension not in ['maf', 'sam']:
                print("Please specify an acceptable alignment format! (.maf or .sam)\n")
                parser_ir.print_help(sys.stderr)
                sys.exit(1)

        print("\nrunning the code with following parameters:\n")
        print("annot", annot)
        print("infile", infile)
        print("aligner", aligner)
        print("ref_g", ref_g)
        print("ref_t", ref_t)
        print("g_alnm", g_alnm)
        print("t_alnm", t_alnm)
        print("prefix", prefix)

        dir_name = os.path.dirname(prefix)
        if dir_name != '':
            call("mkdir -p " + dir_name, shell=True)

        # Alignment if maf/sam file not provided, and post process them to include only primary alignments
        t_alnm, g_alnm = align_transcriptome(infile, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm, ref_g)

        # Add introns to annotation file
        add_intron(annot, prefix)

        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n")
        model_ir.intron_retention(prefix, prefix + "_added_intron_final.gff3", g_alnm, t_alnm)

        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
        return

    if args.mode == "genome":
        infile = args.read
        ref_g = args.ref_g
        aligner = args.aligner
        g_alnm = args.g_alnm
        prefix = args.output
        num_threads = str(max(args.num_threads, 1))
        model_fit = args.no_model_fit

        # check validity of parameters
        if g_alnm != '':
            pre, file_ext = os.path.splitext(g_alnm)
            file_extension = file_ext[1:]
            if file_extension not in ['maf', 'sam']:
                print("Please specify an acceptable alignment format! (.maf or .sam)\n")
                parser_g.print_help(sys.stderr)
                sys.exit(1)
        if g_alnm == '' and ref_g == '':
            print("Please supply a reference genome or genome alignment file\n")
            parser_g.print_help(sys.stderr)
            sys.exit(1)

        print("\nRunning the code with following parameters:\n")
        print("infile", infile)
        print("ref_g", ref_g)
        print("aligner", aligner)
        print("g_alnm", g_alnm)
        print("prefix", prefix)
        print("num_threads", num_threads)
        print("model_fit", model_fit)

        dir_name = os.path.dirname(prefix)
        if dir_name != '':
            call("mkdir -p " + dir_name, shell=True)

        # READ PRE-PROCESS AND ALIGNMENT ANALYSIS
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process\n")
        in_fasta = prefix + "_processed.fasta"
        processed_fasta = open(in_fasta, 'w')

        # Replace spaces in sequence headers with dashes to create unique header for each read
        with open(infile, 'r') as f:
            for seqN, seqS, seqQ in readfq(f):
                info = re.split(r'[_\s]\s*', seqN)
                chr_name = "-".join(info)
                processed_fasta.write('>' + chr_name + '\n' + seqS + '\n')
        processed_fasta.close()

        alnm_ext, unaligned_length, strandness = align_genome(in_fasta, prefix, aligner, num_threads, g_alnm, ref_g)

        # Aligned reads analysis
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n")
        num_aligned = align.head_align_tail(prefix, alnm_ext, args.mode)

    if args.mode == "transcriptome":
        infile = args.read
        ref_g = args.ref_g
        ref_t = args.ref_t
        annot = args.annotation
        aligner = args.aligner
        g_alnm = args.g_alnm
        t_alnm = args.t_alnm
        prefix = args.output
        num_threads = str(max(args.num_threads, 1))
        model_fit = args.no_model_fit
        ir = args.no_intron_retention

        if ir and g_alnm == '' and ref_g == '':
            print("For intron retention function, please supply a reference genome or genome alignment file\n")
            parser_ir.print_help(sys.stderr)
            sys.exit(1)

        if t_alnm == '' and ref_t == '':
            print("Please supply a reference transcriptome or transcriptome alignment file\n")
            parser_ir.print_help(sys.stderr)
            sys.exit(1)

        if g_alnm != '' and t_alnm != '':
            g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm)
            t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm)
            g_alnm_ext = g_alnm_ext[1:]
            t_alnm_ext = t_alnm_ext[1:]
            if g_alnm_ext != t_alnm_ext or g_alnm_ext not in ['maf', 'sam']:
                print("\nPlease provide both alignments in a same format: sam OR maf\n")
                parser_t.print_help(sys.stderr)
                sys.exit(1)
            # Development: model IR using MAF alignment formats as well
            if g_alnm_ext == t_alnm_ext == "maf" and ir:
                print("\nThe intron retention only works with sam alignment files for now. Thanks\n")
                parser_t.print_help(sys.stderr)
                sys.exit(1)

        if ir and (ref_g == '' or annot == ''):
            print("\nPlease also input reference genome and annotation file for Intron Retention modeling\n")
            parser_t.print_help(sys.stderr)
            sys.exit(1)

        print("\nrunning the code with following parameters:\n")
        print("infile", infile)
        print("ref_g", ref_g)
        print("ref_t", ref_t)
        print("annot", annot)
        print("aligner", aligner)
        print("g_alnm", g_alnm)
        print("t_alnm", t_alnm)
        print("prefix", prefix)
        print("num_threads", num_threads)
        print("model_fit", model_fit)
        print("intron_retention", ir)

        dir_name = os.path.dirname(prefix)
        if dir_name != '':
            call("mkdir -p " + dir_name, shell=True)

        # READ PRE-PROCESS AND ALIGNMENT ANALYSIS
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read pre-process and unaligned reads analysis\n")
        in_fasta = prefix + "_processed.fasta"
        processed_fasta = open(in_fasta, 'w')
        with open(infile, 'r') as f:
            for seqN, seqS, seqQ in readfq(f):
                info = re.split(r'[_\s]\s*', seqN)
                chr_name = "-".join(info)
                processed_fasta.write('>' + chr_name + '\n' + seqS + '\n')
        processed_fasta.close()

        # Read the length of reference transcripts from the reference transcriptome
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Read the length of reference transcripts \n")
        dict_ref_len = {}
        with open(ref_t) as f:
            for seqN, seqS, seqQ in readfq(f):
                info = re.split(r'[_\s]\s*', seqN)
                chr_name = "-".join(info)
                dict_ref_len[chr_name] = len(seqS)

        alnm_ext, unaligned_length, g_alnm, t_alnm, strandness = \
            align_transcriptome(in_fasta, prefix, aligner, num_threads, t_alnm, ref_t, g_alnm, ref_g)

        if ir:
            # Add introns to annotation file
            add_intron(annot, prefix)

            sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n")
            model_ir.intron_retention(prefix, prefix + "_added_intron_final.gff3", g_alnm, t_alnm)

        # Aligned reads analysis
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Aligned reads analysis\n")
        num_aligned = align.head_align_tail(prefix + "_transcriptome", alnm_ext, args.mode, dict_ref_len)

    # strandness of the aligned reads
    strandness_rate = open(prefix + "_strandness_rate", 'w')
    strandness_rate.write("strandness:\t" + str(round(strandness, 3)))
    strandness_rate.close()

    # Length distribution of unaligned reads
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Unaligned reads analysis\n")
    alignment_rate = open(prefix + "_reads_alignment_rate", 'w')

    num_unaligned = len(unaligned_length)
    if num_unaligned != 0:
        alignment_rate.write("Aligned / Unaligned ratio:" + "\t" + str(num_aligned * 1.0 / num_unaligned) + '\n')
        unaligned_length_2d = unaligned_length[:, numpy.newaxis]
        kde_unaligned = KernelDensity(bandwidth=10).fit(unaligned_length_2d)
        joblib.dump(kde_unaligned, prefix + "_unaligned_length.pkl")
    else:
        alignment_rate.write("Aligned / Unaligned ratio:\t100%\n")

    alignment_rate.close()
    del unaligned_length

    # MATCH AND ERROR MODELS
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": match and error models\n")
    if args.mode == "transcriptome":
        error_model.hist(prefix + "_genome", alnm_ext)  # Use primary genome alignment for error profiling
    else:
        error_model.hist(prefix, alnm_ext)

    if model_fit:
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Model fitting\n")
        model_fitting.model_fitting(prefix, int(num_threads))

    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")
Example #3
0
def main():

    # Parse input and output files
    infile = ''
    ref_g = ''
    ref_t = ''
    annot = ''
    model_fit = True
    intron_retention = True
    detect_IR = False
    quantify = False

    parser = argparse.ArgumentParser(
        description='Given the read profiles from characterization step, ' \
                    'simulate transcriptome ONT reads and output error profiles',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument('-i',
                        '--read',
                        help='Input read for training.',
                        required=True)
    parser.add_argument('-rg',
                        '--ref_g',
                        help='Reference genome.',
                        required=True)
    parser.add_argument('-rt',
                        '--ref_t',
                        help='Reference Transcriptome.',
                        required=True)
    parser.add_argument('-annot',
                        '--annot',
                        help='Annotation file in ensemble GTF/GFF formats.',
                        required=True)
    parser.add_argument(
        '-a',
        '--aligner',
        help='The aligner to be used minimap2 or LAST (Default = minimap2)',
        default='minimap2')
    parser.add_argument(
        '-ga',
        '--g_alnm',
        help='Genome alignment file in sam or maf format (optional)',
        default='')
    parser.add_argument(
        '-ta',
        '--t_alnm',
        help='Transcriptome alignment file in sam or maf format (optional)',
        default='')
    parser.add_argument('-o',
                        '--output',
                        help='The output name and location for profiles',
                        default="training")
    parser.add_argument('--no_model_fit',
                        help='Disable model fitting step',
                        action='store_true')
    parser.add_argument('--no_intron_retention',
                        help='Disable Intron Retention analysis',
                        action='store_true')
    parser.add_argument(
        '--detect_IR',
        help='Detect Intron Retention events using input reads and exit',
        action='store_true')
    parser.add_argument('-b',
                        '--num_bins',
                        help='Number of bins to be used (Default = 20)',
                        default=20)
    parser.add_argument(
        '-t',
        '--num_threads',
        help=
        'Number of threads to be used in alignments and model fitting (Default = 1)',
        default=1)
    parser.add_argument('--quantify',
                        help='Quantify expression profile of input reads',
                        action='store_true')

    args = parser.parse_args()

    infile = args.read
    ref_g = args.ref_g
    ref_t = args.ref_t
    annot = args.annot
    aligner = args.aligner
    g_alnm = args.g_alnm
    t_alnm = args.t_alnm
    outfile = args.output
    num_bins = max(args.num_bins, 1)
    num_threads = max(args.num_threads, 1)

    if args.no_model_fit:
        model_fit = False
    if args.no_intron_retention:
        intron_retention = False
    if args.detect_IR:
        detect_IR = True
    if args.quantify:
        quantify = True

    print("Running the characterization step with following arguments: \n")
    print("infile", infile)
    print("ref_g", ref_g)
    print("ref_t", ref_t)
    print("annot", annot)
    print("aligner", aligner)
    print("g_alnm", g_alnm)
    print("t_alnm", t_alnm)
    print("outfile", outfile)
    print("model_fit", model_fit)
    print("num_bins", num_bins)
    print("num_threads", num_threads)
    print("detect_IR", detect_IR)
    print("quantify", quantify)

    #Quantifying the transcript abundance from input read
    sys.stdout.write('Quantifying transcripts abundance: \n')
    #sys.stdout.log.write('Quantifying transcripts abundance: \n')
    call("minimap2 -t " + str(num_threads) + " -x map-ont -p0 " + ref_t + " " +
         infile + " > " + outfile + "_mapping.paf",
         shell=True)
    call("python nanopore_transcript_abundance.py -i " + outfile +
         "_mapping.paf > " + outfile + "_abundance.tsv",
         shell=True)
    sys.stdout.write('Finished! \n')
    #sys.stdout.log.write('Finished! \n')

    if quantify == True:
        sys.exit(1)

    if (g_alnm != '' and t_alnm == '') or (g_alnm == '' and t_alnm != ''):
        print(
            "Please specify either both alignment files (-ga and -ta) OR an aligner to use for alignment (-a)"
        )
        usage()
        sys.exit(1)
    if g_alnm != "" and t_alnm != "":
        g_alnm_filename, g_alnm_ext = os.path.splitext(g_alnm)
        t_alnm_filename, t_alnm_ext = os.path.splitext(t_alnm)
        g_alnm_ext = g_alnm_ext[1:]
        t_alnm_ext = t_alnm_ext[1:]
        if g_alnm_ext != t_alnm_ext:
            print(
                "Please provide both alignments in a same format: sam OR maf\n"
            )
            usage()
            sys.exit(1)

    # READ PRE-PROCESS AND UNALIGNED READS ANALYSIS
    sys.stdout.write(
        strftime("%Y-%m-%d %H:%M:%S") +
        ": Read pre-process and unaligned reads analysis\n")

    # Read pre-process
    in_fasta = outfile + ".fasta"
    if in_fasta == infile:
        in_fasta = outfile + "_processed.fasta"
    out_fasta = open(in_fasta, 'w')
    dic_reads = {}
    with open(infile, 'r') as f:
        for line in f:
            if line[0] == '>':
                name = '-'.join(line.strip()[1:].split())
                dic_reads[name] = ""
            else:
                dic_reads[name] += line.strip()
    for k, v in dic_reads.items():
        out_fasta.write('>' + k + '\n' + v + '\n')
    out_fasta.close()

    del dic_reads

    # Read the annotation GTF/GFF3 file
    sys.stdout.write(
        strftime("%Y-%m-%d %H:%M:%S") +
        ": Parse the annotation file (GTF/GFF3)\n")
    # If gtf provided, convert to GFF3 (gt gtf_to_gff3)
    annot_filename, annot_file_extension = os.path.splitext(annot)
    annot_file_extension = annot_file_extension[1:]
    if annot_file_extension.upper() == "GTF":
        call("gt gtf_to_gff3 -tidy -o " + outfile + ".gff3" + annot,
             shell=True)

    # Next, add intron info into gff3:
    call("gt gff3 -tidy -retainids -checkids -addintrons -o " + outfile +
         "_addedintron.gff3 " + annot_filename + ".gff3",
         shell=True)

    sys.stdout.write(
        strftime("%Y-%m-%d %H:%M:%S") +
        ": Read the length of reference transcripts \n")
    #Read the length of reference transcripts from the reference transcriptome
    dict_ref_len = {}
    with open(ref_t) as f:
        for line in f:
            if line.startswith(">"):
                ref_id = line.split()[0][1:]
                dict_ref_len[ref_id] = 0
            else:
                dict_ref_len[ref_id] += len(line.strip())

    #If both alignment files are provided:
    if g_alnm != "" and t_alnm != "":
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Processing the alignment files: " + t_alnm_ext + "\n")
        if t_alnm_ext == "maf":
            outmaf_g = outfile + "_genome_alnm.maf"
            outmaf_t = outfile + "_transcriptome_alnm.maf"
            if outmaf_g == g_alnm:
                outmaf_g = outfile + "_genome_alnm_processed.maf"
            if outmaf_t == t_alnm:
                outmaf_t = outfile + "_transcriptome_alnm_processed.maf"

            call("grep '^s ' " + g_alnm + " > " + outmaf_g, shell=True)
            call("grep '^s ' " + t_alnm + " > " + outmaf_t, shell=True)

            unaligned_length = list(
                get_besthit_maf.besthit_and_unaligned(in_fasta, outmaf_t,
                                                      outfile))

        elif t_alnm_ext == "sam":

            unaligned_length = list(
                get_primary_sam.primary_and_unaligned(g_alnm, t_alnm, outfile))

    else:
        if aligner == "minimap2":
            g_alnm_ext = "sam"
            t_alnm_ext = "sam"
            outsam_g = outfile + "_genome_alnm.sam"
            outsam_t = outfile + "_transcriptome_alnm.sam"
            # Alignment to reference genome

            # [EDIT] I should change the options for minimap when dealing with cDNA and dRNA reads.
            sys.stdout.write(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Alignment with minimap2 to reference genome\n")
            call("minimap2 -ax splice " + ref_g + " " + in_fasta + " > " +
                 outsam_g,
                 shell=True)
            # Alignment to reference transcriptome
            sys.stdout.write(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Alignment with minimap2 to reference transcriptome\n")
            call("minimap2 --cs -ax map-ont " + ref_t + " " + in_fasta +
                 " > " + outsam_t,
                 shell=True)

            # [EDIT] I may add a script to remove minimap2/LAST post-alignment files after alignment.
            unaligned_length = list(
                get_primary_sam.primary_and_unaligned(outsam_g, outsam_t,
                                                      outfile))

        elif aligner == "LAST":
            g_alnm_ext = "maf"
            t_alnm_ext = "maf"
            outmaf_g = outfile + "_genome_alnm.maf"
            outmaf_t = outfile + "_transcriptome_alnm.maf"
            # Alignment to reference genome
            sys.stdout.write(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Alignment with LAST to reference genome\n")
            call("lastdb ref_genome " + ref_g, shell=True)
            call("lastal -a 1 -P " + num_threads + " ref_genome " + in_fasta +
                 " | grep '^s ' > " + outmaf_g,
                 shell=True)
            # Alignment to reference transcriptome
            sys.stdout.write(
                strftime("%Y-%m-%d %H:%M:%S") +
                ": Alignment with LAST to reference transcriptome\n")
            call("lastdb ref_transcriptome " + ref_t, shell=True)
            call("lastal -a 1 -P " + num_threads + " ref_transcriptome " +
                 in_fasta + " | grep '^s ' > " + outmaf_t,
                 shell=True)

            unaligned_length = list(
                get_besthit_maf.besthit_and_unaligned(in_fasta, outmaf_t,
                                                      outfile))

        else:
            print("Please specify an acceptable aligner (minimap2 or LAST)\n")
            usage()
            sys.exit(1)

    if detect_IR == True:
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") +
            ": Detecting Intron Retention events using input reads\n")
        model_ir.intron_retention(outfile, ref_t)
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished\n")
        sys.exit(1)

    sys.stdout.write(
        strftime("%Y-%m-%d %H:%M:%S") +
        ": Reads length distribution analysis\n")
    # Aligned reads length distribution analysis
    count_aligned = align.head_align_tail(outfile, num_bins, t_alnm_ext,
                                          dict_ref_len)

    # Unaligned reads length distribution analysis
    out1 = open(outfile + "_unaligned_length_ecdf", 'w')
    count_unaligned = len(unaligned_length)
    if count_unaligned != 0:
        max_length = max(unaligned_length)
        hist_unaligned, edges_unaligned = numpy.histogram(
            unaligned_length,
            bins=numpy.arange(0, max_length + 50, 50),
            density=True)
        cdf = numpy.cumsum(hist_unaligned * 50)
        out1.write("Aligned / Unaligned ratio:" + "\t" +
                   str(count_aligned * 1.0 / count_unaligned) + '\n')
        out1.write("bin\t0-" + str(max_length) + '\n')
        for i in xrange(len(cdf)):
            out1.write(
                str(edges_unaligned[i]) + '-' + str(edges_unaligned[i + 1]) +
                "\t" + str(cdf[i]) + '\n')
    else:
        out1.write("Aligned / Unaligned ratio:\t100%\n")
    out1.close()

    # MATCH AND ERROR MODELS
    sys.stdout.write(
        strftime("%Y-%m-%d %H:%M:%S") + ": match and error models\n")
    error_model.hist(outfile, t_alnm_ext)

    if intron_retention:
        sys.stdout.write(
            strftime("%Y-%m-%d %H:%M:%S") + ": Modeling Intron Retention\n")
        model_ir.intron_retention(outfile, ref_t)

    if model_fit:
        sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Model fitting\n")
        model_fitting.model_fitting(outfile, int(num_threads))

    call("find . -name \*ref_genome.* -delete", shell=True)
    call("find . -name \*ref_transcriptome.* -delete", shell=True)
    call("find . -name \*.pyc -delete", shell=True)
    sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ": Finished!\n")