def main():

    FORMAT = "%(asctime)-15s %(levelname)s %(module)s.%(name)s.%(funcName)s at %(lineno)d :\n\t%(message)s\n"
    global logger
    logger = logging.getLogger()
    logging.basicConfig(filename='variant_calling.log',
                        format=FORMAT,
                        filemode='w',
                        level=logging.DEBUG)
    # add a new Handler to print all INFO and above messages to stdout
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    logger.addHandler(ch)

    parser = argparse.ArgumentParser(description=str(
        "This script requires you have the following dependencies:\n" +
        "Samtools: \"samtools\" in your path\n" +
        "Java: \"java\" in your path\n" +
        "Picard-Tools: env var \"$PICARD_HOME\" with the path to Picard-Tools's bin\n"
        + "STAR: \"STAR\" in your path\n" +
        "GATK: env var \"$GATK_HOME\" with the path to GATK's bin\n"),
                                     epilog="",
                                     formatter_class=argparse.
                                     RawTextHelpFormatter)

    parser.add_argument('--st_fa',
                        '--supertranscript_fasta',
                        dest="st_fa",
                        type=str,
                        required=True,
                        help="Path to the SuperTranscripts fasta file.")

    parser.add_argument('--st_gtf',
                        '--supertranscript_gtf',
                        dest="st_gtf",
                        type=str,
                        required=True,
                        help="Path to the SuperTranscript gtf file.")

    group = parser.add_mutually_exclusive_group(required=True)

    group.add_argument('-p',
                       '--paired',
                       dest="paired_reads",
                       type=str,
                       nargs=2,
                       help="Pair of paired ends read files.")

    group.add_argument('-s',
                       '--single',
                       dest="single_reads",
                       type=str,
                       help="Single reads file.")

    parser.add_argument(
        '-o',
        '--output',
        dest="out_path",
        type=str,
        required=True,
        help="Path to the folder where to generate the output.")

    parser.add_argument(
        '-l',
        '--sjdbOverhang',
        dest="sjdbOverhang",
        default=150,
        type=int,
        help="Size of the reads (used for STAR --sjdbOverhang). default=150")

    parser.add_argument(
        '-t',
        '--threads',
        dest="nthreads",
        type=str,
        default="4",
        help="Number of threads to use for tools that are multithreaded.")

    parser.add_argument(
        '-m',
        '--maxram',
        dest="maxram",
        type=str,
        default="50000000000",
        help=
        "Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value)."
    )

    args = parser.parse_args()

    PICARD_HOME = os.getenv("PICARD_HOME")
    if not PICARD_HOME:
        exit("Error, missing path to Picard-Tools in $PICARD_HOME.")

    GATK_HOME = os.getenv("GATK_HOME")
    if not GATK_HOME:
        exit("Error, missing path to GATK in $GATK.")

    # get real paths before changing working directory in case they are relative paths
    if args.paired_reads:
        reads_paths = [os.path.realpath(f) for f in args.paired_reads]
    else:
        reads_paths = [os.path.realpath(args.single_reads)]

    st_fa_path = os.path.realpath(args.st_fa)

    st_gtf_path = os.path.realpath(args.st_gtf)

    # check if output directory exists, if not create
    real_path = os.path.realpath(args.out_path)
    if not os.path.isdir(real_path):
        os.makedirs(real_path)

    # move to output folder
    os.chdir(real_path)

    checkpoint_dir = os.path.abspath(
        os.path.basename(st_fa_path)) + ".gatk_chkpts"
    pipeliner = Pipeliner.Pipeliner(checkpoint_dir)

    # generate supertranscript index
    logger.info("Generating SuperTranscript index.")
    pipeliner.add_commands([
        Pipeliner.Command("samtools faidx {}".format(st_fa_path),
                          "samtools_faidx_st.ok")
    ])
    pipeliner.run()

    # generate supertranscript Picard dictionary
    logger.info("Generating Picard dictionary.")
    dict_file = re.sub("\.[^\.]+$", ".dict", st_fa_path)
    if os.path.isfile(dict_file):
        open(checkpoint_dir + "/picard_dict_st.ok", 'a').close()
    else:
        pipeliner.add_commands([
            Pipeliner.Command(
                "java -jar " + PICARD_HOME + "/picard.jar" +
                " CreateSequenceDictionary R=" + st_fa_path + " O=" +
                dict_file + " VALIDATION_STRINGENCY=LENIENT ",
                "picard_dict_st.ok")
        ])
        pipeliner.run()

    # generate genome folder for STAR's first pass
    logger.info("Generating genome folder for STAR")

    star_genome_generate_cmd = str(
        "STAR --runThreadN " + args.nthreads + " --outSAMmapqUnique 60" +
        " --runMode genomeGenerate" + " --genomeDir star_genome_idx " +
        " --genomeFastaFiles {} ".format(st_fa_path) +
        " --sjdbGTFfile {} ".format(st_gtf_path) +
        " --sjdbOverhang {} ".format(args.sjdbOverhang) +
        " --limitGenomeGenerateRAM {}".format(args.maxram))

    pipeliner.add_commands([
        Pipeliner.Command("mkdir star_genome_idx", "mkdir_star_genome_idx.ok"),
        Pipeliner.Command(star_genome_generate_cmd, "star_genome_generate.ok")
    ])
    pipeliner.run()

    # run STAR's alignment
    logger.info("Running STAR alignment.")
    cmd = str("STAR --runThreadN " + args.nthreads +
              " --genomeDir star_genome_idx " + " --runMode alignReads " +
              " --twopassMode Basic " + " --alignSJDBoverhangMin 10 " +
              " --outSAMmapqUnique 60" +
              " --outSAMtype BAM SortedByCoordinate " +
              " --limitBAMsortRAM {} ".format(args.maxram) +
              " --readFilesIn " + " ".join(reads_paths))

    if re.search("\.gz$", reads_paths[0]):
        cmd += " --readFilesCommand 'gunzip -c' "

    pipeliner.add_commands([Pipeliner.Command(cmd, "star_aln.ok")])
    pipeliner.run()

    # clean and convert sam file with Picard-Tools
    logger.info("Cleaning and Converting sam file with Picard-Tools.")

    pipeliner.add_commands([
        Pipeliner.Command(
            "java -jar " + PICARD_HOME + "/picard.jar " +
            " AddOrReplaceReadGroups " + "I=Aligned.sortedByCoord.out.bam " +
            "O=rg_added_sorted.bam " + " VALIDATION_STRINGENCY=SILENT " +
            "SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample",
            "add_read_groups.ok"),
        Pipeliner.Command(
            "java -jar " + PICARD_HOME + "/picard.jar " + " MarkDuplicates " +
            "I=rg_added_sorted.bam O=dedupped.bam " +
            "CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics",
            "mark_dups.ok"),
        Pipeliner.Command("java -jar " + PICARD_HOME +
                          "/picard.jar ValidateSamFile " + "I=dedupped.bam " +
                          "IGNORE_WARNINGS=true " + "MAX_OUTPUT=100000 " +
                          "IGNORE=MATE_NOT_FOUND " +
                          "O=dedupped.bam.validation",
                          "bam_validate.ok",
                          ignore_error=True),
        Pipeliner.Command(
            UTILDIR +
            "/clean_bam.pl dedupped.bam dedupped.bam.validation dedupped.valid.bam",
            "make_valid_dedupped_bam.ok"),
        # the option -U ALLOW_N_CIGAR_READS is not required in GATK 4
        # the option -RMQF 255 -RMQT 60 : use --outSAMmapqUnique 60 in STAR : https://software.broadinstitute.org/gatk/blog?id=4285
        # Cufflinks requires a MAPQ score of 255 so that will need to be changed if cufflinks is used for downstream analysis
        #  ReassignOneMappingQuality read filter reassigns all good alignments to the default value of 60.
        #" -RF MappingQualityAvailableReadFilter --read-validation-stringency LENIENT",
        Pipeliner.Command(
            "java -jar " + GATK_HOME + "/gatk-package-4.0.1.2-local.jar " +
            " SplitNCigarReads -R " + st_fa_path +
            " -I dedupped.valid.bam -O splitNCigar.bam " +
            " --read-validation-stringency LENIENT", "splitNCigarReads.ok")
    ])

    pipeliner.run()

    # do the actual variant calling
    logger.info("Variant Calling using Haplotype Caller.")

    pipeliner.add_commands([
        Pipeliner.Command(
            "java -jar " + GATK_HOME + "/gatk-package-4.0.1.2-local.jar " +
            "HaplotypeCaller -R " + st_fa_path +
            " -I ./splitNCigar.bam --dont-use-soft-clipped-bases true -stand-call-conf 20 -O output.vcf",
            "haplotypecaller.ok")
    ])
    pipeliner.run()

    # do some basic filtering
    logger.info("Doing some basic filtering of vcf.")

    pipeliner.add_commands([
        Pipeliner.Command(
            "java -jar " + GATK_HOME + "/gatk-package-4.0.1.2-local.jar " +
            " VariantFiltration -R " + st_fa_path +
            " -V output.vcf -window 35 -cluster 3 " +
            "--filter-name FS -filter \"FS > 30.0\" " +
            "--filter-name QD -filter \"QD < 2.0\" -O filtered_output.vcf",
            "variant_filt.ok")
    ])

    pipeliner.run()

    logger.info("Done!")
def main():

    FORMAT = "%(asctime)-15s %(levelname)s %(module)s.%(name)s.%(funcName)s at %(lineno)d :\n\t%(message)s\n"
    global logger
    logger = logging.getLogger()
    logging.basicConfig(
        filename="variant_calling.log", format=FORMAT, filemode="w", level=logging.DEBUG
    )
    # add a new Handler to print all INFO and above messages to stdout
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    logger.addHandler(ch)

    parser = argparse.ArgumentParser(
        description=str(
            "This script requires you have the following dependencies:\n"
            + 'Samtools: "samtools" in your path\n'
            + 'Java: "java" in your path\n'
            + 'Picard-Tools: env var "$PICARD_HOME" with the path to Picard-Tools\'s bin\n'
            + 'STAR: "STAR" in your path\n'
            + 'GATK: env var "$GATK_HOME" with the path to GATK\'s bin\n'
        ),
        epilog="",
        formatter_class=argparse.RawTextHelpFormatter,
    )

    parser.add_argument(
        "--st_fa",
        "--supertranscript_fasta",
        dest="st_fa",
        type=str,
        required=True,
        help="Path to the SuperTranscripts fasta file.",
    )

    parser.add_argument(
        "--st_gtf",
        "--supertranscript_gtf",
        dest="st_gtf",
        type=str,
        required=True,
        help="Path to the SuperTranscript gtf file.",
    )

    group = parser.add_mutually_exclusive_group(required=True)

    group.add_argument(
        "-p",
        "--paired",
        dest="paired_reads",
        type=str,
        nargs=2,
        help="Pair of paired ends read files.",
    )

    group.add_argument(
        "-s", "--single", dest="single_reads", type=str, help="Single reads file."
    )

    group.add_argument(
        "-S",
        "--samples_file",
        dest="samples_file",
        type=str,
        help="Trinity samples file (fmt: condition_name replicate_name /path/to/reads_1.fq /path/to/reads_2.fq (tab-delimited, single sample per line))",
    )

    parser.add_argument(
        "-o",
        "--output",
        dest="out_path",
        type=str,
        required=True,
        help="Path to the folder where to generate the output.",
    )

    parser.add_argument(
        "-l",
        "--sjdbOverhang",
        dest="sjdbOverhang",
        default=150,
        type=int,
        help="Size of the reads (used for STAR --sjdbOverhang). default=150",
    )

    parser.add_argument(
        "-t",
        "--threads",
        dest="nthreads",
        type=str,
        default="4",
        help="Number of threads to use for tools that are multithreaded.",
    )

    parser.add_argument(
        "-m",
        "--maxram",
        dest="maxram",
        type=str,
        default="50000000000",
        help="Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value).",
    )

    parser.add_argument("--rg_id", default="id", help="bam RGID field value")
    parser.add_argument("--rg_sample", default="sample", help="bam RGSM value")

    parser.add_argument(
        "--STAR_genomeGenerate_opts",
        type=str,
        default="",
        help="options to pass through to STAR's genomeGenerate function",
    )

    args = parser.parse_args()

    PICARD_HOME = os.getenv("PICARD_HOME")
    if not PICARD_HOME:
        exit("Error, missing path to Picard-Tools in $PICARD_HOME.")

    GATK_HOME = os.getenv("GATK_HOME")
    if not GATK_HOME:
        exit("Error, missing path to GATK in $GATK.")

    # identify gatk4_jar file
    gatk4_jar = glob.glob(os.path.join(GATK_HOME, "gatk-package-4.*-local.jar"))
    if len(gatk4_jar) != 1:
        raise RuntimeError(
            "Error, cannot locate single gatk-package-4.*-local.jar in {}".format(
                GATK_HOME
            )
        )
    gatk_path = os.path.abspath(gatk4_jar[0])

    # get real paths before changing working directory in case they are relative paths
    if args.paired_reads:
        reads_paths = [os.path.realpath(f) for f in args.paired_reads]
    elif args.single_reads:
        reads_paths = [os.path.realpath(args.single_reads)]
    elif args.samples_file:
        left_fq_list = list()
        right_fq_list = list()
        with open(args.samples_file) as fh:
            for line in fh:
                line = line.rstrip()
                if not re.match("\w", line):
                    continue
                fields = line.split("\t")
                left_fq = fields[2]
                left_fq_list.append(os.path.realpath(left_fq))
                if len(fields) > 3:
                    right_fq = fields[3]
                    right_fq_list.append(os.path.realpath(right_fq))
        reads_paths = [",".join(left_fq_list)]
        if right_fq_list:
            reads_paths.append(",".join(right_fq_list))
    else:
        raise RuntimeError("no reads specified")  # should never get here

    st_fa_path = os.path.realpath(args.st_fa)

    st_gtf_path = os.path.realpath(args.st_gtf)

    # check if output directory exists, if not create
    real_path = os.path.realpath(args.out_path)
    if not os.path.isdir(real_path):
        os.makedirs(real_path)

    # move to output folder
    os.chdir(real_path)

    checkpoint_dir = os.path.abspath(os.path.basename(st_fa_path)) + ".gatk_chkpts"
    pipeliner = Pipeliner.Pipeliner(checkpoint_dir)

    # generate supertranscript index
    logger.info("Generating SuperTranscript index.")
    pipeliner.add_commands(
        [
            Pipeliner.Command(
                "samtools faidx {}".format(st_fa_path), "samtools_faidx_st.ok"
            )
        ]
    )
    pipeliner.run()

    # generate supertranscript Picard dictionary
    logger.info("Generating Picard dictionary.")
    dict_file = re.sub("\.[^\.]+$", ".dict", st_fa_path)
    if os.path.isfile(dict_file):
        open(checkpoint_dir + "/picard_dict_st.ok", "a").close()
    else:
        pipeliner.add_commands(
            [
                Pipeliner.Command(
                    "java -jar "
                    + PICARD_HOME
                    + "/picard.jar"
                    + " CreateSequenceDictionary R="
                    + st_fa_path
                    + " O="
                    + dict_file
                    + " VALIDATION_STRINGENCY=LENIENT ",
                    "picard_dict_st.ok",
                )
            ]
        )
        pipeliner.run()

    # generate genome folder for STAR's first pass
    logger.info("Generating genome folder for STAR")

    # from Alex D.:
    # scale down the --genomeSAindexNbases parameter as log2(GenomeLength)/2 - 1

    genomeSAindexNbases = int(math.log(os.path.getsize(st_fa_path)) / math.log(2) / 2)

    star_genome_generate_cmd = str(
        "STAR --runThreadN "
        + args.nthreads
        + " --runMode genomeGenerate"
        + " --genomeDir star_genome_idx "
        + " --genomeFastaFiles {} ".format(st_fa_path)
        + " --genomeSAindexNbases {} ".format(genomeSAindexNbases)
        + " --sjdbGTFfile {} ".format(st_gtf_path)
        + " --sjdbOverhang {} ".format(args.sjdbOverhang)
        + " --limitGenomeGenerateRAM {}".format(args.maxram)
        + " {} ".format(args.STAR_genomeGenerate_opts)
    )

    pipeliner.add_commands(
        [
            Pipeliner.Command("mkdir star_genome_idx", "mkdir_star_genome_idx.ok"),
            Pipeliner.Command(star_genome_generate_cmd, "star_genome_generate.ok"),
        ]
    )
    pipeliner.run()

    # run STAR's alignment
    logger.info("Running STAR alignment.")
    cmd = str(
        "STAR --runThreadN "
        + args.nthreads
        + " --genomeDir star_genome_idx "
        + " --runMode alignReads "
        + " --twopassMode Basic "
        + " --alignSJDBoverhangMin 10 "
        + " --outSAMtype BAM SortedByCoordinate "
        + " --limitBAMsortRAM {} ".format(args.maxram)
        + " --readFilesIn "
        + " ".join(reads_paths)
    )

    if re.search("\.gz$", reads_paths[0]):
        cmd += " --readFilesCommand 'gunzip -c' "

    pipeliner.add_commands([Pipeliner.Command(cmd, "star_aln.ok")])
    pipeliner.run()

    ##
    ## GATK settings based on best practices:
    ## https://software.broadinstitute.org/gatk/documentation/article.php?id=3891
    ##

    # clean and convert sam file with Picard-Tools
    logger.info("Cleaning and Converting sam file with Picard-Tools.")

    pipeliner.add_commands(
        [
            Pipeliner.Command(
                "java -jar "
                + PICARD_HOME
                + "/picard.jar "
                + " AddOrReplaceReadGroups "
                + "I=Aligned.sortedByCoord.out.bam "
                + "O=rg_added_sorted.bam "
                + " VALIDATION_STRINGENCY=SILENT "
                + "SO=coordinate RGID={} RGLB=library RGPL=platform RGPU=machine RGSM={}".format(
                    args.rg_id, args.rg_sample
                ),
                "add_read_groups.ok",
            ),
            Pipeliner.Command(
                "java -jar "
                + PICARD_HOME
                + "/picard.jar "
                + " MarkDuplicates "
                + "I=rg_added_sorted.bam O=dedupped.bam "
                + "CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics",
                "mark_dups.ok",
            ),
            Pipeliner.Command(
                "java -jar "
                + PICARD_HOME
                + "/picard.jar ValidateSamFile "
                + "I=dedupped.bam "
                + "IGNORE_WARNINGS=true "
                + "MAX_OUTPUT=100000 "
                + "IGNORE=MATE_NOT_FOUND "
                + "O=dedupped.bam.validation",
                "bam_validate.ok",
                ignore_error=True,
            ),
            Pipeliner.Command(
                UTILDIR
                + "/clean_bam.pl dedupped.bam dedupped.bam.validation dedupped.valid.bam",
                "make_valid_dedupped_bam.ok",
            ),
            Pipeliner.Command(
                "java -jar "
                + gatk_path
                + " SplitNCigarReads -R "
                + st_fa_path
                + " -I dedupped.valid.bam -O splitNCigar.bam "
                + " --read-validation-stringency LENIENT",
                "splitNCigarReads.ok",
            ),
        ]
    )
    pipeliner.run()

    # do the actual variant calling
    logger.info("Variant Calling using Haplotype Caller.")

    pipeliner.add_commands(
        [
            Pipeliner.Command(
                "java -jar "
                + gatk_path
                + " HaplotypeCaller -R "
                + st_fa_path
                + " -I ./splitNCigar.bam -dont-use-soft-clipped-bases -stand-call-conf 20.0 -O output.vcf",
                "haplotypecaller.ok",
            )
        ]
    )
    pipeliner.run()

    # do some basic filtering
    logger.info("Doing some basic filtering of vcf.")

    pipeliner.add_commands(
        [
            Pipeliner.Command(
                "java -jar "
                + gatk_path
                + " VariantFiltration -R "
                + st_fa_path
                + " -V output.vcf -window 35 -cluster 3 "
                + '--filter-name FS -filter "FS > 30.0" '
                + '--filter-name QD -filter "QD < 2.0" -O filtered_output.vcf',
                "variant_filt.ok",
            )
        ]
    )

    pipeliner.run()

    logger.info("Done!")
def main():

    FORMAT = "%(asctime)-15s %(levelname)s %(module)s.%(name)s.%(funcName)s at %(lineno)d :\n\t%(message)s\n"
    global logger
    logger = logging.getLogger()
    logging.basicConfig(filename='variant_calling.log',
                        format=FORMAT,
                        filemode='w',
                        level=logging.DEBUG)
    # add a new Handler to print all INFO and above messages to stdout
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    logger.addHandler(ch)

    parser = argparse.ArgumentParser(description=str(
        "This script requires you have the following dependencies:\n" +
        "Samtools: \"samtools\" in your path\n" +
        "Java: \"java\" in your path\n" +
        "Picard-Tools: env var \"$PICARD_HOME\" with the path to Picard-Tools's bin\n"
        + "STAR: \"STAR\" in your path\n" +
        "GATK: env var \"$GATK_HOME\" with the path to GATK's bin\n"),
                                     epilog="",
                                     formatter_class=argparse.
                                     RawTextHelpFormatter)

    parser.add_argument('--st_fa',
                        '--supertranscript_fasta',
                        dest="st_fa",
                        type=str,
                        required=True,
                        help="Path to the SuperTranscripts fasta file.")

    parser.add_argument('--st_gtf',
                        '--supertranscript_gtf',
                        dest="st_gtf",
                        type=str,
                        required=True,
                        help="Path to the SuperTranscript gtf file.")

    group = parser.add_mutually_exclusive_group(required=True)

    group.add_argument('-p',
                       '--paired',
                       dest="paired_reads",
                       type=str,
                       nargs=2,
                       help="Pair of paired ends read files.")

    group.add_argument('-s',
                       '--single',
                       dest="single_reads",
                       type=str,
                       help="Single reads file.")

    group.add_argument(
        "-S",
        "--samples_file",
        dest="samples_file",
        type=str,
        help=
        "Trinity samples file (fmt: condition_name replicate_name /path/to/reads_1.fq /path/to/reads_2.fq (tab-delimited, single sample per line))"
    )

    parser.add_argument(
        '-o',
        '--output',
        dest="out_path",
        type=str,
        required=True,
        help="Path to the folder where to generate the output.")

    parser.add_argument(
        '-l',
        '--sjdbOverhang',
        dest="sjdbOverhang",
        default=150,
        type=int,
        help="Size of the reads (used for STAR --sjdbOverhang). default=150")

    parser.add_argument(
        '-t',
        '--threads',
        dest="nthreads",
        type=str,
        default="4",
        help="Number of threads to use for tools that are multithreaded.")

    parser.add_argument(
        '-m',
        '--maxram',
        dest="maxram",
        type=str,
        default="50000000000",
        help=
        "Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value)."
    )

    parser.add_argument(
        "--STAR_genomeGenerate_opts",
        type=str,
        default="",
        help="options to pass through to STAR's genomeGenerate function")

    args = parser.parse_args()

    PICARD_HOME = os.getenv("PICARD_HOME")
    if not PICARD_HOME:
        exit("Error, missing path to Picard-Tools in $PICARD_HOME.")

    GATK_HOME = os.getenv("GATK_HOME")
    if not GATK_HOME:
        exit("Error, missing path to GATK in $GATK.")

    # get real paths before changing working directory in case they are relative paths
    if args.paired_reads:
        reads_paths = [os.path.realpath(f) for f in args.paired_reads]
    elif args.single_reads:
        reads_paths = [os.path.realpath(args.single_reads)]
    elif args.samples_file:
        left_fq_list = list()
        right_fq_list = list()
        with open(args.samples_file) as fh:
            for line in fh:
                line = line.rstrip()
                if not re.match("\w", line):
                    continue
                fields = line.split("\t")
                left_fq = fields[2]
                left_fq_list.append(os.path.realpath(left_fq))
                if len(fields) > 3:
                    right_fq = fields[3]
                    right_fq_list.append(os.path.realpath(right_fq))
        reads_paths = [",".join(left_fq_list)]
        if right_fq_list:
            reads_paths.append(",".join(right_fq_list))
    else:
        raise RuntimeError("no reads specified")  # should never get here

    st_fa_path = os.path.realpath(args.st_fa)

    st_gtf_path = os.path.realpath(args.st_gtf)

    # check if output directory exists, if not create
    real_path = os.path.realpath(args.out_path)
    if not os.path.isdir(real_path):
        os.makedirs(real_path)

    # move to output folder
    os.chdir(real_path)

    checkpoint_dir = os.path.abspath(
        os.path.basename(st_fa_path)) + ".gatk_chkpts"
    pipeliner = Pipeliner.Pipeliner(checkpoint_dir)

    # generate supertranscript index
    logger.info("Generating SuperTranscript index.")
    pipeliner.add_commands([
        Pipeliner.Command("samtools faidx {}".format(st_fa_path),
                          "samtools_faidx_st.ok")
    ])
    pipeliner.run()

    # generate supertranscript Picard dictionary
    logger.info("Generating Picard dictionary.")
    dict_file = re.sub("\.[^\.]+$", ".dict", st_fa_path)
    if os.path.isfile(dict_file):
        open(checkpoint_dir + "/picard_dict_st.ok", 'a').close()
    else:
        pipeliner.add_commands([
            Pipeliner.Command(
                "java -jar " + PICARD_HOME + "/picard.jar" +
                " CreateSequenceDictionary R=" + st_fa_path + " O=" +
                dict_file + " VALIDATION_STRINGENCY=LENIENT ",
                "picard_dict_st.ok")
        ])
        pipeliner.run()

    # generate genome folder for STAR's first pass
    logger.info("Generating genome folder for STAR")

    star_genome_generate_cmd = str(
        "STAR --runThreadN " + args.nthreads + " --runMode genomeGenerate" +
        " --genomeDir star_genome_idx " +
        " --genomeFastaFiles {} ".format(st_fa_path) +
        " --genomeSAindexNbases 8 " +  # as per A. Dobin
        " --sjdbGTFfile {} ".format(st_gtf_path) +
        " --sjdbOverhang {} ".format(args.sjdbOverhang) +
        " --limitGenomeGenerateRAM {}".format(args.maxram) +
        " {} ".format(args.STAR_genomeGenerate_opts))

    pipeliner.add_commands([
        Pipeliner.Command("mkdir star_genome_idx", "mkdir_star_genome_idx.ok"),
        Pipeliner.Command(star_genome_generate_cmd, "star_genome_generate.ok")
    ])
    pipeliner.run()

    # run STAR's alignment
    logger.info("Running STAR alignment.")
    cmd = str("STAR --runThreadN " + args.nthreads +
              " --genomeDir star_genome_idx " + " --runMode alignReads " +
              " --twopassMode Basic " + " --alignSJDBoverhangMin 10 " +
              " --outSAMtype BAM SortedByCoordinate " +
              " --limitBAMsortRAM {} ".format(args.maxram) +
              " --readFilesIn " + " ".join(reads_paths))

    if re.search("\.gz$", reads_paths[0]):
        cmd += " --readFilesCommand 'gunzip -c' "

    pipeliner.add_commands([Pipeliner.Command(cmd, "star_aln.ok")])
    pipeliner.run()

    # clean and convert sam file with Picard-Tools
    logger.info("Cleaning and Converting sam file with Picard-Tools.")

    pipeliner.add_commands([
        Pipeliner.Command(
            "java -jar " + PICARD_HOME + "/picard.jar " +
            " AddOrReplaceReadGroups " + "I=Aligned.sortedByCoord.out.bam " +
            "O=rg_added_sorted.bam " + " VALIDATION_STRINGENCY=SILENT " +
            "SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample",
            "add_read_groups.ok"),
        Pipeliner.Command(
            "java -jar " + PICARD_HOME + "/picard.jar " + " MarkDuplicates " +
            "I=rg_added_sorted.bam O=dedupped.bam " +
            "CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics",
            "mark_dups.ok"),
        Pipeliner.Command("java -jar " + PICARD_HOME +
                          "/picard.jar ValidateSamFile " + "I=dedupped.bam " +
                          "IGNORE_WARNINGS=true " + "MAX_OUTPUT=100000 " +
                          "IGNORE=MATE_NOT_FOUND " +
                          "O=dedupped.bam.validation",
                          "bam_validate.ok",
                          ignore_error=True),
        Pipeliner.Command(
            UTILDIR +
            "/clean_bam.pl dedupped.bam dedupped.bam.validation dedupped.valid.bam",
            "make_valid_dedupped_bam.ok"),
        Pipeliner.Command(
            "java -jar " + GATK_HOME + "/GenomeAnalysisTK.jar " +
            "-T SplitNCigarReads -R " + st_fa_path +
            " -I dedupped.valid.bam -o splitNCigar.bam " +
            " -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS  --validation_strictness LENIENT",
            "splitNCigarReads.ok")
    ])
    pipeliner.run()

    # do the actual variant calling
    logger.info("Variant Calling using Haplotype Caller.")

    pipeliner.add_commands([
        Pipeliner.Command(
            "java -jar " + GATK_HOME + "/GenomeAnalysisTK.jar " +
            "-T HaplotypeCaller -R " + st_fa_path +
            " -I ./splitNCigar.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -o output.vcf",
            "haplotypecaller.ok")
    ])
    pipeliner.run()

    # do some basic filtering
    logger.info("Doing some basic filtering of vcf.")

    pipeliner.add_commands([
        Pipeliner.Command(
            "java -jar " + GATK_HOME + "/GenomeAnalysisTK.jar " +
            "-T VariantFiltration -R " + st_fa_path +
            " -V output.vcf -window 35 -cluster 3 " +
            "-filterName FS -filter \"FS > 30.0\" " +
            "-filterName QD -filter \"QD < 2.0\" -o filtered_output.vcf",
            "variant_filt.ok")
    ])

    pipeliner.run()

    logger.info("Done!")
Example #4
0
def main():

    parser = argparse.ArgumentParser(
        description="capture gene to intron usage stats",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    ctat_genome_lib = os.environ.get("CTAT_GENOME_LIB", None)
    parser.add_argument(
        "--ctat_genome_lib",
        dest="ctat_genome_lib",
        type=str,
        required=False,
        default=ctat_genome_lib,
        help="ctat genome lib build dir",
    )
    parser.add_argument(
        "--SJ_tab_file",
        dest="SJ_tab_file",
        type=str,
        required=True,
        help="STAR SJ.out.tab file",
    )
    parser.add_argument(
        "--chimJ_file",
        dest="chimJ_file",
        type=str,
        required=False,
        default=None,
        help="STAR Chimeric.out.junction file",
    )
    parser.add_argument(
        "--output_prefix",
        dest="output_prefix",
        type=str,
        required=True,
        help="prefix for all output files",
    )
    parser.add_argument(
        "--min_total_reads",
        dest="min_total_reads",
        type=int,
        required=False,
        default=5,
        help="minimum reads supporting cancer intron",
    )

    parser.add_argument(
        "--vis",
        action="store_true",
        default=False,
        help=
        "Generate igv html ctat splicing visualization (requires --bam_file to be set)",
    )
    parser.add_argument(
        "--bam_file",
        dest="bam_file",
        type=str,
        required=False,
        default=None,
        help="STAR generated BAM file",
    )
    parser.add_argument(
        "--sample_name",
        dest="vis_sample_name",
        type=str,
        required=False,
        default="",
        help="sample name for vis title",
    )

    args = parser.parse_args()

    ctat_genome_lib = args.ctat_genome_lib

    if not os.path.exists(ctat_genome_lib):
        raise RuntimeError("Error, must set --ctat_genome_lib ")

    SJ_tab_file = args.SJ_tab_file
    chimJ_file = args.chimJ_file
    output_prefix = args.output_prefix
    bam_file = args.bam_file
    VIS_flag = args.vis
    min_total_reads = args.min_total_reads
    vis_sample_name = args.vis_sample_name

    if VIS_flag and not bam_file:
        raise RuntimeError("Error, if --vis, must specify --bam_file ")

    if not os.path.exists(SJ_tab_file):
        raise RuntimeError(
            "Error, cannot locate expected splice junction tab file: {} ".
            format(SJ_tab_file))

    chckpts_dir = output_prefix + ".chckpts"
    if not os.path.exists(chckpts_dir):
        os.makedirs(chckpts_dir)

    pipeliner = Pipeliner(chckpts_dir)

    introns_output_file = output_prefix + ".introns"
    introns_output_file_chckpt = os.path.join(chckpts_dir, "introns.ok")

    if not os.path.exists(introns_output_file_chckpt):

        targets_list_file = os.path.join(ctat_genome_lib,
                                         "ref_annot.gtf.mini.sortu")
        chr_intron_bounds = ioc.populate_intron_bounds(targets_list_file)
        introns_dict = ioc.map_introns_from_splice_tab(SJ_tab_file,
                                                       chr_intron_bounds)

        if chimJ_file is not None:
            if not os.path.exists(chimJ_file):
                raise RuntimeError(
                    "Error, cannot locate expected chimeric Junctiom out file: {} "
                    .format(chimJ_file))

            # must make splice file:
            chimJ_introns_file = (output_prefix + "." +
                                  os.path.basename(chimJ_file) +
                                  ".introns.tmp")
            cmd = str(
                os.path.join(utildir, "STAR_chimeric_junctions_to_introns.pl")
                + " -J {} > {}".format(chimJ_file, chimJ_introns_file))
            subprocess.check_call(cmd, shell=True)

            introns_dict = ioc.supplement_introns_from_chimeric_junctions_file(
                chimJ_introns_file, introns_dict, chr_intron_bounds)

        with open(introns_output_file, "wt") as ofh:
            # write header
            ofh.write("\t".join(
                ["intron", "strand", "genes", "uniq_mapped", "multi_mapped"]) +
                      "\n")

            for intron in introns_dict.values():
                ofh.write("\t".join([
                    "{}:{}-{}".format(intron.chromosome, intron.lend,
                                      intron.rend),
                    intron.strand,
                    intron.genes,
                    str(intron.uniq_mapped),
                    str(intron.multi_mapped),
                ]) + "\n")

        # done, add checkpoint
        subprocess.check_call("touch {}".format(introns_output_file_chckpt),
                              shell=True)

    # annotate for cancer introns.
    cancer_introns_file_prelim = output_prefix + ".cancer.introns.prelim"
    cmd = str(
        os.path.join(utildir, "annotate_cancer_introns.pl") +
        " --introns_file {} ".format(introns_output_file) +
        " --ctat_genome_lib {} ".format(ctat_genome_lib) + " --intron_col 0 " +
        " > {} ".format(cancer_introns_file_prelim))

    pipeliner.add_commands([Command(cmd, "prelim_introns.ok")])

    # filter for min support
    cancer_introns_file = output_prefix + ".cancer.introns"
    cmd = str(
        os.path.join(utildir, "filter_by_min_total_reads.py") +
        " --cancer_intron_candidates {}".format(cancer_introns_file_prelim) +
        " --min_total_reads {} ".format(min_total_reads) +
        " > {} ".format(cancer_introns_file))

    pipeliner.add_commands([Command(cmd, "introns_filtered.ok")])

    pipeliner.run()

    if VIS_flag:

        # generate the intron/junctions bed needed by igv
        igv_introns_bed_file = introns_output_file + ".for_IGV.bed"
        cmd = str(
            os.path.join(utildir, "make_igv_splice_bed.py") +
            " --all_introns {} ".format(introns_output_file) +
            " --cancer_introns {} ".format(cancer_introns_file) +
            " --genome_lib_dir {} ".format(ctat_genome_lib) +
            " --output_bed {} ".format(igv_introns_bed_file))

        pipeliner.add_commands([Command(cmd, "intron_igv_bed.ok")])
        pipeliner.run()

        igv_tracks_config_file = write_igv_config(
            output_prefix,
            ctat_genome_lib,
            igv_introns_bed_file,
            bam_file,
            os.path.join(utildir, "misc/igv.tracks.json"),
            pipeliner,
        )

        # Create the IGV Reports
        cmd = str(
            "create_report {} ".format(igv_introns_bed_file) +
            " {} ".format(os.path.join(ctat_genome_lib, "ref_genome.fa")) +
            " --type junction " +
            " --output {}.ctat-splicing.igv.html ".format(output_prefix) +
            " --track-config {} ".format(igv_tracks_config_file) +
            " --info-columns gene variant_name uniquely_mapped multi_mapped TCGA GTEx "
            + " --title 'CTAT_Splicing: {}' ".format(vis_sample_name))

        pipeliner.add_commands([Command(cmd, "igv_create_html.ok")])
        pipeliner.run()

    logger.info("done.")

    sys.exit(0)