def _run_kraken(data,ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio))
    logger.info("Running kraken to determine contaminant: %s" % str(data["name"]))
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    stats = out = out_stats = None
    db = data['config']["algorithm"]["kraken"] 
    if db == "minikraken":
        db = os.path.join(_get_data_dir(),"genome","kraken","minikraken")
    else:
        if not os.path.exists(db):
            logger.info("kraken: no database found %s, skipping" % db)
            return {"kraken_report" : "null"}
    if not os.path.exists(os.path.join(kraken_out,"kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        files = data["files"]        
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir,"kraken_out")
                out_stats = os.path.join(tx_tmp_dir,"kraken_stats")
                cl = (" ").join([config_utils.get_program("kraken", data["config"]),
                      "--db",db,"--quick",
                      "--preload","--min-hits","2","--threads",str(num_cores), 
                      "--out", out, files[0]," 2>",out_stats])
                do.run(cl,"kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                    shutil.rmtree(kraken_out)
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out,db,data)
    return metrics
Exemple #2
0
def _prepare_samples(args):
    """
    create dict for each sample having all information
    """
    if args.galaxy:
        system_config = args.galaxy
    else:
        system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml")
    config = yaml.load(open(system_config))
    config['algorithm'] = {}
    data = []
    vcf_files = [fn for fn in args.files if fn.endswith('vcf')]
    bam_files = [fn for fn in args.files if fn.endswith('bam')]
    fastq_files = [fn for fn in args.files if is_fastq(fn)]
    if not fastq_files:
        fastq_files = vcf_files
    for sample in fastq_files:
        dt = {}
        dt['name'] = splitext_plus(op.basename(sample))[0]
        dt['config'] = config
        dt['fastq'] = op.abspath(sample)
        if bam_files:
            dt['bam'] = _find_bam(bam_files, sample)
        data.append([dt])
    return data
Exemple #3
0
def _config(args):
    if args.galaxy:
        system_config = args.galaxy
    else:
        system_config = op.join(_get_data_dir(), "galaxy", "bcbio_system.yaml")
    config = yaml.load(open(system_config))
    config['algorithm'] = {}
    return config
Exemple #4
0
def update_samples(data, resources, args):
    """
    Update algorithm dict with new cores set
    """
    if args.galaxy:
        system_config = args.galaxy
    else:
        system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml")
    config = yaml.load(open(system_config))
    config['algorithm'] = {}

    new_data = []
    for sample in data:
        sample['config'] = config
        sample['config']['algorithm'] = resources
        new_data.append([sample])
    return new_data
Exemple #5
0
def _prepare_samples(args):
    """
    create dict for each sample having all information
    """
    if args.galaxy:
        system_config = args.galaxy
    else:
        system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml")
    config = yaml.load(open(system_config))
    config['algorithm'] = {}
    data = []
    for sample in args.files:
        dt = {}
        dt['name'] = splitext_plus(op.basename(sample))[0]
        dt['config'] = config
        dt['bed'] = op.abspath(sample)
        data.append([dt])
    return data
Exemple #6
0
def _run_kraken(data, ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]), ratio))
    logger.info("Running kraken to determine contaminant: %s" %
                str(data["name"]))
    qc_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    out = out_stats = None
    db = data['config']["algorithm"]["kraken"]
    kraken_cmd = config_utils.get_program("kraken", data["config"])
    if db == "minikraken":
        db = os.path.join(_get_data_dir(), "genomes", "kraken", "minikraken")

    if not os.path.exists(db):
        logger.info("kraken: no database found %s, skipping" % db)
        return {"kraken_report": "null"}

    if not os.path.exists(os.path.join(kraken_out, "kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        fn_file = data["files"][0]
        if fn_file.endswith("bam"):
            logger.info("kraken: need fasta files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cat = "zcat" if fn_file.endswith(".gz") else "cat"
                cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick "
                      "--preload --min-hits 2 "
                      "--threads {num_cores} "
                      "--out {out} --fastq-input /dev/stdin  2> {out_stats}"
                      ).format(**locals())
                do.run(cl, "kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                    shutil.rmtree(kraken_out)
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out, db, data)
    return metrics
Exemple #7
0
def run(_, data, out_dir):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio))
    logger.info("Running kraken to determine contaminant: %s" %
                dd.get_sample_name(data))
    # ratio = bam.get_aligned_reads(bam_file, data)
    out = out_stats = None
    db = tz.get_in(["config", "algorithm", "kraken"], data)
    kraken_cmd = config_utils.get_program("kraken", data["config"])
    if db == "minikraken":
        db = os.path.join(install._get_data_dir(), "genomes", "kraken",
                          "minikraken")

    if not os.path.exists(db):
        logger.info("kraken: no database found %s, skipping" % db)
        return {"kraken_report": "null"}

    if not os.path.exists(os.path.join(out_dir, "kraken_out")):
        work_dir = os.path.dirname(out_dir)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        fn_file = data["files_orig"][0] if dd.get_save_diskspace(
            data) else data["files"][0]
        if fn_file.endswith("bam"):
            logger.info("kraken: need fastq files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cat = "zcat" if fn_file.endswith(".gz") else "cat"
                cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick "
                      "--preload --min-hits 2 "
                      "--threads {num_cores} "
                      "--output {out} --fastq-input /dev/stdin  2> {out_stats}"
                      ).format(**locals())
                do.run(cl, "kraken: %s" % dd.get_sample_name(data))
                if os.path.exists(out_dir):
                    shutil.rmtree(out_dir)
                shutil.move(tx_tmp_dir, out_dir)
    metrics = _parse_kraken_output(out_dir, db, data)
    return metrics
Exemple #8
0
def run(_, data, out_dir):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio))
    logger.info("Running kraken to determine contaminant: %s" % dd.get_sample_name(data))
    # ratio = bam.get_aligned_reads(bam_file, data)
    out = out_stats = None
    db = tz.get_in(["config", "algorithm", "kraken"], data)
    if db and isinstance(db, (list, tuple)):
        db = db[0]
    kraken_cmd = config_utils.get_program("kraken", data["config"])
    if db == "minikraken":
        db = os.path.join(install._get_data_dir(), "genomes", "kraken", "minikraken")

    if not os.path.exists(db):
        logger.info("kraken: no database found %s, skipping" % db)
        return {"kraken_report": "null"}

    if not os.path.exists(os.path.join(out_dir, "kraken_out")):
        work_dir = os.path.dirname(out_dir)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        fn_file = data["files_orig"][0] if dd.get_save_diskspace(data) else data["files"][0]
        if fn_file.endswith("bam"):
            logger.info("kraken: need fastq files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cat = "zcat" if fn_file.endswith(".gz") else "cat"
                cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick "
                      "--preload --min-hits 2 "
                      "--threads {num_cores} "
                      "--output {out} --fastq-input /dev/stdin  2> {out_stats}").format(**locals())
                do.run(cl, "kraken: %s" % dd.get_sample_name(data))
                if os.path.exists(out_dir):
                    shutil.rmtree(out_dir)
                shutil.move(tx_tmp_dir, out_dir)
    metrics = _parse_kraken_output(out_dir, db, data)
    return metrics
Exemple #9
0
def _run_kraken(data, ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    logger.info("Number of aligned reads < than 0.60 in %s: %s" %
                (str(data["name"]), ratio))
    logger.info("Running kraken to determine contaminant: %s" %
                str(data["name"]))
    qc_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    out = out_stats = None
    db = data['config']["algorithm"]["kraken"]
    if db == "minikraken":
        db = os.path.join(_get_data_dir(), "genomes", "kraken", "minikraken")
    else:
        if not os.path.exists(db):
            logger.info("kraken: no database found %s, skipping" % db)
            return {"kraken_report": "null"}
    if not os.path.exists(os.path.join(kraken_out, "kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        files = data["files"]
        if files[0].endswith("bam"):
            logger.info("kraken: need fasta files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cl = (" ").join([
                    config_utils.get_program("kraken", data["config"]), "--db",
                    db, "--quick", "--preload", "--min-hits", "2", "--threads",
                    str(num_cores), "--out", out, files[0], " 2>", out_stats
                ])
                do.run(cl, "kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                    shutil.rmtree(kraken_out)
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out, db, data)
    return metrics
def _run_kraken(data, ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]), ratio))
    logger.info("Running kraken to determine contaminant: %s" % str(data["name"]))
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    out = out_stats = None
    db = data['config']["algorithm"]["kraken"]
    kraken_cmd = config_utils.get_program("kraken", data["config"])
    if db == "minikraken":
        db = os.path.join(_get_data_dir(), "genomes", "kraken", "minikraken")

    if not os.path.exists(db):
        logger.info("kraken: no database found %s, skipping" % db)
        return {"kraken_report": "null"}

    if not os.path.exists(os.path.join(kraken_out, "kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        fn_file = data["files"][0]
        if fn_file.endswith("bam"):
            logger.info("kraken: need fasta files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cat = "zcat" if fn_file.endswith(".gz") else "cat"
                cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick "
                      "--preload --min-hits 2 "
                      "--threads {num_cores} "
                      "--out {out} --fastq-input /dev/stdin  2> {out_stats}").format(**locals())
                do.run(cl, "kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                    shutil.rmtree(kraken_out)
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out, db, data)
    return metrics
    parser.add_argument("-q", "--queue", help="Queue to submit jobs to.")
    parser.add_argument("-p",
                        "--tag",
                        help="Tag name to label jobs on the cluster",
                        default="bcb-prep")
    parser.add_argument("-t",
                        "--paralleltype",
                        choices=["local", "ipython"],
                        default="local",
                        help="Run with iptyhon")

    args = parser.parse_args()
    out_dir = os.path.abspath(args.out)
    utils.safe_makedir(out_dir)
    try:
        system_config = os.path.join(_get_data_dir(), "galaxy",
                                     "bcbio_system.yaml")
    except ValueError as err:
        print(err)
        print(
            "WARNING: Attempting to read bcbio_system.yaml in the current directory."
        )
        system_config = "bcbio_system.yaml"

    with open(system_config) as in_handle:
        config = yaml.load(in_handle)
        res = {'cores': args.cores_per_job}
        config["algorithm"] = {"num_cores": args.cores_per_job}
        config["resources"].update({'sambamba': res, 'samtools': res})
        config["log_dir"] = os.path.join(os.path.abspath(os.getcwd()), "log")
    parallel = clargs.to_parallel(args)
    parser.add_argument("--timeout", default=15, help="Time to wait before giving up starting.")
    parser.add_argument("--retries", default=0, type=int,
                        help=("Number of retries of failed tasks during "
                              "distributed processing. Default 0 "
                              "(no retries)"))
    parser.add_argument("-s", "--scheduler", help="Type of scheduler to use.",
                        choices=["lsf", "slurm", "torque", "sge", "pbspro"])
    parser.add_argument("-r", "--resources", help="Extra scheduler resource flags.", default=[], action="append")
    parser.add_argument("-q", "--queue", help="Queue to submit jobs to.")
    parser.add_argument("-p", "--tag", help="Tag name to label jobs on the cluster", default="bcb-prep")
    parser.add_argument("-t", "--paralleltype",
                        choices=["local", "ipython"],
                        default="local", help="Run with iptyhon")

    args = parser.parse_args()
    system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml")
    if args.galaxy:
        system_config = args.galaxy
    with open(system_config) as in_handle:
        config = yaml.load(in_handle)

    parallel = clargs.to_parallel(args)
    parallel.update({'progs': args.progs})
    dirs = {'work': os.path.abspath(os.getcwd())}
    if args.sys_info.find(";") > -1:
        info = args.sys_info.split(";")
        sysinfo = {'cores': int(info[0]), 'memory': float(info[1])}
    else:
        if utils.file_exists(args.sys_info):
            sysinfo = yaml.load(open(args.sys_info))[0]
    print "system info %s" % sysinfo
        raise ValueError("--mirbase and --srna_gtf both need a value.")

    env.hosts = ["localhost"]
    env.cores = args.cores
    os.environ["PATH"] += os.pathsep + os.path.dirname(sys.executable)
    cbl = get_cloudbiolinux(REMOTES)
    sys.path.insert(0, cbl["dir"])
    genomemod = __import__("cloudbio.biodata", fromlist=["genomes"])
    # monkey patch cloudbiolinux to use this indexing command instead
    genomes = getattr(genomemod, 'genomes')
    genomes._index_w_command = _index_w_command
    fabmod = __import__("cloudbio", fromlist=["fabutils"])
    fabutils = getattr(fabmod, 'fabutils')
    fabutils.configure_runsudo(env)

    system_config = os.path.join(_get_data_dir(), "galaxy",
                                 "bcbio_system.yaml")
    with open(system_config) as in_handle:
        config = yaml.load(in_handle)
    env.picard_home = config_utils.get_program("picard", config, ptype="dir")

    genome_dir = os.path.abspath(os.path.join(_get_data_dir(), "genomes"))
    args.fasta = os.path.abspath(args.fasta)
    args.gtf = os.path.abspath(args.gtf) if args.gtf else None
    if args.gff3:
        args.gtf = gff3_to_gtf(args.gtf)

    # always make a sequence dictionary
    if "seq" not in args.indexes:
        args.indexes.append("seq")
 def error(self, message):
     self.print_help()
     galaxy_base = os.path.join(_get_data_dir(), "galaxy")
     print("\nCurrent genomes\n")
     print(open(loc.get_loc_file(galaxy_base, "samtools")).read())
     sys.exit(0)
    parser.add_argument("--mirbase", help="species in mirbase for smallRNAseq data.")
    parser.add_argument("--srna_gtf", help="gtf to use for smallRNAseq data.")

    args = parser.parse_args()
 #   if not all([args.mirbase, args.srna_gtf]) and any([args.mirbase, args.srna_gtf]):
 #       raise ValueError("--mirbase and --srna_gtf both need a value.")

    os.environ["PATH"] += os.pathsep + os.path.dirname(sys.executable)
    cbl = get_cloudbiolinux(REMOTES)
    sys.path.insert(0, cbl["dir"])
    genomemod = __import__("cloudbio.biodata", fromlist=["genomes"])
    # monkey patch cloudbiolinux to use this indexing command instead
    genomes = getattr(genomemod, 'genomes')
    genomes._index_w_command = _index_w_command

    genome_dir = os.path.abspath(os.path.join(_get_data_dir(), "genomes"))
    args.fasta = os.path.abspath(args.fasta)
    if not file_exists(args.fasta):
        print("%s does not exist, exiting." % args.fasta)
        sys.exit(1)

    args.gtf = os.path.abspath(args.gtf) if args.gtf else None
    if args.gtf and not file_exists(args.gtf):
        print("%s does not exist, exiting." % args.gtf)
        sys.exit(1)
    args.srna_gtf = os.path.abspath(args.srna_gtf) if args.srna_gtf else None

    gtf_file = args.gtf
    if args.gff3:
        gtf_file = extract_if_gzipped(gtf_file)
        gtf_file = gff3_to_gtf(gtf_file)
Exemple #16
0
 def error(self, message):
     self.print_help()
     galaxy_base = os.path.join(_get_data_dir(), "galaxy")
     print("\nCurrent genomes\n")
     print(open(loc.get_loc_file(galaxy_base, "samtools")).read())
     sys.exit(0)
        raise ValueError("--mirbase and --srna_gtf both need a value.")

    env.hosts = ["localhost"]
    env.cores = args.cores
    os.environ["PATH"] += os.pathsep + os.path.dirname(sys.executable)
    cbl = get_cloudbiolinux(REMOTES)
    sys.path.insert(0, cbl["dir"])
    genomemod = __import__("cloudbio.biodata", fromlist=["genomes"])
    # monkey patch cloudbiolinux to use this indexing command instead
    genomes = getattr(genomemod, 'genomes')
    genomes._index_w_command = _index_w_command
    fabmod = __import__("cloudbio", fromlist=["fabutils"])
    fabutils = getattr(fabmod, 'fabutils')
    fabutils.configure_runsudo(env)

    system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml")
    with open(system_config) as in_handle:
        config = yaml.load(in_handle)
    env.picard_home = config_utils.get_program("picard", config, ptype="dir")

    genome_dir = os.path.abspath(os.path.join(_get_data_dir(), "genomes"))
    args.fasta = os.path.abspath(args.fasta)
    args.gtf = os.path.abspath(args.gtf) if args.gtf else None
    if args.gff3:
        args.gtf = gff3_to_gtf(args.gtf)

    # always make a sequence dictionary
    if "seq" not in args.indexes:
        args.indexes.append("seq")

    env.system_install = genome_dir