Example #1
0
def _get_samples_to_process(fn, out_dir, config, force_single):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    out_dir = os.path.abspath(out_dir)
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            cols = l.strip().split(",")
            if len(cols) > 0:
                if len(cols) < 2:
                    raise ValueError("Line needs 2 values: file and name.")
                if utils.file_exists(cols[0]) or is_gsm(cols[0]):
                    if cols[0].find(" ") > -1:
                        new_name = os.path.abspath(cols[0].replace(" ", "_"))
                        logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name))
                        logger.warning("Please, avoid names with spaces in the future.")
                        utils.symlink_plus(os.path.abspath(cols[0]), new_name)
                        cols[0] = new_name
                    samples[cols[1]].append(cols)
                else:
                    logger.info("skipping %s, File doesn't exist." % cols[0])
    for sample, items in samples.items():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        elif is_gsm(items[0][0]):
            fn = "query_gsm"
            ext = ".fastq.gz"
        files = [os.path.abspath(fn_file[0]) if not is_gsm(fn_file[0]) else fn_file[0] for fn_file in items]
        samples[sample] = [{'files': _check_paired(files, force_single), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}]
    return [samples[sample] for sample in samples]
Example #2
0
def _prepare_samples(args):
    """
    create dict for each sample having all information
    """
    if args.galaxy:
        system_config = args.galaxy
    else:
        system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml")
    config = yaml.load(open(system_config))
    config['algorithm'] = {}
    data = []
    vcf_files = [fn for fn in args.files if fn.endswith('vcf')]
    bam_files = [fn for fn in args.files if fn.endswith('bam')]
    fastq_files = [fn for fn in args.files if is_fastq(fn)]
    if not fastq_files:
        fastq_files = vcf_files
    for sample in fastq_files:
        dt = {}
        dt['name'] = splitext_plus(op.basename(sample))[0]
        dt['config'] = config
        dt['fastq'] = op.abspath(sample)
        if bam_files:
            dt['bam'] = _find_bam(bam_files, sample)
        data.append([dt])
    return data
Example #3
0
def _get_samples_to_process(fn, out_dir, config):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            if not l.startswith("samplename"):
                cols = l.strip().split(",")
                samples[cols[1]].append(cols)
    for sample, items in samples.iteritems():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        files = [os.path.abspath(fn_file[0]) for fn_file in items]
        samples[sample] = [{
            'files': _check_paired(files),
            'out_file': os.path.join(out_dir, sample + ext),
            'fn': fn,
            'anno': items[0][2:],
            'config': config,
            'name': sample,
            'out_dir': out_dir
        }]
    return [samples[sample] for sample in samples]
def _get_samples_to_process(fn, out_dir, config):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            cols = l.strip().split(",")
            if len(cols) < 2:
                raise ValueError("Line needs 2 values: file and name.")
            if utils.file_exists(cols[0]):
                samples[cols[1]].append(cols)
            else:
                logger.info("skipping %s, File doesn't exist." % cols[0])
    for sample, items in samples.iteritems():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        files = [os.path.abspath(fn_file[0]) for fn_file in items]
        samples[sample] = [{
            'files': _check_paired(files),
            'out_file': os.path.join(out_dir, sample + ext),
            'fn': fn,
            'anno': items[0][2:],
            'config': config,
            'name': sample,
            'out_dir': out_dir
        }]
    return [samples[sample] for sample in samples]
Example #5
0
def trim_adapters(data):
    to_trim = [x for x in data["files"] if x is not None and is_fastq(x)]
    if not to_trim:
        return data["files"]

    logger.info("Trimming low quality ends and read through adapter "
                "sequence from %s." % (", ".join(to_trim)))
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trimmed"))
    return _trim_adapters(to_trim, out_dir, data)
Example #6
0
def trim_adapters(data):
    to_trim = [x for x in data["files"] if x is not None and is_fastq(x)]
    if not to_trim:
        return data["files"]

    logger.info("Trimming low quality ends and read through adapter "
                "sequence from %s." % (", ".join(to_trim)))
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trimmed"))
    return _trim_adapters(to_trim, out_dir, data)
Example #7
0
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped
    """
    if fastq.is_fastq(in_file) and not utils.is_gzipped(in_file):
        gzipped_file = in_file + ".gz"
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file
Example #8
0
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped
    """
    if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file)
          and not in_file.startswith(utils.SUPPORTED_REMOTES)):
        gzipped_file = in_file + ".gz"
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file
Example #9
0
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped
    """
    if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file)
            and not objectstore.is_remote(in_file)):
        gzipped_file = in_file + ".gz"
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()),
               message)
        return gzipped_file
    return in_file
Example #10
0
def _bzip_gzip(in_file):
    """
    convert from bz2 to gz
    """
    base, first_ext = os.path.splitext(in_file)
    gzipped_file = base + ".gz"
    if not file_exists(gzipped_file):
        return gzipped_file
    if (fastq.is_fastq(base) and utils.is_bzipped(in_file)
          and not objectstore.is_remote(in_file)):
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("bunzip2 -c {in_file} | gzip > {gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file
Example #11
0
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped, handling conversion
    from bzip to gzipped files
    """
    if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file):
        if utils.is_bzipped(in_file):
            return _bzip_gzip(in_file)
        elif not utils.is_gzipped(in_file):
            gzipped_file = in_file + ".gz"
            if file_exists(gzipped_file):
                return gzipped_file
            message = "gzipping {in_file}.".format(in_file=in_file)
            with file_transaction(gzipped_file) as tx_gzipped_file:
                do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()),
                       message)
            return gzipped_file
    return in_file
Example #12
0
def _bzip_gzip(in_file):
    """
    convert from bz2 to gz
    """
    if not utils.is_bzipped(in_file):
        return in_file
    base, first_ext = os.path.splitext(in_file)
    gzipped_file = base + ".gz"
    if (fastq.is_fastq(base) and
        not objectstore.is_remote(in_file)):

        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        with file_transaction(gzipped_file) as tx_gzipped_file:
            do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file
Example #13
0
def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped, handling conversion
    from bzip to gzipped files
    """
    if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file):
        if utils.is_bzipped(in_file):
            return _bzip_gzip(in_file)
        elif not utils.is_gzipped(in_file):
            gzipped_file = in_file + ".gz"
            if file_exists(gzipped_file):
                return gzipped_file
            message = "gzipping {in_file}.".format(in_file=in_file)
            with file_transaction(gzipped_file) as tx_gzipped_file:
                do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()),
                       message)
            return gzipped_file
    return in_file
def _get_samples_to_process(fn, out_dir, config):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            if not l.startswith("samplename"):
                cols = l.strip().split(",")
                samples[cols[1]].append(cols)
    for sample, items in samples.iteritems():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        files = [os.path.abspath(fn_file[0]) for fn_file in items]
        samples[sample] = [{'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}]
    return [samples[sample] for sample in samples]
Example #15
0
def _bzip_gzip(in_file, out_dir=None):
    """
    convert from bz2 to gz
    """
    if not utils.is_bzipped(in_file):
        return in_file
    base, _ = os.path.splitext(in_file)
    if out_dir:
        gzipped_file = os.path.join(out_dir, os.path.basename(base) + ".gz")
    else:
        gzipped_file = base + ".gz"
    if (fastq.is_fastq(base) and not objectstore.is_remote(in_file)):
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file} to {gzipped_file}.".format(
            in_file=in_file, gzipped_file=gzipped_file)
        with file_transaction(gzipped_file) as tx_gzipped_file:
            do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file
Example #16
0
def run_sailfish(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    if not fastq.is_fastq(fq1):
        return [[data]]
    sailfish_dir = os.path.join(work_dir, "sailfish", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    stranded = dd.get_strandedness(data).lower()
    out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, sailfish_dir)
    return [[data]]
Example #17
0
def run_sailfish(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    if not fastq.is_fastq(fq1):
        return [[data]]
    sailfish_dir = os.path.join(work_dir, "sailfish", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    stranded = dd.get_strandedness(data).lower()
    out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, sailfish_dir)
    return [[data]]
Example #18
0
def _bzip_gzip(in_file, out_dir=None):
    """
    convert from bz2 to gz
    """
    if not utils.is_bzipped(in_file):
        return in_file
    base, _ = os.path.splitext(in_file)
    if out_dir:
        gzipped_file = os.path.join(out_dir, os.path.basename(base) + ".gz")
    else:
        gzipped_file = base + ".gz"
    if (fastq.is_fastq(base) and not objectstore.is_remote(in_file)):
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file} to {gzipped_file}.".format(
            in_file=in_file, gzipped_file=gzipped_file)
        with file_transaction(gzipped_file) as tx_gzipped_file:
            do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file
Example #19
0
def get_files(target_files, config):
    out = []
    for fname_in in target_files.keys():
        if isinstance(fname_in, (list, tuple)):
            fnames = fname_in
        else:
            fnames = fname_in.split(";")
        for fname in fnames:
            if os.path.exists(fname):
                out.append(fname)
            else:
                added = False
                for dirname in config["inputs"]:
                    for f in glob.glob(os.path.join(dirname, fname) + "*"):
                        if bam.is_bam(f) or fastq.is_fastq(f):
                            if os.path.exists(f):
                                out.append(f)
                                added = True
                assert added, "Did not find files %s in directories %s" % (
                    fname, config["inputs"])
    return out
def _get_samples_to_process(fn, out_dir, config):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            cols = l.strip().split(",")
            if len(cols) < 2:
                raise ValueError("Line needs 2 values: file and name.")
            if utils.file_exists(cols[0]):
                samples[cols[1]].append(cols)
            else:
                logger.info("skipping %s, File doesn't exist." % cols[0])
    for sample, items in samples.iteritems():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        files = [os.path.abspath(fn_file[0]) for fn_file in items]
        samples[sample] = [{'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}]
    return [samples[sample] for sample in samples]