def _get_samples_to_process(fn, out_dir, config, force_single): """parse csv file with one line per file. It will merge all files that have the same description name""" out_dir = os.path.abspath(out_dir) samples = defaultdict(list) with open(fn) as handle: for l in handle: cols = l.strip().split(",") if len(cols) > 0: if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]) or is_gsm(cols[0]): if cols[0].find(" ") > -1: new_name = os.path.abspath(cols[0].replace(" ", "_")) logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name)) logger.warning("Please, avoid names with spaces in the future.") utils.symlink_plus(os.path.abspath(cols[0]), new_name) cols[0] = new_name samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.items(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" elif is_gsm(items[0][0]): fn = "query_gsm" ext = ".fastq.gz" files = [os.path.abspath(fn_file[0]) if not is_gsm(fn_file[0]) else fn_file[0] for fn_file in items] samples[sample] = [{'files': _check_paired(files, force_single), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}] return [samples[sample] for sample in samples]
def _prepare_samples(args): """ create dict for each sample having all information """ if args.galaxy: system_config = args.galaxy else: system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") config = yaml.load(open(system_config)) config['algorithm'] = {} data = [] vcf_files = [fn for fn in args.files if fn.endswith('vcf')] bam_files = [fn for fn in args.files if fn.endswith('bam')] fastq_files = [fn for fn in args.files if is_fastq(fn)] if not fastq_files: fastq_files = vcf_files for sample in fastq_files: dt = {} dt['name'] = splitext_plus(op.basename(sample))[0] dt['config'] = config dt['fastq'] = op.abspath(sample) if bam_files: dt['bam'] = _find_bam(bam_files, sample) data.append([dt]) return data
def _get_samples_to_process(fn, out_dir, config): """parse csv file with one line per file. It will merge all files that have the same description name""" samples = defaultdict(list) with open(fn) as handle: for l in handle: if not l.startswith("samplename"): cols = l.strip().split(",") samples[cols[1]].append(cols) for sample, items in samples.iteritems(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" files = [os.path.abspath(fn_file[0]) for fn_file in items] samples[sample] = [{ 'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir }] return [samples[sample] for sample in samples]
def _get_samples_to_process(fn, out_dir, config): """parse csv file with one line per file. It will merge all files that have the same description name""" samples = defaultdict(list) with open(fn) as handle: for l in handle: cols = l.strip().split(",") if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]): samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.iteritems(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" files = [os.path.abspath(fn_file[0]) for fn_file in items] samples[sample] = [{ 'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir }] return [samples[sample] for sample in samples]
def trim_adapters(data): to_trim = [x for x in data["files"] if x is not None and is_fastq(x)] if not to_trim: return data["files"] logger.info("Trimming low quality ends and read through adapter " "sequence from %s." % (", ".join(to_trim))) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trimmed")) return _trim_adapters(to_trim, out_dir, data)
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped """ if fastq.is_fastq(in_file) and not utils.is_gzipped(in_file): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped """ if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file) and not in_file.startswith(utils.SUPPORTED_REMOTES)): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped """ if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file) and not objectstore.is_remote(in_file)): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _bzip_gzip(in_file): """ convert from bz2 to gz """ base, first_ext = os.path.splitext(in_file) gzipped_file = base + ".gz" if not file_exists(gzipped_file): return gzipped_file if (fastq.is_fastq(base) and utils.is_bzipped(in_file) and not objectstore.is_remote(in_file)): if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) do.run("bunzip2 -c {in_file} | gzip > {gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped, handling conversion from bzip to gzipped files """ if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file): if utils.is_bzipped(in_file): return _bzip_gzip(in_file) elif not utils.is_gzipped(in_file): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) with file_transaction(gzipped_file) as tx_gzipped_file: do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _bzip_gzip(in_file): """ convert from bz2 to gz """ if not utils.is_bzipped(in_file): return in_file base, first_ext = os.path.splitext(in_file) gzipped_file = base + ".gz" if (fastq.is_fastq(base) and not objectstore.is_remote(in_file)): if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) with file_transaction(gzipped_file) as tx_gzipped_file: do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _get_samples_to_process(fn, out_dir, config): """parse csv file with one line per file. It will merge all files that have the same description name""" samples = defaultdict(list) with open(fn) as handle: for l in handle: if not l.startswith("samplename"): cols = l.strip().split(",") samples[cols[1]].append(cols) for sample, items in samples.iteritems(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" files = [os.path.abspath(fn_file[0]) for fn_file in items] samples[sample] = [{'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}] return [samples[sample] for sample in samples]
def _bzip_gzip(in_file, out_dir=None): """ convert from bz2 to gz """ if not utils.is_bzipped(in_file): return in_file base, _ = os.path.splitext(in_file) if out_dir: gzipped_file = os.path.join(out_dir, os.path.basename(base) + ".gz") else: gzipped_file = base + ".gz" if (fastq.is_fastq(base) and not objectstore.is_remote(in_file)): if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file} to {gzipped_file}.".format( in_file=in_file, gzipped_file=gzipped_file) with file_transaction(gzipped_file) as tx_gzipped_file: do.run("bunzip2 -c {in_file} | gzip > {tx_gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def run_sailfish(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None if not fastq.is_fastq(fq1): return [[data]] sailfish_dir = os.path.join(work_dir, "sailfish", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file stranded = dd.get_strandedness(data).lower() out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data) data = dd.set_sailfish(data, out_file) data = dd.set_sailfish_dir(data, sailfish_dir) return [[data]]
def get_files(target_files, config): out = [] for fname_in in target_files.keys(): if isinstance(fname_in, (list, tuple)): fnames = fname_in else: fnames = fname_in.split(";") for fname in fnames: if os.path.exists(fname): out.append(fname) else: added = False for dirname in config["inputs"]: for f in glob.glob(os.path.join(dirname, fname) + "*"): if bam.is_bam(f) or fastq.is_fastq(f): if os.path.exists(f): out.append(f) added = True assert added, "Did not find files %s in directories %s" % ( fname, config["inputs"]) return out
def _get_samples_to_process(fn, out_dir, config): """parse csv file with one line per file. It will merge all files that have the same description name""" samples = defaultdict(list) with open(fn) as handle: for l in handle: cols = l.strip().split(",") if len(cols) < 2: raise ValueError("Line needs 2 values: file and name.") if utils.file_exists(cols[0]): samples[cols[1]].append(cols) else: logger.info("skipping %s, File doesn't exist." % cols[0]) for sample, items in samples.iteritems(): if is_fastq(items[0][0], True): fn = "fq_merge" ext = ".fastq.gz" elif is_bam(items[0][0]): fn = "bam_merge" ext = ".bam" files = [os.path.abspath(fn_file[0]) for fn_file in items] samples[sample] = [{'files': _check_paired(files), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}] return [samples[sample] for sample in samples]