def call_macs_peaks(exp_bed, back_bed, out_base, ref, config): """Peak calling using MACS. """ peak_dir = config["peak_dir"] subpeaks = config["algorithm"].get("subpeaks", False) shiftsize = config["algorithm"].get("shiftsize", None) largelambda = config["algorithm"].get("largelambda", None) if not os.path.exists(peak_dir): os.makedirs(peak_dir) genome_size = _size_from_fai(_get_ref_fai(ref)) out_name = os.path.join(peak_dir, "%s-%s-macs" % (out_base.replace(" ", "_"), os.path.basename(ref))) ext = "peaks.subpeaks.bed" if subpeaks else "peaks.xls" peak_file = "%s_%s" % (out_name, ext) if not os.path.exists(peak_file): cl = ["macs14", "-t", _full_path(exp_bed), "-c", _full_path(back_bed), "--name=%s" % os.path.basename(out_name), "--format=BED", "--gsize=%s" % genome_size] if largelambda: cl += ["--llocal", str(largelambda)] if shiftsize: cl += ["--nomodel", "--shiftsize=%s" % shiftsize] if subpeaks: cl += ["--call-subpeaks", "--wig"] with chdir(os.path.dirname(out_name)): print " ".join(cl) subprocess.check_call(cl) return peak_file
def split_by_barcode(fastq1, fastq2, multiplex, base_name, config): """Split a fastq file into multiplex pieces using barcode details. """ if not multiplex: return [("", "", fastq1, fastq2)] bc_dir = "%s_barcode" % base_name nomatch_file = "%s_1_unmatched_fastq.txt" % base_name with utils.chdir(bc_dir): tag_file = _make_tag_file(multiplex) cl = [config["program"]["barcode"], tag_file, "%s_--b--_--r--_fastq.txt" % base_name, fastq1] if fastq2: cl.append(fastq2) cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"]) if int(config["algorithm"]["bc_read"]) == 2: cl.append("--second") if int(config["algorithm"]["bc_position"]) == 5: cl.append("--five") if not os.path.exists(nomatch_file): subprocess.check_call(cl) out_files = [] for info in multiplex: fq_fname = lambda x: os.path.join(bc_dir, "%s_%s_%s_fastq.txt" % (base_name, info["barcode_id"], x)) bc_file1 = fq_fname("1") bc_file2 = fq_fname("2") if fastq2 else None out_files.append((info["barcode_id"], info["name"], bc_file1, bc_file2)) return out_files
def _make_blastdb(self, seq): out_file = "ref.fa" with chdir(self._tmpdir): with open(out_file, "w") as out_handle: out_handle.write(">ref\n%s\n" % seq.upper()) cl = ["makeblastdb", "-in", out_file, "-dbtype", "nucl", "-out", out_file] with open("/dev/null", "w") as stdout: subprocess.check_call(cl, stdout=stdout) return os.path.join(self._tmpdir, out_file)
def make_search_db(seq_recs, ids, target_org, tmp_dir): search_db = "%s-db.fa" % target_org.replace(" ", "_") db_name = os.path.splitext(search_db)[0] with chdir(tmp_dir): with open(search_db, "w") as out_handle: SeqIO.write((seq_recs[i] for i in ids), out_handle, "fasta") cl = [ "makeblastdb", "-in", search_db, "-dbtype", "prot", "-out", db_name, "-title", target_org ] subprocess.check_call(cl) return os.path.join(tmp_dir, db_name)
def make_search_db(seq_recs, ids, target_org, tmp_dir): search_db = "%s-db.fa" % target_org.replace(" ", "_") db_name = os.path.splitext(search_db)[0] with chdir(tmp_dir): with open(search_db, "w") as out_handle: SeqIO.write((seq_recs[i] for i in ids), out_handle, "fasta") cl = ["makeblastdb", "-in", search_db, "-dbtype", "prot", "-out", db_name, "-title", target_org] subprocess.check_call(cl) return os.path.join(tmp_dir, db_name)
def _generate_fastq(fc_dir): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): with utils.chdir(os.path.split(fastq_dir)[0]): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] subprocess.check_call(cl) return fastq_dir
def _files_to_copy(directory): """Retrieve files that should be remotely copied. """ with utils.chdir(directory): image_redo_files = reduce(operator.add, [glob.glob("*.params"), glob.glob("Images/L*/C*"), ["RunInfo.xml"]]) qseqs = reduce(operator.add, [glob.glob("Data/Intensities/*.xml"), glob.glob("Data/Intensities/BaseCalls/*qseq.txt"), ]) reports = reduce(operator.add, [glob.glob("Data/Intensities/BaseCalls/*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xsl"), glob.glob("Data/Intensities/BaseCalls/*.htm"), ["Data/Intensities/BaseCalls/Plots", "Data/reports"]]) fastq = ["Data/Intensities/BaseCalls/fastq"] return sorted(image_redo_files + qseqs), sorted(reports + fastq)
def _generate_fastq(fc_dir, config): """Generate fastq files for the current flowcell. """ fc_name, fc_date = get_flowcell_info(fc_dir) short_fc_name = "%s_%s" % (fc_date, fc_name) fastq_dir = get_fastq_dir(fc_dir) basecall_dir = os.path.split(fastq_dir)[0] if not fastq_dir == fc_dir and not os.path.exists(fastq_dir): log.info("Generating fastq files for %s" % fc_dir) _generate_qseq(basecall_dir, config) log.info("Qseq files generated.") with utils.chdir(basecall_dir): lanes = sorted(list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")]))) cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)] log.info("Converting qseq to fastq on all lanes.") subprocess.check_call(cl) log.info("Qseq to fastq conversion completed.") return fastq_dir
def _generate_qseq(bc_dir, config): """Generate qseq files from illumina bcl files if not present. More recent Illumina updates do not produce qseq files. These can be generated from bcl, intensity and filter files with tools from the offline base caller OLB. """ qseqs = glob.glob(os.path.join(bc_dir, "*qseq.txt")) if len(qseqs) == 0: log.info("Generating qseq files at %s" % bc_dir) cmd = os.path.join(config["program"]["olb"], "bin", "setupBclToQseq.py") cl = [cmd, "-i", bc_dir, "-o", bc_dir, "-p", os.path.split(bc_dir)[0], "--in-place", "--overwrite"] subprocess.check_call(cl) with utils.chdir(bc_dir): try: processors = config["algorithm"]["num_cores"] except KeyError: processors = 8 cl = config["program"].get("olb_make", "make").split() + ["-j", str(processors)] subprocess.check_call(cl)