def _download_ucsc_genome_and_index(build="hg19", chr="chr11", start=0, end=2000000): """Download chromosome from ucsc, extract a given region and format for bwa and bowtie2. """ ucsc = "http://hgdownload.cse.ucsc.edu/goldenPath/{}/chromosomes/".format(build) url = os.path.join(ucsc, "{}.fa.gz".format(chr)) genomedir = os.path.join(GENOMES, genomes[build]['species'], build, "seq") if not os.path.exists(genomedir): safe_makedir(genomedir) try: LOG.info("Downloading {} from {} with curl".format(os.path.join(genomedir, os.path.basename(url)), url)) cl = ["curl", url, "-o", os.path.join(genomedir, os.path.basename(url))] if not os.path.exists(os.path.join(genomedir, os.path.basename(url))): subprocess.check_call(cl) except: pass if not os.path.exists(os.path.join(genomedir, os.path.basename(url).replace(".gz", ""))): rec = SeqIO.read(gzip.open(os.path.join(genomedir, os.path.basename(url)), "r"), "fasta") outh = open(os.path.join(genomedir, os.path.basename(url).replace(".gz", "")), "w") SeqIO.write(SeqRecord(rec.seq[start:end], rec.id, '', ''), outh, "fasta") outh.close() outfile = _index_bwa(os.path.join(genomedir, os.path.basename(url).replace(".gz", ""))) index_files['sam']['data'].write("index\t{}\t{}\n".format(build, os.path.join(genomedir, os.path.basename(url).replace(".gz", "")))) index_files['bwa']['data'].write("{}\t{}\t{}\t{}\n".format(build, build, genomes[build]['label'], outfile)) outfile = _index_bowtie(os.path.join(genomedir, os.path.basename(url).replace(".gz", ""))) index_files['bowtie']['data'].write("{}\t{}\t{}\t{}\n".format(build, build, genomes[build]['label'], outfile))
def _install_dbsnp_file(build="hg19"): """Download a (large) dbsnp file and extract a region from chr 11""" variationdir = os.path.join(GENOMES, genomes[build]['species'], build, "variation") url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/dbsnp132_20101103.vcf.gz" fn = os.path.join(variationdir, "dbsnp132.vcf.gz") dbsnp = os.path.join(variationdir, "dbsnp132_chr11.vcf") if not os.path.exists(variationdir): safe_makedir(variationdir) try: LOG.info("Downloading {} from {} with curl".format(fn, url)) cl = ["curl", url, "-o", fn] if not os.path.exists(os.path.join(variationdir, os.path.basename(fn))): subprocess.check_call(cl) except: pass fh = gzip.open(fn, "read") if not os.path.exists(dbsnp): of = open(dbsnp, "w") for r in fh: vals = r.split() if r.startswith("#"): of.write(r) if vals[0] != "11": continue if int(vals[1]) < 2000000: vals[0] = "chr{}".format(vals[0]) of.write("\t".join(vals)) of.write("\n") else: break of.close() return dbsnp
def _install_dbsnp_entrez(build="hg19"): """Install a subset of snps using Entrez queries""" LOG.info("Installing dbsnp file for {}".format(genomes[build]['species'])) variationdir = os.path.join(GENOMES, genomes[build]['species'], build, "variation") if not os.path.exists(variationdir): safe_makedir(variationdir) fn = os.path.join(variationdir, "dbsnp132_chr11.vcf") if not os.path.exists(fn): try: # http://www.ncbi.nlm.nih.gov/books/NBK44454/#Search.how_do_i_search_dbsnp_for_the_tot ## This will actually only download a subset of snps handle = Entrez.esearch(db="snp", retmax=8000, term="\"H**o sapiens\"[Organism] AND (11[CHR] AND (1[CHRPOS] : 2000000[CHRPOS])") record = Entrez.read(handle) records = [] ## For some reason the first entries are more or less empty start = 4000 delta = 200 for i in xrange(start, len(record['IdList']), delta): LOG.info("retrieving dbsnp records {} - {}".format(i, i+delta)) h = Entrez.efetch(db="snp", id=record['IdList'][i:i+delta], rettype="flt", retmax=delta) lbuffer = None lines = [] while True: if lbuffer: l = lbuffer lbuffer = None else: l = h.readline() if l.startswith("rs") or len(lines) > 20: if lines: lbuffer = l rec = _dbsnp_line(lines) if rec is None: break records.append(rec) lines = [] else: lines.append(l.rstrip()) else: lines.append(l.rstrip()) except: LOG.warning("Entrez query failed") pass LOG.info("Writing file {}".format(fn)) fh = open(fn, "w") fh.write(dbsnp_header) ## Header must be tab-separated, otherwise GATK complains... fh.write(vcfheader) for rec in sorted(records, key=lambda x: int(x.split()[1])): fh.write(rec) fh.write("\n") fh.close() return fn
def _index_bowtie2(fn, label="bowtie2"): """Index bowtie2""" outdir = os.path.join(os.path.dirname(fn), os.pardir, label) if not os.path.exists(outdir): safe_makedir(outdir) os.symlink(fn, os.path.join(outdir, os.path.basename(fn))) if os.path.exists(os.path.join(outdir,"{}.1.bt2".format( os.path.splitext(os.path.basename(fn))[0]))): return os.path.splitext(os.path.join(outdir, os.path.basename(fn)))[0] cl = ["bowtie2-build", os.path.abspath(os.path.join(outdir, os.path.basename(fn))), os.path.splitext(os.path.abspath(os.path.join(outdir, os.path.basename(fn))))[0]] subprocess.check_call(cl) return os.path.splitext(os.path.join(outdir, os.path.basename(fn)))[0]
def _index_bowtie(fn, label="bowtie"): """Index bowtie""" LOG.info("Indexing {} with bowtie".format(fn)) outdir = os.path.join(os.path.dirname(fn), os.pardir, label) if not os.path.exists(outdir): safe_makedir(outdir) os.symlink(fn, os.path.join(outdir, os.path.basename(fn))) if os.path.exists(os.path.join(outdir,"{}.1.ebwt".format( os.path.splitext(os.path.basename(fn))[0]))): LOG.info("{} exists; not doing anything".format(fn)) return os.path.splitext(os.path.join(outdir, os.path.basename(fn)))[0] cl = ["bowtie-build", os.path.abspath(os.path.join(outdir, os.path.basename(fn))), os.path.splitext(os.path.abspath(os.path.join(outdir, os.path.basename(fn))))[0]] subprocess.check_call(cl) LOG.info("Finished indexing {} with bowtie".format(fn)) return os.path.splitext(os.path.join(outdir, os.path.basename(fn)))[0]
def _index_bwa(fn, label="bwa"): """Index bwa""" LOG.info("Indexing {} with bwa".format(fn)) outdir = os.path.join(os.path.dirname(fn), os.pardir,label) if not os.path.exists(outdir): safe_makedir(outdir) os.symlink(fn, os.path.join(outdir, os.path.basename(fn))) if os.path.exists(os.path.join(outdir,"{}.amb".format( os.path.basename(fn)))): LOG.info("{} exists; not doing anything".format(fn)) return os.path.join(outdir, os.path.basename(fn)) cl = ["bwa", "index", os.path.abspath(os.path.join(outdir, os.path.basename(fn)))] subprocess.check_call(cl) LOG.info("Finished indexing {} with bwa".format(fn)) return os.path.join(outdir, os.path.basename(fn))
def _install_1000g_test_files(data_dir): """Download 1000 genomes exome data See ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/sequence_indices/20120522.sequence.index for an index of recent sequencing runs. Load data into R df <- read.table("20120522.sequence.index", header=TRUE, sep="\t", fill=TRUE, as.is=TRUE) df$INDIVIDUAL =gsub("/sequence.*", "", gsub("data/", "", df$FASTQ_FILE)) and select individual based on sequencing platform tapply(df$INSTRUMENT_MODEL, df$INDIVIDUAL, function(x) {levels(as.factor(x))}) Here sequencing data from individual NA21137 has (arbitrarily) been chosen for download. Sequencing was done at BROAD institute on a Illumina HiSeq 2000. Method: download the fastq file and divide sequences into batches of 10000 sequences to emulate different projects. Then run data through pipeline to generate downstream data. Downloading partial bam files with curl is possible, but then SamToFastq complains about unpaired mates so in the end it is better to download the entire bam (500M). The following code extracts 200000 reads that map to the 1.2 first Mb of chromosome 11. """ individual = "NA21137" base_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/{}".format(individual) bam_url = os.path.join(base_url, "exome_alignment", "{}.chrom11.ILLUMINA.bwa.GIH.exome.20111114.bam".format(individual)) if not os.path.exists(tmpdir): safe_makedir(tmpdir) bamfile = os.path.join(tmpdir, os.path.basename(bam_url)) ## Here check should be done on input files to pipeline; if not present, then ## download bamfile and generate fastq files ## FIXME: checks should be done on output from _pair_fastq_files smallbamfile = bamfile.replace(".bam", ".small.bam") if not os.path.exists(smallbamfile): LOG.info("downloading {} from {}".format(bamfile, base_url)) cl = ["curl", bam_url, "-o", smallbamfile, "-r", "0-{}".format(CURLFILESIZE)] subprocess.check_call(cl) LOG.info("finished creating {}".format(smallbamfile)) _bam_to_fastq(smallbamfile, os.path.join(tmpdir, "reads")) r1 = os.path.join(tmpdir, "reads_1.fq") r2 = os.path.join(tmpdir, "reads_2.fq") _pair_fastq_files(r1, r2, os.path.join(tmpdir, "seqs")) _make_casava_archive_files(FLOWCELL["C003CCCXX"], "C003CCCXX", os.path.join(tmpdir, "seqs")) ## FIXME: startiter doesn't work, now generating identical files _make_casava_archive_files(FLOWCELL["B002BBBXX"], "B002BBBXX", os.path.join(tmpdir, "seqs"))
def setUpClass(self): dirs = [ "data", "data/alignments", "data/nophix", "data/fastqc", "data/fastqc/nophix", "data/nophix/fastqc" ] [safe_makedir(x) for x in dirs] [ subprocess.check_call(["touch", os.path.join(x, "file1.txt")]) for x in dirs ] [ subprocess.check_call(["touch", os.path.join(x, "file2.txt")]) for x in dirs ]
def _install_phix(): LOG.info("Installing phix") build = "phix" genomedir = os.path.join(GENOMES, genomes[build]['species'], build, "seq") fn = os.path.join(genomedir, "phix.fa") if not os.path.exists(genomedir): LOG.info("Creating {}".format(genomedir)) safe_makedir(genomedir) if not os.path.exists(fn): try: LOG.info("Opening file {}".format(fn)) fh = open(fn, "w") handle = Entrez.efetch(db="nucleotide", id="9626372", rettype="fasta", retmode="text") rec = "".join(handle.readlines()) fh.write(rec) fh.close() except: pass outfile = _index_bwa(fn, label="bwa") index_files['sam']['data'].write("index\t{}\t{}\n".format(build, fn)) index_files['bwa']['data'].write("{}\t{}\t{}\t{}\n".format(build, build, genomes[build]['label'], outfile)) outfile = _index_bowtie(fn, label="bowtie") index_files['bowtie']['data'].write("{}\t{}\t{}\t{}\n".format(build, build, genomes[build]['label'], outfile))
def _make_casava_archive_files(fc, ssname, prefix, startiter = 1, nseqout=1000): fc_dir = os.path.join(ARCHIVE, fc) if not os.path.exists(fc_dir): safe_makedir(fc_dir) with open(os.path.join(fc_dir, "{}.csv".format(ssname)), "w") as fh: fh.write(SAMPLESHEETS[ssname]) with open(os.path.join(fc_dir, "RunInfo.xml"), "w") as fh: fh.write(RUNINFO.render(**{'flowcell':os.path.basename(fc), 'fc_id':fc_id(fc), 'date':fc_parts(fc)[0], 'instrument':fc.split("_")[1]})) with open(os.path.join(fc_dir, "runParameters.xml"), "w") as fh: fh.write(RUNPARAMETERS.render(**{'flowcell':os.path.basename(fc), 'fc_id':fc_id(fc), 'date':fc_parts(fc)[0], 'instrument':fc.split("_")[1]})) outf1 = [] outf2 = [] basecall_stats_dir = os.path.join(fc_dir, "Unaligned", "Basecall_Stats_{}".format(ssname)) if not os.path.exists(basecall_stats_dir): safe_makedir(basecall_stats_dir) for d in [os.path.join(basecall_stats_dir, x) for x in ["css", "Plots"]]: if not os.path.exists(d): safe_makedir(d) for row in SAMPLESHEETS[ssname].split("\n"): vals = row.split(",") if vals[0] == "FCID": header = row continue if len(vals) == 0: continue outdir = os.path.join(fc_dir, "Unaligned", "Project_{}".format(vals[5]), "Sample_{}".format(vals[2])) if not os.path.exists(outdir): safe_makedir(outdir) with open(os.path.join(outdir, "SampleSheet.csv"), "w") as fh: LOG.info("Writing to {}".format(os.path.join(outdir, "SampleSheet.csv"))) fh.write("{}\n".format(header)) fh.write("{}\n".format(row)) r1 = os.path.join(outdir, "{}_{}_L00{}_R1_001.fastq.gz".format(vals[2], vals[4], vals[1])) r2 = os.path.join(outdir, "{}_{}_L00{}_R2_001.fastq.gz".format(vals[2], vals[4], vals[1])) if os.path.exists(r1): LOG.info("{} already exists: if you want to rerun file generation remove {}".format(r1, r1)) return outf1.append(r1) outf2.append(r2) ## Write sequences with open("{}_1.fastq".format(prefix), "r") as fh: _write_sample_fastq(fh, outf1, startiter=startiter, nseqout=nseqout) with open("{}_2.fastq".format(prefix), "r") as fh: _write_sample_fastq(fh, outf2, startiter=startiter, nseqout=nseqout)
def setUpClass(self): dirs = ["data", "data/alignments", "data/nophix", "data/fastqc", "data/fastqc/nophix", "data/nophix/fastqc"] [safe_makedir(x) for x in dirs] [subprocess.check_call(["touch", os.path.join(x, "file1.txt")]) for x in dirs] [subprocess.check_call(["touch", os.path.join(x, "file2.txt")]) for x in dirs]
def setUpModule(): """Set up test files for scilifelab pipeline tests. The setup covers some typical situations, such as multiplexing, samples run on several flowcells, and same sample being run on several lanes in one flowcell. In short, the setup - downloads data from 1000 genomes (exome data from chr11, 0-2Mb) - generates fastq files in an archive folder - installs genome references (phix, hg19) - downloads dbsnp data for chr11, 0-2Mb - runs run_bcbb_pipeline.py -s to install fastq files to production folder - runs automated_initial_analysis.py """ pattern = "14_write_metrics.txt" def filter_fn(f): return re.search(pattern, f) != None n = sum([len(filtered_walk(os.path.join(PROJECTDIR, x), filter_fn)) for x in PROJECTS]) if n == NSAMPLES: LOG.info("All samples have been run, requirements for downstream tests satisfied") return LOG.info("Running setUpModule") _check_requirements() ## Add function to check existence of output files _install_1000g_test_files(os.path.join(os.path.dirname(__file__), "data", "production")) _install_phix() dbsnp = _install_dbsnp_entrez() (omni_out, hapmap_out, mills_out) = _install_training_data() _download_ucsc_genome_and_index() ## Install post_process file fh = open(POSTPROCESS, "w") fh.write(PPTEMPLATE.render(**{'store_dir':ARCHIVE, 'base_dir':PRODUCTION, 'dbsnp':dbsnp, 'omni':omni_out, 'hapmap':hapmap_out, 'mills':mills_out})) fh.close() ## Install index files for k, v in index_files.iteritems(): if not os.path.exists(os.path.dirname(v['file'])): safe_makedir(os.path.dirname(v['file'])) fh = open(v['file'], "w") fh.write(v['data'].getvalue()) fh.close() ## Make production dir if not os.path.exists(PRODUCTION): safe_makedir(PRODUCTION) ## Install files in production with run_bcbb_pipeline.py for k in FLOWCELL.keys(): install = False for ss in SAMPLESHEETS[k].split("\n"): vals = ss.split(",") if vals[0]=="FCID": continue outdir = os.path.join(PRODUCTION, "{}".format(vals[5].replace("__", ".")), "{}".format(vals[2]), "{}_{}".format(FLOWCELL[k].split("_")[0],FLOWCELL[k].split("_")[-1])) r1 = os.path.join(outdir, "{}_{}_L00{}_R1_001.fastq.gz".format(vals[2], vals[4], vals[1])) r2 = os.path.join(outdir, "{}_{}_L00{}_R2_001.fastq.gz".format(vals[2], vals[4], vals[1])) LOG.info("Looking for {} and {}".format(r1, r2)) if not os.path.exists(r1) or not os.path.exists(r2): install = True break if install: LOG.info("Installing files with run_bcbb_pipeline.py for flowcell {}".format(k)) cl = ["run_bcbb_pipeline.py", "-s", "-g", POSTPROCESS, os.path.join(ARCHIVE, FLOWCELL[k])] subprocess.check_call(cl) else: LOG.info("All files present; not running run_bcbb_pipeline.py") ## Run pipeline on samples pattern = "-bcbb-config.yaml$" yamlfiles = [] ## http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python ## [item for sublist in l for item in sublist] yamlfiles = [item for sublist in [filtered_walk(os.path.join(PROJECTDIR, x), filter_fn) for x in PROJECTS] for item in sublist] orig_dir = os.path.abspath(os.curdir) for yamlconfig in yamlfiles: try: LOG.info("cding to {}".format(os.path.abspath(os.curdir))) os.chdir(os.path.dirname(yamlconfig)) LOG.info("cding to {}".format(os.path.dirname(yamlconfig))) cl = ["automated_initial_analysis.py", POSTPROCESS, os.path.join(os.path.pardir, os.path.basename(os.path.dirname(yamlconfig))), yamlconfig] if not os.path.exists(os.path.join(os.path.dirname(yamlconfig), "14_write_metrics.txt")): LOG.info("Running pipeline: {}".format(" ".join(cl))) subprocess.check_call(cl) finally: os.chdir(orig_dir) LOG.info("Finished pipeline run and cd back to {}".format(orig_dir))