Beispiel #1
0
def _download_ucsc_genome_and_index(build="hg19", chr="chr11", start=0, end=2000000):
    """Download chromosome from ucsc, extract a given region and
    format for bwa and bowtie2.
    """
    ucsc = "http://hgdownload.cse.ucsc.edu/goldenPath/{}/chromosomes/".format(build)
    url = os.path.join(ucsc, "{}.fa.gz".format(chr))
    genomedir = os.path.join(GENOMES, genomes[build]['species'], build, "seq")
    if not os.path.exists(genomedir):
        safe_makedir(genomedir)
    try:
        LOG.info("Downloading {} from {} with curl".format(os.path.join(genomedir, os.path.basename(url)), url))
        cl = ["curl", url, "-o", os.path.join(genomedir, os.path.basename(url))]
        if not os.path.exists(os.path.join(genomedir, os.path.basename(url))):
            subprocess.check_call(cl)
    except:
        pass
    if not os.path.exists(os.path.join(genomedir, os.path.basename(url).replace(".gz", ""))):
        rec = SeqIO.read(gzip.open(os.path.join(genomedir, os.path.basename(url)), "r"), "fasta")
        outh = open(os.path.join(genomedir, os.path.basename(url).replace(".gz", "")), "w")
        SeqIO.write(SeqRecord(rec.seq[start:end], rec.id, '', ''), outh, "fasta")
        outh.close()
    outfile = _index_bwa(os.path.join(genomedir, os.path.basename(url).replace(".gz", "")))
    index_files['sam']['data'].write("index\t{}\t{}\n".format(build, os.path.join(genomedir, os.path.basename(url).replace(".gz", ""))))
    index_files['bwa']['data'].write("{}\t{}\t{}\t{}\n".format(build, build, genomes[build]['label'], outfile))
    outfile = _index_bowtie(os.path.join(genomedir, os.path.basename(url).replace(".gz", "")))
    index_files['bowtie']['data'].write("{}\t{}\t{}\t{}\n".format(build, build, genomes[build]['label'], outfile))
Beispiel #2
0
def _install_dbsnp_file(build="hg19"):
    """Download a (large) dbsnp file and extract a region from chr 11"""
    variationdir = os.path.join(GENOMES, genomes[build]['species'], build, "variation")
    url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/dbsnp132_20101103.vcf.gz"
    fn = os.path.join(variationdir, "dbsnp132.vcf.gz")
    dbsnp = os.path.join(variationdir, "dbsnp132_chr11.vcf")
    if not os.path.exists(variationdir):
        safe_makedir(variationdir)
    try:
        LOG.info("Downloading {} from {} with curl".format(fn, url))
        cl = ["curl", url, "-o", fn]
        if not os.path.exists(os.path.join(variationdir, os.path.basename(fn))):
            subprocess.check_call(cl)
    except:
        pass
    fh = gzip.open(fn, "read")
    if not os.path.exists(dbsnp):
        of = open(dbsnp, "w")
        for r in fh:
            vals = r.split()
            if r.startswith("#"):
                of.write(r)
            if vals[0] != "11":
                continue
            if int(vals[1]) < 2000000:
                vals[0] = "chr{}".format(vals[0])
                of.write("\t".join(vals))
                of.write("\n")
            else:
                break
        of.close()
    return dbsnp
Beispiel #3
0
def _install_dbsnp_entrez(build="hg19"):
    """Install a subset of snps using Entrez queries"""
    LOG.info("Installing dbsnp file for {}".format(genomes[build]['species']))
    variationdir = os.path.join(GENOMES, genomes[build]['species'], build, "variation")
    if not os.path.exists(variationdir):
        safe_makedir(variationdir)
    fn = os.path.join(variationdir, "dbsnp132_chr11.vcf")
    if not os.path.exists(fn):
        try:
            # http://www.ncbi.nlm.nih.gov/books/NBK44454/#Search.how_do_i_search_dbsnp_for_the_tot
            ## This will actually only download a subset of snps
            handle = Entrez.esearch(db="snp", retmax=8000, term="\"H**o sapiens\"[Organism] AND (11[CHR] AND (1[CHRPOS] : 2000000[CHRPOS])")
            record = Entrez.read(handle)
            records = []
            ## For some reason the first entries are more or less empty
            start = 4000
            delta = 200
            for i in xrange(start, len(record['IdList']), delta):
                LOG.info("retrieving dbsnp records {} - {}".format(i, i+delta))
                h = Entrez.efetch(db="snp", id=record['IdList'][i:i+delta], rettype="flt", retmax=delta)
                lbuffer = None
                lines = []
                while True:
                    if lbuffer:
                        l = lbuffer
                        lbuffer = None
                    else:
                        l = h.readline()
                    if l.startswith("rs") or len(lines) > 20:
                        if lines:
                            lbuffer = l
                            rec = _dbsnp_line(lines)
                            if rec is None:
                                break
                            records.append(rec)
                            lines = []
                        else:
                            lines.append(l.rstrip())
                    else:
                        lines.append(l.rstrip())
        except:
            LOG.warning("Entrez query failed")
            pass
        LOG.info("Writing file {}".format(fn))
        fh = open(fn, "w")
        fh.write(dbsnp_header)
        ## Header must be tab-separated, otherwise GATK complains...
        fh.write(vcfheader)
        for rec in sorted(records, key=lambda x: int(x.split()[1])):
            fh.write(rec)
            fh.write("\n")
        fh.close()

    return fn
Beispiel #4
0
def _index_bowtie2(fn, label="bowtie2"):
    """Index bowtie2"""
    outdir = os.path.join(os.path.dirname(fn), os.pardir, label)
    if not os.path.exists(outdir):
        safe_makedir(outdir)
        os.symlink(fn, os.path.join(outdir, os.path.basename(fn)))
    if os.path.exists(os.path.join(outdir,"{}.1.bt2".format( os.path.splitext(os.path.basename(fn))[0]))):
        return os.path.splitext(os.path.join(outdir, os.path.basename(fn)))[0]
    cl = ["bowtie2-build", os.path.abspath(os.path.join(outdir, os.path.basename(fn))), os.path.splitext(os.path.abspath(os.path.join(outdir, os.path.basename(fn))))[0]]
    subprocess.check_call(cl)
    return os.path.splitext(os.path.join(outdir, os.path.basename(fn)))[0]
Beispiel #5
0
def _index_bowtie(fn, label="bowtie"):
    """Index bowtie"""
    LOG.info("Indexing {} with bowtie".format(fn))
    outdir = os.path.join(os.path.dirname(fn), os.pardir, label)
    if not os.path.exists(outdir):
        safe_makedir(outdir)
        os.symlink(fn, os.path.join(outdir, os.path.basename(fn)))
    if os.path.exists(os.path.join(outdir,"{}.1.ebwt".format( os.path.splitext(os.path.basename(fn))[0]))):
        LOG.info("{} exists; not doing anything".format(fn))
        return os.path.splitext(os.path.join(outdir, os.path.basename(fn)))[0]
    cl = ["bowtie-build", os.path.abspath(os.path.join(outdir, os.path.basename(fn))), os.path.splitext(os.path.abspath(os.path.join(outdir, os.path.basename(fn))))[0]]
    subprocess.check_call(cl)
    LOG.info("Finished indexing {} with bowtie".format(fn))
    return os.path.splitext(os.path.join(outdir, os.path.basename(fn)))[0]
Beispiel #6
0
def _index_bwa(fn, label="bwa"):
    """Index bwa"""
    LOG.info("Indexing {} with bwa".format(fn))
    outdir = os.path.join(os.path.dirname(fn), os.pardir,label)
    if not os.path.exists(outdir):
        safe_makedir(outdir)
        os.symlink(fn, os.path.join(outdir, os.path.basename(fn)))
    if os.path.exists(os.path.join(outdir,"{}.amb".format( os.path.basename(fn)))):
        LOG.info("{} exists; not doing anything".format(fn))
        return os.path.join(outdir, os.path.basename(fn))
    cl = ["bwa", "index", os.path.abspath(os.path.join(outdir, os.path.basename(fn)))]
    subprocess.check_call(cl)
    LOG.info("Finished indexing {} with bwa".format(fn))
    return os.path.join(outdir, os.path.basename(fn))
Beispiel #7
0
def _install_1000g_test_files(data_dir):
    """Download 1000 genomes exome data

    See ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/sequence_indices/20120522.sequence.index
    for an index of recent sequencing runs. Load data into R

    df <- read.table("20120522.sequence.index", header=TRUE, sep="\t", fill=TRUE, as.is=TRUE)
    df$INDIVIDUAL =gsub("/sequence.*", "", gsub("data/", "", df$FASTQ_FILE))
    
    and select individual based on sequencing platform

    tapply(df$INSTRUMENT_MODEL, df$INDIVIDUAL, function(x) {levels(as.factor(x))})

    Here sequencing data from individual NA21137 has (arbitrarily) been chosen for download. Sequencing was
    done at BROAD institute on a Illumina HiSeq 2000.

    Method: download the fastq file and divide sequences into batches of 10000 
    sequences to emulate different projects. Then run data through pipeline to 
    generate downstream data. Downloading partial bam files with curl is possible, but
    then SamToFastq complains about unpaired mates so in the end it is better
    to download the entire bam (500M).

    The following code extracts 200000 reads that map to the 1.2 first Mb of 
    chromosome 11.
    """
    individual = "NA21137"
    base_url = "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/{}".format(individual)
    bam_url = os.path.join(base_url, "exome_alignment", "{}.chrom11.ILLUMINA.bwa.GIH.exome.20111114.bam".format(individual))
    if not os.path.exists(tmpdir):
        safe_makedir(tmpdir)
    bamfile = os.path.join(tmpdir, os.path.basename(bam_url))
    ## Here check should be done on input files to pipeline; if not present, then
    ## download bamfile and generate fastq files
    ## FIXME: checks should be done on output from _pair_fastq_files
    smallbamfile = bamfile.replace(".bam", ".small.bam")
    if not os.path.exists(smallbamfile):
        LOG.info("downloading {} from {}".format(bamfile, base_url))
        cl = ["curl", bam_url, "-o", smallbamfile, "-r", "0-{}".format(CURLFILESIZE)]
        subprocess.check_call(cl)
        LOG.info("finished creating {}".format(smallbamfile))

    _bam_to_fastq(smallbamfile, os.path.join(tmpdir, "reads"))
    r1 = os.path.join(tmpdir, "reads_1.fq")
    r2 = os.path.join(tmpdir, "reads_2.fq")
    _pair_fastq_files(r1, r2, os.path.join(tmpdir, "seqs"))

    _make_casava_archive_files(FLOWCELL["C003CCCXX"], "C003CCCXX", os.path.join(tmpdir, "seqs"))
    ## FIXME: startiter doesn't work, now generating identical files
    _make_casava_archive_files(FLOWCELL["B002BBBXX"], "B002BBBXX", os.path.join(tmpdir, "seqs"))
Beispiel #8
0
 def setUpClass(self):
     dirs = [
         "data", "data/alignments", "data/nophix", "data/fastqc",
         "data/fastqc/nophix", "data/nophix/fastqc"
     ]
     [safe_makedir(x) for x in dirs]
     [
         subprocess.check_call(["touch",
                                os.path.join(x, "file1.txt")]) for x in dirs
     ]
     [
         subprocess.check_call(["touch",
                                os.path.join(x, "file2.txt")]) for x in dirs
     ]
Beispiel #9
0
def _install_phix():
    LOG.info("Installing phix")
    build = "phix"
    genomedir = os.path.join(GENOMES, genomes[build]['species'], build, "seq")
    fn = os.path.join(genomedir, "phix.fa")
    if not os.path.exists(genomedir):
        LOG.info("Creating {}".format(genomedir))
        safe_makedir(genomedir)
    if not os.path.exists(fn):
        try:
            LOG.info("Opening file {}".format(fn))
            fh = open(fn, "w")
            handle = Entrez.efetch(db="nucleotide", id="9626372", rettype="fasta", retmode="text")
            rec = "".join(handle.readlines())
            fh.write(rec)
            fh.close()
        except:
            pass
    outfile = _index_bwa(fn, label="bwa")
    index_files['sam']['data'].write("index\t{}\t{}\n".format(build, fn))
    index_files['bwa']['data'].write("{}\t{}\t{}\t{}\n".format(build, build, genomes[build]['label'], outfile))
    outfile = _index_bowtie(fn, label="bowtie")
    index_files['bowtie']['data'].write("{}\t{}\t{}\t{}\n".format(build, build, genomes[build]['label'], outfile))
Beispiel #10
0
def _make_casava_archive_files(fc, ssname, prefix, startiter = 1, nseqout=1000):
    fc_dir = os.path.join(ARCHIVE, fc)
    if not os.path.exists(fc_dir):
        safe_makedir(fc_dir)
    with open(os.path.join(fc_dir, "{}.csv".format(ssname)), "w") as fh:
        fh.write(SAMPLESHEETS[ssname])
    with open(os.path.join(fc_dir, "RunInfo.xml"), "w") as fh:
        fh.write(RUNINFO.render(**{'flowcell':os.path.basename(fc), 'fc_id':fc_id(fc), 'date':fc_parts(fc)[0], 'instrument':fc.split("_")[1]}))
    with open(os.path.join(fc_dir, "runParameters.xml"), "w") as fh:
        fh.write(RUNPARAMETERS.render(**{'flowcell':os.path.basename(fc), 'fc_id':fc_id(fc), 'date':fc_parts(fc)[0], 'instrument':fc.split("_")[1]}))

    outf1 = []
    outf2 = []
    basecall_stats_dir = os.path.join(fc_dir, "Unaligned", "Basecall_Stats_{}".format(ssname))
    if not os.path.exists(basecall_stats_dir):
        safe_makedir(basecall_stats_dir)
    for d in [os.path.join(basecall_stats_dir, x) for x in ["css", "Plots"]]:
        if not os.path.exists(d):
            safe_makedir(d)
    
    for row in SAMPLESHEETS[ssname].split("\n"):
        vals = row.split(",")
        if vals[0] == "FCID":
            header = row
            continue
        if len(vals) == 0:
            continue
        outdir = os.path.join(fc_dir, "Unaligned", "Project_{}".format(vals[5]), "Sample_{}".format(vals[2]))
        if not os.path.exists(outdir):
            safe_makedir(outdir)
        with open(os.path.join(outdir, "SampleSheet.csv"), "w") as fh:
            LOG.info("Writing to {}".format(os.path.join(outdir, "SampleSheet.csv")))
            fh.write("{}\n".format(header))
            fh.write("{}\n".format(row))
        r1 = os.path.join(outdir, "{}_{}_L00{}_R1_001.fastq.gz".format(vals[2], vals[4], vals[1]))
        r2 = os.path.join(outdir, "{}_{}_L00{}_R2_001.fastq.gz".format(vals[2], vals[4], vals[1]))
        if os.path.exists(r1):
            LOG.info("{} already exists: if you want to rerun file generation remove {}".format(r1, r1))
            return 
        outf1.append(r1)
        outf2.append(r2)

    ## Write sequences
    with open("{}_1.fastq".format(prefix), "r") as fh:
        _write_sample_fastq(fh, outf1, startiter=startiter, nseqout=nseqout)
    with open("{}_2.fastq".format(prefix), "r") as fh:
        _write_sample_fastq(fh, outf2, startiter=startiter, nseqout=nseqout)
Beispiel #11
0
 def setUpClass(self):
     dirs = ["data", "data/alignments", "data/nophix", "data/fastqc", "data/fastqc/nophix", "data/nophix/fastqc"]
     [safe_makedir(x) for x in dirs]
     [subprocess.check_call(["touch", os.path.join(x, "file1.txt")]) for x in dirs]
     [subprocess.check_call(["touch", os.path.join(x, "file2.txt")]) for x in dirs]
Beispiel #12
0
def setUpModule():
    """Set up test files for scilifelab pipeline tests. The setup
    covers some typical situations, such as multiplexing, samples run
    on several flowcells, and same sample being run on several lanes
    in one flowcell.

    In short, the setup
    - downloads data from 1000 genomes (exome data from chr11, 0-2Mb)
    - generates fastq files in an archive folder
    - installs genome references (phix, hg19)
    - downloads dbsnp data for chr11, 0-2Mb
    - runs run_bcbb_pipeline.py -s to install fastq files to production folder
    - runs automated_initial_analysis.py
    """
    pattern = "14_write_metrics.txt"
    def filter_fn(f):
        return re.search(pattern, f) != None

    n = sum([len(filtered_walk(os.path.join(PROJECTDIR, x), filter_fn)) for x in PROJECTS])
    if n == NSAMPLES:
        LOG.info("All samples have been run, requirements for downstream tests satisfied")
        return
    LOG.info("Running setUpModule")
    _check_requirements()
    ## Add function to check existence of output files
    _install_1000g_test_files(os.path.join(os.path.dirname(__file__), "data", "production"))
    _install_phix()
    dbsnp = _install_dbsnp_entrez()
    (omni_out, hapmap_out, mills_out) = _install_training_data()

    _download_ucsc_genome_and_index()
    ## Install post_process file
    fh = open(POSTPROCESS, "w")
    fh.write(PPTEMPLATE.render(**{'store_dir':ARCHIVE, 'base_dir':PRODUCTION, 'dbsnp':dbsnp, 'omni':omni_out, 'hapmap':hapmap_out, 'mills':mills_out}))
    fh.close()
    ## Install index files
    for k, v in index_files.iteritems():
        if not os.path.exists(os.path.dirname(v['file'])):
            safe_makedir(os.path.dirname(v['file']))
        fh = open(v['file'], "w")
        fh.write(v['data'].getvalue())
        fh.close()
    ## Make production dir
    if not os.path.exists(PRODUCTION):
        safe_makedir(PRODUCTION)

    ## Install files in production with run_bcbb_pipeline.py
    for k in FLOWCELL.keys():
        install = False
        for ss in SAMPLESHEETS[k].split("\n"):
            vals = ss.split(",")
            if vals[0]=="FCID":
                continue
            outdir = os.path.join(PRODUCTION, "{}".format(vals[5].replace("__", ".")), "{}".format(vals[2]), "{}_{}".format(FLOWCELL[k].split("_")[0],FLOWCELL[k].split("_")[-1]))
            r1 = os.path.join(outdir, "{}_{}_L00{}_R1_001.fastq.gz".format(vals[2], vals[4], vals[1]))
            r2 = os.path.join(outdir, "{}_{}_L00{}_R2_001.fastq.gz".format(vals[2], vals[4], vals[1]))
            LOG.info("Looking for {} and {}".format(r1, r2))
            if not os.path.exists(r1) or not os.path.exists(r2):
                install = True
                break
        if install:
            LOG.info("Installing files with run_bcbb_pipeline.py for flowcell {}".format(k))
            cl = ["run_bcbb_pipeline.py", "-s", "-g", POSTPROCESS, os.path.join(ARCHIVE, FLOWCELL[k])]
            subprocess.check_call(cl)
        else:
            LOG.info("All files present; not running run_bcbb_pipeline.py")
    
    ## Run pipeline on samples 
    pattern = "-bcbb-config.yaml$"
    yamlfiles = []
    ## http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
    ## [item for sublist in l for item in sublist]
    yamlfiles = [item for sublist in [filtered_walk(os.path.join(PROJECTDIR, x), filter_fn) for x in PROJECTS] for item in sublist]
    orig_dir = os.path.abspath(os.curdir)
    for yamlconfig in yamlfiles:
        try:
            LOG.info("cding to {}".format(os.path.abspath(os.curdir)))
            os.chdir(os.path.dirname(yamlconfig))
            LOG.info("cding to {}".format(os.path.dirname(yamlconfig)))
            cl = ["automated_initial_analysis.py", POSTPROCESS, os.path.join(os.path.pardir, os.path.basename(os.path.dirname(yamlconfig))), yamlconfig]
            if not os.path.exists(os.path.join(os.path.dirname(yamlconfig), "14_write_metrics.txt")):
                LOG.info("Running pipeline: {}".format(" ".join(cl)))
                subprocess.check_call(cl)
        finally:
            os.chdir(orig_dir)
            LOG.info("Finished pipeline run and cd back to {}".format(orig_dir))