Ejemplo n.º 1
0
def GATKBaseRecal(infile,
                  outfile,
                  genome,
                  intervals,
                  padding,
                  dbsnp,
                  solid_options=""):
    '''Recalibrates base quality scores using GATK'''

    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''GenomeAnalysisTK
                    -T BaseRecalibrator
                    --out %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -L %(intervals)s
                    -ip %(padding)s
                    -I %(infile)s
                    --knownSites %(dbsnp)s %(solid_options)s ;
                    ''' % locals()

    statement += '''GenomeAnalysisTK
                    -T PrintReads -o %(outfile)s
                    -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -I %(infile)s ;
                    ''' % locals()

    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()
    P.run(statement)
Ejemplo n.º 2
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(P.get_params()["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "tomtom", outfile)

    if iotools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        iotools.touch_file(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Ejemplo n.º 3
0
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using
       SAMtools, realigns around indels and recalibrates base quality scores
       using GATK'''

    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir()
    job_memory = PARAMS["gatk_memory"]

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"],
                           PARAMS["genome"])

    outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr")
    outfile2 = outfile.replace(".bqsr", ".realign.bqsr")

    exome.GATKReadGroups(infile, outfile1, genome,
                                 PARAMS["readgroup_library"],
                                 PARAMS["readgroup_platform"],
                                 PARAMS["readgroup_platform_unit"])

    exome.GATKIndelRealign(outfile1, outfile2, genome,
                                   PARAMS["gatk_threads"])

    iotools.zap_file(outfile1)

    exome.GATKBaseRecal(outfile2, outfile, genome,
                                PARAMS["gatk_dbsnp"],
                                PARAMS["gatk_solid_options"])
    iotools.zap_file(outfile2)
Ejemplo n.º 4
0
def deduplicate_reads(infile, outfile):

    tmpdir = P.get_temp_dir(dir=PARAMS["shared_tmpdir"])

    statement = '''
                    MarkDuplicates
                    I=%(infile)s
                    O=%(outfile)s
                    ASSUME_SORTED=True
                    VALIDATION_STRINGENCY=LENIENT
                    METRICS_FILE=%(outfile)s.stats
                    REMOVE_DUPLICATES=True
                    TMP_DIR=%(tmpdir)s > %(outfile)s.log

                    && 

                    samtools index %(outfile)s

                    && 

                    rm %(tmpdir)s -r '''

    job_memory = "16G"

    P.run(statement)
Ejemplo n.º 5
0
def mergeSampleBams(infile, outfile):
    '''merge control and tumor bams'''
    # Note: need to change readgroup headers for merge and subsequent
    # splitting of bam files
    to_cluster = USECLUSTER
    job_memory = PARAMS["gatk_memory"]

    tmpdir_gatk = P.get_temp_dir(shared=True)

    outfile_tumor = outfile.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])
    infile_tumor = infile.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    infile_base = os.path.basename(infile)
    infile_tumor_base = infile_base.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    library = PARAMS["readgroup_library"]
    platform = PARAMS["readgroup_platform"]
    platform_unit = PARAMS["readgroup_platform_unit"]

    control_id = "Control.bam"
    tumor_id = control_id.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    statement = '''picard AddOrReplaceReadGroups
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track)s
                    ID=%(track)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''picard AddOrReplaceReadGroups
                    INPUT=%(infile_tumor)s
                    OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s
                    RGLB=%(library)s RGPL=%(platform)s
                    RGPU=%(platform_unit)s RGSM=%(track_tumor)s
                    ID=%(track_tumor)s
                    VALIDATION_STRINGENCY=SILENT ;'''
    statement += '''samtools merge -rf
                    %(outfile)s
                    %(tmpdir_gatk)s/%(infile_base)s
                    %(tmpdir_gatk)s/%(infile_tumor_base)s;'''
    statement += "samtools index %(outfile)s; "
    statement += "rm -rf %(tmpdir_gatk)s ;"
    P.run(statement)
    iotools.zap_file(infile)
    iotools.zap_file(infile_tumor)
Ejemplo n.º 6
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)
Ejemplo n.º 7
0
    def __init__(self,
                 save=True,
                 summarize=False,
                 threads=1,
                 qual_format='phred64',
                 *args, **kwargs):
        self.save = save
        self.summarize = summarize
        self.threads = threads
        if self.save:
            self.outdir = "processed.dir"
        else:
            self.outdir = P.get_temp_dir(shared=True)

        self.processors = []
        self.qual_format = qual_format
Ejemplo n.º 8
0
def runPicardOnRealigned(infile, outfile):
    to_cluster = USECLUSTER
    job_memory = PARAMS["gatk_memory"]

    tmpdir_gatk = P.get_temp_dir()

    outfile_tumor = outfile.replace(PARAMS["sample_control"],
                                    PARAMS["sample_tumour"])
    infile_tumor = infile.replace(PARAMS["sample_control"],
                                  PARAMS["sample_tumour"])

    track = P.snip(os.path.basename(infile), ".bam")
    track_tumor = track.replace(PARAMS["sample_control"],
                                PARAMS["sample_tumour"])

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    mappingqc.buildPicardAlignmentStats(infile, outfile, genome)
    mappingqc.buildPicardAlignmentStats(infile_tumor, outfile_tumor, genome)
Ejemplo n.º 9
0
def picardMarkDuplicates(infile, outfile):
    '''
    Yield duplication metrics using Picard Tools.
    '''

    out_dir = os.path.dirname(outfile)

    bam_in = os.path.join(os.path.dirname(outfile),
                          "outs/possorted_genome_bam.bam")

    base_bam = 'marked_duplicates.bam'
    base_metrics = os.path.basename(outfile)

    picard_options = PARAMS["picard_markduplicate_options"]
    barcode_tag = PARAMS["picard_barcode_tag"]
    read_one_barcode_tag = PARAMS["picard_read_one_barcode_tag"]
    read_two_barcode_tag = PARAMS["picard_read_two_barcode_tag"]
    validation_stringency = PARAMS["picard_validation_stringency"]

    job_threads = PICARD_THREADS
    job_memory = PICARD_MEMORY

    local_tmpdir = P.get_temp_dir()

    statement = '''picard_out=`mktemp -d -p %(local_tmpdir)s`;
                   MarkDuplicates
                   I=%(bam_in)s
                   O=${picard_out}/%(base_bam)s
                   M=${picard_out}/%(base_metrics)s
                   BARCODE_TAG=%(barcode_tag)s
                   READ_ONE_BARCODE_TAG=%(read_one_barcode_tag)s
                   READ_TWO_BARCODE_TAG=%(read_two_barcode_tag)s
                   VALIDATION_STRINGENCY=%(validation_stringency)s
                   %(picard_options)s;
                   grep . ${picard_out}/%(base_metrics)s
                   | grep -v "#"
                   | head -n2
                   > %(outfile)s;
                   rm -rv ${picard_out}
                '''

    P.run(statement)
Ejemplo n.º 10
0
def GATKReadGroups(infile,
                   outfile,
                   genome,
                   library="unknown",
                   platform="Illumina",
                   platform_unit="1",
                   track="unknown"):
    '''Reorders BAM according to reference fasta and adds read groups'''

    if track == 'unknown':
        track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''picard ReorderSam
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    REFERENCE=%(genome)s
                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
                    VALIDATION_STRINGENCY=SILENT ;''' % locals()

    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                 ''' % locals()

    statement += '''picard AddOrReplaceReadGroups
                    INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    OUTPUT=%(outfile)s
                    RGLB=%(library)s
                    RGPL=%(platform)s
                    RGPU=%(platform_unit)s
                    RGSM=%(track)s
                    VALIDATION_STRINGENCY=SILENT ;''' % locals()

    statement += '''samtools index %(outfile)s ;
                 ''' % locals()
    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()

    P.run(statement)
Ejemplo n.º 11
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # configure job_threads with fastq_screen_options from P.get_params()
    job_threads = re.findall(r'--threads \d+', P.get_params()['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))

    tempdir = P.get_temp_dir(".")
    conf_fn = os.path.join(tempdir, "fastq_screen.conf")
    with iotools.open_file(conf_fn, "w") as f:
        for i, k in P.get_params().items():
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = mapping.FastqScreen(config_filename=conf_fn)
    statement = m.build((infiles,), outfile)
    P.run(statement, job_memory="8G")
    shutil.rmtree(tempdir)
    iotools.touch_file(outfile)
Ejemplo n.º 12
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio, MEME is not run on
    all intervals but only the top 10% of intervals (peakval) are
    used.  Also, only the segment of 200 bp around the peak is used
    and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
        return

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    meme %(infile)s -dna -revcomp
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(motifs_max_size)s
    %(meme_options)s
       > %(outfile)s.log
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)
Ejemplo n.º 13
0
    def run(self, infile, outfile, params):

        if not os.path.exists(params.reference_bam):
            raise OSError("reference bam file {} does not exist".format(
                params.reference_bam))

        tmpdir = P.get_temp_dir(clear=True)

        statement = (
            "mkdir {tmpdir}; "
            "samtools sort -n {infile} > {tmpdir}/comp.bam; "
            "samtools sort -n {params.reference_bam} > {tmpdir}/ref.bam; "
            "{params.path} bam-compare-alignments "
            "--output-filename-pattern={outfile}.daisy_bam_compare_alignments_%%s.tsv "
            "{params.options} "
            "--input-bam={tmpdir}/comp.bam "
            "--reference-bam={tmpdir}/ref.bam "
            ">& {outfile}; "
            "rm -rf {tmpdir}; ".format(**locals()))

        retval = P.run(statement)

        return retval
Ejemplo n.º 14
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if iotools.is_empty(dbfile) or len(motiffiles) == 0:
        iotools.touch_file(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.get_temp_dir(".")
    tmpfile = P.get_temp_filename(".")

    for motiffile in motiffiles:
        if iotools.is_empty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = iotools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

        of = iotools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

    P.run("gzip < %(tmpfile)s > %(outfile)s")

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Ejemplo n.º 15
0
    def buildStatement(self, *args, **PARAMS):
        """
        Generate run statement for processing single, paired, or paired
        + singleton samples. 

        Required arguments: 
        index
        reference
        
        """

        run_options = PARAMS["sortmerna_run_options"]
        threads = PARAMS["sortmerna_threads"]

        # A comma separated list of references
        references = PARAMS["sortmerna_reference"]
        references = ' --ref '.join(references.split(','))
        # All listed references must be pre-indexed in this location
        index_dir = PARAMS[
            "sortmerna_index"]  # Check this isn't automatically passed.

        tmpf = P.get_temp_dir('.')
        tmpf_kvdb = os.path.join(tmpf, 'kvdb')
        tmpf_readb = os.path.join(tmpf, 'readb')

        if not self.fastn2:
            # Run sortMeRNA for single reads
            in_fastn1 = self.fastn1
            in_prefix = P.snip(in_fastn1, self.fn_suffix, strip_path=True)
            out_prefix = os.path.join(self.outdir, in_prefix)

            # Run sortMeRNA for single reads
            statement = (
                "sortmerna"
                " --index 0"  # skip indexing, assume in idx-dir
                " --fastx"
                " --reads %(in_fastn1)s"
                " --ref %(references)s"
                " --idx-dir %(index_dir)s"  # location of reference indexes
                " --aligned %(out_prefix)s_aligned"  # output location of aligned seq
                " --other %(out_prefix)s_unaligned"  # output location of unalinged seq
                " --readb %(tmpf_readb)s"  # location of tmp file for reads
                " --kvdb %(tmpf_kvdb)s"  # location of tmp file for kv pairs
                " --threads %(threads)s"
                " --zip-out" % locals())

        else:
            # Run sortMeRNA for paired reads
            in_fastn1 = self.fastn1
            in_fastn2 = self.fastn2
            in_prefix = P.snip(in_fastn1, self.fn_suffix, strip_path=True)
            out_prefix = os.path.join(self.outdir, in_prefix)
            # Run sortMeRNA for single reads
            statement = (
                "sortmerna"
                " --index 0"  # skip indexing, assume in idx-dir
                " --fastx"
                " --reads %(in_fastn1)s"  # First read file
                " --reads %(in_fastn2)s"  # Second read file
                " --ref %(references)s"
                " --idx-dir %(index_dir)s"  # location of reference indexes
                " --aligned %(out_prefix)s_aligned"  # output location of aligned seq
                " --other %(out_prefix)s_unaligned"  # output location of unalinged seq
                " --readb %(tmpf_readb)s"  # location of tmp file for reads
                " --kvdb %(tmpf_kvdb)s"  # location of tmp file for kv pairs
                " --paired_in"  # If one read is aligned, both are output to aligned file
                " --out2"  # Output paired reads to separate files
                " --threads %(threads)s"
                " --zip-out" % locals())

        if self.fastn3 and not PARAMS.get('sortmerna_skip_singletons', False):
            in_fastn3 = self.fastn3
            statement_2 = (
                "sortmerna"
                " --index 0"  # skip indexing, assume in idx-dir
                " --fastx"
                " --reads %(in_fastn3)s"
                " --idx-dir %(index_dir)s"  # location of reference indexes
                " --ref %(references)s"
                " --aligned %(out_prefix)s_aligned_singleton"  # output location of aligned seq
                " --other  %(out_prefix)s_unaligned_singleton"  # output location of unalinged seq
                " --readb %(tmpf_readb)s"  # location of tmp file for reads
                " --kvdb %(tmpf_kvdb)s"  # location of tmp file for kv pairs
                " --threads %(threads)s"
                " --zip-out" % locals())

            statement = " && ".join([
                statement,
                "rm -rf %(tmpf)s/*" % locals(),  # location of tmp_readb & kvdb
                statement_2,
                "rm -rf %(tmpf)s" % locals()
            ])

        return statement, run_options
Ejemplo n.º 16
0
def redirect2mounts(config,
                    mountpoint=None,
                    debug=None,
                    mount_write=False,
                    substitute_only=False,
                    always_mount=False):
    """redirect filenames in dictionary config to a mount-point.

    Mount points in the config are indicated by the `arv=` prefix. If
    no option in config requires mounting, no mounting will be done and
    the method returns None.

    :param config: dictionary with config values. Will be modified in-place.
    :param mountpoint: if given, paths will be substituted by mountpoint. If None,
        a new mountpoint will be created.
    :param debug: if given, mount in debug mode and save log to filename.
    :param mount_write: if True, mount in --read-write mode.
    :param substitute_only: if True, only perform substitution, do not mount anything
        even if mountpoint is None.
    :param always_mount: if True, always mount, no matter if arv= prefix is present.

    :return: the mountpoint

    """
    arvados_options = ["--disable-event-listening"]
    if debug:
        arvados_options.append(" --debug --logfile={}".format(debug))

    if mount_write:
        arvados_options.append("--read-write")
        arvados_options = " ".join(arvados_options)
        if not mountpoint:
            mountpoint = P.get_temp_dir() + "/"
            E.info("redirect2mounts: mounting arvados at {} with --read-write".
                   format(mountpoint))
            E.run("arv-mount {} {}".format(arvados_options, mountpoint))
            E.info("redirect2mounts: arvados mounted at {} with --read-write".
                   format(mountpoint))
    else:
        arvados_options.append("--read-only")
        if always_mount:
            mountpoint = P.get_temp_dir() + "/"
            do_mount = True
        else:
            do_mount = False

        for d, key, value in IOTools.nested_iter(config):
            if isinstance(value, str):
                if "arv=" in value:
                    if substitute_only and mountpoint is None:
                        continue
                    if not mountpoint:
                        mountpoint = P.get_temp_dir() + "/"
                        do_mount = True
                    d[key] = re.sub("arv=", mountpoint, value)

        if do_mount:
            raise NotImplementedError("arvados support disabled")
            # if not arvados.have_arvados():
            #     raise ValueError(
            #         "config file requires arvados access, but arvados not available")
            arvados_options = " ".join(arvados_options)
            E.debug("redirect2mounts: mounting arvados at {} with options {}".
                    format(mountpoint, arvados_options))
            E.run("arv-mount {} {}".format(arvados_options, mountpoint))
            E.debug(
                "redirect2mounts: arvados mounted at {}".format(mountpoint))

    return mountpoint
Ejemplo n.º 17
0
    def run(self, infiles, outfile, params):

        tmpdir = P.get_temp_dir(clear=True)

        statements = ["mkdir {}".format(tmpdir)]

        if params.remove_fields:
            cleanup_statement = ("| {params.path} annotate "
                                 "-x {params.remove_fields} "
                                 "2> {outfile}_annotate.log ".format(
                                     **locals()))
        else:
            cleanup_statement = ""

        # the current pattern is properly overly specific and
        # substitutes ./. with 0/0
        if params.set_missing_genotype_to_reference:
            set_genotype = "| perl -p -e 's/\.\/\./0\/0/g'"
        else:
            set_genotype = ""

        with IOTools.open_file(outfile + ".filelist_blocks", "w") as blockf:

            for start in range(0, len(infiles), self.block_size):

                fn = outfile + ".filelist_{}".format(start)
                fn_vcf = os.path.join(tmpdir, "block_{}.vcf.gz".format(start))
                with IOTools.open_file(fn, "w") as outf:
                    end = start + self.block_size
                    outf.write("\n".join(infiles[start:end]) + "\n")

                statements.append("{params.path} merge "
                                  "{params.options} "
                                  "-O v "
                                  "--file-list {outfile}.filelist_{start} "
                                  "2> {outfile}_merge_{start}.log "
                                  "{cleanup_statement} "
                                  "{set_genotype} "
                                  "| bgzip "
                                  "> {fn_vcf}; "
                                  "tabix -p vcf {fn_vcf}".format(**locals()))

                blockf.write(fn_vcf + "\n")

        if params.restrict_to_all:
            filter_statement = ("| {params.path} filter "
                                "--include \"FORMAT/GT != '.'\" "
                                "-O v "
                                "2> {outfile}_filter.log ".format(**locals()))
        else:
            filter_statement = ""

        statements.append("{params.path} merge "
                          "{params.options} "
                          "-O v "
                          "--file-list {outfile}.filelist_blocks "
                          "2> {outfile}_merge.log "
                          "{filter_statement} "
                          "| bgzip "
                          "> {outfile}; "
                          "tabix -p vcf {outfile} ".format(**locals()))

        statements.append("rm -rf {}".format(tmpdir))

        statement = "; ".join(statements)

        retvals = P.run(statement, **params._asdict())

        return retvals
Ejemplo n.º 18
0
def removeHost(fastq1, outfile):
    '''Remove host contamination using bmtagger'''

    outf_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.txt'
    outf_host_stub = P.snip(outf_host, '.txt') + '_toremove'

    # Currently disabled. Has no effect. See drop_fastq.py
    # # Whether to keep pair if a read is identified as host.
    # if PARAMS['bmtagger_keep_pairs']:
    #     keep_pairs = True
    #     E.info("BMTagger: reads with a pair identified as host will be"
    #            " discarded")
    # else:
    #     keep_pairs = False
    #     E.info("BMTagger: reads with a pair identified as host will be"
    #            " kept as singletons (assuming they are not also identified"
    #            " as host)")

    if IS_PAIRED:
        fastq2 = P.snip(fastq1, '.1.gz') + '.2.gz'
        fastq3 = P.snip(fastq1, '.1.gz') + '.3.gz'

        to_remove_paired = P.get_temp_filename('.')
        to_remove_singletons = P.get_temp_filename('.')

        # In some cases, it may be desirable to screen against multiple hosts.
        indexes = zip(PARAMS['bmtagger_bitmask'].split(','),
                      PARAMS['bmtagger_srprism'].split(','))
        for n, indexes in enumerate(indexes, 1):
            n = str(n)
            bitmask, srprism = indexes

            # Screen the paired reads, then singletons
            tmpdir1 = P.get_temp_dir('.')
            tmpdir2 = P.get_temp_dir('.')

            tmpf1 = P.get_temp_filename('.')
            tmpf2 = P.get_temp_filename('.')
            tmpf3 = P.get_temp_filename('.')

            # bmtagger truncates fasta headers...  sed 's/[[:space:]]\+/__/g'
            # It won't accept... sed 's|[[:space:]].*$|/1|'
            # It also fails if fastq1 header differs from fastq2
            statement1 = (
                "zcat %(fastq1)s > %(tmpf1)s &&"
                " zcat %(fastq2)s > %(tmpf2)s &&"
                " bmtagger.sh"
                "  -b %(bitmask)s"
                "  -x %(srprism)s"
                "  -T %(tmpdir1)s"
                "  -q1"  # Input is fastq
                "  -1 %(tmpf1)s"
                "  -2 %(tmpf2)s"
                "  -o %(outf_host_stub)s_paired%(n)s"
                "  &> %(outfile)s.log &&"
                " cat %(outf_host_stub)s_paired%(n)s"
                "  >> %(to_remove_paired)s &&"
                " rm -rf %(tmpdir1)s %(tmpf1)s %(tmpf2)s"
                "  %(outf_host_stub)s_paired%(n)s")

            # Screen the singletons
            if IOTools.open_file(fastq3).read(1):
                statement2 = (
                    "zcat %(fastq3)s > %(tmpf3)s &&"
                    " bmtagger.sh"
                    "  -b %(bitmask)s"
                    "  -x %(srprism)s"
                    "  -T %(tmpdir2)s"
                    "  -q1"  # Input is fastq
                    "  -1 %(tmpf3)s"
                    "  -o %(outf_host_stub)s_singletons%(n)s"
                    " &>> %(outfile)s.log &&"
                    " cat %(outf_host_stub)s_singletons%(n)s"
                    "  >> %(to_remove_singletons)s &&"
                    " rm -rf %(tmpdir2)s %(tmpf3)s"
                    "  %(outf_host_stub)s_singletons%(n)s")
            else:
                statement2 = ("touch  %(to_remove_singletons)s &&"
                              " rm -rf %(tmpdir2)s %(tmpf3)s")

            statement = " && ".join([statement1, statement2])

            P.run(statement, job_options=PARAMS['bmtagger_run_options'])

        # Drop host contaminated reads
        # A hack due to the fact that BMTagger truncates fastq identifiers
        # TO DO: Look at bmtagger/.../bin/extract_fullseq
        drop_script = os.path.join(
            os.path.splitext(__file__)[0], 'drop_fastqs.py')

        fastq1_out = outfile
        fastq2_out = P.snip(outfile, '.1.gz') + '.2.gz'
        fastq3_out = P.snip(outfile, '.1.gz') + '.3.gz'

        fastq1_host = P.snip(outfile,
                             '_dehost.fastq.1.gz') + '_host.fastq.1.gz'
        fastq2_host = P.snip(outfile,
                             '_dehost.fastq.1.gz') + '_host.fastq.2.gz'
        fastq3_host = P.snip(outfile,
                             '_dehost.fastq.1.gz') + '_host.fastq.3.gz'

        statement = ("python %(drop_script)s"
                     " --fastq1 %(fastq1)s"
                     " --fastq2 %(fastq2)s"
                     " --fastq3 %(fastq3)s"
                     " --to-drop-paired %(to_remove_paired)s"
                     " --to-drop-single %(to_remove_singletons)s"
                     " --fastq-out1 %(fastq1_out)s"
                     " --fastq-out2 %(fastq2_out)s"
                     " --fastq-out3 %(fastq3_out)s"
                     " --fastq-drop1 %(fastq1_host)s"
                     " --fastq-drop2 %(fastq2_host)s"
                     " --fastq-drop3 %(fastq3_host)s"
                     " &>> %(outfile)s.log")

        P.run(statement)

        os.unlink(to_remove_paired)
        os.unlink(to_remove_singletons)

    else:
        indexes = zip(PARAMS['bmtagger_bitmask'].split(','),
                      PARAMS['bmtagger_srprism'].split(','))
        to_remove = P.get_temp_filename('.')

        for n, indexes in enumerate(indexes, 1):
            n = str(n)
            bitmask, srprism = indexes
            # Screen the singletons
            tmpdir1 = P.get_temp_dir('.')
            tmpf = P.get_temp_filename('.')

            statement = (
                "zcat %(fastq1)s > %(tmpf)s &&"
                " bmtagger.sh"
                "  -b %(bitmask)s"
                "  -x %(srprism)s"
                "  -T %(tmpdir1)s"
                "  -q1"  # Input is fastq
                "  -1 %(tmpf)s"
                "  -o %(outf_host_stub)s_%(n)s"
                "  &>> %(outfile)s.log &&"
                " cat %(outf_host_stub)s_%(n)s >> %(to_remove)s"
                " rm -rf %(tmpdir1)s %(tmpf)s %(outf_host_stub)s_%(n)s")

            P.run(statement, job_options=PARAMS['bmtagger_run_options'])

        # Drop host contaminated reads
        drop_script = ps.path.join(
            os.path.splitext(__file__)[0], 'drop_single_fastqs.py')

        fastq_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.1.gz'

        statement = ("python %(drop_script)s"
                     " --fastq1 %(fastq1)s"
                     " --to-drop-single %(to_remove)s"
                     " --fastq-out1 %(outfile)s"
                     " --fastq-drop1 %(fastq_host)s"
                     " &>> %(outfile)s.log")
        P.run(statement)

        os.unlink(to_remove)
Ejemplo n.º 19
0
 def setUp(self):
     # ignore command line arguments for pytest
     P.initialize(argv=["test"])
     self.work_dir = P.get_temp_dir(shared=True)
Ejemplo n.º 20
0
 def setUp(self):
     self.work_dir = P.get_temp_dir(shared=True)