예제 #1
0
def buildPicardCoverageStats(infile, outfile, baits, regions):
    '''run picard:CollectHSMetrics
    Generate coverage statistics for regions of interest from a bed
    file using Picard.
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    baits : :term:`bed` formatted file of bait regions
    regions : :term:`bed` formatted file of target regions
    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    statement = '''picard %(picard_opts)s CollectHsMetrics
    BAIT_INTERVALS=%(baits)s
    TARGET_INTERVALS=%(regions)s
    INPUT=%(infile)s
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=LENIENT''' % locals()
    P.run(statement)
예제 #2
0
def loadFastqc(infile, outfile):
    '''load FASTQC stats into database.'''
    track = P.snip(infile, ".fastqc")
    filename = os.path.join(PARAMS["exportdir"], "fastqc", track + "*_fastqc",
                            "fastqc_data.txt")
    PipelineReadqc.loadFastqc(filename, database_url=PARAMS["database"]["url"])
    IOTools.touch_file(outfile)
예제 #3
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.get_temp_dir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.yml
    with IOTools.open_file(os.path.join(tempdir, "fastq_screen.conf"),
                           "w") as f:
        for i, k in list(PARAMS.items()):
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles, ), outfile)
    P.run(statement, job_memory="8G")
    shutil.rmtree(tempdir)
    IOTools.touch_file(outfile)
예제 #4
0
def buildPicardGCStats(infile, outfile, genome_file):
    """picard:CollectGCBiasMetrics
    Collect GC bias metrics.
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.
    """

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    statement = '''picard %(picard_opts)s CollectGcBiasMetrics
    INPUT=%(infile)s
    REFERENCE_SEQUENCE=%(genome_file)s
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    CHART_OUTPUT=%(outfile)s.pdf
    SUMMARY_OUTPUT=%(outfile)s.summary
    >& %(outfile)s'''

    P.run(statement)
예제 #5
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.get_temp_filename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=P.get_params()["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        IOTools.touch_file(outfile)
    else:
        statement = '''
        BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run(statement)

    os.unlink(tmpfasta)
예제 #6
0
def buildPicardRnaSeqMetrics(infiles, strand, outfile):
    '''run picard:RNASeqMetrics



    Arguments
    ---------
    infiles : string
        Input filename in :term:`BAM` format.
        Genome file in refflat format
            (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat)
    outfile : string
        Output filename with picard output.

    '''
    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3
    infile, genome = infiles

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    statement = '''picard %(picard_opts)s CollectRnaSeqMetrics
    REF_FLAT=%(genome)s
    INPUT=%(infile)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    STRAND=%(strand)s
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run(statement)
예제 #7
0
def loadGO(infile, outfile, tablename):
    """import GO results into individual tables.

    This method concatenates all the results from
    a GO analysis and uploads into a single table.

    """

    indir = infile + ".dir"

    if not os.path.exists(indir):
        IOTools.touch_file(outfile)
        return

    load_statement = P.build_load_statement(tablename=tablename,
                                            options="--allow-empty-file "
                                            "--add-index=category "
                                            "--add-index=goid ")

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run(statement)
예제 #8
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substitute_parameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.as_list(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
예제 #9
0
def getCpGIslandsFromUCSC(dbhandle, outfile):
    '''get CpG islands from UCSC database and save as a :term:`bed`
    formatted file.

    The name column in the bed file will be set to the UCSC name.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`bed` format.
    '''

    table = "cpgIslandExt"
    sql = """SELECT chrom, chromStart, chromEnd, name
    FROM %(table)s ORDER by chrom, chromStart"""
    sql = sql % locals()

    E.debug("executing sql statement: %s" % sql)
    try:
        cc = dbhandle.execute(sql)
        outfile = IOTools.open_file(outfile, "w")
        for data in cc.fetchall():
            outfile.write("\t".join(map(str, data)) + "\n")
        outfile.close()
    except Exception:
        E.warn("Failed to connect to table %s. %s is empty" % (table, outfile))
        IOTools.touch_file(outfile)
예제 #10
0
def buildPicardInsertSizeStats(infile, outfile, genome_file):
    '''run Picard:CollectInsertSizeMetrics
    Collect insert size statistics.
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.
    '''
    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    statement = '''picard %(picard_opts)s CollectInsertSizeMetrics
    INPUT=%(infile)s
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run(statement, job_memory=PICARD_MEMORY)
예제 #11
0
def buildPicardDuplicateStats(infile, outfile):
    '''run picard:MarkDuplicates
    Record duplicate metrics using Picard and keep the dedupped .bam
    file.
    Pair duplication is properly handled, including inter-chromosomal
    cases. SE data is also handled.  These stats also contain a
    histogram that estimates the return from additional sequecing.  No
    marked bam files are retained (/dev/null...)  Note that picards
    counts reads but they are in fact alignments.
    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    '''
    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    statement = '''picard %(picard_opts)s MarkDuplicates
    INPUT=%(infile)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s.duplicate_metrics
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s.log &&
    samtools index %(outfile)s'''
    P.run(statement)
예제 #12
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(P.get_params()["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "tomtom", outfile)

    if IOTools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        IOTools.touch_file(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
예제 #13
0
def runGOFromDatabase(outfile,
                      outdir,
                      statement_fg,
                      statement_bg,
                      go_file,
                      ontology_file=None,
                      samples=1000):
    """check for GO enrichment.

    Gene lists are extracted from a database.
    This method is a wrapper for `runGO.py`.

    Arguments
    ---------
    outfile : string
        Output filename
    outdir : string
        Output directory for auxiliary files
    statement_fg : string
        SQL statement to select genes of foreground set.
    statement_bg : string
        SQL statement to select genes in background set.
    go_file : string
        Filename with Gene-to-GO assignments
    ontology_file : string
        Filename with ontology information.
    samples : int
        Number of samples for empirical FDR. If not given, use
        BH FDR.
    """

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    cc = dbhandle.cursor()
    fg = set([x[0] for x in cc.execute(statement_fg).fetchall()])
    bg = set([x[0] for x in cc.execute(statement_bg).fetchall()])

    if len(fg) == 0:
        IOTools.touch_file(outfile)
        return

    fg_file = os.path.join(outdir, "foreground")
    bg_file = os.path.join(outdir, "background")
    outf = open(fg_file, "w")
    outf.write("\n".join(map(str, fg)) + "\n")
    outf.close()
    outf = open(bg_file, "w")
    outf.write("\n".join(map(str, bg)) + "\n")
    outf.close()

    runGOFromFiles(outfile,
                   outdir,
                   fg_file,
                   bg_file,
                   go_file,
                   ontology_file=ontology_file,
                   samples=samples)
예제 #14
0
 def test_touch_file_updates_existing_file(self):
     with IOTools.open_file(self.filename, "w") as outf:
         outf.write("some data\n")
     created = os.stat(self.filename).st_mtime
     time.sleep(1)
     IOTools.touch_file(self.filename)
     modified = os.stat(self.filename).st_mtime
     self.assertGreater(modified, created)
     with IOTools.open_file(self.filename) as inf:
         data = inf.read()
     self.assertEqual(data, "some data\n")
예제 #15
0
    def test_touch_file_creates_empty_file(self):
        self.assertFalse(os.path.exists(self.filename))
        IOTools.touch_file(self.filename)
        self.assertTrue(os.path.exists(self.filename))
        if self.filename.endswith(".gz"):
            self.assertFalse(IOTools.is_empty(self.filename))
        else:
            self.assertTrue(IOTools.is_empty(self.filename))

        with IOTools.open_file(self.filename) as inf:
            data = inf.read()
        self.assertEqual(len(data), 0)
예제 #16
0
def summarizeFastqScreen(infiles, outfiles):
    all_files = []
    for infile in infiles:
        all_files.extend(glob.glob(IOTools.snip(infile, "screen") + "*_screen.txt"))
    if len(all_files) == 0:
        E.warn("no fastqcscreen results to concatenate")
        for x in outfiles:
            IOTools.touch_file(x)
        return
    df_summary, df_details = PipelineReadqc.read_fastq_screen(
        all_files)
    df_summary.to_csv(outfiles[0], sep="\t", index=True)
    df_details.to_csv(outfiles[1], sep="\t", index=True)
예제 #17
0
def buildPicardDuplicationStats(infile, outfile):
    '''run picard:MarkDuplicates

    Record duplicate metrics using Picard, the marked records
    are discarded.

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    # currently, MarkDuplicates cannot handle split alignments from gsnap
    # these can be identified by the custom XT tag.
    if ".gsnap.bam" in infile:
        tmpf = P.get_temp_file(".")
        tmpfile_name = tmpf.name
        statement = '''samtools view -h %(infile)s
        | awk "!/\\tXT:/"
        | samtools view /dev/stdin -S -b > %(tmpfile_name)s;
        ''' % locals()
        data_source = tmpfile_name
    else:
        statement = ""
        data_source = infile

    statement += '''picard %(picard_opts)s MarkDuplicates
    INPUT=%(data_source)s
    ASSUME_SORTED=true
    METRICS_FILE=%(outfile)s
    OUTPUT=/dev/null
    VALIDATION_STRINGENCY=SILENT
    '''
    P.run(statement)

    if ".gsnap.bam" in infile:
        os.unlink(tmpfile_name)
예제 #18
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)
예제 #19
0
def buildIntronLevelReadCounts(infiles, outfile):
    '''count reads in gene models
    Count the reads from a :term:`bam` file which overlap the
    positions of introns in a :term:`gtf` format transcripts file.
    Parameters
    ----------
    infiles : list of str
       infile :term:`str`
          Input filename in :term:`bam` format
       geneset :term:`str`
          Input filename in :term:`gtf` format
    outfile : str
       Output filename in :term:`tsv` format
    .. note::
       In paired-end data sets each mate will be counted. Thus
       the actual read counts are approximately twice the fragment
       counts.
    '''
    infile, exons = infiles

    job_memory = "4G"

    if "transcriptome.dir" in infile:
        IOTools.touch_file(outfile)
        return

    statement = '''
    zcat %(exons)s
    | awk -v OFS="\\t" -v FS="\\t" '{$3="exon"; print}'
    | cgat gtf2table
          --reporter=genes
          --bam-file=%(infile)s
          --counter=length
          --column-prefix="introns_"
          --counter=read-counts
          --column-prefix=""
          --counter=read-coverage
          --column-prefix=coverage_
    | gzip
    > %(outfile)s
    '''

    P.run(statement)
예제 #20
0
def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.to_table(outfile)

    resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                              infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        IOTools.touch_file(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("name")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.get_temp_file(".")

    # parse the text file
    for line in IOTools.open_file(infile):
        if line.startswith("#Query"):
            tmpfile.write(
                "target_name\tquery_id\ttarget_id\toptimal_offset\tpvalue\tevalue\tqvalue\tOverlap\tquery_consensus\ttarget_consensus\torientation\n"
            )
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)
예제 #21
0
def buildPicardAlignmentStats(infile, outfile, genome_file):
    '''run picard:CollectMultipleMetrics

    Arguments
    ---------
    infile : string
        Input filename in :term:`BAM` format.
    outfile : string
        Output filename with picard output.
    genome_file : string
        Filename with genomic sequence.
    '''

    job_memory = PICARD_MEMORY
    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals(
    )
    job_threads = 3

    if BamTools.getNumReads(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        IOTools.touch_file(outfile)
        return

    # Picard seems to have problem if quality information is missing
    # or there is no sequence/quality information within the bam file.
    # Thus, add it explicitly.
    statement = '''cat %(infile)s
    | cgat bam2bam
    -v 0
    --method=set-sequence
    --output-sam
    --log=%(outfile)s.bam2bam.log
    | picard %(picard_opts)s CollectMultipleMetrics
    INPUT=/dev/stdin
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run(statement)
예제 #22
0
def compute_file_metrics(infile, outfile, metric, suffixes):
    """apply a tool to compute metrics on a list of files matching
    regex_pattern."""

    if suffixes is None or len(suffixes) == 0:
        E.info("No metrics computed for {}".format(outfile))
        IOTools.touch_file(outfile)
        return

    track = P.snip(infile, ".log")

    # convert regex patterns to a suffix match:
    # prepend a .*
    # append a $
    regex_pattern = " -or ".join(
        ["-regex .*{}$".format(pipes.quote(x)) for x in suffixes])

    E.debug("applying metric {} to files matching {}".format(
        metric, regex_pattern))

    if metric == "file":
        statement = '''find %(track)s.dir
        -type f
        -not -regex '.*\/report.*'
        -not -regex '.*\/_.*'
        \( %(regex_pattern)s \)
        | sort -k1,1
        > %(outfile)s'''
    else:
        statement = '''find %(track)s.dir
        -type f
        -not -regex '.*\/report.*'
        -not -regex '.*\/_.*'
        \( %(regex_pattern)s \)
        -exec %(scriptsdir)s/cgat_file_apply.sh {} %(metric)s \;
        | perl -p -e "s/ +/\\t/g"
        | sort -k1,1
        > %(outfile)s'''

    P.run(statement)
예제 #23
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # configure job_threads with fastq_screen_options from PARAMS
    job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))

    tempdir = P.get_temp_dir(".")
    conf_fn = os.path.join(tempdir, "fastq_screen.conf")
    with IOTools.open_file(conf_fn, "w") as f:
        for i, k in PARAMS.items():
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen(config_filename=conf_fn)
    statement = m.build((infiles,), outfile)
    P.run(statement, job_memory="8G")
    shutil.rmtree(tempdir)
    IOTools.touch_file(outfile)
예제 #24
0
def plotDETagStats(infile, composition_file, outfile):
    '''plot differential expression statistics

    Arguments
    ---------
    infile : string
        Filename with :term:`tsv` formatted list of differential
        methylation results output from :doc:`scripts/runExpression`.
    composition_file : string
        Filename with :term:`tsv` formatted data about nucleotide
        compositions of windows tested.
    outfile : string
        Output filename, used as sentinel only.
    '''

    Expression.plotDETagStats(infile,
                              outfile,
                              additional_file=composition_file,
                              join_columns=("contig", "start", "end"),
                              additional_columns=("CpG_density", "length"))

    IOTools.touch_file(outfile)
예제 #25
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio, MEME is not run on
    all intervals but only the top 10% of intervals (peakval) are
    used.  Also, only the segment of 200 bp around the peak is used
    and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
        return

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    meme %(infile)s -dna -revcomp
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(motifs_max_size)s
    %(meme_options)s
       > %(outfile)s.log
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)
예제 #26
0
def buildDiscoverySequences(infile, outfile, npeaks, width, masker):
    '''get the peak sequences, masking or not specificed in the ini file.
    '''

    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=[masker],
        halfwidth=width,
        maxsize=int(PARAMS["motifs_max_size"]),
        proportion=None,
        num_sequences=npeaks,
        order='peakval')

    if nseq == 0:
        E.warn("%s: no sequences in foreground" % outfile)
        IOTools.touch_file(outfile)
예제 #27
0
def buildSpikeResults(infile, outfile):
    '''build matrices with results from spike-in and upload
    into database.

    The method will output several files:

    .spiked.gz: Number of intervals that have been spiked-in
               for each bin of expression and fold-change

    .power.gz: Global power analysis - aggregates over all
        ranges of fold-change and expression and outputs the
        power, the proportion of intervals overall that
        could be detected as differentially methylated.

        This is a table with the following columns:

        fdr - fdr threshold
        power - power level, number of intervals detectable
        intervals - number of intervals in observed data at given
                    level of fdr and power.
        intervals_percent - percentage of intervals in observed data
              at given level of fdr and power

    The method will also upload the results into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`tsv` format. Usually the output of
        :mod:`scripts/runExpression`.
    outfile : string
        Output filename in :term:`tsv` format.

    '''

    expression_nbins = 10
    fold_nbins = 10

    spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz'

    if not os.path.exists(spikefile):
        E.warn('no spike data: %s' % spikefile)
        IOTools.touch_file(outfile)
        return

    ########################################
    # output and load spiked results
    tmpfile_name = P.get_temp_filename(shared=True)

    statement = '''zcat %(spikefile)s
    | grep -e "^spike" -e "^test_id"
    > %(tmpfile_name)s
    '''
    P.run(statement)

    E.debug("outputting spiked counts")
    (spiked, spiked_d2hist_counts, xedges, yedges,
     spiked_l10average, spiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".spiked.gz",
            infile_name=tmpfile_name,
            expression_nbins=expression_nbins,
            fold_nbins=fold_nbins)

    ########################################
    # output and load unspiked results
    statement = '''zcat %(infile)s
    | grep -v -e "^spike"
    > %(tmpfile_name)s
    '''
    P.run(statement)
    E.debug("outputting unspiked counts")

    (unspiked, unspiked_d2hist_counts, unspiked_xedges,
     unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz",
            infile_name=tmpfile_name,
            expression_bins=xedges,
            fold_bins=yedges)

    E.debug("computing power")

    assert xedges.all() == unspiked_xedges.all()

    tmpfile = IOTools.open_file(tmpfile_name, "w")
    tmpfile.write("\t".join(("expression", "fold", "fdr", "counts",
                             "percent")) + "\n")

    fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1))
    power_thresholds = numpy.arange(0.1, 1.1, 0.1)

    spiked_total = float(spiked_d2hist_counts.sum().sum())
    unspiked_total = float(unspiked_d2hist_counts.sum().sum())

    outf = IOTools.open_file(outfile, "w")
    outf.write("fdr\tpower\tintervals\tintervals_percent\n")

    # significant results
    for fdr in fdr_thresholds:
        take = spiked['qvalue'] < fdr

        # compute 2D histogram in spiked data below fdr threshold
        spiked_d2hist_fdr, xedges, yedges = \
            numpy.histogram2d(spiked_l10average[take],
                              spiked_l2fold[take],
                              bins=(xedges, yedges))

        # convert to percentage of spike-ins per bin
        spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts
        spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed)

        # set values without data to -1
        spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0

        # output to table for database upload
        for x, y in itertools.product(list(range(len(xedges) - 1)),
                                      list(range(len(yedges) - 1))):
            tmpfile.write("\t".join(
                map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y],
                          100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n")

        # take elements in spiked_hist_fdr above a certain threshold
        for power in power_thresholds:
            # select 2D bins at a given power level
            power_take = spiked_d2hist_fdr_normed >= power

            # select the counts in the unspiked data according
            # to this level
            power_counts = unspiked_d2hist_counts[power_take]

            outf.write("\t".join(
                map(str, (fdr, power, power_counts.sum().sum(), 100.0 *
                          power_counts.sum().sum() / unspiked_total))) + "\n")

    tmpfile.close()
    outf.close()

    # upload into table
    method = P.snip(os.path.dirname(outfile), ".dir")
    tablename = P.to_table(
        P.snip(outfile, "power.gz") + method + ".spike.load")

    P.load(tmpfile_name,
           outfile + ".log",
           tablename=tablename,
           options="--add-index=fdr")

    os.unlink(tmpfile_name)
예제 #28
0
def buildPseudogenes(infiles, outfile, dbhandle):
    '''build a set of pseudogenes.

    Transcripts are extracted from the GTF file and designated as
    pseudogenes if:

    * the gene_type or transcript_type contains the phrase
      "pseudo". This taken is from the database.

    * the feature is 'processed_transcript' and has similarity to
      protein coding genes. Similarity is assessed by aligning the
      transcript and peptide set against each other with exonerate_.

    Pseudogenic transcripts can overlap with protein coding
    transcripts.

    Arguments
    ---------
    infiles : list
       Filenames of ENSEMBL geneset in :term:`gtf` format
       and associated peptide sequences in :term:`fasta` format.
    outfile : filename
       Output in :term:`gtf` format with inferred or annotated
       pseudogenes.
    dbandle : object
       Database handle for extracting transcript biotypes.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    # JJ - there are also 'nontranslated_CDS', but no explanation of these
    if PARAMS["genome"].startswith("dm"):
        E.warn("Ensembl dm genome annotations only contain source"
               " 'pseudogenes' - skipping exonerate step")
        statement = """zcat %(infile_gtf)s
        |awk '$2 ~ /pseudogene/'
        | gzip
        > %(outfile)s"""
        P.run(statement)
        return

    tmpfile1 = P.get_temp_filename(shared=True)

    # collect processed transcripts and save as fasta sequences
    statement = '''
    zcat %(infile_gtf)s
    | awk '$2 ~ /processed/'
    | cgat gff2fasta
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run(statement)

    if IOTools.is_empty(tmpfile1):
        E.warn("no pseudogenes found")
        os.unlink(tmpfile1)
        IOTools.touch_file(outfile)
        return

    model = "protein2dna"

    # map processed transcripts against peptide sequences
    statement = '''
    cat %(tmpfile1)s
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model %(model)s
              --bestn 1
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\"
              --showalignment no --showsugar no --showcigar no --showvulgar no
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run(statement)

    os.unlink(tmpfile1)

    inf = IOTools.open_file("%s.links.gz" % outfile)
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and \
           best_matches[transcript_id][0] > score:
            continue
        best_matches[transcript_id] = (score, peptide_id)

    inf.close()

    E.info("found %i best links" % len(best_matches))
    new_pseudos = set(best_matches.keys())

    cc = dbhandle.cursor()
    known_pseudos = set([
        x[0] for x in cc.execute("""SELECT DISTINCT transcript_id
        FROM transcript_info
        WHERE transcript_biotype like '%pseudo%' OR
        gene_biotype like '%pseudo%' """)
    ])

    E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, "
           "intersection=%i" %
           ((len(new_pseudos), len(known_pseudos),
             len(new_pseudos.intersection(known_pseudos)))))

    all_pseudos = new_pseudos.union(known_pseudos)

    c = E.Counter()

    outf = IOTools.open_file(outfile, "w")
    inf = GTF.iterator(IOTools.open_file(infile_gtf))
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write("%s\n" % gtf)
    outf.close()

    E.info("exons: %s" % str(c))
예제 #29
0
def buildNUMTs(infile, outfile):
    '''output set of potential nuclear mitochondrial genes (NUMTs).

    This function works by aligning the mitochondrial chromosome
    against genome using exonerate_. This can take a while.

    Arguments
    ---------
    infile : string
       Ignored.
    outfile : filename
       Output in :term:`gtf` format with potential NUMTs.

    '''
    if not PARAMS["numts_mitochrom"]:
        E.info("skipping numts creation")
        IOTools.touch_file(outfile)
        return

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"])
        IOTools.touch_file(outfile)
        return

    tmpfile_mito = P.get_temp_filename(".")

    statement = '''
    cgat index_fasta
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run(statement)

    if IOTools.is_empty(tmpfile_mito):
        E.warn("mitochondrial genome empty.")
        os.unlink(tmpfile_mito)
        IOTools.touch_file(outfile)
        return

    format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi",
              "C")

    format = "\\\\t".join(["%%%s" % x for x in format])

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no
              --showvulgar no
              --ryo \\"%(format)s\\n\\"
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run(statement)

    # convert to gtf
    inf = IOTools.open_file("%s.links.gz" % outfile)
    outf = IOTools.open_file(outfile, "w")

    min_score = PARAMS["numts_score"]

    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end, target_contig,
         target_strand, target_start, target_end, score, pid,
         alignment) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score:
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int(target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1

    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))

    os.unlink(tmpfile_mito)
예제 #30
0
def makeAdaptorFasta(infile, outfile, track, dbh, contaminants_file):
    '''Generate a .fasta file of adaptor sequences that are
    overrepresented in the reads from a sample.

    Requires cutadapt >= 1.7.

    Arguments
    ---------
    infile : string
        Input filename that has been QC'ed. The filename is used to
        check if the input was a :term:`sra` file and guess the
        number of tracks to check.
    outfile : string
        Output filename in :term:`fasta` format.
    track : string
        Track name, used to access FastQC results in database.
    dbh : object
        Database handle.
    contaminants_file : string
        Path of file containing contaminants used for screening by
        Fastqc.

    '''
    tracks = [track]

    if infile.endswith(".sra"):
        # patch for SRA files, look at multiple tracks
        f, fastq_format, datatype = Sra.peek(infile)
        if len(f) == 2:
            tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.1.gz"):
        tracks = [track + "_fastq_1", track + "_fastq_2"]
    elif infile.endswith(".fastq.gz"):
        tracks = [track]

    found_contaminants = []

    for t in tracks:
        table = PipelineTracks.AutoSample(os.path.basename(t)).asTable()

        # if sample name starts with a number, sql table will have
        # prepended "_"
        if re.match("^\d+.*", table):
            table = "_" + table

        query = '''SELECT Possible_Source, Sequence FROM
        %s_fastqc_Overrepresented_sequences;''' % table

        cc = dbh.cursor()

        # if there is no contamination table for even a single sample
        # it will prevent the whole pipeline progressing
        try:
            found_contaminants.extend(cc.execute(query).fetchall())
        except sqlite3.OperationalError:
            E.warn("No table found for {}".format(t))

    if len(found_contaminants) == 0:
        IOTools.touch_file(outfile)
        return

    # read contaminants from existing file
    with IOTools.open_file(contaminants_file, "r") as inf:
        known_contaminants = [l.split() for l in inf
                              if not l.startswith("#") and l.strip()]
        known_contaminants = {" ".join(x[:-1]): x[-1]
                              for x in known_contaminants}

    # output the full sequence of the contaminant if found
    # in the list of known contaminants, otherwise don't report!

    matched_contaminants = set()
    with IOTools.open_file(outfile, "w") as outf:
        for found_source, found_seq in found_contaminants:
            possible_source = found_source.split(" (")[0]

            if possible_source in known_contaminants:
                matched_contaminants.update((possible_source,))
            else:
                pass

        if len(matched_contaminants) > 0:
            for match in matched_contaminants:
                outf.write(">%s\n%s\n" % (match.replace(" ,", ""),
                                          known_contaminants[match]))