Esempio n. 1
0
def checkFileExistence(infile, outfile):
    '''check whether file exists.

    Files are uncompressed before checking existence.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="file",
                         suffixes=P.as_list(
                             P.as_list(PARAMS.get('%s_regex_exist' % track,
                                                  ""))))
Esempio n. 2
0
def buildCheckSums(infile, outfile):
    '''build checksums for files in the build directory.

    Files are uncompressed before computing the checksum
    as gzip stores meta information such as the time stamp.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="md5sum",
                         suffixes=P.as_list(
                             P.as_list(PARAMS.get('%s_regex_md5' % track,
                                                  ""))))
Esempio n. 3
0
def buildLineCounts(infile, outfile):
    '''compute line counts.

    Files are uncompressed before computing the number of lines.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="wc -l",
                         suffixes=P.as_list(
                             P.as_list(
                                 PARAMS.get('%s_regex_linecount' % track,
                                            ""))))
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(PARAMS["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                               outfile)

    if IOTools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Esempio n. 5
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substitute_parameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.as_list(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
Esempio n. 6
0
def run_test(infile, outfile):
    '''run a test.

    Multiple targets are run iteratively.
    '''

    track = P.snip(outfile, ".log")
    pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):])

    pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full"))

    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    #to_cluster = False

    template_statement = ("cd %%(track)s.dir; "
                          "xvfb-run -d cgatflow %%(pipeline_name)s "
                          "%%(pipeline_options)s "
                          "%%(workflow_options)s make %s "
                          "-L ../%%(outfile)s "
                          "-S ../%%(outfile)s.stdout "
                          "-E ../%%(outfile)s.stderr")

    if len(pipeline_targets) == 1:
        statement = template_statement % pipeline_targets[0]
        P.run(statement, ignore_errors=True, job_memory="unlimited")
    else:
        statements = []
        for pipeline_target in pipeline_targets:
            statements.append(template_statement % pipeline_target)
        P.run(statement, ignore_errors=True, job_memory="unlimited")
Esempio n. 7
0
def importRNAAnnotationFromUCSC(outfile):
    """This task downloads UCSC repetetive RNA types.
    """
    PipelineGtfsubset.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.as_list(PARAMS["ucsc_rnatypes"]),
        outfile=outfile,
        remove_contigs_regex=PARAMS["ncbi_remove_contigs"],
        job_memory=PARAMS["job_memory"])
Esempio n. 8
0
def importRepeatsFromUCSC(outfile):
    """This task downloads UCSC repeats types as identified
    in the configuration file.
    """
    PipelineGtfsubset.getRepeatDataFromUCSC(dbhandle=connectToUCSC(),
                                            repclasses=P.as_list(
                                                PARAMS["ucsc_repeattypes"]),
                                            outfile=outfile,
                                            job_memory=PARAMS["job_memory"])
Esempio n. 9
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)
Esempio n. 10
0
def exportIntervalSequences(infile, outfile, track, method):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    dbhandle = connect()

    try:
        halfwidth = int(PARAMS[method+"_halfwidth"])
        full = False
    except ValueError:
        full = True
        halfwidth = None

    try:
        maxsize = int(PARAMS[method+"_max_size"])
    except ValueError:
        maxsize = None

    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=full,
        masker=P.as_list(PARAMS[method+'_masker']),
        halfwidth=halfwidth,
        maxsize=maxsize,
        num_sequences=PARAMS[method+"_num_sequences"],
        proportion=PARAMS[method+"_proportion"],
        min_sequences=PARAMS[method+"_min_sequences"],
        order=PARAMS[method+'_score'])

    if nseq == 0:
        E.warn("%s: no sequences - %s skipped" % (outfile, method))
        P.touch(outfile)
Esempio n. 11
0
def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.

    By default, this method searches for ``track.bam`` file in the
    current directory and returns an offset of 0.

    Associations can be defined in the .yml file in the section
    [bams]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::

       [bams]
       track1=track1.bam,track2.bam

    Glob expressions are permitted.

    Offsets are used to shift tags in ChIP experiments. Offsets
    need to be defined in the [offsets] sections. If no offsets
    are defined, the method returns a list of 0 offsets.

    Offsets need to be defined in the same order as the bam files::

       [offsets]
       track1=120,200

    returns a list of BAM files and offsets.

    Default tracks and offsets can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam

    '''
    fn = track.asFile()
    bamfiles = glob.glob("%s.bam" % fn)

    if bamfiles == []:
        if "bams_%s" % fn.lower() in PARAMS:
            for ff in P.as_list(PARAMS["bams_%s" % fn.lower()]):
                bamfiles.extend(glob.glob(ff))
        else:
            for pattern, value in P.CONFIG.items("bams"):
                if "%" in pattern:
                    p = re.sub("%", "\S+", pattern)
                    if re.search(p, fn, re.IGNORECASE):
                        bamfiles.extend(glob.glob(value))

    offsets = []
    if "offsets_%s" % fn.lower() in PARAMS:
        offsets = list(map(int, P.as_list(PARAMS["offsets_%s" % fn.lower()])))
    else:
        for pattern, value in P.CONFIG.items("offsets"):
            if "%" in pattern:
                p = re.sub("%", "\S+", pattern)
                if re.search(p, fn, re.IGNORECASE):
                    offsets.extend(list(map(int, value.split(","))))

    if offsets == []:
        offsets = [0] * len(bamfiles)

    if len(bamfiles) != len(offsets):
        raise ValueError("number of BAM files %s is not the "
                         "same as number of offsets: %s" %
                         (str(bamfiles), str(offsets)))

    return bamfiles, offsets
Esempio n. 12
0
def compareCheckSums(infiles, outfile):
    '''compare checksum files against existing reference data.
    '''

    outf = IOTools.open_file(outfile, "w")
    outf.write("\t".join((
        ("track", "status", "job_finished", "nfiles", "nref", "missing",
         "extra", "different", "different_md5", "different_lines", "same",
         "same_md5", "same_lines", "same_exist", "files_missing",
         "files_extra", "files_different_md5", "files_different_lines"))) +
               "\n")

    for infile in infiles:
        E.info("working on {}".format(infile))
        track = P.snip(infile, ".stats")

        logfiles = glob.glob(track + "*.log")
        job_finished = True
        for logfile in logfiles:
            is_complete = IOTools.is_complete(logfile)
            E.debug("logcheck: {} = {}".format(logfile, is_complete))
            job_finished = job_finished and is_complete

        reffile = track + ".ref"

        # regular expression of files to test only for existence
        regex_exist = PARAMS.get('%s_regex_exist' % track, None)
        if regex_exist:
            regex_exist = re.compile("|".join(P.as_list(regex_exist)))

        regex_linecount = PARAMS.get('%s_regex_linecount' % track, None)
        if regex_linecount:
            regex_linecount = re.compile("|".join(P.as_list(regex_linecount)))

        regex_md5 = PARAMS.get('%s_regex_md5' % track, None)
        if regex_md5:
            regex_md5 = re.compile("|".join(P.as_list(regex_md5)))

        if not os.path.exists(reffile):
            raise ValueError('no reference data defined for %s' % track)

        cmp_data = pandas.read_csv(IOTools.open_file(infile),
                                   sep="\t",
                                   index_col=0)

        ref_data = pandas.read_csv(IOTools.open_file(reffile),
                                   sep="\t",
                                   index_col=0)

        shared_files = set(cmp_data.index).intersection(ref_data.index)
        missing = set(ref_data.index).difference(cmp_data.index)
        extra = set(cmp_data.index).difference(ref_data.index)

        different = set(shared_files)

        # remove those for which only check for existence
        if regex_exist:
            same_exist = set([x for x in different if regex_exist.search(x)])

            different = set(
                [x for x in different if not regex_exist.search(x)])
        else:
            same_exist = set()

        # select those for which only check for number of lines
        if regex_linecount:
            check_lines = [x for x in different if regex_linecount.search(x)]

            dd = (cmp_data['nlines'][check_lines] !=
                  ref_data['nlines'][check_lines])
            different_lines = set(dd.index[dd])
            different = different.difference(check_lines)

            dd = (cmp_data['nlines'][check_lines] == ref_data['nlines']
                  [check_lines])
            same_lines = set(dd.index[dd])

        else:
            different_lines = set()
            same_lines = set()

        # remainder - check md5
        if regex_md5:
            check_md5 = [x for x in different if regex_md5.search(x)]

            dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5])
            different_md5 = set(dd.index[dd])

            dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5])
            same_md5 = set(dd.index[dd])

        else:
            different_md5 = set()
            same_md5 = set()

        if job_finished and (len(missing) + len(extra) + len(different_md5) +
                             len(different_lines) == 0):
            status = "OK"
        else:
            status = "FAIL"

        outf.write("\t".join(
            map(str, (
                track,
                status,
                job_finished,
                len(cmp_data),
                len(ref_data),
                len(missing),
                len(extra),
                len(different_md5) + len(different_lines),
                len(different_md5),
                len(different_lines),
                len(same_md5) + len(same_lines) + len(same_exist),
                len(same_md5),
                len(same_lines),
                len(same_exist),
                ",".join(missing),
                ",".join(extra),
                ",".join(different_md5),
                ",".join(different_lines),
            ))) + "\n")

    outf.close()
Esempio n. 13
0
PARAMS = P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
])

# WARNING: pipeline names with underscores in their name are not allowed
TESTS = sorted(
    set([
        "test_{}".format(x.split("_")[1]) for x in PARAMS.keys()
        if x.startswith("test_")
    ]))


# obtain prerequisite generic data
@files([(None, "%s.tgz" % x)
        for x in P.as_list(PARAMS.get("prerequisites", ""))])
def setupPrerequisites(infile, outfile):
    '''setup pre-requisites.

    These are tar-balls that are unpacked, but not run.
    '''

    #to_cluster = False
    track = P.snip(outfile, ".tgz")

    # obtain data - should overwrite pipeline.yml file
    statement = '''
    wget --no-check-certificate -O %(track)s.tgz %(data_url)s/%(track)s.tgz'''
    P.run(statement)

    tf = tarfile.open(outfile)
Esempio n. 14
0
    def processReads(infile, outfiles):
        '''process reads from .fastq and other sequence files.
        '''
        trimmomatic_options = PARAMS["trimmomatic_options"]

        if PARAMS["auto_remove"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                "contaminants.fasta", PARAMS["trimmomatic_mismatches"],
                PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"],
                PARAMS["trimmomatic_min_adapter_len"],
                PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options

        elif PARAMS["trimmomatic_adapter"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                PARAMS["trimmomatic_adapter"],
                PARAMS["trimmomatic_mismatches"],
                PARAMS["trimmomatic_p_thresh"], PARAMS["trimmomatic_c_thresh"],
                PARAMS["trimmomatic_min_adapter_len"],
                PARAMS["trimmomatic_keep_both_reads"]) + trimmomatic_options

        job_threads = PARAMS["threads"]
        job_memory = "12G"

        track = re.match(REGEX_TRACK, infile).groups()[0]

        m = PipelinePreprocess.MasterProcessor(
            save=PARAMS["save"],
            summarize=PARAMS["summarize"],
            threads=PARAMS["threads"],
            qual_format=PARAMS['qual_format'])

        for tool in P.as_list(PARAMS["preprocessors"]):

            if tool == "fastx_trimmer":
                m.add(
                    PipelinePreprocess.FastxTrimmer(
                        PARAMS["fastx_trimmer_options"],
                        threads=PARAMS["threads"]))
            elif tool == "trimmomatic":
                m.add(
                    PipelinePreprocess.Trimmomatic(trimmomatic_options,
                                                   threads=PARAMS["threads"]))
            elif tool == "sickle":
                m.add(
                    PipelinePreprocess.Sickle(PARAMS["sickle_options"],
                                              threads=PARAMS["threads"]))
            elif tool == "trimgalore":
                m.add(
                    PipelinePreprocess.Trimgalore(PARAMS["trimgalore_options"],
                                                  threads=PARAMS["threads"]))
            elif tool == "flash":
                m.add(
                    PipelinePreprocess.Flash(PARAMS["flash_options"],
                                             threads=PARAMS["threads"]))
            elif tool == "reversecomplement":
                m.add(
                    PipelinePreprocess.ReverseComplement(
                        PARAMS["reversecomplement_options"]))
            elif tool == "pandaseq":
                m.add(
                    PipelinePreprocess.Pandaseq(PARAMS["pandaseq_options"],
                                                threads=PARAMS["threads"]))
            elif tool == "cutadapt":
                cutadapt_options = PARAMS["cutadapt_options"]
                if PARAMS["auto_remove"]:
                    cutadapt_options += " -a file:contaminants.fasta "
                m.add(
                    PipelinePreprocess.Cutadapt(
                        cutadapt_options,
                        threads=PARAMS["threads"],
                        untrimmed=PARAMS['cutadapt_reroute_untrimmed'],
                        process_paired=PARAMS["cutadapt_process_paired"]))
            else:
                raise NotImplementedError("tool '%s' not implemented" % tool)

        statement = m.build((infile, ), "processed.dir/trimmed-", track)
        P.run(statement)
Esempio n. 15
0
           regex(".*/(.*).bed.gz"),
           r"motifs/\1.control.fasta")
def exportMotifControlSequences(infile, outfile):
    '''for each interval, export the left and right
    sequence segment of the same size.
    '''
    PipelineMotifs.exportSequencesFromBedFile(
        infile, outfile,
        masker=PARAMS['motifs_masker'],
        mode="leftright")


############################################################
############################################################
############################################################
@active_if("meme" in P.as_list(PARAMS["methods"]) or
           "disc_meme" in P.as_list(PARAMS["methods"]))
@transform(loadIntervals,
           suffix("_intervals.load"),
           ".meme.fasta")
def exportMemeIntervalSequences(infile, outfile):
    
    track = os.path.basename(P.snip(infile, "_intervals.load"))

    exportIntervalSequences(infile, outfile, track, "meme")


############################################################
@follows(mkdir("meme.dir"))
@active_if("meme" in P.as_list(PARAMS["methods"]))
@transform(exportMemeIntervalSequences, regex("(.+).meme.fasta"),
Esempio n. 16
0
PARAMS = P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
])

dbname = PARAMS['db_name']
unmapped = PipelineEnrichment.getUnmapped(PARAMS)
outfilesuffixes = [
    "_genestoterms.tsv", "_termstogenes.tsv", "_termstodetails.tsv",
    "_termstoont.tsv"
]

unmappedouts = [["annotations.dir/%s%s" % (u, s) for s in outfilesuffixes]
                for u in unmapped]

hpatissues = P.as_list(PARAMS.get('hpa_tissue', {}))
hpatissues = [
    'clean_backgrounds.dir/%s_hpa_background.tsv' % tissue.replace(" ", "_")
    for tissue in hpatissues
]

########################################################
# Set up database connection
########################################################


def connect():
    '''utility function to connect to database.

    Use this method to connect to the pipeline database.
    Additional databases can be attached here as well.