コード例 #1
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(P.get_params()["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "tomtom", outfile)

    if iotools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        iotools.touch_file(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
コード例 #2
0
def create_files(outfile):

    E.debug("creating output file {}".format(outfile))
    with open(outfile, "w") as outf:
        outf.write("\n".join(
            map(
                str,
                numpy.random.normal(P.get_params()["mu"],
                                    P.get_params()["sigma"],
                                    P.get_params()["num_samples"]))) + "\n")
コード例 #3
0
ファイル: motifs.py プロジェクト: tw7649116/cgat-flow
def runGLAM2(infile, outfile, dbhandle):
    '''run glam2 on all intervals and motifs.

    In order to increase the signal/noise ratio, MEME is not run on
    all intervals but only the top 10% of intervals (peakval) are
    used.  Also, only the segment of 200 bp around the peak is used
    and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    '''
    to_cluster = True

    target_path = os.path.join(
        os.path.abspath(P.get_params()["exportdir"]), "glam2", outfile)
    track = infile[:-len(".fasta")]

    tmpdir = tempfile.mkdtemp()
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = motifs.writeSequencesForIntervals(
        track, tmpfasta,
        dbhandle,
        full=False,
        halfwidth=int(
            P.get_params()["meme_halfwidth"]),
        maxsize=int(
            P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"])

    min_sequences = int(nseq / 10.0)
    statement = '''
    %(execglam2)s -2 -O %(tmpdir)s %(glam2_options)s -z %(min_sequences)i n %(tmpfasta)s > %(outfile)s.log
    '''
    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "glam2.txt"), outfile)
コード例 #4
0
def reconcileReads(infile, outfile):
    if P.get_params()["reconcile"] == 1:
        in1 = infile
        in2 = infile.replace(".fastq.1.gz", ".fastq.2.gz")
        outfile = outfile.replace(".fastq.1.gz",  "")

        statement = """cgat fastqs2fastqs
            --method=reconcile
            --output-filename-pattern=%(outfile)s.fastq.%%s.gz
            %(in1)s %(in2)s"""

        P.run(statement,
              job_threads=P.get_params()["threads"],
              job_memory="8G")
コード例 #5
0
def main(argv=None):
    workflow_options = []
    if "--local" in argv:
        workflow_options.append("--local")
    workflow_options.append("-p {}".format(
        P.get_params()["cluster"]["num_jobs"]))

    P.get_params()["workflow_options"] == "".join(workflow_options)
    # manually set location of test scripts - this needs to be better organized
    # 1. make scripts live alongside pipeline_testing.py
    # 2. make scripts available via cgatflow CLI
    # 3. include scripts in pipeline_testing
    P.get_params()["test_scriptsdir"] = os.path.join(
        os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "scripts")
    P.main(argv)
コード例 #6
0
def connect():
    '''
    Setup a connection to an sqlite database
    '''

    dbh = sqlite3.connect(P.get_params()['database'])
    return dbh
コード例 #7
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.get_temp_filename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=P.get_params()["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        iotools.touch_file(outfile)
    else:
        statement = '''
        BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run(statement)

    os.unlink(tmpfasta)
コード例 #8
0
ファイル: motifs.py プロジェクト: tw7649116/cgat-flow
def loadBioProspector(infile, outfile):
    '''load results from bioprospector.'''

    target_path = os.path.join(
        os.path.abspath(P.get_params()["exportdir"]), "bioprospector")

    try:
        os.makedirs(target_path)
    except OSError:
        pass

    track = infile[:-len(".bioprospector")]

    results = Bioprospector.parse(iotools.open_file(infile, "r"))

    tmpfile = P.get_temp_file()
    tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n")

    for x, motifs in enumerate(results):
        outname = os.path.join(target_path, "%s_%02i.png" % (track, x))
        Bioprospector.build_logo([y.sequence for y in motifs.matches],
                                 outname)

        for match in motifs.matches:

            distance = abs(
                match.start + match.width1 - (match.end - match.width2))

            if match.strand in ("+-", "-+"):
                arrangement = "ER"
            elif match.strand in ("++", "--"):
                arrangement = "DR"
            else:
                arrangement = "SM"
                distance = 0

            arrangement += "%i" % distance
            strand = match.strand[0]

            id = re.sub(".*_", "", match.id)
            tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" %
                          (id,
                           x,
                           match.start,
                           match.end,
                           strand,
                           arrangement))
    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
コード例 #9
0
def runFastQC(infiles, outfile):
    '''run FastQC on each input file.

    convert sra files to fastq and check mapping qualities are in
    solexa format.  Perform quality control checks on reads from
    .fastq files.

    '''
    # only pass the contaminants file list if requested by user,
    if P.get_params()['use_custom_contaminants']:
        m = mapping.FastQC(nogroup=P.get_params()["readqc_no_group"],
                                   outdir=os.path.dirname(outfile),
                                   contaminants=P.get_params()['contaminants_path'],
                                   qual_format=P.get_params()['qual_format'])
    else:
        m = mapping.FastQC(nogroup=P.get_params()["readqc_no_group"],
                                   outdir=os.path.dirname(outfile),
                                   qual_format=P.get_params()['qual_format'])

    if P.get_params()["reconcile"] == 1:
        infiles = infiles.replace("processed.dir/trimmed",
                                  "reconciled.dir/trimmed")

    statement = m.build((infiles,), outfile)
    P.run(statement)
コード例 #10
0
        def makeAdaptorFasta(infile, outfile):
            '''Make a single fasta file for each sample of all contaminant adaptor
            sequences for removal
            '''

            preprocess.makeAdaptorFasta(
                infile=infile,
                outfile=outfile,
                track=re.match(REGEX_TRACK, infile).groups()[0],
                dbh=connect(),
                contaminants_file=P.get_params()['contaminants_path'])
コード例 #11
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)
コード例 #12
0
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # configure job_threads with fastq_screen_options from P.get_params()
    job_threads = re.findall(r'--threads \d+', P.get_params()['fastq_screen_options'])
    if len(job_threads) != 1:
        raise ValueError("Wrong number of threads for fastq_screen")

    job_threads = int(re.sub(r'--threads ', '', job_threads[0]))

    tempdir = P.get_temp_dir(".")
    conf_fn = os.path.join(tempdir, "fastq_screen.conf")
    with iotools.open_file(conf_fn, "w") as f:
        for i, k in P.get_params().items():
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = mapping.FastqScreen(config_filename=conf_fn)
    statement = m.build((infiles,), outfile)
    P.run(statement, job_memory="8G")
    shutil.rmtree(tempdir)
    iotools.touch_file(outfile)
コード例 #13
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"):
    '''export sequences for intervals in :term:`bed`-formatted *infile*
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip(infile, ".bed.gz")

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))
    outs = iotools.open_file(outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator(iotools.open_file(infile))):
        lcontig = fasta.getLength(bed.contig)

        if mode == "intervals":
            seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end))
            ids.append("%s_%s %s:%i..%i" %
                       (track, bed.name, bed.contig, bed.start, bed.end))

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_%s_l %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_%s_r %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

    masked = maskSequences(seqs, masker)
    outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]))

    outs.close()
コード例 #14
0
def loadFastQC(infile, outfile):
    '''load FASTQC stats into database.'''

    # a check to make sure file isnt empty
    n = 0
    with iotools.open_file(infile) as f:
        for i, line in enumerate(f):
            n =+ i
    if n > 0:
        P.load(infile, outfile, options="--add-index=track")
    else:
        table_name = infile.replace(".tsv.gz", "")
        database_sql = P.get_params()["database"]["url"]
        database_name = os.path.basename(database_sql)
        statement = """sqlite3 %(database_name)s
                       'DROP TABLE IF EXISTS %(table_name)s;
                       CREATE TABLE %(table_name)s
                       ("track" text PRIMARY KEY, "Sequence" text,
                       "Count" integer, "Percentage" integer, "Possible Source" text);'
                       'INSERT INTO %(table_name)s VALUES ("NA", "NA", 0, 0, "NA");'"""

        P.run(statement)
コード例 #15
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio, MEME is not run on
    all intervals but only the top 10% of intervals (peakval) are
    used.  Also, only the segment of 200 bp around the peak is used
    and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
        return

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    meme %(infile)s -dna -revcomp
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(motifs_max_size)s
    %(meme_options)s
       > %(outfile)s.log
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)
コード例 #16
0
def main(argv):

    options = P.initialize(argv, config_file="benchmark.yml")

    # compatibility with cgatcore < 0.6.3
    if isinstance(options, tuple):
        options = options[0]

    # not sure what this does
    # if not options.config_file:
    #     P.get_parameters(options.config_file)
    # else:
    #     sys.exit(P.main(options, args))

    params = P.get_params()

    with arvados_enabled(always_mount=options.always_mount):
        mountpoint = params.get("mount_point", None)
        if mountpoint:
            redirect_defaults2mountpoint(mountpoint)

        # A selection of command line arguments are added to PARAMS
        # as 'extras' not implemented in ruffus 2.6.3
        kwargs = collections.defaultdict(dict)
        if options.only_info:
            kwargs["extras"].update({'only_info': True})
            P.PARAMS["only_info"] = True
        if options.is_test:
            kwargs["extras"].update({'is_test': True})
            P.PARAMS["is_test"] = True

        E.debug("construction of workflow started")
        pipeline = ruffus.Pipeline('benchmark')
        # Tool execution
        suffix, tool_runners = add_tools_to_pipeline(pipeline,
                                                     map_tool_to_runner,
                                                     config=P.PARAMS,
                                                     **kwargs)

        E.debug("added {} tools to workflow".format(len(tool_runners)))
        # Optionally, add externally computed files as
        # pseudo-tools:
        if "external" in P.PARAMS["setup"]:
            external_runners = add_external_data_to_pipeline(pipeline,
                                                             config=P.PARAMS,
                                                             **kwargs)
            tool_runners.extend(external_runners)

        # Optionally, combine tool runs into aggregate
        # outputs. The type of the output is preserved
        # (VCF -> VCF, etc.)
        # For example, call individual members in a trio
        # and then build a combined VCF to analyse mendelian
        # inconsistencies.
        if "collate" in P.PARAMS["setup"]:
            collate_runners = add_collations_to_pipeline(
                pipeline,
                map_collate_to_runner,
                P.PARAMS["setup"]["collate"],
                tasks=tool_runners,
                config=P.PARAMS)
            if P.PARAMS["setup"].get("only_collate", False):
                tool_runners = []
            if P.PARAMS["setup"].get("no_collate_metrics", False):
                collate_runners = []
            E.debug("added {} collators to workflow".format(
                len(collate_runners)))
        else:
            collate_runners = []

        # Optionally, split up the output before applying
        # additional analyses. The type of the output is preserved
        # (VCF -> VCF, etc).
        # For example, identify false positives, false negatives
        # and true positives and collect metrics individually.
        if "split" in P.PARAMS["setup"]:
            split_runners = add_splits_to_pipeline(pipeline,
                                                   map_split_to_runner,
                                                   tool_runners,
                                                   P.PARAMS["setup"]["split"],
                                                   tasks=tool_runners,
                                                   config=P.PARAMS)
            if P.PARAMS["setup"].get("only_split", False):
                tool_runners = []
            E.debug("added {} splitters to workflow".format(
                len(split_runners)))
        else:
            split_runners = []

        metric_runners = []
        for prefix, r in zip(["tool", "collate", "split"],
                             [tool_runners, collate_runners, split_runners]):
            if not r:
                continue

            metrics = None

            if prefix == "collate" and "collate_metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["collate_metrics"]
            elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["split_metrics"]
            elif "metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["metrics"]
            else:
                raise KeyError(
                    "configuration file requires a 'setup:metrics' section")

            # Metric execution
            mm = add_metrics_to_pipeline(pipeline,
                                         metrics,
                                         map_metric_to_runner,
                                         r,
                                         suffix=suffix,
                                         prefix=prefix + "_",
                                         config=P.PARAMS,
                                         **kwargs)

            if len(mm) == 0:
                raise ValueError(
                    "workflow construction error: "
                    "no metric tasks result for metrics {}".format(metrics))

            metric_runners.extend(mm)
            E.debug("added {} {}_metrics to workflow".format(len(mm), prefix))

        # add plot task
        if "aggregate" in P.PARAMS["setup"]:
            aggregate_metrics = add_collations_to_pipeline(
                pipeline,
                map_collate_to_runner,
                P.PARAMS["setup"]["aggregate"],
                metric_runners,
                config=P.PARAMS)

            E.debug("added metric aggregation to workflow")
        else:
            aggregate_metrics = []

        add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics,
                               P.PARAMS)
        E.debug("added upload to workflow".format(prefix))

        # add export task
        export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"])
        map_export2runner = {
            "collate": collate_runners,
            "tools": tool_runners,
            "split": split_runners
        }

        export_runners = []
        for e in export:
            try:
                export_runners.extend(map_export2runner[e])
            except KeyError:
                raise KeyError("unknown export section: {}".format(e))

        add_export_to_pipeline(pipeline,
                               export_runners,
                               suffix=suffix,
                               config=P.PARAMS)

        E.debug("added export to workflow")

        add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics)

        # Collate output files to facilitate analysis
        if "collation" in P.PARAMS:
            collators = add_collations_to_pipeline(pipeline,
                                                   map_collate_to_runner,
                                                   P.PARAMS["collation"],
                                                   config=P.PARAMS)

        E.debug("construction of workflow completed")

        E.debug("starting workflow")
        P.run_workflow(options, pipeline=pipeline)
コード例 #17
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None):
    '''build a sequence set for motif discovery. Intervals are taken from
    the table <track>_intervals in the database *dbhandle* and save to
    *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible
    values are 'peakval' (peak value, descending order), 'score' (peak
    score, descending order)

    If *shift* is set, intervals will be shifted. ``leftright``
    creates two intervals on the left and right of the actual
    interval. The intervals will be centered around the mid-point and
    truncated the same way as the main intervals.

    '''

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    else:
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.tablequote(track)
    statement = '''SELECT contig, start, end, interval_id, peakcenter
    FROM %(tablename)s
    ''' % locals() + orderby

    cc = dbhandle.execute(statement)
    data = cc.fetchall()
    cc.close()

    if proportion:
        cutoff = int(len(data) * proportion) + 1
        if min_sequences:
            cutoff = max(cutoff, min_sequences)
    elif num_sequences:
        cutoff = num_sequences
    else:
        cutoff = len(data)
        L.info(
            "writeSequencesForIntervals %s: using at most %i sequences for pattern finding"
            % (track, cutoff))

    data = data[:cutoff]

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    # modify the ranges
    if shift:
        if shift == "leftright":
            new_data = [(contig, start - (end - start), start,
                         str(interval_id) + "_left", peakcenter)
                        for contig, start, end, interval_id, peakcenter in data
                        ]
            new_data.extend([
                (contig, end, end + (end - start), str(interval_id) + "_right",
                 peakcenter)
                for contig, start, end, interval_id, peakcenter in data
            ])
        data = new_data

    if halfwidth:
        # center around peakcenter, add halfwidth on either side
        data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth,
                 interval_id)
                for contig, start, end, interval_id, peakcenter in data]
    else:
        # remove peakcenter
        data = [(contig, start, end, interval_id)
                for contig, start, end, interval_id, peakcenter in data]

    # get the sequences - cut at number of nucleotides
    sequences = []
    current_size, nseq = 0, 0
    new_data = []
    for contig, start, end, interval_id in data:
        lcontig = fasta.getLength(contig)
        start, end = max(0, start + offset), min(end + offset, lcontig)
        if start >= end:
            L.info(
                "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored"
                % (track, id, start, end, offset))
            continue
        seq = fasta.getSequence(contig, "+", start, end)
        sequences.append(seq)
        new_data.append((start, end, interval_id, contig))
        current_size += len(seq)
        if maxsize and current_size >= maxsize:
            L.info(
                "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)"
                % (track, maxsize, nseq, len(data) - nseq))
            break
        nseq += 1

    data = new_data

    if shuffled:
        # note that shuffling is done on the unmasked sequences
        # Otherwise N's would be interspersed with real sequence
        # messing up motif finding unfairly. Instead, masking is
        # done on the shuffled sequence.
        sequences = [list(x) for x in sequences]
        for sequence in sequences:
            random.shuffle(sequence)
        sequences = maskSequences(["".join(x) for x in sequences], masker)

    c = E.Counter()
    outs = iotools.open_file(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            sequences = maskSequences(sequences, masker)

    for sequence, d in zip(sequences, data):
        c.input += 1
        if len(sequence) == 0:
            c.empty += 1
            continue
        start, end, id, contig = d
        id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end)
        outs.write(">%s\n%s\n" % (id, sequence))
        c.output += 1
    outs.close()

    E.info("%s" % c)

    return c.output
コード例 #18
0
    def processReads(infile, outfiles):
        '''process reads from .fastq and other sequence files.
        '''
        trimmomatic_options = P.get_params()["trimmomatic_options"]

        if P.get_params()["auto_remove"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                "contaminants.fasta",
                P.get_params()["trimmomatic_mismatches"],
                P.get_params()["trimmomatic_p_thresh"],
                P.get_params()["trimmomatic_c_thresh"],
                P.get_params()["trimmomatic_min_adapter_len"],
                P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options

        elif P.get_params()["trimmomatic_adapter"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                P.get_params()["trimmomatic_adapter"],
                P.get_params()["trimmomatic_mismatches"],
                P.get_params()["trimmomatic_p_thresh"],
                P.get_params()["trimmomatic_c_thresh"],
                P.get_params()["trimmomatic_min_adapter_len"],
                P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options

        job_threads = P.get_params()["threads"]
        job_memory = "12G"

        track = re.match(REGEX_TRACK, infile).groups()[0]

        m = preprocess.MasterProcessor(
            save=P.get_params()["save"],
            summarize=P.get_params()["summarize"],
            threads=P.get_params()["threads"],
            qual_format=P.get_params()['qual_format'])

        for tool in P.as_list(P.get_params()["preprocessors"]):

            if tool == "fastx_trimmer":
                m.add(preprocess.FastxTrimmer(
                    P.get_params()["fastx_trimmer_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "trimmomatic":
                m.add(preprocess.Trimmomatic(
                    trimmomatic_options,
                    threads=P.get_params()["threads"]))
            elif tool == "sickle":
                m.add(preprocess.Sickle(
                    P.get_params()["sickle_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "trimgalore":
                m.add(preprocess.Trimgalore(
                    P.get_params()["trimgalore_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "flash":
                m.add(preprocess.Flash(
                    P.get_params()["flash_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "reversecomplement":
                m.add(preprocess.ReverseComplement(
                    P.get_params()["reversecomplement_options"]))
            elif tool == "pandaseq":
                m.add(preprocess.Pandaseq(
                    P.get_params()["pandaseq_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "cutadapt":
                cutadapt_options = P.get_params()["cutadapt_options"]
                if P.get_params()["auto_remove"]:
                    cutadapt_options += " -a file:contaminants.fasta "
                m.add(preprocess.Cutadapt(
                    cutadapt_options,
                    threads=P.get_params()["threads"],
                    untrimmed=P.get_params()['cutadapt_reroute_untrimmed'],
                    process_paired=P.get_params()["cutadapt_process_paired"]))
            else:
                raise NotImplementedError("tool '%s' not implemented" % tool)

        statement = m.build((infile,), "processed.dir/trimmed-", track)
        P.run(statement)
コード例 #19
0
ファイル: motifs.py プロジェクト: kevinrue/cgat-flow
def loadMAST(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''

    tablename = P.to_table(outfile)

    tmpfile = P.get_temp_file(".")

    tmpfile.write(MAST.Match().header + "\tmotif\tcontig"
                  "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end"
                  "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end"
                  "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n")

    lines = iotools.open_file(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.get_temp_file(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(iotools.open_file(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast

    def splitId(s, mode):
        '''split background match id

        has three parts: track _ id _ pos

        track might contain '_'.
        '''
        d = match.id.split("_")
        if mode == "bg":
            return "_".join(d[:-2]), d[-2], d[-1]
        elif mode == "fg":
            return "_".join(d[:-1]), d[-1]

    for chunk in range(0, len(chunks) - 1, 2):

        motif_fg, part, mast_fg = readChunk(lines, chunk)
        assert part == "foreground"
        motif_bg, part, mast_bg = readChunk(lines, chunk + 1)
        assert part == "background"
        assert motif_fg == motif_bg

        # index control data
        controls = collections.defaultdict(dict)
        for match in mast_bg.matches:
            track, id, pos = splitId(match.id, "bg")
            controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs,
                                 match.length, match.start, match.end)

        for match in mast_fg.matches:
            # remove track and pos
            track, match.id = splitId(match.id, "fg")
            # move to genomic coordinates
            contig, start, end = re.match("(\S+):(\d+)..(\d+)",
                                          match.description).groups()
            if match.nmotifs > 0:
                start, end = int(start), int(end)
                match.start += start
                match.end += start
                match.positions = [x + start for x in match.positions]

            id = match.id
            if id not in controls:
                P.warn("no controls for %s - increase MAST evalue" % id)

            if "l" not in controls[id]:
                controls[id]["l"] = (float(P.get_params()["mast_evalue"]), 1,
                                     0, 0, 0, 0)
            if "r" not in controls[id]:
                controls[id]["r"] = (float(P.get_params()["mast_evalue"]), 1,
                                     0, 0, 0, 0)

            min_evalue = min(controls[id]["l"][0], controls[id]["r"][0])
            min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1])
            max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2])

            tmpfile.write(
                str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    motif_fg,
                    contig,
                    "\t".join(map(str, controls[id]["l"])),
                    "\t".join(map(str, controls[id]["r"])),
                    str(min_evalue),
                    str(min_pvalue),
                    str(max_nmatches),
                ) + "\n")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           options="--add-index=id "
           "--add-index=motif "
           "--add-index=id,motif "
           "--allow-empty-file "
           "--map=base_qualities:text")

    os.unlink(tmpfile.name)
コード例 #20
0
# unprocessed files
REGEX_TRACK_BOTH = r"(processed.dir/)*([^/]+)\.(fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"

SEQUENCEFILES_REGEX = r"([^/]+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"


def connect():
    '''
    Setup a connection to an sqlite database
    '''

    dbh = sqlite3.connect(P.get_params()['database'])
    return dbh


@transform(P.get_params()["input_globs"].get("default", INPUT_FORMATS),
           regex("(.*)"), r"\1")
def unprocessReads(infiles, outfiles):
    """dummy task - no processing of reads."""


# if preprocess tools are specified, preprocessing is done on output that has
# already been generated in the first run
if P.get_params().get("preprocessors", None):
    if P.get_params()["auto_remove"]:
        # check if FastQC has been run
        for x in iotools.flatten([glob.glob(y) for y in
                                  P.get_params()["input_globs"].get("default", INPUT_FORMATS)]):
            f = "fastqc.dir/" + re.match(REGEX_TRACK, x).group(1) + ".fastqc"
            if not os.path.exists(f):
                raise ValueError(
コード例 #21
0
ファイル: report.py プロジェクト: kevinrue/cgat-flow
def run_report(clean=True,
               with_pipeline_status=True,
               pipeline_status_format="svg"):
    '''run cgatreport.

    This will also run ruffus to create an svg image of the pipeline
    status unless *with_pipeline_status* is set to False. The image
    will be saved into the export directory.

    '''

    params = P.get_params()

    if with_pipeline_status:
        targetdir = params["exportdir"]
        if not os.path.exists(targetdir):
            os.mkdir(targetdir)

        ruffus.pipeline_printout_graph(
            os.path.join(targetdir, "pipeline.%s" % pipeline_status_format),
            pipeline_status_format, ["full"],
            checksum_level=params["ruffus_checksums_level"])

    dirname, basename = os.path.split(P.get_caller().__file__)

    report_engine = params.get("report_engine", "cgatreport")
    assert report_engine in ('sphinxreport', 'cgatreport')

    docdir = os.path.join(dirname, "pipeline_docs",
                          iotools.snip(basename, ".py"))
    themedir = os.path.join(dirname, "pipeline_docs", "themes")
    relpath = os.path.relpath(docdir)
    trackerdir = os.path.join(docdir, "trackers")

    # use a fake X display in order to avoid windows popping up
    # from R plots.
    xvfb_command = iotools.which("xvfb-run")

    # permit multiple servers using -d option
    if xvfb_command:
        xvfb_command += " -d "
    else:
        xvfb_command = ""

    # if there is no DISPLAY variable set, xvfb runs, but
    # exits with error when killing process. Thus, ignore return
    # value.
    # print os.getenv("DISPLAY"), "command=", xvfb_command
    if not os.getenv("DISPLAY"):
        erase_return = "|| true"
    else:
        erase_return = ""

    if os.path.exists("conf.py"):
        conf_dir = os.path.abspath(".")
    else:
        conf_dir = os.path.join(os.path.dirname(__file__), "configuration")

    # in the current version, xvfb always returns with an error, thus
    # ignore these.
    erase_return = "|| true"

    if clean:
        clean = "rm -rf report _cache _static;"
    else:
        clean = ""

    # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as
    # the virtual environment seems to be stripped. It is thus set to
    # the contents of the current sys.path
    syspath = ":".join(sys.path)

    statement = '''
    %(clean)s
    (export SPHINX_DOCSDIR=%(docdir)s;
    export SPHINX_THEMEDIR=%(themedir)s;
    export PYTHONPATH=%(syspath)s;
    %(xvfb_command)s
    %(report_engine)s-build
    --num-jobs=%(report_threads)s
    sphinx-build
    -b html
    -d %(report_doctrees)s
    -c %(conf_dir)s
    -j %(report_threads)s
    %(docdir)s %(report_html)s
    >& report.log %(erase_return)s )
    '''

    P.run(statement)

    E.info(
        'the report is available at %s' %
        os.path.abspath(os.path.join(params['report_html'], "contents.html")))