Python Pipeline.touch 예제들, CGATCore.Pipeline.touch Python 예제들

예제 #1

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def generatePSP(positives, negatives, outfile):
    ''' generate a discrimitative PSP file from
    the positives and negatives that can be used
    to do descriminative MEME '''

    psp_options = PARAMS["psp_options"]

    nseqs_pos = int(FastaIterator.count(positives))
    nseqs_neg = int(FastaIterator.count(negatives))

    if nseqs_pos < 2 or nseqs_neg < 2:
        E.warn("%s: input files do not have sufficent sequences"
               "to run psp-gen, skipping" % outfile)
        P.touch(outfile)
        return

    # get appropriate options from meme options
    if PARAMS.get("meme_revcomp", True):
        psp_options += " -revcomp"

    statement = '''psp-gen -pos %(positives)s
                           -neg %(negatives)s
                           %(psp_options)s
                   > %(outfile)s '''

    P.run(statement)

예제 #2

0

파일 보기

파일: pipeline_rrbs.py 프로젝트: logust79/cgat-flow

def calculateM3DSpikeClustersPvalue(infiles, outfile):
    job_options = "-l mem_free=4G -pe dedicated 1"
    design = infiles[-1]
    infiles = infiles[:-1]
    RRBS.calculateM3DSpikepvalue(infiles, outfile, design,
                                 submit=True, job_options=job_options)
    P.touch(outfile)

예제 #3

0

파일 보기

def removeBamfiles(infiles, outfile):
    for bamfile in infiles:
        bam_index = bamfile + ".bai"
        os.unlink(bamfile)
        if os.path.exists(bam_index):
            os.unlink(bam_index)
    P.touch(outfile)

예제 #4

0

파일 보기

파일: pipeline_rrbs.py 프로젝트: logust79/cgat-flow

def makeSummaryPlots(infile, outfile):

    job_options = "-l mem_free=48G"

    RRBS.summaryPlots(infile, outfile,
                      submit=True, job_options=job_options)
    P.touch(outfile)

예제 #5

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(PARAMS["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                               outfile)

    if IOTools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)

예제 #6

0

파일 보기

        def splitFiles(infile, outfile):
            '''
            Arbitrarily split files into chunks for parallelisation
            '''

            Timeseries.splitFiles(infile=infile,
                                  nchunks=PARAMS['resampling_chunks'],
                                  out_dir="parallel_files.dir")
            P.touch(outfile)

예제 #7

0

파일 보기

def reMergeBamfiles(infiles, sentinel):
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinel)

예제 #8

0

파일 보기

def poolSampleBamfiles(infiles, sentinel):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinel)

예제 #9

0

파일 보기

def callPeaksOnIndividualReplicates(infile, outfile):
    infile = P.snip(infile, ".sentinel") + ".bam"
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks
    IDR.callIDRPeaks(infile, outfile, PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"], PARAMS_PEAKCALLER)

    P.touch(outfile)

예제 #10

0

파일 보기

def splitPooledBamfiles(infile, sentinel):
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)

예제 #11

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def runFIMO(motifs, database, outfile, exportdir, options={}):
    '''run fimo to look for occurances of motifs supplied in sequence database.
    :param:`motifs` is the path to a MEME formated motif file.
    :param:`database` is a fasta file.
    :param:`outfile` is the text output from fimo
    :param:`exportdir` specifies the directory to put exported files (html,gff)
    :param:options is a dictionary: {'option':'value'} will be passed as
                    --option=value and will overwrite options specified in the
                     PARAMs'''

    # if the motifs file is empty, then fimo will return an error
    # this isn't very useful behavoir.

    inlines = IOTools.open_file(motifs).read()
    #print inlines
    if not re.search("MOTIF", inlines):
        E.warning("No motifs found in %s" % motifs)
        P.touch(outfile)
        return
    else:
        E.debug("%s: %i motifs found" %
                (motifs, len(re.findall("MOTIF", inlines))))

    fimo_options = PARAMS.get("fimo_options", "")
    for option, value in options.iteritems():
        fimo_options = re.sub("%s=\S+" % option, "", fimo_options)
        if value is None:
            fimo_options += " --%s" % option
        else:
            fimo_options += " --%s=%s" % (option, value)

    tmpout = P.get_temp_filename()

    track = os.path.basename(outfile)
    exportdir = os.path.abspath(exportdir)

    xmlout = P.snip(outfile, ".txt") + ".xml"
    logfile = P.snip(outfile, ".txt") + ".log"
    gffout = os.path.join(exportdir, track + ".gff")
    htmlout = os.path.join(exportdir, track + ".html")

    statement = ''' fimo --oc %(tmpout)s
                         %(fimo_options)s
                         %(motifs)s
                         %(database)s &> %(logfile)s;
                     mv %(tmpout)s/fimo.txt %(outfile)s;
                     mv %(tmpout)s/fimo.xml %(xmlout)s;
                     mv %(tmpout)s/fimo.gff %(gffout)s;
                     mv %(tmpout)s/fimo.html %(htmlout)s;
                     rm -r %(tmpout)s '''

    P.run(statement)

예제 #12

0

파일 보기

def callPeaksOnPooledReplicates(infile, outfile):
    # fetch peak calling parameters
    PARAMS_PEAKCALLER = get_peak_caller_parameters(
        PARAMS["options_peak_caller"])

    # call peaks on pseudoreplicates
    IDR.callIDRPeaks(infile,
                     outfile,
                     PARAMS["options_peak_caller"],
                     PARAMS["options_control_type"],
                     PARAMS_PEAKCALLER,
                     pseudoreplicate=False)

    P.touch(outfile)

예제 #13

0

파일 보기

    def genReplicateData(infile, outfile):
        '''
        Split each replicate into a separate file for clustering
        within each replicate.  Relies on each replicate being the
        same across the whole time series.
        '''

        outdir = outfile.split("/")[0]
        Timeseries.splitReplicates(infile=infile,
                                   axis="column",
                                   group_var="replicates",
                                   outdir=outdir)

        P.touch(outfile)

예제 #14

0

파일 보기

def splitBamfiles(infile, sentinel):
    """
    For all tracks, split the filtered bamfile in two using pysam
    """
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)

예제 #15

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(PARAMS['motifs_masker']),
        halfwidth=int(PARAMS["meme_halfwidth"]),
        maxsize=int(PARAMS["meme_max_size"]),
        proportion=PARAMS["meme_proportion"],
        min_sequences=PARAMS["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        P.touch(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)

예제 #16

0

파일 보기

def poolInputBamfiles(infiles, sentinel):
    """
    Merge filtered input files for each tissue, with the option of excluding
    undesirable libraries.
    """
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"
    bad_samples = PARAMS["filter_remove_inputs"].split(",")

    if len(infiles) > 1:
        to_merge = IDR.filterBadLibraries(infiles, bad_samples)
        IDR.mergeBams(to_merge, outfile)
    else:
        os.symlink(os.path.abspath(infiles[0]), outfile)
        os.symlink(os.path.abspath(infiles[0]) + ".bai", outfile + ".bai")

    P.touch(sentinel)

예제 #17

0

파일 보기

def exportIntervalSequences(infile, outfile, track, method):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    dbhandle = connect()

    try:
        halfwidth = int(PARAMS[method+"_halfwidth"])
        full = False
    except ValueError:
        full = True
        halfwidth = None

    try:
        maxsize = int(PARAMS[method+"_max_size"])
    except ValueError:
        maxsize = None

    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=full,
        masker=P.as_list(PARAMS[method+'_masker']),
        halfwidth=halfwidth,
        maxsize=maxsize,
        num_sequences=PARAMS[method+"_num_sequences"],
        proportion=PARAMS[method+"_proportion"],
        min_sequences=PARAMS[method+"_min_sequences"],
        order=PARAMS[method+'_score'])

    if nseq == 0:
        E.warn("%s: no sequences - %s skipped" % (outfile, method))
        P.touch(outfile)

예제 #18

0

파일 보기

파일: pipeline_promotors.py 프로젝트: logust79/cgat-flow

def loadGeneSummary(infile, outfile):
    '''summarize binding information per gene.'''

    dbh = connect()

    table = P.toTable(outfile)

    cc = dbh.cursor()
    cc.execute("""DROP TABLE IF EXISTS %(table)s """ % locals())
    cc.execute("""CREATE TABLE %(table)s AS
            SELECT gene_id, SUM( tata ) AS tata, SUM( cpg ) AS cpg 
                   FROM promotorinfo_transcripts AS p,
                        annotations.transcript_info as i
                   WHERE i.transcript_id = p.transcript_id
                   GROUP BY gene_id""" % locals())
    cc.close()

    P.touch(outfile)

예제 #19

0

파일 보기

파일: pipeline_rrbs.py 프로젝트: logust79/cgat-flow

def summariseReadStart(infile, outfile):
    # this only works for fastq files. Fails with .sra files
    # this function and the next section should be replaced with a call to
    # fastq-dump if the file ends with .sra and then use the functions of
    # the fastq module to count the first bases in the fastq records.
    # for now, create empty outfile
    if infile.endswith(".sra"):
        P.touch(outfile)
    else:
        statement = '''zcat %(infile)s |
        paste - - - - | cut -f2 | cut -c1-3 | sort | uniq -c |
        sort -nk1 | awk -F' ' 'BEGIN{total=0; sum=0}
        {total+=$1; OFS"\\t";
        if($2=="CGG"||$2=="TGG"||$2=="CGA"||$2=="TGA")
        {sum+=$1; print $1, $2}}
        END {print total-sum,"others"}' > %(outfile)s ''' % locals()

        P.run()

예제 #20

0

파일 보기

def timePointDiffExpression(infile, outfile):
    '''
    Within each condition test for differentially expressed
    genes against the baseline time point.  Uses DESeq2.
    '''

    statement = '''
    cgat timeseries2diffgenes
    --log=%(outfile)s.log
    --method=timepoint
    --alpha=%(deseq_alpha)s
    --results-directory=diff_timepoints.dir
    %(infile)s
    '''

    P.run()

    P.touch(outfile)

예제 #21

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def loadTomTom(infile, outfile):
    '''load tomtom results'''

    tablename = P.to_table(outfile)

    resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                              infile)
    xml_file = os.path.join(resultsdir, "tomtom.xml")

    if not os.path.exists(xml_file):
        E.warn("no tomtom output - skipped loading ")
        P.touch(outfile)
        return

    # get the motif name from the xml file

    tree = xml.etree.ElementTree.ElementTree()
    tree.parse(xml_file)
    motifs = tree.find("targets")
    name2alt = {}
    for motif in motifs.getiterator("motif"):
        name = motif.get("id")
        alt = motif.get("alt")
        name2alt[name] = alt

    tmpfile = P.get_temp_file(".")

    # parse the text file
    for line in IOTools.open_file(infile):
        if line.startswith("#Query"):
            tmpfile.write('\t'.join(("target_name", "query_id", "target_id",
                                     "optimal_offset", "pvalue", "evalue",
                                     "qvalue", "Overlap", "query_consensus",
                                     "target_consensus", "orientation")) +
                          "\n")
            continue
        data = line[:-1].split("\t")
        target_name = name2alt[data[1]]
        tmpfile.write("%s\t%s" % (target_name, line))
    tmpfile.close()

    P.load(tmpfile.name, outfile)

    os.unlink(tmpfile.name)

예제 #22

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def collectMEMEResults(tmpdir, target_path, outfile, method="meme"):
    '''collect output from a MEME run in tmpdir
    and copy all over to target_path

    convert images output by MEME (.eps files) to 
    .png files.'''

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    E.debug("tmpdir is" + tmpdir + "      target_path is " + target_path)
    shutil.move(tmpdir, target_path)

    if method == "dreme":
        shutil.copyfile(os.path.join(target_path, "dreme.txt"), outfile)
    elif method == "meme":
        shutil.copyfile(os.path.join(target_path, "meme.txt"), outfile)
    elif method == "memechip":
        try:
            shutil.copyfile(os.path.join(target_path, "combined.meme"),
                            outfile)
        except IOError:
            E.warn("%s: No motifs found")
            P.touch(outfile)

    # convert images to png
    epsfiles = glob.glob(os.path.join(target_path, "*.eps"))

    statement = []
    for epsfile in epsfiles:
        b, ext = os.path.splitext(epsfile)
        pngfile = b + ".png"
        statement.append('''convert %(epsfile)s %(pngfile)s ''')

    if len(statement) > 0:
        statement = " && ".join(statement)
        P.run(statement)

예제 #23

0

파일 보기

파일: PipelineChipseq.py 프로젝트: logust79/cgat-flow

def subtractBedFiles(infile, subtractfile, outfile):
    '''subtract intervals in *subtractfile* from *infile*
    and store in *outfile*.
    '''

    if IOTools.isEmpty(subtractfile):
        shutil.copyfile(infile, outfile)
        return
    elif IOTools.isEmpty(infile):
        P.touch(outfile)
        return

    statement = '''
        intersectBed -v -a %(infile)s -b %(subtractfile)s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %(outfile)s ; tabix -p bed %(outfile)s
        '''

    P.run()

예제 #24

0

파일 보기

def conditionDiffExpression(infile, outfile):
    '''
    Call DEGs showing statistically significantly
    different expression based on interaction terms between condition
    and time point.  Uses DESeq2.
    '''

    job_options = "-l mem_free=4G"

    statement = '''
    zcat %(infile)s |
    cgat timeseries2diffgenes
    --log=%(outfile)s.log
    --method=condition
    --alpha=%(deseq_alpha)s
    --results-directory=diff_condition.dir
    '''

    P.run()

    P.touch(outfile)

예제 #25

0

파일 보기

파일: PipelineChipseq.py 프로젝트: logust79/cgat-flow

def intersectBedFiles(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''

    if len(infiles) == 1:

        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)

예제 #26

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def runMemeCHIP(infile, outfile, motifs=None):
    '''Run the MEME-CHiP pipeline on the input files.
    optional motifs files can be supplied as a list'''

    if motifs:
        motifs = " ".join("-db %s" % motif for motif in motifs)
    else:
        motifs = " "

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme-chip skipped")
        P.touch(outfile)
        return

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    meme-chip %(infile)s
             -p %(meme_threads)s 
             -oc %(tmpdir)s
             -nmeme %(memechip_nmeme)s
             %(memechip_options)s     
             %(motifs)s > %(outfile)s.log '''

    # If running with more than one thread
    # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html
    # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options
    # through job_options
    if int(PARAMS["memechip_threads"]) != 1:
        job_options = str(PARAMS["memechip_job_options"])
        job_threads = int(PARAMS["memechip_threads"])
        cluster_parallel_environment = str(
            PARAMS["memechip_cluster_parallel_environment"])

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile, method="memechip")

예제 #27

0

파일 보기

def summarizeEffectsPerGene(infile, outfile):
    '''summarize effects on a per-gene level.'''

    tablename = outfile[:-len(".load")]
    track = infile[:-len("_effects.load")]

    dbhandle = connect()

    statement = '''
    CREATE TABLE %(tablename)s AS
    SELECT DISTINCT
           gene_id,
           COUNT(*) AS ntranscripts,
           MIN(e.nalleles) AS min_nalleles,
           MAX(e.nalleles) AS max_nalleles,
           MIN(e.stop_min) AS min_stop_min,
           MAX(e.stop_min) AS max_stop_min,
           MIN(e.stop_max) AS min_stop_max,
           MAX(e.stop_max) AS max_stop_max,
           SUM( CASE WHEN stop_min > 0 AND cds_len - stop_min * 3 < last_exon_start THEN 1
                     ELSE 0 END) AS nmd_knockout,
           SUM( CASE WHEN stop_max > 0 AND cds_len - stop_max * 3 < last_exon_start THEN 1
                     ELSE 0 END) AS nmd_affected
    FROM annotations.transcript_info as i,
         %(track)s_effects AS e
    WHERE i.transcript_id = e.transcript_id
    GROUP BY i.gene_id
    ''' % locals()

    Database.executewait(dbhandle,
                         "DROP TABLE IF EXISTS %(tablename)s" % locals())
    Database.executewait(dbhandle, statement)
    Database.executewait(
        dbhandle,
        "CREATE INDEX %(tablename)s_gene_id ON %(tablename)s (gene_id)" %
        locals())
    dbhandle.commit()

    P.touch(outfile)

예제 #28

0

파일 보기

    def genResampleData(infile, outfile):
        '''
        Resample the data n-times with replacement - generates
        n flat files which are then propagated at later stages.
        Files are generally small though.
        '''

        time_agg = list(TIME.__dict__['track2groups'].keys())
        time_points = [int(str(x).split("-")[1]) for x in time_agg]
        time_points.sort()
        time_points = list(set(time_points))
        rep_agg = list(REPLICATE.__dict__['track2groups'].keys())
        replicates = [str(x).split("-")[2] for x in rep_agg]
        time_rep_comb = [x for x in itertools.product(time_points, replicates)]
        time_cond = ro.StrVector([x[0] for x in time_rep_comb])
        rep_cond = ro.StrVector([x[1] for x in time_rep_comb])
        ref_gtf = str(infile).split("-")[1]
        condition = (str(infile).split("-")[0]).strip("deseq.dir/")

        time_points = ",".join([str(i) for i in time_points])
        replicates = ",".join(replicates)

        statement = '''
        cgat data2resamples
        --log=%(outfile)s.log
        --time=%(time_points)s
        --replicates=%(replicates)s
        --condition=%(condition)s
        --resamples=%(resampling_resample)s
        --input-gtf=%(ref_gtf)s
        --output-file-directory=clustering.dir
        --seed=%(resampling_seed)s
        %(infile)s
        '''
        P.run()

        P.touch(outfile)

예제 #29

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def runDREME(infile, outfile, neg_file="", options=""):
    ''' Run DREME on fasta file. If a neg_file is passed
    then DREME will use this as the negative set, otherwise
    the default is to shuffle the input '''

    nseqs_pos = int(FastaIterator.count(infile))
    if nseqs_pos < 2:
        E.warn("%s: less than 2 sequences - dreme skipped" % outfile)
        P.touch(outfile)
        return

    if neg_file:
        nseqs_neg = int(FastaIterator.count(neg_file))
        if nseqs_neg < 2:
            E.warn(
                "%s: less than 2 sequences in negatives file - dreme skipped" %
                outfile)
            P.touch(outfile)
            return
        else:
            neg_file = "-n %s" % neg_file

    logfile = outfile + ".log"
    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    dreme -p %(infile)s %(neg_file)s -png
        -oc %(tmpdir)s
            %(dreme_options)s
            %(options)s
       > %(logfile)s
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile, method="dreme")

예제 #30

0

파일 보기

파일: PipelineDeNovoMotifs_python3.py 프로젝트: sudlab/pipeline_denovo_motifs

def runMEMEOnSequences(infile, outfile, background=None, psp=None):
    '''run MEME on fasta sequences to find motifs
   
    By defualt MEME calculates a zero-th order background
    model from the nucleotide frequencies in the input set.

    To use a different background set, a background
    file created by fasta-get-markov must be supplied.

    To perform descrimantive analysis a position specific
    prior (psp) file must be provided. This can be generated
    used generatePSP.

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs < 2:
        E.warn("%s: less than 2 sequences - meme skipped" % outfile)
        P.touch(outfile)
        return

    # Get the total length of the sequences to decide the memory
    total_seqs_length = 0

    with IOTools.open_file(infile, "r") as fasta_reader:

        iterator_fasta = FastaIterator.iterate(fasta_reader)

        for fasta_seq in iterator_fasta:
            total_seqs_length += len(fasta_seq.sequence)

    fasta_reader.close()

    # If the length of all sequences is higher than 160,000bp
    # Up the memory
    job_memory = "2G"

    if (total_seqs_length > 160000):
        job_memory = "4G"

    if PARAMS.get("meme_revcomp", True):
        revcomp = "-revcomp"
    else:
        revcomp = ""

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile)
    tmpdir = P.get_temp_dir(".")
    if background:
        background_model = "-bfile %s" % background
    else:
        background_model = ""

    if psp:
        E.info("Running MEME in descriminative mode")
        psp_file = "-psp %s" % psp
    else:
        psp_file = ""

    statement = '''
    meme %(infile)s -dna %(revcomp)s
    -p %(meme_threads)s
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(meme_max_size)s
    %(background_model)s
    %(psp_file)s
    %(meme_options)s
       2> %(outfile)s.log
    '''

    # If running with more than one thread
    # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html
    # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options
    # through job_options
    if int(PARAMS["meme_threads"]) != 1:
        job_options = str(PARAMS["meme_job_options"])
        job_threads = int(PARAMS["meme_threads"])
        cluster_parallel_environment = str(
            PARAMS["meme_cluster_parallel_environment"])

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)