Python Pipeline 예제들, CGATPipelines.Pipeline Python 예제들

예제 #1

0

파일 보기

파일: PipelineExome.py 프로젝트: BioXiao/CGATPipelines

def GATKBaseRecal(infile, outfile, genome, dbsnp, solid_options=""):
    '''Recalibrates base quality scores using GATK'''

    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''GenomeAnalysisTK
                    -T BaseRecalibrator
                    --out %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -I %(infile)s
                    --knownSites %(dbsnp)s %(solid_options)s ;
                    checkpoint ;''' % locals()

    statement += '''GenomeAnalysisTK
                    -T PrintReads -o %(outfile)s
                    -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -I %(infile)s ;
                    checkpoint ;''' % locals()

    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()
    P.run()

예제 #2

0

파일 보기

파일: pipeline_chains.py 프로젝트: CGATOxford/CGATPipelines

def convertPslToChain(infile, outfile):
    '''convert a psl to a chain file.

    see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver
    '''

    to_cluster = True

    target, query = extractGenomes(infile)

    tmpfilename1 = P.getTempFilename(".")
    tmpfilename2 = P.getTempFilename(".")

    writeContigSizes(target, tmpfilename1)
    writeContigSizes(query, tmpfilename2)

    statement = '''gunzip
    < %(infile)s
    | pslSwap stdin stdout
    | cgat psl2chain --log=%(outfile)s.log
    | chainSort stdin stdout
    | gzip
    > %(outfile)s.sorted.chain.gz;
    checkpoint;
    gunzip < %(outfile)s.sorted.chain.gz
    | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null
    | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout
    | gzip
    > %(outfile)s'''
    P.run()

    os.unlink(tmpfilename1)
    os.unlink(tmpfilename2)

예제 #3

0

파일 보기

파일: pipeline_chains.py 프로젝트: CGATOxford/CGATPipelines

    def extractPairwiseAlignmentSingleFile(infiles, outfile, track):
        '''build pairwise genomic aligment from maf files.'''

        try:
            os.remove(outfile)
        except OSError:
            pass

        genomefile = PARAMS["%s_genome" % track]

        to_cluster = True

        for infile in infiles:

            E.info("adding %s" % infile)

            statement = '''gunzip < %(infile)s
                 | cgat maf2psl
                      --query=%(track)s
                      --target=%(maf_master)s
                      --log=%(outfile)s.log
                 | cgat psl2psl
                      --method=filter-fasta
                      --method=sanitize
                      --queries-tsv-file=%(genomefile)s
                      --target-psl-file=%(genome)s
                      --log=%(outfile)s.log
                 | gzip
                 >> %(outfile)s
                 '''
            P.run()

예제 #4

0

파일 보기

파일: PipelineExome.py 프로젝트: BioXiao/CGATPipelines

def GATKReadGroups(infile, outfile, genome,
                   library="unknown", platform="Illumina",
                   platform_unit="1", track="unknown"):
    '''Reorders BAM according to reference fasta and adds read groups'''

    if track == 'unknown':
        track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''ReorderSam
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    REFERENCE=%(genome)s
                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
                    VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()

    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                    checkpoint ;''' % locals()

    statement += '''AddOrReplaceReadGroups
                    INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    OUTPUT=%(outfile)s
                    RGLB=%(library)s
                    RGPL=%(platform)s
                    RGPU=%(platform_unit)s
                    RGSM=%(track)s
                    VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()

    statement += '''samtools index %(outfile)s ;
                    checkpoint ;''' % locals()
    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()

    P.run()

예제 #5

0

파일 보기

파일: pipeline_chains.py 프로젝트: CGATOxford/CGATPipelines

def buildIndirectMaps(infile, outfile, track):
    '''build a map between query and target, linking
    via intermediate targets.'''

    to_cluster = True

    path = P.asList(PARAMS["%s_path" % track])

    E.info("path=%s" % str(path))

    statement = []

    for stage, part in enumerate(path):
        filename = part + ".over.psl.gz"
        if not os.path.exists(filename):
            raise ValueError(
                "required file %s for %s (stage %i) not exist." % (filename, outfile, stage))

        if stage == 0:
            statement.append( '''gunzip < %(filename)s''' % locals() )
        else:
            statement.append( '''
               pslMap stdin <(gunzip < %(filename)s) stdout
            ''' % locals() )

    statement.append("gzip")

    statement = " | ".join(statement) + " > %(outfile)s " % locals()

    P.run()

예제 #6

0

파일 보기

파일: PipelineMetagenomeBenchmark.py 프로젝트: BioXiao/CGATPipelines

def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()

예제 #7

0

파일 보기

파일: pipeline_project035.py 프로젝트: MikeDMorgan/proj035

def spikeInCounts(infiles, outfile):
    '''
    Perform spike-in across a specific range of fold changes or absolute
    count differences.  Counts table generated from original input counts
    data.
    '''

    counts_file = infiles[0]
    design_file = infiles[1]

    statement = '''
    zcat %(counts_file)s |
    python %(scriptsdir)s/counts2counts.py --design-tsv-file=%(design_file)s
            --method="spike" --spike-type="row" --spike-change-bin-max=3.0
            --spike-change-bin-width=0.1  --spike-change-bin-min=0.1
            --spike-initial-bin-width=1
            --spike-initial-bin-min=1 --spike-initial-bin-max=200000
            --spike-minimum=1 --spike-maximum=1000000
            --random-seed=%(random_seed)i
            --spike-iterations=%(spike_iterations)i  -v 5
            --log=%(outfile)s.log
            | gzip > %(outfile)s
    '''

    P.run()

예제 #8

0

파일 보기

파일: pipeline_mapping_benchmark.py 프로젝트: CGATOxford/CGATPipelines

def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)

예제 #9

0

파일 보기

파일: PipelineMetagenomeBenchmark.py 프로젝트: BioXiao/CGATPipelines

def calculateFalsePositiveRate(infiles, outfile):
    '''
    taxonomy false positives and negatives etc
    '''
    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    levels = ["phylum", "class", "order", "family", "genus", "species"]
    tablename_true = P.toTable(infiles[0])

    # get corresponding estimate file
    tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[
                                   1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0]))

    outf = open(outfile, "w")
    track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load")
    for level in levels:
        for cutoff in [0, 1]:
            true_set = set()
            estimate_set = set()
            for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)):
                true_set.add(taxa[0])
            for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))):
                estimate_set.add(taxa[0])
            total_true = len(true_set)
            total_estimate = len(estimate_set)
            tp = true_set.intersection(estimate_set)
            fp = estimate_set.difference(true_set)

            fp_rate = float(len(fp)) / total_estimate
            tp_rate = float(len(tp)) / total_true
            outf.write("%s\t%f\t%f\t%s\t%s\n" %
                       (level, fp_rate, tp_rate, track, str(cutoff)))
    outf.close()

예제 #10

0

파일 보기

파일: pipeline_mapping_benchmark.py 프로젝트: CGATOxford/CGATPipelines

def buildBAMStats(infile, outfile):
    '''Count number of reads mapped, duplicates, etc. '''
    to_cluster = USECLUSTER
    scriptsdir = PARAMS["general_scriptsdir"]
    statement = '''cgat bam2stats --force-output 
                   --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s'''
    P.run()

예제 #11

0

파일 보기

파일: pipeline_mapping_benchmark.py 프로젝트: CGATOxford/CGATPipelines

def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()

예제 #12

0

파일 보기

파일: PipelineUCSC.py 프로젝트: sudlab/CGATPipelines

def getCpGIslandsFromUCSC(dbhandle, outfile):
    '''get CpG islands from UCSC database and save as a :term:`bed`
    formatted file.

    The name column in the bed file will be set to the UCSC name.

    Arguments
    ---------
    dbhandle : object
       Database handle to UCSC mysql database
    outfile : string
       Filename of output file in :term:`bed` format.
    '''

    cc = dbhandle.cursor()
    table = "cpgIslandExt"
    sql = """SELECT chrom, chromStart, chromEnd, name
    FROM %(table)s ORDER by chrom, chromStart"""
    sql = sql % locals()

    E.debug("executing sql statement: %s" % sql)
    try:
        cc.execute(sql)
        outfile = IOTools.openFile(outfile, "w")
        for data in cc.fetchall():
            outfile.write("\t".join(map(str, data)) + "\n")
        outfile.close()
    except Exception:
        E.warn("Failed to connect to table %s. %s is empty" % (table, outfile))
        P.touch(outfile)

예제 #13

0

파일 보기

파일: pipeline_mapping_benchmark.py 프로젝트: CGATOxford/CGATPipelines

def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = open('dupstats.txt', 'w')

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

예제 #14

0

파일 보기

파일: pipeline_ancestral_repeats.py 프로젝트: gjaime/CGATPipelines

    def buildRawGenomeAlignment(infiles, outfile):
        '''build pairwise genomic aligment from maf files.
        '''

        try:
            os.remove(outfile)
        except OSError:
            pass

        for infile in infiles:
            # skip maf files without Hsap on top.
            if "other" in infile or "supercontig" in infile:
                continue

            E.info("adding %s" % infile)

            genome_query, genome_target = getGenomes()

            statement = '''gunzip < %(infile)s 
             | python %(scriptsdir)s/maf2psl.py
                  --query=%(maf_name_query)s
                  --target=%(maf_name_target)s
                  --log=%(outfile)s.log
             | python %(scriptsdir)s/psl2psl.py
                  --method=filter-fasta
                  --method=sanitize
                  --queries-tsv-file=%(genome_query)s
                  --target-psl-file=%(genome_target)s
                  --log=%(outfile)s.log
             | gzip
             >> %(outfile)s
             '''
            P.run()

예제 #15

0

파일 보기

파일: PipelineMappingQC.py 프로젝트: BioXiao/CGATPipelines

def buildPicardAlignmentStats(infile, outfile, genome_file):
    '''gather BAM file alignment statistics using Picard '''

    job_options = getPicardOptions()
    job_threads = 3

    if getNumReadsFromBAMFile(infile) == 0:
        E.warn("no reads in %s - no metrics" % infile)
        P.touch(outfile)
        return

    # Picard seems to have problem if quality information is missing
    # or there is no sequence/quality information within the bam file.
    # Thus, add it explicitely.
    statement = '''cat %(infile)s
    | python %(scriptsdir)s/bam2bam.py -v 0
    --method=set-sequence --output-sam
    | CollectMultipleMetrics
    INPUT=/dev/stdin
    REFERENCE_SEQUENCE=%(genome_file)s
    ASSUME_SORTED=true
    OUTPUT=%(outfile)s
    VALIDATION_STRINGENCY=SILENT
    >& %(outfile)s'''

    P.run()

예제 #16

0

파일 보기

파일: pipeline_rnaseqlncrna.py 프로젝트: gjaime/CGATPipelines

def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)

예제 #17

0

파일 보기

파일: pipeline_variant_annotation.py 프로젝트: CGATOxford/CGATPipelines

def loadAnnotations(infile, outfile):
    '''load variant annotations into database'''

    P.load(infile, outfile,
           options="--map=gene_id:str "
           "--add-index=gene_id "
           "--map=base_qualities:text ")

예제 #18

0

파일 보기

파일: pipeline_rnaseqlncrna.py 프로젝트: gjaime/CGATPipelines

def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | python %(scriptsdir)s/gtf2gtf.py
                        --method=sort --sort-order=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN",
                                                      "TRAN")
            statement = '''
            zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py
            --method=renumber-genes
            --pattern-identifier=%(gene_pattern)s%%i
            | python %(scriptsdir)s/gtf2gtf.py
            --method=renumber-transcripts
            --pattern-identifier=%(transcript_pattern)s%%i
            | python %(scriptsdir)s/gtf2gtf.py
            --method=sort --sort-order=gene
            --log=%(outfile)s.log
            | gzip > %(outfile)s'''

    P.run()

예제 #19

0

파일 보기

파일: pipeline_rnaseqlncrna.py 프로젝트: gjaime/CGATPipelines

    def buildFilteredLncRNAGeneSet(infile, outfile):
        '''
        Depending on on filtering_remove_single_exon will:
        i) remove all single exon transcripts from all lncrna models
        (transcripts)
        ii) remove lncrna loci that only contain single exon transcripts
        (loci)
        iii) leave all single-exon and multi-exon loci in outfile
        (None)
        '''

        if not PARAMS["filtering_remove_single_exon"]:
            E.info("Both multi-exon and single-exon lncRNA are retained!")
            statement = ("cp %(infile)s %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "loci":
            E.info("Warning: removing all single-exon"
                   " transcripts from lncRNA set")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status_locus \"s\"'"
                         " gzip > %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "transcripts":
            E.info("Warning: removing loci with only single-exon transcripts")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status \"s\"'"
                         " gzip > %(outfile)s")
        else:
            raise ValueError("Unregocnised parameter %s"
                             % PARAMS["filtering_remove_single_exon"])
        P.run()

예제 #20

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: hainm/CGATPipelines

def mapReadsWithBowtie(infiles, outfile):
    """map reads with bowtie"""

    inifile, infile = infiles

    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    statement = """
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam
           -C
           --threads %(bowtie_threads)s
           %(bowtie_options)s
           %(bowtie_genome_dir)s/%(genome)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --output-sam --method=set-nh --log=%(outfile)s.log
    | gzip
    > %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s
    """

    P.run()

예제 #21

0

파일 보기

파일: PipelineGO.py 프로젝트: CGATOxford/CGATPipelines

def loadGO(infile, outfile, tablename):
    """import GO results into individual tables.

    This method concatenates all the results from
    a GO analysis and uploads into a single table.

    """

    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    load_statement = P.build_load_statement(
        tablename=tablename,
        options="--allow-empty-file "
        "--add-index=category "
        "--add-index=goid ")

    statement = '''
    python %(toolsdir)s/cat_tables.py %(indir)s/*.overall
    | %(load_statement)s
    > %(outfile)s
    '''
    P.run()

예제 #22

0

파일 보기

파일: pipeline_cufflinks_optimization.py 프로젝트: CGATOxford/CGATPipelines

def buildAllStats(infiles, outfile):
    '''
    paste stats together
    '''
    statement = '''paste %s > %s''' % (
        " ".join([infile for infile in infiles]), outfile)
    P.run()

예제 #23

0

파일 보기

파일: pipeline_cufflinks_optimization.py 프로젝트: CGATOxford/CGATPipelines

def loadSummariseReadsContributingToTranscripts(infile, outfile):
    '''
    loads the summary of reads contributing to transcripts
    '''
    tablename = P.toTable(outfile.replace("/", "_"))
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #24

0

파일 보기

파일: pipeline_cufflinks_optimization.py 프로젝트: CGATOxford/CGATPipelines

def splitMultiAndSingleExonLincRna(infile, outfiles):
    '''
    pulls out the multi-exonic and the single exonic lincRNA transcripts
    from the lincrna.gtf.gz
    '''

    inf = gzip.open(infile)
    multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w")
    single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w")

    for entry in GTF.transcript_iterator(GTF.iterator(inf)):
        if len(entry) > 1:
            for exon in entry:
                multi.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".", exon.strand,
                                        "."])) +
                    "\t" + exon.attributes + "\n")

        elif len(entry) == 1:
            for exon in entry:
                single.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".",
                                        exon.strand, "."])) +
                    "\t" + exon.attributes + "\n")

    for outfile in outfiles:
        outf = P.snip(outfile, ".gz")
        if not os.path.exists(outfile):
            statement = '''gzip %(outf)s'''
            P.run()

예제 #25

0

파일 보기

파일: pipeline_cufflinks_optimization.py 프로젝트: CGATOxford/CGATPipelines

def loadCountSingleAndMultiExonLincRNA(infile, outfile):
    '''
    load the counts for the multi and single exon lincRNA
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + ".count"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #26

0

파일 보기

파일: pipeline_polyphen.py 프로젝트: CGATOxford/CGATPipelines

def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database_name"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --map-tsv-file=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)

예제 #27

0

파일 보기

파일: PipelineTransfacMatch.py 프로젝트: CGATOxford/CGATPipelines

def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()

예제 #28

0

파일 보기

파일: pipeline_cufflinks_optimization.py 프로젝트: CGATOxford/CGATPipelines

def loadNumberExonsLengthSummaryStats(infile, outfile):
    '''
    load the table of exon counts and transcript lengths
    '''
    tablename = P.toTable(outfile.replace("/", "_")) + "_stats"
    statement = '''cgat csv2db -t %(tablename)s --log=%(outfile)s.log < %(infile)s > %(outfile)s'''
    P.run()

예제 #29

0

파일 보기

파일: pipeline_ancestral_repeats.py 프로젝트: CGATOxford/CGATPipelines

def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome):
    '''import repeats from a UCSC formatted file.

    The repeats are stored as a :term:`gff` formatted file.
    '''

    repclasses = "','".join(repeattypes.split(","))

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.
    dbhandle = PipelineUCSC.connectToUCSC(
        host=PARAMS["ucsc_host"],
        user=PARAMS["ucsc_user"],
        database=ucsc_database)

    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
    tables = [x[0] for x in cc.fetchall()]
    if len(tables) == 0:
        raise ValueError("could not find any `rmsk` tables")

    tmpfile = P.getTempFile(shared=True)

    total_repeats = 0
    for table in tables:
        E.info("%s: loading repeats from %s" % (ucsc_database, table))
        cc = dbhandle.execute(
            """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.',
            strand, '.',
            CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";')
            FROM %(table)s
            WHERE repClass in ('%(repclasses)s') """ % locals())
        n = 0
        for data in cc.fetchall():
            n += 1
            tmpfile.write("\t".join(map(str, data)) + "\n")
        E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n))
        total_repeats += n

    if total_repeats == 0:
        raise ValueErrror("did not find any repeats for %s" % ucsc_database)

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''cat %(tmpfilename)s
    | %(pipeline_scriptsdir)s/gff_sort pos
    | cgat gff2gff
    --method=sanitize
    --sanitize-method=genome
    --skip-missing
    --genome-file=%(genome)s
    --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)

예제 #30

0

파일 보기

파일: pipeline_benchmark_rnaseqmappers.py 프로젝트: hainm/CGATPipelines

def mergeAndLoad(infiles, outfile, suffix):
    """load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    """
    header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()

예제 #31

0

파일 보기

def publish_report():
    '''publish report in the CGAT downloads directory.'''

    E.info("publishing report")
    P.publish_report()

예제 #32

0

파일 보기

def loadConsensusMetrics(infile, outfile):
    P.load(infile, outfile)

예제 #33

0

파일 보기

def loadConsensusClustering(infile, outfile):
    P.load(infile, outfile)

예제 #34

0

파일 보기

def loadClusterEigengenes(infile, outfile):
    '''
    Load module eignengene expression profiles in to DB
    '''

    P.load(infile, outfile)

예제 #35

0

파일 보기

 def loadMatchClusterExpression(infile, outfile):
     P.load(infile, outfile)

예제 #36

0

파일 보기

import itertools
import re
import sqlite3
import glob
import pandas as pd
import rpy2.robjects as ro
import CGAT.Experiment as E
import CGAT.Timeseries as Timeseries
import CGATPipelines.PipelineTracks as PipelineTracks

###################################################
# Pipeline configuration
# load options from the config file
import CGATPipelines.Pipeline as P
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS

###################################################################
# Helper functions mapping tracks to conditions, etc
GENESETS = PipelineTracks.Tracks(PipelineTracks.Sample).loadFromDirectory(
    glob.glob("*.gtf.gz"), "(\S+).gtf.gz")
TRACKS3 = PipelineTracks.Tracks(PipelineTracks.Sample3)
TRACKS = TRACKS3.loadFromDirectory(glob.glob("*.bam"), "(\S+).bam")
REPLICATE = PipelineTracks.Aggregate(TRACKS, labels=("replicate", ))
TIME = PipelineTracks.Aggregate(TRACKS, labels=("condition", "tissue"))


def connect():

예제 #37

0

파일 보기

def matchBgSequenceComposition(gc_load_files,
                               background,
                               foreground,
                               fasta_file,
                               outfile,
                               database="csvdb",
                               header_line=True,
                               bg_stat="pCpG",
                               stat="fisher"):
    '''
    take the background set and subset it for intervals with a sequence
    composition distribution that is the same as the foreground set.
    Subsetting is done without replacement.
    This requires that the background set is sufficiently large, if the
    returned matched_background set is <90% size of foreground set, the
    pipeline will crash.
    '''
    background_file = open(background)
    foreground_file = open(foreground)

    if header_line:
        background_file.readline()
        foreground_file.readline()

    background_set = set()
    foreground_set = set()
    for interval_id in background_file.readlines():
        background_set.add(interval_id[:-1])
    for interval_id in foreground_file.readlines():
        foreground_set.add(interval_id[:-1])

    dbh = sqlite3.connect(database)
    cc = dbh.cursor()
    tablenames = [
        filenameToTablename(P.snip(os.path.basename(x), ".load"))
        for x in gc_load_files
    ]

    # jj: cpg scores rounded to three dp.
    # background dict - key <cpg score>: val <set of gene_ids with that score>
    # foreground dict - key <gene_id>: val <cpg score>

    gc = {"background": collections.defaultdict(set), "foreground": {}}
    for tablename in tablenames:

        # MM: need to make sure `-` in filenames don't break the sql statement

        tablename = tablename.replace("-", "_")
        for data in cc.execute("""SELECT * FROM %s;""" % tablename):
            interval_id = data[3].split(" ")[0]
            cpg = data[2]

            # jj: store background in 1 percent bins
            cpg_str = "%.3f" % cpg
            if re.search("background", tablename):
                if interval_id in background_set:
                    gc["background"][cpg_str].add(interval_id)

            elif re.search("foreground", tablename):
                if interval_id in foreground_set:
                    gc["foreground"][interval_id] = cpg_str

            else:
                raise ValueError("Unrecognized table name %s. Should contain"
                                 "'foreground' or 'background'" % tablename)

    # debug: pickle and dump the gc dict
    pickle_file = P.snip(foreground, ".foreground.tsv") + ".p"
    pickle.dump(gc, open(pickle_file, "wb"))

    # match the background set to the foreground set by taking a random
    # background interval with the the same sequence composition as each
    # foreground interval.
    outf = open(outfile, "w")
    if header_line:
        outf.write("gene_id\n")

    # jj: sample background gene_ids without replacement
    matched_background = set()
    X = 0
    for interval, cpg in gc["foreground"].iteritems():

        # print("Finding background for foreground gene: %s (%s)" %
        # (interval, cpg))
        if cpg in gc["background"].keys():

            # get set of bg gene_ids with relevant cpg score
            bg_gene_ids = gc["background"][cpg]

            # print "There are %i background genes in total" % len(bg_gene_ids)
            # remove foreground genes from background set
            bg_gene_ids = bg_gene_ids - foreground_set

            # print("There are %i background genes after removing foreground" %
            # len(bg_gene_ids))

            if bg_gene_ids:
                # select one gene_id to add to matched_background

                bg_id = random.sample(gc["background"][cpg], 1)[0]
                matched_background.add(bg_id)
                # remove selected background gene_id from set

                gc["background"][cpg].remove(bg_id)
            else:
                X += 1
                E.warn("Missing background gene for %s %s, no gene with"
                       " matching %s" %
                       (foreground_file.name, interval, bg_stat))

        else:
            X += 1
            E.warn("Missing background gene for %s %s, no gene with"
                   " matching %s" % (foreground_file.name, interval, bg_stat))

    # Hack
    # jj: check that background gene_list is <10% shorter than foregroung
    # hack
    # MM: only need to check sufficient background size for Fisher's exact test
    if stat == "fisher":
        assert len(matched_background) > 0.9 * len(foreground_set), (
            "There are insufficient genes with matched background to perform"
            " test for sample %s" % foreground_file)
    else:
        pass
    print "Number of genes with no available background: %i" % X
    print "Foreground set: %i" % len(foreground_set)
    print "Backfround set: %i" % len(matched_background)
    outf.write("\n".join(matched_background) + "\n")
    outf.close()

예제 #38

0

파일 보기

Code
====

"""
from ruffus import *

import sys
import os
import sqlite3
import CGAT.Experiment as E
import CGATPipelines.Pipeline as P

# load options from the config file
PARAMS = P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

# add configuration values from associated pipelines
#
# 1. pipeline_annotations: any parameters will be added with the
#    prefix "annotations_". The interface will be updated with
#    "annotations_dir" to point to the absolute path names.
PARAMS.update(
    P.peekParameters(PARAMS["annotations_dir"],
                     "pipeline_annotations.py",
                     on_error_raise=__name__ == "__main__",
                     prefix="annotations_",
                     update_interface=True))

# if necessary, update the PARAMS dictionary in any modules file.

예제 #39

0

파일 보기

파일: pipeline_promotors.py 프로젝트: dormeight/CGATPipelines

def update_report():
    '''update report.'''

    E.info("updating report")
    P.run_report(clean=False)

예제 #40

0

파일 보기

def loadWordCounts(infile, outfile):
    '''load results of word counting into database.'''
    P.load(infile, outfile, "--add-index=word")

예제 #41

0

파일 보기

파일: pipeline_promotors.py 프로젝트: dormeight/CGATPipelines

def loadCpGIslands(infile, outfile):
    '''load CpG Islands information.'''

    P.load(infile, outfile, "--add-index=transcript_id")

예제 #42

0

파일 보기

파일: pipeline_promotors.py 프로젝트: dormeight/CGATPipelines

def publish_report():
    '''publish report.'''

    E.info("publishing report")
    P.publish_report()

예제 #43

0

파일 보기

파일: pipeline_promotors.py 프로젝트: dormeight/CGATPipelines

def findTATABox(infiles, outfile):
    '''find TATA box in promotors. There are several matrices to choose from:

    M00216 V$TATA_C Retroviral TATA box
    M00252 V$TATA_01 cellular and viral TATA box elements
    M00311 V$ATATA_B Avian C-type TATA box
    M00320 V$MTATA_B Muscle TATA box
    '''

    # 1. create fasta file - look for TATA box
    #
    bedfile, genomefile = infiles

    statement = '''
    slopBed -i %(bedfile)s
            -l %(tata_search_upstream)i
            -r %(tata_search_downstream)i
            -s
            -g %(genomefile)s
    | python %(scriptsdir)s/bed2fasta.py 
       --use-strand
       --genome=%(genome_dir)s/%(genome)s
       --log=%(outfile)s.log
    > %(outfile)s.fasta
    '''

    P.run()

    match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64'
    match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat'
    match_profile = 'minFP_good.prf'
    match_profile = outfile + ".prf"

    prf = '''tata.prf
prf to minimize sum of both errors - derived from minSUM.prf
 MIN_LENGTH 300
0.0
 1.000 0.716 0.780 M00216 V$TATA_C
 1.000 0.738 0.856 M00252 V$TATA_01
 1.000 0.717 0.934 M00311 V$ATATA_B
 1.000 0.711 0.784 M00320 V$MTATA_B
//
'''

    with IOTools.openFile(match_profile, "w") as outf:
        outf.write(prf)

    # -u : uniq - only one best match per sequence
    statement = '''
         %(match_executable)s
         %(match_matrix)s
         %(outfile)s.fasta
         %(outfile)s.match
         %(match_profile)s
         -u
    >> %(outfile)s.log
    '''
    P.run()

    transcript2pos = {}
    for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")):
        transcript_id, contig, start, end, strand = re.match(
            "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups()
        transcript2pos[transcript_id] = (contig, int(start), int(end), strand)

    MATCH = collections.namedtuple(
        "MATCH",
        "pid transfac_id pos strand core_similarity matrix_similarity sequence"
    )

    def _grouper(infile):
        r = []
        keep = False
        for line in infile:
            if line.startswith("Inspecting sequence ID"):
                keep = True
                if r:
                    yield pid, r
                r = []
                pid = re.match("Inspecting sequence ID\s+(\S+)",
                               line).groups()[0]
                continue
            elif line.startswith(" Total"):
                break

            if not keep:
                continue
            if line[:-1].strip() == "":
                continue
            transfac_id, v, core_similarity, matrix_similarity, sequence = [
                x.strip() for x in line[:-1].split("|")
            ]
            pos, strand = re.match("(\d+) \((\S)\)", v).groups()
            r.append(
                MATCH._make((pid, transfac_id, int(pos), strand,
                             float(core_similarity), float(matrix_similarity),
                             sequence)))

        yield pid, r

    offset = PARAMS["tata_search_upstream"]

    outf = IOTools.openFile(outfile + ".table.gz", "w")
    outf.write("\t".join(("transcript_id", "strand", "start", "end",
                          "relative_start", "relative_end", "transfac_id",
                          "core_similarity", "matrix_similarity",
                          "sequence")) + "\n")

    bedf = IOTools.openFile(outfile, "w")

    c = E.Counter()
    found = set()
    for transcript_id, matches in _grouper(IOTools.openFile(outfile +
                                                            ".match")):
        contig, seq_start, seq_end, strand = transcript2pos[transcript_id]
        c.promotor_with_matches += 1
        nmatches = 0
        found.add(transcript_id)
        for match in matches:

            c.matches_total += 1
            lmatch = len(match.sequence)
            if match.strand == "-":
                c.matches_wrong_strand += 1
                continue

            # get genomic location of match
            if strand == "+":
                genome_start = seq_start + match.pos
            else:
                genome_start = seq_end - match.pos - lmatch

            genome_end = genome_start + lmatch

            # get relative location of match
            if strand == "+":
                tss_start = seq_start + offset
                relative_start = genome_start - tss_start
            else:
                tss_start = seq_end - offset
                relative_start = tss_start - genome_end

            relative_end = relative_start + lmatch

            outf.write("\t".join(
                map(str, (transcript_id, strand, genome_start, genome_end,
                          relative_start, relative_end, match.transfac_id,
                          match.core_similarity, match.matrix_similarity,
                          match.sequence))) + "\n")
            c.matches_output += 1
            nmatches += 1

            bedf.write("\t".join(
                map(str, (contig, genome_start, genome_end, transcript_id,
                          strand, match.matrix_similarity))) + "\n")

        if nmatches == 0:
            c.promotor_filtered += 1
        else:
            c.promotor_output += 1

    c.promotor_total = len(transcript2pos)
    c.promotor_without_matches = len(
        set(transcript2pos.keys()).difference(found))

    outf.close()
    bedf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)

예제 #44

0

파일 보기

파일: pipeline_promotors.py 프로젝트: dormeight/CGATPipelines

def build_report():
    '''build report from scratch.'''

    E.info("starting report build process from scratch")
    P.run_report(clean=True)

예제 #45

0

파일 보기

                        'pipeline_docs', 'themes')
logopath = os.path.join(themedir, "cgat_logo.png")

################################################################
# Import pipeline configuration from pipeline.ini in the current
# directory and the common one.

# PATH were code for pipelines is stored
pipelinesdir = os.path.dirname(CGATPipelines.__file__)

# The default configuration file - 'inifile' is read by
# sphinx-report.
inifile = os.path.join(os.path.dirname(CGATPipelines.__file__),
                       'configuration', 'pipeline.ini')

PARAMS = P.getParameters([inifile, "pipeline.ini"])

# Definition now part of CGATReport
# def setup(app):
#     app.add_config_value('PARAMS', {}, True)

################################################################
################################################################
################################################################
# The pipeline assumes that sphinxreport is called within the
# working directory. If the report is in a separate build directory,
# change the paths below.
#
# directory with export directory from pipeline
# This should be a directory in the build directory - you can
# link from here to a directory outside the build tree, though.

예제 #46

0

파일 보기

파일: pipeline_promotors.py 프로젝트: dormeight/CGATPipelines

def loadTATABox(infile, outfile):
    '''load TATA box information.'''

    P.load(infile + ".table.gz", outfile, "--add-index=transcript_id")

예제 #47

0

파일 보기

파일: pipeline_genesets.py 프로젝트: dormeight/CGATPipelines

def loadHypergeometricAnalysis(infile, outfile):
    '''load GO results.'''

    track = P.toTable(outfile)
    tablename = 'hypergeometric_%s_summary' % track
    P.load(infile, outfile, tablename=tablename)

    dbh = connect()
    ontologies = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT ontology FROM %s''' %
            tablename).fetchall()
    ]

    genelists = [
        x[0] for x in Database.executewait(
            dbh, '''SELECT DISTINCT genelist FROM %s''' %
            tablename).fetchall()
    ]

    # output files from runGO.py
    sections = ('results', 'parameters', 'withgenes')

    for section in sections:
        tablename = 'hypergeometric_%s_%s' % (track, section)
        load_statement = P.build_load_statement(tablename=tablename)

        statement = '''
        python %(scriptsdir)s/combine_tables.py
        --cat=track
        --regex-filename="hypergeometric.dir/%(track)s.tsv.dir/(\S+).%(section)s"
        hypergeometric.dir/%(track)s.tsv.dir/*.%(section)s
        | %(load_statement)s
        >> %(outfile)s'''
        P.run()

    for ontology in ontologies:

        fn = os.path.join(infile + ".dir", "all_alldesc.%s.l2fold" % ontology)

        if not os.path.exists(fn):
            E.warn("file %s does not exist" % fn)
            continue

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l2fold' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10pvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10pvalue' % (track, ontology),
               options='--allow-empty-file')

        fn = os.path.join(infile + ".dir",
                          "all_alldesc.%s.l10qvalue" % ontology)

        P.load(fn,
               outfile,
               tablename='hypergeometric_%s_%s_l10qvalue' % (track, ontology),
               options='--allow-empty-file')

예제 #48

0

파일 보기

파일: pipeline_promotors.py 프로젝트: dormeight/CGATPipelines

import CGAT.Experiment as E
import CGAT.IOTools as IOTools
import CGAT.FastaIterator as FastaIterator
import CGAT.Bed as Bed
import CGATPipelines.PipelineGeneset as PipelineGeneset

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################

# load options from the config file
import CGATPipelines.Pipeline as P
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

PipelineGeneset.PARAMS = PARAMS

###################################################################
###################################################################
###################################################################


def connect():
    '''connect to database.

예제 #49

0

파일 보기

파일: pipeline_genesets.py 프로젝트: dormeight/CGATPipelines

def loadGeneListMatrix(infile, outfile):
    '''load fgene list matrix into table.'''
    track = P.snip(infile, ".tsv.gz")
    P.load(infile, outfile, tablename="%s_foreground" % track)
    P.load(infile + ".bg.tsv.gz", outfile, tablename="%s_background" % track)

예제 #50

0

파일 보기

파일: pipeline_genesets.py 프로젝트: dormeight/CGATPipelines

def loadHypergeometricResultsSummary(infiles, outfile):
    '''load GO summary results.'''
    infiles = glob.glob("hypergeometric.dir/*/*.parameters")
    P.mergeAndLoad(infiles, outfile)

예제 #51

0

파일 보기

파일: pipeline_genesets.py 프로젝트: dormeight/CGATPipelines

def loadGeneLists(infile, outfile):
    '''load gene list data into database.'''
    P.load(infile, outfile, tablename="genelist_%s" % P.toTable(outfile))

예제 #52

0

파일 보기

파일: pipeline_genesets.py 프로젝트: dormeight/CGATPipelines

def loadPathways(infile, outfile):
    '''load pathway information into database.'''
    P.load(infile, outfile, "--add-index=gene_id --add-index=go_id")

예제 #53

0

파일 보기

def convertBedGraph(infile, outfile):
    contig_file = os.path.join(PARAMS["annotations_dir"], "contigs.tsv")
    statement = ("bedGraphToBigWig %(infile)s %(contig_file)s %(outfile)s")
    P.run()

예제 #54

0

파일 보기

파일: pipeline_genesets.py 프로젝트: dormeight/CGATPipelines

def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(IOTools.openFile(infile),
                                   index_col=0,
                                   sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" %
                                                track)]
        genesets.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" %
                                                track)]

        E.info('%s: background: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))
        backgrounds.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" %
               (track, len(genesets[-1]), len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with IOTools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with IOTools.openFile(outfile + ".matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)

예제 #55

0

파일 보기

def loadNPeaksForPooledPseudoreplicates(infile, outfile):
    P.load(infile, outfile)

예제 #56

0

파일 보기

def publish():
    '''publish report and data.'''

    E.info("publishing report")
    P.publish_report()

예제 #57

0

파일 보기

def loadIDROnPooledPseudoreplicates(infile, outfile):
    P.load(infile, outfile)

예제 #58

0

파일 보기

def generatePeakSets(infile, outfiles):
    outf_con, outf_opt = outfiles

    # retrieve maximum number of peaks obtained from inter-replicate IDR
    # (table created by loadNPeaksForIndividualReplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " max(n_peaks) AS nPeaks"
                 " FROM individual_replicates_nPeaks"
                 " GROUP BY experiment")
    df = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name'])

    # reassign experiment as index
    df = df.set_index("Experiment")

    # retrieve number of peaks obtained from pooled_pseudoreplicate IDR
    # (table created by loadNPeaksForPooledPseudoreplicates)
    statement = ("SELECT"
                 " Experiment,"
                 " n_peaks AS nPeaks"
                 " FROM pooled_pseudoreplicates_nPeaks")
    df2 = Database.fetch_DataFrame(statement, dbhandle=PARAMS['database_name'])

    # reassign experiment as index
    df2 = df2.set_index("Experiment")

    # split the infile name to obtain experiment
    sample_id = os.path.basename(infile).split("_VS_")[0]
    sample = sample_id.split("-")
    experiment = "_".join([sample[0], sample[1]])

    # retrieve max_numPeaks for experiment
    nPeaks = int(df.loc[experiment])
    # retrieve numPeaks_Rep0 for experiment
    nPeaks_rep0 = int(df2.loc[experiment])
    # retrieve maximumn of the two
    nPeaks_max = max(nPeaks, nPeaks_rep0)

    # establish which column to sort by
    if PARAMS["idr_options_ranking_measure"] == "signal.value":
        sort_statement = "sort -k7nr,7nr"
    elif PARAMS["idr_options_ranking_measure"] == "p.value":
        sort_statement = "sort -k8nr,8nr"
    elif PARAMS["idr_options_ranking_measure"] == "q.value":
        sort_statement = "sort -k9nr,9nr"
    else:
        raise ValueError("Unrecognised ranking_measure"
                         " %s don't know which column"
                         " to sort on" % PARAMS["idr_options_ranking_measure"])

    # sort infile by column and write top nPeaks to outfile (conservative)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks)s |"
                 " gzip > %(outf_con)s")
    P.run()

    # sort infile by column and write top nPeaks_max to outfile (optimum)
    ignore_pipe_errors = True
    statement = ("zcat %(infile)s |"
                 " %(sort_statement)s |"
                 " head -%(nPeaks_max)s |"
                 " gzip > %(outf_opt)s")
    P.run()

예제 #59

0

파일 보기

def loadIDROnIndividualReplicates(infile, outfile):
    P.load(infile, outfile)

예제 #60

0

파일 보기

def loadNPeaksForIndividualReplicates(infile, outfile):
    P.load(infile, outfile)