コード例 #1
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_dir"],
                                  PARAMS["annotations_interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS["annotations_interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()
コード例 #2
0
def convertPslToChain(infile, outfile):
    '''convert a psl to a chain file.

    see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver
    '''

    to_cluster = True

    target, query = extractGenomes(infile)

    tmpfilename1 = P.getTempFilename(".")
    tmpfilename2 = P.getTempFilename(".")

    writeContigSizes(target, tmpfilename1)
    writeContigSizes(query, tmpfilename2)

    statement = '''gunzip
    < %(infile)s
    | pslSwap stdin stdout
    | cgat psl2chain --log=%(outfile)s.log
    | chainSort stdin stdout
    | gzip
    > %(outfile)s.sorted.chain.gz;
    checkpoint;
    gunzip < %(outfile)s.sorted.chain.gz
    | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null
    | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout
    | gzip
    > %(outfile)s'''
    P.run()

    os.unlink(tmpfilename1)
    os.unlink(tmpfilename2)
コード例 #3
0
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
コード例 #4
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
        BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
コード例 #5
0
def loadSleuthTable(infile, outfile, transcript_info, gene_biotypes,
                    database, annotations_database):

        tmpfile = P.getTempFilename("/ifs/scratch/")

        table = os.path.basename(transcript_info)

        if gene_biotypes:
            where_cmd = "WHERE " + " OR ".join(
                ["gene_biotype = '%s'" % x
                 for x in gene_biotypes.split(",")])
        else:
            where_cmd = ""

        select = """SELECT DISTINCT
        transcript_id, transcript_biotype, gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

        df1 = pd.read_table(infile, sep="\t")
        df1.set_index("transcript_id", drop=True, inplace=True)

        df2 = pd.read_sql(select, connect(database, annotations_database))
        df2.set_index("transcript_id", drop=False, inplace=True)

        df = df1.join(df2)
        df.to_csv(tmpfile, sep="\t", index=True)

        options = "--add-index=transcript_id"
        P.load(tmpfile, outfile, options=options)
        os.unlink(tmpfile)
コード例 #6
0
def mapReadsWithBowtie(infiles, outfile):
    """map reads with bowtie"""

    inifile, infile = infiles

    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    statement = """
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam
           -C
           --threads %(bowtie_threads)s
           %(bowtie_options)s
           %(bowtie_genome_dir)s/%(genome)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --output-sam --method=set-nh --log=%(outfile)s.log
    | gzip
    > %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s
    """

    P.run()
コード例 #7
0
def testMotifDisruptingSnpsNotEnriched(infiles, outfile):
    '''
    test SNPs for motif disrupting effects using
    motifbreakR
    '''

    infile = infiles[0]
    annot_file = infiles[1]
    job_memory = "6G"

    tmp = P.getTempFilename(shared=True)

    statement = '''
    comm -23 %(infile)s <(zcat %(annot_file)s | cut -f 4 | sort)
    > %(tmp)s; checkpoint;
    python /ifs/devel/projects/proj045/enrichment_pipeline/snps2motif.py
    --log=%(outfile)s.log
    --snp-column=0
    --R-scripts-directory=%(r_scripts)s
    --R-script=%(motifs_script)s
    --additional-motif=%(motifs_pwms)s
    --image-directory=plots.dir
    %(tmp)s
    > %(outfile)s; checkpoint;
    rm -f %(tmp)s'''

    P.run()
コード例 #8
0
def buildGff(infile, outfile):
    '''Creates a gff for DEXSeq

    This takes the gtf and flattens it to an exon based input
    required by DEXSeq. The required python script is provided by DEXSeq
    and uses HTSeqCounts.

    Parameters
    ----------

    infile : string
       Input filename in :term:`gtf` format

    outfile : string
        A :term:`gff` file for use in DEXSeq

    annotations_interface_geneset_all_gtf : string
       :term:`PARAMS`. Filename of :term:`gtf` file containing
       all ensembl annotations
    '''

    tmpgff = P.getTempFilename(".")
    statement = "gunzip -c %(infile)s > %(tmpgff)s"
    P.run()

    ps = PYTHONSCRIPTSDIR
    statement = '''python %(ps)s/dexseq_prepare_annotation.py
                %(tmpgff)s %(outfile)s'''
    P.run()

    os.unlink(tmpgff)
コード例 #9
0
def loadRepeatInformation(infiles, outfile):
    '''load genome information.'''

    to_cluster = True

    table = outfile[:-len(".load")]

    repeatsfile, indexfile = infiles

    tmpfilename = P.getTempFilename(".")

    statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s'''
    P.run()

    statement = '''
    gunzip < %(repeatsfile)s
    | cgat gff2bed -v 0
    | coverageBed -a stdin -b %(tmpfilename)s
    | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
    |cgat csv2db %(csv2db_options)s
    --table=%(table)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
コード例 #10
0
def testMotifDisruptingSnpsEnrichedAnnotations(infile, outfile):
    '''
    test SNPs for motif disrupting effects using
    motifbreakR
    '''

    job_memory = "6G"

    tmp = P.getTempFilename(shared=True)

    statement = '''
    zcat %(infile)s
    | grep -P %(annotations_regex)s
    | grep -P %(annotations_cell_regex)s
    > %(tmp)s; checkpoint;
    python /ifs/devel/projects/proj045/enrichment_pipeline/snps2motif.py
    --log=%(outfile)s.log
    --snp-column=3
    --R-scripts-directory=%(r_scripts)s
    --R-script=%(motifs_script)s
    --additional-motif=%(motifs_pwms)s
    --image-directory=plots.dir
    %(tmp)s
    > %(outfile)s; checkpoint;
    rm -f %(tmp)s'''

    P.run()
コード例 #11
0
def aggregateWindowsReadCounts(infiles,
                               outfile,
                               regex="(.*)\..*"):
    '''aggregate several results from coverageBed
    into a single file.

    *regex* is used to extract the track name from the filename.
    The default removes any suffix.

    coverageBed outputs the following columns:
    1 Contig
    2 Start
    3 Stop
    4 Name
    5 The number of features in A that overlapped (by at least one
      base pair) the B interval.
    6 The number of bases in B that had non-zero coverage from features in A.
    7 The length of the entry in B.
    8 The fraction of bases in B that had non-zero coverage from
      features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    Windows without any counts will not be output.
    '''

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join(['''<( zcat %s |
              awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' %
                    (x, column) for x in infiles])
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    # build track names
    tracks = [re.search(regex, os.path.basename(x)).groups()[0]
              for x in infiles]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0:
            continue
        assert len(genes) == 1, \
            "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)
コード例 #12
0
def intersectBedFiles(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''

    if len(infiles) == 1:

        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)
コード例 #13
0
    def mergeSingleExpressionTables(infile, outfile):
        '''
        Merge refcoding and lncRNA count tables from a single condition
        if there are separate input reference gtfs.
        '''

        file1 = infile[0]
        file2 = infile[1]

        tmpfile = P.getTempFilename(shared=True)

        df1 = pd.read_table(file1,
                            sep="\t",
                            index_col=0,
                            header=0,
                            compression="gzip")

        df2 = pd.read_table(file2,
                            sep="\t",
                            index_col=0,
                            header=0,
                            compression="gzip")

        out_frame = df1.append(df2)

        out_frame.to_csv(tmpfile, sep="\t")

        statement = '''cat %(tmpfile)s | gzip > %(outfile)s; rm -rf %(tmpfile)s'''

        P.run()
コード例 #14
0
def buildReferenceGeneSet(infile, outfile):
    """ filter full gene set and add attributes to create the reference gene set

    Performs merge and filter operations:
       * Merge exons separated by small introns (< 5bp).
       * Remove transcripts with very long introns (`max_intron_size`)
       * Remove transcripts located on contigs to be ignored (`remove_contigs`)
         (usually: chrM, _random, ...)
       * (Optional) Remove transcripts overlapping repetitive sequences
         (`rna_file`)

    This preserves all features in a gtf file (exon, CDS, ...)

    Runs cuffcompare with `infile` against itself to add
    attributes such as p_id and tss_id.

    Parameters
    ----------
    infile : str
       Input filename in :term:`gtf` format
    outfile : str
       Input filename in :term:`gtf` format
    annotations_interface_rna_gff : str
       :term:`PARAMS`. Filename of :term:`gtf` file containing
       repetitive rna annotations
    genome_dir : str
       :term:`PARAMS`. Directory of :term:fasta formatted files
    genome : str
       :term:`PARAMS`. Genome name (e.g hg38)
    """

    tmp_mergedfiltered = P.getTempFilename(".")

    if "geneset_remove_repetetive_rna" in PARAMS:
        rna_file = PARAMS["annotations_interface_rna_gff"]
    else:
        rna_file = None

    gene_ids = PipelineMapping.mergeAndFilterGTF(
        infile,
        tmp_mergedfiltered,
        "%s.removed.gz" % outfile,
        genome=os.path.join(PARAMS["genome_dir"], PARAMS["genome"]),
        max_intron_size=PARAMS["max_intron_size"],
        remove_contigs=PARAMS["geneset_remove_contigs"],
        rna_file=rna_file,
    )

    # Add tss_id and p_id
    PipelineMapping.resetGTFAttributes(
        infile=tmp_mergedfiltered,
        genome=os.path.join(PARAMS["genome_dir"], PARAMS["genome"]),
        gene_ids=gene_ids,
        outfile=outfile,
    )

    os.unlink(tmp_mergedfiltered)
コード例 #15
0
def prepareBAMs(infile, outfile):
    '''filter bam files for medip-seq analysis.

    Optional steps include:

    * deduplication - remove duplicate reads
    * quality score filtering - remove reads below a certain quality score.

    '''
    to_cluster = True
    track = P.snip(outfile, ".bam")

    tmpdir = P.getTempFilename()

    current_file = infile

    nfiles = 0
    statement = ["mkdir %(tmpdir)s"]

    if "filtering_quality" in PARAMS and PARAMS["filtering_quality"] > 0:
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append( '''samtools view -q %%(filtering_quality)i -b 
                             %(current_file)s 
                             2>> %%(outfile)s.log 
                             > %(next_file)s ''' % locals())
        nfiles += 1
        current_file = next_file

    if "filtering_dedup" in PARAMS and PARAMS["filtering_dedup"]:
        # Picard's MarkDuplicates requries an explicit bam file.
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()

        dedup_method = PARAMS["filtering_dedup_method"]

        if dedup_method == 'samtools':
            statement.append( '''samtools rmdup - - ''' )

        elif dedup_method == 'picard':
            statement.append('''MarkDuplicates INPUT=%(current_file)s
                                               OUTPUT=%(next_file)s
                                               ASSUME_SORTED=true 
                                               METRICS_FILE=%(outfile)s.duplicate_metrics
                                               REMOVE_DUPLICATES=TRUE 
                                               VALIDATION_STRINGENCY=SILENT
                                               2>> %%(outfile)s.log ''' % locals() )
        nfiles += 1
        current_file = next_file

    statement.append("mv %%(current_file)s %(outfile)s" % locals())
    statement.append("rm -rf %(tmpdir)s")
    statement.append("samtools index %(outfile)s")

    statement = " ; ".join(statement)

    P.run()

    os.unlink(tmpdir)
コード例 #16
0
 def download(self, genes=None, fields=None, scope=None, species=None):
     '''
     download an up to date ontology file, parse the xml data into a
     Python "ElementTree" and delete the ontology file.
     '''
     ontologyfile = P.getTempFilename(".")
     os.system("wget -O %s %s" % (ontologyfile, self.datasource))
     tree = ET.parse(ontologyfile)
     os.remove(ontologyfile)
     self.dataset = tree
コード例 #17
0
def buildBAMforPeakCalling(infiles, outfile, dedup, mask):
    ''' Make a BAM file suitable for peak calling.

        Infiles are merged and unmapped reads removed. 

        If specificied duplicate reads are removed. 
        This method use Picard.

        If a mask is specified, reads falling within
        the mask are filtered out. 

        This uses bedtools.

        The mask is a quicksect object containing
        the regions from which reads are to be excluded.
    '''

    # open the infiles, if more than one merge and sort first using samtools.

    samfiles = []
    num_reads = 0
    nfiles = 0

    statement = []

    tmpfile = P.getTempFilename(".")

    if len(infiles) > 1 and isinstance(infiles, str) == 0:
        # assume: samtools merge output is sorted
        # assume: sam files are sorted already
        statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" ")))
        statement.append('''samtools sort @IN@ @OUT@''')

    if dedup:
        statement.append('''MarkDuplicates
        INPUT=@IN@
        ASSUME_SORTED=true
        REMOVE_DUPLICATES=true
        QUIET=true
        OUTPUT=@OUT@
        METRICS_FILE=%(outfile)s.picardmetrics
        VALIDATION_STRINGENCY=SILENT
        > %(outfile)s.picardlog ''')

    if mask:
        statement.append(
            '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''')

    statement.append('''mv @IN@ %(outfile)s''')
    statement.append('''samtools index %(outfile)s''')

    statement = P.joinStatements(statement, infiles)
    P.run()
コード例 #18
0
ファイル: PipelineiCLIP.py プロジェクト: sudlab/iCLIPlib
def clustersToBigBed(infile, genome_file, outfile):
    '''Convert beds to bigbed '''

    checkParams()

    tmp = P.getTempFilename()

    statement = ''' zcat %(infile)s | sort -k1,1 -k2,2n 
                    | awk 'BEGIN{OFS="\\t"} $5=1' > %(tmp)s;
                    checkpoint;
                    bedToBigBed %(tmp)s %(genome_file)s %(outfile)s;
                    checkpoint;
                    rm %(tmp)s'''
    P.run()
コード例 #19
0
def loadManualAnnotations(infile, outfile):

    tmp = P.getTempFilename(".")

    annotation = P.snip(infile, "_annotations.tsv")

    with IOTools.openFile(tmp, "w") as outf:
        outf.write("%s\tgene_id\n" % annotation)
        with IOTools.openFile(infile, "r") as inf:
            for line in inf:
                outf.write("%s\t%s" % (annotation, line))

    P.load(tmp, outfile, options="--add-index=gene_id")
    os.unlink(tmp)
コード例 #20
0
        def aggregateAdaptors(infiles, outfile):
            '''
            Collate fasta files into a single contaminants file for
            adapter removal.
            '''
            tempfile = P.getTempFilename()
            infiles = " ".join(infiles)

            statement = """
            cat %(infiles)s | fastx_reverse_complement > %(tempfile)s;
            cat %(tempfile)s %(infiles)s | fastx_collapser > %(outfile)s;
            rm -f %(tempfile)s
            """
            P.run()
コード例 #21
0
def mapReadsWithTophat(infiles, outfile):
    """map reads with tophat

    """
    inifile, infile = infiles

    local_params = P.loadParameters(inifile)

    job_options = "-l mem_free=16G"
    job_threads = PARAMS["tophat_threads"]

    tmpfile = P.getTempFilename(".")

    # qualfile = P.snip(infile, "csfasta.gz" ) + "qual.gz"
    """
    gunzip < %(infile)s > %(tmpfile)s.csfasta;
    checkpoint;
    gunzip < %(qualfile)s > %(tmpfile)s.qual;
    checkpoint;
    """

    statement = """
    zcat %(infile)s 
    | python %(scriptsdir)s/fastq2solid.py 
           --method=change-format --target-format=integer
           --pattern-identifier="%(tmpfile)s.%%s" >& %(outfile)s.log;
    checkpoint;
    tophat --output-dir %(outfile)s.dir                    
           --num-threads %(tophat_threads)s  
           --library-type %(tophat_library_type)s
           --color
           --quals
           --integer-quals
           %(tophat_options)s
           %(tophat_genome_dir)s/%(genome)s_cs
           %(tmpfile)s.csfasta %(tmpfile)s.qual
           >> %(outfile)s.log;
    checkpoint;
    mv %(outfile)s.dir/accepted_hits.bam %(outfile)s;
    checkpoint;
    samtools index %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s.csfasta %(tmpfile)s.qual
    """

    # use local parameters to overwrite default ones.
    P.run(**local_params)

    os.unlink(tmpfile)
コード例 #22
0
def buildRefFlat(infile, outfile):
    '''build flat geneset for Picard RnaSeqMetrics.
    '''

    tmpflat = P.getTempFilename(".")

    job_memory = PARAMS["job_memory"]

    statement = '''
    gtfToGenePred -genePredExt -geneNameAsName2 %(infile)s %(tmpflat)s;
    paste <(cut -f 12 %(tmpflat)s) <(cut -f 1-10 %(tmpflat)s)
    > %(outfile)s
    '''
    P.run()
    os.unlink(tmpflat)
コード例 #23
0
def makeCytoscapeInputs(infiles, outfile):
    infile = infiles[1]
    T = P.getTempFilename(".")
    statement = """
    awk -F "\\t" '{printf("%%%%s\\t%%%%s\\t%%%%s\\t%%%%s\\t+1\\n",\
    $1, $12, $8, $9)}' %(infile)s > %(T)s""" % locals()
    P.run()
    typ = infile.split("_")[-3]
    keep = [line.strip() for line in
            IOTools.openFile(PARAMS['cytoscape_%s' % typ]).readlines()]
    tab = pd.read_csv(T, sep="\t")
    tab = tab[tab['term_id'].isin(keep)]
    tab.columns = ['ID', 'Description', 'pvalue', 'padj', 'Phenotype']
    tab.to_csv(outfile, sep="\t", index=None)
    os.remove(T)
コード例 #24
0
def loadClusterCounts(infiles, outfile):
    '''Find the number of signficant clusters found in each sample'''

    tmp = P.getTempFilename(shared=True)
    results = []
    for infile in infiles:
        count = IOTools.getNumLines(infile)
        method, track = re.match(
            "dedup_(.+).dir/(.+)\.clusters.bedgraph", infile).groups()
        results.append((method, track, count))
        
    IOTools.writeLines(tmp, results, header=["method", "track", "count"])

    P.load(tmp, outfile)
    os.unlink(tmp)
コード例 #25
0
ファイル: pipeline_cram2fastq.py プロジェクト: snsansom/scseq
def validateCramFiles(infile, outfiles):
    '''Validate CRAM files by exit status of
       cramtools qstat. Save the quality scores of cram files.
    '''

    outfile, outfile_quality = outfiles

    temp_quality = P.getTempFilename()
    statement = '''cramtools qstat -I %(infile)s > %(temp_quality)s;
                   echo $? > %(outfile)s;
                   cat %(temp_quality)s
                   | awk '{OFS="\\t"} {print $1,$2}'
                   > %(outfile_quality)s;
                '''
    P.run()
コード例 #26
0
def clustersToBigBed(infile, outfile):
    '''Convert beds to bigbed '''

    checkParams()

    tmp = P.getTempFilename()
    genome_file = os.path.join(PARAMS["annotations_dir"],
                               PARAMS_ANNOTATIONS["interface_contigs_tsv"])
    statement = ''' zcat %(infile)s | sort -k1,1 -k2,2n 
                    | awk 'BEGIN{OFS="\\t"} $5=1' > %(tmp)s;
                    checkpoint;
                    bedToBigBed %(tmp)s %(genome_file)s %(outfile)s;
                    checkpoint;
                    rm %(tmp)s'''
    P.run()
コード例 #27
0
def aggregateTiledReadCounts(infiles, outfile):
    '''aggregate tag counts for each window.

    coverageBed outputs the following columns:
    1) Contig
    2) Start
    3) Stop
    4) Name
    5) The number of features in A that overlapped (by at least one base pair) the B interval.
    6) The number of bases in B that had non-zero coverage from features in A.
    7) The length of the entry in B.
    8) The fraction of bases in B that had non-zero coverage from features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    This method uses the maximum number of reads found in any interval as the tag count.

    Tiles with no counts will not be output.
    '''

    to_cluster = True

    src = " ".join(
        [ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$4 );}' ) ''' % x for x in infiles] )
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0:
            continue
        assert len(
            genes) == 1, "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)
コード例 #28
0
def bed2BigWig(infiles, outfile):
    infile, sizes = infiles
    infile = infile.replace(".bismark.cov", ".bedGraph")

    # need to sort first, can do this with tmp file
    tmp_infile = P.getTempFilename()

    statement = '''
    sort -k1,1 -k2,2n %(infile)s |
    awk '{OFS="\t"; $3 = $3 + 1; print $1,$2,$3,$4}' > %(tmp_infile)s;
    checkpoint;
    bedGraphToBigWig %(tmp_infile)s %(sizes)s %(outfile)s;
    checkpoint;
    rm -rf %(tmp_infile)s'''

    P.run()
コード例 #29
0
def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile):
    """map reads from short read archive sequence using bowtie against
    transcriptome data.
    """

    # Mapping will permit up to one mismatches. This is sufficient
    # as the downstream filter in bams2bam requires the
    # number of mismatches less than the genomic number of mismatches.
    # Change this, if the number of permitted mismatches for the genome
    # increases.

    # Output all valid matches in the best stratum. This will
    # inflate the file sizes due to matches to alternative transcripts
    # but otherwise matches to paralogs will be missed (and such
    # reads would be filtered out).
    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    infile, reffile, contigs = infiles
    track = P.snip(outfile, ".bam")
    prefix = P.snip(reffile, ".fa")

    statement = """
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam
           -C
           --un /dev/null
           --threads %(bowtie_threads)s
           %(transcriptome_options)s 
           --best --strata -a
           %(prefix)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --output-sam --method=set-nh --log=%(outfile)s.log
    | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}"
    | samtools import %(contigs)s - -
    | samtools sort - %(track)s;
    checkpoint;
    samtools index %(outfile)s
    checkpoint;
    rm -f %(tmpfile)s
    """

    P.run()
コード例 #30
0
def runCufflinks(infiles, outfile):
    '''estimate expression levels in each set.
    '''

    gtffile, bamfile = infiles

    job_threads = PARAMS["cufflinks_threads"]

    track = os.path.basename(P.snip(gtffile, ".gtf.gz"))

    tmpfilename = P.getTempFilename(".")
    if os.path.exists(tmpfilename):
        os.unlink(tmpfilename)

    gtffile = os.path.abspath(gtffile)
    bamfile = os.path.abspath(bamfile)
    outfile = os.path.abspath(outfile)

    # note: cufflinks adds \0 bytes to gtf file - replace with '.'
    # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a 4.3Mb
    # intron.

    # AH: removed log messages about BAM record error
    # These cause logfiles to grow several Gigs and are
    # frequent for BAM files not created by tophat.
    # Error is:
    # BAM record error: found spliced alignment without XS attribute
    statement = '''mkdir %(tmpfilename)s;
    cd %(tmpfilename)s;
    cufflinks --label %(track)s
              --GTF <(gunzip < %(gtffile)s)
              --num-threads %(cufflinks_threads)i
              --frag-bias-correct %(bowtie_index_dir)s/%(genome)s.fa
              --library-type %(cufflinks_library_type)s
              %(cufflinks_options)s
              %(bamfile)s
    | grep -v 'BAM record error'
    >& %(outfile)s;
    perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz;
    gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz;
    gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz;
    '''

    P.run()

    shutil.rmtree(tmpfilename)
コード例 #31
0
def buildNUMTs(infile, outfile):
    '''output set of potential nuclear mitochondrial genes (NUMTs).

    This function works by aligning the mitochondrial chromosome
    against genome using exonerate_. This can take a while.

    Arguments
    ---------
    infile : string
       Ignored.
    outfile : filename
       Output in :term:`gtf` format with potential NUMTs.

    '''
    if not PARAMS["numts_mitochrom"]:
        E.info("skipping numts creation")
        P.touch(outfile)
        return

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"])
        P.touch(outfile)
        return

    tmpfile_mito = P.getTempFilename(".")

    statement = '''
    cgat index_fasta
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run()

    if IOTools.isEmpty(tmpfile_mito):
        E.warn("mitochondrial genome empty.")
        os.unlink(tmpfile_mito)
        P.touch(outfile)
        return

    format = ("qi", "qS", "qab", "qae",
              "ti", "tS", "tab", "tae",
              "s",
              "pi",
              "C")

    format = "\\\\t".join(["%%%s" % x for x in format])

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no
              --showvulgar no
              --ryo \\"%(format)s\\n\\"
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    # convert to gtf
    inf = IOTools.openFile("%s.links.gz" % outfile)
    outf = IOTools.openFile(outfile, "w")

    min_score = PARAMS["numts_score"]

    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end,
         target_contig, target_strand, target_start, target_end,
         score, pid, alignment) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score:
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int(target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1

    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))

    os.unlink(tmpfile_mito)
コード例 #32
0
def buildGenomicContext(infiles, outfile, distance=10):
    '''build a :term:`bed` formatted file with genomic context.

    The output is a bed formatted file, annotating genomic segments
    according to whether they are any of the ENSEMBL annotations.

    The function also adds the RNA and repeats annotations from the UCSC.
    The annotations can be partially or fully overlapping.

    The annotations can be partially or fully overlapping. Adjacent
    features (less than 10 bp apart) of the same type are merged.

    Arguments
    ---------
    infiles : list
       A list of input files to generate annotations from. The contents are
       1. ``repeats``, a :term:`gff` formatted file with repeat annotations

       2. ``rna``, a :term:`gff` formatted file with small, repetetive
          RNA annotations

       3. ``annotations``, a :term:`gtf` formatted file with genomic
            annotations, see :func:`annotateGenome`.

       4. ``geneset_flat``, a flattened gene set in :term:`gtf` format, see
            :func:`buildFlatGeneSet`.

    outfile : string
       Output filename in :term:`bed` format.
    distance : int
       Merge adajcent features of the same type within this distance.

    '''

    repeats_gff, rna_gff, annotations_gtf, geneset_flat_gff, \
        cpgisland_bed, go_tsv = infiles

    tmpfile = P.getTempFilename(shared=True)
    tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(6)]

    # add ENSEMBL annotations
    statement = """
    zcat %(annotations_gtf)s
    | cgat gtf2gtf
    --method=sort --sort-order=gene
    | cgat gtf2gtf
    --method=merge-exons --log=%(outfile)s.log
    | cgat gff2bed
    --set-name=gene_biotype --is-gtf
    --log=%(outfile)s.log
    | sort -k 1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_0
    """
    P.run()

    # rna
    statement = '''
    zcat %(repeats_gff)s %(rna_gff)s
    | cgat gff2bed --set-name=family --is-gtf -v 0
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_1'''
    P.run()

    # add aggregate intervals for repeats
    statement = '''
    zcat %(repeats_gff)s
    | cgat gff2bed --set-name=family --is-gtf -v 0
    | awk -v OFS="\\t" '{$4 = "repeats"; print}'
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_2'''
    P.run()

    # add aggregate intervals for rna
    statement = '''
    zcat %(rna_gff)s
    | cgat gff2bed --set-name=family --is-gtf -v 0
    | awk -v OFS="\\t" '{$4 = "repetetive_rna"; print}'
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_3 '''
    P.run()

    # add ribosomal protein coding genes
    goids = ("GO:0003735", )

    patterns = "-e %s" % ("-e ".join(goids))

    statement = '''
    zcat %(geneset_flat_gff)s
    | cgat gtf2gtf
    --map-tsv-file=<(zcat %(go_tsv)s | grep %(patterns)s | cut -f 2 | sort | uniq)
    --method=filter --filter-method=gene
    --log=%(outfile)s.log
    | cgat gff2bed
    --log=%(outfile)s.log
    | awk -v OFS="\\t" '{$4 = "ribosomal_coding"; print}'
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_4
    '''
    P.run()

    # CpG islands
    statement = '''
    zcat %(cpgisland_bed)s
    | awk '{printf("%%s\\t%%i\\t%%i\\tcpgisland\\n", $1,$2,$3 )}'
    > %(tmpfile)s_5
    '''
    P.run()

    # sort and merge
    # remove strand information as bedtools
    # complains if there are annotations with
    # different number of field
    files = " ".join(tmpfiles)
    statement = '''
    sort --merge -k1,1 -k2,2n %(files)s
    | cut -f 1-4
    | gzip
    > %(outfile)s
    '''
    P.run()

    for x in tmpfiles:
        os.unlink(x)
コード例 #33
0
def buildPseudogenes(infiles, outfile, dbhandle):
    '''build a set of pseudogenes.

    Transcripts are extracted from the GTF file and designated as
    pseudogenes if:

    * the gene_type or transcript_type contains the phrase
      "pseudo". This taken is from the database.

    * the feature is 'processed_transcript' and has similarity to
      protein coding genes. Similarity is assessed by aligning the
      transcript and peptide set against each other with exonerate_.

    Pseudogenic transcripts can overlap with protein coding
    transcripts.

    Arguments
    ---------
    infiles : list
       Filenames of ENSEMBL geneset in :term:`gtf` format
       and associated peptide sequences in :term:`fasta` format.
    outfile : filename
       Output in :term:`gtf` format with inferred or annotated
       pseudogenes.
    dbandle : object
       Database handle for extracting transcript biotypes.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    # JJ - there are also 'nontranslated_CDS', but no explanation of these
    if PARAMS["genome"].startswith("dm"):
        E.warn("Ensembl dm genome annotations only contain source"
               " 'pseudogenes' - skipping exonerate step")
        statement = """zcat %(infile_gtf)s
        |awk '$2 ~ /pseudogene/'
        | gzip
        > %(outfile)s"""
        P.run()
        return

    tmpfile1 = P.getTempFilename(shared=True)

    # collect processed transcripts and save as fasta sequences
    statement = '''
    zcat %(infile_gtf)s
    | awk '$2 ~ /processed/'
    | cgat gff2fasta
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run()

    if IOTools.isEmpty(tmpfile1):
        E.warn("no pseudogenes found")
        os.unlink(tmpfile1)
        P.touch(outfile)
        return

    model = "protein2dna"

    # map processed transcripts against peptide sequences
    statement = '''
    cat %(tmpfile1)s
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model %(model)s
              --bestn 1
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\"
              --showalignment no --showsugar no --showcigar no --showvulgar no
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    os.unlink(tmpfile1)

    inf = IOTools.openFile("%s.links.gz" % outfile)
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and \
           best_matches[transcript_id][0] > score:
            continue
        best_matches[transcript_id] = (score, peptide_id)

    inf.close()

    E.info("found %i best links" % len(best_matches))
    new_pseudos = set(best_matches.keys())

    cc = dbhandle.cursor()
    known_pseudos = set([x[0] for x in cc.execute(
        """SELECT DISTINCT transcript_id
        FROM transcript_info
        WHERE transcript_biotype like '%pseudo%' OR
        gene_biotype like '%pseudo%' """)])

    E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, "
           "intersection=%i" % (
               (len(new_pseudos),
                len(known_pseudos),
                len(new_pseudos.intersection(known_pseudos)))))

    all_pseudos = new_pseudos.union(known_pseudos)

    c = E.Counter()

    outf = IOTools.openFile(outfile, "w")
    inf = GTF.iterator(IOTools.openFile(infile_gtf))
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write("%s\n" % gtf)
    outf.close()

    E.info("exons: %s" % str(c))
コード例 #34
0
def plotHeatmap(results, norm_matrix, threshold_stat, p_threshold,
                fc_threshold, outfile):
    '''
    plot heatmap of differentially abundant genes
    '''
    if threshold_stat == "p":
        p = "P.Value"
    elif threshold_stat == "padj":
        p = "adj.P.Val"
    else:
        p = "adj.P.Val"

    temp = P.getTempFilename(".")
    R('''library(gplots)''')
    R('''library(gtools)''')
    E.info("reading data")
    R('''mat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t")''' % norm_matrix)
    R('''rownames(mat) <- mat$taxa
         mat <- as.matrix(mat[,1:ncol(mat)-1])''')
    R('''dat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t")''' % results)
    E.info("data loaded")

    R('''t <- dat$taxa[dat$%s < %f & abs(dat$logFC) > %f]''' %
      (p, p_threshold, fc_threshold))
    R('''diff.genes <- unique(t)''')

    ##############################
    # this is a hack
    # to avoid errors when
    # a single differential
    # abundant feature is found
    ##############################
    R('''write.table(diff.genes,
                     file = "%s",
                     row.names = F,
                     sep = "\t")''' % temp)

    tmp = open(temp)
    tmp.readline()
    if len(tmp.readlines()) == 1:
        P.touch(outfile)
    else:
        R('''mat <- mat[as.character(diff.genes), ]
             samples <- colnames(mat)
             mat <- as.data.frame(t(apply(mat, 1, scale)))
             colnames(mat) <- samples
         mat <- mat[, mixedsort(colnames(mat))]
         colours = colorRampPalette(c("blue", "white", "red"))(75)
         pdf("%s", height = 12, width = 12)
         heatmap.2(as.matrix(mat),
                   trace = "none",
                   scale = "none",
                   col = colours,
                   Colv = F,
                   dendrogram = "row",
                   margins = c(18, 18))
             dev.off()''' % outfile)

    os.unlink(temp)
コード例 #35
0
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0):
    '''load results from cuffdiff analysis to database

    This functions parses and loads the results of a cuffdiff differential
    expression analysis.
    Parsing is performed by the parseCuffdiff function.

    Multiple tables will be created as cuffdiff outputs information
    on gene, isoform, tss, etc. levels.

    The method converts from ln(fold change) to log2 fold change.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    `min_fpkm`) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    infile : string
        Input filename, output from cuffdiff
    outfile : string
        Output filename in :term:`tsv` format.
    min_fpkm : float
        Minimum fpkm. Genes with an fpkm lower than this will
        be set to status `NOCALL`.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued...

    tmpname = P.getTempFilename(shared=True)

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)

        results = parseCuffdiff(infile, min_fpkm=min_fpkm)
        Expression.writeExpressionResults(tmpname, results)
        P.load(tmpname, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=treatment_name "
               "--add-index=control_name "
               "--add-index=test_id")

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"
        infile = os.path.join(indir, fn)

        P.load(infile, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=tracking_id "
               "--add-index=control_name "
               "--add-index=test_id")

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        is_first = True
        for line in inf:

            if is_first:
                is_first = False
                continue

            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # please make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            s = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[s]] + "\t")
                s += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        P.load(tmpf,
               outfile,
               tablename=tablename,
               options="--allow-empty-file "
               " --add-index=gene_id")

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
コード例 #36
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.isEmpty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.getTempDir(".")
    tmpfile = P.getTempFilename(".")

    for motiffile in motiffiles:
        if IOTools.isEmpty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run()

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
コード例 #37
0
def resetGTFAttributes(infile, genome, gene_ids, outfile):
    """set GTF attributes in :term:`gtf` formatted file so that they are
    compatible with cufflinks.
    This method runs cuffcompare with `infile` against itself to add
    attributes such as p_id and tss_id.
    Arguments
    ---------
    infile : string
        Filename of :term:`gtf`-formatted input file
    genome : string
       Filename (without extension) of indexed genome file
       in :term:`fasta` format.
    gene_ids : dict
       Dictionary mapping transcript ids to gene ids.
    outfile : string
       Output filename in :term:`gtf` format
    """
    tmpfile1 = P.getTempFilename(".")
    tmpfile2 = P.getTempFilename(".")

    #################################################
    E.info("adding tss_id and p_id")

    # The p_id attribute is set if the fasta sequence is given.
    # However, there might be some errors in cuffdiff downstream:
    #
    # cuffdiff: bundles.cpp:479: static void HitBundle::combine(const std::
    # vector<HitBundle*, std::allocator<HitBundle*> >&, HitBundle&): Assertion
    # `in_bundles[i]->ref_id() == in_bundles[i-1]->ref_id()' failed.
    #
    # I was not able to resolve this, it was a complex
    # bug dependent on both the read libraries and the input reference gtf
    # files
    job_memory = "5G"

    statement = '''
    cuffcompare -r <( gunzip < %(infile)s )
         -T
         -s %(genome)s.fa
         -o %(tmpfile1)s
         <( gunzip < %(infile)s )
         <( gunzip < %(infile)s )
    > %(outfile)s.log
    '''
    P.run()

    #################################################
    E.info("resetting gene_id and transcript_id")

    # reset gene_id and transcript_id to ENSEMBL ids
    # cufflinks patch:
    # make tss_id and p_id unique for each gene id
    outf = IOTools.openFile(tmpfile2, "w")
    map_tss2gene, map_pid2gene = {}, {}
    inf = IOTools.openFile(tmpfile1 + ".combined.gtf")

    def _map(gtf, key, val, m):
        if val in m:
            while gene_id != m[val]:
                val += "a"
                if val not in m:
                    break
        m[val] = gene_id

        gtf.setAttribute(key, val)

    for gtf in GTF.iterator(inf):
        transcript_id = gtf.oId
        gene_id = gene_ids[transcript_id]
        gtf.setAttribute("transcript_id", transcript_id)
        gtf.setAttribute("gene_id", gene_id)

        # set tss_id
        try:
            tss_id = gtf.tss_id
        except AttributeError:
            tss_id = None
        try:
            p_id = gtf.p_id
        except AttributeError:
            p_id = None

        if tss_id:
            _map(gtf, "tss_id", tss_id, map_tss2gene)
        if p_id:
            _map(gtf, "p_id", p_id, map_pid2gene)

        outf.write(str(gtf) + "\n")

    outf.close()

    # sort gtf file
    PipelineGeneset.sortGTF(tmpfile2, outfile)

    # make sure tmpfile1 is NEVER empty
    assert tmpfile1
    for x in glob.glob(tmpfile1 + "*"):
        os.unlink(x)
    os.unlink(tmpfile2)
コード例 #38
0
def summarizeTagsWithinContext(tagfile,
                               contextfile,
                               outfile,
                               min_overlap=0.5,
                               job_memory="15G"):
    '''count occurances of tags in genomic context.

    Examines the genomic context to where tags align.

    A tag is assigned to the genomic context that it
    overlaps by at least 50%. Thus some reads mapping
    several contexts might be dropped.

    Arguments
    ---------
    tagfile : string
        Filename with tags. The file can be :term:`bam` or :term:`bed` format.
    contextfile : string
        Filename of :term:`bed` formatted files with named intervals (BED4).
    outfile : string
        Output in :term:`tsv` format.
    min_overlap : float
        Minimum overlap (fraction) to count features as overlapping.
    job_memory : string
        Memory to reserve.
    '''

    tmpfile = P.getTempFilename(shared=True)
    tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(2)]
    statement = '''
    cgat bam_vs_bed
    --min-overlap=%(min_overlap)f
    --log=%(outfile)s.log
    %(tagfile)s %(contextfile)s
    > %(tmpfile)s_0
    '''

    P.run()

    statement = '''
    printf "intergenic\\t" >> %(tmpfile)s_1'''

    P.run()

    statement = '''
    bedtools intersect -a %(tagfile)s
    -b %(contextfile)s
    -bed -v | wc -l
    | xargs printf
    >> %(tmpfile)s_1
    '''
    P.run()

    files = " ".join(tmpfiles)
    statement = '''
    sort --merge  %(files)s
    | gzip > %(outfile)s
    '''
    P.run()

    for x in tmpfiles:
        os.unlink(x)
コード例 #39
0
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"):
    '''aggregate output from several ``bedtools coverage`` results.

    ``bedtools coverage`` outputs the following columns for a bed4
    file::

    1 Contig
    2 Start
    3 Stop
    4 Name
    5 The number of features in A that overlapped (by at least one
      base pair) the B interval.
    6 The number of bases in B that had non-zero coverage from features in A.
    7 The length of the entry in B.
    8 The fraction of bases in B that had non-zero coverage from
      features in A.

    This method autodetects the number of columns in the :term:`infiles`
    and selects:

    * bed4: use column 5
    * bed6: use column 7
    * bed12: use column 13

    Arguments
    ---------
    infiles : list
        Input filenames with the output from ``bedtools coverage``
    outfile : string
        Output filename in :term:`tsv` format.
    regex : string
        Regular expression used to extract the track name from the
        filename.  The default removes any suffix.

    '''

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join([
        """<( zcat %s |
              awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" %
        (x, column) for x in infiles
    ])
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    # build track names
    tracks = [
        re.search(regex, os.path.basename(x)).groups()[0] for x in infiles
    ]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    # filter for uniqueness - keys with the same value as the
    # previous line will be ignored.
    last_gene = None
    c = E.Counter()
    for line in open(tmpfile, "r"):
        c.input += 1
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]

        assert len(genes) == 1, \
            "paste command failed, wrong number of genes per line: '%s'" % line
        if genes[0] == last_gene:
            c.duplicates += 1
            continue
        c.output += 1
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))
        last_gene = genes[0]

    outf.close()

    os.unlink(tmpfile)

    E.info("aggregateWindowsTagCounts: %s" % c)
コード例 #40
0
def convertReadsToIntervals(bamfile,
                            bedfile,
                            filtering_quality=None,
                            filtering_dedup=None,
                            filtering_dedup_method='picard',
                            filtering_nonunique=False):
    '''convert reads in *bamfile* to *intervals*.

    This method converts read data into intervals for
    counting based methods.

    This method is not appropriate for RNA-Seq.

    Optional steps include:

    For paired end data, pairs are merged and optionally
    filtered by insert size.

    Arguments
    ---------
    bamfile : string
        Filename of input file in :term:`bam` format.
    bedfile : string
        Filename of output file in :term:`bed` format.
    filtering_quality : int
        If set, remove reads with a quality score below given threshold.
    filtering_dedup : bool
        If True, deduplicate data.
    filtering_dedup_method : string
        Deduplication method. Possible options are ``picard`` and
        ``samtools``.
    filtering_nonunique : bool
        If True, remove non-uniquely matching reads.

    '''
    track = P.snip(bedfile, ".bed.gz")

    is_paired = BamTools.isPaired(bamfile)
    current_file = bamfile
    tmpdir = P.getTempFilename()
    os.unlink(tmpdir)
    statement = ["mkdir %(tmpdir)s"]
    nfiles = 0

    if filtering_quality > 0:
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append('''samtools view
        -q %(filtering_quality)i -b
        %(current_file)s
        2>> %%(bedfile)s.quality.log
        > %(next_file)s ''' % locals())

        nfiles += 1
        current_file = next_file

    if filtering_nonunique:

        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append('''cat %(current_file)s
        | cgat bam2bam
        --method=filter
        --filter-method=unique,mapped
        --log=%%(bedfile)s.nonunique.log
        > %(next_file)s ''' % locals())

        nfiles += 1
        current_file = next_file

    if filtering_dedup is not None:
        # Picard's MarkDuplicates requries an explicit bam file.
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()

        if filtering_dedup_method == 'samtools':
            statement.append('''samtools rmdup - - ''')

        elif filtering_dedup_method == 'picard':
            statement.append('''MarkDuplicates
            INPUT=%(current_file)s
            OUTPUT=%(next_file)s
            ASSUME_SORTED=TRUE
            METRICS_FILE=%(bedfile)s.duplicate_metrics
            REMOVE_DUPLICATES=TRUE
            VALIDATION_STRINGENCY=SILENT
            2>> %%(bedfile)s.markdup.log ''' % locals())

        nfiles += 1
        current_file = next_file

    if is_paired:
        statement.append('''cat %(current_file)s
            | cgat bam2bed
              --merge-pairs
              --min-insert-size=%(filtering_min_insert_size)i
              --max-insert-size=%(filtering_max_insert_size)i
              --log=%(bedfile)s.bam2bed.log
              -
            | cgat bed2bed
              --method=sanitize-genome
              --genome-file=%(genome_dir)s/%(genome)s
              --log=%(bedfile)s.sanitize.log
            | cut -f 1,2,3,4
            | sort -k1,1 -k2,2n
            | bgzip > %(bedfile)s''')
    else:
        statement.append('''cat %(current_file)s
            | cgat bam2bed
              --log=%(bedfile)s.bam2bed.log
              -
            | cgat bed2bed
              --method=sanitize-genome
              --genome-file=%(genome_dir)s/%(genome)s
              --log=%(bedfile)s.sanitize.log
            | cut -f 1,2,3,4
            | sort -k1,1 -k2,2n
            | bgzip > %(bedfile)s''')

    statement.append("tabix -p bed %(bedfile)s")
    statement.append("rm -rf %(tmpdir)s")
    statement = " ; checkpoint; ".join(statement)
    P.run()
コード例 #41
0
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)
コード例 #42
0
def buildGenomicContext(infiles, outfile, distance=10):

    '''build a :term:`bed` formatted file with genomic context.
    The output is a bed formatted file, annotating genomic segments
    according to whether they are any of the ENSEMBL annotations.
    The function also adds the RNA and repeats annotations from the UCSC.
    The annotations can be partially or fully overlapping.
    The annotations can be partially or fully overlapping. Adjacent
    features (less than 10 bp apart) of the same type are merged.
    Arguments
    ---------
    infiles : list
       A list of input files to generate annotations from. The contents are
       1. ``repeats``, a :term:`gff` formatted file with repeat annotations
       2. ``rna``, a :term:`gff` formatted file with small, repetetive
          RNA annotations
       3. ``annotations``, a :term:`gtf` formatted file with genomic
            annotations, see :func:`annotateGenome`.
       4. ``geneset_flat``, a flattened gene set in :term:`gtf` format, see
            :func:`buildFlatGeneSet`.
    outfile : string
       Output filename in :term:`bed` format.
    distance : int
       Merge adajcent features of the same type within this distance.
    '''

    repeats_gff, rna_gff, annotations_gtf, utr_gtf, intron_gtf = infiles

    tmpfile = P.getTempFilename(shared=True)
    tmpfiles = ["%s_%i" % (tmpfile, x) for x in range(4)]

    # add ENSEMBL annotations
    statement = """
    zcat %(annotations_gtf)s
    | cgat gtf2gtf
    --method=sort --sort-order=gene
    | cgat gtf2gtf
    --method=merge-exons --log=%(outfile)s.log
    | cgat gff2bed
    --set-name=gene_biotype --is-gtf
    --log=%(outfile)s.log
    | sort -k 1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_0
    """
    P.run()

    # rna
    statement = '''
    zcat %(repeats_gff)s %(rna_gff)s
    | cgat gff2bed --set-name=family --is-gtf -v 0
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_1'''
    P.run()

    # utr
    statement = '''zcat %(utr_gtf)s
    | cgat gff2bed --is-gtf --set-name=feature
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_2'''
    P.run()

    # intron
    statement = '''zcat %(intron_gtf)s
    | cgat gff2bed --is-gtf --set-name=feature
    | sort -k1,1 -k2,2n
    | cgat bed2bed --method=merge --merge-by-name
    --merge-distance=%(distance)i --log=%(outfile)s.log
    > %(tmpfile)s_3'''
    P.run()

    # sort and merge
    # remove strand information as bedtools
    # complains if there are annotations with
    # different number of field
    files = " ".join(tmpfiles)
    statement = '''
    sort --merge -k1,1 -k2,2n %(files)s
    | cut -f 1-4
    | gzip
    > %(outfile)s
    '''
    P.run()

    for x in tmpfiles:
        os.unlink(x)
コード例 #43
0
def buildSpikeResults(infile, outfile):
    '''build matrices with results from spike-in and upload
    into database.

    The method will output several files:

    .spiked.gz: Number of intervals that have been spiked-in
               for each bin of expression and fold-change

    .power.gz: Global power analysis - aggregates over all
        ranges of fold-change and expression and outputs the
        power, the proportion of intervals overall that
        could be detected as differentially methylated.

        This is a table with the following columns:

        fdr - fdr threshold
        power - power level, number of intervals detectable
        intervals - number of intervals in observed data at given
                    level of fdr and power.
        intervals_percent - percentage of intervals in observed data
              at given level of fdr and power

    The method will also upload the results into the database.

    Arguments
    ---------
    infile : string
        Input filename in :term:`tsv` format. Usually the output of
        :mod:`scripts/runExpression`.
    outfile : string
        Output filename in :term:`tsv` format.

    '''

    expression_nbins = 10
    fold_nbins = 10

    spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz'

    if not os.path.exists(spikefile):
        E.warn('no spike data: %s' % spikefile)
        P.touch(outfile)
        return

    ########################################
    # output and load spiked results
    tmpfile_name = P.getTempFilename(shared=True)

    statement = '''zcat %(spikefile)s
    | grep -e "^spike" -e "^test_id"
    > %(tmpfile_name)s
    '''
    P.run()

    E.debug("outputting spiked counts")
    (spiked, spiked_d2hist_counts, xedges, yedges,
     spiked_l10average, spiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".spiked.gz",
            infile_name=tmpfile_name,
            expression_nbins=expression_nbins,
            fold_nbins=fold_nbins)

    ########################################
    # output and load unspiked results
    statement = '''zcat %(infile)s
    | grep -v -e "^spike"
    > %(tmpfile_name)s
    '''
    P.run()
    E.debug("outputting unspiked counts")

    (unspiked, unspiked_d2hist_counts, unspiked_xedges,
     unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \
        outputSpikeCounts(
            outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz",
            infile_name=tmpfile_name,
            expression_bins=xedges,
            fold_bins=yedges)

    E.debug("computing power")

    assert xedges.all() == unspiked_xedges.all()

    tmpfile = IOTools.openFile(tmpfile_name, "w")
    tmpfile.write("\t".join(
        ("expression",
         "fold",
         "fdr",
         "counts",
         "percent")) + "\n")

    fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1))
    power_thresholds = numpy.arange(0.1, 1.1, 0.1)

    spiked_total = float(spiked_d2hist_counts.sum().sum())
    unspiked_total = float(unspiked_d2hist_counts.sum().sum())

    outf = IOTools.openFile(outfile, "w")
    outf.write("fdr\tpower\tintervals\tintervals_percent\n")

    # significant results
    for fdr in fdr_thresholds:
        take = spiked['qvalue'] < fdr

        # compute 2D histogram in spiked data below fdr threshold
        spiked_d2hist_fdr, xedges, yedges = \
            numpy.histogram2d(spiked_l10average[take],
                              spiked_l2fold[take],
                              bins=(xedges, yedges))

        # convert to percentage of spike-ins per bin
        spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts
        spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed)

        # set values without data to -1
        spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0

        # output to table for database upload
        for x, y in itertools.product(list(range(len(xedges) - 1)),
                                      list(range(len(yedges) - 1))):
            tmpfile.write("\t".join(map(
                str, (xedges[x], yedges[y],
                      fdr,
                      spiked_d2hist_fdr[x, y],
                      100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n")

        # take elements in spiked_hist_fdr above a certain threshold
        for power in power_thresholds:
            # select 2D bins at a given power level
            power_take = spiked_d2hist_fdr_normed >= power

            # select the counts in the unspiked data according
            # to this level
            power_counts = unspiked_d2hist_counts[power_take]

            outf.write("\t".join(map(
                str, (fdr, power,
                      power_counts.sum().sum(),
                      100.0 * power_counts.sum().sum() /
                      unspiked_total))) + "\n")

    tmpfile.close()
    outf.close()

    # upload into table
    method = P.snip(os.path.dirname(outfile), ".dir")
    tablename = P.toTable(
        P.snip(outfile, "power.gz") + method + ".spike.load")

    P.load(tmpfile_name,
           outfile + ".log",
           tablename=tablename,
           options="--add-index=fdr")

    os.unlink(tmpfile_name)
コード例 #44
0
def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--add-index=gene_id")