Example #1
0
def convertPslToChain(infile, outfile):
    '''convert a psl to a chain file.

    see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver
    '''

    to_cluster = True

    target, query = extractGenomes(infile)

    tmpfilename1 = P.getTempFilename(".")
    tmpfilename2 = P.getTempFilename(".")

    writeContigSizes(target, tmpfilename1)
    writeContigSizes(query, tmpfilename2)

    statement = '''gunzip
    < %(infile)s
    | pslSwap stdin stdout
    | cgat psl2chain --log=%(outfile)s.log
    | chainSort stdin stdout
    | gzip
    > %(outfile)s.sorted.chain.gz;
    checkpoint;
    gunzip < %(outfile)s.sorted.chain.gz
    | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null
    | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout
    | gzip
    > %(outfile)s'''
    P.run()

    os.unlink(tmpfilename1)
    os.unlink(tmpfilename2)
Example #2
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_dir"],
                                  PARAMS["annotations_interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS["annotations_interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;" " gzip %(removed)s")
    P.run()
Example #3
0
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
Example #4
0
def intersectBedFiles(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''

    if len(infiles) == 1:

        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)
Example #5
0
def mapReadsWithBowtie(infiles, outfile):
    '''map reads with bowtie'''

    inifile, infile = infiles

    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam
           -C
           --threads %(bowtie_threads)s
           %(bowtie_options)s
           %(bowtie_genome_dir)s/%(genome)s_cs
           %(tmpfile)s
    | cgat bam2bam --output-sam --method=set-nh --log=%(outfile)s.log
    | gzip
    > %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()
def loadRepeatInformation(infiles, outfile):
    '''load genome information.'''

    to_cluster = True

    table = outfile[:-len(".load")]

    repeatsfile, indexfile = infiles

    tmpfilename = P.getTempFilename(".")

    statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s'''
    P.run()

    statement = '''
    gunzip < %(repeatsfile)s
    | cgat gff2bed -v 0
    | coverageBed -a stdin -b %(tmpfilename)s
    | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
    |cgat csv2db %(csv2db_options)s
    --table=%(table)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
def loadSleuthTableGenes(infile, outfile, gene_info, gene_biotypes, database,
                         annotations_database):

    tmpfile = P.getTempFilename("/ifs/scratch/")

    table = os.path.basename(gene_info)

    if gene_biotypes:
        where_cmd = "WHERE " + " OR ".join(
            ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")])
    else:
        where_cmd = ""

    select = """SELECT DISTINCT
        gene_id, gene_name
        FROM annotations.%(table)s
        %(where_cmd)s""" % locals()

    df1 = pd.read_table(infile, sep="\t")
    df1.set_index("test_id", drop=False, inplace=True)

    df2 = pd.read_sql(select, connect(database, annotations_database))
    df2.set_index("gene_id", drop=False, inplace=True)

    df = df1.join(df2)
    df.to_csv(tmpfile, sep="\t", index=True)

    options = "--add-index=gene_id"
    P.load(tmpfile, outfile, options=options)
    os.unlink(tmpfile)
Example #8
0
    def mergeSingleExpressionTables(infile, outfile):
        '''
        Merge refcoding and lncRNA count tables from a single condition
        if there are separate input reference gtfs.
        '''

        file1 = infile[0]
        file2 = infile[1]

        tmpfile = P.getTempFilename(shared=True)

        df1 = pd.read_table(file1,
                            sep="\t",
                            index_col=0,
                            header=0,
                            compression="gzip")

        df2 = pd.read_table(file2,
                            sep="\t",
                            index_col=0,
                            header=0,
                            compression="gzip")

        out_frame = df1.append(df2)

        out_frame.to_csv(tmpfile, sep="\t")

        statement = '''cat %(tmpfile)s | gzip > %(outfile)s; rm -rf %(tmpfile)s'''

        P.run()
Example #9
0
def prepareBAMs(infile, outfile):
    '''filter bam files for medip-seq analysis.

    Optional steps include:

    * deduplication - remove duplicate reads
    * quality score filtering - remove reads below a certain quality score.

    '''
    to_cluster = True
    track = P.snip(outfile, ".bam")

    tmpdir = P.getTempFilename()

    current_file = infile

    nfiles = 0
    statement = ["mkdir %(tmpdir)s"]

    if "filtering_quality" in PARAMS and PARAMS["filtering_quality"] > 0:
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append('''samtools view -q %%(filtering_quality)i -b 
                             %(current_file)s 
                             2>> %%(outfile)s.log 
                             > %(next_file)s ''' % locals())
        nfiles += 1
        current_file = next_file

    if "filtering_dedup" in PARAMS and PARAMS["filtering_dedup"]:
        # Picard's MarkDuplicates requries an explicit bam file.
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()

        dedup_method = PARAMS["filtering_dedup_method"]

        if dedup_method == 'samtools':
            statement.append('''samtools rmdup - - ''')

        elif dedup_method == 'picard':
            statement.append('''MarkDuplicates INPUT=%(current_file)s
                                               OUTPUT=%(next_file)s
                                               ASSUME_SORTED=true 
                                               METRICS_FILE=%(outfile)s.duplicate_metrics
                                               REMOVE_DUPLICATES=TRUE 
                                               VALIDATION_STRINGENCY=SILENT
                                               2>> %%(outfile)s.log ''' %
                             locals())
        nfiles += 1
        current_file = next_file

    statement.append("mv %%(current_file)s %(outfile)s" % locals())
    statement.append("rm -rf %(tmpdir)s")
    statement.append("samtools index %(outfile)s")

    statement = " ; ".join(statement)

    P.run()

    os.unlink(tmpdir)
Example #10
0
def buildBAMforPeakCalling(infiles, outfile, dedup, mask):
    ''' Make a BAM file suitable for peak calling.

        Infiles are merged and unmapped reads removed. 

        If specificied duplicate reads are removed. 
        This method use Picard.

        If a mask is specified, reads falling within
        the mask are filtered out. 

        This uses bedtools.

        The mask is a quicksect object containing
        the regions from which reads are to be excluded.
    '''

    # open the infiles, if more than one merge and sort first using samtools.

    samfiles = []
    num_reads = 0
    nfiles = 0

    statement = []

    tmpfile = P.getTempFilename(".")

    if len(infiles) > 1 and isinstance(infiles, str) == 0:
        # assume: samtools merge output is sorted
        # assume: sam files are sorted already
        statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" ")))
        statement.append('''samtools sort @IN@ @OUT@''')

    if dedup:
        statement.append('''MarkDuplicates
        INPUT=@IN@
        ASSUME_SORTED=true
        REMOVE_DUPLICATES=true
        QUIET=true
        OUTPUT=@OUT@
        METRICS_FILE=%(outfile)s.picardmetrics
        VALIDATION_STRINGENCY=SILENT
        > %(outfile)s.picardlog ''')

    if mask:
        statement.append(
            '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''')

    statement.append('''mv @IN@ %(outfile)s''')
    statement.append('''samtools index %(outfile)s''')

    statement = P.joinStatements(statement, infiles)
    P.run()
Example #11
0
def aggregateTiledReadCounts(infiles, outfile):
    '''aggregate tag counts for each window.

    coverageBed outputs the following columns:
    1) Contig
    2) Start
    3) Stop
    4) Name
    5) The number of features in A that overlapped (by at least one base pair) the B interval.
    6) The number of bases in B that had non-zero coverage from features in A.
    7) The length of the entry in B.
    8) The fraction of bases in B that had non-zero coverage from features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    This method uses the maximum number of reads found in any interval as the tag count.

    Tiles with no counts will not be output.
    '''

    to_cluster = True

    src = " ".join([
        '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$4 );}' ) '''
        % x for x in infiles
    ])
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0:
            continue
        assert len(
            genes
        ) == 1, "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)
Example #12
0
def mapReadsWithTophat(infiles, outfile):
    '''map reads with tophat

    '''
    inifile, infile = infiles

    local_params = P.loadParameters(inifile)

    job_options = "-l mem_free=16G"
    job_threads = PARAMS["tophat_threads"]

    tmpfile = P.getTempFilename(".")

    #qualfile = P.snip(infile, "csfasta.gz" ) + "qual.gz"
    '''
    gunzip < %(infile)s > %(tmpfile)s.csfasta;
    checkpoint;
    gunzip < %(qualfile)s > %(tmpfile)s.qual;
    checkpoint;
    '''

    statement = '''
    zcat %(infile)s
    | cgat fastq2solid
           --method=change-format --target-format=integer
           --pattern-identifier="%(tmpfile)s.%%s" >& %(outfile)s.log;
    checkpoint;
    tophat --output-dir %(outfile)s.dir
           --num-threads %(tophat_threads)s
           --library-type %(tophat_library_type)s
           --color
           --quals
           --integer-quals
           %(tophat_options)s
           %(tophat_genome_dir)s/%(genome)s_cs
           %(tmpfile)s.csfasta %(tmpfile)s.qual
           >> %(outfile)s.log;
    checkpoint;
    mv %(outfile)s.dir/accepted_hits.bam %(outfile)s;
    checkpoint;
    samtools index %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s.csfasta %(tmpfile)s.qual
    '''

    # use local parameters to overwrite default ones.
    P.run(**local_params)

    os.unlink(tmpfile)
Example #13
0
def bed2BigWig(infiles, outfile):
    infile, sizes = infiles
    infile = infile.replace(".bismark.cov", ".bedGraph")

    # need to sort first, can do this with tmp file
    tmp_infile = P.getTempFilename()

    statement = '''
    sort -k1,1 -k2,2n %(infile)s |
    awk '{OFS="\t"; $3 = $3 + 1; print $1,$2,$3,$4}' > %(tmp_infile)s;
    checkpoint;
    bedGraphToBigWig %(tmp_infile)s %(sizes)s %(outfile)s;
    checkpoint;
    rm -rf %(tmp_infile)s'''

    P.run()
Example #14
0
def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile):
    '''map reads from short read archive sequence using bowtie against
    transcriptome data.
    '''

    # Mapping will permit up to one mismatches. This is sufficient
    # as the downstream filter in bams2bam requires the
    # number of mismatches less than the genomic number of mismatches.
    # Change this, if the number of permitted mismatches for the genome
    # increases.

    # Output all valid matches in the best stratum. This will
    # inflate the file sizes due to matches to alternative transcripts
    # but otherwise matches to paralogs will be missed (and such
    # reads would be filtered out).
    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    infile, reffile, contigs = infiles
    track = P.snip(outfile, ".bam")
    prefix = P.snip(reffile, ".fa")

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam
           -C
           --un /dev/null
           --threads %(bowtie_threads)s
           %(transcriptome_options)s
           --best --strata -a
           %(prefix)s_cs
           %(tmpfile)s
    | cgat bam2bam --output-sam --method=set-nh --log=%(outfile)s.log
    | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}"
    | samtools import %(contigs)s - -
    | samtools sort - %(track)s;
    checkpoint;
    samtools index %(outfile)s
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()
Example #15
0
def buildFeatureCounts(infiles, outfile):
    '''counts reads falling into "features", which by default are genes.

    A read overlaps if at least one bp overlaps.

    Pairs and strandedness can be used to resolve reads falling into
    more than one feature. Reads that cannot be resolved to a single
    feature are ignored.

    '''

    infile, annotations = infiles

    # featureCounts cannot handle gzipped in or out files
    outfile = P.snip(outfile, ".gz")
    annotations_tmp = P.getTempFilename()

    # -p -B specifies count fragments rather than reads, and both
    # reads must map to the feature
    if PARAMS['featurecounts_paired'] == "1":
        paired = "-p -B"
    else:
        paired = ""

    job_options = "-pe dedicated %i" % PARAMS['featurecounts_threads']

    statement = '''
    zcat %(annotations)s > %(annotations_tmp)s;
    checkpoint;
    featureCounts %(featurecounts_options)s
    -T %(featurecounts_threads)s
    -s %(featurecounts_strand)s
    -b
    -a %(annotations_tmp)s
    -o %(outfile)s
    %(infile)s
    > %(outfile)s.log;
    checkpoint;
    gzip %(outfile)s;
    checkpoint;
    rm %(annotations_tmp)s '''

    P.run()
Example #16
0
def mapReadsWithBowtieAgainstJunctions(infiles, outfile):
    '''map reads from short read archive sequence using bowtie against
    splice junctions.

    The reads are converted to genomic coordinates.
    '''

    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    infile, reffile, contigs = infiles
    track = P.snip(outfile, ".bam")
    prefix = P.snip(reffile, ".fa")

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam
           -C
           --un /dev/null
           --threads %(bowtie_threads)s
           %(transcriptome_options)s
           --best --strata -a
           %(prefix)s_cs
           %(tmpfile)s
    | cgat bam2bam
    --method=set-nh --log=%(outfile)s.log
    | cgat rnaseq_junction_bam2bam
    --contigs-tsv-file=%(contigs)s --log=%(outfile)s.log
    | samtools sort - %(track)s;
    checkpoint;
    samtools index %(outfile)s
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()

    os.unlink(tmpfile)
Example #17
0
def buildRefcodingGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''

    # calculate exon status for refcoding genes.
    tmpf = P.getTempFilename(".") + ".gz"
    PipelineLncRNA.flagExonStatus(infile, tmpf)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join([
        "no_transcripts", "no_genes", "no_exons_per_transcript",
        "no_exons_per_gene", "no_single_exon_transcripts",
        "no_multi_exon_transcripts", "no_single_exon_genes",
        "no_multi_exon_genes"
    ]) + "\n")
    outf.write("\t".join(
        map(str, [
            PipelineLncRNA.CounterTranscripts(tmpf).count(),
            PipelineLncRNA.CounterGenes(tmpf).count(),
            PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(),
            PipelineLncRNA.CounterExonsPerGene(tmpf).count(),
            PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(),
            PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(),
            PipelineLncRNA.CounterSingleExonGenes(tmpf).count(),
            PipelineLncRNA.CounterMultiExonGenes(tmpf).count()
        ])))

    os.unlink(tmpf)
    os.unlink(tmpf + ".log")
    os.unlink(P.snip(tmpf, ".gz"))
Example #18
0
def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
Example #19
0
def buildBigBed(infile, outfile):
    '''bed file with intervals that are covered by reads in any of the experiments.
    '''

    to_cluster = True
    to_cluster = False

    tmpfile = P.getTempFilename()

    contig_sizes = os.path.join(PARAMS["annotations_dir"],
                                PARAMS_ANNOTATIONS["interface_contigs"])

    statement = '''
    zcat %(infile)s > %(tmpfile)s;
    bedToBigBed %(tmpfile)s %(contig_sizes)s %(outfile)s;
    rm -f %(tmpfile)s
    '''
    P.run()

    try:
        os.unlink(tmpfile)
    except OSError:
        pass
Example #20
0
def plotHeatmap(results,
                norm_matrix,
                threshold_stat,
                p_threshold,
                fc_threshold,
                outfile):
    '''
    plot heatmap of differentially abundant genes
    '''
    if threshold_stat == "p":
        p = "P.Value"
    elif threshold_stat == "padj":
        p = "adj.P.Val"
    else:
        p = "adj.P.Val"

    temp = P.getTempFilename(".")
    R('''library(gplots)''')
    R('''library(gtools)''')
    E.info("reading data")
    R('''mat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t")''' % norm_matrix)
    R('''rownames(mat) <- mat$taxa
         mat <- as.matrix(mat[,1:ncol(mat)-1])''')
    R('''dat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t")''' % results)
    E.info("data loaded")

    R('''t <- dat$taxa[dat$%s < %f & abs(dat$logFC) > %f]''' % (
        p, p_threshold, fc_threshold))
    R('''diff.genes <- unique(t)''')

    ##############################
    # this is a hack
    # to avoid errors when
    # a single differential
    # abundant feature is found
    ##############################
    R('''write.table(diff.genes,
                     file = "%s",
                     row.names = F,
                     sep = "\t")''' % temp)

    tmp = open(temp)
    tmp.readline()
    if len(tmp.readlines()) == 1:
        P.touch(outfile)
    else:
        R('''mat <- mat[as.character(diff.genes), ]
             samples <- colnames(mat)
             mat <- as.data.frame(t(apply(mat, 1, scale)))
             colnames(mat) <- samples
         mat <- mat[, mixedsort(colnames(mat))]
         colours = colorRampPalette(c("blue", "white", "red"))(75)
         pdf("%s", height = 12, width = 12)
         heatmap.2(as.matrix(mat),
                   trace = "none",
                   scale = "none",
                   col = colours,
                   Colv = F,
                   dendrogram = "row",
                   margins = c(18, 18))
             dev.off()''' % outfile)

    os.unlink(temp)
Example #21
0
def loadLncRNAPhyloCSF(infile, outfile):
    tmpf = P.getTempFilename("/ifs/scratch")
    PipelineLncRNA.parsePhyloCSF(infile, tmpf)
    P.load(tmpf, outfile, options="--add-index=gene_id")
Example #22
0
def buildNormalizedBAM(infiles, outfile, normalize=True):
    '''build a normalized BAM file.

    Infiles are merged and duplicated reads are removed.  If
    *normalize* is set, reads are removed such that all files will
    have approximately the same number of reads.

    Note that the duplication here is wrong as there
    is no sense of strandedness preserved.

    '''

    min_reads = getMinimumMappedReads(glob.glob("*.readstats"))

    samfiles = []
    num_reads = 0
    for infile, statsfile in infiles:
        samfiles.append(pysam.Samfile(infile, "rb"))
        num_reads += getMappedReads(statsfile)

    threshold = float(min_reads) / num_reads

    E.info("%s: min reads: %i, total reads=%i, threshold=%f" %
           (infiles, min_reads, num_reads, threshold))

    pysam_out = pysam.Samfile(outfile, "wb", template=samfiles[0])

    ninput, noutput, nduplicates = 0, 0, 0

    # iterate over mapped reads
    last_contig, last_pos = None, None
    for pysam_in in samfiles:
        for read in pysam_in.fetch():

            ninput += 1
            if read.rname == last_contig and read.pos == last_pos:
                nduplicates += 1
                continue

            if normalize and random.random() <= threshold:
                pysam_out.write(read)
                noutput += 1

            last_contig, last_pos = read.rname, read.pos

        pysam_in.close()

    pysam_out.close()

    logs = IOTools.openFile(outfile + ".log", "w")
    logs.write("# min_reads=%i, threshold= %5.2f\n" %
               (min_reads, threshold))
    logs.write("set\tcounts\tpercent\n")
    logs.write("ninput\t%i\t%5.2f%%\n" % (ninput, 100.0))
    nwithout_dups = ninput - nduplicates
    logs.write("duplicates\t%i\t%5.2f%%\n" %
               (nduplicates, 100.0 * nduplicates / ninput))
    logs.write("without duplicates\t%i\t%5.2f%%\n" %
               (nwithout_dups, 100.0 * nwithout_dups / ninput))
    logs.write("target\t%i\t%5.2f%%\n" %
               (min_reads, 100.0 * min_reads / nwithout_dups))
    logs.write("noutput\t%i\t%5.2f%%\n" %
               (noutput, 100.0 * noutput / nwithout_dups))

    logs.close()

    # if more than one samfile: sort
    if len(samfiles) > 1:
        tmpfilename = P.getTempFilename(".")
        pysam.sort(outfile, tmpfilename)
        shutil.move(tmpfilename + ".bam", outfile)
        os.unlink(tmpfilename)

    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, min_reads))
Example #23
0
def buildIntervalCounts(infile, outfile, track, fg_replicates, bg_replicates):
    '''count read density in bed files comparing stimulated versus unstimulated binding.
    '''
    samfiles_fg, samfiles_bg = [], []

    # collect foreground and background bam files
    for replicate in fg_replicates:
        samfiles_fg.append("%s.call.bam" % replicate.asFile())

    for replicate in bg_replicates:
        samfiles_bg.append("%s.call.bam" % replicate.asFile())

    samfiles_fg = [x for x in samfiles_fg if os.path.exists(x)]
    samfiles_bg = [x for x in samfiles_bg if os.path.exists(x)]

    samfiles_fg = ",".join(samfiles_fg)
    samfiles_bg = ",".join(samfiles_bg)

    tmpfile1 = P.getTempFilename(os.getcwd()) + ".fg"
    tmpfile2 = P.getTempFilename(os.getcwd()) + ".bg"

    # start counting
    to_cluster = True

    statement = """
    zcat < %(infile)s 
    | cgat bed2gff --as-gtf 
    | cgat gtf2table 
                --counter=read-coverage 
                --log=%(outfile)s.log 
                --bam-file=%(samfiles_fg)s 
    > %(tmpfile1)s"""
    P.run()

    if samfiles_bg:
        statement = """
        zcat < %(infile)s 
        | cgat bed2gff --as-gtf 
        | cgat gtf2table 
                    --counter=read-coverage 
                    --log=%(outfile)s.log 
                    --bam-file=%(samfiles_bg)s 
        > %(tmpfile2)s"""
        P.run()

        statement = '''
        python %(toolsdir)s/combine_tables.py 
               --add-file-prefix 
               --regex-filename="[.](\S+)$" 
        %(tmpfile1)s %(tmpfile2)s > %(outfile)s
        '''

        P.run()

        os.unlink(tmpfile2)

    else:
        statement = '''
        python %(toolsdir)s/combine_tables.py 
               --add-file-prefix 
               --regex-filename="[.](\S+)$" 
        %(tmpfile1)s > %(outfile)s
        '''

        P.run()

    os.unlink(tmpfile1)