Exemple #1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import alignlib
        from genomicode import config
        from genomicode import parallel

        log_filenames = _find_output_logs(in_data.identifier)
        assert log_filenames

        results = {}  # dict of sample -> dictionary of output
        for filename in log_filenames:
            # <path>/<sample>.log
            path, file_ = os.path.split(filename)
            f, e = os.path.splitext(file_)
            assert e == ".log"
            sample = f
            results[sample] = alignlib.parse_bowtie1_output(filename)

        # Make table where the rows are the samples and the columns
        # are the statistics.
        all_samples = sorted(results)
        table = []
        header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned"
        table.append(header)
        for sample in all_samples:
            stats = results[sample]
            total_reads = stats["reads_processed"]
            aligned_reads = stats["aligned_reads"]
            perc_aligned = float(aligned_reads) / total_reads * 100

            x1 = parselib.pretty_int(aligned_reads)
            x2 = parselib.pretty_int(total_reads)
            x3 = "%.2f%%" % perc_aligned
            x = sample, x1, x2, x3
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = filelib.which_assert(config.txt2xls)
        os.system("%s -b %s > %s" %
                  (parallel.quote(txt2xls), TXT_FILE, outfile))
Exemple #2
0
def read_matrices(filenames):
    from genomicode import parselib
    from genomicode import matrixlib

    if not filenames:
        return []

    x = matrixlib.read_matrices(filenames)
    DATA, ALIGNED = x

    for d, filename in zip(DATA, filenames):
        f = os.path.split(filename)[1]
        print "%s has %s genes and %s samples." % (
            f, parselib.pretty_int(d.nrow()), parselib.pretty_int(d.ncol()))
    if len(filenames) > 1:
        print "The merged file has %s genes." % \
              parselib.pretty_int(ALIGNED[0].nrow())
    sys.stdout.flush()

    return ALIGNED
Exemple #3
0
def resolve_sequence(
    name, bp_upstream, length, ra_chrom_path, knowngene_file, 
    default_transcript=None, skip_unknown_genes=False):
    from genomicode import parselib
    from genomicode import genomelib

    transcript_num = default_transcript
    gene_symbol = name
    if "," in name:
        gene_symbol, transcript_num = name.split(",", 1)
        assert transcript_num, "Invalid gene symbol: %r" % name
        transcript_num = int(transcript_num)

    proms = genomelib.get_promoters(
        gene_symbol, -bp_upstream, length, gene_file=knowngene_file,
        ra_path=ra_chrom_path)
    if not proms and skip_unknown_genes:
        return None
    assert proms, "I could not find gene: %s" % gene_symbol
    
    # If there is only 1 promoter, then use that one.
    if len(proms) == 1 and transcript_num is None:
        transcript_num = 0

    if len(proms) > 1 and transcript_num is None:
        print "Multiple transcripts for %s." % gene_symbol
        print "Please specify a specific transcript."
        x = "Index", "Chrom", "Strand", "TSS"
        print "\t".join(map(str, x))
        for i in range(len(proms)):
            chrom, tss, strand, start, length, seq = proms[i]
            tss = parselib.pretty_int(tss)
            x = i, chrom, strand, tss
            print "\t".join(map(str, x))
        sys.exit(0)

    assert transcript_num is not None
    assert transcript_num >= 0 and transcript_num < len(proms), \
           "Invalid transcript: %s" % transcript_num
    
    x = proms[transcript_num]
    chrom, tss, txn_strand, prom_base, prom_length, prom_seq = x
    return gene_symbol, chrom, prom_base, prom_length, txn_strand, tss
Exemple #4
0
def pretty_runtime(start_time, end_time):
    from genomicode import parselib
    
    x = end_time-start_time
    fracs = x - int(x)
    fracs = int(fracs * 1000)
    x = int(x)
    num_hours = x / 3600
    x = x % 3600
    num_secs = x % 60
    num_mins = x / 60
    if num_hours == 0 and num_mins == 0:
        run_time = "%ss" % parselib.pretty_int(num_secs)
    elif num_hours == 0:
        run_time = "%02d:%02d.%03d" % (num_mins, num_secs, fracs)
    else:
        run_time = "%02d:%02d:%02d.%03d" % (
            num_hours, num_mins, num_secs, fracs)
    return run_time
Exemple #5
0
def summarize_report(filenames, matrices, num_factors, start_time,
                     file_layout):
    import time
    import subprocess
    from genomicode import htmllib
    from genomicode import parselib

    #def highlight(s):
    #    return htmllib.SPAN(s, style="background-color:yellow")

    assert len(filenames) == len(matrices)

    lines = []
    w = lines.append
    w("<HTML>")
    w(htmllib.HEAD(htmllib.TITLE("BFRMNormalize Report")))
    w("<BODY>")
    w(htmllib.CENTER(htmllib.H1(htmllib.EM("BFRMNormalize") + " Report")))

    w(htmllib.H3("I.  Overview"))
    files = [os.path.split(x)[1] for x in filenames]
    x1 = "one data set"
    if len(files) > 1:
        x1 = "the following data sets"
    x2 = "factor"
    if num_factors > 1:
        x2 = "factors"
    x = "I normalized %s using %d %s." % (x1, num_factors, x2)
    l = [x]
    for i in range(len(files)):
        name = files[i]
        num_samples = matrices[i].ncol()
        x = "%s (%d samples)" % (name, num_samples)
        l.append(htmllib.LI() + x)
    l = "\n".join(l)
    w(htmllib.UL(l))

    w(htmllib.P())
    x = os.path.split(file_layout.DS_PROC)[1]
    w("The merged gene expression data set is available at " +
      htmllib.A(x, href=x) + ".")
    w(htmllib.BR())
    x = os.path.split(file_layout.DS_FINAL)[1]
    w("The normalized data set is available at " + htmllib.A(x, href=x) + ".")

    w(htmllib.P())
    w(htmllib.H3("II.  Results"))

    # Make the table of the heatmaps.
    x = os.path.split(file_layout.DS_PROC_HEATMAP)[1]
    x1 = htmllib.CENTER(
        htmllib.B("Before Normalization") + htmllib.BR() +
        htmllib.A(htmllib.IMG(height=480, src=x), href=x))
    x = os.path.split(file_layout.DS_FINAL_HEATMAP)[1]
    x2 = htmllib.CENTER(
        htmllib.B("After Normalization") + htmllib.BR() +
        htmllib.A(htmllib.IMG(height=480, src=x), href=x))
    row1 = htmllib.TR(htmllib.TD(x1) + htmllib.TD(x2))

    x = htmllib.TD(
        htmllib.B("Figure 1: Heatmaps. ") +
        "These heatmaps show the expression patterns in the data before "
        "and after normalization.  "
        "The rows contain the %d genes that exhibit the highest variance "
        "in gene expression across the original data set.  "
        "The columns contain the samples in the data sets provided.  "
        "The genes and samples are in the same order in both heatmaps.  "
        "Warm colors indicate high expression of the gene, and cool colors "
        "indicate low expression." % NUM_FILTERED_GENES,
        colspan=2)
    row2 = htmllib.TR(x)

    w(htmllib.TABLE(row1 + row2, border=0, cellspacing=10, width="50%%"))

    w(htmllib.P())

    # Make the table of the scatter plots.
    x = os.path.split(file_layout.DS_PROC_SCATTER)[1]
    x1 = htmllib.CENTER(
        htmllib.B("Before Normalization") + htmllib.BR() +
        htmllib.A(htmllib.IMG(height=400, src=x), href=x))
    x = os.path.split(file_layout.DS_FINAL_SCATTER)[1]
    x2 = htmllib.CENTER(
        htmllib.B("After Normalization") + htmllib.BR() +
        htmllib.A(htmllib.IMG(height=400, src=x), href=x))
    row1 = htmllib.TR(htmllib.TD(x1) + htmllib.TD(x2))

    x1 = (
        "These plots show the samples projected onto the first two principal "
        "components of the expression profiles of the %d genes that "
        "exhibit the highest variance across the original data set.  " %
        NUM_FILTERED_GENES)
    x2 = ("Each point represents a sample, and samples from the same data "
          "set have the same color.  "
          "If there are batch effects, the samples from the same data set "
          "(the same color) will cluster together.  "
          "If there are no batch effects, the colors should be mixed.")
    if len(filenames) == 1:
        x2 = ""
    x = htmllib.TD(htmllib.B("Figure 2: PCA Plots. ") + x1 + x2, colspan=2)
    row2 = htmllib.TR(x)

    w(htmllib.TABLE(row1 + row2, border=0, cellspacing=10, width="50%%"))

    # Format the current time.
    end_time = time.time()
    time_str = parselib.pretty_date(start_time)
    x = int(end_time - start_time)
    num_min = x / 60
    num_secs = x % 60
    if num_min == 0:
        run_time = "%ss" % parselib.pretty_int(num_secs)
    else:
        run_time = "%sm %ss" % (parselib.pretty_int(num_min), num_secs)

    # Get the hostname.
    cmd = "hostname"
    p = subprocess.Popen(cmd,
                         shell=True,
                         bufsize=0,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT,
                         close_fds=True)
    wh, r = p.stdin, p.stdout
    wh.close()
    hostname = r.read().strip()
    assert hostname, "I could not get the hostname."

    w(htmllib.P())
    w(htmllib.HR())
    w(
        htmllib.EM(
            "This analysis was run on %s on %s.  It took %s to complete." %
            (time_str, hostname, run_time)))

    w("</BODY>")
    w("</HTML>")

    x = "\n".join(lines) + "\n"
    outfile = file_layout.REPORT
    open(outfile, 'w').write(x)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import alignlib
        from genomicode import config
        from genomicode import parallel

        align_node = in_data
        x = filelib.list_files_in_path(align_node.identifier,
                                       endswith="align_summary.txt")
        align_filenames = x
        assert align_filenames, "Missing align_summary.txt"

        results = {}  # dict of sample -> dictionary of output
        for filename in align_filenames:
            # Names must in the format:
            # <path>/<sample>.tophat/alignment_summary.txt
            # full_path   <path>/<sample>.tophat
            # path        <path>
            # tophat_dir  <sample>.tophat
            # file_       accepted_hits.bam
            # sample      <sample>

            full_path, file_ = os.path.split(filename)
            path, tophat_dir = os.path.split(full_path)
            assert file_ == "align_summary.txt"
            assert tophat_dir.endswith(".tophat")
            sample = tophat_dir[:-7]

            x = alignlib.parse_tophat_align_summary(filename)
            results[sample] = x

        # Make table where the rows are the samples and the columns
        # are the statistics.
        all_samples = sorted(results)
        table = []
        header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned"
        table.append(header)
        for sample in all_samples:
            stats = results[sample]
            total_reads = stats["reads_processed"]
            aligned_reads = stats["aligned_reads"]
            perc_aligned = float(aligned_reads) / total_reads * 100

            x1 = parselib.pretty_int(aligned_reads)
            x2 = parselib.pretty_int(total_reads)
            x3 = "%.2f%%" % perc_aligned
            x = sample, x1, x2, x3
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = filelib.which_assert(config.txt2xls)
        os.system("%s -b %s > %s" %
                  (parallel.quote(txt2xls), TXT_FILE, outfile))
Exemple #7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import parselib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        MAX_CORES = 4  # I/O intensive.

        fastq_node, sample_node, bam_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        sample2fastq = mlib.find_merged_fastq_files(sample_node.identifier,
                                                    fastq_node.identifier,
                                                    as_dict=True)

        metadata = {}

        jobs = []  # list of (sample, bam_file, fastq_file)
        for filename in bam_filenames:
            path, sample, ext = mlib.splitpath(filename)
            assert sample in sample2fastq, "Missing fastq: %s" % sample
            fastq1, fastq2 = sample2fastq[sample]
            x = sample, filename, fastq1
            jobs.append(x)

        funcalls = []
        for x in jobs:
            sample, bam_filename, fastq_filename = x
            # Count the number of reads.
            x1 = count_reads, (fastq_filename, ), {}
            # Count the number of alignments.
            x2 = count_alignments, (bam_filename, ), {}
            funcalls.append(x1)
            funcalls.append(x2)
        assert len(funcalls) == len(jobs) * 2

        nc = min(num_cores, MAX_CORES)
        results = parallel.pyfun(funcalls, num_procs=nc)
        metadata["num_cores"] = nc

        # list of (sample, aligns, aligned_reads, total_reads, perc_aligned).
        results2 = []
        for i, x in enumerate(jobs):
            sample, bam_filename, fastq_filename = x
            x1 = results[i * 2]
            x2 = results[i * 2 + 1]
            total_reads = x1
            aligned_reads, alignments = x2
            perc_aligned = float(aligned_reads) / total_reads
            x = sample, alignments, aligned_reads, total_reads, perc_aligned
            results2.append(x)
        results = results2

        # sort by sample name
        results.sort()

        # Make table where the rows are the samples and the columns
        # are the statistics.
        table = []
        header = ("Sample", "Alignments", "Aligned Reads", "Total Reads",
                  "Perc Aligned")
        table.append(header)
        for x in results:
            sample, alignments, aligned_reads, total_reads, perc_aligned = x

            x1 = parselib.pretty_int(alignments)
            x2 = parselib.pretty_int(aligned_reads)
            x3 = parselib.pretty_int(total_reads)
            x4 = "%.2f%%" % (perc_aligned * 100)
            x = sample, x1, x2, x3, x4
            assert len(x) == len(header)
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = mlib.findbin("txt2xls", quote=True)
        parallel.sshell("%s -b %s > %s" % (txt2xls, TXT_FILE, outfile))
        return metadata
Exemple #8
0
def main():
    from optparse import OptionParser, OptionGroup

    import math
    from genomicode import genomelib
    from genomicode import motiflib
    from genomicode import parselib
    
    usage = "usage: %prog [options] <GENE SYMBOL>[,#] [...]"
    parser = OptionParser(usage=usage, version="%prog 01")

    # Matrix options.
    parser.add_option(
        "-m", "--matrix", dest="matrices", default=[], action="append",
        help="Plot binding sites for this matrix or gene.  "
        "Format: <matrix>[,<color>[,<f|o>].  Color is 0xRRGGBB format.  "
        "f(illed)|o(outline); def=f.")
    parser.add_option(
        "--all_matrices", default=False, action="store_true",
        help="Search all matrices.")

    # Gene options.
    parser.add_option(
        "--genome", default="hg19",
        help="Genome to search, e.g. hg19 (default), mm11.")
    parser.add_option(
        "--upstream", dest="upstream", default=250, type="int",
        help="Number of base pairs upstream of the TSS (def 250).")
    parser.add_option(
        "--downstream", dest="downstream", default=50, type="int",
        help="Number of base pairs downstream of the TSS (def 50).")
    parser.add_option(
        "-s", dest="strict", default=False, action="store_true",
        help="Use strict checking of gene names.")

    # Plotting options.
    parser.add_option(
        "--output_as_table", default=False, action="store_true")
    parser.add_option(
        "--pvalue", dest="pvalue_cutoff", default=None, type="float",
        help="p-value cutoff.")
    parser.add_option(
        "--max_genes", dest="max_genes", default=None, type="int",
        help="Maximum number of genes to plot.")
    parser.add_option(
        "-l", dest="label", default=False, action="store_true",
        help="Label the gene names.")
    parser.add_option(
        "-g", dest="gain", default=1, type="float",
        help="Increase the gain of the colors of the TFBS (def 1).")
    parser.add_option(
        "--format", dest="image_format", type="choice",
        choices=["png", "svg"], default="png",
        help="Image format: png (default) or svg.")

    # Running options.
    parser.add_option(
        "-j", "--jobname", dest="jobname", type="string", default="out",
        help="Name of output file.")
    parser.add_option(
        "--num_procs", dest="num_procs", type="int", default=1,
        help="Number of jobs to run in parallel.")

    options, args = parser.parse_args()
    if not args:
        parser.error("Please specify a gene to analyze.")
    if options.num_procs < 1 or options.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")

    ra_chrom_path, knowngene_file = resolve_genome_files(options.genome)
         
    gene_symbols = []
    for symbol_or_file in args:
        x = resolve_symbol_or_file(symbol_or_file)
        gene_symbols.extend(x)

    default_transcript = None
    skip_unknown_genes = False
    if not options.strict:
        default_transcript = 0
        skip_unknown_genes = True

    if not options.matrices and not options.all_matrices:
        parser.error("Please specify a matrix to plot.")
    if options.upstream < 0:
        parser.error("Upstream should be 0 or positive.")
    if options.downstream < 0:
        parser.error("Downstream should be 0 or positive.")
    seq_length = options.upstream + options.downstream
    if seq_length <= 0:
        parser.error("No sequence.")

    # Choose a plotting library.
    if options.jobname.lower().endswith(".png"):
        options.jobname = options.jobname[:-4]
    if options.image_format == "svg":
        plotlib = __import__(
            "genomicode.svgplot", globals(), locals(), ["svgplot"])
        outfile = "%s.svg" % options.jobname
    else:
        plotlib = __import__(
            "genomicode.pilplot", globals(), locals(), ["pilplot"])
        outfile = "%s.png" % options.jobname

    # Figure out the genes to plot.
    GENES = []   # list of gene_symbol, chrom, start, length, strand, tss
    for name in gene_symbols:
        # Figure out where the gene lies on the chromosome.
        x = resolve_sequence(
            name, options.upstream, seq_length, ra_chrom_path, knowngene_file,
            default_transcript=default_transcript,
            skip_unknown_genes=skip_unknown_genes)
        if x is None:
            continue
        GENES.append(x)

    if options.max_genes is not None:
        if options.max_genes <= 0:
            parser.error("Invalid max_genes.")
        GENES = GENES[:options.max_genes]

    # Figure out the matrices to search for.
    x = resolve_matrices(options.matrices, options.all_matrices)
    matrices, matrix2color, matrix2style = x

    # Constants governing how the figures are drawn.
    PIXELS_PER_BP = 4         # How many pixels for each base pair.
    X_BORDER = 5              # Number of pixels of border around the figure.
    Y_BORDER = 5
    GENE_BUFFER = 4           # Buffer between genes, in pixels.
    
    SEQ_COLOR = (0, 0, 0)     # Color for the sequence.
    SEQ_WIDTH = 1             # Width of line for sequence, in pixels.
    TSS_HEIGHT = 3            # Height of the TSS, in base pairs.
    TSS_WIDTH = TSS_HEIGHT*2  # Width of the TSS
    TFBS_HEIGHT = 2           # Height of TFBS, in base pairs.
    HASH_INTERVAL = 50        # Number of base pairs per hash mark.
    HASH_HEIGHT = 0.5         # Height of hash marks, in base pairs
    LABEL_HEIGHT = 5          # Height of label, in base pairs.

    NUM_GENES = len(GENES)

    # Figure out the geometry for a single gene.
    plot_width = int(PIXELS_PER_BP * seq_length)
    # Hack: Add some to TFBS_HEIGHT to handle overlapping TFBS.
    plot_height = int(PIXELS_PER_BP * max(TSS_HEIGHT, TFBS_HEIGHT+6)*2)
    # Figure out the geometry for the entire figure.
    total_width = X_BORDER*2 + plot_width
    total_height = (
        Y_BORDER*2 + plot_height*NUM_GENES + GENE_BUFFER*(NUM_GENES-1))

    image = plotlib.image(total_width, total_height)

    if options.output_as_table:
        header = [
            "Gene Symbol", "Chromosome", "Strand", "Transcription Start",
            "Matrix ID", "Matrix Gene Symbol", "TFBS Pos", "TFBS Strand",
            "TFBS Offset", "P-value", "Sequence"]
        print "\t".join(header)

    # Plot each gene.
    for gene_num, x in enumerate(GENES):
        gene_symbol, chrom, gen_start, gen_length, txn_strand, tss = x
        if not options.output_as_table:
            print "%s: %+d to %+d relative to TSS at chr%s:%s:%s." % (
                gene_symbol, -options.upstream, -options.upstream+seq_length,
                chrom, parselib.pretty_int(tss), txn_strand)

        # Get the TFBS from that site.
        nlp_cutoff = 0
        if options.pvalue_cutoff:
            nlp_cutoff = -math.log(options.pvalue_cutoff)
        data = motiflib.score_tfbs_genome(
            chrom, gen_start, gen_length, matrices=matrices, nlp=nlp_cutoff,
            num_procs=options.num_procs, ra_path=ra_chrom_path)
        
        ## # If multiple matrices for the same gene symbol hit the same
        ## # place, then keep the one with the highest NLP.
        ## # Sort by gene_symbol, position, decreasing NLP.
        ## x = [(matid2info[x[0]].Gene_Symbol, x[3], -x[4], x) for x in data]
        ## x.sort()
        ## data = [x[-1] for x in x]
        ## i = 0
        ## while i < len(data)-1:
        ##     di, dj = data[i], data[i+1]
        ##     gs0 = matid2info[di[0]].Gene_Symbol
        ##     gs1 = matid2info[dj[0]].Gene_Symbol
        ##     # Same matrix and position.
        ##     if gs0 == gs1 and di[3] == dj[3]:
        ##         del data[i+1]
        ##     else:
        ##         i += 1

        # Sort by position relative to TSS, decreasing NLP
        x = [(genomelib.genbase2tssbase(x[3], tss, txn_strand), -x[4], x)
             for x in data]
        x.sort()
        data = [x[-1] for x in x]

        # Inspect the data to improve the formatting of the results.
        name_len = 0
        gs_len = 0
        tss_dist_len = 0
        #NLP_len = 0
        for x in data:
            matrix, chrom, strand, pos, NLP = x

            m = motiflib.matid2matrix(matrix)
            
            name_len = max(name_len, len(matrix))
            gs_len = max(gs_len, len(m.gene_symbol))
            tss_dist = genomelib.calc_tss_seq_dist(
                tss, txn_strand, pos, m.length)
            tss_dist_len = max(
                tss_dist_len, len(parselib.pretty_int(tss_dist)))
            #NLP_len = max(NLP_len, len("%.2f" % NLP))

        # Print the data.
        binding_sites = []
        for x in data:
            matrix, chrom, strand, pos, NLP = x
            
            m = motiflib.matid2matrix(matrix)
            
            x = matrix, pos, m.length, strand, NLP
            binding_sites.append(x)

            # Get the matrix sequence, with some flanking bases.
            FLANK = int(math.ceil((20-m.length)/2.0))
            FLANK = max(FLANK, 2)
            left_flank = min(FLANK, pos)
            right_flank = FLANK
            seq_pos = pos - left_flank
            seq_len = m.length + left_flank + right_flank
            seq = genomelib.get_sequence(
                chrom, seq_pos, seq_len, ra_path=ra_chrom_path)
            s1 = seq[:left_flank]
            s2 = seq[left_flank:left_flank+m.length]
            s3 = seq[left_flank+m.length:]
            seq = s1.lower() + s2.upper() + s3.lower()
            
            # Print to stdout.
            mat_strand = "+"
            if strand != txn_strand:
                mat_strand = "-"
            tss_dist = genomelib.calc_tss_seq_dist(
                tss, txn_strand, pos, m.length)
            pvalue = parselib.pretty_pvalue(math.exp(-NLP), nsig=2)
            if options.output_as_table:
                x = (gene_symbol, chrom, txn_strand, tss,
                     matrix, m.gene_symbol,
                     pos, strand, tss_dist, pvalue, seq)
                assert len(x) == len(header)
                print "\t".join(map(str, x))
            else:
                #print "  %-*s [%*s] %s:%s:%s (%*s:%s) %*.2f %s" % (
                print "  %-*s [%*s] %s:%s (%*s) %-8s %s" % (
                    gs_len, m.gene_symbol, name_len, matrix, 
                    parselib.pretty_int(pos), strand, tss_dist_len,
                    parselib.pretty_int(tss_dist), pvalue, seq)

        # Initialize some objects to plot the figures.
        bp_start = gen_start
        if txn_strand == "-":
            bp_start = gen_start+gen_length
        x_bp = bp_start
        y_bp = 0
        x_pix = X_BORDER
        y_pix = Y_BORDER + gene_num*(plot_height+GENE_BUFFER) + plot_height/2
    
        bp2pix = BasePair2PixConverter(
            PIXELS_PER_BP, x_bp, y_bp, x_pix, y_pix, txn_strand)
        seq_layout = SequenceLayout(
            bp2pix, SEQ_WIDTH, TSS_HEIGHT, TSS_WIDTH,
            HASH_INTERVAL, HASH_HEIGHT, SEQ_COLOR)
        tfbs_layout = TFBSLayout(
            bp2pix, TFBS_HEIGHT, matrix2color, matrix2style, options.gain)
        label_layout = None
        if options.label:
            label_layout = LabelLayout(bp2pix, LABEL_HEIGHT)

        # Actually plot the figures.
        plot(
            plotlib, image, seq_layout, tfbs_layout, label_layout,
            gene_symbol, gen_start, gen_length, tss, binding_sites)
        sys.stdout.flush()

    plotlib.write(image, open(outfile, 'w'))
        genes = genomelib.filter_unique_tss(x)
        assert genes

        # If there is only 1 gene, then use that one.
        if len(genes) == 1 and transcript_num is None:
            transcript_num = 0

        if len(genes) > 1 and transcript_num is None:
            print "Multiple transcripts for %s." % gene_symbol
            print "Please specify a specific transcript."
            x = "Index", "Chrom", "Strand", "TSS"
            print "\t".join(map(str, x))
            for i in range(len(genes)):
                tss = genomelib.transcript2tss(
                    genes[i].txn_start, genes[i].txn_length,genes[i].strand)
                tss = parselib.pretty_int(tss)
                x = i, genes[i].chrom, genes[i].strand, tss
                print "\t".join(map(str, x))
            return
        
        assert transcript_num is not None
        assert transcript_num >= 0 and transcript_num < len(genes), \
               "Invalid transcript: %s" % transcript_num
        gene = genes[transcript_num]

        x = genomelib.transcript2promoter(
            gene.txn_start, gene.txn_length, gene.strand, gene_offset,
            gene_length)
        base, length, strand = x
        chrom = gene.chrom
        chrom_base = base
Exemple #10
0
def main():
    import os
    import sys
    import time
    import argparse
    #import multiprocessing

    from genomicode import aptamerlib
    from genomicode import parselib

    parser = argparse.ArgumentParser(description="Pull out a subset of reads.")
    parser.add_argument("sequence_file", help="FASTQ-formatted sequence file.")
    parser.add_argument("-j",
                        dest="num_procs",
                        type=int,
                        default=1,
                        help="Number of jobs to run in parallel.")

    parser.add_argument("--match_file",
                        help="File for reads that match this library.")
    parser.add_argument("--leftover_file",
                        help="File for leftover reads that don't match.")
    parser.add_argument("--clobber", default=False, action="store_true")

    parser.add_argument(
        "--min_seqlen",
        type=int,
        default=None,
        help="Discard sequences less than this minimum length.")
    parser.add_argument("--library_file",
                        help="Want reads that match this library.")
    parser.add_argument(
        "--titles",
        default=[],
        action="append",
        help="Want reads with these titles.  "
        "Comma-separated titles, parameter can be used multiple times.")

    args = parser.parse_args()

    # Check the inputs.
    assert os.path.exists(args.sequence_file), \
           "File not found: %s" % args.sequence_file
    assert args.num_procs >= 1 and args.num_procs < 256
    assert args.min_seqlen is None or (args.min_seqlen >= 0
                                       and args.min_seqlen < 100)
    assert not args.library_file or os.path.exists(args.library_file), \
           "File not found: %s" % args.library_file

    assert args.match_file, "Please specify a match_file."
    if not args.clobber and (args.match_file
                             and os.path.exists(args.match_file)):
        raise AssertionError, ("match_file %s exists.  "
                               "Please use --clobber to overwrite." %
                               args.match_file)
    if not args.clobber and (args.leftover_file
                             and os.path.exists(args.leftover_file)):
        raise AssertionError, ("leftover_file %s exists.  "
                               "Please use --clobber to overwrite." %
                               args.leftover_file)

    titles = _parse_titles(args.titles)

    match_handle = open(args.match_file, 'w')
    leftover_handle = None
    if args.leftover_file:
        leftover_handle = open(args.leftover_file, 'w')

    library = None
    if args.library_file:
        library = aptamerlib.read_library(args.library_file)

    #manager = multiprocessing.Manager()
    #lock = manager.Lock()
    #pool = multiprocessing.Pool(args.num_procs)

    TIME_FORMAT = "%m/%d/%Y %H:%M:%S"
    last_time = None
    for i, x in enumerate(aptamerlib.parse_fastq(args.sequence_file)):
        title, sequence, quality = x

        t = time.time()
        if last_time is None or t > last_time + 5:
            last_time = t
            now = time.strftime(TIME_FORMAT, time.localtime(t))
            print "%s\t%s" % (now, "Extracting read %s." %
                              parselib.pretty_int(i + 1))
            sys.stdout.flush()

        if args.min_seqlen is not None and len(sequence) < args.min_seqlen:
            continue

        is_match = False

        # Keep if either the title matches or the library matches.
        if titles:
            if title in titles:
                is_match = True
        if library:
            orientation = aptamerlib.guess_sequence_orientation(
                sequence, library)
            assert orientation in [-1, 0, 1]
            if orientation in [-1, 1]:
                is_match = True

        if is_match:
            # Matches the library.
            print >> match_handle, title
            print >> match_handle, sequence
            print >> match_handle, "+"
            print >> match_handle, quality
        elif leftover_handle:
            print >> leftover_handle, title
            print >> leftover_handle, sequence
            print >> leftover_handle, "+"
            print >> leftover_handle, quality

    match_handle.close()
    if leftover_handle:
        leftover_handle.close()
def main():
    from optparse import OptionParser, OptionGroup
    from genomicode import genomelib
    from genomicode import primer3
    from genomicode import parselib

    # gene    E2F1 ENSA,0 (<gene>,<tss>).
    usage = "usage: %prog [options] <gene>"
    parser = OptionParser(usage=usage, version="%prog 01")

    parser.add_option("--product_size",
                      dest="product_size",
                      default=[],
                      action="append",
                      help="Add a product size to search, e.g. 75-100.")
    parser.add_option("-n",
                      dest="num_primers",
                      type="int",
                      default=None,
                      help="Number of primer pairs to pick.")
    parser.add_option("-v",
                      dest="verbose",
                      action="store_true",
                      default=False,
                      help="Make output verbose.")

    options, args = parser.parse_args()
    if len(args) != 1:
        print usage
        sys.exit(-1)
    gene, = args

    assert options.num_primers is None or options.num_primers > 0
    if not options.product_size:
        # Default Product Size Range for web is:
        # 150-250 100-300 301-400 401-500 501-600 601-700 701-850 851-1000
        options.product_size = [
            (50, 100),
            (101, 150),
            (151, 200),
            (201, 300),
        ]
    for mn, mx in options.product_size:
        assert mn > 0
        assert mn < mx

    gene_symbol, transcript_num = gene.upper(), None
    # See if a transcript num was specified.
    if "," in gene_symbol:
        gene_symbol, x = gene_symbol.split(",", 1)
        transcript_num = int(x)
        assert transcript_num >= 0

    genes = genomelib.get_gene_coords(gene_symbol)
    assert genes

    # If there is only 1 gene, then use that one.
    if len(genes) == 1 and transcript_num is None:
        transcript_num = 0

    if len(genes) > 1 and transcript_num is None:
        print "Multiple transcripts for %s." % gene_symbol
        print "Please specify a specific transcript."
        x = "Index", "Chrom", "Strand", "TSS"
        print "\t".join(map(str, x))
        for i in range(len(genes)):
            tss = genomelib.transcript2tss(genes[i].txn_start,
                                           genes[i].txn_length,
                                           genes[i].strand)
            tss = parselib.pretty_int(tss)
            x = i, genes[i].chrom, genes[i].strand, tss
            print "\t".join(map(str, x))
        return

    assert transcript_num is not None
    assert transcript_num >= 0 and transcript_num < len(genes), \
           "Invalid transcript: %s" % transcript_num
    gene = genes[transcript_num]

    seq = genomelib.get_transcript(gene.chrom, gene.strand, gene.txn_start,
                                   gene.txn_length, gene.exon_starts,
                                   gene.exon_lengths)
    #print gene.txn_start, gene.txn_length
    #genomelib.write_fasta("HELLO", seq)

    # Search for primers on just the exons.
    exon_seq = ExonSequence(seq)
    primers = primer3.primer3(exon_seq.sub_seq,
                              product_size=options.product_size,
                              num_return=options.num_primers)
    if not primers:
        print "No primers found."
        return

    # Revcomp the right primer.
    for d1, d2, size in primers:
        d2.seq_rc = genomelib.revcomp(d2.seq)

    if options.verbose:
        x = [
            "Index", "L_Seq", "L_Pos", "L_Length", "L_Tm", "L_GC", "L_Exon",
            "R_Seq", "R_Pos", "R_Length", "R_Tm", "R_GC", "R_Exon",
            "Genome Size", "Product Size"
        ]
        print "\t".join(x)

    for zzz, x in enumerate(primers):
        d1, d2, size = x

        # Calculate the size of the genomic product.
        gen_left = exon_seq.full_seq.find(d1.seq.upper())
        gen_right = exon_seq.full_seq.find(d2.seq_rc.upper())
        gen_size = 0
        if gen_left >= 0 and gen_right >= 0:
            gen_size = gen_right - gen_left + len(d2.seq_rc)

        # Calculate the size of the PCR product.
        pcr_left = exon_seq.sub_seq.find(d1.seq.upper())
        pcr_right = exon_seq.sub_seq.find(d2.seq_rc.upper())
        pcr_size = pcr_right - pcr_left + len(d2.seq)
        assert pcr_size == size

        # Figure out the exons of the primers.
        L_exon_ID = [
            exon_seq.sub_exonid[pcr_left + i] for i in range(len(d1.seq))
        ]
        R_exon_ID = [
            exon_seq.sub_exonid[pcr_right + i] for i in range(len(d2.seq))
        ]
        L_exon_ID_str = format_exon_ids(L_exon_ID)
        R_exon_ID_str = format_exon_ids(R_exon_ID)

        # Ignore primers on the same exon.
        if L_exon_ID == R_exon_ID:
            continue

        if options.verbose:
            x = (zzz + 1, d1.seq, pcr_left, d1.length, d1.tm, d1.gc_percent,
                 L_exon_ID_str, d2.seq, pcr_right, d2.length, d2.tm,
                 d2.gc_percent, R_exon_ID_str, gen_size, size)
            print "\t".join(map(str, x))
        else:
            print d1.seq
            print d2.seq
            print "Product Size=%d" % size
            print