Esempio n. 1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--bam-file",
                      dest="bam_file",
                      type="string",
                      help="supply input bam file name")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtf_file",
                      type="string",
                      help="supply input gtf file name")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="supply output file name")

    parser.add_option(
        "-G",
        "--reference-gtf-file",
        dest="reference_gtf",
        type="string",
        help=
        "supply reference gtf for context of reads not contributing to transcripts"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ######################################################
    ######################################################
    # for all alignments
    ######################################################
    ######################################################

    # open outfile and prepare headers
    outf = IOTools.openFile(options.outfile, "w")
    outf.write("\t".join([
        "total alignments", "aligments in transcripts",
        "percent alignments in transcripts", "total spliced alignments",
        "spliced alignments in transcripts",
        "percent spliced alignments in transcripts"
    ]) + "\n")

    # calculate coverage over transcript file - NB split reads contribute twice to the transcript
    # use BedTool object
    pybedbamfile = pybedtools.BedTool(options.bam_file)

    # count alignments
    E.info("counting total number of alignments and spliced alignments")
    total_alignments = 0
    spliced_alignments = 0

    for alignment in pybedbamfile:
        cigar = alignment[5]
        if cigar.find("N") != -1:  # N signifies split read
            total_alignments += 1
            spliced_alignments += 1
        else:
            total_alignments += 1

    # merge the gtf file to avoid double counting of exons in different
    # transcripts - converts to a bed file
    gtffile = pybedtools.BedTool(options.gtf_file).merge()

    E.info("computing coverage of aligments in %s over intervals in %s" %
           (options.bam_file, options.gtf_file))
    cover = pybedbamfile.coverage(gtffile)

    # make sure that the exons aren't being counted twice - shouldn't be
    # because of merge
    E.info("counting reads contributing to transcripts")
    c = 0
    for entry in cover:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    # sum the coverage across exons from all transcripts
    coverage_in_transcripts = c

    ######################################################
    ######################################################
    # for spliced alignments
    ######################################################
    ######################################################

    # count total number of spliced alignments
    # requires that the CIGAR string 'N' is present

    # uses pysam to write out a bam file of the spliced reads only
    allreads = pysam.Samfile(options.bam_file)
    spliced_bamname = IOTools.snip(options.bam_file,
                                   ".bam") + "_spliced_reads.bam"

    # open file for outputting spliced alignments
    splicedreads = pysam.Samfile(spliced_bamname, "wb", template=allreads)

    # cigar string in pysam for spliced alignment is (3, int)
    spliced = collections.defaultdict(list)
    for read in allreads:
        for cigar_tag in read.cigar:
            if cigar_tag[0] == 3:
                spliced[read].append(cigar_tag)

    # write out spliced alignments
    for read in list(spliced.keys()):
        splicedreads.write(read)
    splicedreads.close()
    allreads.close()

    # index splice reads bam file
    pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam"))
    pysam.index(spliced_bamname)

    # read in the spliced reads as a BedTool object
    splicedbam = pybedtools.BedTool(spliced_bamname)

    # perform coverage of spliced reads over intervals - will be twice
    # as many as there should be due to counting both exons
    # overlapping
    spliced_coverage = splicedbam.coverage(gtffile)

    # avoid double counting exons
    E.info("counting spliced reads contributing to transcripts")
    spliced_exons = {}
    c = 0
    for entry in spliced_coverage:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    spliced_coverage_in_transcripts = c

    # NOTE: the counting of spliced alignments is not accurate

    spliced_coverage_in_transcripts = float(
        spliced_coverage_in_transcripts) / 2

    ###########################
    # write out the results
    ###########################

    outf.write(str(int(total_alignments)) + "\t")
    # remove half of the coverage assigned to spliced reads
    coverage_in_transcripts = (coverage_in_transcripts) - (
        spliced_coverage_in_transcripts)
    outf.write(
        str(
            int(coverage_in_transcripts) -
            int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t")

    # write out spliced counts
    outf.write(str(int(spliced_alignments)) + "\t")
    outf.write(str(int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100)))

    outf.close()

    ############################
    # contextualise those that
    # don't fall in transcripts
    ############################

    if options.reference_gtf:
        context_summary = IOTools.openFile(
            IOTools.snip(options.bam_file, ".bam") + ".excluded.context", "w")
        context_summary.write("\t".join(["Feature", "number"]) + "\n")

        # write out the read info as well
        context_file = IOTools.openFile(
            IOTools.snip(options.bam_file, ".bam") + ".excluded", "w")

        context_dict = collections.defaultdict(int)
        # intersect bam - write non-overlapping with transcripts - intersect
        # with reference - write out
        context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect(
            pybedtools.BedTool(options.reference_gtf), wb=True)
        for entry in context:
            feature = entry[8]
            context_dict[feature] += 1
            context_file.write("\t".join([e for e in entry]) + "\n")

        for feature, value in context_dict.items():
            context_summary.write("\t".join([feature, str(value)]) + "\n")

        context_file.close()
        context_summary.close()

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 2
0
def peekParameters(workingdir,
                   pipeline,
                   on_error_raise=None,
                   prefix=None,
                   update_interface=False,
                   restrict_interface=False):
    '''peek configuration parameters from external pipeline.

    As the paramater dictionary is built at runtime, this method
    executes the pipeline in workingdir, dumping its configuration
    values and reading them into a dictionary.

    If either `pipeline` or `workingdir` are not found, an error is
    raised. This behaviour can be changed by setting `on_error_raise`
    to False. In that case, an empty dictionary is returned.

    Arguments
    ---------
    workingdir : string
       Working directory. This is the directory that the pipeline
       was executed in.
    pipeline : string
       Name of the pipeline script. The pipeline is assumed to live
       in the same directory as the current pipeline.
    on_error_raise : Bool
       If set to a boolean, an error will be raised (or not) if there
       is an error during parameter peeking, for example if
       `workingdir` can not be found. If `on_error_raise` is None, it
       will be set to the default, which is to raise an exception
       unless the calling script is imported or the option
       ``--is-test`` has been passed at the command line.
    prefix : string
       Add a prefix to all parameters. This is useful if the paramaters
       are added to the configuration dictionary of the calling pipeline.
    update_interface : bool
       If True, this method will prefix any options in the
       ``[interface]`` section with `workingdir`. This allows
       transparent access to files in the external pipeline.
    restrict_interface : bool
       If  True, only interface parameters will be imported.

    Returns
    -------
    config : dict
        Dictionary of configuration values.

    '''
    caller_locals = getCallerLocals()

    # check if we should raise errors
    if on_error_raise is None:
        on_error_raise = not isTest() and \
            "__name__" in caller_locals and \
            caller_locals["__name__"] == "__main__"

    # patch - if --help or -h in command line arguments,
    # do not peek as there might be no config file.
    if "--help" in sys.argv or "-h" in sys.argv:
        return {}

    # Attempt to locate directory with pipeline source code. This is a
    # patch as pipelines might be called within the repository
    # directory or from an installed location
    dirname = PARAMS["pipelinedir"]

    # called without a directory, use current directory
    if dirname == "":
        dirname = os.path.abspath(".")
    else:
        # if not exists, assume we want version located
        # in directory of calling script.
        if not os.path.exists(dirname):
            # directory is path of calling script
            dirname = os.path.dirname(caller_locals['__file__'])

    pipeline = os.path.join(dirname, pipeline)
    if not os.path.exists(pipeline):
        if on_error_raise:
            raise ValueError(
                "can't find pipeline at %s" % (pipeline))
        else:
            return {}

    if workingdir == "":
        workingdir = os.path.abspath(".")

    # patch for the "config" target - use default
    # pipeline directory if directory is not specified
    # working dir is set to "?!"
    if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!":
        workingdir = os.path.join(PARAMS.get("pipelinedir"),
                                  IOTools.snip(pipeline, ".py"))

    if not os.path.exists(workingdir):
        if on_error_raise:
            raise ValueError(
                "can't find working dir %s" % workingdir)
        else:
            return {}

    statement = "python %s -f -v 0 dump" % pipeline

    os.environ.update({'BASH_ENV': os.path.join(os.environ['HOME'],'.bashrc')})
    process = subprocess.Popen(statement,
                               cwd=workingdir,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE,
                               env=os.environ.copy())

    # process.stdin.close()
    stdout, stderr = process.communicate()
    if process.returncode != 0:
        raise OSError(
            ("Child was terminated by signal %i: \n"
             "Statement: %s\n"
             "The stderr was: \n%s\n"
             "Stdout: %s") %
            (-process.returncode, statement, stderr, stdout))

    # subprocess only accepts encoding argument in py >= 3.6 so
    # decode here.
    stdout = stdout.decode("utf-8").splitlines()
    # remove any log messages
    stdout = [x for x in stdout if x.startswith("{")]
    if len(stdout) > 1:
        raise ValueError("received multiple configurations")
    dump = json.loads(stdout[0])

    # update interface
    if update_interface:
        for key, value in list(dump.items()):
            if key.startswith("interface"):
                dump[key] = os.path.join(workingdir, value)

    # keep only interface if so required
    if restrict_interface:
        dump = dict([(k, v) for k, v in dump.items()
                     if k.startswith("interface")])

    # prefix all parameters
    if prefix is not None:
        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])

    return dump
Esempio n. 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--first-gtf-file",
        dest="gtf_a",
        type="string",
        help="supply a gtf file - will compress uncompressed files")
    parser.add_option(
        "-b",
        "--second-gtf-file",
        dest="gtf_b",
        type="string",
        help="supply a second gtf file - will compress uncompressed files")
    parser.add_option("-s",
                      "--scripts-dir",
                      dest="scripts_dir",
                      type="string",
                      help="supply a location for accessory scripts")
    parser.add_option("--no-venn",
                      dest="no_venn",
                      action="store_true",
                      help="set if no venn is to be drawn")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    gtf_files = [options.gtf_a, options.gtf_b]

    merged_files = []
    prefices = []
    E.info("merging gtf files")
    for gtf in gtf_files:
        if gtf.endswith(".gtf.gz"):
            outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf.gz"))
            merged_files.append(outfile)
            statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            P.execute(statement)
        elif gtf.endswith(".gtf"):
            outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf"))
            merged_files.append(outfile)
            statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip  > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            E.execute(statement)
        else:
            raise ValueError("cannot perform merge on %s: is not a gtf file" %
                             gtf)

    for prefix in prefices:
        if options.gtf_a.find(prefix) != -1:
            gtf_a = prefix + ".merged.gtf.gz"
            prefix_a = prefix
        elif options.gtf_b.find(prefix) != -1:
            gtf_b = prefix + ".merged.gtf.gz"
            prefix_b = prefix

    E.info("intersecting gtf files")
    # intersect the resulting merged files

    scriptsdir = options.scripts_dir
    intersection_out = "_vs_".join([prefix_a, prefix_b
                                    ]) + ".intersection.gtf.gz"
    statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa
                 | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s'''
    P.run()

    if not options.no_venn:
        E.info("producing venn diagram for %s vs %s..." %
               (options.gtf_a, options.gtf_b))
        # produce the venn diagram
        intersection_file = intersection_out
        gtf_a_merged = gtf_a
        gtf_b_merged = gtf_b

        # create dictionary key
        gtf_pair = (gtf_a_merged, gtf_b_merged)

        # containers for counts
        count_gtf_merged_a = 0
        count_gtf_merged_b = 0
        count_intersection = 0

        # create GTF iterator objects
        gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0]))
        gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1]))
        gtf_iterator_intersection = GTF.iterator(
            IOTools.openFile(intersection_file))

        # do the counts for each file
        E.info("counting entries in %s" % gtf_a)
        for entry in gtf_iterator_a:
            count_gtf_merged_a += 1
        print("counts for gtf-a: ", count_gtf_merged_a)

        E.info("counting entries in %s" % gtf_b)
        for entry in gtf_iterator_b:
            count_gtf_merged_b += 1
        print("counts for gtf-b: ", count_gtf_merged_b)

        E.info("counting entries in %s" % intersection_file)
        for entry in gtf_iterator_intersection:
            count_intersection += 1
        print("counts for intersection: ", count_intersection)

        # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set
        # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this
        # set to make up the remaining - non-overlapping set

        result = {}
        E.info("assembling count lists")
        result[gtf_pair] = {
            "gtf-b":
            list(map(str, range(count_gtf_merged_b))),
            "gtf-a":
            list(map(str, range(count_intersection))) + list(
                map(str, [
                    random.random()
                    for i in range(count_intersection, count_gtf_merged_a)
                ]))
        }

        R_source = os.path.join(os.path.abspath(options.scripts_dir),
                                "venn_diagram.R")
        R.source(R_source)

        prefix_a = prefix_a.replace(".", "_").replace("-", "_")
        prefix_b = prefix_b.replace(".", "_").replace("-", "_")

        R('''prefix.a <- "%s"''' % prefix_a)
        R('''prefix.b <- "%s"''' % prefix_b)
        E.info("drawing venn diagram to %s" %
               (prefix_a + "_vs_" + prefix_b + ".overlap.png"))

        R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"],
                                  B=result[gtf_pair]["gtf-b"]),
                           prefix_a + "_vs_" + prefix_b + ".overlap.png", **{
                               'cat.cex':
                               1.5,
                               'main.fontfamily':
                               "Arial",
                               'cat.pos':
                               FloatVector((0, 0)),
                               'cat.fontfamily':
                               "Arial",
                               'main.cex':
                               1.8,
                               'height':
                               1000,
                               'width':
                               1000,
                               'cex':
                               2,
                               'fontfamily':
                               "Arial",
                               'lwd':
                               R.c(1, 1),
                               'fill':
                               R.c(R.rgb(0, 0, 0.5, 0.5),
                                   R.rgb(0.5, 0, 0, 0.5)),
                               'category.names':
                               R.c(prefix_a, prefix_b),
                               'margin':
                               R.c(0.1, 0.1, 0.1, 0.1)
                           })

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 4
0
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0):
    '''load results from cuffdiff analysis to database

    This functions parses and loads the results of a cuffdiff differential
    expression analysis.
    Parsing is performed by the parseCuffdiff function.

    Multiple tables will be created as cuffdiff outputs information
    on gene, isoform, tss, etc. levels.

    The method converts from ln(fold change) to log2 fold change.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    `min_fpkm`) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    infile : string
        Input filename, output from cuffdiff
    outfile : string
        Output filename in :term:`tsv` format.
    min_fpkm : float
        Minimum fpkm. Genes with an fpkm lower than this will
        be set to status `NOCALL`.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued...

    tmpname = P.getTempFilename(shared=True)

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)

        results = parseCuffdiff(infile, min_fpkm=min_fpkm)
        Expression.writeExpressionResults(tmpname, results)
        P.load(tmpname, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=treatment_name "
               "--add-index=control_name "
               "--add-index=test_id")

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"
        infile = os.path.join(indir, fn)

        P.load(infile, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=tracking_id "
               "--add-index=control_name "
               "--add-index=test_id")

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        is_first = True
        for line in inf:

            if is_first:
                is_first = False
                continue

            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # please make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            s = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[s]] + "\t")
                s += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        P.load(tmpf,
               outfile,
               tablename=tablename,
               options="--allow-empty-file "
               " --add-index=gene_id")

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
Esempio n. 5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-a", "--first-gtf-file", dest="gtf_a", type="string",
                      help="supply a gtf file - will compress uncompressed files")
    parser.add_option("-b", "--second-gtf-file", dest="gtf_b", type="string",
                      help="supply a second gtf file - will compress uncompressed files")
    parser.add_option("-s", "--scripts-dir", dest="scripts_dir", type="string",
                      help="supply a location for accessory scripts")
    parser.add_option("--no-venn", dest="no_venn", action="store_true",
                      help="set if no venn is to be drawn")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    gtf_files = [options.gtf_a, options.gtf_b]

    merged_files = []
    prefices = []
    E.info("merging gtf files")
    for gtf in gtf_files:
        if gtf.endswith(".gtf.gz"):
            outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf.gz"))
            merged_files.append(outfile)
            statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            P.execute(statement)
        elif gtf.endswith(".gtf"):
            outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz"
            prefices.append(IOTools.snip(gtf, ".gtf"))
            merged_files.append(outfile)
            statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip  > %s''' % (
                gtf, options.scripts_dir, outfile, outfile)
            E.execute(statement)
        else:
            raise ValueError(
                "cannot perform merge on %s: is not a gtf file" % gtf)

    for prefix in prefices:
        if options.gtf_a.find(prefix) != -1:
            gtf_a = prefix + ".merged.gtf.gz"
            prefix_a = prefix
        elif options.gtf_b.find(prefix) != -1:
            gtf_b = prefix + ".merged.gtf.gz"
            prefix_b = prefix

    E.info("intersecting gtf files")
    # intersect the resulting merged files

    scriptsdir = options.scripts_dir
    intersection_out = "_vs_".join(
        [prefix_a, prefix_b]) + ".intersection.gtf.gz"
    statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa
                 | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s'''
    P.run()

    if not options.no_venn:
        E.info("producing venn diagram for %s vs %s..." %
               (options.gtf_a, options.gtf_b))
        # produce the venn diagram
        intersection_file = intersection_out
        gtf_a_merged = gtf_a
        gtf_b_merged = gtf_b

        # create dictionary key
        gtf_pair = (gtf_a_merged, gtf_b_merged)

        # containers for counts
        count_gtf_merged_a = 0
        count_gtf_merged_b = 0
        count_intersection = 0

        # create GTF iterator objects
        gtf_iterator_a = GTF.iterator(IOTools.openFile(gtf_pair[0]))
        gtf_iterator_b = GTF.iterator(IOTools.openFile(gtf_pair[1]))
        gtf_iterator_intersection = GTF.iterator(
            IOTools.openFile(intersection_file))

        # do the counts for each file
        E.info("counting entries in %s" % gtf_a)
        for entry in gtf_iterator_a:
            count_gtf_merged_a += 1
        print("counts for gtf-a: ", count_gtf_merged_a)

        E.info("counting entries in %s" % gtf_b)
        for entry in gtf_iterator_b:
            count_gtf_merged_b += 1
        print("counts for gtf-b: ", count_gtf_merged_b)

        E.info("counting entries in %s" % intersection_file)
        for entry in gtf_iterator_intersection:
            count_intersection += 1
        print("counts for intersection: ", count_intersection)

        # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set
        # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this
        # set to make up the remaining - non-overlapping set

        result = {}
        E.info("assembling count lists")
        result[gtf_pair] = {"gtf-b": list(map(str, range(count_gtf_merged_b))), "gtf-a": list(map(str, range(count_intersection))) +
                            list(map(str, [random.random() for i in range(count_intersection, count_gtf_merged_a)]))}

        R_source = os.path.join(
            os.path.abspath(options.scripts_dir), "venn_diagram.R")
        R.source(R_source)

        prefix_a = prefix_a.replace(".", "_").replace("-", "_")
        prefix_b = prefix_b.replace(".", "_").replace("-", "_")

        R('''prefix.a <- "%s"''' % prefix_a)
        R('''prefix.b <- "%s"''' % prefix_b)
        E.info("drawing venn diagram to %s" %
               (prefix_a + "_vs_" + prefix_b + ".overlap.png"))

        R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"], B=result[gtf_pair]["gtf-b"]), prefix_a + "_vs_" + prefix_b + ".overlap.png", **{'cat.cex': 1.5, 'main.fontfamily': "Arial", 'cat.pos': FloatVector((0, 0)), 'cat.fontfamily': "Arial", 'main.cex': 1.8, 'height': 1000, 'width': 1000, 'cex': 2, 'fontfamily': "Arial", 'lwd': R.c(1, 1), 'fill': R.c(R.rgb(0, 0, 0.5, 0.5), R.rgb(0.5, 0, 0, 0.5)), 'category.names': R.c(prefix_a, prefix_b), 'margin': R.c(0.1, 0.1, 0.1, 0.1)
                                                                                                                                               })

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 6
0
def runDE(design_file,
          counts_file,
          outfile,
          outdir,
          method="deseq",
          spike_file=None):
    '''run DESeq, DESeq2 or EdgeR through :mod:`scripts/runExpression.py`

    The job is split into smaller sections. The order of the input
    data is randomized in order to avoid any biases due to chromosomes
    and break up local correlations.

    At the end, a q-value is computed from all results.

    Arguments
    ---------
    design_file : string
        Filename with experimental design
    counts_file : string
        :term:`tsv` formatted file with counts per windows
    outfile : string
       Output filename in :term:`tsv` format.
    outdir : string
       Directory for additional output files.
    method : string
       Method to use. See :mod:`scripts/runExpression.py`.
    spike_file : string
       Filename with spike-in data to add before processing.
    '''

    if spike_file is None:
        statement = "zcat %(counts_file)s"
    else:
        statement = '''python %(scriptsdir)s/combine_tables.py
        --missing-value=0
        --cat=filename
        --log=%(outfile)s.log
        %(counts_file)s %(spike_file)s
        | python %(scriptsdir)s/csv_cut.py
        --remove filename
        --log=%(outfile)s.log
        '''

    prefix = IOTools.snip(os.path.basename(outfile))
    E.info(prefix)

    # the post-processing strips away the warning,
    # renames the qvalue column to old_qvalue
    # and adds a new qvalue column after recomputing
    # over all windows.
    statement += '''
    | perl %(scriptsdir)s/randomize_lines.pl -h
    | %(cmd-farm)s
    --input-header
    --output-header
    --split-at-lines=200000
    --cluster-options="-l mem_free=16G"
    --log=%(outfile)s.log
    --output-filename-pattern=%(outdir)s/%%s
    --subdirs
    --output-regex-header="^test_id"
    "python %(scriptsdir)s/runExpression.py
              --method=%(method)s
              --tags-tsv-file=-
              --design-tsv-file=%(design_file)s
              --output-filename-pattern=%%DIR%%%(prefix)s_
              --deseq-fit-type=%(deseq_fit_type)s
              --deseq-dispersion-method=%(deseq_dispersion_method)s
              --deseq-sharing-mode=%(deseq_sharing_mode)s
              --edger-dispersion=%(edger_dispersion)f
              --deseq2-design-formula=%(deseq2_model)s
              --deseq2-contrasts=%(deseq2_contrasts)s
              --filter-min-counts-per-row=%(tags_filter_min_counts_per_row)i
              --filter-min-counts-per-sample=%(tags_filter_min_counts_per_sample)i
              --filter-percentile-rowsums=%(tags_filter_percentile_rowsums)i
              --log=%(outfile)s.log
              --fdr=%(edger_fdr)f
              --deseq2-plot=0"
    | perl -p -e "s/qvalue/old_qvalue/"
    | python %(scriptsdir)s/table2table.py
    --log=%(outfile)s.log
    --method=fdr
    --column=pvalue
    --fdr-method=BH
    --fdr-add-column=qvalue
    | gzip
    > %(outfile)s '''
    E.info(statement)

    P.run()
Esempio n. 7
0
def runDE(design_file,
          counts_file,
          outfile,
          outdir,
          method="deseq",
          spike_file=None):
    '''run DESeq, DESeq2 or EdgeR through :mod:`scripts/runExpression.py`

    The job is split into smaller sections. The order of the input
    data is randomized in order to avoid any biases due to chromosomes
    and break up local correlations.

    At the end, a q-value is computed from all results.

    Arguments
    ---------
    design_file : string
        Filename with experimental design
    counts_file : string
        :term:`tsv` formatted file with counts per windows
    outfile : string
       Output filename in :term:`tsv` format.
    outdir : string
       Directory for additional output files.
    method : string
       Method to use. See :mod:`scripts/runExpression.py`.
    spike_file : string
       Filename with spike-in data to add before processing.
    '''

    if spike_file is None:
        statement = "zcat %(counts_file)s"
    else:
        statement = '''python %(scriptsdir)s/combine_tables.py
        --missing-value=0
        --cat=filename
        --log=%(outfile)s.log
        %(counts_file)s %(spike_file)s
        | python %(scriptsdir)s/csv_cut.py
        --remove filename
        --log=%(outfile)s.log
        '''

    prefix = IOTools.snip(os.path.basename(outfile))
    E.info(prefix)

    # the post-processing strips away the warning,
    # renames the qvalue column to old_qvalue
    # and adds a new qvalue column after recomputing
    # over all windows.
    statement += '''
    | perl %(scriptsdir)s/randomize_lines.pl -h
    | %(cmd-farm)s
    --input-header
    --output-header
    --split-at-lines=200000
    --cluster-options="-l mem_free=16G"
    --log=%(outfile)s.log
    --output-filename-pattern=%(outdir)s/%%s
    --subdirs
    --output-regex-header="^test_id"
    "python %(scriptsdir)s/runExpression.py
              --method=%(method)s
              --tags-tsv-file=-
              --design-tsv-file=%(design_file)s
              --output-filename-pattern=%%DIR%%%(prefix)s_
              --deseq-fit-type=%(deseq_fit_type)s
              --deseq-dispersion-method=%(deseq_dispersion_method)s
              --deseq-sharing-mode=%(deseq_sharing_mode)s
              --edger-dispersion=%(edger_dispersion)f
              --deseq2-design-formula=%(deseq2_model)s
              --deseq2-contrasts=%(deseq2_contrasts)s
              --filter-min-counts-per-row=%(tags_filter_min_counts_per_row)i
              --filter-min-counts-per-sample=%(tags_filter_min_counts_per_sample)i
              --filter-percentile-rowsums=%(tags_filter_percentile_rowsums)i
              --log=%(outfile)s.log
              --fdr=%(edger_fdr)f
              --deseq2-plot=0"
    | perl -p -e "s/qvalue/old_qvalue/"
    | python %(scriptsdir)s/table2table.py
    --log=%(outfile)s.log
    --method=fdr
    --column=pvalue
    --fdr-method=BH
    --fdr-add-column=qvalue
    | gzip
    > %(outfile)s '''
    E.info(statement)

    P.run()
Esempio n. 8
0
def peekParameters(workingdir,
                   pipeline,
                   on_error_raise=None,
                   prefix=None,
                   update_interface=False,
                   restrict_interface=False):
    '''peek configuration parameters from external pipeline.

    As the paramater dictionary is built at runtime, this method
    executes the pipeline in workingdir, dumping its configuration
    values and reading them into a dictionary.

    If either `pipeline` or `workingdir` are not found, an error is
    raised. This behaviour can be changed by setting `on_error_raise`
    to False. In that case, an empty dictionary is returned.

    Arguments
    ---------
    workingdir : string
       Working directory. This is the directory that the pipeline
       was executed in.
    pipeline : string
       Name of the pipeline script. The pipeline is assumed to live
       in the same directory as the current pipeline.
    on_error_raise : Bool
       If set to a boolean, an error will be raised (or not) if there
       is an error during parameter peeking, for example if
       `workingdir` can not be found. If `on_error_raise` is None, it
       will be set to the default, which is to raise an exception
       unless the calling script is imported or the option
       ``--is-test`` has been passed at the command line.
    prefix : string
       Add a prefix to all parameters. This is useful if the paramaters
       are added to the configuration dictionary of the calling pipeline.
    update_interface : bool
       If True, this method will prefix any options in the
       ``[interface]`` section with `workingdir`. This allows
       transparent access to files in the external pipeline.
    restrict_interface : bool
       If  True, only interface parameters will be imported.

    Returns
    -------
    config : dict
        Dictionary of configuration values.

    '''
    caller_locals = getCallerLocals()

    # check if we should raise errors
    if on_error_raise is None:
        on_error_raise = not isTest() and \
            "__name__" in caller_locals and \
            caller_locals["__name__"] == "__main__"

    # patch - if --help or -h in command line arguments,
    # do not peek as there might be no config file.
    if "--help" in sys.argv or "-h" in sys.argv:
        return {}

    # Attempt to locate directory with pipeline source code. This is a
    # patch as pipelines might be called within the repository
    # directory or from an installed location
    dirname = PARAMS["pipelinedir"]

    # called without a directory, use current directory
    if dirname == "":
        dirname = os.path.abspath(".")
    else:
        # if not exists, assume we want version located
        # in directory of calling script.
        if not os.path.exists(dirname):
            # directory is path of calling script
            dirname = os.path.dirname(caller_locals['__file__'])

    pipeline = os.path.join(dirname, pipeline)
    if not os.path.exists(pipeline):
        if on_error_raise:
            raise ValueError("can't find pipeline at %s" % (pipeline))
        else:
            return {}

    if workingdir == "":
        workingdir = os.path.abspath(".")

    # patch for the "config" target - use default
    # pipeline directory if directory is not specified
    # working dir is set to "?!"
    if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!":
        workingdir = os.path.join(PARAMS.get("pipelinedir"),
                                  IOTools.snip(pipeline, ".py"))

    if not os.path.exists(workingdir):
        if on_error_raise:
            raise ValueError("can't find working dir %s" % workingdir)
        else:
            return {}

    statement = "python %s -f -v 0 dump" % pipeline
    process = subprocess.Popen(statement,
                               cwd=workingdir,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    # process.stdin.close()
    stdout, stderr = process.communicate()
    if process.returncode != 0:
        raise OSError(
            ("Child was terminated by signal %i: \n"
             "Statement: %s\n"
             "The stderr was: \n%s\n"
             "Stdout: %s") % (-process.returncode, statement, stderr, stdout))

    # subprocess only accepts encoding argument in py >= 3.6 so
    # decode here.
    stdout = stdout.decode("utf-8").splitlines()
    # remove any log messages
    stdout = [x for x in stdout if x.startswith("{")]
    if len(stdout) > 1:
        raise ValueError("received multiple configurations")
    dump = json.loads(stdout[0])

    # update interface
    if update_interface:
        for key, value in list(dump.items()):
            if key.startswith("interface"):
                dump[key] = os.path.join(workingdir, value)

    # keep only interface if so required
    if restrict_interface:
        dump = dict([(k, v) for k, v in dump.items()
                     if k.startswith("interface")])

    # prefix all parameters
    if prefix is not None:
        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])

    return dump
Esempio n. 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b", "--bam-file", dest="bam_file", type="string",
                      help="supply input bam file name")

    parser.add_option("-g", "--gtf-file", dest="gtf_file", type="string",
                      help="supply input gtf file name")

    parser.add_option("-o", "--outfile", dest="outfile", type="string",
                      help="supply output file name")

    parser.add_option(
        "-G", "--reference-gtf-file", dest="reference_gtf", type="string",
        help="supply reference gtf for context of reads not contributing to transcripts")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    ######################################################
    ######################################################
    # for all alignments
    ######################################################
    ######################################################

    # open outfile and prepare headers
    outf = open(options.outfile, "w")
    outf.write("\t".join(["total alignments", "aligments in transcripts", "percent alignments in transcripts",
                          "total spliced alignments", "spliced alignments in transcripts", "percent spliced alignments in transcripts"]) + "\n")

    # calculate coverage over transcript file - NB split reads contribute twice to the transcript
    # use BedTool object
    pybedbamfile = pybedtools.BedTool(options.bam_file)

    # count alignments
    E.info("counting total number of alignments and spliced alignments")
    total_alignments = 0
    spliced_alignments = 0

    for alignment in pybedbamfile:
        cigar = alignment[5]
        if cigar.find("N") != -1:  # N signifies split read
            total_alignments += 1
            spliced_alignments += 1
        else:
            total_alignments += 1

    # merge the gtf file to avoid double counting of exons in different
    # transcripts - converts to a bed file
    gtffile = pybedtools.BedTool(options.gtf_file).merge()

    E.info("computing coverage of aligments in %s over intervals in %s" %
           (options.bam_file, options.gtf_file))
    cover = pybedbamfile.coverage(gtffile)

    # make sure that the exons aren't being counted twice - shouldn't be
    # because of merge
    E.info("counting reads contributing to transcripts")
    c = 0
    for entry in cover:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    # sum the coverage across exons from all transcripts
    coverage_in_transcripts = c

    ######################################################
    ######################################################
    # for spliced alignments
    ######################################################
    ######################################################

    # count total number of spliced alignments
    # requires that the CIGAR string 'N' is present

    # uses pysam to write out a bam file of the spliced reads only
    allreads = pysam.Samfile(options.bam_file)
    spliced_bamname = IOTools.snip(options.bam_file, ".bam") + "_spliced_reads.bam"

    # open file for outputting spliced alignments
    splicedreads = pysam.Samfile(spliced_bamname, "wb", template=allreads)

    # cigar string in pysam for spliced alignment is (3, int)
    spliced = collections.defaultdict(list)
    for read in allreads:
        for cigar_tag in read.cigar:
            if cigar_tag[0] == 3:
                spliced[read].append(cigar_tag)

    # write out spliced alignments
    for read in spliced.keys():
        splicedreads.write(read)
    splicedreads.close()
    allreads.close()

    # index splice reads bam file
    pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam"))
    pysam.index(spliced_bamname)

    # read in the spliced reads as a BedTool object
    splicedbam = pybedtools.BedTool(spliced_bamname)

    # perform coverage of spliced reads over intervals - will be twice
    # as many as there should be due to counting both exons
    # overlapping
    spliced_coverage = splicedbam.coverage(gtffile)

    # avoid double counting exons
    E.info("counting spliced reads contributing to transcripts")
    spliced_exons = {}
    c = 0
    for entry in spliced_coverage:
        coverage = int(entry[3])
        if coverage > 0:
            c += coverage

    spliced_coverage_in_transcripts = c

    # NOTE: the counting of spliced alignments is not accurate

    spliced_coverage_in_transcripts = float(
        spliced_coverage_in_transcripts) / 2

    ###########################
    # write out the results
    ###########################

    outf.write(str(int(total_alignments)) + "\t")
    # remove half of the coverage assigned to spliced reads
    coverage_in_transcripts = (
        coverage_in_transcripts) - (spliced_coverage_in_transcripts)
    outf.write(
        str(int(coverage_in_transcripts) - int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t")

    # write out spliced counts
    outf.write(str(int(spliced_alignments)) + "\t")
    outf.write(str(int(spliced_coverage_in_transcripts)) + "\t")
    outf.write(
        str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100)))

    outf.close()

    ############################
    # contextualise those that
    # don't fall in transcripts
    ############################

    if options.reference_gtf:
        context_summary = open(
            IOTools.snip(options.bam_file, ".bam") + ".excluded.context", "w")
        context_summary.write("\t".join(["Feature", "number"]) + "\n")

        # write out the read info as well
        context_file = open(
            IOTools.snip(options.bam_file, ".bam") + ".excluded", "w")

        context_dict = collections.defaultdict(int)
        # intersect bam - write non-overlapping with transcripts - intersect
        # with reference - write out
        context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect(
            pybedtools.BedTool(options.reference_gtf), wb=True)
        for entry in context:
            feature = entry[8]
            context_dict[feature] += 1
            context_file.write("\t".join([e for e in entry]) + "\n")

        for feature, value in context_dict.iteritems():
            context_summary.write("\t".join([feature, str(value)]) + "\n")

        context_file.close()
        context_summary.close()

    # write footer and output benchmark information.
    E.Stop()
Esempio n. 10
0
def loadCuffdiff(dbhandle, infile, outfile, min_fpkm=1.0):
    '''load results from cuffdiff analysis to database

    This functions parses and loads the results of a cuffdiff differential
    expression analysis.
    Parsing is performed by the parseCuffdiff function.

    Multiple tables will be created as cuffdiff outputs information
    on gene, isoform, tss, etc. levels.

    The method converts from ln(fold change) to log2 fold change.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    `min_fpkm`) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    Arguments
    ---------
    dbhandle : object
        Database handle.
    infile : string
        Input filename, output from cuffdiff
    outfile : string
        Output filename in :term:`tsv` format.
    min_fpkm : float
        Minimum fpkm. Genes with an fpkm lower than this will
        be set to status `NOCALL`.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued...

    tmpname = P.getTempFilename(shared=True)

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)

        results = parseCuffdiff(infile, min_fpkm=min_fpkm)
        Expression.writeExpressionResults(tmpname, results)
        P.load(tmpname, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=treatment_name "
               "--add-index=control_name "
               "--add-index=test_id")

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"
        infile = os.path.join(indir, fn)

        P.load(infile, outfile,
               tablename=tablename,
               options="--allow-empty-file "
               "--add-index=tracking_id "
               "--add-index=control_name "
               "--add-index=test_id")

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        is_first = True
        for line in inf:

            if is_first:
                is_first = False
                continue

            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # please make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            s = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[s]] + "\t")
                s += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        P.load(tmpf,
               outfile,
               tablename=tablename,
               options="--allow-empty-file "
               " --add-index=gene_id")

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
Esempio n. 11
0
def loadCuffdiff(infile, outfile, min_fpkm=1.0):
    '''load results from differential expression analysis and produce
    summary plots.

    Note: converts from ln(fold change) to log2 fold change.

    The cuffdiff output is parsed.

    Pairwise comparisons in which one gene is not expressed (fpkm <
    fpkm_silent) are set to status 'NOCALL'. These transcripts might
    nevertheless be significant.

    This requires the cummeRbund library to be present in R.

    '''

    prefix = P.toTable(outfile)
    indir = infile + ".dir"

    if not os.path.exists(indir):
        P.touch(outfile)
        return

    # E.info( "building cummeRbund database" )
    # R('''library(cummeRbund)''')
    # cuff = R('''readCufflinks(dir = %(indir)s, dbfile=%(indir)s/csvdb)''' )
    # to be continued

    dbhandle = sqlite3.connect(PARAMS["database"])

    tmpname = P.getTempFilename(".")

    # ignore promoters and splicing - no fold change column, but  sqrt(JS)
    for fn, level in (("cds_exp.diff.gz", "cds"),
                      ("gene_exp.diff.gz", "gene"),
                      ("isoform_exp.diff.gz", "isoform"),
                      # ("promoters.diff.gz", "promotor"),
                      # ("splicing.diff.gz", "splice"),
                      ("tss_group_exp.diff.gz", "tss")):

        tablename = prefix + "_" + level + "_diff"

        infile = os.path.join(indir, fn)
        results = parseCuffdiff(infile,
                                min_fpkm=min_fpkm)

        Expression.writeExpressionResults(tmpname, results)

        statement = '''cat %(tmpname)s
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --allow-empty-file
              --add-index=treatment_name
              --add-index=control_name
              --add-index=test_id
              --table=%(tablename)s
         >> %(outfile)s.log
         '''

        P.run()

    for fn, level in (("cds.fpkm_tracking.gz", "cds"),
                      ("genes.fpkm_tracking.gz", "gene"),
                      ("isoforms.fpkm_tracking.gz", "isoform"),
                      ("tss_groups.fpkm_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "_levels"

        statement = '''zcat %(indir)s/%(fn)s
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --allow-empty-file
              --add-index=tracking_id
              --table=%(tablename)s
         >> %(outfile)s.log
         '''

        P.run()

    # Jethro - load tables of sample specific cuffdiff fpkm values into csvdb
    # IMS: First read in lookup table for CuffDiff/Pipeline sample name
    # conversion
    inf = IOTools.openFile(os.path.join(indir, "read_groups.info.gz"))
    inf.readline()
    sample_lookup = {}

    for line in inf:
        line = line.split("\t")
        our_sample_name = IOTools.snip(line[0])
        our_sample_name = re.sub("-", "_", our_sample_name)
        cuffdiff_sample_name = "%s_%s" % (line[1], line[2])
        sample_lookup[cuffdiff_sample_name] = our_sample_name

    inf.close()

    for fn, level in (("cds.read_group_tracking.gz", "cds"),
                      ("genes.read_group_tracking.gz", "gene"),
                      ("isoforms.read_group_tracking.gz", "isoform"),
                      ("tss_groups.read_group_tracking.gz", "tss")):

        tablename = prefix + "_" + level + "sample_fpkms"

        tmpf = P.getTempFilename(".")
        inf = IOTools.openFile(os.path.join(indir, fn)).readlines()
        outf = IOTools.openFile(tmpf, "w")

        samples = []
        genes = {}

        x = 0
        for line in inf:
            if x == 0:
                x += 1
                continue
            line = line.split()
            gene_id = line[0]
            condition = line[1]
            replicate = line[2]
            fpkm = line[6]
            status = line[8]

            sample_id = condition + "_" + replicate

            if sample_id not in samples:
                samples.append(sample_id)

            # IMS: The following block keeps getting its indenting messed
            # up. It is not part of the 'if sample_id not in samples' block
            # plesae make sure it does not get made part of it
            if gene_id not in genes:
                genes[gene_id] = {}
                genes[gene_id][sample_id] = fpkm
            else:
                if sample_id in genes[gene_id]:
                    raise ValueError(
                        'sample_id %s appears twice in file for gene_id %s'
                        % (sample_id, gene_id))
                else:
                    if status != "OK":
                        genes[gene_id][sample_id] = status
                    else:
                        genes[gene_id][sample_id] = fpkm

        samples = sorted(samples)

        # IMS - CDS files might be empty if not cds has been
        # calculated for the genes in the long term need to add CDS
        # annotation to denovo predicted genesets in meantime just
        # skip if cds tracking file is empty

        if len(samples) == 0:
            continue

        headers = "gene_id\t" + "\t".join([sample_lookup[x] for x in samples])
        outf.write(headers + "\n")

        for gene in genes.iterkeys():
            outf.write(gene + "\t")
            x = 0
            while x < len(samples) - 1:
                outf.write(genes[gene][samples[x]] + "\t")
                x += 1

            # IMS: Please be careful with this line. It keeps getting moved
            # into the above while block where it does not belong
            outf.write(genes[gene][samples[len(samples) - 1]] + "\n")

        outf.close()

        statement = ("cat %(tmpf)s |"
                     " python %(scriptsdir)s/csv2db.py "
                     "  %(csv2db_options)s"
                     "  --allow-empty-file"
                     "  --add-index=gene_id"
                     "  --table=%(tablename)s"
                     " >> %(outfile)s.log")
        P.run()

        os.unlink(tmpf)

    # build convenience table with tracks
    tablename = prefix + "_isoform_levels"
    tracks = Database.getColumnNames(dbhandle, tablename)
    tracks = [x[:-len("_FPKM")] for x in tracks if x.endswith("_FPKM")]

    tmpfile = P.getTempFile(dir=".")
    tmpfile.write("track\n")
    tmpfile.write("\n".join(tracks) + "\n")
    tmpfile.close()

    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)