def split_gtf_by_category(infiles, outfiles, catname):

    catfile, gtffile = infiles
    categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t")

    # create output filepool
    outpool = IOTools.FilePool("{}_%s.gtf.gz".format(catname), force=True)

    gtffile = IOTools.openFile(gtffile)

    for gtfline in GTF.iterator(gtffile):

        try:
            transcript_id = gtfline.transcript_id
        except AttributeError:
            transcript_id = None

        try:
            gene_id = gtfline.gene_id
        except AttributeError:
            gene_id = None

        if transcript_id in categories.index:
            outpool.write(categories[transcript_id], str(gtfline) + "\n")
        elif gene_id in categories.index:
            outpool.write(categories[gene_id], str(gtfline) + "\n")

    outpool.close()
Example #2
0
def chunk_iterator_column(infile, args, prefix, use_header=False):
    """split at column.

    The table need not be sorted by this column.
    If num_files is given, files will randomly created
    and tags according to column randomly assigned.


    """

    column, max_files = args
    files = IOTools.FilePool()
    header = False

    if max_files:
        map_tag2file = {}

    for line in infile:
        if line[0] == "#":
            continue

        if not header and use_header:
            files.setHeader(line)
            header = True
            continue

        key = line[:-1].split("\t")[column]
        if max_files:
            if key in map_tag2file:
                key = map_tag2file[key]
            else:
                n = "%010i" % (len(map_tag2file) % max_files)
                map_tag2file[key] = n
                key = n

        files.write("%s/%s.in" % (prefix, key), line)

    for filename, count in list(files.items()):
        E.info("created file %s with %i items" % (filename, count))
        yield filename
Example #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--min-overlap",
                      dest="min_overlap",
                      type="int",
                      help="minimum overlap")

    parser.add_option(
        "-w",
        "--pattern-window",
        dest="pattern_window",
        type="string",
        help=
        "regular expression to extract window coordinates from test id [%default]"
    )

    parser.add_option("-i",
                      "--invert",
                      dest="invert",
                      action="store_true",
                      help="invert direction of fold change [%default]")

    parser.set_defaults(min_overlap=10,
                        invert=False,
                        pattern_window="(\S+):(\d+)-(\d+)"),

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    outfiles = IOTools.FilePool(options.output_filename_pattern)

    if options.invert:
        test_f = lambda l2fold: l2fold < 0
    else:
        test_f = lambda l2fold: l2fold > 0

    def read():

        rx_window = re.compile(options.pattern_window)
        # filter any of the DESeq/EdgeR message that end up at the top of the
        # output file

        for data in IOTools.iterate(options.stdin):

            contig, start, end = rx_window.match(data.test_id).groups()
            start, end = map(int, (start, end))

            yield DATA._make(
                (contig, start, end, data.treatment_name,
                 float(data.treatment_mean),
                 float(data.treatment_std), data.control_name,
                 float(data.control_mean), float(data.control_std),
                 float(data.pvalue), float(data.qvalue), float(data.l2fold),
                 float(data.fold), int(data.significant), data.status, 0))

    def grouper(data, distance=10):

        last = data.next()
        entries = [last]

        while 1:
            d = data.next()
            if d is None:
                break
            if d.contig == last.contig and d.start < last.start:
                raise ValueError("error not sorted by start")

            if ((d.contig != last.contig) or (d.start - last.end > distance)
                    or (d.status != last.status)
                    or (d.significant != last.significant)
                    or (d.l2fold * last.l2fold < 0)):
                yield entries
                entries = []

            entries.append(d)
            last = d

        yield entries

    counter = E.Counter()

    options.stdout.write("\t".join(DATA._fields) + "\n")

    # set of all sample names - used to create empty files
    samples = set()

    # need to sort by coordinate
    all_data = list(read())
    all_data.sort(key=lambda x: (x.contig, x.start))

    for group in grouper(iter(all_data), distance=options.min_overlap):

        start, end = group[0].start, group[-1].end
        assert start < end, 'start > end: %s' % str(group)
        n = float(len(group))
        counter.input += n

        g = group[0]

        if g.l2fold < 0:
            l2fold = max([x.l2fold for x in group])
            fold = max([x.fold for x in group])
        else:
            l2fold = min([x.l2fold for x in group])
            fold = min([x.fold for x in group])

        outdata = DATA._make(
            (g.contig, start, end, g.treatment_name,
             sum([x.treatment_mean for x in group]) / n,
             max([x.treatment_std for x in group]), g.control_name,
             sum([x.control_mean
                  for x in group]) / n, max([x.control_std for x in group]),
             max([x.pvalue for x in group]), max([x.qvalue for x in group]),
             l2fold, fold, g.significant, g.status, int(n)))

        samples.add(g.treatment_name)
        samples.add(g.control_name)
        if g.significant:
            if test_f(g.l2fold):
                # treatment lower methylation than control
                outfiles.write(
                    g.treatment_name, "%s\t%i\t%i\t%s\t%f\n" %
                    (g.contig, g.start, g.end, g.treatment_name,
                     sum([x.treatment_mean for x in group]) / n))

            else:
                outfiles.write(
                    g.control_name, "%s\t%i\t%i\t%s\t%f\n" %
                    (g.contig, g.start, g.end, g.control_name,
                     sum([x.control_mean for x in group]) / n))

        options.stdout.write("\t".join(map(str, outdata)) + "\n")

        counter.output += 1

    for sample in samples:
        outfiles.write(sample, "")

    outfiles.close()
    E.info("%s" % counter)

    # write footer and output benchmark information.
    E.Stop()
def buildExpressionTracks(infile, outfiles, map_exp2columns, suffix):
    '''build expression tracks.

    read the analysis from FILENAME_EXPRESSION
    
    ..note::
       The file A589_Data_RMA.csv does NOT always contain the probeset_id 
       in the first column, but instead it might be the transcript_cluster_id.
       A possible explanation is that if several probesets map to the same
       transcript cluster, the transcript cluster is normalized.
       
       The set of cluster_id and probeset ids are completely non-overlapping.

    Hence, the :term:`cluster_id` will be used.
    '''

    E.info("importing expression data from %s" % infile)

    dbhandle = sqlite3.connect(PARAMS["database"])

    cc = dbhandle.cursor()
    statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript"
    cc.execute(statement)
    map_cluster2transcript, map_probeset2cluster = {}, {}
    for probeset, cluster, transcript_id in cc.fetchall():
        map_probeset2cluster[probeset] = cluster
        map_cluster2transcript[cluster] = transcript_id

    reader = csv.reader(open(infile, "rU"))

    first = True
    # do not delete old files as this function is called several times
    output_files = IOTools.FilePool(output_pattern="exp%s.data", force=False)

    headers = (("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"),
               ("mRna - Description", "description"), ('mRNA Accession',
                                                       'mrna_id'),
               ('mRNA  Source', 'source'), ('mRNA - xhyb', 'xhyb'),
               ('GO Biological Process ID',
                'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'),
               ('GO Cellular Component ID',
                'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'),
               ('GO Molecular Function ID',
                'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'),
               ('Pathway Source', 'pw_source'), ('Pathway Name', 'pw_name'))

    old_headers = set([x[0] for x in headers])
    new_headers = [x[1] for x in headers]
    take = []
    index_soure, index_accession, index_probeset = None, None, None
    counts = E.Counter()
    found = set()

    outf = open(outfiles[0] + suffix, "w")
    outf.write("# %s\n" % infile)
    outs = open(outfiles[1] + suffix, "w")
    outs.write("# %s\n" % infile)

    writer = csv.writer(outf)

    for row in reader:
        if first:
            first = False
            writer.writerow(row)

            for x, old_header in enumerate(row):
                if old_header == "mRNA  Source": index_source = len(take)
                if old_header == "mRNA Accession": index_accession = len(take)
                if old_header == "Probe Set ID": index_probeset = len(take)
                if old_header in old_headers: take.append(x)

            # write headers to all files
            outs.write("\t".join(new_headers) + "\n")

            for exp, columns in map_exp2columns.items():
                output_files.write(
                    exp, "\t".join(
                        ("cluster_id", Stats.Summary().getHeader(), "\t".join(
                            ["R%i" % i for i in range(len(columns))]))) + "\n")
        else:
            new_row = []
            for x in take:
                if row[x].strip() != "---":
                    new_row.append(row[x].strip())
                else:
                    new_row.append("")

            probeset = new_row[index_probeset].strip()
            if probeset in map_probeset2cluster:
                probeset = map_probeset2cluster[probeset]
                counter.mapped_to_cluster += 1

            if probeset not in map_cluster2transcript:
                writer.writerow(row)
                counts.skipped += 1
                continue
            else:
                if probeset in found:
                    counts.duplicates += 1
                counts.output += 1
                found.add(probeset)

            outs.write("\t".join(new_row) + "\n")

            for exp, cols in map_exp2columns.items():
                data = [row[x] for x in cols]
                output_files.write(
                    exp, "\t".join(
                        (probeset, str(Stats.Summary(
                            [float(x)
                             for x in data])), "\t".join(data))) + "\n")

    outf.close()
    if counts.duplicates > 0:
        P.warn("duplicate probeset/clusters")

    P.info("probeset source information: %s" % str(counts))
    output_files.close()
Example #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if DISABLE:
        print "# tophat_segment_juncs.py disabled"
        argv[0] = "segment_juncs.original"
        runCommand(argv, "segment_juncs.log")
        return 0

    E.Start(no_parsing=True)

    # collect arguments
    parser = argparse.ArgumentParser(description='Process tophat options.')
    parser.add_argument('-p',
                        '--num-threads',
                        metavar='N',
                        type=int,
                        dest='nthreads',
                        help='number of threads')
    parser.add_argument('--version', action='version', version='%(prog)s')
    options, args = parser.parse_known_args(argv[1:])

    E.info("parallelizing segment juncs with %i threads" % options.nthreads)

    x = argv.index("--ium-reads") + 1

    all_options = argv[1:x]

    (input_missing_reads, input_genome, output_junctions, output_insertions,
     output_deletions, input_left_all_reads, input_left_all_map,
     input_left_segments_maps) = argv[x:x + 8]

    input_left_segments_maps = input_left_segments_maps.split(",")

    if len(argv) > x + 8:
        (input_right_all_reads, input_right_all_map,
         input_right_segments_maps) = argv[x + 8:x + 11]
        input_right_segments_maps = input_right_segments_maps.split(",")
    else:
        input_right_all_reads = ""
        input_right_all_map = ""
        input_right_segments_maps = []

    keys = set()

    # some filenames might appear multiple times
    files_to_split = set([input_left_all_map, \
                              input_right_all_map ] +\
                             input_left_segments_maps +\
                             input_right_segments_maps )

    E.info("splitting %i files" % len(files_to_split))

    ## split all map files by chromosome
    for filename in files_to_split:
        if filename == "": continue
        E.info("splitting %s" % filename)
        base, ext = os.path.splitext(filename)

        f = glob.glob("%s.input.*%s" % (filename, ext))
        if f:
            E.info("files already exist - skipping")
            keys.update([
                re.match("%s.input.(\S+)%s" % (filename, ext), x).groups()[0]
                for x in f
            ])
            continue

        infile = IOTools.openFile(filename)

        outfiles = IOTools.FilePool(filename + ".input.%s" + ext)

        for line in infile:
            key = line.split("\t")[2]
            keys.add(key)
            outfiles.write(key, line)

        outfiles.close()

    # keys = set( ["chr1", "chr2", "chr3", "chr4", "chr5",
    #              "chr6", "chr7", "chr8", "chr9", "chr10",
    #              "chr11", "chr12", "chr13", "chr14", "chr15",
    #              "chr16", "chr17", "chr18", "chr19", "chr20",
    #              "chr21", "chr22", "chrX", "chrY", "chrM" ] )

    E.info("working on %i contigs: %s" % (len(keys), list(keys)))

    pool = multiprocessing.pool.ThreadPool(options.nthreads)
    #pool = threadpool.ThreadPool( THREADS )

    tmpdir = os.path.dirname(input_left_all_reads)
    logdir = os.path.join(tmpdir[:-len("tmp")], "logs")

    if not os.path.exists(logdir):
        raise IOError("can not find logdir %s" % logdir)

    args = []
    for key in keys:

        def modout(old, key):
            if not old: return ""
            _, ext = os.path.splitext(old)
            return old + ".output.%s%s" % (key, ext)

        def modin(old, key):
            if not old: return ""
            _, ext = os.path.splitext(old)
            return old + ".input.%s%s" % (key, ext)

        def modgenome(old, key):
            dirname, filename = os.path.split(old)
            genome, ext = os.path.splitext(filename)
            if genome.lower().endswith("_cs"): genome = genome[:-3]
            new = os.path.join(dirname, genome + ".perchrom", key + ext)
            if not os.path.exists(new):
                raise ValueError("can not find chromoseme file %s" % new)
            return new

        cmd = ["segment_juncs"] +\
            all_options +\
            [input_missing_reads,  \
                 modgenome(input_genome,key), \
                 modout(output_junctions,key),\
                 modout(output_insertions,key),\
                 modout(output_deletions,key),\
                 input_left_all_reads,\
                 modin( input_left_all_map, key ),\
                 ",".join( [ modin( x, key ) for x in input_left_segments_maps ] ),\
                 input_right_all_reads,\
                 modin( input_right_all_map, key ),\
                 ",".join( [ modin( x, key ) for x in input_right_segments_maps ] ) ]

        logfile = os.path.join(logdir, "segment_juncs_%s.log" % key)
        args.append((cmd, logfile))

    E.info("submitting %i jobs" % len(keys))

    pool.map(runCommand, args, chunksize=1)
    pool.close()
    pool.join()

    E.info("all jobs finished successfully")

    E.info("merging results")
    ## merge results
    for filename in (output_junctions, output_insertions, output_deletions):
        outfile = open(filename, "w")
        for inf in glob.glob(filename + ".output.*"):
            infile = open(inf, "r")
            outfile.write(infile.read())
            infile.close()
        outfile.close()

    E.info("results merged")

    ## cleaning up is done automatically by tophat
    E.info("cleaning up")
    for f in glob.glob( os.path.join( tmpdir, "*.output.*") ) +\
            glob.glob( os.path.join( tmpdir, "*.input.*") ):
        os.remove(f)

    ## write footer and output benchmark information.
    E.Stop()