Python IOTools.FilePool Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: IOTools

Method/Function: FilePool

Examples at hotexamples.com: 5

Python IOTools.FilePool - 5 examples found. These are the top rated real world Python examples of CGAT.IOTools.FilePool extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

openFile(30)

ReadMap(23)

ReadList(21)

isEmpty(14)

writeLines(9)

readMap(9)

which(8)

getInvertedDictionary(7)

readList(7)

prettyPercent(7)

zapFile(6)

convertDictionary(6)

snip(5)

FilePool(5)

iterate(5)

getNumLines(4)

readTable(4)

flatten(4)

readMultiMap(3)

str2val(3)

touchFile(3)

writeMatrix(3)

isComplete(2)

getLastLine(2)

readMatrix(2)

val2str(2)

human2bytes(1)

force_str(1)

cloneFile(1)

prettyFloat(1)

Example #1

Show file

File: pipeline_splitprofilesNumMatched.py Project: jdparker101/pipeline_splitprofilesNumMatched

def split_gtf_by_category(infiles, outfiles, catname):

    catfile, gtffile = infiles
    categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t")

    # create output filepool
    outpool = IOTools.FilePool("{}_%s.gtf.gz".format(catname), force=True)

    gtffile = IOTools.openFile(gtffile)

    for gtfline in GTF.iterator(gtffile):

        try:
            transcript_id = gtfline.transcript_id
        except AttributeError:
            transcript_id = None

        try:
            gene_id = gtfline.gene_id
        except AttributeError:
            gene_id = None

        if transcript_id in categories.index:
            outpool.write(categories[transcript_id], str(gtfline) + "\n")
        elif gene_id in categories.index:
            outpool.write(categories[gene_id], str(gtfline) + "\n")

    outpool.close()

Example #2

Show file

File: farm.py Project: wbyu/CGATPipelines

def chunk_iterator_column(infile, args, prefix, use_header=False):
    """split at column.

    The table need not be sorted by this column.
    If num_files is given, files will randomly created
    and tags according to column randomly assigned.


    """

    column, max_files = args
    files = IOTools.FilePool()
    header = False

    if max_files:
        map_tag2file = {}

    for line in infile:
        if line[0] == "#":
            continue

        if not header and use_header:
            files.setHeader(line)
            header = True
            continue

        key = line[:-1].split("\t")[column]
        if max_files:
            if key in map_tag2file:
                key = map_tag2file[key]
            else:
                n = "%010i" % (len(map_tag2file) % max_files)
                map_tag2file[key] = n
                key = n

        files.write("%s/%s.in" % (prefix, key), line)

    for filename, count in list(files.items()):
        E.info("created file %s with %i items" % (filename, count))
        yield filename

Example #3

Show file

File: medip_merge_intervals.py Project: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--min-overlap",
                      dest="min_overlap",
                      type="int",
                      help="minimum overlap")

    parser.add_option(
        "-w",
        "--pattern-window",
        dest="pattern_window",
        type="string",
        help=
        "regular expression to extract window coordinates from test id [%default]"
    )

    parser.add_option("-i",
                      "--invert",
                      dest="invert",
                      action="store_true",
                      help="invert direction of fold change [%default]")

    parser.set_defaults(min_overlap=10,
                        invert=False,
                        pattern_window="(\S+):(\d+)-(\d+)"),

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    outfiles = IOTools.FilePool(options.output_filename_pattern)

    if options.invert:
        test_f = lambda l2fold: l2fold < 0
    else:
        test_f = lambda l2fold: l2fold > 0

    def read():

        rx_window = re.compile(options.pattern_window)
        # filter any of the DESeq/EdgeR message that end up at the top of the
        # output file

        for data in IOTools.iterate(options.stdin):

            contig, start, end = rx_window.match(data.test_id).groups()
            start, end = map(int, (start, end))

            yield DATA._make(
                (contig, start, end, data.treatment_name,
                 float(data.treatment_mean),
                 float(data.treatment_std), data.control_name,
                 float(data.control_mean), float(data.control_std),
                 float(data.pvalue), float(data.qvalue), float(data.l2fold),
                 float(data.fold), int(data.significant), data.status, 0))

    def grouper(data, distance=10):

        last = data.next()
        entries = [last]

        while 1:
            d = data.next()
            if d is None:
                break
            if d.contig == last.contig and d.start < last.start:
                raise ValueError("error not sorted by start")

            if ((d.contig != last.contig) or (d.start - last.end > distance)
                    or (d.status != last.status)
                    or (d.significant != last.significant)
                    or (d.l2fold * last.l2fold < 0)):
                yield entries
                entries = []

            entries.append(d)
            last = d

        yield entries

    counter = E.Counter()

    options.stdout.write("\t".join(DATA._fields) + "\n")

    # set of all sample names - used to create empty files
    samples = set()

    # need to sort by coordinate
    all_data = list(read())
    all_data.sort(key=lambda x: (x.contig, x.start))

    for group in grouper(iter(all_data), distance=options.min_overlap):

        start, end = group[0].start, group[-1].end
        assert start < end, 'start > end: %s' % str(group)
        n = float(len(group))
        counter.input += n

        g = group[0]

        if g.l2fold < 0:
            l2fold = max([x.l2fold for x in group])
            fold = max([x.fold for x in group])
        else:
            l2fold = min([x.l2fold for x in group])
            fold = min([x.fold for x in group])

        outdata = DATA._make(
            (g.contig, start, end, g.treatment_name,
             sum([x.treatment_mean for x in group]) / n,
             max([x.treatment_std for x in group]), g.control_name,
             sum([x.control_mean
                  for x in group]) / n, max([x.control_std for x in group]),
             max([x.pvalue for x in group]), max([x.qvalue for x in group]),
             l2fold, fold, g.significant, g.status, int(n)))

        samples.add(g.treatment_name)
        samples.add(g.control_name)
        if g.significant:
            if test_f(g.l2fold):
                # treatment lower methylation than control
                outfiles.write(
                    g.treatment_name, "%s\t%i\t%i\t%s\t%f\n" %
                    (g.contig, g.start, g.end, g.treatment_name,
                     sum([x.treatment_mean for x in group]) / n))

            else:
                outfiles.write(
                    g.control_name, "%s\t%i\t%i\t%s\t%f\n" %
                    (g.contig, g.start, g.end, g.control_name,
                     sum([x.control_mean for x in group]) / n))

        options.stdout.write("\t".join(map(str, outdata)) + "\n")

        counter.output += 1

    for sample in samples:
        outfiles.write(sample, "")

    outfiles.close()
    E.info("%s" % counter)

    # write footer and output benchmark information.
    E.Stop()

Example #4

Show file

File: pipeline_vitaminD_expression.py Project: logust79/cgat-apps

def buildExpressionTracks(infile, outfiles, map_exp2columns, suffix):
    '''build expression tracks.

    read the analysis from FILENAME_EXPRESSION
    
    ..note::
       The file A589_Data_RMA.csv does NOT always contain the probeset_id 
       in the first column, but instead it might be the transcript_cluster_id.
       A possible explanation is that if several probesets map to the same
       transcript cluster, the transcript cluster is normalized.
       
       The set of cluster_id and probeset ids are completely non-overlapping.

    Hence, the :term:`cluster_id` will be used.
    '''

    E.info("importing expression data from %s" % infile)

    dbhandle = sqlite3.connect(PARAMS["database"])

    cc = dbhandle.cursor()
    statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript"
    cc.execute(statement)
    map_cluster2transcript, map_probeset2cluster = {}, {}
    for probeset, cluster, transcript_id in cc.fetchall():
        map_probeset2cluster[probeset] = cluster
        map_cluster2transcript[cluster] = transcript_id

    reader = csv.reader(open(infile, "rU"))

    first = True
    # do not delete old files as this function is called several times
    output_files = IOTools.FilePool(output_pattern="exp%s.data", force=False)

    headers = (("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"),
               ("mRna - Description", "description"), ('mRNA Accession',
                                                       'mrna_id'),
               ('mRNA  Source', 'source'), ('mRNA - xhyb', 'xhyb'),
               ('GO Biological Process ID',
                'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'),
               ('GO Cellular Component ID',
                'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'),
               ('GO Molecular Function ID',
                'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'),
               ('Pathway Source', 'pw_source'), ('Pathway Name', 'pw_name'))

    old_headers = set([x[0] for x in headers])
    new_headers = [x[1] for x in headers]
    take = []
    index_soure, index_accession, index_probeset = None, None, None
    counts = E.Counter()
    found = set()

    outf = open(outfiles[0] + suffix, "w")
    outf.write("# %s\n" % infile)
    outs = open(outfiles[1] + suffix, "w")
    outs.write("# %s\n" % infile)

    writer = csv.writer(outf)

    for row in reader:
        if first:
            first = False
            writer.writerow(row)

            for x, old_header in enumerate(row):
                if old_header == "mRNA  Source": index_source = len(take)
                if old_header == "mRNA Accession": index_accession = len(take)
                if old_header == "Probe Set ID": index_probeset = len(take)
                if old_header in old_headers: take.append(x)

            # write headers to all files
            outs.write("\t".join(new_headers) + "\n")

            for exp, columns in map_exp2columns.items():
                output_files.write(
                    exp, "\t".join(
                        ("cluster_id", Stats.Summary().getHeader(), "\t".join(
                            ["R%i" % i for i in range(len(columns))]))) + "\n")
        else:
            new_row = []
            for x in take:
                if row[x].strip() != "---":
                    new_row.append(row[x].strip())
                else:
                    new_row.append("")

            probeset = new_row[index_probeset].strip()
            if probeset in map_probeset2cluster:
                probeset = map_probeset2cluster[probeset]
                counter.mapped_to_cluster += 1

            if probeset not in map_cluster2transcript:
                writer.writerow(row)
                counts.skipped += 1
                continue
            else:
                if probeset in found:
                    counts.duplicates += 1
                counts.output += 1
                found.add(probeset)

            outs.write("\t".join(new_row) + "\n")

            for exp, cols in map_exp2columns.items():
                data = [row[x] for x in cols]
                output_files.write(
                    exp, "\t".join(
                        (probeset, str(Stats.Summary(
                            [float(x)
                             for x in data])), "\t".join(data))) + "\n")

    outf.close()
    if counts.duplicates > 0:
        P.warn("duplicate probeset/clusters")

    P.info("probeset source information: %s" % str(counts))
    output_files.close()

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if DISABLE:
        print "# tophat_segment_juncs.py disabled"
        argv[0] = "segment_juncs.original"
        runCommand(argv, "segment_juncs.log")
        return 0

    E.Start(no_parsing=True)

    # collect arguments
    parser = argparse.ArgumentParser(description='Process tophat options.')
    parser.add_argument('-p',
                        '--num-threads',
                        metavar='N',
                        type=int,
                        dest='nthreads',
                        help='number of threads')
    parser.add_argument('--version', action='version', version='%(prog)s')
    options, args = parser.parse_known_args(argv[1:])

    E.info("parallelizing segment juncs with %i threads" % options.nthreads)

    x = argv.index("--ium-reads") + 1

    all_options = argv[1:x]

    (input_missing_reads, input_genome, output_junctions, output_insertions,
     output_deletions, input_left_all_reads, input_left_all_map,
     input_left_segments_maps) = argv[x:x + 8]

    input_left_segments_maps = input_left_segments_maps.split(",")

    if len(argv) > x + 8:
        (input_right_all_reads, input_right_all_map,
         input_right_segments_maps) = argv[x + 8:x + 11]
        input_right_segments_maps = input_right_segments_maps.split(",")
    else:
        input_right_all_reads = ""
        input_right_all_map = ""
        input_right_segments_maps = []

    keys = set()

    # some filenames might appear multiple times
    files_to_split = set([input_left_all_map, \
                              input_right_all_map ] +\
                             input_left_segments_maps +\
                             input_right_segments_maps )

    E.info("splitting %i files" % len(files_to_split))

    ## split all map files by chromosome
    for filename in files_to_split:
        if filename == "": continue
        E.info("splitting %s" % filename)
        base, ext = os.path.splitext(filename)

        f = glob.glob("%s.input.*%s" % (filename, ext))
        if f:
            E.info("files already exist - skipping")
            keys.update([
                re.match("%s.input.(\S+)%s" % (filename, ext), x).groups()[0]
                for x in f
            ])
            continue

        infile = IOTools.openFile(filename)

        outfiles = IOTools.FilePool(filename + ".input.%s" + ext)

        for line in infile:
            key = line.split("\t")[2]
            keys.add(key)
            outfiles.write(key, line)

        outfiles.close()

    # keys = set( ["chr1", "chr2", "chr3", "chr4", "chr5",
    #              "chr6", "chr7", "chr8", "chr9", "chr10",
    #              "chr11", "chr12", "chr13", "chr14", "chr15",
    #              "chr16", "chr17", "chr18", "chr19", "chr20",
    #              "chr21", "chr22", "chrX", "chrY", "chrM" ] )

    E.info("working on %i contigs: %s" % (len(keys), list(keys)))

    pool = multiprocessing.pool.ThreadPool(options.nthreads)
    #pool = threadpool.ThreadPool( THREADS )

    tmpdir = os.path.dirname(input_left_all_reads)
    logdir = os.path.join(tmpdir[:-len("tmp")], "logs")

    if not os.path.exists(logdir):
        raise IOError("can not find logdir %s" % logdir)

    args = []
    for key in keys:

        def modout(old, key):
            if not old: return ""
            _, ext = os.path.splitext(old)
            return old + ".output.%s%s" % (key, ext)

        def modin(old, key):
            if not old: return ""
            _, ext = os.path.splitext(old)
            return old + ".input.%s%s" % (key, ext)

        def modgenome(old, key):
            dirname, filename = os.path.split(old)
            genome, ext = os.path.splitext(filename)
            if genome.lower().endswith("_cs"): genome = genome[:-3]
            new = os.path.join(dirname, genome + ".perchrom", key + ext)
            if not os.path.exists(new):
                raise ValueError("can not find chromoseme file %s" % new)
            return new

        cmd = ["segment_juncs"] +\
            all_options +\
            [input_missing_reads,  \
                 modgenome(input_genome,key), \
                 modout(output_junctions,key),\
                 modout(output_insertions,key),\
                 modout(output_deletions,key),\
                 input_left_all_reads,\
                 modin( input_left_all_map, key ),\
                 ",".join( [ modin( x, key ) for x in input_left_segments_maps ] ),\
                 input_right_all_reads,\
                 modin( input_right_all_map, key ),\
                 ",".join( [ modin( x, key ) for x in input_right_segments_maps ] ) ]

        logfile = os.path.join(logdir, "segment_juncs_%s.log" % key)
        args.append((cmd, logfile))

    E.info("submitting %i jobs" % len(keys))

    pool.map(runCommand, args, chunksize=1)
    pool.close()
    pool.join()

    E.info("all jobs finished successfully")

    E.info("merging results")
    ## merge results
    for filename in (output_junctions, output_insertions, output_deletions):
        outfile = open(filename, "w")
        for inf in glob.glob(filename + ".output.*"):
            infile = open(inf, "r")
            outfile.write(infile.read())
            infile.close()
        outfile.close()

    E.info("results merged")

    ## cleaning up is done automatically by tophat
    E.info("cleaning up")
    for f in glob.glob( os.path.join( tmpdir, "*.output.*") ) +\
            glob.glob( os.path.join( tmpdir, "*.input.*") ):
        os.remove(f)

    ## write footer and output benchmark information.
    E.Stop()