Esempi in Python per IOTools.openFile, esempi in Python per CGATCore.IOTools.openFile

Esempio n. 1

0

Mostra file

def annotate(infile, outfile, geneset):
    '''
    annotate NOGs into functional categories
    '''
    annotation = {}
    E.info("loading geneset")
    anno = IOTools.openFile(geneset)
    for line in anno.readlines():
        data = line[:-1].split("\t")
        nog, funccat = data[1], data[3]
        annotation[nog] = funccat
    E.info("finished loading gene set")

    E.info("annotating infile")
    inf = IOTools.openFile(infile)
    header = inf.readline()
    outf = IOTools.openFile(outfile, "w")
    outf.write(header[:-1] + "\ttaxa\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        nog = data[0]
        try:
            pathway = annotation[nog]
        except KeyError:
            pathway = "Function unknown"
        outf.write(line[:-1] + "\t" + pathway + "\n")
    outf.close()

Esempio n. 2

0

Mostra file

def filterGTF(gtf, filterstring, tempout):

    if "!=" in filterstring:
        column, value = filterstring.split("!=")
        value = value.split("+")
        filtertype = "notin"

    elif "=" in filterstring:
        column, value = filterstring.split("=")
        value = value.split("+")
        filtertype = "in"

    elif "-in_file-" in filterstring:
        column, value = filterstring.split("-in_file-")
        value = [line.strip() for line in IOTools.openFile(value)]
        filtertype = "in_file"

    elif "-notin_file-" in filterstring:
        column, value = filterstring.split("-notin_file-")
        value = [line.strip() for line in IOTools.openFile(value)]
        filtertype = "notin_file"

    elif "-morethan-" in filterstring:
        column, value = filterstring.split("-morethan-")
        value = float(value)
        filtertype = "morethan"

    elif "-lessthan-" in filterstring:
        column, value = filterstring.split("-lessthan-")
        value = float(value)
        filtertype = "lessthan"

    gfile = IOTools.openFile(gtf)
    G = GTF.iterator(gfile)

    out = IOTools.openFile(tempout, "w")
    for item in G:
        D = item.asDict()
        D['contig'] = item.contig
        D['source'] = item.source
        D['feature'] = item.feature
        D['start'] = item.start
        D['end'] = item.end
        D['strand'] = item.strand
        D['frame'] = item.frame

        if filtertype == "in" or filtertype == 'in_file':
            if D[column] in value:
                out.write("%s\n" % str(item))
        elif filtertype == "notin" or filtertype == 'notin_file':
            if D[column] not in value:
                out.write("%s\n" % str(item))
        elif filtertype == "morethan":
            if float(D[column]) > value:
                out.write("%s\n" % str(item))
        elif filtertype == "lessthan":
            if float(D[column]) < value:
                out.write("%s\n" % str(item))
    out.close()
    gfile.close()

Esempio n. 3

0

Mostra file

def chunk_iterator_lines(infile, args, prefix, use_header=False):
    """split by lines."""

    chunk_size = args[0]
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.openFile(filename, "w")
    header = None

    for line in infile:
        if line[0] == "#":
            continue

        if not header and n == 0 and use_header:
            header = line
            outfile.write(header)
            continue

        n += 1

        if n % chunk_size == 0:
            outfile.close()
            yield filename
            filename = "%s/%010i.in" % (prefix, n)
            outfile = IOTools.openFile(filename, "w")
            if header:
                outfile.write(header)

        outfile.write(line)
    outfile.close()
    yield filename

Esempio n. 4

0

Mostra file

def chunk_iterator_regex_split(infile, args, prefix, use_header=False):
    """split where regular expression is true.
    """

    rex = args[0]
    chunk_size = args[2]
    max_lines = args[3]

    nlines = 0
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.openFile(filename, "w")

    for line in infile:

        if line[0] == "#":
            continue
        if rex.search(line[:-1]):
            if n > 0 and (n % chunk_size == 0 or
                          (max_lines and nlines > max_lines)):
                outfile.close()
                yield filename
                filename = "%s/%010i.in" % (prefix, n)
                outfile = IOTools.openFile(filename, "w")
                nlines = 0

            n += 1

        outfile.write(line)
        nlines += 1

    outfile.close()
    yield filename

Esempio n. 5

0

Mostra file

    def __call__(self, track, slice=None):

        if slice == "transcript":
            lengths_transcripts = []
            for transcript in GTF.transcript_iterator(
                    GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in transcript])
                lengths_transcripts.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(
                lengths_transcripts, numbins=40, defaultreallimits=(0, 20000))
            x = np.arange(counts.size) * dx + lower
            return odict((("length", x), ("cumulative frequency",
                                          counts / len(lengths_transcripts))))

        elif slice == "gene":
            lengths_genes = []
            for gene in GTF.flat_gene_iterator(
                    GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in gene])
                lengths_genes.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(
                lengths_genes, numbins=40, defaultreallimits=(0, 20000))
            x = np.arange(counts.size) * dx + lower
            return odict((("length", x), ("cumulative frequency",
                                          counts / len(lengths_genes))))

Esempio n. 6

0

Mostra file

def convertGo2Goslim(options):
    """read gene list with GO assignments and convert to GO slim
    categories."""

    E.info("reading GO assignments from stdin")
    gene2gos, go2infos = ReadGene2GOFromFile(options.stdin)
    input_genes, input_goids = countGOs(gene2gos)

    #############################################################
    # read GO ontology from file
    assert options.filename_ontology, "please supply a GO ontology"
    E.info("reading ontology from %s" % (options.filename_ontology))

    infile = IOTools.openFile(options.filename_ontology)
    ontology = readOntology(infile)
    infile.close()

    go2infos = collections.defaultdict(dict)
    # substitute go2infos
    for go in list(ontology.values()):
        go2infos[go.mNameSpace][go.mId] = GOInfo(go.mId,
                                                 go_type=go.mNameSpace,
                                                 description=go.mName)

    E.info("reading GO assignments from %s" % options.filename_slims)
    go_slims = GetGOSlims(IOTools.openFile(options.filename_slims, "r"))

    if options.loglevel >= 1:
        v = set()
        for x in list(go_slims.values()):
            for xx in x:
                v.add(xx)
        E.info("read go slims from %s: go=%i, slim=%i" %
               (options.filename_slims, len(go_slims), len(v)))

    output_goids, output_genes = set(), set()
    noutput = 0
    options.stdout.write("\t".join(("go_type", "gene_id", "go_id",
                                    "description", "evidence")) + "\n")

    for category, gene2go in sorted(gene2gos.items()):
        gene2go = MapGO2Slims(gene2go, go_slims, ontology)
        for gene_id, values in sorted(gene2go.items()):
            output_genes.add(gene_id)
            for go in sorted(values, key=lambda x: x.mGOId):
                output_goids.add(go.mGOId)
                options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (
                    go.mGOType,
                    gene_id,
                    go.mGOId,
                    go.mDescription,
                    "NA",
                ))
                noutput += 1

    E.info(("ninput_genes=%i, ninput_goids=%i, noutput_gene=%i, "
            "noutput_goids=%i, noutput=%i") %
           (len(input_genes), len(input_goids), len(output_genes),
            len(output_goids), noutput))

Esempio n. 7

0

Mostra file

File: CPC.py Progetto: kevinrue/cgat-flow

    def __call__(self, track, slice=None):

        c_transcript = []
        c_gene = []
        for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_transcript.append(len(transcript))
        for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_gene.append(len(gene))

        return odict((("transcript", np.mean(c_transcript)), ("gene", np.mean(c_gene))))

Esempio n. 8

0

Mostra file

File: pipeline_rrbs.py Progetto: logust79/cgat-flow

def makeCpgIslandsBed(outfile):
    infile = PARAMS["methylation_summary_cpgislands"]
    out = IOTools.openFile(outfile, "w")
    with IOTools.openFile(infile, "r") as f:
        for line in f.readlines():
            # this assumes location of req. values
            contig, start, end = line.split()[1:4]
            if not contig == "chrom":
                out.write("%s\t%s\t%s\n" % (contig, start, end))
    out.close()

Esempio n. 9

0

Mostra file

File: pipeline_rrbs.py Progetto: logust79/cgat-flow

def make1basedCpgIslands(infile, outfile):

    # outfile, loadfile = outfiles

    out = IOTools.openFile(outfile, "w")
    out.write("%s\t%s\t%s\n" % ("contig", "position", "cpgi"))

    with IOTools.openFile(infile, "r") as f:
        lines = f.readlines()
        for line in lines:
            contig, start, stop = line.split()
            for position in [x for x in range(int(start), int(stop) + 2)]:
                out.write("%s\t%s\t%s\n" % (contig, position, "CpGIsland"))
    out.close()

Esempio n. 10

0

Mostra file

File: Control.py Progetto: k3yavi/CGATCore

def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    dry_run = PARAMS.get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = IOTools.openFile(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = IOTools.openFile(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = IOTools.zapFile(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write(
                    "%s\t%s\t%s\t%s\n" %
                    (fn, time.asctime(time.localtime(time.time())), linkdest,
                     "\t".join([str(getattr(stat, x)) for x in fields])))

    E.info("zapped: %s" % (c))
    outfile.close()

    return c

Esempio n. 11

0

Mostra file

File: Requirements.py Progetto: AndreasHegerGenomics/cgat-core

def readDefinitions(filename):
    '''read definitions from a :term:`yaml` file.'''
    with IOTools.openFile(filename) as f:
        config = yaml.load(f)
        if config is None:
            raise IOError("could not read data from '%s'" % filename)
    return config

Esempio n. 12

0

Mostra file

def openOutputFile(section, mode="w"):
    """open file for writing substituting section in the
    output_pattern (if defined).

    If the filename ends with ".gz", the output is opened
    as a gzip'ed file.

    Arguments
    ---------
    section : string
       section will replace any %s in the pattern for output files.

    mode : char
       file opening mode

    Returns
    -------
    File
        an opened file
    """

    fn = getOutputFile(section)
    try:
        if fn == "-":
            return global_options.stdout
        else:
            if not global_options.output_force and os.path.exists(fn):
                raise OSError(
                    ("file %s already exists, use --force-output to "
                     "overwrite existing files.") % fn)
            return IOTools.openFile(fn, mode)
    except AttributeError:
        return global_options.stdout

Esempio n. 13

0

Mostra file

File: CPC.py Progetto: kevinrue/cgat-flow

    def __call__(self, track, slice=None):

        classes = ["antisense", "antisense_upstream", "antisense_downstream", "sense_upstream",
                   "sense_downstream", "intergenic", "sense_intronic", "antisense_intronic"]

        coding_set = {}
        for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")):
            coding_set[gtf.transcript_id] = gtf.source

        result = {"noncoding": {}, "coding": collections.defaultdict(int)}
        total_nc = float(self.getValue(
            "SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'"))
        for c in classes:
            result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' 
                                                              AND b.C_NC = 'noncoding' 
                                                              AND a.transcript_id = b.transcript_id""" % (track, c))) / total_nc) * 100

        total_c = len(list(coding_set.keys()))
        for c in classes:
            ids = self.getValues(
                "SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'")
            for i in ids:
                if i in list(coding_set.keys()):
                    if coding_set[i] == c:
                        result["coding"][c] += 1

        for x, y in result["coding"].items():
            result["coding"][x] = (float(y) / total_c) * 100

        return result

Esempio n. 14

0

Mostra file

File: PipelineMetagenomeBenchmark.py Progetto: logust79/cgat-flow

def buildTrueTaxonomicRelativeAbundances(infiles, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    '''
    levels = ["species", "genus", "family", "order", "class", "phylum"]
    taxa = open(infiles[1])
    header = taxa.readline()
    gi2taxa = collections.defaultdict(list)
    for line in taxa.readlines():
        data = line[:-1].split("\t")
        gi, strain, species, genus, family, order, _class, phylum = data[
            0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
        gi2taxa[gi] = (species, genus, family, order, _class, phylum)

    outf = open(outfile, "w")
    outf.write("level\ttaxa\trelab\n")
    for i in range(len(levels)):
        total = 0
        result = collections.defaultdict(int)
        for fastq in Fastq.iterate(IOTools.openFile(infiles[0])):
            total += 1
            gi = fastq.identifier.split("|")[1]
            result[gi2taxa[gi][i]] += 1
        for taxa, value in result.items():
            outf.write("%s\t%s\t%s\n" %
                       (levels[i], taxa, float(value) / total))
    outf.close()

Esempio n. 15

0

Mostra file

    def __call__(self, filenames, outfile, options):

        for fi, fn in filenames:
            E.debug("# merging %s" % fn)
            infile = IOTools.openFile(fn, "r")

            if options.output_header:
                self.parseHeader(infile, outfile, options)

            for l in infile:
                nfields = l.count("\t")

                if l[0] == "#":
                    options.stdlog.write(l)
                elif self.nfields is not None and nfields != self.nfields:
                    # validate number of fields in row, raise warning
                    # for those not matching and skip.
                    E.warn(
                        "# line %s has unexpected number of fields: %i != %i" %
                        (l[:-1], nfields, self.nfields))
                else:
                    if self.mFieldIndex is not None:
                        data = l[:-1].split("\t")
                        try:
                            data[self.mFieldIndex] = self.mMapper(
                                fi, data[self.mFieldIndex])
                        except IndexError:
                            raise IndexError("can not find field %i in %s" %
                                             (self.mFieldIndex, l))
                        l = "\t".join(data) + "\n"

                    outfile.write(l)
            infile.close()

Esempio n. 16

0

Mostra file

File: PipelineChipseq.py Progetto: logust79/cgat-flow

def getMappedReads(infile):
    '''return number of reads mapped. '''
    for lines in IOTools.openFile(infile, "r"):
        data = lines[:-1].split("\t")
        if data[1].startswith("without duplicates"):
            return int(data[0])
    return

Esempio n. 17

0

Mostra file

File: PipelineChipseq.py Progetto: logust79/cgat-flow

def buildSimpleNormalizedBAM(infiles, outfile, nreads):
    '''normalize a bam file to given number of counts
       by random sampling
    '''
    infile, countfile = infiles

    pysam_in = pysam.Samfile(infile, "rb")

    fh = IOTools.openFile(countfile, "r")
    readcount = int(fh.read())
    fh.close()

    threshold = float(nreads) / float(readcount)

    pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in)

    # iterate over mapped reads thinning by the threshold
    ninput, noutput = 0, 0
    for read in pysam_in.fetch():
        ninput += 1
        if random.random() <= threshold:
            pysam_out.write(read)
            noutput += 1

    pysam_in.close()
    pysam_out.close()
    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, nreads))

Esempio n. 18

0

Mostra file

File: PipelineChipseq.py Progetto: logust79/cgat-flow

def exportPeaksAsBed(infile, outfile):
    '''export peaks as bed files.'''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    if infile.endswith("_macs.load"):
        track = infile[:-len("_macs.load")]
    else:
        track = infile[:-len("_intervals.load")]

    if track.startswith("control"):
        return

    peakwidth = PARAMS["peakwidth"]

    cc = dbhandle.cursor()
    statement = '''SELECT contig, peakcenter - %(peakwidth)i, peakcenter + %(peakwidth)i,
                          interval_id, peakval FROM %(track)s_intervals ORDER by contig, start''' % locals()
    cc.execute(statement)

    outs = IOTools.openFile(outfile, "w")

    for result in cc:
        contig, start, end, interval_id, peakval = result
        # peakval is truncated at a 1000 as this is the maximum permitted
        # score in a bed file.
        peakval = int(min(peakval, 1000))
        outs.write("%s\t%i\t%i\t%s\t%i\n" %
                   (contig, start, end, str(interval_id), peakval))

    cc.close()
    outs.close()

Esempio n. 19

0

Mostra file

def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | cgat gtf2gtf
                        --method=sort --sort-order=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''
            zcat %(infile)s | cgat gtf2gtf
            --method=renumber-genes
            --pattern-identifier=%(gene_pattern)s%%i
            | cgat gtf2gtf
            --method=renumber-transcripts
            --pattern-identifier=%(transcript_pattern)s%%i
            | cgat gtf2gtf
            --method=sort --sort-order=gene
            --log=%(outfile)s.log
            | gzip > %(outfile)s'''

    P.run()

Esempio n. 20

0

Mostra file

File: pipeline_promotors.py Progetto: logust79/cgat-flow

def buildGeneOntology(infile, outfile):
    '''create an output file akin to GO ontology files to be
    used with GO.py
    '''

    table = P.toTable(infile)
    columns = ("cpg", "tata")
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    i = 1
    for c in columns:
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc
        ]))
        i += 1
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c)
            for x in cc
        ]))
        i += 1

    outf.close()

Esempio n. 21

0

Mostra file

File: cgat_cwd2list.py Progetto: logust79/cgat-flow

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    dir2files = {}
    for root, directory, files in os.walk("."):
        dir2files[root] = files

    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
    filename = "CWD_%s" % st
    E.info("outputting directory state to %s" % filename)
    with IOTools.openFile(filename, "w") as outf:
        outf.write("##contents of cwd on %s\n\n" % st)
        for directory, files in dir2files.items():
            for file in files:
                path = os.path.join(directory, file)
                outf.write(path + "\n")

    # write footer and output benchmark information.
    E.Stop()

Esempio n. 22

0

Mostra file

def ReadGeneLists(filename_genes, gene_pattern=None):
    """read gene lists from filename in matrix.

    returns a tuple (list of all genes, dictionary of gene lists) 
    """

    if filename_genes == "-":
        infile = sys.stdin
    else:
        infile = IOTools.openFile(filename_genes, "r")

    headers, table = CSV.readTable(infile.readlines(), as_rows=False)

    if filename_genes != "-":
        infile.close()

    all_genes = table[0]

    # if there is only a single column, add a dummy column
    if len(table) == 1:
        table.append([1] * len(table[0]))
        headers.append("foreground")

    E.info("read %i genes from %s" % (len(all_genes), filename_genes))

    if gene_pattern:
        rx = re.compile(gene_pattern)
        all_genes = [rx.search(x).groups()[0] for x in all_genes]

    gene_lists = collections.OrderedDict()
    for header, col in zip(headers[1:], table[1:]):
        s = list(set([x for x, y in zip(all_genes, col) if y != "0"]))
        gene_lists[header] = set(s)

    return all_genes, gene_lists

Esempio n. 23

0

Mostra file

File: PipelineMetagenomeBenchmark.py Progetto: logust79/cgat-flow

def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print(contigs)
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()

Esempio n. 24

0

Mostra file

 def __call__(self, filenames, outfile, options):
     for fi, fn in filenames:
         infile = IOTools.openFile(fn, "r")
         outfile.write(
             "######### logging output for %s ###################\n" % fi)
         for l in infile:
             outfile.write(l)
         infile.close()

Esempio n. 25

0

Mostra file

    def __call__(self, track, slice=None):

        if slice == "transcript":
            lengths_transcripts = []
            for transcript in GTF.transcript_iterator(
                    GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in transcript])
                lengths_transcripts.append(length)
            return np.mean(lengths_transcripts)

        elif slice == "gene":
            lengths_genes = []
            for gene in GTF.flat_gene_iterator(
                    GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in gene])
                lengths_genes.append(length)
            return np.mean(lengths_genes)

Esempio n. 26

0

Mostra file

def summarizePeaksForPooledPseudoreplicates(infiles, outfile):
    outf = IOTools.openFile(outfile, "w")
    outf.write("Sample_id\t"
               "Experiment\t"
               "Tissue\t"
               "Condition\t"
               "Pseudoreplicate\t"
               "n_peaks\n")
    IDR.countPeaks(infiles, outf)

Esempio n. 27

0

Mostra file

    def __call__(self, track, slice=None):

        transcript_counts = collections.defaultdict(set)
        counts = []
        for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))):
            transcript_counts[gtf.gene_id].add(gtf.transcript_id)
        for gene, transcripts in transcript_counts.items():
            counts.append(len(transcripts))
        return counts

Esempio n. 28

0

Mostra file

File: pipeline_chains.py Progetto: logust79/cgat-flow

def writeContigSizes(genome, outfile):
    '''write contig sizes to outfile for UCSC tools.
    '''

    outf = IOTools.openFile(outfile, "w")
    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], genome))
    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outf.write("%s\t%i\n" % (contig, size))
    outf.close()

Esempio n. 29

0

Mostra file

def chunk_iterator_regex_group(infile, args, prefix, use_header=False):
    """group by regular expression is true.

    Entries need to be consecutive.
    """

    rex = args[0]
    column = args[1]
    chunk_size = args[2]
    last = None
    header = None
    n = chunk_size
    outfile = None
    filename = None

    for line in infile:

        if line[0] == "#":
            continue

        if not header and use_header:
            header = line
            continue

        try:
            this = rex.search(line[:-1]).groups()[0]
        except IndexError:
            if outfile:
                outfile.write(line)
            continue
        except AttributeError:
            if outfile:
                outfile.write(line)
            continue

        if last != this and n >= chunk_size:
            if last:
                outfile.close()
                yield filename

            last = this

            filename = "%s/%s.in" % (prefix, this)
            outfile = IOTools.openFile(filename, "w")
            if header:
                outfile.write(header)
            n = 0

        outfile.write(line)
        n += 1

    if outfile:
        outfile.close()
        yield filename

Esempio n. 30

0

Mostra file

File: PipelineTranscriptDiffExpression.py Progetto: logust79/cgat-flow

def makeExpressionSummaryPlots(counts_inf, design_inf, logfile):
    ''' use the plotting methods for Counts object to make summary plots'''

    with IOTools.openFile(logfile, "w") as log:

        plot_prefix = P.snip(logfile, ".log")

        # need to manually read in data as index column is not the first column
        counts = Counts.Counts(pd.read_table(counts_inf, sep="\t"))
        counts.table.set_index(["transcript_id"])

        design = Expression.ExperimentalDesign(design_inf)

        # make certain counts table only include samples in design
        counts.restrict(design)

        cor_outfile = plot_prefix + "_pairwise_correlations.png"
        pca_var_outfile = plot_prefix + "_pca_variance.png"
        pca1_outfile = plot_prefix + "_pc1_pc2.png"
        pca2_outfile = plot_prefix + "_pc3_pc4.png"
        heatmap_outfile = plot_prefix + "_heatmap.png"

        counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False)

        counts_highExp = counts_log10.clone()
        counts_highExp.table['order'] = counts_highExp.table.apply(np.mean,
                                                                   axis=1)
        counts_highExp.table.sort(["order"], ascending=0, inplace=True)
        counts_highExp.table = counts_highExp.table.iloc[0:500, :]
        counts_highExp.table.drop("order", axis=1, inplace=True)

        log.write("plot correlations: %s\n" % cor_outfile)
        counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000)

        log.write("plot pc3,pc4: %s\n" % pca1_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile,
                             pca1_outfile,
                             x_axis="PC1",
                             y_axis="PC2",
                             colour="group",
                             shape="group")

        log.write("plot pc3,pc4: %s\n" % pca2_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile,
                             pca2_outfile,
                             x_axis="PC3",
                             y_axis="PC4",
                             colour="group",
                             shape="group")

        log.write("plot heatmap: %s\n" % heatmap_outfile)
        counts_highExp.heatmap(heatmap_outfile)