Python openFile Beispiele, cgatcore.iotools.openFile Python Beispiele

Beispiel #1

0

Datei anzeigen

def chunk_iterator_regex_split(infile, args, prefix, use_header=False):
    """split where regular expression is true.
    """

    rex = args[0]
    chunk_size = args[2]
    max_lines = args[3]

    nlines = 0
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = iotools.openFile(filename, "w")

    for line in infile:

        if line[0] == "#":
            continue
        if rex.search(line[:-1]):
            if n > 0 and (n % chunk_size == 0 or
                          (max_lines and nlines > max_lines)):
                outfile.close()
                yield filename
                filename = "%s/%010i.in" % (prefix, n)
                outfile = iotools.openFile(filename, "w")
                nlines = 0

            n += 1

        outfile.write(line)
        nlines += 1

    outfile.close()
    yield filename

Beispiel #2

0

Datei anzeigen

Datei: PipelineMetagenomeCommunities.py Projekt: tw7649116/cgat-flow

def annotate(infile, outfile, geneset):
    '''
    annotate NOGs into functional categories
    '''
    annotation = {}
    E.info("loading geneset")
    anno = iotools.openFile(geneset)
    for line in anno.readlines():
        data = line[:-1].split("\t")
        nog, funccat = data[1], data[3]
        annotation[nog] = funccat
    E.info("finished loading gene set")

    E.info("annotating infile")
    inf = iotools.openFile(infile)
    header = inf.readline()
    outf = iotools.openFile(outfile, "w")
    outf.write(header[:-1] + "\ttaxa\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        nog = data[0]
        try:
            pathway = annotation[nog]
        except KeyError:
            pathway = "Function unknown"
        outf.write(line[:-1] + "\t" + pathway + "\n")
    outf.close()

Beispiel #3

0

Datei anzeigen

def chunk_iterator_lines(infile, args, prefix, use_header=False):
    """split by lines."""

    chunk_size = args[0]
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = iotools.openFile(filename, "w")
    header = None

    for line in infile:
        if line[0] == "#":
            continue

        if not header and n == 0 and use_header:
            header = line
            outfile.write(header)
            continue

        n += 1

        if n % chunk_size == 0:
            outfile.close()
            yield filename
            filename = "%s/%010i.in" % (prefix, n)
            outfile = iotools.openFile(filename, "w")
            if header:
                outfile.write(header)

        outfile.write(line)
    outfile.close()
    yield filename

Beispiel #4

0

Datei anzeigen

Datei: pipeline_rrbs.py Projekt: microbialman/cgat-flow

def makeCpgIslandsBed(outfile):
    infile = PARAMS["methylation_summary_cpgislands"]
    out = iotools.openFile(outfile, "w")
    with iotools.openFile(infile, "r") as f:
        for line in f.readlines():
            # this assumes location of req. values
            contig, start, end = line.split()[1:4]
            if not contig == "chrom":
                out.write("%s\t%s\t%s\n" % (contig, start, end))
    out.close()

Beispiel #5

0

Datei anzeigen

Datei: pipeline_rrbs.py Projekt: microbialman/cgat-flow

def make1basedCpgIslands(infile, outfile):

    # outfile, loadfile = outfiles

    out = iotools.openFile(outfile, "w")
    out.write("%s\t%s\t%s\n" % ("contig", "position", "cpgi"))

    with iotools.openFile(infile, "r") as f:
        lines = f.readlines()
        for line in lines:
            contig, start, stop = line.split()
            for position in [x for x in range(int(start), int(stop) + 2)]:
                out.write("%s\t%s\t%s\n" % (contig, position, "CpGIsland"))
    out.close()

Beispiel #6

0

Datei anzeigen

    def __call__(self, filenames, outfile, options):

        for fi, fn in filenames:
            E.debug("# merging %s" % fn)
            infile = iotools.openFile(fn, "r")

            if options.output_header:
                self.parseHeader(infile, outfile, options)

            for l in infile:
                nfields = l.count("\t")

                if l[0] == "#":
                    options.stdlog.write(l)
                elif self.nfields is not None and nfields != self.nfields:
                    # validate number of fields in row, raise warning
                    # for those not matching and skip.
                    E.warn(
                        "# line %s has unexpected number of fields: %i != %i" %
                        (l[:-1], nfields, self.nfields))
                else:
                    if self.mFieldIndex is not None:
                        data = l[:-1].split("\t")
                        try:
                            data[self.mFieldIndex] = self.mMapper(
                                fi, data[self.mFieldIndex])
                        except IndexError:
                            raise IndexError("can not find field %i in %s" %
                                             (self.mFieldIndex, l))
                        l = "\t".join(data) + "\n"

                    outfile.write(l)
            infile.close()

Beispiel #7

0

Datei anzeigen

Datei: PipelineMetagenomeBenchmark.py Projekt: microbialman/cgat-flow

def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print(contigs)
    for fasta in FastaIterator.iterate(iotools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()

Beispiel #8

0

Datei anzeigen

Datei: PipelineChipseq.py Projekt: tw7649116/cgat-flow

def getMappedReads(infile):
    '''return number of reads mapped. '''
    for lines in iotools.openFile(infile, "r"):
        data = lines[:-1].split("\t")
        if data[1].startswith("without duplicates"):
            return int(data[0])
    return

Beispiel #9

0

Datei anzeigen

Datei: cgat_cwd2list.py Projekt: tw7649116/cgat-flow

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    dir2files = {}
    for root, directory, files in os.walk("."):
        dir2files[root] = files

    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')
    filename = "CWD_%s" % st
    E.info("outputting directory state to %s" % filename)
    with iotools.openFile(filename, "w") as outf:
        outf.write("##contents of cwd on %s\n\n" % st)
        for directory, files in dir2files.items():
            for file in files:
                path = os.path.join(directory, file)
                outf.write(path + "\n")

    # write footer and output benchmark information.
    E.Stop()

Beispiel #10

0

Datei anzeigen

def buildGeneOntology(infile, outfile):
    '''create an output file akin to GO ontology files to be
    used with GO.py
    '''

    table = P.toTable(infile)
    columns = ("cpg", "tata")
    dbh = connect()
    cc = dbh.cursor()

    outf = iotools.openFile(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    i = 1
    for c in columns:
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc
        ]))
        i += 1
        cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" %
                   locals())
        outf.write("".join([
            "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c)
            for x in cc
        ]))
        i += 1

    outf.close()

Beispiel #11

0

Datei anzeigen

Datei: PipelineChipseq.py Projekt: tw7649116/cgat-flow

def exportPeaksAsBed(infile, outfile):
    '''export peaks as bed files.'''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    if infile.endswith("_macs.load"):
        track = infile[:-len("_macs.load")]
    else:
        track = infile[:-len("_intervals.load")]

    if track.startswith("control"):
        return

    peakwidth = PARAMS["peakwidth"]

    cc = dbhandle.cursor()
    statement = '''SELECT contig, peakcenter - %(peakwidth)i, peakcenter + %(peakwidth)i,
                          interval_id, peakval FROM %(track)s_intervals ORDER by contig, start''' % locals()
    cc.execute(statement)

    outs = iotools.openFile(outfile, "w")

    for result in cc:
        contig, start, end, interval_id, peakval = result
        # peakval is truncated at a 1000 as this is the maximum permitted
        # score in a bed file.
        peakval = int(min(peakval, 1000))
        outs.write("%s\t%i\t%i\t%s\t%i\n" %
                   (contig, start, end, str(interval_id), peakval))

    cc.close()
    outs.close()

Beispiel #12

0

Datei anzeigen

Datei: PipelineChipseq.py Projekt: tw7649116/cgat-flow

def buildSimpleNormalizedBAM(infiles, outfile, nreads):
    '''normalize a bam file to given number of counts
       by random sampling
    '''
    infile, countfile = infiles

    pysam_in = pysam.Samfile(infile, "rb")

    fh = iotools.openFile(countfile, "r")
    readcount = int(fh.read())
    fh.close()

    threshold = float(nreads) / float(readcount)

    pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in)

    # iterate over mapped reads thinning by the threshold
    ninput, noutput = 0, 0
    for read in pysam_in.fetch():
        ninput += 1
        if random.random() <= threshold:
            pysam_out.write(read)
            noutput += 1

    pysam_in.close()
    pysam_out.close()
    pysam.index(outfile)

    E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" %
           (ninput, noutput, 100.0 * noutput / ninput, nreads))

Beispiel #13

0

Datei anzeigen

def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = iotools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | cgat gtf2gtf
                        --method=sort --sort-order=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''
            zcat %(infile)s | cgat gtf2gtf
            --method=renumber-genes
            --pattern-identifier=%(gene_pattern)s%%i
            | cgat gtf2gtf
            --method=renumber-transcripts
            --pattern-identifier=%(transcript_pattern)s%%i
            | cgat gtf2gtf
            --method=sort --sort-order=gene
            --log=%(outfile)s.log
            | gzip > %(outfile)s'''

    P.run()

Beispiel #14

0

Datei anzeigen

Datei: PipelineMetagenomeBenchmark.py Projekt: microbialman/cgat-flow

def buildTrueTaxonomicRelativeAbundances(infiles, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    '''
    levels = ["species", "genus", "family", "order", "class", "phylum"]
    taxa = open(infiles[1])
    header = taxa.readline()
    gi2taxa = collections.defaultdict(list)
    for line in taxa.readlines():
        data = line[:-1].split("\t")
        gi, strain, species, genus, family, order, _class, phylum = data[
            0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]
        gi2taxa[gi] = (species, genus, family, order, _class, phylum)

    outf = open(outfile, "w")
    outf.write("level\ttaxa\trelab\n")
    for i in range(len(levels)):
        total = 0
        result = collections.defaultdict(int)
        for fastq in Fastq.iterate(iotools.openFile(infiles[0])):
            total += 1
            gi = fastq.identifier.split("|")[1]
            result[gi2taxa[gi][i]] += 1
        for taxa, value in result.items():
            outf.write("%s\t%s\t%s\n" %
                       (levels[i], taxa, float(value) / total))
    outf.close()

Beispiel #15

0

Datei anzeigen

 def __call__(self, filenames, outfile, options):
     for fi, fn in filenames:
         infile = iotools.openFile(fn, "r")
         outfile.write(
             "######### logging output for %s ###################\n" % fi)
         for l in infile:
             outfile.write(l)
         infile.close()

Beispiel #16

0

Datei anzeigen

Datei: pipeline_idr.py Projekt: tw7649116/cgat-flow

def summarizePeaksForPooledPseudoreplicates(infiles, outfile):
    outf = iotools.openFile(outfile, "w")
    outf.write("Sample_id\t"
               "Experiment\t"
               "Tissue\t"
               "Condition\t"
               "Pseudoreplicate\t"
               "n_peaks\n")
    IDR.countPeaks(infiles, outf)

Beispiel #17

0

Datei anzeigen

def writeContigSizes(genome, outfile):
    '''write contig sizes to outfile for UCSC tools.
    '''

    outf = iotools.openFile(outfile, "w")
    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], genome))
    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outf.write("%s\t%i\n" % (contig, size))
    outf.close()

Beispiel #18

0

Datei anzeigen

def chunk_iterator_regex_group(infile, args, prefix, use_header=False):
    """group by regular expression is true.

    Entries need to be consecutive.
    """

    rex = args[0]
    column = args[1]
    chunk_size = args[2]
    last = None
    header = None
    n = chunk_size
    outfile = None
    filename = None

    for line in infile:

        if line[0] == "#":
            continue

        if not header and use_header:
            header = line
            continue

        try:
            this = rex.search(line[:-1]).groups()[0]
        except IndexError:
            if outfile:
                outfile.write(line)
            continue
        except AttributeError:
            if outfile:
                outfile.write(line)
            continue

        if last != this and n >= chunk_size:
            if last:
                outfile.close()
                yield filename

            last = this

            filename = "%s/%s.in" % (prefix, this)
            outfile = iotools.openFile(filename, "w")
            if header:
                outfile.write(header)
            n = 0

        outfile.write(line)
        n += 1

    if outfile:
        outfile.close()
        yield filename

Beispiel #19

0

Datei anzeigen

Datei: PipelineChipseq.py Projekt: tw7649116/cgat-flow

def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed.gz")
        tablename = "%s_intervals" % P.tablequote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = iotools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=iotools.openFile(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()

Beispiel #20

0

Datei anzeigen

Datei: PipelineMetagenomeCommunities.py Projekt: tw7649116/cgat-flow

def readHierarchy(mapfile):
    '''
    read hierachy into dictionary
    '''
    hierarchy = collections.defaultdict(list)
    inf = iotools.openFile(mapfile)
    inf.readline()
    for line in inf.readlines():
        data = line.strip("\n").split("\t")
        kingdom = data[0]
        hierarchy[kingdom].extend(data[1:])
    return hierarchy

Beispiel #21

0

Datei anzeigen

def buildMRBed(infile, outfile):
    '''output bed6 file with methylated regions.

    All regions are output, even the insignificant ones.

    The score is the log fold change.
    '''

    outf = iotools.openFile(outfile, "w")
    c = E.Counter()
    for row in csv.DictReader(iotools.openFile(infile), dialect="excel-tab"):
        c.input += 1

        contig, start, end = re.match("(.*):(\d+)-(\d+)",
                                      row["interval_id"]).groups()
        c.output += 1
        outf.write("\t".join((contig, start, end, str(c.input),
                              row["lfold"])) + "\n")

    outf.close()

    E.info("%s" % str(c))

Beispiel #22

0

Datei anzeigen

Datei: PipelineIDR.py Projekt: tw7649116/cgat-flow

def findNPeaks(infiles, outfile, params):
    outf = iotools.openFile(outfile, "w")
    outf.write("Tissue\t"
               "Condition\t"
               "Experiment\t"
               "idr_comp\t"
               "sample_1\t"
               "sample_2\t"
               "n_peaks\n")
    idr_threshold = float(params[0])

    # Hack: for only one infile, P.submit returns a string rather than a list
    if type(infiles) is str:
        infiles = [
            infiles,
        ]

    for inf in infiles:
        inf_name = P.snip(os.path.basename(inf), "-overlapped-peaks.txt")
        tissue = inf_name.split("-")[0]
        condition = inf_name.split("-")[1]
        experiment = "_".join([tissue, condition])
        sample1, sample2 = inf_name.split("_vs_")
        n_peaks = 0
        header = True
        for line in iotools.openFile(inf):
            if header:
                header = False
                continue
            line = line.split()
            if float(line[10]) <= idr_threshold:
                n_peaks += 1
            else:
                continue
        outf.write(tissue + "\t" + condition + "\t" + experiment + "\t" +
                   inf_name + "\t" + sample1 + "\t" + sample2 + "\t" +
                   str(n_peaks) + "\n")

    outf.close()

Beispiel #23

0

Datei anzeigen

Datei: PipelineIDR.py Projekt: tw7649116/cgat-flow

def countPeaks(infiles, outf):
    """
    Count the number of peaks in each narrowPeak file
    """
    for infile in infiles:
        sample_id = os.path.basename(infile).split("_VS_")[0]
        tissue, condition, replicate = sample_id.split("-")
        experiment = tissue + "_" + condition
        n_peaks = str(len(iotools.openFile(infile).readlines()))
        outf.write("\t".join(
            [sample_id, experiment, tissue, condition, replicate, n_peaks]) +
                   "\n")
    outf.close()

Beispiel #24

0

Datei anzeigen

 def __call__(self, filenames, outfile, options):
     for fi, fn in filenames:
         infile = iotools.openFile(fn, "r")
         for l in infile:
             if l[0] == "#":
                 options.stdlog.write(l)
                 continue
             elif l[0] == ">":
                 x = re.search(">(\S+)", l[:-1])
                 id = self.mMapper(fi, x.groups()[0])
                 l = ">%s%s" % (id, l[x.end(0):])
             outfile.write(l)
         infile.close()

Beispiel #25

0

Datei anzeigen

def aggregateTiledReadCounts(infiles, outfile):
    '''aggregate tag counts for each window.

    coverageBed outputs the following columns:
    1) Contig
    2) Start
    3) Stop
    4) Name
    5) The number of features in A that overlapped (by at least one base pair) the B interval.
    6) The number of bases in B that had non-zero coverage from features in A.
    7) The length of the entry in B.
    8) The fraction of bases in B that had non-zero coverage from features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    This method uses the maximum number of reads found in any interval as the tag count.

    Tiles with no counts will not be output.
    '''

    to_cluster = True

    src = " ".join([
        '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$4 );}' ) '''
        % x for x in infiles
    ])
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles]

    outf = iotools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0:
            continue
        assert len(
            genes
        ) == 1, "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)

Beispiel #26

0

Datei anzeigen

Datei: PipelineChipseq.py Projekt: tw7649116/cgat-flow

def summarizeMACSFDR(infiles, outfile):
    '''compile table with peaks that would remain after filtering
    by fdr.
    '''

    fdr_thresholds = numpy.arange(0, 1.05, 0.05)

    outf = iotools.openFile(outfile, "w")
    outf.write("track\t%s\n" % "\t".join(map(str, fdr_thresholds)))

    for infile in infiles:
        called = []
        track = P.snip(os.path.basename(infile), ".macs")
        infilename = infile + "_peaks.xls.gz"
        inf = iotools.openFile(infilename)
        peaks = list(WrapperMACS.iteratePeaks(inf))

        for threshold in fdr_thresholds:
            called.append(len([x for x in peaks if x.fdr <= threshold]))

        outf.write("%s\t%s\n" % (track, "\t".join(map(str, called))))

    outf.close()

Beispiel #27

0

Datei anzeigen

Datei: PipelineMetagenomeCommunities.py Projekt: tw7649116/cgat-flow

def normaliseKraken(infile, outfile):
    '''
    normalise kraken counts by nreads/million mapped
    '''
    inf = iotools.openFile(infile)
    header = inf.readline().replace("rel_abundance", "rpm")
    mapped = 0

    # will have to iterate over the file twice
    for line in inf.readlines():
        data = line[:-1].split("\t")
        count = int(data[-1])
        mapped += count
    inf.close()

    inf = iotools.openFile(infile)
    inf.readline()
    outf = iotools.openFile(outfile, "w")
    outf.write(header)
    for line in inf.readlines():
        data = line[:-1].split("\t")
        count = int(data[-1]) / (float(mapped) / 1000000)
        outf.write("\t".join(map(str, data[:-1] + [count])) + "\n")
    outf.close()

Beispiel #28

0

Datei anzeigen

Datei: PipelineChipseq.py Projekt: tw7649116/cgat-flow

def buildQuicksectMask(bed_file):
    '''return Quicksect object containing the regions specified
       takes a bed file listing the regions to mask 
    '''
    mask = IndexedGenome.Quicksect()

    n_regions = 0
    for bed in Bed.iterator(iotools.openFile(bed_file)):
        # it is neccessary to extend the region to make an accurate mask
        mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
        n_regions += 1

    E.info("Built Quicksect mask for %i regions" % n_regions)

    return(mask)

Beispiel #29

0

Datei anzeigen

Datei: PipelineMetagenomeBenchmark.py Projekt: microbialman/cgat-flow

def buildExpectedCoverageOverGenomes(infiles, outfile):
    '''
    take sequence files and estimate the theoretical
    coverage we would get over genomes in the 
    sample i.e. at 1X coverage
    '''

    # if paired end then will have to multiply
    # by two
    multiply = False
    if infiles[0].endswith(".fastq.1.gz"):
        multiply = True

    # the theoretical coverage is defined as
    # (read length (L) * no. reads (N)) / genome size (G) (bp)

    # get genome sizes into memory
    genomes = open(infiles[1])
    header = genomes.readline()
    genome_sizes = {}
    for line in genomes.readlines():
        data = line[:-1].split("\t")
        gi = data[0].split("_")[1]
        size = data[1]
        genome_sizes[gi] = size

    # get the expected genome size
    expected_genome_sizes = collections.defaultdict(int)
    E.info("iterating over fastq file")
    for fastq in Fastq.iterate(iotools.openFile(infiles[0])):
        gi = fastq.identifier.split("|")[1]
        expected_genome_sizes[gi] += 1
    E.info("iterating over fastq file: DONE")

    # get the proportion of each genome covered
    outf = open(outfile, "w")
    outf.write("gi\texpected_coverage\n")
    for gi, size in expected_genome_sizes.items():
        if multiply:
            size = size * 2
        if gi not in genome_sizes:
            E.warn("could not find gi no. %s in dictionary" % gi)
            continue
        proportion_coverage = float(size) / float(genome_sizes[gi])
        if proportion_coverage > 1:
            proportion_coverage = 1
        outf.write("%s\t%f\n" % (gi, proportion_coverage))
    outf.close()

Beispiel #30

0

Datei anzeigen

def makeExpressionSummaryPlots(counts_inf, design_inf, logfile):
    ''' use the plotting methods for Counts object to make summary plots'''

    with iotools.openFile(logfile, "w") as log:

        plot_prefix = P.snip(logfile, ".log")

        # need to manually read in data as index column is not the first column
        counts = Counts.Counts(pd.read_table(counts_inf, sep="\t"))
        counts.table.set_index(["transcript_id"])

        design = Expression.ExperimentalDesign(design_inf)

        # make certain counts table only include samples in design
        counts.restrict(design)

        cor_outfile = plot_prefix + "_pairwise_correlations.png"
        pca_var_outfile = plot_prefix + "_pca_variance.png"
        pca1_outfile = plot_prefix + "_pc1_pc2.png"
        pca2_outfile = plot_prefix + "_pc3_pc4.png"
        heatmap_outfile = plot_prefix + "_heatmap.png"

        counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False)

        counts_highExp = counts_log10.clone()
        counts_highExp.table['order'] = counts_highExp.table.apply(
            np.mean, axis=1)
        counts_highExp.table.sort(["order"], ascending=0, inplace=True)
        counts_highExp.table = counts_highExp.table.iloc[0:500, :]
        counts_highExp.table.drop("order", axis=1, inplace=True)

        log.write("plot correlations: %s\n" % cor_outfile)
        counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000)

        log.write("plot pc3,pc4: %s\n" % pca1_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile, pca1_outfile,
                             x_axis="PC1", y_axis="PC2",
                             colour="group", shape="group")

        log.write("plot pc3,pc4: %s\n" % pca2_outfile)
        counts_log10.plotPCA(design,
                             pca_var_outfile, pca2_outfile,
                             x_axis="PC3", y_axis="PC4",
                             colour="group", shape="group")

        log.write("plot heatmap: %s\n" % heatmap_outfile)
        counts_highExp.heatmap(heatmap_outfile)