def annotate(infile, outfile, geneset): ''' annotate NOGs into functional categories ''' annotation = {} E.info("loading geneset") anno = IOTools.openFile(geneset) for line in anno.readlines(): data = line[:-1].split("\t") nog, funccat = data[1], data[3] annotation[nog] = funccat E.info("finished loading gene set") E.info("annotating infile") inf = IOTools.openFile(infile) header = inf.readline() outf = IOTools.openFile(outfile, "w") outf.write(header[:-1] + "\ttaxa\n") for line in inf.readlines(): data = line[:-1].split("\t") nog = data[0] try: pathway = annotation[nog] except KeyError: pathway = "Function unknown" outf.write(line[:-1] + "\t" + pathway + "\n") outf.close()
def filterGTF(gtf, filterstring, tempout): if "!=" in filterstring: column, value = filterstring.split("!=") value = value.split("+") filtertype = "notin" elif "=" in filterstring: column, value = filterstring.split("=") value = value.split("+") filtertype = "in" elif "-in_file-" in filterstring: column, value = filterstring.split("-in_file-") value = [line.strip() for line in IOTools.openFile(value)] filtertype = "in_file" elif "-notin_file-" in filterstring: column, value = filterstring.split("-notin_file-") value = [line.strip() for line in IOTools.openFile(value)] filtertype = "notin_file" elif "-morethan-" in filterstring: column, value = filterstring.split("-morethan-") value = float(value) filtertype = "morethan" elif "-lessthan-" in filterstring: column, value = filterstring.split("-lessthan-") value = float(value) filtertype = "lessthan" gfile = IOTools.openFile(gtf) G = GTF.iterator(gfile) out = IOTools.openFile(tempout, "w") for item in G: D = item.asDict() D['contig'] = item.contig D['source'] = item.source D['feature'] = item.feature D['start'] = item.start D['end'] = item.end D['strand'] = item.strand D['frame'] = item.frame if filtertype == "in" or filtertype == 'in_file': if D[column] in value: out.write("%s\n" % str(item)) elif filtertype == "notin" or filtertype == 'notin_file': if D[column] not in value: out.write("%s\n" % str(item)) elif filtertype == "morethan": if float(D[column]) > value: out.write("%s\n" % str(item)) elif filtertype == "lessthan": if float(D[column]) < value: out.write("%s\n" % str(item)) out.close() gfile.close()
def chunk_iterator_lines(infile, args, prefix, use_header=False): """split by lines.""" chunk_size = args[0] n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") header = None for line in infile: if line[0] == "#": continue if not header and n == 0 and use_header: header = line outfile.write(header) continue n += 1 if n % chunk_size == 0: outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") if header: outfile.write(header) outfile.write(line) outfile.close() yield filename
def chunk_iterator_regex_split(infile, args, prefix, use_header=False): """split where regular expression is true. """ rex = args[0] chunk_size = args[2] max_lines = args[3] nlines = 0 n = 0 filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") for line in infile: if line[0] == "#": continue if rex.search(line[:-1]): if n > 0 and (n % chunk_size == 0 or (max_lines and nlines > max_lines)): outfile.close() yield filename filename = "%s/%010i.in" % (prefix, n) outfile = IOTools.openFile(filename, "w") nlines = 0 n += 1 outfile.write(line) nlines += 1 outfile.close() yield filename
def __call__(self, track, slice=None): if slice == "transcript": lengths_transcripts = [] for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in transcript]) lengths_transcripts.append(length) counts, lower, dx, _ = scipy.stats.cumfreq( lengths_transcripts, numbins=40, defaultreallimits=(0, 20000)) x = np.arange(counts.size) * dx + lower return odict((("length", x), ("cumulative frequency", counts / len(lengths_transcripts)))) elif slice == "gene": lengths_genes = [] for gene in GTF.flat_gene_iterator( GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in gene]) lengths_genes.append(length) counts, lower, dx, _ = scipy.stats.cumfreq( lengths_genes, numbins=40, defaultreallimits=(0, 20000)) x = np.arange(counts.size) * dx + lower return odict((("length", x), ("cumulative frequency", counts / len(lengths_genes))))
def convertGo2Goslim(options): """read gene list with GO assignments and convert to GO slim categories.""" E.info("reading GO assignments from stdin") gene2gos, go2infos = ReadGene2GOFromFile(options.stdin) input_genes, input_goids = countGOs(gene2gos) ############################################################# # read GO ontology from file assert options.filename_ontology, "please supply a GO ontology" E.info("reading ontology from %s" % (options.filename_ontology)) infile = IOTools.openFile(options.filename_ontology) ontology = readOntology(infile) infile.close() go2infos = collections.defaultdict(dict) # substitute go2infos for go in list(ontology.values()): go2infos[go.mNameSpace][go.mId] = GOInfo(go.mId, go_type=go.mNameSpace, description=go.mName) E.info("reading GO assignments from %s" % options.filename_slims) go_slims = GetGOSlims(IOTools.openFile(options.filename_slims, "r")) if options.loglevel >= 1: v = set() for x in list(go_slims.values()): for xx in x: v.add(xx) E.info("read go slims from %s: go=%i, slim=%i" % (options.filename_slims, len(go_slims), len(v))) output_goids, output_genes = set(), set() noutput = 0 options.stdout.write("\t".join(("go_type", "gene_id", "go_id", "description", "evidence")) + "\n") for category, gene2go in sorted(gene2gos.items()): gene2go = MapGO2Slims(gene2go, go_slims, ontology) for gene_id, values in sorted(gene2go.items()): output_genes.add(gene_id) for go in sorted(values, key=lambda x: x.mGOId): output_goids.add(go.mGOId) options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % ( go.mGOType, gene_id, go.mGOId, go.mDescription, "NA", )) noutput += 1 E.info(("ninput_genes=%i, ninput_goids=%i, noutput_gene=%i, " "noutput_goids=%i, noutput=%i") % (len(input_genes), len(input_goids), len(output_genes), len(output_goids), noutput))
def __call__(self, track, slice=None): c_transcript = [] c_gene = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_transcript.append(len(transcript)) for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_gene.append(len(gene)) return odict((("transcript", np.mean(c_transcript)), ("gene", np.mean(c_gene))))
def makeCpgIslandsBed(outfile): infile = PARAMS["methylation_summary_cpgislands"] out = IOTools.openFile(outfile, "w") with IOTools.openFile(infile, "r") as f: for line in f.readlines(): # this assumes location of req. values contig, start, end = line.split()[1:4] if not contig == "chrom": out.write("%s\t%s\t%s\n" % (contig, start, end)) out.close()
def make1basedCpgIslands(infile, outfile): # outfile, loadfile = outfiles out = IOTools.openFile(outfile, "w") out.write("%s\t%s\t%s\n" % ("contig", "position", "cpgi")) with IOTools.openFile(infile, "r") as f: lines = f.readlines() for line in lines: contig, start, stop = line.split() for position in [x for x in range(int(start), int(stop) + 2)]: out.write("%s\t%s\t%s\n" % (contig, position, "CpGIsland")) out.close()
def clean(files, logfile): '''clean up files given by glob expressions. Files are cleaned up by zapping, i.e. the files are set to size 0. Links to files are replaced with place-holders. Information about the original file is written to `logfile`. Arguments --------- files : list List of glob expressions of files to clean up. logfile : string Filename of logfile. ''' fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev', 'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev', 'st_size', 'st_uid') dry_run = PARAMS.get("dryrun", False) if not dry_run: if not os.path.exists(logfile): outfile = IOTools.openFile(logfile, "w") outfile.write("filename\tzapped\tlinkdest\t%s\n" % "\t".join(fields)) else: outfile = IOTools.openFile(logfile, "a") c = E.Counter() for fn in files: c.files += 1 if not dry_run: stat, linkdest = IOTools.zapFile(fn) if stat is not None: c.zapped += 1 if linkdest is not None: c.links += 1 outfile.write( "%s\t%s\t%s\t%s\n" % (fn, time.asctime(time.localtime(time.time())), linkdest, "\t".join([str(getattr(stat, x)) for x in fields]))) E.info("zapped: %s" % (c)) outfile.close() return c
def readDefinitions(filename): '''read definitions from a :term:`yaml` file.''' with IOTools.openFile(filename) as f: config = yaml.load(f) if config is None: raise IOError("could not read data from '%s'" % filename) return config
def openOutputFile(section, mode="w"): """open file for writing substituting section in the output_pattern (if defined). If the filename ends with ".gz", the output is opened as a gzip'ed file. Arguments --------- section : string section will replace any %s in the pattern for output files. mode : char file opening mode Returns ------- File an opened file """ fn = getOutputFile(section) try: if fn == "-": return global_options.stdout else: if not global_options.output_force and os.path.exists(fn): raise OSError( ("file %s already exists, use --force-output to " "overwrite existing files.") % fn) return IOTools.openFile(fn, mode) except AttributeError: return global_options.stdout
def __call__(self, track, slice=None): classes = ["antisense", "antisense_upstream", "antisense_downstream", "sense_upstream", "sense_downstream", "intergenic", "sense_intronic", "antisense_intronic"] coding_set = {} for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")): coding_set[gtf.transcript_id] = gtf.source result = {"noncoding": {}, "coding": collections.defaultdict(int)} total_nc = float(self.getValue( "SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'")) for c in classes: result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' AND b.C_NC = 'noncoding' AND a.transcript_id = b.transcript_id""" % (track, c))) / total_nc) * 100 total_c = len(list(coding_set.keys())) for c in classes: ids = self.getValues( "SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'") for i in ids: if i in list(coding_set.keys()): if coding_set[i] == c: result["coding"][c] += 1 for x, y in result["coding"].items(): result["coding"][x] = (float(y) / total_c) * 100 return result
def buildTrueTaxonomicRelativeAbundances(infiles, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan ''' levels = ["species", "genus", "family", "order", "class", "phylum"] taxa = open(infiles[1]) header = taxa.readline() gi2taxa = collections.defaultdict(list) for line in taxa.readlines(): data = line[:-1].split("\t") gi, strain, species, genus, family, order, _class, phylum = data[ 0], data[1], data[2], data[3], data[4], data[5], data[6], data[7] gi2taxa[gi] = (species, genus, family, order, _class, phylum) outf = open(outfile, "w") outf.write("level\ttaxa\trelab\n") for i in range(len(levels)): total = 0 result = collections.defaultdict(int) for fastq in Fastq.iterate(IOTools.openFile(infiles[0])): total += 1 gi = fastq.identifier.split("|")[1] result[gi2taxa[gi][i]] += 1 for taxa, value in result.items(): outf.write("%s\t%s\t%s\n" % (levels[i], taxa, float(value) / total)) outf.close()
def __call__(self, filenames, outfile, options): for fi, fn in filenames: E.debug("# merging %s" % fn) infile = IOTools.openFile(fn, "r") if options.output_header: self.parseHeader(infile, outfile, options) for l in infile: nfields = l.count("\t") if l[0] == "#": options.stdlog.write(l) elif self.nfields is not None and nfields != self.nfields: # validate number of fields in row, raise warning # for those not matching and skip. E.warn( "# line %s has unexpected number of fields: %i != %i" % (l[:-1], nfields, self.nfields)) else: if self.mFieldIndex is not None: data = l[:-1].split("\t") try: data[self.mFieldIndex] = self.mMapper( fi, data[self.mFieldIndex]) except IndexError: raise IndexError("can not find field %i in %s" % (self.mFieldIndex, l)) l = "\t".join(data) + "\n" outfile.write(l) infile.close()
def getMappedReads(infile): '''return number of reads mapped. ''' for lines in IOTools.openFile(infile, "r"): data = lines[:-1].split("\t") if data[1].startswith("without duplicates"): return int(data[0]) return
def buildSimpleNormalizedBAM(infiles, outfile, nreads): '''normalize a bam file to given number of counts by random sampling ''' infile, countfile = infiles pysam_in = pysam.Samfile(infile, "rb") fh = IOTools.openFile(countfile, "r") readcount = int(fh.read()) fh.close() threshold = float(nreads) / float(readcount) pysam_out = pysam.Samfile(outfile, "wb", template=pysam_in) # iterate over mapped reads thinning by the threshold ninput, noutput = 0, 0 for read in pysam_in.fetch(): ninput += 1 if random.random() <= threshold: pysam_out.write(read) noutput += 1 pysam_in.close() pysam_out.close() pysam.index(outfile) E.info("buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0 * noutput / ninput, nreads))
def exportPeaksAsBed(infile, outfile): '''export peaks as bed files.''' dbhandle = sqlite3.connect(PARAMS["database_name"]) if infile.endswith("_macs.load"): track = infile[:-len("_macs.load")] else: track = infile[:-len("_intervals.load")] if track.startswith("control"): return peakwidth = PARAMS["peakwidth"] cc = dbhandle.cursor() statement = '''SELECT contig, peakcenter - %(peakwidth)i, peakcenter + %(peakwidth)i, interval_id, peakval FROM %(track)s_intervals ORDER by contig, start''' % locals() cc.execute(statement) outs = IOTools.openFile(outfile, "w") for result in cc: contig, start, end, interval_id, peakval = result # peakval is truncated at a 1000 as this is the maximum permitted # score in a bed file. peakval = int(min(peakval, 1000)) outs.write("%s\t%i\t%i\t%s\t%i\n" % (contig, start, end, str(interval_id), peakval)) cc.close() outs.close()
def renameTranscriptsInPreviousSets(infile, outfile): ''' transcripts need to be renamed because they may use the same cufflinks identifiers as we use in the analysis - don't do if they have an ensembl id - sort by transcript ''' inf = IOTools.openFile(infile) for gtf in GTF.iterator(inf): if gtf.gene_id.find("ENSG") != -1: statement = '''zcat %(infile)s | grep -v "#" | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' else: gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz") transcript_pattern = gene_pattern.replace("GEN", "TRAN") statement = ''' zcat %(infile)s | cgat gtf2gtf --method=renumber-genes --pattern-identifier=%(gene_pattern)s%%i | cgat gtf2gtf --method=renumber-transcripts --pattern-identifier=%(transcript_pattern)s%%i | cgat gtf2gtf --method=sort --sort-order=gene --log=%(outfile)s.log | gzip > %(outfile)s''' P.run()
def buildGeneOntology(infile, outfile): '''create an output file akin to GO ontology files to be used with GO.py ''' table = P.toTable(infile) columns = ("cpg", "tata") dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") i = 1 for c in columns: cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twith_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 cc.execute("SELECT DISTINCT gene_id FROM %(table)s WHERE %(c)s = 0" % locals()) outf.write("".join([ "promotor\t%s\tGO:%07i\twithout_%s\tNA\n" % (x[0], i, c) for x in cc ])) i += 1 outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) dir2files = {} for root, directory, files in os.walk("."): dir2files[root] = files ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S') filename = "CWD_%s" % st E.info("outputting directory state to %s" % filename) with IOTools.openFile(filename, "w") as outf: outf.write("##contents of cwd on %s\n\n" % st) for directory, files in dir2files.items(): for file in files: path = os.path.join(directory, file) outf.write(path + "\n") # write footer and output benchmark information. E.Stop()
def ReadGeneLists(filename_genes, gene_pattern=None): """read gene lists from filename in matrix. returns a tuple (list of all genes, dictionary of gene lists) """ if filename_genes == "-": infile = sys.stdin else: infile = IOTools.openFile(filename_genes, "r") headers, table = CSV.readTable(infile.readlines(), as_rows=False) if filename_genes != "-": infile.close() all_genes = table[0] # if there is only a single column, add a dummy column if len(table) == 1: table.append([1] * len(table[0])) headers.append("foreground") E.info("read %i genes from %s" % (len(all_genes), filename_genes)) if gene_pattern: rx = re.compile(gene_pattern) all_genes = [rx.search(x).groups()[0] for x in all_genes] gene_lists = collections.OrderedDict() for header, col in zip(headers[1:], table[1:]): s = list(set([x for x, y in zip(all_genes, col) if y != "0"])) gene_lists[header] = set(s) return all_genes, gene_lists
def filterByCoverage(infiles, outfile): fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect( os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"])) cc = dbh.cursor() contigs = set() for infile in infiles[1:]: dirsplit = infile.split("/") infile = os.path.join( PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1]) tablename = P.toTable(os.path.basename(infile)) if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"): statement = """SELECT contig_id ave FROM (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id) WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"]) for data in cc.execute(statement).fetchall(): contigs.add(data[0]) outf = open(outfile, "w") print(contigs) for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)): identifier = fasta.title.split(" ")[0] if identifier in contigs: outf.write(">%s\n%s\n" % (identifier, fasta.sequence)) outf.close()
def __call__(self, filenames, outfile, options): for fi, fn in filenames: infile = IOTools.openFile(fn, "r") outfile.write( "######### logging output for %s ###################\n" % fi) for l in infile: outfile.write(l) infile.close()
def __call__(self, track, slice=None): if slice == "transcript": lengths_transcripts = [] for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in transcript]) lengths_transcripts.append(length) return np.mean(lengths_transcripts) elif slice == "gene": lengths_genes = [] for gene in GTF.flat_gene_iterator( GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in gene]) lengths_genes.append(length) return np.mean(lengths_genes)
def summarizePeaksForPooledPseudoreplicates(infiles, outfile): outf = IOTools.openFile(outfile, "w") outf.write("Sample_id\t" "Experiment\t" "Tissue\t" "Condition\t" "Pseudoreplicate\t" "n_peaks\n") IDR.countPeaks(infiles, outf)
def __call__(self, track, slice=None): transcript_counts = collections.defaultdict(set) counts = [] for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))): transcript_counts[gtf.gene_id].add(gtf.transcript_id) for gene, transcripts in transcript_counts.items(): counts.append(len(transcripts)) return counts
def writeContigSizes(genome, outfile): '''write contig sizes to outfile for UCSC tools. ''' outf = IOTools.openFile(outfile, "w") fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], genome)) for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outf.write("%s\t%i\n" % (contig, size)) outf.close()
def chunk_iterator_regex_group(infile, args, prefix, use_header=False): """group by regular expression is true. Entries need to be consecutive. """ rex = args[0] column = args[1] chunk_size = args[2] last = None header = None n = chunk_size outfile = None filename = None for line in infile: if line[0] == "#": continue if not header and use_header: header = line continue try: this = rex.search(line[:-1]).groups()[0] except IndexError: if outfile: outfile.write(line) continue except AttributeError: if outfile: outfile.write(line) continue if last != this and n >= chunk_size: if last: outfile.close() yield filename last = this filename = "%s/%s.in" % (prefix, this) outfile = IOTools.openFile(filename, "w") if header: outfile.write(header) n = 0 outfile.write(line) n += 1 if outfile: outfile.close() yield filename
def makeExpressionSummaryPlots(counts_inf, design_inf, logfile): ''' use the plotting methods for Counts object to make summary plots''' with IOTools.openFile(logfile, "w") as log: plot_prefix = P.snip(logfile, ".log") # need to manually read in data as index column is not the first column counts = Counts.Counts(pd.read_table(counts_inf, sep="\t")) counts.table.set_index(["transcript_id"]) design = Expression.ExperimentalDesign(design_inf) # make certain counts table only include samples in design counts.restrict(design) cor_outfile = plot_prefix + "_pairwise_correlations.png" pca_var_outfile = plot_prefix + "_pca_variance.png" pca1_outfile = plot_prefix + "_pc1_pc2.png" pca2_outfile = plot_prefix + "_pc3_pc4.png" heatmap_outfile = plot_prefix + "_heatmap.png" counts_log10 = counts.log(base=10, pseudocount=0.1, inplace=False) counts_highExp = counts_log10.clone() counts_highExp.table['order'] = counts_highExp.table.apply(np.mean, axis=1) counts_highExp.table.sort(["order"], ascending=0, inplace=True) counts_highExp.table = counts_highExp.table.iloc[0:500, :] counts_highExp.table.drop("order", axis=1, inplace=True) log.write("plot correlations: %s\n" % cor_outfile) counts_log10.plotPairwiseCorrelations(cor_outfile, subset=1000) log.write("plot pc3,pc4: %s\n" % pca1_outfile) counts_log10.plotPCA(design, pca_var_outfile, pca1_outfile, x_axis="PC1", y_axis="PC2", colour="group", shape="group") log.write("plot pc3,pc4: %s\n" % pca2_outfile) counts_log10.plotPCA(design, pca_var_outfile, pca2_outfile, x_axis="PC3", y_axis="PC4", colour="group", shape="group") log.write("plot heatmap: %s\n" % heatmap_outfile) counts_highExp.heatmap(heatmap_outfile)