def buildPFAMDomains( infiles, outfile ): '''map PFAM domains onto current sequence collection. The mapping is done by ID lookup.''' infile = infiles[0] with IOTools.openFile( "nrdb50.fasta.tsv") as inf: reader = csv.DictReader( inf, dialect='excel-tab' ) map_id2nid = {} for row in reader: map_id2nid[row['repid']] = row['nid'] rx = re.compile( "(\S+)\/(\d+)-(\d+)\s+(\S+);(.*);" ) c = E.Counter() outf = IOTools.openFile( outfile, "w" ) with IOTools.openFile( infile ) as inf: for entry in FastaIterator.iterate( inf ): c.input += 1 pid, start, end, pfam_id, description = rx.match( entry.title ).groups() try: outf.write( "%s\t%i\t%i\t%s\n" % (map_id2nid[pid], int(start)-1, int(end), pfam_id ) ) except KeyError: c.missed += 1 continue c.output += 1 outf.close() E.info( c )
def __call__(self, track, slice = None): c_transcript = [] c_gene = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_transcript.append(len(transcript)) for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_gene.append(len(gene)) return odict( ( ("transcript", np.mean(c_transcript)), ("gene",np.mean(c_gene) )) )
def __call__(self, track, slice = None): if slice == "transcript": lengths_transcripts = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in transcript]) lengths_transcripts.append(length) return np.mean(lengths_transcripts) elif slice == "gene": lengths_genes = [] for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in gene]) lengths_genes.append(length) return np.mean(lengths_genes)
def __call__(self, track, slice = None): classes = ["antisense" , "antisense_upstream" , "antisense_downstream" , "sense_upstream" , "sense_downstream" , "intergenic" , "sense_intronic" , "antisense_intronic"] coding_set = {} for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")): coding_set[gtf.transcript_id] = gtf.source result = {"noncoding": {}, "coding":collections.defaultdict(int)} total_nc = float(self.getValue("SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'")) for c in classes: result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' AND b.C_NC = 'noncoding' AND a.transcript_id = b.transcript_id""" % (track,c)))/total_nc)*100 total_c = len(coding_set.keys()) for c in classes: ids = self.getValues("SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'") for i in ids: if i in coding_set.keys(): if coding_set[i] == c: result["coding"][c] += 1 for x, y in result["coding"].iteritems(): result["coding"][x] = (float(y)/total_c)*100 return result
def buildSummaryCpGCoverage( infiles, outfile ): '''build summary of differentially methylated regions.''' dbh = connect() cc = dbh.cursor() outf = IOTools.openFile( outfile, "w" ) outf.write("metatrack\ttrack\tcoverage\tncovered\tpcovered\n" ) for track in TRACKS: tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master WHERE type='table' and name LIKE '%%coveredpos%%' """ % track ).fetchall()] for table in tables: statement = """SELECT '%(track)s' as metatrack, '%(table)s' as track, coverage, ncovered, pcovered FROM medip_%(track)s.%(table)s""" for x in cc.execute(statement % locals()): outf.write( "\t".join(map(str,x))+ "\n" ) outf.close()
def buildSummaryCalledDMRs( infiles, outfile ): '''build summary of differentially methylated regions.''' dbh = connect() cc = dbh.cursor() outf = IOTools.openFile( outfile, "w" ) outf.write( "metatrack\ttest\tntested\tnok\tnsignificant\tn2fold\n" ) for track in TRACKS: tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master WHERE type='table' and sql LIKE '%%control_mean%%' and sql LIKE '%%treatment_mean%%'""" % track ).fetchall()] for table in tables: statement = """SELECT COUNT(*) as ntested, SUM(CASE WHEN status='OK' THEN 1 ELSE 0 END) AS nok, SUM(CASE WHEN significant THEN 1 ELSE 0 END) AS nsignificant, SUM(CASE WHEN significant AND (l2fold < -1 OR l2fold > 1) THEN 1 ELSE 0 END) as n2fold FROM medip_%(track)s.%(table)s""" ntested, nok, nsignificant, n2fold = cc.execute( statement % locals() ).fetchone() outf.write( "\t".join( map(str, (track, table, ntested, nok, nsignificant, n2fold )))+ "\n" ) outf.close()
def buildSummaryMapping( infiles, outfile ): dbh = connect() cc = dbh.cursor() outf = IOTools.openFile( outfile, "w" ) table = "bam_stats" colnames = None for track in TRACKS: statement = """SELECT * FROM medip_%(track)s.%(table)s""" data = cc.execute( statement % locals() ).fetchall() _colnames = [x[0] for x in cc.description] if not colnames: colnames = _colnames outf.write( "\t".join( ["metatrack"] + colnames,) + "\n" ) assert colnames == _colnames for row in data: outf.write( "\t".join( map(str, (track,) + row))+ "\n" ) outf.close()
def buildSummaryCpGCoverage(infiles, outfile): '''build summary of differentially methylated regions.''' dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("metatrack\ttrack\tcoverage\tncovered\tpcovered\n") for track in TRACKS: tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master WHERE type='table' and name LIKE '%%coveredpos%%' """ % track ).fetchall()] for table in tables: statement = """SELECT '%(track)s' as metatrack, '%(table)s' as track, coverage, ncovered, pcovered FROM medip_%(track)s.%(table)s""" for x in cc.execute(statement % locals()): outf.write("\t".join(map(str, x)) + "\n") outf.close()
def buildSummaryCalledDMRs(infiles, outfile): '''build summary of differentially methylated regions.''' dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("metatrack\ttest\tntested\tnok\tnsignificant\tn2fold\n") for track in TRACKS: tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master WHERE type='table' and sql LIKE '%%control_mean%%' and sql LIKE '%%treatment_mean%%'""" % track ).fetchall()] for table in tables: statement = """SELECT COUNT(*) as ntested, SUM(CASE WHEN status='OK' THEN 1 ELSE 0 END) AS nok, SUM(CASE WHEN significant THEN 1 ELSE 0 END) AS nsignificant, SUM(CASE WHEN significant AND (l2fold < -1 OR l2fold > 1) THEN 1 ELSE 0 END) as n2fold FROM medip_%(track)s.%(table)s""" ntested, nok, nsignificant, n2fold = cc.execute( statement % locals()).fetchone() outf.write( "\t".join(map(str, (track, table, ntested, nok, nsignificant, n2fold))) + "\n") outf.close()
def buildSummaryMapping(infiles, outfile): dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") table = "bam_stats" colnames = None for track in TRACKS: statement = """SELECT * FROM medip_%(track)s.%(table)s""" data = cc.execute(statement % locals()).fetchall() _colnames = [x[0] for x in cc.description] if not colnames: colnames = _colnames outf.write("\t".join(["metatrack"] + colnames,) + "\n") assert colnames == _colnames for row in data: outf.write("\t".join(map(str, (track,) + row)) + "\n") outf.close()
def __call__(self, track, slice=None): fn = os.path.join( DATADIR, "replicated_intervals/%(track)s.peakshape.gz.matrix_%(slice)s.gz" % locals()) if not os.path.exists(fn): return x = IOTools.openFile(fn) matrix, rownames, colnames = IOTools.readMatrix(x) nrows = len(rownames) if nrows == 0: return if nrows > self.scale: take = numpy.array(numpy.floor( numpy.arange(0, nrows, float(nrows + 1) / self.scale)), dtype=int) rownames = [rownames[x] for x in take] matrix = matrix[take] return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
def getReferenceLincRNA(self, reference_gtf): lincs = [] for entry in GTF.iterator(IOTools.openFile(reference_gtf)): if entry.source == "lincRNA": if entry.gene_id not in lincs: lincs.append(entry.gene_id) return len(lincs)
def __call__(self, track, slice=None): fn = "ortholog_pairs_with_feature.matrix2" if not os.path.exists(fn): return x = IOTools.openFile(fn) matrix, rownames, colnames = IOTools.readMatrix(x) return odict((("matrix", matrix), ("rows", rownames), ("columns", colnames)))
def __call__(self, track, slice=None): fn = "ortholog_pairs_with_feature.matrix2" if not os.path.exists(fn): return x = IOTools.openFile(fn) matrix, rownames, colnames = IOTools.readMatrix(x) return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
def __call__(self,track, slice = None): transcript_counts = collections.defaultdict( set ) counts = [] for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))): transcript_counts[gtf.gene_id].add(gtf.transcript_id) for gene, transcripts in transcript_counts.iteritems(): counts.append(len(transcripts)) return counts
def checkBlastRuns( infiles, outfile ): '''check if output files are complete. ''' outf = IOTools.openFile( outfile, "w" ) outf.write( "chunkid\tquery_first\tquery_last\tfound_first\tfound_last\tfound_total\tfound_results\thas_finished\tattempts\t%s\n" %\ "\t".join(Logfile.RuntimeInformation._fields)) for infile in infiles: E.debug( "processing %s" % infile) chunkid = P.snip( os.path.basename( infile ), ".blast.gz" ) logfile = infile + ".log" chunkfile = P.snip( infile, ".blast.gz" ) + ".fasta" with IOTools.openFile( infile ) as inf: l = inf.readline() ids = set() total_results = 0 for l in inf: if l.startswith("#//"): continue ids.add( int(l.split("\t")[0] ) ) total_results += 1 found_first = min(ids) found_last = max(ids) found_total = len(ids) l = IOTools.getFirstLine( chunkfile ) query_first = l[1:-1] l2 = IOTools.getLastLine( chunkfile, nlines = 2).split("\n") query_last = l2[0][1:] logresults = Logfile.parse( logfile ) outf.write( "\t".join( map(str, (\ chunkid, query_first, query_last, found_first, found_last, found_total, total_results, logresults[-1].has_finished, len(logresults), "\t".join( map(str, logresults[-1]) ) ) ) ) + "\n" ) outf.close()
def __call__(self, track, slice = None): if slice == "transcript": lengths_transcripts = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in transcript]) lengths_transcripts.append(length) counts, lower, dx, _ = scipy.stats.cumfreq(lengths_transcripts, numbins=40, defaultreallimits=(0,20000)) x = np.arange(counts.size) * dx + lower return odict( (("length", x), ("cumulative frequency", counts/len(lengths_transcripts))) ) elif slice == "gene": lengths_genes = [] for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in gene]) lengths_genes.append(length) counts, lower, dx, _ = scipy.stats.cumfreq(lengths_genes, numbins=40, defaultreallimits=(0,20000)) x = np.arange(counts.size) * dx + lower return odict( (("length", x), ("cumulative frequency", counts/len(lengths_genes))) )
def __call__(self,track, slice = None): transcript_counts = collections.defaultdict( set ) counts = [] for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))): transcript_counts[gtf.gene_id].add(gtf.transcript_id) for gene, transcripts in transcript_counts.iteritems(): counts.append(len(transcripts)) count, lower, dx, _ = scipy.stats.cumfreq(counts, numbins=40, defaultreallimits=(1,15)) x = np.arange(count.size) * dx + lower return odict( (("transcript number", x), ("cumulative frequency", count/len(counts))) )
def buildNrdb50( infile, outfile ): '''build nrdb50 Renumber seqences.''' outf_fasta = IOTools.openFile( outfile, "w" ) outf_table = IOTools.openFile( outfile + ".tsv", "w" ) outf_table.write("nid\tpid\thid\tdescription\tcluster_size\ttaxon\trepid\n" ) rx = re.compile( "(\S+) (.*) n=(\d+) Tax=(.*) RepID=(\S+)" ) nid = 1 for entry in FastaIterator.iterate( IOTools.openFile( infile )): outf_fasta.write(">%i\n%s\n" % (nid, entry.sequence ) ) cluster_name, description, cluster_size, taxon, repid = rx.match( entry.title ).groups() hid = computeHID( entry.sequence ) outf_table.write( "\t".join( (str(nid), cluster_name, hid, description, cluster_size, taxon, repid)) + "\n" ) nid += 1 outf_fasta.close() outf_table.close()
def getNumColumns( filename ): '''return number of fields in bed-file by looking at the first entry. Returns 0 if file is empty. ''' with IOTools.openFile( filename ) as inf: for line in inf: if line.startswith("#"): continue if line.startswith("track"): continue return len(line[:-1].split("\t")) return 0
def getNumColumns(filename): '''return number of fields in bed-file by looking at the first entry. Returns 0 if file is empty. ''' with IOTools.openFile(filename) as inf: for line in inf: if line.startswith("#"): continue if line.startswith("track"): continue return len(line[:-1].split("\t")) return 0
def checkBlastRun( infiles, outfile ): '''build summary stats on file.''' pairsdbfile, seqfile = infiles nids = set() with IOTools.openFile( seqfile ) as inf: for r in FastaIterator.iterate( inf ): nids.add( int(r.title) ) with IOTools.openFile( pairsdbfile ) as inf: query_ids, sbjct_ids = set(), set() total_results, self_links = 0, 0 for l in inf: l = inf.readline() if l.startswith("#//"): continue query_id, sbjct_id = l.split("\t")[:2] query_ids.add( int(query_id) ) sbjct_ids.add( int(sbjct_id) ) if query_id == sbjct_id: self_links += 1 total_results += 1 outf = IOTools.openFile( outfile, "w" ) outf.write( "category\tcounts\n") outf.write( "\t".join( map(str, ('nids', len(nids)))) + "\n" ) outf.write( "\t".join( map(str, ('links', total_results))) + "\n" ) outf.write( "\t".join( map(str, ('self', self_links))) + "\n" ) outf.write( "\t".join( map(str, ('queries', len(query_ids)))) + "\n" ) outf.write( "\t".join( map(str, ('sbjcts', len(sbjct_ids)))) + "\n" ) outf.close() outf = IOTools.openFile( outfile + '.missing_queries.gz', 'w' ) outf.write( 'nid\n' ) outf.write( "\n".join( map(str, sorted( list( nids.difference( query_ids )) ) )) + "\n" ) outf.close() outf = IOTools.openFile( outfile + '.missing_sbjcts.gz', 'w' ) outf.write( 'nid\n' ) outf.write( "\n".join( map(str, sorted( list( nids.difference( sbjct_ids )) ) )) + "\n" ) outf.close()
def collectGenomeSizes(infile, outfile): ''' output the genome sizes for each genome ''' to_cluster = True outf = open(outfile, "w") outf.write("genome\tlength\n") # assume single fasta entry for fasta in FastaIterator.iterate(IOTools.openFile(infile)): name = P.snip(os.path.basename(infile), ".fna") length = len(list(fasta.sequence)) outf.write("%s\t%s\n" % (name, str(length))) outf.close()
def buildPFAMFamilies( infiles, outfile ): outf = IOTools.openFile( outfile, "w" ) outf.write( "family\tshort\tdescription\n" ) infile = infiles[1] family, description, short = None, None, None c = E.Counter() with IOTools.openFile( infile ) as inf: for line in inf: if line.startswith( "#=GF AC"): if family: outf.write( "%s\n" % "\t".join( (family,description,short))) c.output += 1 family = re.match("#=GF AC\s+(\S+)", line[:-1]).groups()[0] elif line.startswith( "#=GF DE"): description = re.match("#=GF DE\s+(.+)",line[:-1]).groups()[0] elif line.startswith( "#=GF ID"): short = re.match("#=GF ID\s+(.+)",line[:-1]).groups()[0] outf.write( "%s\n" % "\t".join( (family,description,short))) c.outptut += 1 outf.close() E.info(c)
def buildAlignmentSizes(infiles, outfile): ''' use bed files to sum the total number of bases that are aligned to the genomes ''' outf = open(outfile, "w") outf.write("genome\tsize\n") for infile in infiles: genome = P.snip(os.path.basename(infile), ".bed.gz") c = 0 inf = IOTools.openFile(infile) for bed in Bed.iterator(inf): c += bed.end - bed.start outf.write("%s\t%s\n" % (genome, str(c))) outf.close()
def buildMatrixFromTables( infiles, column, column_header = 0, dtype = numpy.float, default = None ): '''build a matrix from a column called *column* in a series of input files. If column_value == None, the first column is taken as the name of the row. The columns are given by order of the input files. returns matrix, row_headers ''' lists = [] for infile in infiles: data = pandas.read_table( IOTools.openFile(infile) ) lists.append( zip( list( data[column_header] ), list(data[column]) ) ) return buildMatrixFromLists( lists, dtype = dtype, default = default )
def buildTrueTaxonomicRelativeAbundances(infile, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan The gi_taxid_nucl is a huge table and therefore this function takes an age to run - can think of optimising this somehow ''' to_cluster = True total = 0 rel_abundance = collections.defaultdict(int) for fastq in Fastq.iterate(IOTools.openFile(infile)): total += 1 gi = fastq.identifier.split("|")[1] rel_abundance[gi] += 1 for gi, ab in rel_abundance.items(): rel_abundance[gi] = float(ab) / total dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() result = collections.defaultdict(float) for gi in list(rel_abundance.keys()): E.info("processing gi %s" % gi) taxid = cc.execute( """SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" % gi).fetchone()[0] species_id = cc.execute( """SELECT species_id FROM categories WHERE taxid == '%s'""" % taxid).fetchone()[0] species_name = cc.execute( """SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'""" % species_id).fetchone()[0] abundance = rel_abundance[gi] E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" % (str(gi), str(taxid), str(species_id), species_name)) result[species_name] += abundance outf = open(outfile, "w") outf.write("species_name\trelab\n") for species_name, abundance in result.items(): # create names consistent with metaphlan species_name = species_name.replace(" ", "_") outf.write("%s\t%f\n" % (species_name, abundance)) outf.close()
def __call__(self, track): length = {} for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.gtf.gz"))): length[transcript[0].transcript_id] = sum([gtf.end - gtf.start for gtf in transcript]) score = {} dbh = sqlite3.connect("csvdb") cc = dbh.cursor() for data in cc.execute("SELECT transcript_id, CP_score FROM lncrna_filtered_cpc_result"): score[data[0]] = data[1] result = {"length": [], "score": []} for transcript, value in length.iteritems(): result["length"].append(np.log10(length[transcript])) result["score"].append(score[transcript]) return result
def __call__(self, track, slice = None): fn = os.path.join( DATADIR, "%(track)s.peakshape.tsv.gz.matrix_%(slice)s.gz" % locals() ) if not os.path.exists( fn ): return matrix, rownames, colnames = IOTools.readMatrix( IOTools.openFile( fn )) nrows = len(rownames) if nrows == 0: return if nrows > 1000: take = numpy.array( numpy.floor( numpy.arange( 0, nrows, nrows / 1000 ) ), dtype = int ) rownames = [ rownames[x] for x in take ] matrix = matrix[ take ] return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)) )
def __call__(self, track, slice = None): pattern = self.pattern fn = os.path.join( DATADIR, "liver_vs_testes/%(track)s%(pattern)s.matrix_%(slice)s.gz" % locals() ) if not os.path.exists( fn ): return x = IOTools.openFile( fn ) matrix, rownames, colnames = IOTools.readMatrix( x ) nrows = len(rownames) if nrows == 0: return if nrows > self.scale: take = numpy.array( numpy.floor( numpy.arange( 0, nrows, float(nrows + 1) / self.scale ) ), dtype = int ) rownames = [ rownames[x] for x in take ] matrix = matrix[ take ] return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)) )
def buildMatrixFromTables(infiles, column, column_header=0, dtype=numpy.float, default=None): '''build a matrix from a column called *column* in a series of input files. If column_value == None, the first column is taken as the name of the row. The columns are given by order of the input files. returns matrix, row_headers ''' lists = [] for infile in infiles: data = pandas.read_table(IOTools.openFile(infile)) lists.append(zip(list(data[column_header]), list(data[column]))) return buildMatrixFromLists(lists, dtype=dtype, default=default)
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) coords_file=args[0] bamfile=pysam.Samfile( args[1], 'rb' ) # bamfile options.stdout.write( "gene_id\tcounts\tlength\n" ) iter = Bed.iterator( IOTools.openFile( coords_file ) ) for gene_id, exons in itertools.groupby( iter, lambda x: x.name ): num_reads=0 anames=set([]) lgene = 0 for bed in exons: lgene += bed.end - bed.start for alignedread in bamfile.fetch(bed.contig, bed.start, bed.end): anames.add((alignedread.qname, alignedread.is_read1)) num_reads = len(anames) options.stdout.write( "\t".join( (gene_id, str(num_reads), str(lgene ) )) + "\n" ) ## write footer and output benchmark information. E.Stop()
def buildTrueTaxonomicRelativeAbundances(infile, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan The gi_taxid_nucl is a huge table and therefore this function takes an age to run - can think of optimising this somehow ''' to_cluster = True total = 0 rel_abundance = collections.defaultdict(int) for fastq in Fastq.iterate(IOTools.openFile(infile)): total += 1 gi = fastq.identifier.split("|")[1] rel_abundance[gi] += 1 for gi, ab in rel_abundance.iteritems(): rel_abundance[gi] = float(ab)/total dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() result = collections.defaultdict(float) for gi in rel_abundance.keys(): E.info("processing gi %s" % gi) taxid = cc.execute("""SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" % gi).fetchone()[0] species_id = cc.execute("""SELECT species_id FROM categories WHERE taxid == '%s'""" % taxid).fetchone()[0] species_name = cc.execute("""SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'""" % species_id).fetchone()[0] abundance = rel_abundance[gi] E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" % (str(gi), str(taxid), str(species_id), species_name)) result[species_name] += abundance outf = open(outfile, "w") outf.write("species_name\trelab\n") for species_name, abundance in result.iteritems(): # create names consistent with metaphlan species_name = species_name.replace(" ", "_") outf.write("%s\t%f\n" % (species_name, abundance)) outf.close()
def main(argv = None): parser = E.OptionParser(version = "%prog version: $Id: CBioPortal.py 2888 2012-06-07 15:52:00Z ians $", usage = globals()["__doc__"]) parser.add_option("-o","--output_file", type="string", default = None, help="[Optional] Filename to output results to. [default=STDOUT]" ) parser.add_option("-u","--url",type="string",default="http://www.cbioportal.org/public-portal/webservice.do", help="[Optional] Url to the cBioPortal webservice [default=%default]" ) cqueryopts = optparse.OptionGroup(parser,"Common parameters","Common arguments to the query") cqueryopts.add_option("-s", "--study_id", dest="study_id", type="string", default = None, help="[Required/OPtional] cBioPortal ID for study [default=%default].\n This or study_name required for: getGeneticProfiles, getCaseLists, getProteinArrayInfo, getLink,getOncoprintHTML, getPercentAltered, getTotalAltered" ) cqueryopts.add_option("-n", "--study_name",dest = "study_name", type = "string", default = None, help="[Required/Optional] cBioPortal Name for study [defualt=%default].\n See above for which commands require this.") cqueryopts.add_option("-c", "--case_set_id", dest="case_set_id", type="string", default = None, help="[Required for some] cBioPortal case_set_id specifying the case list to use.\nRequired for getProfileData, getMutationData, getClincalData, getProteinArrayData, getPercentAltered, getTotalAltered. Default is case_set_id for case list 'All Tumours' ") cqueryopts.add_option("-g", "--gene_list", dest = "gene_list", type = "string", default = None, help="[Required for some] Comma seperated list of HUGO gene symbols or Entrez gene IDs.\nRequired for getProfileData, getMutationData, getLink, getOncoprintHTML" ) cqueryopts.add_option("-f","--gene_list_file", dest = "gene_list_file", type="string", default = None, help="[Optional] Filename to read in gene_list from" ) cqueryopts.add_option("-p", "--profile_id", dest = "profile_id", type = "string", help="[Optional] Comma seperated list of cBioPortal genetic_profile_ids. If none are specified then the list of profiles for the study where display in analysis is True is used." ) squeryopts = optparse.OptionGroup(parser,"Query specific parameters", "Arguments specific to a particular query") squeryopts.add_option("--protein_array_type", dest="protein_array_type", type="string", default = "protein_level", help="[Optional] Either protein_level or phosphorylation [default=%default]" ) squeryopts.add_option("--protein_array_id", dest = "protein_array_id", type = "string", help="[Required for some] comma seperated list of one or more protein array IDs" ) squeryopts.add_option("--array_info", dest ="protein_array_info", type ="int", default = 0, help="[Optional] If 1, antibody infomation will also be exported in a getProteinArrayData query [default=%default]" ) squeryopts.add_option("--report", dest = "report", type = "string", default = "full", help = "[Optional] Report type to display for getLink. Either full or oncoprint_html [default=%default] " ) squeryopts.add_option("--threshold", dest = "threshold", type="int", default = 2, help = "[Optional] Threshold for deciding if an alteration is significant for continuous metrics [default=%default]" ) parser.add_option_group(cqueryopts) parser.add_option_group(squeryopts) (options,args) = E.Start(parser, add_pipe_options = False, add_output_options = False, argv = argv) portal = CBioPortal(url = options.url, study = options.study_id, study_name = options.study_name, case_list_id = options.case_set_id) results = [] if options.gene_list_file: infile = IOTools.openFile(options.gene_list_file) gene_list = [x.strip() for x in infile] elif options.gene_list: gene_list = options.gene_list.split(",") if options.profile_id: profile_id = options.profile_id.split(",") else: profile_id = None if "getCancerStudies" in args: results.append(portal.getCancerStudies()) if "getGeneticProfiles" in args: results.append(portal.getGeneticProfiles()) if "getCaseLists" in args: results.append(portal.getCaseLists()) if "getProfileData" in args: results.append(portal.getProfileData(gene_list = gene_list, genetic_profile_id = profile_id)) if "getMutationData" in args: results.append(portal.getMutationData(gene_list = gene_list, genetic_profile_id = profile_id)) if "getClinicalData" in args: results.append(portal.getClinicalData()) if "getProteinArrayInfo" in args: results.append(portal.getProteinArrayInfo(gene_list = gene_list, protein_array_type = options.protein_array_type)) if "getProteinArrayData" in args: results.append(portal.getProteinArrayData(protein_array_id = options.protein_array_id, array_info = options.array_info)) if "getPercentAltered" in args: results.append(portal.getPercentAltered(gene_list = gene_list, genetic_profile_id = profile_id, threshold = options.threshold)) if "getLink" in args: results.append(portal.getLink(gene_list = gene_list, report = options.report)) if "getOncoprintHTML" in args: results.append(portal.getOncoprintHTML(gene_list = gene_list)) if len(results) == 0: sys.stderr.write( "No recognised query commands provided") sys.exit() if options.output_file: outf = IOTools.openFile(options.output_file, "w") else: outf = sys.stdout for result in results: try: outf.write(tableToString(result)) except: outf.write(result) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices=("bed", "bam"), help="input file format [default=%default].") parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-e", "--extension", dest="extension", type="int", help="extension size [default=%default].") parser.add_option("-b", "--bin-size", dest="bin_size", type="int", help="bin size of genome vector [default=%default].") parser.add_option("-l", "--fragment-length", dest="fragment_length", type="int", help="bin size of genome vector [default=%default].") parser.add_option( "-s", "--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis [default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "rms", "rpm", "all"), help="actions to perform [default=%default].") parser.add_option( "-w", "--bigwig", dest="bigwig", action="store_true", help= "store wig files as bigwig files - requires a genome file [default=%default]" ) parser.set_defaults( input_format="bam", ucsc_genome="hg19", genome_file=None, extension=400, bin_size=50, saturation_iterations=10, fragment_length=700, toolset=[], bigwig=False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(args) != 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() filename_sample = args[0] if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome R.library(genome_file) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) bin_size = options.bin_size extension = options.extension fragment_length = options.fragment_length saturation_iterations = options.saturation_iterations if options.input_format == "bam": E.info("converting bam files") filename_sample = bamToMEDIPS(filename_sample, os.path.join(tmpdir, "sample.medips")) elif options.input_format == "bed": E.info("converting bed files") filename_sample = bedToMEDIPS(filename_sample, os.path.join(tmpdir, "sample.medips")) E.info("loading data") R('''CONTROL.SET = MEDIPS.readAlignedSequences( BSgenome = "%(genome_file)s", file = "%(filename_sample)s" ) ''' % locals()) slotnames = (("extend", "extend", "%i"), ("distFunction", "distance_function", "%s"), ("slope", "slope", "%f"), ("fragmentLength", "fragment_length", "%i"), ("bin_size", "bin_size", "%i"), ("seq_pattern", "pattern", "%s"), ("number_regions", "nregions", "%i"), ("number_pattern", "npatterns", "%i"), ("cali_chr", "calibration_contig", "%s"), ("genome_name", "genome", "%s")) E.info("computing genome vector") R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, bin_size = %(bin_size)i, extend=%(extension)i )''' % locals()) E.info("computing CpG positions") R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")''' ) E.info("compute coupling vector") R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, fragmentLength = %(fragment_length)i, func = "count")''' % locals()) E.info("compute calibration curve") R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''') E.info("normalizing") R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''') outfile = IOTools.openFile(E.getOutputFile("summary.tsv.gz"), "w") outfile.write("category\tvalue\n") if "saturation" in options.toolset or do_all: E.info("saturation analysis") R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, bin_size = %(bin_size)i, extend = %(extension)i, no_iterations = %(saturation_iterations)i, no_random_iterations = 1)''' % locals()) R.png(E.getOutputFile("saturation.png")) R('''MEDIPS.plotSaturation(sr.control)''') R('''dev.off()''') R('''write.csv( sr.control$estimation, file ='%s' )''' % E.getOutputFile("saturation_estimation.csv")) outfile.write("estimated_correlation\t%f\n" % R('''sr.control$maxEstCor''')[1]) outfile.write("true_correlation\t%f\n" % R('''sr.control$maxTruCor''')[1]) if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, extend = %(extension)i, no_iterations = 10)''' % locals()) R.png(E.getOutputFile("cpg_coverage.png")) R('''MEDIPS.plotCoverage(cr.control)''') R('''dev.off()''') # three rows R('''write.csv( cr.control$coveredPos, file ='%s' )''' % E.getOutputFile("saturation_coveredpos.csv")) # coverage threshold # number of CpG covered # percentage of CpG covered R('''write.csv( cr.control$matrix, file ='%s' )''' % E.getOutputFile("saturation_matrix.csv")) # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''') if "calibration" in options.toolset or do_all: E.info("plotting calibration") R.png(E.getOutputFile("calibration.png")) R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)''' ) R('''dev.off()''') for slotname, label, pattern in slotnames: value = tuple(R('''CONTROL.SET@%s''' % slotname)) if len(value) == 0: continue outfile.write( "%s\t%s\n" % (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname))[0])) outfile.close() if "rpm" in options.toolset or do_all: outputfile = E.getOutputFile("rpm.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) if "rms" in options.toolset or do_all: outputfile = E.getOutputFile("rms.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) shutil.rmtree(tmpdir) ## write footer and output benchmark information. E.Stop()
def buildSCOPDomains( infiles, outfile ): '''reconcile mapped domains into a single domain file. * fragments are removed - a domain must map at least 90% of its length. * domains overlapping on the same sequence with the same superfamily classification are merged. ''' linksfile, fastafile = infiles # filtering criteria min_coverage = 0.9 # only take first four fold classes classes = 'abcd' rx = re.compile('(\S+)\s(\S+)\s(.*)' ) id2class = {} with IOTools.openFile( fastafile ) as inf: for x in FastaIterator.iterate( inf ): pid, cls, description = rx.match(x.title).groups() id2class[pid] = (cls, len(x.sequence) ) E.info('read mappings for %i sequences' % len(id2class)) counter = E.Counter() with IOTools.openFile( linksfile ) as inf: nid2domains = collections.defaultdict( list ) ndomains = 0 for line in inf: if line.startswith('query_nid'): continue if line.startswith('#'): continue counter.links += 1 domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \ block_sizes, domain_starts, sbjct_starts, \ bitscore, pid = line[:-1].split() nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \ ( nid, domain_start, domain_end, sbjct_start, sbjct_end )) family, length = id2class[domain_id] cls, fold, superfamily, family = family.split('.') if cls not in classes: continue if float(domain_end - domain_start) / length < min_coverage: continue counter.unmerged_domains += 1 superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily)) nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) ) counter.sequences = len(nid2domains) E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences)) outf = IOTools.openFile( outfile, 'w' ) outf.write('nid\tstart\tend\tfamily\n') for nid, dd in sorted(nid2domains.iteritems()): for family, domains in itertools.groupby( dd, key = lambda x: x[0] ): unmerged_domains = [ (x[1],x[2]) for x in domains ] merged_domains = Intervals.combine( unmerged_domains ) for start, end in merged_domains: counter.domains += 1 outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) ) outf.close() E.info( counter )
def main(argv=None): parser = E.OptionParser( version= "%prog version: $Id: CBioPortal.py 2888 2012-06-07 15:52:00Z ians $", usage=globals()["__doc__"]) parser.add_option( "-o", "--output_file", type="string", default=None, help="[Optional] Filename to output results to. [default=STDOUT]") parser.add_option( "-u", "--url", type="string", default="http://www.cbioportal.org/public-portal/webservice.do", help="[Optional] Url to the cBioPortal webservice [default=%default]") cqueryopts = optparse.OptionGroup(parser, "Common parameters", "Common arguments to the query") cqueryopts.add_option( "-s", "--study_id", dest="study_id", type="string", default=None, help= "[Required/OPtional] cBioPortal ID for study [default=%default].\n This or study_name required for: getGeneticProfiles, getCaseLists, getProteinArrayInfo, getLink,getOncoprintHTML, getPercentAltered, getTotalAltered" ) cqueryopts.add_option( "-n", "--study_name", dest="study_name", type="string", default=None, help= "[Required/Optional] cBioPortal Name for study [defualt=%default].\n See above for which commands require this." ) cqueryopts.add_option( "-c", "--case_set_id", dest="case_set_id", type="string", default=None, help= "[Required for some] cBioPortal case_set_id specifying the case list to use.\nRequired for getProfileData, getMutationData, getClincalData, getProteinArrayData, getPercentAltered, getTotalAltered. Default is case_set_id for case list 'All Tumours' " ) cqueryopts.add_option( "-g", "--gene_list", dest="gene_list", type="string", default=None, help= "[Required for some] Comma seperated list of HUGO gene symbols or Entrez gene IDs.\nRequired for getProfileData, getMutationData, getLink, getOncoprintHTML" ) cqueryopts.add_option("-f", "--gene_list_file", dest="gene_list_file", type="string", default=None, help="[Optional] Filename to read in gene_list from") cqueryopts.add_option( "-p", "--profile_id", dest="profile_id", type="string", help= "[Optional] Comma seperated list of cBioPortal genetic_profile_ids. If none are specified then the list of profiles for the study where display in analysis is True is used." ) squeryopts = optparse.OptionGroup( parser, "Query specific parameters", "Arguments specific to a particular query") squeryopts.add_option( "--protein_array_type", dest="protein_array_type", type="string", default="protein_level", help= "[Optional] Either protein_level or phosphorylation [default=%default]" ) squeryopts.add_option( "--protein_array_id", dest="protein_array_id", type="string", help= "[Required for some] comma seperated list of one or more protein array IDs" ) squeryopts.add_option( "--array_info", dest="protein_array_info", type="int", default=0, help= "[Optional] If 1, antibody infomation will also be exported in a getProteinArrayData query [default=%default]" ) squeryopts.add_option( "--report", dest="report", type="string", default="full", help= "[Optional] Report type to display for getLink. Either full or oncoprint_html [default=%default] " ) squeryopts.add_option( "--threshold", dest="threshold", type="int", default=2, help= "[Optional] Threshold for deciding if an alteration is significant for continuous metrics [default=%default]" ) parser.add_option_group(cqueryopts) parser.add_option_group(squeryopts) (options, args) = E.Start(parser, add_pipe_options=False, add_output_options=False, argv=argv) portal = CBioPortal(url=options.url, study=options.study_id, study_name=options.study_name, case_list_id=options.case_set_id) results = [] if options.gene_list_file: infile = IOTools.openFile(options.gene_list_file) gene_list = [x.strip() for x in infile] elif options.gene_list: gene_list = options.gene_list.split(",") if options.profile_id: profile_id = options.profile_id.split(",") else: profile_id = None if "getCancerStudies" in args: results.append(portal.getCancerStudies()) if "getGeneticProfiles" in args: results.append(portal.getGeneticProfiles()) if "getCaseLists" in args: results.append(portal.getCaseLists()) if "getProfileData" in args: results.append( portal.getProfileData(gene_list=gene_list, genetic_profile_id=profile_id)) if "getMutationData" in args: results.append( portal.getMutationData(gene_list=gene_list, genetic_profile_id=profile_id)) if "getClinicalData" in args: results.append(portal.getClinicalData()) if "getProteinArrayInfo" in args: results.append( portal.getProteinArrayInfo( gene_list=gene_list, protein_array_type=options.protein_array_type)) if "getProteinArrayData" in args: results.append( portal.getProteinArrayData( protein_array_id=options.protein_array_id, array_info=options.array_info)) if "getPercentAltered" in args: results.append( portal.getPercentAltered(gene_list=gene_list, genetic_profile_id=profile_id, threshold=options.threshold)) if "getLink" in args: results.append( portal.getLink(gene_list=gene_list, report=options.report)) if "getOncoprintHTML" in args: results.append(portal.getOncoprintHTML(gene_list=gene_list)) if len(results) == 0: sys.stderr.write("No recognised query commands provided") sys.exit() if options.output_file: outf = IOTools.openFile(options.output_file, "w") else: outf = sys.stdout for result in results: try: outf.write(tableToString(result)) except: outf.write(result) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices = ("bed", "bam"), help="input file format [default=%default]." ) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default]." ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default]." ) parser.add_option("-e", "--extension", dest="extension", type="int", help="extension size [default=%default]." ) parser.add_option("-b", "--bin-size", dest="bin_size", type="int", help="bin size of genome vector [default=%default]." ) parser.add_option("-l", "--fragment-length", dest="fragment_length", type="int", help="bin size of genome vector [default=%default]." ) parser.add_option("-s", "--saturation-iterations", dest="saturation_iterations", type="int", help = "iterations for saturation analysis [default=%default]." ) parser.add_option( "-t", "--toolset", dest="toolset", type="choice", action="append", choices = ("saturation", "coverage", "rms", "rpm", "all"), help = "actions to perform [default=%default]." ) parser.add_option( "-w", "--bigwig", dest="bigwig", action = "store_true", help = "store wig files as bigwig files - requires a genome file [default=%default]" ) parser.set_defaults( input_format = "bam", ucsc_genome = "hg19", genome_file = None, extension = 400, bin_size = 50, saturation_iterations = 10, fragment_length = 700, toolset = [], bigwig = False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv, add_output_options = True ) if len(args) != 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() filename_sample = args[0] if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset # load MEDIPS R.library( 'MEDIPS' ) genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome R.library( genome_file ) tmpdir = tempfile.mkdtemp( ) E.debug( "temporary files are in %s" % tmpdir ) bin_size = options.bin_size extension = options.extension fragment_length = options.fragment_length saturation_iterations = options.saturation_iterations if options.input_format == "bam": E.info( "converting bam files" ) filename_sample = bamToMEDIPS( filename_sample, os.path.join( tmpdir, "sample.medips" ) ) elif options.input_format == "bed": E.info( "converting bed files" ) filename_sample = bedToMEDIPS( filename_sample, os.path.join( tmpdir, "sample.medips" ) ) E.info( "loading data" ) R('''CONTROL.SET = MEDIPS.readAlignedSequences( BSgenome = "%(genome_file)s", file = "%(filename_sample)s" ) ''' % locals() ) slotnames = ( ( "extend", "extend", "%i"), ( "distFunction", "distance_function", "%s"), ( "slope", "slope", "%f"), ( "fragmentLength", "fragment_length", "%i" ), ( "bin_size", "bin_size", "%i"), ( "seq_pattern", "pattern", "%s" ), ( "number_regions", "nregions", "%i"), ( "number_pattern", "npatterns", "%i" ), ( "cali_chr", "calibration_contig", "%s"), ( "genome_name", "genome", "%s") ) E.info( "computing genome vector" ) R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, bin_size = %(bin_size)i, extend=%(extension)i )''' % locals()) E.info( "computing CpG positions" ) R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")''' ) E.info( "compute coupling vector" ) R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, fragmentLength = %(fragment_length)i, func = "count")''' % locals() ) E.info( "compute calibration curve" ) R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''') E.info( "normalizing" ) R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''') outfile = IOTools.openFile( E.getOutputFile( "summary.tsv.gz" ), "w" ) outfile.write( "category\tvalue\n" ) if "saturation" in options.toolset or do_all: E.info( "saturation analysis" ) R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, bin_size = %(bin_size)i, extend = %(extension)i, no_iterations = %(saturation_iterations)i, no_random_iterations = 1)''' % locals() ) R.png( E.getOutputFile( "saturation.png" ) ) R('''MEDIPS.plotSaturation(sr.control)''') R('''dev.off()''') R('''write.csv( sr.control$estimation, file ='%s' )'''% E.getOutputFile( "saturation_estimation.csv" ) ) outfile.write( "estimated_correlation\t%f\n" % R('''sr.control$maxEstCor''')[1] ) outfile.write( "true_correlation\t%f\n" % R('''sr.control$maxTruCor''')[1] ) if "coverage" in options.toolset or do_all: E.info( "CpG coverage analysis" ) R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, extend = %(extension)i, no_iterations = 10)''' % locals()) R.png( E.getOutputFile( "cpg_coverage.png" ) ) R('''MEDIPS.plotCoverage(cr.control)''') R('''dev.off()''') # three rows R('''write.csv( cr.control$coveredPos, file ='%s' )'''% E.getOutputFile( "saturation_coveredpos.csv" ) ) # coverage threshold # number of CpG covered # percentage of CpG covered R('''write.csv( cr.control$matrix, file ='%s' )'''% E.getOutputFile( "saturation_matrix.csv" ) ) # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''') if "calibration" in options.toolset or do_all: E.info( "plotting calibration" ) R.png( E.getOutputFile( "calibration.png" ) ) R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)''') R('''dev.off()''') for slotname, label, pattern in slotnames: value = tuple(R('''CONTROL.SET@%s''' % slotname )) if len(value) == 0: continue outfile.write( "%s\t%s\n" % (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname ))[0] ) ) outfile.close() if "rpm" in options.toolset or do_all: outputfile = E.getOutputFile( "rpm.wig" ) R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")''' % locals()) if options.bigwig: bigwig( outputfile, contig_sizes ) else: compress( outputfile ) if "rms" in options.toolset or do_all: outputfile = E.getOutputFile( "rms.wig" ) R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")''' % locals()) if options.bigwig: bigwig( outputfile, contig_sizes ) else: compress( outputfile ) shutil.rmtree( tmpdir ) ## write footer and output benchmark information. E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices = ("bed", "bam"), help="input file format [default=%default]." ) parser.add_option("-s", "--fragment-size", dest="fragment_size", type="int", help="fragment size [default=%default]." ) parser.add_option("-m", "--mappability-dir", dest="mappability_dir", type="string", help="mappability_dir [default=%default]." ) parser.add_option("-b", "--bit-filename", dest="bit_filename", type="string", help="2bit genome filename [default=%default]." ) parser.add_option("-c", "--control-filename", dest="control_filename", type="string", help="filename of input/control data in bed format [default=%default]." ) parser.add_option("-i", "--index-dir", dest="index_dir", type="string", help="index directory [default=%default]." ) parser.add_option("-t", "--threads", dest="threads", type="int", help="number of threads to use [default=%default]." ) parser.add_option("-q", "--fdr-threshold", dest="fdr_threshold", type="float", help="fdr threshold [default=%default]." ) parser.add_option("-a", "--alignability-threshold", dest="alignability_threshold", type="int", help="alignability threshold [default=%default]." ) parser.add_option("-p", "--per-contig", dest="per_contig", action = "store_true", help="run analysis per chromosome [default=%default]") parser.add_option("-w", "--temp-dir", dest="tempdir", type="string", help="use existing directory as temporary directory [default=%default]." ) parser.add_option( "--keep-temp", dest="keep_temp", action = "store_true", help="keep temporary directory [default=%default]") parser.add_option( "--action", dest="action", type="choice", choices=("full", "count", "predict", "model"), help="action to perform [default=%default]") parser.add_option( "--improvement", dest="improvement", type="float", help="relative improvement of likelihood until convergence [default=%default]") parser.set_defaults( input_format = "bed", fragment_size = 200, mappability_dir = None, threads = 1, alignability_threshold = 1, bit_filename = None, fdr_threshold = 0.05, tempdir = None, winsize = 250, offset = 125, cnvWinSize = 1e+05, cnvOffset = 2500, per_contig = False, keep_temp = False, filelist = "files.list", action = "full", improvement = 0.00001, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) if len(args) != 2: raise ValueError("please specify a filename with sample data and an output file") filename_sample, filename_output = args[0], args[1] filename_control = options.control_filename # load Zinba R.library( 'zinba' ) if not options.tempdir: tmpdir = tempfile.mkdtemp( ) else: tmpdir = options.tempdir E.debug( "temporary files are in %s" % tmpdir ) if options.input_format == "bam": E.info( "converting bam files to bed" ) if not os.path.exists( os.path.join( tmpdir, "sample.bed")): filename_sample = bamToBed( filename_sample, os.path.join( tmpdir, "sample.bed" ) ) else: E.info("using existing file %(tmpdir)s/sample.bed" % locals() ) filename_sample = os.path.join( tmpdir, "sample.bed") if filename_control: if not os.path.exists( os.path.join( tmpdir, "control.bed")): filename_control = bamToBed( filename_control, os.path.join( tmpdir, "control.bed" ) ) else: E.info("using existing file %(tmpdir)s/control.bed" % locals() ) filename_control = os.path.join( os.path.join( tmpdir, "control.bed")) fragment_size = options.fragment_size threads = options.threads bit_filename = options.bit_filename mappability_dir = options.mappability_dir fdr_threshold = options.fdr_threshold tol = options.improvement contigs = E.run( "twoBitInfo %(bit_filename)s %(tmpdir)s/contig_sizes" % locals() ) contig2size = dict( [x.split() for x in IOTools.openFile( os.path.join( tmpdir, "contig_sizes")) ] ) outdir = filename_output + "_files" if not os.path.exists( outdir ): os.mkdir( outdir ) filelist = os.path.join( outdir, filename_output + ".list") modelfile = os.path.join( outdir, filename_output + ".model") winfile = os.path.join( outdir, filename_output + ".wins") winSize=250 offset=125 cnvWinSize=100000 cnvOffset=0 winGap = 0 peakconfidence = 1.0 - fdr_threshold if not os.path.exists( os.path.join( tmpdir, "basecount")): E.info( "computing counts" ) R( '''basealigncount( inputfile='%(filename_sample)s', outputfile='%(tmpdir)s/basecount', extension=%(fragment_size)i, filetype='bed', twoBitFile='%(bit_filename)s' ) ''' % locals() ) else: E.info( "using existing counts" ) # tried incremental updates # for contig, size in contig2size.iteritems(): # for size in # fn = os.path.join( tmpdir, "sample_%(contig)s_win%(size)ibp_offset(offset)ibp.txt" % locals() ) if options.action == "count": E.info("computing window counts only - saving results in %s" % outdir ) R('''buildwindowdata( seq='%(filename_sample)s', align='%(mappability_dir)s', input='%(filename_control)s', twoBit='%(bit_filename)s', winSize=%(winSize)i, offset=%(offset)i, cnvWinSize=%(cnvWinSize)i, cnvOffset=%(cnvOffset)i, filelist='%(filelist)s', filetype='bed', extension=%(fragment_size)s, outdir='%(outdir)s/') ''' % locals() ) elif options.action == "model": # The important option is buildwin = 0 # parameterized for broad == FALSE and input present # see zinba.R # model selection only on chr19. R('''run.zinba( filelist='%(filelist)s', formula=NULL,formulaE=NULL,formulaZ=NULL, outfile='%(filename_output)s', seq='%(filename_sample)s', input='%(filename_control)s', filetype='bed', align='%(mappability_dir)s', twoBit='%(bit_filename)s', extension=%(fragment_size)s, winSize=%(winSize)i, offset=%(offset)i, cnvWinSize=%(cnvWinSize)i, cnvOffset=%(cnvOffset)i, basecountfile='%(tmpdir)s/basecount', buildwin=0, threshold=%(fdr_threshold)f, pquant=1, peakconfidence=%(peakconfidence)f, winGap=%(winGap)i, tol=%(tol)f, initmethod="count", method="mixture", numProc=%(threads)i, printFullOut=1, interaction=FALSE, selectmodel=TRUE, selectchr='chr19', selectcovs=c("input_count"), selecttype="complete", FDR=TRUE)''' % locals()) elif options.action == "predict": # The important option is buildwin = 0 and selectmodel = FALSE # parameterized for broad == FALSE and input present # see zinba.R # model selection only on chr19. if not os.path.exists( modelfile ): raise OSError( "model file %s does not exist" ) E.info( "reading model from %s" % modelfile ) R(''' final=read.table('%(modelfile)s', header=T, sep="\t") final=final[final$fail==0,] bestBIC=which.min(final$BIC) formula=as.formula(paste("exp_count~",final$formula[bestBIC])) formulaE=as.formula(paste("exp_count~",final$formulaE[bestBIC])) formulaZ=as.formula(paste("exp_count~",final$formulaZ[bestBIC])) cat("Background formula is:\n\t") print(formula) cat("Enrichment formula is:\n\t") print(formulaE) cat("Zero-inflated formula is:\n\t") print(formulaE) ''' % locals() ) E.info( "predicting peaks" ) R('''run.zinba( filelist='%(filelist)s', outfile='%(filename_output)s', seq='%(filename_sample)s', input='%(filename_control)s', filetype='bed', align='%(mappability_dir)s', twoBit='%(bit_filename)s', extension=%(fragment_size)s, winSize=%(winSize)i, offset=%(offset)i, cnvWinSize=%(cnvWinSize)i, cnvOffset=%(cnvOffset)i, basecountfile='%(tmpdir)s/basecount', buildwin=0, threshold=%(fdr_threshold)f, pquant=1, winGap=%(winGap)i, initmethod="count", tol=%(tol)f, method="mixture", numProc=%(threads)i, printFullOut=1, interaction=FALSE, selectmodel=FALSE, formula=formula, formulaE=formulaE, formulaZ=formulaZ, peakconfidence=%(peakconfidence)f, FDR=TRUE)''' % locals()) elif options.action == "per_contig": E.info("processing per chromosome" ) for contig, size in contig2size.iteritems(): if contig not in ("chr16",): continue E.info("processing contig %s" % contig) filename_sample_contig = filename_sample + "_%s" % contig filename_control_contig = filename_control + "_%s" % contig if not os.path.exists( filename_output + "_files" ): os.mkdir( filename_output + "_files" ) filename_output_contig = os.path.join( filename_output + "_files", contig ) filename_basecounts_contig = os.path.join( tmpdir, "basecount_%s" % contig) E.run( "grep %(contig)s < %(filename_sample)s > %(filename_sample_contig)s" % locals() ) E.run( "grep %(contig)s < %(filename_control)s > %(filename_control_contig)s" % locals() ) if not os.path.exists( filename_basecounts_contig ): E.info( "computing counts" ) R( '''basealigncount( inputfile='%(filename_sample_contig)s', outputfile='%(filename_basecounts_contig)s', extension=%(fragment_size)i, filetype='bed', twoBitFile='%(bit_filename)s' ) ''' % locals() ) else: E.info( "using existing counts" ) # run zinba, do not build window data R( '''zinba( refinepeaks=1, seq='%(filename_sample_contig)s', input='%(filename_control_contig)s', filetype='bed', align='%(mappability_dir)s', twoBit='%(bit_filename)s', outfile='%(filename_output_contig)s', extension=%(fragment_size)s, basecountfile='%(filename_basecounts_contig)s', numProc=%(threads)i, threshold=%(fdr_threshold)f, broad=FALSE, printFullOut=0, interaction=FALSE, mode='peaks', FDR=TRUE) ''' % locals() ) elif options.action == "full": # run zinba, do not build window data R( '''zinba( refinepeaks=1, seq='%(filename_sample)s', input='%(filename_control)s', filetype='bed', align='%(mappability_dir)s', twoBit='%(bit_filename)s', outfile='%(filename_output)s', extension=%(fragment_size)s, basecountfile='%(tmpdir)s/basecount', numProc=%(threads)i, threshold=%(fdr_threshold)f, broad=FALSE, printFullOut=0, interaction=FALSE, mode='peaks', FDR=TRUE) ''' % locals() ) if not (options.tempdir or options.keep_temp): shutil.rmtree( tmpdir ) ## write footer and output benchmark information. E.Stop()