def biomart_iterator( columns, biomart = "ensembl", dataset = "hsapiens_gene_ensembl", host = 'www.biomart.org' ): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset= dataset, host=host ) result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), mart=mart ) # result is a dataframe. # rx returns a dataframe. # rx()[0] returns a vector for data in zip( *[ result.rx(x)[0] for x in columns] ): yield dict( zip(columns, data) )
def plot( self, matrix, row_headers, col_headers, path ): '''plot matrix. Large matrices are split into several plots. ''' self.debug("HeatmapPlot started") self.startPlot() R.library( 'gplots' ) R["heatmap.2"]( matrix, trace = 'none', dendrogram = 'none', col=R.bluered(75), symbreaks = True, symkey = True, cexCol = 0.5, cexRow = 0.5, labRow = row_headers, labCol = col_headers, mar = ro.IntVector((10,10)), keysize = 1 ) self.debug("HeatmapPlot finished") return self.endPlot( None, path )
def tune_model(scenario, params, log): irace_command = create_command(scenario, params, log) # Run irace process = subprocess.Popen(irace_command.split(), stdout=subprocess.PIPE) output, error = process.communicate() # Load the irace output and transdorm it to a csv file pandas2ri.activate() r.load(log.as_posix()) r.library('irace') b = r.getFinalElites(r.iraceResults, n=0) with localconverter(default_converter + pandas2ri.converter): r_pd_params = b out_csv = Path(log.parent, log.stem + '.csv') r_pd_params.to_csvfile(out_csv.as_posix(), sep=',') # Import to pandas to pythonic cleaning py_pd_params = pd.read_csv(out_csv) py_pd_params = py_pd_params.drop(columns=['.ID.', '.PARENT.']) # Giving the correct order to rows correct_table = {} for c in py_pd_params.columns: rows = [] for j in py_pd_params[c].keys(): rows.append(py_pd_params[c][j]) correct_table[c] = rows correct_table = pd.DataFrame(data=correct_table) correct_table.to_csv(out_csv)
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns), mart=mart) # result is a dataframe. # rx returns a dataframe. # rx()[0] returns a vector for data in zip(*[result.rx(x)[0] for x in columns]): yield dict(zip(columns, data))
def q_value_analysis(p_values): logging.info('q-value analysis on %d p-values', len(p_values)) from rpy2.robjects import r, FloatVector r.library('qvalue') ps = FloatVector(p_values) qobj = r.qvalue(ps) return qobj
def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. *columns* is a dictionary mapping biomart columns to columns in the output tables. *biomart* and *dataset* denote the database and dataset to get the data from. ''' R.library("biomaRt") keys = columns.keys() mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def computeExpressionLevels(infiles, outfiles): """normalize data using gcrma libary. output a file with the R object and another as human readable table. """ outfile_r, outfile_table = outfiles R.library("simpleaffy") R.library("gcrma") E.info("reading data") raw_data = R("""raw.data = ReadAffy()""") E.info("normalization") R("""gcrma.eset = call.exprs( raw.data, "%(normalization_method)s" )""" % PARAMS) E.info("saving data") R("""save( gcrma.eset, raw.data, file = "%s") """ % outfile_r) data = R("""as.list(assayData(gcrma.eset))""")["exprs"] probesets, headers = R("""dimnames( assayData(gcrma.eset)$exprs )""") headers = [re.sub(".CEL", "", x) for x in headers] outf = open(outfile_table, "w") outf.write("probeset\t%s\n" % "\t".join(headers)) for probeset, data in zip(probesets, data): outf.write("%s\t%s\n" % (probeset, "\t".join(map(str, data)))) outf.close()
def import_cummeRbund_library(): """ imports cummeRbund library [r.library('cummeRbund')] or asks to install it otherwise. """ try: r.library('cummeRbund') except RRuntimeError as exc: if "there is no package called" in str(exc): print '\n\nIt looks like you have not installed "cummeRbund" yet.\nWould you like me to try to install it for you now?\n' print '(This will require an active internet connection)' while 1: install_cmrbnd = raw_input('y/n: ') if install_cmrbnd == ('y'): print '\nOK, I am going to hand you off to the R install process. Answer any prompts it asks you, and I will see you on the other side.\n\n' raw_input('Press "Enter" when ready...') run_cummeRbund_install() print '\nGreat. Now lets try this again, and hopefully we will be good to go!\n\nSee you soon!' exit(0) elif install_cmrbnd == 'n': print '\nThis script requires that cummeRbund be installed. Please install it manually and try again.\n\nGoodbye.' exit(0) else: print "Please type only 'y' or 'n'." else: raise exc
def computeExpressionLevels(infiles, outfiles): '''normalize data using gcrma libary. output a file with the R object and another as human readable table. ''' outfile_r, outfile_table = outfiles R.library("simpleaffy") R.library("gcrma") E.info("reading data") raw_data = R('''raw.data = ReadAffy()''') E.info("normalization") R('''gcrma.eset = call.exprs( raw.data, "%(normalization_method)s" )''' % PARAMS) E.info("saving data") R('''save( gcrma.eset, raw.data, file = "%s") ''' % outfile_r) data = R('''as.list(assayData(gcrma.eset))''')['exprs'] probesets, headers = R('''dimnames( assayData(gcrma.eset)$exprs )''') headers = [re.sub(".CEL", "", x) for x in headers] outf = open(outfile_table, "w") outf.write("probeset\t%s\n" % "\t".join(headers)) for probeset, data in zip(probesets, data): outf.write("%s\t%s\n" % (probeset, "\t".join(map(str, data)))) outf.close()
def r_pca_plot(x, argAxes=[1,2], argChoix="ind"): # R PCA plot function. # The "FactoMineR" package is required. r.library("FactoMineR") rcmd = r["plot.PCA"] raxe = r_vector(argAxes) return rcmd(x,choix=argChoix,axes=raxe)
def r_conPgsql(user, passwd, dbname): # create a PostgreSQL instance and create one connection. r.library("RPostgreSQL") rcmd_drv = r["dbDriver"] rcmd_con = r["dbConnect"] drv = rcmd_drv("PostgreSQL") return rcmd_con(drv,user=user,password=passwd,dbname=dbname)
def q_value_analysis(p_values): logging.info("q-value analysis on %d p-values", len(p_values)) from rpy2.robjects import r, FloatVector r.library("qvalue") ps = FloatVector(p_values) qobj = r.qvalue(ps) return qobj
def build_bardmap(self, config): """ Build the BARD reporting base maps. Parameters: config - The XML configuration. """ # The first geolevel is the base geolevel of EVERYTHING lbody = LegislativeBody.objects.all()[0] basegl = Geolevel.objects.get(id=lbody.get_base_geolevel()) gconfig = config.xpath('//GeoLevels/GeoLevel[@name="%s"]' % basegl.name)[0] shapefile = gconfig.xpath('Shapefile')[0].get('path') srs = DataSource(shapefile)[0].srs bconfig = config.find('//Reporting/BardConfigs/BardConfig') if srs.name == 'WGS_1984_Web_Mercator_Auxiliary_Sphere': # because proj4 doesn't have definitions for this ESRI def, # but it does understand 3785 srs = SpatialReference(3785) try: # We don't need to install rpy unless we're doing this from rpy2.robjects import r r.library('rgeos') logger.debug("Loaded rgeos library.") r.library('BARD') logger.debug("Loaded BARD library.") sdf = r.readShapePoly(shapefile, proj4string=r.CRS(srs.proj)) logger.debug("Read shapefile '%s'.", shapefile) # The following lines perform the bard basemap computation # much faster, but require vast amounts of memory. Disabled # by default. #fib = r.poly_findInBoxGEOS(sdf) #logger.debug("Created neighborhood index file.") #nb = r.poly2nb(sdf,foundInBox=fib) # nb = r.poly2nb(sdf) # logger.debug("Computed neighborhoods.") # bardmap = r.spatialDataFrame2bardBasemap(sdf) bardmap = r.spatialDataFrame2bardBasemap(sdf, keepgeom=r(False)) logger.debug("Created bardmap.") r.writeBardMap(bconfig.get('shape'), bardmap) logger.debug("Wrote bardmap to disk.") except: logger.info(""" ERROR: The BARD map could not be computed. Please check the configuration settings and try again. """) logger.debug( "The following traceback may provide more information:") logger.debug(traceback.format_exc()) return False return True
def build_bardmap(self, config): """ Build the BARD reporting base maps. Parameters: config - The XML configuration. """ # The first geolevel is the base geolevel of EVERYTHING lbody = LegislativeBody.objects.all()[0] basegl = Geolevel.objects.get(id=lbody.get_base_geolevel()) gconfig = config.xpath('//GeoLevels/GeoLevel[@name="%s"]' % basegl.name)[0] shapefile = gconfig.xpath('Shapefile')[0].get('path') srs = DataSource(shapefile)[0].srs bconfig = config.find('//Reporting/BardConfigs/BardConfig') if srs.name == 'WGS_1984_Web_Mercator_Auxiliary_Sphere': # because proj4 doesn't have definitions for this ESRI def, # but it does understand 3785 srs = SpatialReference(3785) try: # We don't need to install rpy unless we're doing this from rpy2.robjects import r r.library('rgeos') logger.debug("Loaded rgeos library.") r.library('BARD') logger.debug("Loaded BARD library.") sdf = r.readShapePoly(shapefile,proj4string=r.CRS(srs.proj)) logger.debug("Read shapefile '%s'.", shapefile) # The following lines perform the bard basemap computation # much faster, but require vast amounts of memory. Disabled # by default. #fib = r.poly_findInBoxGEOS(sdf) #logger.debug("Created neighborhood index file.") #nb = r.poly2nb(sdf,foundInBox=fib) # nb = r.poly2nb(sdf) # logger.debug("Computed neighborhoods.") # bardmap = r.spatialDataFrame2bardBasemap(sdf) bardmap = r.spatialDataFrame2bardBasemap(sdf,keepgeom=r(False)) logger.debug("Created bardmap.") r.writeBardMap(bconfig.get('shape'), bardmap) logger.debug("Wrote bardmap to disk.") except: logger.info(""" ERROR: The BARD map could not be computed. Please check the configuration settings and try again. """) logger.debug("The following traceback may provide more information:") logger.debug(traceback.format_exc()) return False return True
def computePresenceCalls(infiles, outfile): """ modified workflow from https://stat.ethz.ch/pipermail/bioconductor/2005-July/009758.html Calculate MAS5 detection pvalues and Present/Marginal/Absent calls. Note that this analysis only applies to 3' arrays, but not for other arrays like HuGene. The simpleaffy package was designed to correct for 3' bias due to the 7' oligo dT primers used for building cDNAs. It requires PM and MM probes, which might not exist on other arrays. """ R.library("simpleaffy") R.library("gcrma") infile_r, infile_table = infiles # load the normalized data # (raw.data and gcrma.eset) R("""load( '%s' )""" % infile_r) # calculate detection calls call_eset = R( """call.eset = detection.p.val( raw.data, alpha1=%(mas5_alpha1)f, alpha2=%(mas5_alpha2)f, )""" % PARAMS ) # output to files R( """write.table( call.eset$call, '%s.calls', sep='\t', quote=FALSE, col.names = NA, row.names = TRUE)""" % outfile ) R( """write.table( call.eset$pval, '%s.pval', sep='\t', quote=FALSE, col.names = NA, row.names = TRUE)""" % outfile ) # build list of probesets to keep # keep all probesets where at least 3 out of 4 replicates # are indicated as present (P/M) in at least one treatment group R("""absent = rowSums( call.eset$call == 'A')""") keep = R("""remove = names(absent[absent <= length(colnames(assayData( gcrma.eset )$exprs))-4] )""") outf = open(outfile, "w") outf.write("probeset\n") outf.write("\n".join(map(str, keep)) + "\n") outf.close()
def importKEGGAssignments(outfile, mart, host, biomart_dataset): '''import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") E.info("getting entrez to ensembl mapping ...") entrez2ensembl = PipelineBiomart.biomart_iterator( ("ensembl_gene_id", "entrezgene"), biomart=mart, dataset=biomart_dataset, host=host, path="/biomart/martservice") entrez2ensembl = dict( (x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") # rx2 did not work in rpy2 2.4.2 - workaround uses # absolute indices for gene_column, gene in enumerate(entrez2path.names): try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path[gene_column]: pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write("\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector( ["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write("\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def importKEGGAssignments(outfile, mart, host, biomart_dataset): ''' import the KEGG annotations from the R KEGG.db annotations package. Note that since KEGG is no longer publically availible, this is not up-to-date and maybe removed from bioconductor in future releases ''' R.library("KEGG.db") R.library("biomaRt") E.info("getting entrez to ensembl mapping ...") mart = R.useMart(biomart=mart, host=host, path="/biomart/martservice", dataset=biomart_dataset) entrez2ensembl = R.getBM(attributes=ro.StrVector(["ensembl_gene_id", "entrezgene"]), mart=mart) entrez = entrez2ensembl.rx2("entrezgene") ensembl = entrez2ensembl.rx2("ensembl_gene_id") entrez2ensembl = dict(zip(entrez, ensembl)) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(zip(pathnames.names, R.unlist(pathnames))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") for gene in entrez2path.names: try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path.rx2(str(gene)): pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
def __init__(self, *args, **kwargs): Renderer.__init__(self, *args, **kwargs) Plotter.__init__(self, *args, **kwargs) if "statement" not in kwargs: raise ValueError("r-ggplot renderer requires a statement option") self.statement = kwargs.get('statement') if R: R.library('ggplot2')
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org', path="/biomart/martservice", filters=None, values=None, archive=False): '''download a dataset from biomart and output as a tab-separated table. *columns* is a list with field to obtain. *biomart* and *dataset* denote the database and dataset to get the data from. returns a iterator over rows. ''' R.library("biomaRt") mart = R.useMart(biomart=biomart, dataset=dataset, host=host, path=path, archive=archive) if filters is not None: filter_names = R.StrVector(filters) else: filter_names = "" if values is not None: filter_values = values else: filter_values = "" # result is a dataframe result = R.getBM( attributes=rpy2.robjects.vectors.StrVector(columns), filters=filter_names, values=filter_values, mart=mart) # access via result.rx was broken in rpy2 2.4.2, thus try # numeric access assert tuple(result.colnames) == tuple(columns),\ "colnames in dataframe: %s different from expected: %s" % \ (str(tuple(result.colnames)), tuple(columns)) for data in zip(*[result[x] for x in range(len(columns))]): yield dict(zip(columns, data))
def add_result_for_graph(self,G): embed.embed(G); x=embed.get_embedding(self.d,self.scale) r.library('mclust') mcr = r.Mclust(x,G=self.kRange) self.mcr.append(mcr) self.khat.append(mcr['G']) trueLabel = nx.get_node_attribute(G,'block') predLabel = np.array(mcr['classification']).astype(int)-1 self.ari.append(adjusted_rand_score(trueLabel,predLabel)) self.error.append(num_diff_w_perms(trueLabel,predLabel))
def computePresenceCalls(infiles, outfile): ''' modified workflow from https://stat.ethz.ch/pipermail/bioconductor/2005-July/009758.html Calculate MAS5 detection pvalues and Present/Marginal/Absent calls. Note that this analysis only applies to 3' arrays, but not for other arrays like HuGene. The simpleaffy package was designed to correct for 3' bias due to the 7' oligo dT primers used for building cDNAs. It requires PM and MM probes, which might not exist on other arrays. ''' R.library("simpleaffy") R.library("gcrma") infile_r, infile_table = infiles # load the normalized data # (raw.data and gcrma.eset) R('''load( '%s' )''' % infile_r) # calculate detection calls call_eset = R('''call.eset = detection.p.val( raw.data, alpha1=%(mas5_alpha1)f, alpha2=%(mas5_alpha2)f, )''' % PARAMS) # output to files R('''write.table( call.eset$call, '%s.calls', sep='\t', quote=FALSE, col.names = NA, row.names = TRUE)''' % outfile) R('''write.table( call.eset$pval, '%s.pval', sep='\t', quote=FALSE, col.names = NA, row.names = TRUE)''' % outfile) # build list of probesets to keep # keep all probesets where at least 3 out of 4 replicates # are indicated as present (P/M) in at least one treatment group R('''absent = rowSums( call.eset$call == 'A')''') keep = R( '''remove = names(absent[absent <= length(colnames(assayData( gcrma.eset )$exprs))-4] )''' ) outf = open(outfile, "w") outf.write("probeset\n") outf.write("\n".join(map(str, keep)) + "\n") outf.close()
def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- outfile : string Filename of output file columns : dict Dictionary mapping biomart columns to columns in the output table. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host ''' R.library("biomaRt") keys = columns.keys() # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def importFromBiomart(outfile, columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org'): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- outfile : string Filename of output file columns : dict Dictionary mapping biomart columns to columns in the output table. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host ''' R.library("biomaRt") keys = list(columns.keys()) # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host) result = R.getBM(attributes=keys, mart=mart) outf = IOTools.openFile(outfile, "w") outf.write("\t".join([columns[x] for x in keys]) + "\n") # for x in ("mim_gene_accession", "mim_morbid_accession"): # result[x] = [ ("", y)[y >= 0] for y in result[x] ] for data in zip(*[result[x] for x in keys]): outf.write("\t".join(map(str, data)) + "\n") outf.close()
def rconnect( self, creator = None ): '''open connection within R to database.''' if not self.rdb: R.library( 'RSQLite' ) if creator: self.rdb = creator() else: if self.backend.startswith( 'sqlite' ): self.rdb = R.dbConnect(R.SQLite(), dbname=re.sub( "sqlite:///./", "", self.backend ) ) else: raise NotImplementedError("can not connect to %s in R" % self.backend )
def mog_cross_validate(x): import rpy2.robjects as robjects from rpy2.robjects import r as R from rpy2.robjects.packages import importr R.library('mclust') log_predictions = [] for training, validation in k_fold_cross_validation(x, options.x_validate_groups): r_x = robjects.FloatVector(inv_sigmoid(training)) clustering = R.Mclust(r_x) predictions = R.dens( modelName = clustering.rx2('modelName'), data = robjects.FloatVector(validation), parameters = clustering.rx2('parameters') ) log_predictions.extend(np.log(predictions)) return log_predictions
def estimateExpression(infiles, outfile): '''estimate expression levels.''' R.library("affy") E.info("reading data") raw_data = R.ReadAffy(infiles) E.info("RMA normalization") eset = R.rma(raw_data) R.boxplot(raw_data) R.boxplot(eset) print R.as_list(R.assayData(eset))
def estimateExpression(infiles, outfile): """estimate expression levels.""" R.library("affy") E.info("reading data") raw_data = R.ReadAffy(infiles) E.info("RMA normalization") eset = R.rma(raw_data) R.boxplot(raw_data) R.boxplot(eset) print R.as_list(R.assayData(eset))
def clusterize_r_em(*args): """ Clustering and plotting with EM GMM""" try: from rpy2.robjects import r import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() except: print "You need rpy2" sys.exit(-1) r.library("mclust") for arg in args: model = r.Mclust(arg) print model print r.summary(model) r.quartz("plot") r.plot(model, arg) print raw_input("any key to pass")
def render(self, dataframe, path): R.library('ggplot2') # add all indices as columns dataframe.reset_index(inplace=True) rframe = pandas.rpy.common.convert_to_r_dataframe(dataframe) # for the issue below, see: # http://stackoverflow.com/questions/12865218/getting-rid-of-asis-class-attribute unAsIs = R('''function (x) { if(typeof(x) %in% c("integer","double")) { class(x) <- "numeric" return (x)} else if (typeof(x) == "character") { class(x) <- "character" return (x) } else { return(x) } }''') rframe = R["as.data.frame"](R.lapply(rframe, unAsIs)) R.assign("rframe", rframe) # start plot R('''gp = ggplot(rframe)''') # add aesthetics and geometries try: pp = R('''gp + %s ''' % self.statement) except ValueError as msg: raise ValueError( "could not interprete R statement: " "gp + %s; msg=%s" % (self.statement, msg)) figname = re.sub('/', '_', path2str(path)) r = ResultBlock('#$ggplot %s$#' % figname, title=path2str(path)) r.rggplot = pp r.figname = figname return ResultBlocks(r)
def fit(self, X_train, y_train, subset_no): from rpy2.robjects import r, pandas2ri import rpy2.robjects as ro from rpy2.robjects.conversion import localconverter print('.libPaths( c( "{}" , .libPaths() ) )'.format(scratch_path)) r('.libPaths( c( "{}" , .libPaths() ) )'.format(scratch_path)) r('install.packages("rmcfs", repos="https://cloud.r-project.org/")') r.library('rmcfs') r.library('dplyr') original_features = X_train.columns.values X_train.columns = [ 'f' + str(i + 1) for i, _ in enumerate(X_train.columns.values) ] X_train[TARGET] = y_train with localconverter(ro.default_converter + pandas2ri.converter): r_from_pd_df = ro.conversion.py2rpy(X_train) r.assign('df1', r_from_pd_df) r('result <- mcfs({} ~ ., df1, cutoffPermutations = {}, seed = 2, threadsNumber = 16)' .format(TARGET, self.cutoff_permutations)) r('RI <- result$RI') result = r['result'] ri = r['RI'] with localconverter(ro.default_converter + pandas2ri.converter): pd_from_r_df = ro.conversion.rpy2py(ri) features_to_ri = dict() for _, row in pd_from_r_df.iterrows(): features_to_ri[row['attribute']] = row['RI'] del X_train[TARGET] X_train.columns = original_features self.features_importances = [ features_to_ri['f' + str(i + 1)] for i in range(0, len(original_features)) ] print('RMCFS:FEATURE:IMPORTANCES', self.features_importances)
def hclust(pts, distance, clust_method="single", use_great_circle=True): """ Hierarchical clustering method. Exposes R hclust function in Python. Algorithm: - Calculate distances for each pair of input pts (use_great_circle: TRUE then use great_circle else use Euclidean) - Cluster the points via hclust with distances and clust_method - Cut the resultant clusters via cutree at the desired distance - distance should be in kilometers IF use_great_circle=TRUE, otherwise, they should be in the same metric as the input pts. (see R doc for sp.spDist) - Return the set of sets of points (i.e. the clusters) """ rpy2.robjects.numpy2ri.activate() pt_array = numpy.array(pts) r.library('sp') sp_dists = r.spDists(pt_array, longlat=use_great_circle) dists = r('as.dist')(sp_dists) tree = r.hclust(dists, method=clust_method) clusters = r.cutree(tree, h=distance) # this is a little tricky # clusters maintains a list of indexes for each point in the original list. # The indexes represent the cluster number # For example: # clusters = [1, 3, 3, 1, 2, 1] # # where the 0th, 3rd, and last point in the original set # belong to cluster number 1 # # We want to return a list of clusters, each containing # the index to the original point, so we map them here. # # Things are a little more confusing since R counts arrays # from 1, python from 0 (hence the "- 1" from the cluster index) list_of_pts = [[] for i in range(max(clusters))] for j in range(0, len(clusters)): list_of_pts[clusters[j] - 1].append(j) return list_of_pts
def getProbeset2Gene(database): '''build map relating a probeset to an ENSEMBL gene_id''' prefix = database[:-len(".db")] mapping = prefix + "ENSEMBL" R.library(database) # map is a Bimap object m = R(mapping) result = R.toTable(m) mapping = collections.defaultdict(list) for probeset_id, gene_id in zip(result["probe_id"], result["ensembl_id"]): mapping[probeset_id].append(gene_id) E.info("obtained %i mappings: probes=%i, genes=%i" % (len(result), len(set( result["probe_id"])), len(set(result["ensembl_id"])))) return mapping
def getProbeset2Gene(database): """build map relating a probeset to an ENSEMBL gene_id""" prefix = database[: -len(".db")] mapping = prefix + "ENSEMBL" R.library(database) # map is a Bimap object m = R(mapping) result = R.toTable(m) mapping = collections.defaultdict(list) for probeset_id, gene_id in zip(result["probe_id"], result["ensembl_id"]): mapping[probeset_id].append(gene_id) E.info( "obtained %i mappings: probes=%i, genes=%i" % (len(result), len(set(result["probe_id"])), len(set(result["ensembl_id"]))) ) return mapping
def render(self, dataframe, path): R.library('ggplot2') # add all indices as columns dataframe.reset_index(inplace=True) rframe = rpy2.robjects.pandas2ri.py2ri(dataframe) # for the issue below, see: # http://stackoverflow.com/questions/12865218/getting-rid-of-asis-class-attribute unAsIs = R('''function (x) { if("AsIs" %in% class(x)) { class(x) <- class(x)[-match("AsIs", class(x))] } return (x) } ''') rframe = R["as.data.frame"](R.lapply(rframe, unAsIs)) R.assign("rframe", rframe) # start plot R('''gp = ggplot(rframe)''') # add aesthetics and geometries try: pp = R('''gp + %s ''' % self.statement) except ValueError as msg: raise ValueError( "could not interprete R statement: " "gp + %s; msg=%s" % (self.statement, msg)) figname = re.sub('/', '_', path2str(path)) r = ResultBlock('#$ggplot %s$#' % figname, title=path2str(path)) r.rggplot = pp r.figname = figname return ResultBlocks(r)
def clusterize_r_em(*args, **kwargs): """ Clustering and plotting with EM GMM""" try: from rpy2.robjects import r import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() from sklearn.decomposition import PCA except: print "You need rpy2" sys.exit(-1) r.library("mclust") for arg in args: if kwargs.get('clf_on_pca', False): pca = PCA(2) arg = pca.fit(arg).transform(arg) model = r.Mclust(arg) print model print r.summary(model) r.quartz("plot") r.plot(model, arg) print raw_input("press any key to pass")
def render(self, dataframe, path ): R.library( 'ggplot2' ) rframe = pandas.rpy.common.convert_to_r_dataframe(dataframe) # sometimes the row/column mapping did not work # rframe.colnames = dataframe.columns unAsIs = R('''function (x) { if(typeof(x) %in% c("integer","double")) { class(x) <- "numeric" return (x)} else if (typeof(x) == "character") { class(x) <- "character" return (x) } else { return(x) } }''') rframe = R["as.data.frame"](R.lapply(rframe,unAsIs)) R.assign( "rframe", rframe ) # start plot R('''gp = ggplot( rframe )''') # add aesthetics and geometries try: pp = R('''gp + %s ''' % self.statement ) except ValueError as msg: raise ValueError( "could not interprete R statement: gp + %s; msg=%s" % (self.statement, msg )) figname = re.sub( '/', '_', path2str(path) ) r = ResultBlock( '#$ggplot %s$#' % figname, title = path2str(path) ) r.rggplot = pp r.figname = figname return ResultBlocks( r )
def getProbeset2Location(database="hgu133plus2.db"): '''build map with genomic coordinates for each probeset. The mapping is not necessarily unique. ''' R.library(database) prefix = database[:-len(".db")] contigs = dict(R(prefix + "CHRLENGTHS")) # map is a Bimap object result2start = R.toTable(R(prefix + "CHRLOC")) result2end = R.toTable(R(prefix + "CHRLOCEND")) mapping = collections.defaultdict(list) # make sure order is the same assert result2start["probe_id"] == result2end["probe_id"] for probeset_id, contig, start, end in zip(result2start["probe_id"], result2start["Chromosome"], result2start["start_location"], result2end["end_location"]): if start < 0: start = contigs[contig] - start end = contigs[contig] - end mapping[probeset_id].append((contig, start, end)) E.info("mappings: probes=%i, contigs=%i" % ( len(set(result2start["probe_id"])), len(set(result2start["Chromosome"])), )) return mapping
def getProbeset2Location(database="hgu133plus2.db"): '''build map with genomic coordinates for each probeset. The mapping is not necessarily unique. ''' R.library(database) prefix = database[:-len(".db")] contigs = dict(R(prefix + "CHRLENGTHS")) # map is a Bimap object result2start = R.toTable(R(prefix + "CHRLOC")) result2end = R.toTable(R(prefix + "CHRLOCEND")) mapping = collections.defaultdict(list) # make sure order is the same assert result2start["probe_id"] == result2end["probe_id"] for probeset_id, contig, start, end in zip(result2start["probe_id"], result2start["Chromosome"], result2start["start_location"], result2end["end_location"]): if start < 0: start = contigs[contig] - start end = contigs[contig] - end mapping[probeset_id].append((contig, start, end)) E.info("mappings: probes=%i, contigs=%i" % (len(set(result2start["probe_id"])), len(set(result2start["Chromosome"])), )) return mapping
def CalculateEVDParameters(MaxSimilarities): import rpy2.robjects as robjects from rpy2.robjects import r # Create an R object of sorted scores maxsims = robjects.FloatVector(sorted(MaxSimilarities)) # MLE Estimates using "ismev" package r.library("ismev") gev_fit = robjects.r['gev.fit'](maxsims) #Mu = location, Sigma = scale, Xi = shape Mu = gev_fit[6][0] Sigma = gev_fit[6][1] Xi = gev_fit[6][2] #Standard errors for the Mu, Sigma and Xi eMu = gev_fit[8][0] eSigma = gev_fit[8][1] eXi = gev_fit[8][2] return (Xi, Mu, Sigma, eXi, eMu, eSigma)
from rpy2.robjects import r import os import rpy2.robjects.packages as rpackages import rpy2.robjects as robj utils=rpackages.importr('utils') # R 기본 패키지 호출 robj.r('setwd("./")') r.library('plyr') robj.r('load("~/MediWeb-master/R_file/main.RData")') robj.r('table_region<-data.frame()') ##### Category by Region ##### # start robj.r('start<-function(sido_s, sggu_s){\n' 'for(i in 1:nrow(table_united)){\n' 'if((sido_s==as.character(table_united$item.sidoCdNm[i])) && ' '(sggu_s==as.character(table_united$item.sgguCdNm[i]))) {\n' 'start_num=i\n' 'return(start_num)}}}') # end robj.r('end<-function(sido_e, sggu_e, start_num){\n' 'for(i in start_num:nrow(table_united)){\n' 'if(!((sido_e==as.character(table_united$item.sidoCdNm[i])) && ' '(sggu_e==as.character(table_united$item.sgguCdNm[i])))) {\n' 'end_num=i-1\n' 'return(end_num)}}}')
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices=("bam", ), help="input file format [default=%default].") parser.add_option("-w", "--window-size", dest="window_size", type="int", help="window size [default=%default].") parser.add_option("-c", "--control-filename", dest="control_filename", type="string", help="filename of input/control data in " "bed format [default=%default].") parser.add_option("-t", "--threads", dest="threads", type="int", help="number of threads to use [default=%default].") parser.add_option("-q", "--fdr-threshold", dest="fdr_threshold", type="float", help="fdr threshold [default=%default].") parser.add_option("-z", "--spp-z-threshold", dest="z_threshold", type="float", help="z threshold [default=%default].") parser.add_option("--bin", dest="bin", type="int", help="bin tags within the specified number " " of basepairs to speed up calculation;" " increasing bin size decreases the accuracy " "of the determined parameters [default=%default]") parser.add_option("--spp-srange-min", dest="srange_min", type="float", help="srange gives the possible range for the " " size of the protected region;" " srange should be higher than tag length; " " making the upper boundary too high" " will increase calculation time [%default]") parser.add_option("--spp-srange-max", dest="srange_max", type="float", help="srange gives the possible range for the " " size of the protected region;" " srange should be higher than tag length; " " making the upper boundary too high" " will increase calculation time [%default]") parser.set_defaults( input_format="bam", threads=1, fdr_threshold=0.05, window_size=1000, offset=125, srange_min=50, srange_max=500, bin=5, z_threshold=3, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError( "please specify a filename with sample data and an output file") filename_sample, filename_output = args[0], args[1] filename_control = options.control_filename # load Zinba R.library('spp') R.library('snow') # read data E.info("reading data") R('''chip.data <- read.bam.tags('%s')''' % filename_sample) R('''input.data <- read.bam.tags('%s')''' % filename_control) R('''cluster = makeCluster( %i )''' % (options.threads)) E.info("computing binding characteristics") # get binding info from cross-correlation profile # srange gives the possible range for the size of the protected region; # srange should be higher than tag length; making the upper boundary too # high will increase calculation time # bin - bin tags within the specified number of basepairs to speed # up calculation; increasing bin size decreases the accuracy of # the determined parameters srange_min, srange_max = options.srange_min, options.srange_max bin = options.bin R('''binding.characteristics <- get.binding.characteristics(chip.data, srange=c(%(srange_min)i,%(srange_max)i), bin=%(bin)s, cluster=cluster);''' % locals()) # print out binding peak separation distance options.stdout.write("shift\t%i\n" % R('''binding.characteristics$peak$x''')[0]) ################################################## ################################################## ################################################## E.info("plot cross correlation profile") # plot cross-correlation profile R('''pdf(file="%s.crosscorrelation.pdf",width=5,height=5)''' % filename_output) R('''par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8);''') R('''plot(binding.characteristics$cross.correlation, type='l', xlab="strand shift", ylab="cross-correlation");''') R('''abline(v=binding.characteristics$peak$x,lty=2,col=2)''') R('''dev.off();''') E.info("selecting informative tags based on the binding characteristics") # select informative tags based on the binding characteristics R('''chip.data <- select.informative.tags( chip.data,binding.characteristics);''') R('''input.data <- select.informative.tags( input.data,binding.characteristics);''') E.info("outputting broad peaks") window_size, z_threshold = options.window_size, options.z_threshold R('''broad.clusters <- get.broad.enrichment.clusters(chip.data,input.data, window.size=%(window_size)i, z.thr=%(z_threshold)f, tag.shift=round(binding.characteristics$peak$x/2))''' % locals()) # write out in broadPeak format R('''write.broadpeak.info(broad.clusters,"%s.broadpeak.txt")''' % filename_output) # binding detection parameters desired FDR (1%). Alternatively, an # E-value can be supplied to the method calls below instead of the # fdr parameter the binding.characteristics contains the optimized # half-size for binding detection window R('''detection.window.halfsize <- binding.characteristics$whs;''') # determine binding positions using wtd method E.info("determining binding positions using wtd method") fdr = options.fdr_threshold R('''bp <- find.binding.positions( signal.data=chip.data,control.data=input.data, fdr=%(fdr)f,whs=detection.window.halfsize,cluster=cluster)''' % locals()) options.stdout.write( "detected_peaks\t%i\n" % R('''sum(unlist(lapply(bp$npl,function(d) length(d$x))))''')[0]) # output detected binding positions R('''output.binding.results(bp,"%s.summit.txt");''' % filename_output) R('''bp <- add.broad.peak.regions(chip.data,input.data,bp, window.size=%(window_size)i,z.thr=%(z_threshold)f)''' % locals()) # output using narrowPeak format R('''write.narrowpeak.binding(bp,"%s.narrowpeak.txt")''' % filename_output) # write footer and output benchmark information. E.stop()
# <headingcell level=4> # ggplot2 in python with Rpy2 # <markdowncell> # Thanks to [Fei Yu](http://www.thefeiyu.com/) for this vignette. # <codecell> import rpy2.robjects as robjects from rpy2.robjects.packages import importr r = robjects.r r.library("ggplot2") rnorm = r["rnorm"] dtf = robjects.DataFrame({ 'x': rnorm(300, mean=0) + rnorm(100, mean=3), 'y': rnorm(300, mean=0) + rnorm(100, mean=3) }) robjects.globalenv['dtf'] = dtf # assign to R's environment r("gp = ggplot(dtf, aes(x, y))") r("pp = gp + geom_point(size=0.6)") # r("plot(pp)") # I can't get this to work on my system, but saving the plot is just as good. r("ggsave(filename='test.png', plot=pp, scale=0.3)") from IPython.core.display import Image Image(filename='test.png')
(O_Name2,O_Ext2) = os.path.splitext(O_File2) TrailFile2 = open(options.TxtFile2,'r') LineCount2 = len( open(options.TxtFile2,'r').readlines() ) TrailVectorsAll_2 = [] for line2 in TrailFile2: line2 = line2.rstrip('\n') element2 = line2.split('\t') TrailVectorsAll_2.append( [float(element2[0]),float(element2[1])] ) TrailFile2.close() import rpy2.robjects as robjects import rpy2.rpy_classic as rpy from rpy2.robjects import r r.library("CircStats") Angle1 = [] Radians1 = [] Mags1 = [] i = 0 while i < len(TrailVectorsAll_1): Angle1.append( float(PS_Maths.CalculateVectorAngle( TrailVectorsAll_1[i] ) ) ) Radians1.append( float(((math.pi/180)*PS_Maths.CalculateVectorAngle( TrailVectorsAll_1[i] ))) ) Mags1.append( PS_Maths.CalculateVectorMagnitude( TrailVectorsAll_1[i] )) i += 1 rRadians1 = robjects.FloatVector(Radians1) Angle2 = [] Radians2 = [] Mags2 = []
pd_from_r_df = ro.conversion.rpy2py(df) return pd_from_r_df def get_fungi_mart(ds): return R.useMart(biomart = 'fungi_mart', host = "https://fungi.ensembl.org/", dataset=ds) def return_bm_df(atts, mart): atts_vector = StrVector([x for x in atts]) BM = R.getBM(attributes = atts_vector, mart = mart) return r2pd_dataframe(BM) base = sys.argv[1] R.library("biomaRt") print(R.listMarts(host = "https://fungi.ensembl.org/")) # Get the fungi marts mart_names = ['fculmorum_eg_gene' ,'fgraminearum_eg_gene', 'fpseudograminearum_eg_gene'] marts = [get_fungi_mart(mart) for mart in mart_names] f_culmorum_mart, f_graminearum_mart, f_pseudogram_mart = marts[0], marts[1], marts[2] # Get the attributes and the filters f_culmorum_attributes, f_graminearum_attributes, f_pseudogram_attriibtes = R.listAttributes(f_culmorum_mart), R.listAttributes(f_graminearum_mart), R.listAttributes(f_pseudogram_mart) f_cul_att_df = r2pd_dataframe(f_culmorum_attributes) gram_df = f_cul_att_df.name[f_cul_att_df.name.str.startswith('f')] gram_df.to_string()
if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE"
class GENES_TO_UNIPROT_CLASS: #Takes list of genes and returns uniprots. Details of output in each definition "Implementation for the biomaRt (R) gene id to uniprot using the hsapiens_gene_ensembl" #Load rpy2 needed libraries from rpy2 import robjects from rpy2.robjects import r #Declare r objects c = robjects.r["c"] #Load biomaRt r.library("biomaRt") #Choose biomaRt hsapiens ensembl = robjects.r["useMart"]("ensembl", dataset="hsapiens_gene_ensembl") def __init__(self, GENES): self.robjects = GENES_TO_UNIPROT_CLASS.robjects self.c = GENES_TO_UNIPROT_CLASS.c self.GENES = GENES self.ensembl = GENES_TO_UNIPROT_CLASS.ensembl #Vectorize gene list self.gene_vector = self.robjects.StrVector(self.GENES) #Call biomaRt RESULTS = self.robjects.r["getBM"](attributes=self.c( "hgnc_symbol", "uniprot_swissprot_accession"), filters="hgnc_symbol", values=self.gene_vector, mart=self.ensembl) #Make RESULTS pythonic hgnc = list(RESULTS.rx2(1)) uniprot = list(RESULTS.rx2(2)) self.GENE_UNIPROT_PAIRS = [list(x) for x in zip(hgnc, uniprot)] #To filter out record that do not have a uniprot in file self.PRE_GENE_UNIPROT_PAIRS_FILTERED = filter(lambda x: len(x[1]) > 0, self.GENE_UNIPROT_PAIRS) #Further search against personal database LEFTOVER_GENE_SET = filter( lambda x: x not in zip(*self.PRE_GENE_UNIPROT_PAIRS_FILTERED)[0], self.GENES) import urllib2 import pickle REVIEWED_FILE = urllib2.urlopen( "https://sites.google.com/site/jdatabases/home/uniprot/DICT_UNIPROT_TO_GENES_REVIEWED.pi?attredirects=0&d=1" ) REVIEWED_DICT = pickle.load(REVIEWED_FILE) self.PLUS_GENE_UNIPROT_PAIRS = [] for gene in LEFTOVER_GENE_SET: for key, value in REVIEWED_DICT.iteritems(): if gene in value: self.PLUS_GENE_UNIPROT_PAIRS.append([gene, key]) elif gene.lower( ) in value: #to account for lower case records, if already lower case and present earlier, it #would not get to this instance self.PLUS_GENE_UNIPROT_PAIRS.append([gene, key]) #Add values found in personal database to biomart found values self.GENE_UNIPROT_PAIRS_FILTERED = self.PRE_GENE_UNIPROT_PAIRS_FILTERED + self.PLUS_GENE_UNIPROT_PAIRS def has_uniprot_genes( self): #Returns genes in query that have a uniprot match GENE_PAIRS = zip(*self.GENE_UNIPROT_PAIRS_FILTERED)[0] self.found = [] for gene in self.GENES: if gene in GENE_PAIRS: self.found.append(gene) return self.found def get_uniprots( self ): #Return list of all uniprots that correspond to uniprots in has_uniprot_genes self.UNIPROTS = zip(*self.GENE_UNIPROT_PAIRS_FILTERED)[1] return self.UNIPROTS def no_uniprots( self ): #Return genes for which no uniprot was found #CHECK BELOW FOR REASONS GENE_PAIRS = zip(*self.GENE_UNIPROT_PAIRS_FILTERED)[0] self.not_found = [] for gene in self.GENES: if gene not in GENE_PAIRS: self.not_found.append(gene) return self.not_found def not_on_database( self ): #Return genes for which a record is not present in the ensembl database (hgnc) or personal #database, double check them self.not_present = [] for gene in self.GENES: if gene not in zip( *self.GENE_UNIPROT_PAIRS)[0] and gene not in zip( *self.PLUS_GENE_UNIPROT_PAIRS)[0]: self.not_present.append(gene) return self.not_present def empty_gene( self ): #Return genes that are in database but have no uniprot record - BE WARY OF RESULTS, relies on #output format biomaRt, depending on size of input it may give "NA" or leave result empty giving the #false impression that it is in the biomaRt database when in fact it does not exist self.empty = [] GENE_PAIRS_UNFILTERED = zip(*self.GENE_UNIPROT_PAIRS)[0] GENE_PAIRS_FILTERED = zip(*self.GENE_UNIPROT_PAIRS_FILTERED)[0] for gene in self.GENES: if gene in GENE_PAIRS_UNFILTERED and gene not in GENE_PAIRS_FILTERED: self.empty.append(gene) return self.empty
You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.""" __author__ = "Shantanu H. Joshi" __copyright__ = "Copyright 2014, Shantanu H. Joshi Ahmanson-Lovelace Brain Mapping Center, \ University of California Los Angeles" __email__ = "*****@*****.**" __credits__ = 'Inspired by the stats package rshape by Roger P. Woods' import rpy2.robjects as robjects import numpy as np from stats_output import StatsOutput from rpy2.robjects import r r.library("data.table") from sys import stdout def corr_r_func(): corr_r_funct_str = {'corr_r_func': '''<- function(x, y, methodstr) { return(cor.test(x, y, method=methodstr)) } '''} return corr_r_funct_str def corr_shape_r(model, sdata): statsout = StatsOutput(dim=sdata.phenotype_array.shape[1])
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices=("bed", "bam"), help="input file format [default=%default].") parser.add_option("-s", "--fragment-size", dest="fragment_size", type="int", help="fragment size, used for the extension parameter " "in Zinba [default=%default].") parser.add_option("-m", "--zinba-mappability-dir", dest="mappability_dir", type="string", help="mappability_dir [default=%default].") parser.add_option("-b", "--bit-file", dest="bit_filename", type="string", help="2bit genome filename [default=%default].") parser.add_option("-c", "--control-filename", dest="control_filename", type="string", help="filename of input/control data in bed format " "[default=%default].") parser.add_option("-i", "--zinba-index-dir", dest="index_dir", type="string", help="index directory [default=%default].") parser.add_option("-t", "--threads", dest="threads", type="int", help="number of threads to use [default=%default].") parser.add_option("-q", "--fdr-threshold", dest="fdr_threshold", type="float", help="fdr threshold [default=%default].") parser.add_option("-a", "--zinba-alignability-threshold", dest="alignability_threshold", type="int", help="alignability threshold [default=%default].") parser.add_option("-p", "--aggregate-by-contig", dest="per_contig", action="store_true", help="run analysis per chromosome [default=%default]") parser.add_option("-w", "--temp-dir", dest="tempdir", type="string", help="use existing directory as temporary directory " "[default=%default].") parser.add_option("--keep-temp", dest="keep_temp", action="store_true", help="keep temporary directory [default=%default]") parser.add_option("--action", dest="action", type="choice", choices=("full", "count", "predict", "model"), help="action to perform [default=%default]") parser.add_option("--zinba-improvement", dest="improvement", type="float", help="relative improvement of likelihood until " "convergence [default=%default]") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="minimum insert size for paired end data " "[default=%default]") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="maximum insert size for paired end data " "[default=%default]") parser.set_defaults( input_format="bed", fragment_size=200, mappability_dir=None, threads=1, alignability_threshold=1, bit_filename=None, fdr_threshold=0.05, tempdir=None, winsize=250, offset=125, cnvWinSize=1e+05, cnvOffset=2500, per_contig=False, keep_temp=False, min_insert_size=0, max_insert_size=1000, filelist="files.list", selectchr="chr19", action="full", improvement=0.00001, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please specify a filename with sample data and an output file") filename_sample, filename_output = args[0], args[1] filename_sample = os.path.abspath(filename_sample) filename_output = os.path.abspath(filename_output) if options.control_filename: filename_control = os.path.abspath(options.control_filename) else: filename_control = None # load Zinba R.library('zinba') if not options.tempdir: tmpdir = tempfile.mkdtemp() else: tmpdir = options.tempdir E.info("changing to temporary directory %s" % tmpdir) os.chdir(tmpdir) if options.input_format == "bam": E.info("converting bam files to bed") if not os.path.exists(os.path.join(tmpdir, "sample.bed")): filename_sample = bamToBed(filename_sample, os.path.join(tmpdir, "sample.bed")) else: E.info("using existing file %(tmpdir)s/sample.bed" % locals()) filename_sample = os.path.join(tmpdir, "sample.bed") if filename_control: if not os.path.exists(os.path.join(tmpdir, "control.bed")): filename_control = bamToBed( filename_control, os.path.join(tmpdir, "control.bed")) else: E.info("using existing file %(tmpdir)s/control.bed" % locals()) filename_control = os.path.join(tmpdir, "control.bed") fragment_size = options.fragment_size threads = options.threads bit_filename = options.bit_filename mappability_dir = options.mappability_dir fdr_threshold = options.fdr_threshold tol = options.improvement contigs = E.run("twoBitInfo %(bit_filename)s %(tmpdir)s/contig_sizes" % locals()) contig2size = dict([ x.split() for x in IOTools.openFile(os.path.join(tmpdir, "contig_sizes")) ]) outdir = filename_output + "_files" E.info('saving intermediate results in %s' % outdir) if not os.path.exists(outdir): os.mkdir(outdir) filelist = os.path.join(outdir, filename_output + ".list") modelfile = os.path.join(outdir, filename_output + ".model") winfile = os.path.join(outdir, filename_output + ".wins") winSize = 250 offset = 125 cnvWinSize = 100000 cnvOffset = 0 winGap = 0 peakconfidence = 1.0 - fdr_threshold selectchr = options.selectchr if not os.path.exists(os.path.join(tmpdir, "basecount")): E.info("computing counts") R('''basealigncount(inputfile='%(filename_sample)s', outputfile='%(tmpdir)s/basecount', extension=%(fragment_size)i, filetype='bed', twoBitFile='%(bit_filename)s' ) ''' % locals()) else: E.info("using existing counts") # tried incremental updates # for contig, size in contig2size.iteritems(): # for size in # fn = os.path.join( tmpdir, "sample_%(contig)s_win%(size)ibp_offset(offset)ibp.txt" % locals() ) if options.action == "count": E.info("computing window counts only - saving results in %s" % outdir) R('''buildwindowdata( seq='%(filename_sample)s', align='%(mappability_dir)s', input='%(filename_control)s', twoBit='%(bit_filename)s', winSize=%(winSize)i, offset=%(offset)i, cnvWinSize=%(cnvWinSize)i, cnvOffset=%(cnvOffset)i, filelist='%(filelist)s', filetype='bed', extension=%(fragment_size)s, outdir='%(outdir)s/') ''' % locals()) elif options.action == "model": # The important option is buildwin = 0 # parameterized for broad == FALSE and input present # see zinba.R # model selection only on chr19. R('''run.zinba( filelist='%(filelist)s', formula=NULL,formulaE=NULL,formulaZ=NULL, outfile='%(filename_output)s', seq='%(filename_sample)s', input='%(filename_control)s', filetype='bed', align='%(mappability_dir)s', twoBit='%(bit_filename)s', extension=%(fragment_size)s, winSize=%(winSize)i, offset=%(offset)i, cnvWinSize=%(cnvWinSize)i, cnvOffset=%(cnvOffset)i, basecountfile='%(tmpdir)s/basecount', buildwin=0, threshold=%(fdr_threshold)f, pquant=1, peakconfidence=%(peakconfidence)f, winGap=%(winGap)i, tol=%(tol)f, initmethod="count", method="mixture", numProc=%(threads)i, printFullOut=1, interaction=FALSE, selectmodel=TRUE, selectchr='%(selectchr)s', selectcovs=c("input_count"), selecttype="complete", FDR=TRUE)''' % locals()) elif options.action == "predict": # The important option is buildwin = 0 and selectmodel = FALSE # parameterized for broad == FALSE and input present # see zinba.R # model selection only on chr19. if not os.path.exists(modelfile): raise OSError("model file %s does not exist" % modelfile) E.info("reading model from %s" % modelfile) R(''' final=read.table('%(modelfile)s', header=T, sep="\t") final=final[final$fail==0,] bestBIC=which.min(final$BIC) formula=as.formula(paste("exp_count~",final$formula[bestBIC])) formulaE=as.formula(paste("exp_count~",final$formulaE[bestBIC])) formulaZ=as.formula(paste("exp_count~",final$formulaZ[bestBIC])) cat("Background formula is:\n\t") print(formula) cat("Enrichment formula is:\n\t") print(formulaE) cat("Zero-inflated formula is:\n\t") print(formulaE) ''' % locals()) E.info("predicting peaks") R('''run.zinba( filelist='%(filelist)s', outfile='%(filename_output)s', seq='%(filename_sample)s', input='%(filename_control)s', filetype='bed', align='%(mappability_dir)s', twoBit='%(bit_filename)s', extension=%(fragment_size)s, winSize=%(winSize)i, offset=%(offset)i, cnvWinSize=%(cnvWinSize)i, cnvOffset=%(cnvOffset)i, basecountfile='%(tmpdir)s/basecount', buildwin=0, threshold=%(fdr_threshold)f, pquant=1, winGap=%(winGap)i, initmethod="count", selectchr='%(selectchr)s', tol=%(tol)f, method="mixture", numProc=%(threads)i, printFullOut=1, interaction=FALSE, selectmodel=FALSE, formula=formula, formulaE=formulaE, formulaZ=formulaZ, peakconfidence=%(peakconfidence)f, FDR=TRUE)''' % locals()) elif options.action == "per_contig": E.info("processing per chromosome") for contig, size in contig2size.items(): if contig not in ("chr16", ): continue E.info("processing contig %s" % contig) filename_sample_contig = filename_sample + "_%s" % contig filename_control_contig = filename_control + "_%s" % contig if not os.path.exists(filename_output + "_files"): os.mkdir(filename_output + "_files") filename_output_contig = os.path.join(filename_output + "_files", contig) filename_basecounts_contig = os.path.join(tmpdir, "basecount_%s" % contig) E.run( "grep %(contig)s < %(filename_sample)s > %(filename_sample_contig)s" % locals()) E.run( "grep %(contig)s < %(filename_control)s > %(filename_control_contig)s" % locals()) if not os.path.exists(filename_basecounts_contig): E.info("computing counts") R('''basealigncount( inputfile='%(filename_sample_contig)s', outputfile='%(filename_basecounts_contig)s', extension=%(fragment_size)i, filetype='bed', twoBitFile='%(bit_filename)s' ) ''' % locals()) else: E.info("using existing counts") # run zinba, do not build window data R('''zinba( refinepeaks=1, seq='%(filename_sample_contig)s', input='%(filename_control_contig)s', filetype='bed', align='%(mappability_dir)s', twoBit='%(bit_filename)s', outfile='%(filename_output_contig)s', extension=%(fragment_size)s, basecountfile='%(filename_basecounts_contig)s', numProc=%(threads)i, threshold=%(fdr_threshold)f, broad=FALSE, printFullOut=0, interaction=FALSE, mode='peaks', FDR=TRUE) ''' % locals()) elif options.action == "full": # run zinba, build window data and refine peaks # Note that zinba() uses 'chr22' to select model # which is not present in mouse. So call run.zinba # directly. R('''run.zinba( refinepeaks=1, buildwin=1, seq='%(filename_sample)s', input='%(filename_control)s', filetype='bed', align='%(mappability_dir)s', twoBit='%(bit_filename)s', outfile='%(filename_output)s', extension=%(fragment_size)s, winSize=%(winSize)i, offset=%(offset)i, basecountfile='%(tmpdir)s/basecount', numProc=%(threads)i, threshold=%(fdr_threshold)f, pquant=1, winGap=%(winGap)i, selectchr='%(selectchr)s', interaction=FALSE, method="mixture", cnvWinSize=%(cnvWinSize)i, cnvOffset=%(cnvOffset)i, selectmodel=TRUE, selectcovs=c("input_count"), selecttype="complete", initmethod="count", printFullOut=1, diff=0, pWinSize=200, peakconfidence=%(peakconfidence)f, FDR=TRUE) ''' % locals()) if not (options.tempdir or options.keep_temp): shutil.rmtree(tmpdir) # write footer and output benchmark information. E.Stop()
def importKEGGAssignments(outfile, mart, host, biomart_dataset): '''import the KEGG annotations from the R KEGG.db annotations package. .. note:: Since KEGG is no longer publically available, this is not up-to-date and maybe removed from bioconductor in future releases The table written to outfile has the following columns: ``ontology``, ``gene_id``, ``kegg_ID``, ``kegg_name``, ``evidence``. Arguments --------- outfile : string Output filename in :term:`tsv` format. mart : string Name of the biomart host : string Host name of the biomart server biomart_dataset : string Biomart dataset ''' if not re.match("rnorvegicus|scerevisiae|hsapiens|mmusculus", biomart_dataset): E.warn("KEGG.db doesn't map Entrez ids for %s, %s will" " likely be empty" % (biomart_dataset, outfile)) R.library("KEGG.db") E.info("getting entrez to ensembl mapping ...") # Generates an iterator containing the data from biomart entrez2ensembl = Biomart.biomart_iterator( ("ensembl_gene_id", "entrezgene"), biomart=mart, dataset=biomart_dataset, host=host, path="/biomart/martservice") entrez2ensembl = dict((x['entrezgene'], x['ensembl_gene_id']) for x in entrez2ensembl) E.info("Done") E.info("getting entrez to kegg mapping ... ") entrez2path = R('as.list(KEGGEXTID2PATHID)') E.info("Done") E.info("Getting KEGG names") pathnames = R('as.list(KEGGPATHID2NAME)') pathid2name = dict(list(zip(pathnames.names, R.unlist(pathnames)))) E.info("Done") outf = IOTools.openFile(outfile, "w") outf.write("ontology\tgene_id\tkegg_ID\tkegg_name\tevidence\n") # rx2 did not work in rpy2 2.4.2 - workaround uses # absolute indices for gene_column, gene in enumerate(entrez2path.names): try: gene = int(gene) except ValueError: continue if gene in entrez2ensembl: ensid = entrez2ensembl[gene] else: continue for pathway in entrez2path[gene_column]: pathid = re.match("[a-z]+([0-9]+)", pathway).groups()[0] pathname = pathid2name[pathid] outf.write( "\t".join(["kegg", ensid, str(pathway), pathname, "NA"]) + "\n")
# 1. 파일 선택 # 2. 분석, 시각화 from rpy2.robjects import r import os import rpy2.robjects.packages as rpackages import rpy2.robjects as robj # R 기본 패키지 utils=rpackages.importr('utils') file_path = input("파일이 있는 경로를 입력하세요(양 끝에 \"입력) : ") file_name = input("파일 이름을 입력하세요(양 끝에 \" 및 확장자 입력) : ") # 경로 지정 및 관련 라이브러리 호출 robj.r('setwd('+file_path+')') robj.r('Sys.setenv(JAVA_HOME="C:/Program Files (x86)/Java/jre1.8.0_161")') r.library('rJava') r.library('wordcloud') r.library('RColorBrewer') r.library('KoNLP') #robj.r('useSejongDic()') # 데이터 분석 - 파일 불러오기, 명사 추출, 및 기타 필터링 print("데이터 분석 중입니다...[파일 불러오기]") robj.r('data1<-readLines('+file_name+', encoding = "utf-8")') print("데이터 분석 중입니다...[명사 추출 및 필터링]") robj.r('data2<-sapply(data1, extractNoun, USE.NAMES = F)') robj.r('data3<-unlist(data2)') robj.r('data3<-Filter(function(x){nchar(x)>=2}, data3)') robj.r('data3<-gsub("\\\\d+","",data3)') robj.r('data3<-gsub("\\\\(","",data3)') robj.r('data3<-gsub("\\\\)","",data3)')
def biomart_iterator(columns, biomart="ensembl", dataset="hsapiens_gene_ensembl", host='www.biomart.org', path="/biomart/martservice", filters=None, values=None, archive=False): '''download a dataset from biomart and output as a tab-separated table. Arguments --------- columns : dict List of fields to obtain. biomart : string Biomart name dataset : string Biomart dataset host : string Biomart host filters : list List of filter to use values : list Values of the filters archive : bool If True, use archived version Returns ------- iterator Iterator over rows in biomart database. Each row is dictionary mapping column names to values. ''' R.library("biomaRt") # The default value for host in the biomaRt package is # www.biomart.org but for some reason R errors if you specify # host manually but then use the default - but it is fine if # host is anything valid apart from www.biomart.org. So I have # changed this to only specify a value if the value you # are specifying is different to the default KB if host == 'www.biomart.org': mart = R.useMart(biomart=biomart, dataset=dataset, path=path, archive=archive) else: mart = R.useMart(biomart=biomart, dataset=dataset, host=host, path=path, archive=archive) if filters is not None: filter_names = rpy2.robjects.vectors.StrVector(filters) else: filter_names = "" if values is not None: filter_values = values else: filter_values = "" # result is a dataframe result = R.getBM(attributes=rpy2.robjects.vectors.StrVector(columns), filters=filter_names, values=filter_values, mart=mart) # access via result.rx was broken in rpy2 2.4.2, thus try # numeric access assert tuple(result.colnames) == tuple(columns),\ "colnames in dataframe: %s different from expected: %s" % \ (str(tuple(result.colnames)), tuple(columns)) for data in zip(*[result[x] for x in range(len(columns))]): yield dict(zip(columns, data))
import os import time import pickle from os.path import join, dirname import pandas as pd import numpy as np from rpy2.robjects import r, pandas2ri, numpy2ri import pandas.rpy.common as com import multiprocessing as mp from copy import copy, deepcopy from sklearn import ensemble from sklearn.metrics import mean_squared_error import lawstructural.lawstructural.utils as lu import lawstructural.lawstructural.constants as lc r.library('VGAM') pandas2ri.activate() numpy2ri.activate() #TODO: add in tests for first-stage class FirstStage(object): """ Estimate first stage functions for BBL """ def __init__(self, data_p, react, opts): self.data_p = data_p self.react = react self.opts = opts def reaction(self): """ Estimate reaction functions """ print(" * Estimating reaction functions")
DFrame = r.Table(RFrame) subset = DFrame[IdxA:IdxB] NdArr = pandas2ri.ri2py(subset) Matri = np.matrix(NdArr) Cset = Matri[0:(IdxB - IdxA), Col] return Cset base = importr('base') utils = importr('utils') base.source("http://www.bioconductor.org/biocLite.R") biocInstaller = importr("BiocInstaller") biocInstaller.biocLite("GEOquery") GEOquery = r.library("GEOquery") gds1 = r.getGEO("GDS4280") gds2 = r.getGEO("GDS3329") gds3 = r.getGEO("GDS4182") gds4 = r.getGEO("GDS4181") GDS1_Sub = Grab_Subset(gds1, 2, 26, 14823) GDS2_Sub = Grab_Subset(gds2, 2, 81, 14823) GDS3_Sub = Grab_Subset(gds3, 2, 98, 14823) GDS4_Sub = Grab_Subset(gds4, 2, 82, 14823) #print(GDS1_Sub) print(GDS1_Sub.shape) #print(GDS2_Sub) print(GDS2_Sub.shape)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices=("bed", "bam"), help="input file format [default=%default].") parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-e", "--extension", dest="extension", type="int", help="extension size [default=%default].") parser.add_option("-b", "--bin-size", dest="bin_size", type="int", help="bin size of genome vector [default=%default].") parser.add_option("-l", "--fragment-length", dest="fragment_length", type="int", help="bin size of genome vector [default=%default].") parser.add_option( "-s", "--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis [default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "rms", "rpm", "all"), help="actions to perform [default=%default].") parser.add_option( "-w", "--bigwig", dest="bigwig", action="store_true", help= "store wig files as bigwig files - requires a genome file [default=%default]" ) parser.set_defaults( input_format="bam", ucsc_genome="hg19", genome_file=None, extension=400, bin_size=50, saturation_iterations=10, fragment_length=700, toolset=[], bigwig=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(args) != 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() filename_sample = args[0] if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome R.library(genome_file) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) bin_size = options.bin_size extension = options.extension fragment_length = options.fragment_length saturation_iterations = options.saturation_iterations if options.input_format == "bam": E.info("converting bam files") filename_sample = bamToMEDIPS(filename_sample, os.path.join(tmpdir, "sample.medips")) elif options.input_format == "bed": E.info("converting bed files") filename_sample = bedToMEDIPS(filename_sample, os.path.join(tmpdir, "sample.medips")) E.info("loading data") R('''CONTROL.SET = MEDIPS.readAlignedSequences( BSgenome = "%(genome_file)s", file = "%(filename_sample)s" ) ''' % locals()) slotnames = (("extend", "extend", "%i"), ("distFunction", "distance_function", "%s"), ("slope", "slope", "%f"), ("fragmentLength", "fragment_length", "%i"), ("bin_size", "bin_size", "%i"), ("seq_pattern", "pattern", "%s"), ("number_regions", "nregions", "%i"), ("number_pattern", "npatterns", "%i"), ("cali_chr", "calibration_contig", "%s"), ("genome_name", "genome", "%s")) E.info("computing genome vector") R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, bin_size = %(bin_size)i, extend=%(extension)i )''' % locals()) E.info("computing CpG positions") R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")''' ) E.info("compute coupling vector") R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, fragmentLength = %(fragment_length)i, func = "count")''' % locals()) E.info("compute calibration curve") R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''') E.info("normalizing") R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''') outfile = IOTools.openFile(E.getOutputFile("summary.tsv.gz"), "w") outfile.write("category\tvalue\n") if "saturation" in options.toolset or do_all: E.info("saturation analysis") R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, bin_size = %(bin_size)i, extend = %(extension)i, no_iterations = %(saturation_iterations)i, no_random_iterations = 1)''' % locals()) R.png(E.getOutputFile("saturation.png")) R('''MEDIPS.plotSaturation(sr.control)''') R('''dev.off()''') R('''write.csv( sr.control$estimation, file ='%s' )''' % E.getOutputFile("saturation_estimation.csv")) outfile.write("estimated_correlation\t%f\n" % R('''sr.control$maxEstCor''')[1]) outfile.write("true_correlation\t%f\n" % R('''sr.control$maxTruCor''')[1]) if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, extend = %(extension)i, no_iterations = 10)''' % locals()) R.png(E.getOutputFile("cpg_coverage.png")) R('''MEDIPS.plotCoverage(cr.control)''') R('''dev.off()''') # three rows R('''write.csv( cr.control$coveredPos, file ='%s' )''' % E.getOutputFile("saturation_coveredpos.csv")) # coverage threshold # number of CpG covered # percentage of CpG covered R('''write.csv( cr.control$matrix, file ='%s' )''' % E.getOutputFile("saturation_matrix.csv")) # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''') if "calibration" in options.toolset or do_all: E.info("plotting calibration") R.png(E.getOutputFile("calibration.png")) R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)''' ) R('''dev.off()''') for slotname, label, pattern in slotnames: value = tuple(R('''CONTROL.SET@%s''' % slotname)) if len(value) == 0: continue outfile.write( "%s\t%s\n" % (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname))[0])) outfile.close() if "rpm" in options.toolset or do_all: outputfile = E.getOutputFile("rpm.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) if "rms" in options.toolset or do_all: outputfile = E.getOutputFile("rms.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) shutil.rmtree(tmpdir) # write footer and output benchmark information. E.Stop()
#!/usr/bin/env python import numpy import pandas from copy import copy, deepcopy import rpy2.robjects as ro from rpy2.robjects import r, pandas2ri, numpy2ri from rpy2.robjects.conversion import localconverter pandas2ri.activate() numpy2ri.activate() r.library("lme4") class lmer(object): """ Mixed effect regression Wrapper around lme4's lmer class to enable: - training - predicting """ # Class wide constants/parameters def __init__( self, target=None, fixed_effects=[], re_features=[], re_terms=[], ): # Target
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in csv.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.get_output_file("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.get_output_file("%s_saturation_estimation.tsv" % fn)) outfile = iotools.open_file( E.get_output_file("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = iotools.open_file(E.get_output_file("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.get_output_file("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.get_output_file("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.get_output_file("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.get_output_file("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.get_output_file("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.get_output_file("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.get_output_file("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.stop()