def fill_hdf_from_Rdata(self, table): import pandas.rpy.common as com import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.NO_CONVERSION) assert table in self.tables, "Table {} is not a filed table".format(table) Rdata_table = self.tables[table]["Rdata_table"] Rdata_file = self.tables[table]["Rdata_file"] if 'variables' in self.tables: variables = self.tables[table]['variables'] else: variables = None if not os.path.isfile(Rdata_file): raise Exception("file_path do not exists") rpy.r.load(Rdata_file) stored_dataframe = com.load_data(Rdata_table) store_path = table log.info("Inserting {} in HDF file {} at point {}".format( Rdata_table, self.hdf5_file_path, table, ) ) if variables is not None: log.info('variables asked by the user: {}'.format(variables)) variables_stored = list(set(variables).intersection(set(stored_dataframe.columns))) log.info('variables stored: {}'.format(variables_stored)) stored_dataframe = stored_dataframe[variables_stored].copy() stored_dataframe.to_hdf(self.hdf5_file_path, store_path, format = 'table', append = False) gc.collect()
def testFunctionCall(self): rpy.set_default_mode(rpy.BASIC_CONVERSION) # positional only three = rpy.r.sum(1, 2) three = three[0] # is this what is happening w/ rpy, or the list is # ...automatically dropped ? self.assertEqual(3, three) # positional + keywords onetwothree = rpy.r.seq(1, 3, by=0.5) self.assertEqual([1.0, 1.5, 2.0, 2.5, 3.0], onetwothree)
def testFunctionCall(self): rpy.set_default_mode(rpy.BASIC_CONVERSION) # positional only three = rpy.r.sum(1,2) three = three[0] # is this what is happening w/ rpy, or the list is # ...automatically dropped ? self.assertEquals(3, three) # positional + keywords onetwothree = rpy.r.seq(1, 3, by=0.5) self.assertEquals([1.0, 1.5, 2.0, 2.5, 3.0], onetwothree)
def pValue(c, a, b, t): print c, a, b, t t = t - a - b + c a = a - c b = b - c if sum([c, t, a, b]) >= 100000: import rpy2.rpy_classic as rpy import rpy2.robjects as robjects m = robjects.r("matrix(c(" + str(c) + "," + str(a) + "," + str(b) + "," + str(t) + "),byrow=TRUE,nrow=2)") rpy.set_default_mode(rpy.BASIC_CONVERSION) val = rpy.r.chisq_test(m) return val else: return scipy.stats.fisher_exact([[c, a], [b, t]])
def build_erf_aggregates(): """ Fetch the relevant aggregates from erf data """ # Uses rpy2. # On MS Windows, The environment variable R_HOME and R_USER should be set import pandas.rpy.common as com import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.NO_CONVERSION) country = 'france' for year in range(2006, 2008): menageXX = "menage" + str(year)[2:] menageRdata = menageXX + ".Rdata" filename = os.path.join(os.path.dirname(DATA_DIR), 'R', 'erf', str(year), menageRdata) yr = str(year) simu = SurveySimulation() simu.set_config(year=yr, country=country) simu.set_param() agg = Aggregates() agg.set_simulation(simu) # print agg.varlist rpy.r.load(filename) menage = com.load_data(menageXX) cols = [] print year for col in agg.varlist: #print col erf_var = "m_" + col + "m" if erf_var in menage.columns: cols += [erf_var] df = menage[cols] wprm = menage["wprm"] for col in df.columns: tot = (df[col] * wprm).sum() / 1e9 print col, tot
def build_erf_aggregates(): """ Fetch the relevant aggregates from erf data """ # Uses rpy2. # On MS Windows, The environment variable R_HOME and R_USER should be set import pandas.rpy.common as com import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.NO_CONVERSION) country = 'france' for year in range(2006,2008): menageXX = "menage" + str(year)[2:] menageRdata = menageXX + ".Rdata" filename = os.path.join(os.path.dirname(DATA_DIR),'R','erf', str(year), menageRdata) yr = str(year) simu = SurveySimulation() simu.set_config(year = yr, country = country) simu.set_param() agg = Aggregates() agg.set_simulation(simu) # print agg.varlist rpy.r.load(filename) menage = com.load_data(menageXX) cols = [] print year for col in agg.varlist: #print col erf_var = "m_" + col + "m" if erf_var in menage.columns: cols += [erf_var] df = menage[cols] wprm = menage["wprm"] for col in df.columns: tot = (df[col]*wprm).sum()/1e9 print col, tot
# s'inspire fortee;ent de build_from_sources import pandas.rpy.common as com import rpy2.rpy_classic as rpy import pdb rpy.set_default_mode(rpy.NO_CONVERSION) data = "T:/Myliam2/Patrimoine/ici.Rdata" rpy.r.load(data) pdb.set_trace() menage = com.load_data('person')
from encode_gsc.base_types import * verbose = False output = sys.stdout import string import itertools from random import uniform, randint from math import sqrt from operator import itemgetter rpy = None try: import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.BASIC_CONVERSION) except ImportError: pass class sample(tuple): """Hold sample data from which we will calculate the test stat. """ def __init__(self, *args, **kwargs): # tuple.__init__(self, *args, **kwargs) tuple.__init__(self) def __mul__(self, scalar): """Multiply a sample by a scalar.
def main(): try: datafile = sys.argv[1] outfile_name = sys.argv[2] expression = sys.argv[3] except: stop_err('Usage: python gsummary.py input_file ouput_file expression') math_allowed = S3_METHODS()['Math'] ops_allowed = S3_METHODS()['Ops'] # Check for invalid expressions for word in re.compile('[a-zA-Z]+').findall(expression): if word and word not in math_allowed: stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" % (expression, word)) symbols = set() for symbol in re.compile('[^a-z0-9\s]+').findall(expression): if symbol and symbol not in ops_allowed: stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % (expression, symbol)) else: symbols.add(symbol) if len(symbols) == 1 and ',' in symbols: # User may have entered a comma-separated list r_data_frame columns stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression) # Find all column references in the expression cols = [] for col in re.compile('c[0-9]+').findall(expression): try: cols.append(int(col[1:]) - 1) except: pass tmp_file = tempfile.NamedTemporaryFile('w+b') # Write the R header row to the temporary file hdr_str = "\t".join("c%s" % str(col + 1) for col in cols) tmp_file.write("%s\n" % hdr_str) skipped_lines = 0 first_invalid_line = 0 i = 0 for i, line in enumerate(open(datafile)): line = line.rstrip('\r\n') if line and not line.startswith('#'): valid = True fields = line.split('\t') # Write the R data row to the temporary file for col in cols: try: float(fields[col]) except: skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 valid = False break if valid: data_str = "\t".join(fields[col] for col in cols) tmp_file.write("%s\n" % data_str) tmp_file.flush() if skipped_lines == i + 1: stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." ) else: # summary function and return labels set_default_mode(NO_CONVERSION) summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%'] headings_str = "\t".join(headings) r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t") outfile = open(outfile_name, 'w') for col in re.compile('c[0-9]+').findall(expression): r.assign(col, r["$"](r_data_frame, col)) try: summary = summary_func(r(expression)) except RException as s: outfile.close() stop_err("Computation resulted in the following error: %s" % str(s)) summary = summary.as_py(BASIC_CONVERSION) outfile.write("#%s\n" % headings_str) if type(summary) is dict: # using rpy outfile.write("%s\n" % "\t".join(["%g" % summary[k] for k in headings])) else: # using rpy2 outfile.write("%s\n" % "\t".join(["%g" % k for k in summary])) outfile.close() if skipped_lines: print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line)
def testAttributeExpansion(self): rpy.set_default_mode(rpy.BASIC_CONVERSION) wtest = rpy.r.wilcox_test self.assertTrue(isinstance(wtest, rpy.Robj))
def testCallable(self): rpy.set_default_mode(rpy.NO_CONVERSION) #in rpy-1.x, everything is callable self.assertTrue(callable(rpy.r.seq)) self.assertTrue(callable(rpy.r.pi))
def main(): try: datafile = sys.argv[1] outfile_name = sys.argv[2] expression = sys.argv[3] except Exception: stop_err('Usage: python gsummary.py input_file ouput_file expression') math_allowed = S3_METHODS()['Math'] ops_allowed = S3_METHODS()['Ops'] # Check for invalid expressions for word in re.compile('[a-zA-Z]+').findall(expression): if word and word not in math_allowed: stop_err("Invalid expression '%s': term '%s' is not recognized or allowed" % (expression, word)) symbols = set() for symbol in re.compile('[^a-z0-9\s]+').findall(expression): if symbol and symbol not in ops_allowed: stop_err("Invalid expression '%s': operator '%s' is not recognized or allowed" % (expression, symbol)) else: symbols.add(symbol) if len(symbols) == 1 and ',' in symbols: # User may have entered a comma-separated list r_data_frame columns stop_err("Invalid columns '%s': this tool requires a single column or expression" % expression) # Find all column references in the expression cols = [] for col in re.compile('c[0-9]+').findall(expression): try: cols.append(int(col[1:]) - 1) except Exception: pass tmp_file = tempfile.NamedTemporaryFile('w+') # Write the R header row to the temporary file hdr_str = "\t".join("c%s" % str(col + 1) for col in cols) tmp_file.write("%s\n" % hdr_str) skipped_lines = 0 first_invalid_line = 0 i = 0 for i, line in enumerate(open(datafile)): line = line.rstrip('\r\n') if line and not line.startswith('#'): valid = True fields = line.split('\t') # Write the R data row to the temporary file for col in cols: try: float(fields[col]) except Exception: skipped_lines += 1 if not first_invalid_line: first_invalid_line = i + 1 valid = False break if valid: data_str = "\t".join(fields[col] for col in cols) tmp_file.write("%s\n" % data_str) tmp_file.flush() if skipped_lines == i + 1: stop_err("Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements.") else: # summary function and return labels set_default_mode(NO_CONVERSION) summary_func = r("function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }") headings = ['sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%'] headings_str = "\t".join(headings) r_data_frame = r.read_table(tmp_file.name, header=True, sep="\t") outfile = open(outfile_name, 'w') for col in re.compile('c[0-9]+').findall(expression): r.assign(col, r["$"](r_data_frame, col)) try: summary = summary_func(r(expression)) except RException as s: outfile.close() stop_err("Computation resulted in the following error: %s" % str(s)) summary = summary.as_py(BASIC_CONVERSION) outfile.write("#%s\n" % headings_str) if type(summary) is dict: # using rpy outfile.write("%s\n" % "\t".join(["%g" % summary[k] for k in headings])) else: # using rpy2 outfile.write("%s\n" % "\t".join(["%g" % k for k in summary])) outfile.close() if skipped_lines: print("Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % (skipped_lines, first_invalid_line))
### Documentation ### Github/Shark import os import sys import rpy2 import rpy2.robjects as robjects import rpy2.rpy_classic as rpy import rpy2.robjects.numpy2ri from rpy2.robjects.packages import importr rpy2.robjects.numpy2ri.activate() rpy.set_default_mode(0) import itertools import urllib r = robjects.r pvalue = 0.05 ### Get data from db2db def getData(mylist, myoutput, genesDEdata): ids = "" Idtype ="" for ii in range (0, len(mylist)): Idtype = "EnsemblGeneID" ids = ids + "," + mylist[ii].strip().split(".")[0] ids = ids[1:]
def file2h5(fpath, input_dir='', buffersize=10 * 2 ** 20): with open(fpath) as f: content = yaml.load(f) yaml_layout = { '#output': str, 'compression': str, 'globals': { 'periodic': { 'path': str, 'fields': [{ '*': str }], 'oldnames': { '*': str }, 'invert': [str], 'transposed': bool }, '*': { 'path': str, 'type': str, 'fields': [{ '*': str }], 'oldnames': { '*': str }, 'invert': [str], 'transposed': bool } }, '#entities': { '*': { 'path': str, 'fields': [{ '*': str }], 'oldnames': { '*': str }, 'newnames': { '*': str }, 'invert': [str], 'transposed': bool, 'files': None, # { # '*': None # } 'interpolate': { '*': str } } } } validate_dict(content, yaml_layout) localdir = os.path.dirname(os.path.abspath(fpath)) h5_filename = content['output'] compression = content.get('compression') h5_filepath = complete_path(localdir, h5_filename) print("Importing in", h5_filepath) try: h5file = tables.openFile(h5_filepath, mode="w", title="CSV import") globals_def = content.get('globals', {}) if globals_def: print() print("globals") print("-------") const_node = h5file.createGroup("/", "globals", "Globals") for global_name, global_def in globals_def.iteritems(): print() print(" %s" % global_name) req_fields = ([('PERIOD', int)] if global_name == 'periodic' else []) kind, info = load_def(localdir, global_name, global_def, req_fields) if kind == 'ndarray': array_to_disk_array(h5file, const_node, global_name, info, title=global_name, compression=compression) else: assert kind == 'table' fields, numlines, datastream, csvfile = info stream_to_table(h5file, const_node, global_name, fields, datastream, numlines, title="%s table" % global_name, buffersize=buffersize, compression=compression) if csvfile is not None: csvfile.close() print() print("entities") print("--------") ent_node = h5file.createGroup("/", "entities", "Entities") for ent_name, entity_def in content['entities'].iteritems(): print() print(" %s" % ent_name) input_filename = entity_def.get('path', input_dir + ent_name + ".csv") if input_filename[-4:]=='.csv': kind, info = load_def(localdir, ent_name, entity_def, [('period', int), ('id', int)]) assert kind == "table" fields, numlines, datastream, csvfile = info stream_to_table(h5file, ent_node, ent_name, fields, datastream, numlines, title="%s table" % ent_name, invert=entity_def.get('invert', []), buffersize=buffersize, compression=compression) if csvfile is not None: csvfile.close() if input_filename[-6:]=='.Rdata': files_def = entity_def.get('files') if files_def is None: files_def = ent_name print(" - reading", input_filename, ",file", files_def) rpy.set_default_mode(rpy.NO_CONVERSION) msg, filters = compression_str2filter(compression) try: rpy.r.load(input_dir + input_filename) except: rpy.r.load(input_filename) print(" - storing %s..." % msg) array_pandas = com.load_data(files_def) fields_def = entity_def.get('fields') if fields_def is not None: for fdef in fields_def: if isinstance(fdef, basestring): raise SyntaxError("invalid field declaration: '%s', you are " "probably missing a ':'" % fdef) fields = fields_yaml_to_type(fields_def) columns = [col[0] for col in fields] +['id','period'] else: fields = None columns = array_pandas.columns array_pandas = array_pandas.loc[:,columns] dtype = np.dtype(fields) #TODO: gerer les conflits dtype = array_pandas.to_records(index=False).dtype filters=None table = h5file.createTable(ent_node, ent_name, dtype, title="%s table" % ent_name, filters=filters) table.append(array_pandas.to_records(index=False)) table.flush() finally: h5file.close() print() print("done.")
def testFunctionCallWithRObj(self): rpy.set_default_mode(rpy.NO_CONVERSION) onetwo = rpy.r.seq(1, 2) three = rpy.r.sum(onetwo) rpy.set_default_mode(rpy.BASIC_CONVERSION) self.assertEqual(3, three.sexp[0])
def testFunctionCallWithRObj(self): rpy.set_default_mode(rpy.NO_CONVERSION) onetwo = rpy.r.seq(1, 2) three = rpy.r.sum(onetwo) rpy.set_default_mode(rpy.BASIC_CONVERSION) self.assertEquals(3, three.sexp[0])
def testSexp(self): rpy.set_default_mode(rpy.NO_CONVERSION) pi = rpy.r.pi self.assertTrue(isinstance(pi.sexp, rpy2.rinterface.Sexp)) self.assertRaises(AttributeError, setattr, pi, 'sexp', None)
__source__ = '$URL:: $' __all__ = [] #------------------------------------------------------------------------------- # standard library imports: # import os #------------------------------------------------------------------------------- # extension module imports: # import rpy2.robjects as robjects import rpy2.rpy_classic as rpy import rpy2.robjects.numpy2ri rpy.set_default_mode(rpy.BASIC_CONVERSION) r = rpy.r #------------------------------------------------------------------------------- # cecog imports: # #------------------------------------------------------------------------------- # constants: # #------------------------------------------------------------------------------- # functions: #
# Copyright © #2013 Clément Schaff, Mahdi Ben Jelloul # Licensed under the terms of the GVPLv3 or later license # (see openfisca/__init__.py for details) import os import gc from openfisca_core import SRC_PATH from pandas import HDFStore from openfisca_france.utils import check_consistency # Uses rpy2. # On MS Windows, The environment variable R_HOME and R_USER should be set try: import pandas.rpy.common as com import rpy2.rpy_classic as rpy rpy.set_default_mode(rpy.NO_CONVERSION) except: pass from openfisca_france.data.sources.config import DATA_DIR ERF_HDF5_DATA_DIR = os.path.join(SRC_PATH, 'countries', 'france', 'data', 'erf') class SurveyDescription(object): """ An object to describe syrvey data """ def __init__(self): self.survey_year = None self.tables = dict()
def coxuh(gene_name, expn_value, surv_time, surv_censor, feature_names, features): rpy.set_default_mode(rpy.NO_CONVERSION) r_old.library('survival') # remove missing data skip_cols = [] for i in range(len(expn_value)): if np.isnan(expn_value[i]): skip_cols.append(i) if len(skip_cols) > (len(expn_value)/2): return {} expn_value = np.delete(expn_value, skip_cols) surv_time = np.delete(surv_time, skip_cols) surv_censor = np.delete(surv_censor, skip_cols) if len(feature_names) >= 1: features = np.delete(features, skip_cols, 1) r.assign('time', surv_time) r.assign('censor', surv_censor) safe_feature_names = [] for idx, feature_name in enumerate(feature_names): if 'factor{' in feature_name: match = re.search('factor{(.*)}: (.*)', feature_name) reference = match.group(1) factor_feature_name = safe_string(match.group(2)) feature = features[idx].astype(str) r.assign(factor_feature_name, robjects.FactorVector(feature)) # Once we have a feature set up in R, we need to set the reference level: # r(feature_name <- relevel(feature_name, reference_level)) r(factor_feature_name + ' <- relevel('+ factor_feature_name +', "' + reference + '")') safe_feature_names.append(factor_feature_name) else: feature = features[idx].astype(np.float) safe_feature_names.append(safe_string(feature_name)) r.assign(safe_string(feature_name), feature) formula_string = '' if len(safe_feature_names) >= 1: formula_string = 'gene + ' + ' + '.join(safe_feature_names) data_frame_string = 'gene, '+ ', '.join(safe_feature_names) else: formula_string = 'gene' data_frame_string = 'gene' r.assign('gene', expn_value) r('data = data.frame(' + data_frame_string + ')') try: coxuh_output = r('summary( coxph(formula = Surv(time, censor) ~ ' + formula_string + ', ' + 'data = data, model=FALSE, x=FALSE, y=FALSE))') coef_ind = list(coxuh_output.names).index('coefficients') coeffs = coxuh_output[coef_ind] patient_count_ind = list(coxuh_output.names).index('n') patient_count = coxuh_output[patient_count_ind][0] cox_dict = { 'name': gene_name, 'n': patient_count } for multivariate in coeffs.rownames: cox_dict[multivariate] = { 'z': coeffs.rx(multivariate, 'z')[0], 'p': coeffs.rx(multivariate, 'Pr(>|z|)')[0] } return cox_dict except RRuntimeError as e: return {'error': '-1'}