Beispiel #1
0
class LocfitEstimator(BaseEstimator):
    """
    An R locfit estimator that pretends to work like a sklearn estimator.
    """
    def __init__(self):
        rfile = "/net/noble/vol2/home/katecook/proj/2015HiC-differential/src/call_locfit.R"
        rfh = open(rfile, 'r')
        string = ''.join(rfh.readlines())
        self.locfit = SignatureTranslatedAnonymousPackage(string, "locfit")

    def fit(self, X, y):
        X_robj = robjects.FloatVector(X)
        y_robj = robjects.FloatVector(y)
        self.fit_ = self.locfit.do_fit(X_robj, y_robj)

    def predict(self, X):
        check_is_fitted(self, "fit_")
        shape = X.shape
        flat = X.flatten().T
        #logging.debug(flat.shape)
        #logging.debug(flat)
        X_robj = robjects.FloatVector(flat)
        #logging.debug(X_robj)
        y_robj = self.locfit.safepredict(self.fit_, X_robj)
        y_ = np.array(y_robj).T.reshape(shape)
        logging.debug(y_)
        np.savetxt("y.txt", y_, delimiter='\t')
        return y_

    def score(self, X, y):
        yest = self.safepredict(X)
        return r2_score(y, yest, sample_weight=None)
Beispiel #2
0
def run_hydrology(init_gwstorage, init_C, init_Nash, init_Qq, init_Qs,
                  climate_type):

    if "hydrological" in CONFIG.paths:
        path = CONFIG.paths['hydrological']
    else:
        path = os.path.dirname(__file__)
    #end if

    r_path = os.path.join(path, 'WrappableRunIhacresGw.R')
    with open(r_path) as r_file:
        """
		import .R file and call function
		"""
        string = r_file.read()
        IhacresGW = SignatureTranslatedAnonymousPackage(string, "IhacresGW")

        workingdir = CONFIG.paths[
            "hydrological"] if "hydrological" in CONFIG.paths else os.path.dirname(
                __file__) + "/"

        #workingdir = os.path.dirname(__file__)
        # workingdir = "~/Dropbox/integrated/Mike/hydrological"
        # datadir = workingdir + "/Maules_19690101_20100302"

        datadir = workingdir + "data"
        workingdir = workingdir[:
                                -1]  #Remove last slash as function below expects it to be empty

        # sim, tdat = IhacresGW.RunIhacresGw(workingdir, datadir)
        return IhacresGW.RunIhacresGw(workingdir, datadir,
                                      init_gwstorage, init_C,
                                      FloatVector(init_Nash), init_Qq, init_Qs,
                                      climate_type)
    def generate_solutions_tables(self):
        ''' code from Adam use rpy2 to execute rcode which reads out a solutions file to pandas '''
        col_names = [
            'alpha', 'tau', 'AT', 'b', 'delta', 'LL', 'mode_curv',
            'genome mass', 'sigma.h.hat', 'theta.z.hat', 'sigma.A.hat',
            'theta.Q.hat', 'lambda.hat', 'theta.0', 'frac.het', 'SCNA_LL',
            'entropy', 'Kar_LL', 'WGD', 'combined_LL', 'SSNV_LL',
            'SCNA_Theta_integral', 'dens'
        ]

        # Build R function to be used as a python package
        load_RData_func_str = """
                       load_RData <- function(file_path) {
                          load(file_path)
                          head_name <- ls()[1]
                          file_name <- names(`segobj.list`)[1]
                          r_data <- `segobj.list`[[file_name]]$mode.res$mode.tab
                          return(r_data)
                      }
                      """
        # Pack the function above as a package
        r_pack = SignatureTranslatedAnonymousPackage(load_RData_func_str,
                                                     "r_pack")
        print 'Generating absolute tables for ' + str(len(
            self.data_table)) + ' samples'
        pandas2ri.activate()
        for index, row in self.data_table.iterrows():
            if np.mod(index, 100) == 0:
                print str(index) + '/' + str(len(self.data_table))
            r_data = r_pack.load_RData(row['absolute_summary_data'])
            abs_table = pd.DataFrame(pandas2ri.ri2py(r_data),
                                     columns=col_names)
            self.pp_modes_tables[row['pair_id']] = abs_table
        pandas2ri.deactivate()
Beispiel #4
0
def extract_strain_id(busco_result):
    '''
    provide busco_result as input
    input:busco_result
    return:>95% busco result list
    '''
    R_extract_95_code = """
    extract_95 <- function(busco_result) {
        require(tidyverse)
        in_fl <- read.table(busco_result)
        in_fl_95 <- in_fl %>%
        filter(V2 >= 95)
        # print(in_fl_95$V1)
        pb_protein_id=in_fl_95[grep("_PB",in_fl_95$V1,ignore.case = F),1]
        need_remove=gsub("_PB","",pb_protein_id)
        in_fl_95_removed_pb=in_fl_95[!(in_fl_95$V1%in%need_remove),]
        # print(in_fl_95_removed_pb$V1)
        return(as.character(in_fl_95_removed_pb$V1))
    }
    """
    R_extract_95 = SignatureTranslatedAnonymousPackage(R_extract_95_code,
                                                       "R_extract_95")
    R_95_strain = R_extract_95.extract_95(busco_result)
    strain_95_list = list(R_95_strain)
    strain_95_list.remove("70-15")
    strain_95_list.remove("HO_busco")
    strain_95_list.remove("PH42_busco")
    strain_95_list.append("magnaporthe_oryzae_70-15_8_proteins_T0")
    strain_95_list.append("HO")
    strain_95_list.append("PH42")
    return strain_95_list
Beispiel #5
0
def call_r(df):
    '''
	Arguments:
	df: A string replicating a CSV file. The observations for the dependent
		variable MUST be in the FIRST COLUMN

	Returns: an rpy2 Robject float vector which stores the coefficients of the
	linear regression
	'''
    from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
    from io import StringIO
    from rpy2.robjects import DataFrame
    from rpy2.robjects import FloatVector
    import rpy2.rinterface as ri

    ri.initr()
    file_like_obj = StringIO(df)
    constructor_dict = parser(file_like_obj)
    rpy2_dataframe = DataFrame(constructor_dict)
    with open('regression_app\linear_modeler_function.R') as f:
        str = f.read()
    mod = SignatureTranslatedAnonymousPackage(str, 'mod')
    a = mod.linear_modeler(rpy2_dataframe)
    del mod
    return a
def remove_MGG_unpresent_Augustus(
        pav_orthofinder, MGG_unpresent_Augustus_unassianed_list_file_name,
        pav_orthofinder_1574):
    '''
    用于从pav_orthofinder中删掉1574个出现在unassigned gene中的
    input 1: pav_orthofinder
    input 2: MGG_unpresent_Augustus_unassianed_list
    output 1: pav_orthofinder_1574
    '''
    R_code_remove_MGG_unpresent_Augustus = '''
    R_remove_MGG_unpresent_Augustus=function(
    pav_orthofinder_file_name,
    MGG_unpresent_Augustus_unassianed_list_file_name,
    pav_orthofinder_1574_file_name
    ){
        require(readxl)
        require(WriteXLS)
        require(dplyr)
        MGG_unpresent_Augustus_unassianed_list=read.table(MGG_unpresent_Augustus_unassianed_list_file_name)
        pav_orthofinder=read_xlsx(pav_orthofinder_file_name)
        pav_orthofinder_1574=pav_orthofinder %>% 
            filter(!(protein_id %in% MGG_unpresent_Augustus_unassianed_list$V1))
        pav_orthofinder_1574=pav_orthofinder_1574[,-1]
        pav_orthofinder_1574=pav_orthofinder_1574[,c(1,158,2:157)]
        WriteXLS::WriteXLS(pav_orthofinder_1574,pav_orthofinder_1574_file_name)
    }
    '''
    R_remove_MGG_unpresent_Augustus = SignatureTranslatedAnonymousPackage(
        R_code_remove_MGG_unpresent_Augustus,
        "R_remove_MGG_unpresent_Augustus")
    R_remove_MGG_unpresent_Augustus.R_remove_MGG_unpresent_Augustus(
        str(pav_orthofinder),
        str(MGG_unpresent_Augustus_unassianed_list_file_name),
        str(pav_orthofinder_1574))
def predictPrices(path):
    r = robjects.r
    sourcepath = os.path.abspath("rpy2/project/R/predict.R")
    source = r.source(sourcepath)
    from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
    project = SignatureTranslatedAnonymousPackage(
        "predictPrice <- " + str(source[0]), "project")
    return project.predictPrice(path)
Beispiel #8
0
def convertRtoPandas(file_path):
    # Pack the function above as a package
    r_pack = SignatureTranslatedAnonymousPackage(load_RData_func_str, "r_pack")

    pandas2ri.activate()
    r_data = r_pack.load_RData(file_path)
    py_data = pd.DataFrame(pandas2ri.ri2py(r_data), columns=col_names)
    pandas2ri.deactivate()

    return py_data
Beispiel #9
0
    def getFDRCorrection(pvals):
        rcode = """
            fdr <- function(pvals) {
                return(p.adjust(pvals, method = "fdr"))
            }
            """
        rStats = SignatureTranslatedAnonymousPackage(rcode, "rStats")

        pvals_r = robjects.FloatVector(pvals)
        return rStats.fdr(pvals_r)
def r_cal(df):
    string = """
    ptsPPP <- function(df) {
        X <- with(df, ppp(x, y, c(-25,25), c(-25,25)))
        plot(X)
        return(X)
    }
    """
    sp = SignatureTranslatedAnonymousPackage(string, "powerpack")
    pandas2ri.activate()
    r_num_meanDis_DF = pandas2ri.py2ri(df[["x", "y"]])
    ptsPPP = sp.ptsPPP(r_num_meanDis_DF)
Beispiel #11
0
	def run(self):

		try:
			result = "0"
			# grise
			self.disalbledButtonsCalibration()
			self.ui.resetCalibrationPushButton.setDisabled(True)
			self.actualizeOutFiles()

			with open("fonctions_apprentissage.r", "r", encoding="utf-8") as apprentissageRopen:
				apprentissage = "".join(apprentissageRopen.readlines())

			apprentissage = SignatureTranslatedAnonymousPackage(apprentissage, "apprentissage")

			if args.debug: print("{}\n{}".format(apprentissage, dir(apprentissage)))

			# test if Rdata file already exist, if yes remove file if user say yes, or stop analyse
			if os.path.exists(self.calibrationFilesOut["RData"]):
				reply = QMessageBox.question(self, 'WARNING', 'File will be overwritten.\nDo you still want to proceed?', QMessageBox.Yes | QMessageBox.No, QMessageBox.No)
				if reply == QMessageBox.Yes:
					for key, path in self.calibrationFilesOut.items():
						os.remove(path)
					reloadCalibration = True
				elif reply == QMessageBox.No:
					reloadCalibration = False
			else:
				reloadCalibration = True

			if reloadCalibration:
				self.ui.statusbar.showMessage(str("Running calibration, please waiting ...."),9600)
				#doivent être (dans cet ordre) le nom (relatif) des sous-répertoires fond, limbe, lésions
				result , good = apprentissage.apprentissage(self.calibrationInOutPath,"background", "leaf", "lesion").r_repr().replace('"','').replace("c(","").replace(")","").split(",")
				self.calibrationFileOpenLineEdit.setText(self.calibrationFilesOut["RData"])
				#print(result, good)
			if result == "1" and os.path.exists(self.calibrationFilesOut["RData"]):
				print(result, self.calibrationFilesOut["RData"])
				self.infoDialogue(status = "new")
				self.ui.statusbar.showMessage(str("FINISH, files were product on : %s" % self.calibrationInOutPath),9600)
				self.ui.resetCalibrationPushButton.setEnabled(True)
			elif  result == "0" and os.path.exists(self.calibrationFilesOut["RData"]):
				self.infoDialogue(status = "already")
				print(result, self.calibrationFilesOut["RData"])
				self.calibrationFileOpenLineEdit.setText("")
				self.ui.resetCalibrationPushButton.setEnabled(True)
				self.resetLoadFolder()
				self.enableButtonsCalibration()
			elif result == "0" and not os.path.exists(self.calibrationFilesOut["RData"]):
				self.displayError(typeError = "ERROR:", message = "Error when running R code....")
				self.resetLoadFolder()
		except Exception as e:
				self.displayError(typeError = "ERROR:", message = "Error when running R code....\n"+e)
				self.resetLoadFolder()
				self.ui.resetCalibrationPushButton.setEnabled(True)
Beispiel #12
0
	def run(self):
		try:
			result = "0"
			# grise
			self.disalbledButtonsCalibration()
			self.ui.resetCalibrationPushButton.setDisabled(True)

			with open("fonctions_apprentissage.r", "r", encoding="utf-8") as apprentissageRopen:
				apprentissage = "".join(apprentissageRopen.readlines())

			apprentissage = SignatureTranslatedAnonymousPackage(apprentissage, "apprentissage")

			self.CalibrationOutPath = "/".join(str(self.dicoFoldersCalibration["leaf"]).split("/")[:-1])
			self.CalibrationBasename = self.CalibrationOutPath.split("/")[-1]
			self.actualizeOutFiles()

			if debug: print("{}\n{}".format(apprentissage, dir(apprentissage)))

			# test if Rdata file already exist, if yes remove file if user say yes, or stop analyse
			if os.path.exists(self.CalibrationFilesOut["RData"]):
				reply = QMessageBox.question(self, 'Warning', 'File will be overwritten.\nDo you still want to proceed?', QMessageBox.Yes | QMessageBox.No, QMessageBox.No)
				if reply == QMessageBox.Yes:
					for key, path in self.CalibrationFilesOut.items():
						os.remove(path)
					reloadCalibration = True
				elif reply == QMessageBox.No:
					reloadCalibration = False
			else:
				reloadCalibration = True

			if reloadCalibration:
				self.ui.statusbar.showMessage(str("Running Calibration, please waiting ...."),9600)
				#result , self.CalibrationFilesOut["RData"] = apprentissage.apprentissage(self.dicoObjectOpenLineEditCalibration["leaf"],self.dicoObjectOpenLineEditCalibration["symptom"],self.dicoObjectOpenLineEditCalibration["background"]).r_repr().replace('"','').replace("c(","").replace(")","").split(",")
				result , good = apprentissage.apprentissage(self.CalibrationOutPath).r_repr().replace('"','').replace("c(","").replace(")","").split(",")
				self.calibrationFileOpenLineEdit.setText(self.CalibrationFilesOut["RData"])
			if result == "1" and os.path.exists(self.CalibrationFilesOut["RData"]):
				print(result, self.CalibrationFilesOut["RData"])
				self.infoDialogue(status = "new")
				self.ui.statusbar.showMessage(str("FINISH, files were product on : %s" % self.CalibrationOutPath),9600)
				self.ui.resetCalibrationPushButton.setEnabled(True)
			elif  result == "0" and os.path.exists(self.CalibrationFilesOut["RData"]):
				self.infoDialogue(status = "already")
				print(result, self.CalibrationFilesOut["RData"])
				self.calibrationFileOpenLineEdit.setText("")
				self.ui.resetCalibrationPushButton.setEnabled(True)
				self.resetLoadFolder()
				self.enableButtonsCalibration()
			elif result == "0" and not os.path.exists(self.CalibrationFilesOut["RData"]):
				self.displayError(error = "Error when running R code....")

		except Exception as e:
			self.displayError(error = e)
Beispiel #13
0
def mkSimSem(variances=[0.5, 1.1, 0.8, 0.4, 0.4, 0.8, 0.8, 0.5, 0.6]):
    string = """
    library(lavaan)
    mkdata=function(n){
    popModel <- "
        f1 =~ 1*y1 + 0.6*y2 + 0.7*y3
    f2 =~ 1*y4 + 1.1*y5 + 0.9*y6
    f3 =~ 1*y7 + 1.2*y8 + 1.1*y9
    f1 ~~ 0.8*f1
    f2 ~~ 0.9*f2
    f3 ~~ 0.4*f3
    f1 ~~ 0.4*f2
    f1 ~~ 0.2*f3
    f2 ~~ 0.3*f3
    y1 ~~ %f*y1
    y2 ~~ %f*y2
    y3 ~~ %f*y3
    y4 ~~ %f*y4
    y5 ~~ %f*y5
    y6 ~~ %f*y6
    y7 ~~ %f*y7
    y8 ~~ %f*y8
    y9 ~~ %f*y9
    "
    analyzeModel <- "
        f1 =~ y1 + y2 + y3
    f2 =~ y4 + y5 + y6
    f3 =~ y7 + y8 + y9
    "

    s=simulateData(popModel,sample.nobs=n)
    return(s)
    }""" % tuple((i for i in variances))

    return SignatureTranslatedAnonymousPackage(string, "semsimdata")
Beispiel #14
0
def get_taxon_abundance_stacked_bar_plot():
    box_plot_fnc = """
        require("dplyr")
        require("ggplot2")

        taxon_abundance_stacked_bar_plot <- function(data, plot_file_path, title, xlabel, ylabel) {

            temp <- data[order(data$variant_allele_count),] #sort by variant_allele_count
            temp$genotype <- factor(temp$genotype,levels=unique(temp$genotype)) #use reordered genotypes as levels

            #creates a new data frame with median abundance from each combo
            result <- temp %>%
                group_by(genotype, gene, taxon) %>%
                    summarize(medianAbundance = median(abundance))
            #If you want the heights of the bars to represent values in the data,
            #use stat="identity" and map a value to the y aesthetic.

            pdf(plot_file_path, width=8, height=4)

            ap <- ggplot(data=result, aes(x=genotype,y=medianAbundance,fill=taxon)) +
                geom_bar(stat='identity') +
                ggtitle(title)
            ap <- ap + labs(x=xlabel, y=ylabel)
            ap <- ap + theme(legend.direction = 'vertical', legend.position = 'bottom')
            ap <- ap + guides(fill = guide_legend(reverse = TRUE))
            print(ap)
            dev.off()
        }
        """
    pck = SignatureTranslatedAnonymousPackage(box_plot_fnc, 'pck')
    return pck.taxon_abundance_stacked_bar_plot
Beispiel #15
0
def get_taxon_abundance_box_plot():
    box_plot_fnc = """
        require("dplyr")
        require("ggplot2")

        taxon_abundance_box_plot <- function(data, plot_file_path, title, xlabel, ylabel) {

            temp <- data[order(data$variant_allele_count),] #sort by variant_allele_count
            temp$genotype <- factor(temp$genotype,levels=unique(temp$genotype)) #use reordered genotypes as levels

            pdf(plot_file_path)

            ap <- ggplot(data=temp,
                         aes(x=genotype,y=abundance)
                  )
            ap <- ap + geom_boxplot()
            ap <- ap + ggtitle(title)
            ap <- ap + labs(x=xlabel, y=ylabel)
            ap <- ap + geom_jitter(position=position_jitter(w=0.1))

            print(ap)
            dev.off()
        }
        """
    pck = SignatureTranslatedAnonymousPackage(box_plot_fnc, 'pck')
    return pck.taxon_abundance_box_plot
def set_minus_cut(start_point,pav_df_file_name,result_path):
    R_code_set_minus_cut='''
    Cut=function(start_point,pav_df_file_name,result_path){
    require(readxl)
    require(WriteXLS)
    require(tidyverse)
    pav_df=read_xlsx(pav_df_file_name)
    gene_is=pav_df %>% 
        filter((!!sym(start_point))==1) %>%
        # filter(`70-15`)==1
        select("protein_id") 
    
    minus_part=pav_df %>% 
        filter((!!sym(start_point))==1) %>% 
        column_to_rownames("protein_id")
    # pav_df_colsum=colSums(minus_part_num)
    # pav_df_colsum_sort=sort(pav_df_colsum)
    add_part=pav_df %>% 
        filter((!!sym(start_point))==0) %>% 
        column_to_rownames("protein_id")
    # add_part=add_part %>% 
    #   column_to_rownames(pav_df_raw$...2)
    add_part_num=sapply(add_part[2:157], function(x) as.numeric(x))
    pav_df_colsum=colSums(add_part_num)
    pav_df_colsum_sort=sort(pav_df_colsum)
    
    write.table(attributes(pav_df_colsum_sort),paste(result_path,sprintf("set_minus_sort_protein_id_%s.txt", start_point),sep = ""),append = F,quote = F,row.names = F,col.names = F)
    write.table(pav_df_colsum_sort,paste(result_path,sprintf("set_minus_sort_protein_id_num_%s.txt", start_point),sep = ""),append = F,quote = F,row.names = T,col.names = F)
    
    
    WriteXLS::WriteXLS(
        minus_part,
        paste(result_path,sprintf("set_minus_minus_%s.xlsx", start_point),sep = ""),
        col.names = T,
        row.names = T
    )
    WriteXLS::WriteXLS(
        add_part,
        paste(result_path,sprintf("set_minus_add_%s.xlsx", start_point),sep = ""),
        col.names = T,
        row.names = T
    )
    write.table(gene_is,paste(result_path,sprintf("set_minus_gene_id_%s.txt", start_point),sep = ""),append = F,quote = F,row.names = F,col.names = F)
    }
    '''
    R_set_minus_cut = SignatureTranslatedAnonymousPackage(R_code_set_minus_cut, "R_set_minus_cut")
    R_set_minus_cut.Cut(start_point,str(pav_df_file_name),result_path)
Beispiel #17
0
def run_mccoil(barcode_file_lines, maf_file_lines=None, verbose=False):
    ## init barcode set
    barcode_set = Barcode.SetOfBarcodes()
    barcode_set.readBarcodeFileLines(barcode_file_lines)
    # validate
    is_valid_barcode_set, err_msg = barcode_set.validate()
    if not is_valid_barcode_set:
        print err_msg
        return ([])
    ## init mafs
    if not maf_file_lines:
        mafs = barcode_set.computeMAFFromBarcodes(1)
    else:
        mafs = MAF.MAF()
        mafs.readMAFFileLines(maf_file_lines)
    mafs_R_vector = robjects.Vector(mafs.minor_allele_freqs())
    # validate
    is_valid_mafs, err_msg = mafs.validate()
    if not is_valid_mafs:
        print err_msg
        return ({})
    ## compute zygosity matrix, then convert to R DataFrame
    zygosity_matrix = barcode_set.to_zygosity_matrix(mafs,
                                                     header=True,
                                                     index=True)
    if verbose: print(zygosity_matrix)
    data = to_R_zygosity_df(zygosity_matrix)
    if verbose: print(data)
    ## import MCCOIL
    mccoil_R_code = open(mccoil_R, 'r').read()
    mccoil = SignatureTranslatedAnonymousPackage(mccoil_R_code, 'mccoil')
    ## compute result
    result = mccoil.McCOIL_categorical(data, P=mafs_R_vector)
    #print result
    ## get sites/samples
    sites, samples = list(result[-2]), list(result[-1])
    ## get maf/coi predictions, which map 1-to-1 sites/samples
    # (i.e. maf_prediction[i] is prediction for site[i])
    maf_predictions, coi_predictions = list(result[6]), list(result[5])
    return ({
        'mafs': zip(sites, maf_predictions),
        'cois': zip(samples, coi_predictions)
    })
Beispiel #18
0
def screw_around():

    pi = robj.r['pi']
    print pi 
    print pi+2
    print pi[0]
    print pi[0]+2

    #create fake binned array
    nrow = 5
    ncol = 10
    counter = 0
    binned = np.zeros((nrow, ncol), dtype="float64")
    for row in xrange(nrow):
        for col in xrange(ncol):
            binned[row, col] = counter
            counter += 1
    #print binned
    
    #get binned array into R data.frame
    #vec = robj.FloatVector([1.1, 2.2, 0, 4.4, 5.5, ])
    #print binned.shape
    print numpy2ri(binned)
    rdf = robj.r['data.frame'](numpy2ri(binned), code="ID1000")
    #print rdf

    # now see if we can get R to use this dataframe 
    myRcode = """
    square <- function(rdf) {
        myv = rdf$X2 + rdf$X3
        return(myv)
    }
    doit <- function() {
        source("/srv/scratch/carolyn/Dengue_code/Rtest_rpy.R") 
        run_test_wrap(3)
    }
    """
    print "wwwwah"
    powerpack = SignatureTranslatedAnonymousPackage(myRcode, "powerpack")
    print powerpack._rpy2r.keys() #to reveal the functions within powerpack
    print powerpack.square(rdf) #to run the function "square" found in powerpack
    print powerpack.doit()
Beispiel #19
0
 def get_arima_rsi(prices):
     df = pd.DataFrame(prices)
     pandas2ri.activate()
     calculate_models = """ calculate <- function(x, size=100){
                             x <- na.omit(x)
                             library(TTR)
                             library(stats)
                             x <- ts(x)
                             f <- function(m) class(try(solve(m),silent=T))=="matrix"
                             if(f(x)){
                             x[50] = x[50] + 2
                             }
                             arima <- arima(x, c(0,0,0))
                             rsi <- RSI(x, size-1)[size]
                             list <-c(arima$coef, rsi)
                             return(as.array(list))
                             }"""
     calculate = SignatureTranslatedAnonymousPackage(calculate_models, "calculate")
     stats = calculate.calculate(df, len(df))
     return stats
Beispiel #20
0
    def __init__(self, r_filename):

        # Read in r file and init data engine
        string = ""
        with open(r_filename, "r") as myfile:
            string = ''.join(myfile.readlines())
        r = SignatureTranslatedAnonymousPackage(string, "r")

        robjects.r(string)

        self.r = r
def load_r_file(filename, namespace):
    if namespace not in r_namespaces:
        import rpy2.robjects.numpy2ri
        rpy2.robjects.numpy2ri.activate()
        if PROJECT_DIR not in filename:
            filename = os.path.join(PROJECT_DIR, 'r_src', 'forAndrej',
                                    filename)
        with open(filename, 'r') as pout:
            source = pout.read()
        res = SignatureTranslatedAnonymousPackage(source, namespace)
        r_namespaces[namespace] = res
    return r_namespaces[namespace]
Beispiel #22
0
def arxiv_crawl(crawling_list,
                limit=None,
                batchsize=100,
                submission_range=None,
                update_range=None,
                delay=None):
    """
    This is a python wrapper for the aRxiv "arxiv_search" function.

    If submission_range or update_range are given, the results are filtered according to the date ranges.

    :param crawling_list: The subcategories to crawl. NOT "stat" -> USE "stat.AP" etc...
    :type crawling_list: dict of lists.
    :param limit: Max number of results to return.
    :type limit: int.
    :param batchsize: Number of queries per request.
    :type batchsize: int.
    :param submission_range: The range of submission dates.
    :type submission_range: Tuple (start,end).
    :param update_range: The range of last-update dates.
    :type update_range: Tuple (start,end).

    :returns:  The created folder
    """

    # Timestamp of starting datetime
    ts_start = time.time()
    timestamp = datetime.datetime.fromtimestamp(ts_start).strftime(
        '%Y-%m-%d_%H-%M-%S')

    # Create folder structure
    working_folder = base_directory + timestamp
    os.makedirs(working_folder)
    os.makedirs(working_folder + "/temp_files")

    # Setup logging
    config = logging_confdict(working_folder, __name__)
    logging.config.dictConfig(config)
    arxiv_logger = logging.getLogger(__name__)

    arxiv_logger.info("Starting new crawl for {}".format(str(crawling_list)))
    arxiv_logger.info("Created new folder: <<" + working_folder + ">>")

    # Load R-scripts
    arxiv_logger.debug("Loading R-Scripts ...")
    try:
        with open('../r_scripts/arxiv.R', 'r') as f:
            string = ''.join(f.readlines())
        arxiv_crawler = SignatureTranslatedAnonymousPackage(
            string, "arxiv_crawler")
    except Exception, e:
        arxiv_logger.exception("Error while loading R-Scripts.")
        sys.exit('Could not load R-Scripts!')
Beispiel #23
0
def xml2df(url):
     # make some terrible R code
     from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
     from rpy2.robjects import pandas2ri

     string = """
     require(XML)
     require(plyr)

     getXML <- function(x) {
          xmlfile <- xmlTreeParse(x)
          temp = xmlToList(xmlfile, addAttributes = F)
          df <- ldply(temp, .fun=function(x) {data.frame(t(unlist(x)))})
          return(df)
     }
     """
     test = SignatureTranslatedAnonymousPackage(string, "test")

     # make a pandas DF out of the stupid R df
     pydf = pandas2ri.ri2py_dataframe(test.getXML(url))
     return pydf
Beispiel #24
0
def file_to_anonymous_package(
        file: str) -> SignatureTranslatedAnonymousPackage:
    """
    Takes some file.R and sources it in rpy2 as an anonymous package
    Returns the R package as an object
    The name of the package is accessible by package.__rname__ as str
    """
    package_name = os.path.splitext(os.path.split(file)[1])[0]
    with open(file, "r") as r_package_file:
        r_package_src = r_package_file.read()
    package_src = SignatureTranslatedAnonymousPackage(r_package_src,
                                                      name=package_name)
    return package_src
def main():
    filenames, outdir = parse_arguments() #filenames will be a list
    os.chdir(outdir) #change pwd to output directory
    start_time_overall = time.time()
  
    rt_grid_size = 50
    mz_grid_size = 50
    log_statement( "Number of mzml patient files: {}".format(len(filenames)) )
    
    #build empty np array to be filled with LCMS values for R prediction
        #dimension will be (# mzml files) by (# rt/mz bins + 1 for patient ID)
    floatD = np.zeros((len(filenames),rt_grid_size*mz_grid_size), dtype=float) #i vals
    strD = np.zeros((len(filenames),1), dtype='a6') #a6 is dtype for 6 char str
    respD = np.hstack((strD, floatD))

    #fill the array
    for filecount, filename in enumerate(filenames):
        if filecount<1000:
            respD = fill_row_of_lcms_matrix(respD, 
                    rt_grid_size, mz_grid_size, filecount, filename)
    print "\n Data to use in R: " , respD[0:5,0:20]

    log_statement("Time till beginning of R section: {} minutes".format(
    (time.time() - start_time_overall)/60. ) )

    #convert numpy array into data.frame recognized by R
    Rdf = robj.r['data.frame'](numpy2ri(respD))
    #use this dataframe in R prediction code 
    myRcode = """
    doR <- function(python_respD, lcms_run) {
        source("/srv/scratch/carolyn/Dengue_code/prediction_with_LCMS_from_python.R") 
        run_predictions_wrap(python_respD, lcms_run)
    }
    """
    Rpack = SignatureTranslatedAnonymousPackage(myRcode, "Rpack")
    print Rpack.doR(Rdf, lcms_run=2) #to run the function doR, found in Rpack
    
    log_statement("Total execution time: {} minutes".format(
        (time.time() - start_time_overall)/60. ) ) #40 min to create binned data using pickles
Beispiel #26
0
def main():
    mydata = []
    # Open and Read the data file from a csv to convert it to a list
    with open('breakout_detection_wraper/fuel_data.csv', 'r') as csvfile:
        dat = csv.reader(csvfile)
        for line in dat:
            mydata.append(float(line[0]))

    # Define the parameters to configure the break out detection algoritm
    minsize = 30
    method = 'multi'
    degree = 1
    # Open the R file to run it on the wrapper
    with open('breakout_detection_wraper/breakout_function.R') as code:
        rcode = os.linesep.join(code.readlines())
        # Create the wrapper as an anonymous package signature
        wrapper = SignatureTranslatedAnonymousPackage(rcode,
                                                      "breakout_function")
    # Execute the method from the wrapper
    result = wrapper.Detect(FloatVector(mydata), minsize, method, degree)
    # Print the result returned from the R function
    print(result)
Beispiel #27
0
def crossref_lookup(working_folder,
                    index,
                    authors,
                    titles,
                    submitted,
                    num_threads=1):
    # Load r-scripts
    print("\nLoading R-Scripts ...")
    with open('../r_scripts/doi_lookup.R', 'r') as f:
        string = ''.join(f.readlines())
    doi_lookuper = SignatureTranslatedAnonymousPackage(string, "doi_lookuper")

    cr_input_queue = Queue.Queue()
    cr_to_process = Queue.Queue()
    process_to_result = Queue.Queue()

    doc_count = 0
    for idx, author, title, date in zip(index, authors, titles, submitted):
        tokens = author.split("|")
        if len(tokens) >= 15:
            author = "|".join(tokens[:15])
        cr_input_queue.put((idx, author, title, date))
        doc_count += 1

    process_thread = ProcessingThread(working_folder, cr_to_process,
                                      process_to_result, doc_count)

    print("\nStarting crossref crawl process...")
    crossref_threads = []
    for i in range(num_threads):
        thread = CrossrefAPIThread(cr_input_queue, cr_to_process, doi_lookuper)
        thread.start()
        crossref_threads.append(thread)

    process_thread.start()

    for thread in crossref_threads:
        thread.event.set()

    for thread in crossref_threads:
        thread.join()

    process_thread.event.set()

    process_thread.join()

    results = []
    while not process_to_result.empty():
        results.append(process_to_result.get())

    return results
Beispiel #28
0
 def __init__(self):
     self.pd_active()
     self._dotty = self.func('py_dotted_data', self.DOTTED_DATA)
     self.pdata = self.func('py_packdata', self.PACKDATA)
     self.available = self.func('py_avail',self.AVAIL)()
     self.installed = self.calc('installed.packages')
     self._histlines = self.func('py_draw_histlines', self.HISTLINES)
     self._lines = self.func('py_draw_lines', self.LINES)
     self._hist  = self.func('py_draw_hist', self.HIST)
     self._pareto = self.func('py_draw_pareto', self.PARETO)
     self._sleaf = self.func('py_draw_sleaf', self.SLEAF)
     self._csv   = self.func('py_r_csvread', self.CSV)
     self._tapply = self.func('py_r_csvread', self.TAPPLY)
     self._closest = self.func('py_r_csvread', self.CLOSEST)
     self._jupyter_opt = self.func('py_r_jupyter', self.JUPYTER_OPT)
     self._readtab = self.func('py_r_readtab', self.READTAB)
     self._mul   = self.func('py_r_mul', self.MUL)
     self._exp   = self.func('py_r_exp', self.EXP)
     self._div   = self.func('py_r_div', self.DIV)
     self._add   = self.func('py_r_add', self.ADD)
     self._sub   = self.func('py_r_sub', self.SUB)
     self._str   = self.func('py_r_str', self.STR)
     self._cond   = self.func('py_r_cond', self.COND)
     self._lmplot   = self.func('py_r_lmplot', self.LMPLOT)
     self._plot   = self.func('py_r_plot', self.PLOT)
     self._splot   = self.func('py_r_splot', self.SPLOT)
     self._samps = self.func('py_sample_size', self.SAMP_SIZE)
     self._sample = self.func('py_sample', self.SAMPLE)
     self._randomsample = self.func('py_randomsample', self.RANDOMSAMPLE)
     self._column_ext = self.func('py_column_extract', self.COLUMN_EXTRACT)
     self._serror = self.func('py_serror_samp', self.SERROR_SAMP)
     self._pretty = self.func('py_pretty_frame', self.PRETTY_FRAME )
     self._show_row = self.func('py_show_row', self.SHOW_ROW)
     self._some_nums = self.func('py_show_row', self.SOME_NUMBERS)
     self._packs = self.func('py_packs', self.PACKS)
     self._help = self.func('py_help', self.HELP)
     self._contents = self.func('py_contents', self.PACK_CONTENTS)
     self.anon  = SignatureTranslatedAnonymousPackage(self.ANON_PACK, "anon_pack")
Beispiel #29
0
def individual(ind,individual):
   t = (ind,individual)
   cur = con.cursor()
   cur.execute("select * FROM ERV WHERE repName=? AND genoStart=?", t)
   ind = cur.fetchone() #Since we are only expected one result, fetchone make the search faster. 
   
   with open('cariofunctions.R', 'r') as f: #Allows to read the scripts from R. 
     string_again = f.read()

   imgs = STAP(string_again, "imgs") #Creates and object than later can be used to calle the functions inside of the R script.
   c = ind["genoName"][:5] #Get all the variables from the database.
   c = c.replace("_", "")  #KaryoplotteR recognizes the chromosome name like chr1, chr2, etc. Since the first letters of the name of the 
   start = int(ind["genoStart"]) #chromosomes from the database match with this nomenclature, we make sure to get the names correctly.  
   end = int(ind["genoEnd"])
   pngs = ""
   zpngs = ""
   if c!="chrUn":
      pngs = "static/img/"+c+str(start)+".png"  #Allows to name the final images and locate them in the right folder. 
      zpngs = "static/img/z"+c+str(start)+".png"
      cimg = imgs.localization(c,start,end,pngs) #Calls the R functions from the script. 
      zimg = imgs.zoom(c,start,end,zpngs)
   else:
      pngs =  "static/img/un.png" #In case that the chromosome is unknown (some cases in the database), an alternative image is displayed.
   return render_template('individual.html', ind = ind, pngs=pngs, zpngs=zpngs)
Beispiel #30
0
def filter_pan_id(pan_id_file_name, length_table_file,
                  filtered_pan_id_file_name):
    '''
    input 1: pan_id_file_name
    input 2: length_table_file
    output 1: filtered_pan_id_file_name
    '''
    R_code = '''
    filter_pan_id=function(pan_id_file_name,length_table_file,filtered_pan_id_file_name){
    require(dplyr)
    pan_id=read.table(pan_id_file_name)
    length_table=read.table(length_table_file)
    length_table_filter=length_table %>% 
        filter(V2>20)
    pan_id_filter=merge(pan_id,length_table_filter,by.x = 2,by.y = 1,all.y = T)
    pan_id_filter=pan_id_filter[,-3]
    pan_id_filter=pan_id_filter[,c(2,1)]
    write.table(pan_id_filter,filtered_pan_id_file_name,sep = "\t",quote = F,row.names = F,col.names = F)
    }
    '''
    R_filter_pan_id = SignatureTranslatedAnonymousPackage(
        R_code, "R_filter_pan_id")
    R_filter_pan_id.filter_pan_id(pan_id_file_name, length_table_file,
                                  filtered_pan_id_file_name)
def MGG_unpresent_Augustus_locate_in_pav_orthofinder(
        MGG_unpresent_Augustus_list_file_name, pav_orthofinder_file_name,
        gene_protein_mapping_table_file_name,
        pav_MGG_unpresent_Augustus_file_name,
        orthofinder_unassianed_tsv_file_name,
        MGG_unpresent_Augustus_unassianed_list_file_name):
    R_code = '''
    MGG_unpresent_Augustus_locate_in_pav_orthofinder=function(MGG_unpresent_Augustus_list_file_name,pav_orthofinder_file_name,gene_protein_mapping_table_file_name,pav_MGG_unpresent_Augustus_file_name,orthofinder_unassianed_tsv_file_name,MGG_unpresent_Augustus_unassianed_list_file_name){
        require(readxl)
        require(WriteXLS)
        require(dplyr)
        MGG_unpresent_Augustus_list=read.table(MGG_unpresent_Augustus_list_file_name,stringsAsFactors = F)
        pav_orthofinde=read_xlsx(pav_orthofinder_file_name)
        pav_orthofinde=pav_orthofinde[,-1]

        gene_protein_mapping_table=read.table(gene_protein_mapping_table_file_name)

        pav_orthofinde_protein=merge(MGG_unpresent_Augustus_list,gene_protein_mapping_table,by.x = 1,by.y = 1)
        pav_MGG_unpresent_Augustus =pav_orthofinde %>% 
        filter(protein_id %in% pav_orthofinde_protein$V2)
        pav_MGG_unpresent_Augustus=pav_MGG_unpresent_Augustus[,c(158,1:157)]
        WriteXLS::WriteXLS(pav_MGG_unpresent_Augustus,pav_MGG_unpresent_Augustus_file_name)

        orthofinder_unassianed=read.table(orthofinder_unassianed_tsv_file_name,sep = "\t",header = T,check.names = F)
        MGG_unpresent_Augustus_unassianed_list=intersect(pav_MGG_unpresent_Augustus$protein_id,orthofinder_unassianed$`70-15_protein`)
        write.table(MGG_unpresent_Augustus_unassianed_list,MGG_unpresent_Augustus_unassianed_list_file_name,quote = F,row.names = F,col.names = F)
}
    '''
    R_MGG_unpresent_Augustus_locate_in_pav_orthofinder = SignatureTranslatedAnonymousPackage(
        R_code, "R_MGG_unpresent_Augustus_locate_in_pav_orthofinder")
    R_MGG_unpresent_Augustus_locate_in_pav_orthofinder.MGG_unpresent_Augustus_locate_in_pav_orthofinder(
        MGG_unpresent_Augustus_list_file_name, pav_orthofinder_file_name,
        gene_protein_mapping_table_file_name,
        pav_MGG_unpresent_Augustus_file_name,
        orthofinder_unassianed_tsv_file_name,
        MGG_unpresent_Augustus_unassianed_list_file_name)
Beispiel #32
0
    def fit_pogs(self, X, y, tau = 0.5):
        """
        Using POGS for quantile regression

        :param X: independent variables
        :param y: response variable
        :param tau: quantile
        :return: coefficients
        """
        X["Intercept"] = 1
        qr_pogs_model = qr_pogs()
        pb = SignatureTranslatedAnonymousPackage(qr_pogs_model.qr_pogs_def, "powerpack")

        t0 = time.time()
        ret = pb.qr_pogs(X= X, y =y, tau = tau)
        t1 = time.time()

        output = {"Coefficients":{}, "Time (s)":t1-t0}

        for i in range(0,len(list(X.columns.values))):
            column = list(X.columns.values)[i]
            output["Coefficients"][column] = ret[i]

        return output
Beispiel #33
0
def matchit(outcome,
            treatment,
            data,
            method='nearest',
            distance='glm',
            replace=False):
    if replace:
        replace = 'TRUE'
    else:
        replace = 'FALSE'
    data.to_csv('data.csv', index=False)
    formula_cov = treatment + ' ~ '
    i = 0
    for cov in data.columns:
        if cov != outcome and cov != treatment:
            if i != 0:
                formula_cov += '+'
            formula_cov += str(cov)
            i += 1
    string = """
    library('MatchIt')
    data <- read.csv('data.csv')
    r <- matchit( %s,estimand="ATE", method = "%s", data = data, replace = %s)
    matrix <- r$match.matrix[,]
    names <- as.numeric(names(r$match.matrix[,]))
    mtch <- data[as.numeric(names(r$match.matrix[,])),]
    hh <- data[as.numeric(names(r$match.matrix[,])),'%s']- data[as.numeric(r$match.matrix[,]),'%s']
    
    data2 <- data
    data2$%s <- 1 - data2$%s
    r2 <- matchit( %s, estimand="ATE", method = "%s", data = data2, replace = %s)
    matrix2 <- r2$match.matrix[,]
    names2 <- as.numeric(names(r2$match.matrix[,]))
    mtch2 <- data2[as.numeric(names(r2$match.matrix[,])),]
    hh2 <- data2[as.numeric(r2$match.matrix[,]),'%s'] - data2[as.numeric(names(r2$match.matrix[,])),'%s']
    """ % (formula_cov, method, replace, outcome, outcome, treatment,
           treatment, formula_cov, method, replace, outcome, outcome)

    psnn = SignatureTranslatedAnonymousPackage(string, "powerpack")
    match = psnn.mtch
    match2 = psnn.mtch2
    t_hat = pd.DataFrame(np.hstack((np.array(psnn.hh), np.array(psnn.hh2))),
                         index=list(psnn.names.astype(int)) +
                         list(psnn.names2.astype(int)),
                         columns=['CATE'])
    ate = np.mean(t_hat['CATE'])
    return ate
def Bing_cust(lam1, lam2, lam3):
    string_rbing1 = """
    rbingham <- function(n, A) {
       p <- ncol(A)  ## dimensionality of A
       eig <- eigen(A)
       V <- eig$vectors  ## eigenvectors
       lam <- c(%f,%f,%f)
       lam <- lam - lam[p]
       lam <- lam[-p]
       ### f.rbing part
       lam <- sort(lam, decreasing = TRUE)  ## sort the eigenvalues in desceding order
       nsamp <- 0
       X <- NULL
       lam.full <- c(lam, 0)
       qa <- length(lam.full)
       mu <- numeric(qa)
       sigacginv <- 1 + 2 * lam.full
       SigACG <- sqrt( 1 / ( 1 + 2 * lam.full ) )
       Ntry <- 0
    
       while (nsamp < n) {
         x.samp <- FALSE
         while ( !x.samp ) {
           yp <- rnorm(qa, mu, SigACG)
           y <- yp / sqrt( sum( yp^2 ) )
           lratio <-  - sum( y^2 * lam.full ) - qa/2 * log(qa) + 0.5 * (qa - 1) + qa/2 * log( sum(y^2 * sigacginv ) )
           if ( log(runif(1) ) < lratio) {
             X <- c(X, y)
             x.samp <- TRUE
             nsamp <- nsamp + 1
           }
           Ntry <- Ntry + 1
         }
       }
    
       x <- matrix(X, byrow = TRUE, ncol = qa)
       ## the avtry is the estimate of the M in rejection sampling
       ## 1/M is the probability of acceptance
       ## the x contains the simulated values
       tcrossprod(x, V) ## simulated data
     }
    """ % (lam1, lam2, lam3)  # 200,0.05

    powerpack1 = SignatureTranslatedAnonymousPackage(string_rbing1,
                                                     "powerpack")

    return powerpack1
Beispiel #35
0
 def __init__(self, Rlibpath, search_terms, sites, language):
     self._RFmodel = None
     self._language = language
     self._search_terms = search_terms
     self._sites = sites
     #random forest model parameters
     self._threshold = 0.3
     self._splitratio = 0.7
     #base R assets
     self._utils = robjects.packages.importr("utils")
     self._utils.chooseCRANmirror(ind=5) #randomly chosen mirror
     self._base = robjects.packages.importr("base")
     #local library path
     self._base._libPaths(Rlibpath)
     #change locale to use utf-8 for r_repr()
     robjects.r['Sys.setlocale']("LC_CTYPE", "C") 
     
     # tm           - Framework for text mining.
     # SnowballC    - Stemming.
     # textcat      - Determining language of text.
     #
     # randomForest - Random forest.
     # caTools      - Splitting data into training and test sets intelligently.
     # stringr      - String manipulation
     needed_packages = ["tm", "SnowballC", "textcat", "randomForest", "caTools",
                        "stringr"]
     #install packages
     to_install = [package for package in needed_packages 
                        if not robjects.packages.isinstalled(package)]
     if len(to_install) > 0:
         self._utils.install_packages(StrVector(to_install))
     #load packages
     self._loaded_packages = [robjects.packages.importr(package) 
                             for package in needed_packages]
     self._loaded_packages = dict(zip(needed_packages,  self._loaded_packages))
     #load R functions
     self._R_functions = STAP(
             self.__R_functions_str, "R_functions")
import random

# based on http://stackoverflow.com/questions/15419740/calling-custom-functions-from-python-using-rpy2
# and http://rpy.sourceforge.net/rpy2/doc-dev/html/robjects_rpackages.html#importing-arbitrary-r-code-as-a-package

import rpy2.robjects.packages.SignatureTranslatedAnonymousPackage as STAP

with open('electricityDoubleCES.R', 'r') as f:
    string = ''.join(f.readlines())

ces = STAP(string, "ces")

p1 = [30] * 24
p2 = [random.gauss(mu=30, sigma=5) for x in range(24)]

Theta = dict( theta = -0.075,
              alpha =  0.2,
              sigma =  50,
              gamma = 0.01
)
loads =  range(600, 950+1, 50) + [1000] * 5 + range(1100, 800-1, -100) + [750] +  [700] * 3 + [650, 650, 600]


print ces.double.ces(p1, Theta, loads, 30)  # should return baseline loads
print ces.double.ces(p2, Theta, loads, 30)  # example with random prices

print ces.cs(1000, p2, Theta, loads, 30) - cs(1000, p1, Theta, loads, 30)  # change in welfare from p1 to p2



Beispiel #37
0
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage


# import R's "base" package
base = importr('base')

with open("fonctions_apprentissage.r", "r", encoding="utf-8") as apprentissageRopen:
	apprentissage = "".join(apprentissageRopen.readlines())
print(apprentissage)


apprentissage = SignatureTranslatedAnonymousPackage(apprentissage, "apprentissage")


path_sample = "/media/sebastien/Bayer/ScriptsSEB/scripts/GUI/EBimage/AnalyseImagesV2/Samples/5583"

print(dir(apprentissage))

print(apprentissage.apprentissage(path_sample))
Beispiel #38
0
def main():
    filenames, outdir = parse_arguments() #filenames will be a list
    os.chdir(outdir) #change pwd to output directory
    #print "filenames", filenames

    # will need to get intensity_2D_binned into R data.frame
        # intensity_2D_binned was created in my_cython_functions.pyx as follows:
            # cdef np.ndarray[np.float_t, ndim=2] my2Da
            # my2Da = np.zeros((rt_grid_size, mz_grid_size)) """
    #create fake binned array
    nrow = 5
    ncol = 10
    counter = 0
    binned = np.zeros((nrow, ncol), dtype="float64")
    for row in xrange(nrow):
        for col in xrange(ncol):
            binned[row, col] = counter
            counter += 1

    ### Option 1 ###
    #turn each binned array into one row of what will be an R data.frame
        # then add ID to each row and combine these binned arrays
    nele = nrow*ncol
    rbinned1 = np.reshape(binned, nele)
    rbinned1c = np.hstack(( np.array(["ID1001"]), rbinned1 )) #concatenate also works
    rbinned2 = np.reshape(binned, nele)
    rbinned2c = np.hstack(( np.array(["ID1002"]), rbinned2 ))
    stacked = np.vstack((rbinned1c, rbinned2c))
    #print stacked
    #print stacked.shape #2 by 51
    rrdf1 = robj.r['data.frame'](numpy2ri(stacked))
    #print rrdf1

    ### Option 2 ###
    #build empty array that is 5 (# mzml files) by 51 (# rt/mz bins + 1 for patient ID)
        # fill each row with the binned data
    floatD = np.zeros((5,nrow*ncol), dtype="float64")
    strD = np.zeros((5,1), dtype='a6') #a6 is the dtype for a 6 character string
    respD = np.hstack((strD, floatD))
    print respD.shape

    for filecount, filename in enumerate(filenames):
        if filecount<2:
            respD = build_row_of_lcms_matrix(
                binned, respD, nrow, ncol, filecount, filename)
    #print respD
    df2 = robj.r['data.frame'](numpy2ri(respD))
    print df2

    # now see if we can get R to use this dataframe 
    myRcode = """
    doR <- function(python_respD, lcms_run) {
        source("/srv/scratch/carolyn/Dengue_code/prediction_with_LCMS_from_python.R") 
        run_predictions_wrap(python_respD, lcms_run)
    }
    """
    #Rpack = SignatureTranslatedAnonymousPackage(myRcode, "Rpack")
    #print Rpack._rpy2r.keys() #to reveal the functions within powerpack
    #3print Rpack.doR(df2, 1) #to run the function found in powerpack

    # now see if we can get R to use this dataframe 
    myRcode = """
    square <- function(rdf) {
        myv = rdf$X2 + rdf$X3
        return(myv)
    }
    doit <- function(input) {
        source("/srv/scratch/carolyn/Dengue_code/Rtest_rpy.R") 
        run_test_wrap(input)
    }
    """
    print "wwwwah"
    powerpack = SignatureTranslatedAnonymousPackage(myRcode, "powerpack")
    #print powerpack._rpy2r.keys() #to reveal the functions within powerpack
    #print powerpack.square(df2) #to run the function "square" found in powerpack
    print powerpack.doit(df2)
Beispiel #39
0
	def run(self):
		print("RUN")
		try:
			warning = ""	# initialise le nombre d'erreur
			val = 0			# initialise le nombre d'erreur
			txtInfo = ""

			# import R's "base" package
			base = importr('base')

			with open("fonctions_apprentissage.r", "r", encoding="utf-8") as apprentissageRopen:
				apprentissage = "".join(apprentissageRopen.readlines())
			print(apprentissage)

			apprentissage = SignatureTranslatedAnonymousPackage(apprentissage, "apprentissage")

			#path_sample = "/media/sebastien/Bayer/ScriptsSEB/scripts/GUI/EBimage/AnalyseImagesV2/Samples/5583"
			path_sample = "/".join(str(self.dicoFoldersCallibration["leaf"]).split("/")[:-1])
			print(path_sample)
			print(dir(apprentissage))
			result , pathRdataFile= apprentissage.apprentissage(path_sample).r_repr().replace('"','').replace("c(","").replace(")","").split(",")
			print(result, pathRdataFile)
			if result == "1":
				reply = QMessageBox.question(parent=self, title='Attention', text='File will be overwritten.\nDo you still want to proceed?', buttons=QMessageBox.Yes | QMessageBox.No, defaultButton=QMessageBox.No)
				if reply == QMessageBox.Yes:
					print("OK")
				self.callibrationFileOpenLineEdit.setText(pathRdataFile)
			else:

				print("BAD")


			##grise les boutons pour pas relancer job
			#self.ui.frameRun.show()
			#self.ui.runPushButton.setDisabled(True)
			#self.ui.loadMatriceFilePushButton.setDisabled(True)
			#self.ui.loadOrderFilePushButton.setDisabled(True)
			#self.ui.PCAlineEdit.setDisabled(True)
			#self.ui.DAlineEdit.setDisabled(True)
			#self.ui.popMinLineEdit.setDisabled(True)
			#self.ui.popMaxLineEdit.setDisabled(True)
			#self.ui.rmOldCheckBox.setDisabled(True)
			#self.ui.expertCheckBox.setDisabled(True)
			#self.ui.expertFrame.setDisabled(True)
			#self.ui.graphTypeComboBox.setDisabled(True)
			#if self.expertMode == "True":
				#self.EBimagefix = str(self.ui.EBimagefixPlainTextEdit.toPlainText().toUtf8())
				#self.EBimagechange = str(self.ui.EBimagechangePlainTextEdit.toPlainText().toUtf8())

			#"""to run programme"""
			## création du pathout
			#if os.path.isdir(self.pathFileOut):
				#if self.rmOld == "True":
					#shutil.rmtree(str(self.pathFileOut))
					#os.mkdir(self.pathFileOut)
				#else:
					#warning += "Warnnig folder "+self.pathFileOut+" already exist,\nPlease remove/rename before run new analysis or use checkbox"
					#raise Exception(warning)
			#else:
				#os.mkdir(self.pathFileOut)

			################################################
			## code commun mode graphique ou interface
			################################################

			## charge l'ordre a refaire
			#self.orderList = loadInListCol(self.orderPathFile, 0)

			## copie de la matrice dans un dico
			#self.dicoMatrice = loadInDictLine(self.matricePathFile)

			## Comptage du nombre d'individus, de markers et ncode:
			#self.nbindParam = len(self.dicoMatrice.keys())-1

			#if self.nbindParam != len(self.orderList):
				#txtInfo += "WARNING: More individu in Matrice file (%s) than Order label file (%s)!!!\n" % (self.nbindParam, len(self.orderList))

			#fileMat = open(self.matricePathFile,"r")
			#header = fileMat.readline()
			#self.nbmarkParam = len(header.split("\t"))
			#header = " \t"+"\t".join(header.split("\t")[1:])


			#nbcode = fileMat.readline().split("\t")[1]
			#while nbcode == "-9":
				#nbcode = fileMat.readline().split("\t")[1]
			#self.ncodeParam = len(nbcode)
			#fileMat.close()

			## ouverture du nouveau fichier trier
			#with open(self.pathFileOut+self.basename+"_Reorder.tab","w") as reorderMatriceFile:
				#reorderMatriceFile.write(header)
				#for ind in self.orderList:
					#if ind not in self.dicoMatrice.keys():
						#error = "ERROR: The individu %s define in label file was not in the matrice file !!! Exit programme" % ind
						#raise Exception(error)

					#line = self.dicoMatrice[ind].split("\t")[0]+"\t"+"\t".join(self.dicoMatrice[ind].split("\t")[1:]).replace("999","-9")
					#reorderMatriceFile.write(line)

			#txtInfo += "Nb individus: %i\tNb markers: %i\tncodeParam: %i\tGraph type: %s\n" % (self.nbindParam,int(self.nbmarkParam)-1,self.ncodeParam, self.graphType)
			#if args.cmdMode:
				#pass
			#else:
				#self.ui.runningPlainTextEdit.setPlainText(txtInfo)

			##ouverture du script R
			#Rscript = open(self.pathFileOut+self.basename+"_R_EBimage.R","w")
			#Rscript.write(installPackageR)

			## Ajout du path du fichier matrice dans EBimagefix
			## modifie Script R pour adapter aux parametres rentrés
			#dictToReplace = {
			#"**MAKERS**"	:		str(self.nbmarkParam),
			#"**NCODE**"	:			str(self.ncodeParam),
			#"**INDIV**"	:			str(self.nbindParam),
			#"**PATHTOFILE**":		str(reorderMatriceFile.name),
			#"**current_dir**"	:	str(self.pathFileOut),
			#"**GRAPH**"	:	str(self.graphType)
			#}

			#EBimagefixModif = replace_all(dictToReplace, self.EBimagefix)

			##print(EBimagefixModif)

			#Rscript.write(EBimagefixModif)

			#for pop in range(int(self.popMinValue),int(self.popMaxValue)+1):
				##print(pop)
				#popstr=str(pop)
				#EBimagechange2 = self.EBimagechange.replace("**pop**",popstr).replace("**current_dir**",str(self.pathFileOut)).replace("**PCARETAIN**",str(self.PCAvalue)).replace("**DARETAIN**",str(self.DAvalue))
				##print(EBimagechange2)
				#Rscript.write(EBimagechange2)

			#Rscript.close()
			#self.ui.statusbar.showMessage(str("FINISH, script product on : %s" % self.pathFileOut),9600)

			#txtInfo += "FINISH, script product on :\n %s" % (self.pathFileOut)

			#if args.cmdMode:
				#print(txtInfo)
			#else:
				#self.ui.runningPlainTextEdit.setPlainText(txtInfo)

			## si des erreurs:

		except Exception as e:
			self.displayError(error = e)
Beispiel #40
0
class JobAdClassification:
    """Classification of job ads using R and rpy2.

    This class provides training of a machine learning model for 
    recommendations for new job ads, and determination of languages of
    job ads.

    Arguments
    ---------
    Rlibpath : str 
        Path to local R libraries.
    search_terms : list[str]
        All search terms used in job ad collections. Needed to include all factor
        levels in the machine learning model.
    sites : list[str]
        All job sites used in job ad collections. Needed to include all factor
        levels in the model.
    language : str
        Language of job ads / machine learning model. Currently only Finnish and
        English supported.
    """

    #Functions which were easier to implement in pure R than using rpy2.
    __R_functions_str = """ 
        cleanJobAds <- function(class_data, search_terms, sites) {
            # Cleans and transforms job ads.
            # More precisely:
            # - Joins title and description columns
            # - Removes rows with empty column(s), removes extra whitespaces
            # - Removes duplicates 
            # - Adds all factor levels to search terms and sites
            #
            # Returns a data frame with only site, search
            #
            # Arguments:
            # class_data   - dataframe with columns for site, title, description,
            #                searchterm and relevant.
            # search_terms - Character vector of all search terms used, needed for 
            #                factor levels.
            # sites        - Character vector of all job ad sites, needed for 
            #                factor levels.
            class_data$description <- paste(class_data$title, class_data$description)
            class_data$title <- NULL
            #get rid of rows with empty column(s)
            for (col in colnames(class_data)) {
                class_data <- class_data[!(class_data[col] == ""),]
                class_data <- class_data[!is.na(class_data[col]),]
            }
            class_data <- unique(class_data)
            #get rid of extra whitespace in description
            class_data$description <- str_trim(class_data$description)
            class_data$description <- gsub("\\\\s+", " ", class_data$description)
            #assign proper factor levels
            class_data$site <- as.factor(class_data$site)
            levels(class_data$site) <- 
                 c(levels(class_data$site), sites[!(sites %in% levels(class_data$site))])
            class_data$searchterm <- as.factor(class_data$searchterm)
            levels(class_data$searchterm) <- 
                 c(levels(class_data$searchterm), search_terms[!(search_terms %in% levels(class_data$searchterm))])

            return(class_data)
        }      
        createJoinDTM <- function(class_data, lang) {
            # Transforms and parses the description column for words.
            # Words are cleaned and stemmed, and finally added as columns 
            # to the dataframe.
            #
            # Returns dataframe with columns for all parsed words in the description
            # column. The description column itself is removed.
            #
            # Arguments:
            # class_data - Dataframe of cleaned job ads using R_function cleanJobAds.
            # lang       - Language of job ads, needed for stemming and removing 
            #              stopwords.
            
            #create corpus from descriptions
            corpus <- Corpus(VectorSource(class_data$description))
            #cleaning and stemming operations applied to corpus
            corpus <- tm_map(corpus, tolower)
            corpus <- tm_map(corpus, PlainTextDocument) #need to convert after tolower
            corpus <- tm_map(corpus, removePunctuation)
            corpus <- tm_map(corpus, removeWords, stopwords(lang))
            corpus <- tm_map(corpus, stemDocument, lang)
            #create document term matrix and remove sparse terms
            dtm <- DocumentTermMatrix(corpus)
            dtm <- removeSparseTerms(dtm, 0.98)
            dtm <- as.data.frame(as.matrix(dtm))
            class_data <- cbind(class_data, dtm)
            class_data$description <- NULL
            colnames(class_data) <- make.names(colnames(class_data))
            return (class_data)
        }
        RFmodel <- function(train_data, cutoff) {
            # Trains random forest binary classification model using the provided
            # cutoffs.
            #
            # Returns model.
            #
            # Arguments:
            # train_data - Dataframe containing parsed words from job ads. 
            # cutoff     - Threshold for determining whether relevant or not.
            train_data$relevant <- as.factor(train_data$relevant)
            RFmodel <- randomForest(relevant ~ ., data=train_data, cutoff = cutoff)
            return(RFmodel)}
        RFpred <- function(RFmodel, test_data) {
             # Classifies job ads as relevant or not using provided model.
             # 
             # Returns factor of classifications.
             #
             # Arguments: 
             # RFmodel   - Model to use.
             # test_data - Dataframe containing parsed words from job ads as columns. 
             #
            return(predict(RFmodel, newdata=test_data))}
        splitTerms <- function(terms_data, bool) {
            #Helper function for splitting data into training and testing sets.
            return (terms_data == bool)
            }
        model_eval <- function(predictions, actual, thold, printb=0) {
              # Calculates and optionally prints characteristics of model.
              # Returns the following in a column vector:
              # accuracy, sensitivity, RMSE, 
              # true positives, true negatives, 
              # false positives, false negatives, 
              # fscore
              #
              predictions <- as.numeric(predictions)
              actual <- as.numeric(actual)
              if (thold != -1) {
                preds <- predictions >= thold
              }
              #if factor levels are 2,1 instead of 0,1 (will fail if 
              #actual levels are 0,1 but there are no 0 predictions)
              if (max(predictions) == 2 || min(predictions) == 1) {
                    preds <- predictions-1
              }
              
              TP <- sum(actual + preds == 2)
              TN <- sum(actual + preds == 0)
              FP <- sum(actual - preds == -1)
              FN <- sum(actual - preds == 1)

              #accuracy
              acc <- (TP + TN) / (TP + TN + FP + FN)
              #sensitivity
              sens <- (TP / (TP + FN))
              #fscore
              fscore <- 2*TP/(2*TP+FP+FN)
  
              #error measure
              err <- sum((actual - preds)^2)
              if (printb == 1) {
                cat("Model characteristics:", "\n")
                cat("Accuracy", acc, "\n")
                cat("Sensitivity", sens, "\n")
                cat("Fscore", fscore, "\n")
                cat("Error (RMSE)", err, "\n")
              }
              return(c(acc, sens, err, TP, TN, FP, FN, fscore))
            }
        prepNewAds <- function(RFmodel, new_ads) {
              #Prepares new ads for classification by model. 
              #Looks for words used by model and discards 
              #words not in model.
  
              model_columns <- as.character(attr(RFmodel$terms, "variables"))
              new_ads <- new_ads[(names(new_ads) %in% model_columns)]
              for (col in model_columns[!(model_columns %in% names(new_ads))]) {
                new_ads[col] <- rep(0, nrow(new_ads))
              }
              return(new_ads)
            }
        saveFile <- function(object, filename) {
            save(object, file=filename)
        }
        """

    #columns needed for training model
    _train_columns = ["site", "searchterm", "title", "description", 
                      "relevant"]
    #columns needed for classifying new job ads
    _class_columns = ["id", "site", "searchterm", "title", "description"]

    def __init__(self, Rlibpath, search_terms, sites, language):
        self._RFmodel = None
        self._language = language
        self._search_terms = search_terms
        self._sites = sites
        #random forest model parameters
        self._threshold = 0.3
        self._splitratio = 0.7
        #base R assets
        self._utils = robjects.packages.importr("utils")
        self._utils.chooseCRANmirror(ind=5) #randomly chosen mirror
        self._base = robjects.packages.importr("base")
        #local library path
        self._base._libPaths(Rlibpath)
        #change locale to use utf-8 for r_repr()
        robjects.r['Sys.setlocale']("LC_CTYPE", "C") 
        
        # tm           - Framework for text mining.
        # SnowballC    - Stemming.
        # textcat      - Determining language of text.
        #
        # randomForest - Random forest.
        # caTools      - Splitting data into training and test sets intelligently.
        # stringr      - String manipulation
        needed_packages = ["tm", "SnowballC", "textcat", "randomForest", "caTools",
                           "stringr"]
        #install packages
        to_install = [package for package in needed_packages 
                           if not robjects.packages.isinstalled(package)]
        if len(to_install) > 0:
            self._utils.install_packages(StrVector(to_install))
        #load packages
        self._loaded_packages = [robjects.packages.importr(package) 
                                for package in needed_packages]
        self._loaded_packages = dict(zip(needed_packages,  self._loaded_packages))
        #load R functions
        self._R_functions = STAP(
                self.__R_functions_str, "R_functions")

    def _remove_diacritics(self, string):
        """Removes all Swedish (Finnish) diacritics from a string.

        Arguments
        ----------
        string : str
            String to remove diacritics from.
        Returns
        ----------
        clean_string : str
            String without diacritics.
        """
        if isinstance(string, str):
            diacr = ["Ä", "ä", "Ö", "ö", "Å", "å"]
            replc = ["A", "a", "O", "o", "A", "a"]
            for i in range(0, len(diacr)):
                string = string.replace(diacr[i], replc[i])
    
        return string
    
    def _create_R_dataframe(self, job_ads, include_columns):
        """Converts job ads to R dataframe.

        Arguments
        ----------
        job_ads : list[:class:`JobAd`]
            List of :class:`JobAd` instances.
        include_columns : list[str]
            Defines which columns are included in the dataframe. 

        Returns
        ----------
        dataf : :class:`robjects.DataFrame`
            :class:`robjects.DataFrame` representing job ads.
        """
        
        #modify structure to type {column:[rows]}   
        if len(job_ads) == 0:
            raise Exception("No job ads to convert to R dataframe.")

        job_ads_dataf = {}
        for column in include_columns:
            job_ads_dataf[column] = [self._remove_diacritics(ad[column]) 
                                       for ad in job_ads]
            if (column == "relevant"):
                job_ads_dataf[column] = IntVector(job_ads_dataf[column])
            else:
                job_ads_dataf[column] = self._base.I(StrVector(job_ads_dataf[column]))
             
        return robjects.DataFrame(job_ads_dataf)

    def train_model(self, class_ads):
        """Trains a random forest model for classification of job ad relevance.

        Model is stored in the :class:`JobAdClassification` instance.

        Arguments
        ----------
        class_ads : list[:class:`JobAd`]
            List of :class:`JobAd` instances used to train model. Each instance
            should have site, searchterm, title, description and relevant defined.
        """
        ##parameters for training
        #typical value
        splitratio = self._splitratio
        #gave best F-score during parameter sweeping
        threshold = self._threshold

        #convert to dataframe and clean ads
        dataf = self._create_R_dataframe(class_ads, self._train_columns)
        dataf = self._R_functions.cleanJobAds(dataf, StrVector(self._search_terms),
                                              StrVector(self._sites))
        dataf = self._R_functions.createJoinDTM(dataf, self._language.lower())

        #create training and testing data sets
        if (splitratio != 1.0):
            split = robjects.r['sample.split'](dataf.rx2('relevant'), splitratio)
            train = robjects.r['subset'](dataf, self._R_functions.splitTerms(split, 'TRUE'))
            test = robjects.r['subset'](dataf, self._R_functions.splitTerms(split, 'FALSE'))
        else:
            train = dataf
        #train model
        self._RFmodel = self._R_functions.RFmodel(train, FloatVector([1-threshold, threshold])) 
        #test on testing set
        if (splitratio != 1.0):
            pred = self._R_functions.RFpred(self._RFmodel, test)
            conf_matrix = self._R_functions.model_eval(pred, test.rx2('relevant'), -1, 1)

    def save_model(self, filename):
        """Saves :class:`JobAdClassification` instance model to file for later use.

        Arguments
        ----------
        filename : str
            Name of file to save model in.
        """

        self._R_functions.saveFile(self._RFmodel, filename)
        
    def load_model(self, filename):
        """Loads random forest classification model from file.

        Model is stored in :class:`JobAdClassification` instance.

        Arguments
        ----------
        filename : str
            Name of file to load model from.
        """

        self._RFmodel = robjects.r['get'](robjects.r['load'](filename))


    def recommend_ads(self, job_ads):
        """Provides recommendations for ads using instance model.

        Arguments
        ----------
        job_ads : list[:class:`JobAd`]
            Each instance should have id, site, searchterm, title 
            and description defined.

        Returns
        ----------
        results : list[:class:`JobAd`]
            Each instance has id and recommendation defined.
        """
        #convert to dataframe and clean ads
        dataf = self._create_R_dataframe(job_ads, self._class_columns)
        ids = dataf.rx2('id') 
        dataf = self._R_functions.cleanJobAds(dataf, StrVector(self._search_terms), 
                                              StrVector(self._sites))
        dataf = self._R_functions.createJoinDTM(dataf, self._language.lower())
        dataf = self._R_functions.prepNewAds(self._RFmodel, dataf)

        #classify ads
        pred = self._R_functions.RFpred(self._RFmodel, dataf)

        #combine predictions with ids in a list of dictionaries
        results = [JobAd.create({"id" : ids[i], "recommendation": int(pred[i])-1}) 
                   for i in range(0, robjects.r['length'](ids)[0])]
                           
        return results
        

    def _determine_lang(self, title, description):
        """Tries to determine which language a job ad is using the textcat package. 
        Only differentiates between Finnish and English; returns English if another
        language is recognized.

        Arguments
        ----------
        title : str
            Title of job ad.
        description : str
            Description of job ad.
        Returns
        ----------
        language : str
            Determined language of job ad.
        """
        language_both = self._loaded_packages["textcat"].textcat(
            " ".join([title, description])).r_repr().replace("\"", "")
        language_title = self._loaded_packages["textcat"].textcat(title).r_repr().replace("\"", "")
        language_descrip = self._loaded_packages["textcat"].textcat(
            description).r_repr().replace("\"", "")

        #English job titles with Finnish text is sometimes mistaken
        #as danish, frisian or middle_frisian
        false_finnish = ["danish", "frisian", "middle_frisian"]

        if (language_both == "english" or language_both == "finnish"):
            return language_both[0].upper() + language_both[1:]
        elif (language_title == "english" or language_title == "finnish"):
            return language_title[0].upper() + language_title[1:]
        elif (language_descrip == "english" or language_descrip == "finnish"):
            return language_descrip[0].upper() + language_descrip[1:]
        elif (language_both in false_finnish or language_title in false_finnish or
                language_descrip in false_finnish):
            return "Finnish"
        else:
            return "English"

    def det_lang_ads(self, job_ads):
        """Attempts to determine language of job ads.

        Returns list of :class:`JobAd` instances with id and language.

        Arguments
        ----------
        job_ads : list[:class:`JobAd`]
            List of :class:`JobAd` instances. Each instance should have 
            id, title and description defined.

        Returns
        ----------
        results : list[:class:`JobAd`]
            List of :class:`JobAd` instances. Each instance has id and
            language defined.
        """

        results = [{"id": ad["id"], 
                    "language": self._determine_lang(ad["title"], ad["description"])}
                   for ad in job_ads]

        return results