Ejemplo n.º 1
0
def convert_rdata_to_dataframe ( filename ) :
    #
    from rpy2.robjects import r as R
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.conversion import localconverter
    import rpy2.robjects as ro
    #
    print ( 'WARNING THIS PROGRAM NEED VALUE ERROR CHECKING' )
    rd_ = R.load( filename )
    if 'matrix' in str( type( R[rd_[0]] ) ).lower() :
        column_names = [ R[rd_[0]].colnames ]
        index_names  = [ R[rd_[0]].rownames ]
    else :
        column_names = [ [r for r in _rd_.colnames] for _rd_ in R[rd_[0]]]
        index_names  = [ [r for r in _rd_.rownames] for _rd_ in R[rd_[0]]]
    #
    pandas2ri.activate()
    #
    # SMALL HELPER FUNCTION THAT TRANSFORMS A RDATA OBJECT INTO
    # A PANDAS DATAFRAME. CURRENTLY THERE IS NO VALUE ERROR CHECKING
    #
    rd = R.load( filename )
    raw_df_l = []
    if 'ndarray' in str( type( R[rd[0]] ) ).lower() :
        [ raw_df_l.append( R[rd[0]] ) ]
    else :
        [ raw_df_l.append( rdf ) for rdf in ro.vectors.DataFrame(R[rd[0]]) ]
    full_df_dict = {} ; i_ = 0
    for raw_df,colnames,rownames in zip( raw_df_l,column_names,index_names ) :
        pdf = pd.DataFrame( raw_df , columns=colnames , index=rownames )
        full_df_dict[i_] = pdf
        i_ = i_ + 1
    pandas2ri.deactivate()
    return ( full_df_dict )
Ejemplo n.º 2
0
def tune_model(scenario, params, log):
    irace_command = create_command(scenario, params, log)

    # Run irace
    process = subprocess.Popen(irace_command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()

    # Load the irace output and transdorm it to a csv file
    pandas2ri.activate()

    r.load(log.as_posix())

    r.library('irace')
    b = r.getFinalElites(r.iraceResults, n=0)

    with localconverter(default_converter + pandas2ri.converter):
        r_pd_params = b
    out_csv = Path(log.parent, log.stem + '.csv')
    r_pd_params.to_csvfile(out_csv.as_posix(), sep=',')

    # Import to pandas to pythonic cleaning
    py_pd_params = pd.read_csv(out_csv)
    py_pd_params = py_pd_params.drop(columns=['.ID.', '.PARENT.'])

    # Giving the correct order to rows
    correct_table = {}
    for c in py_pd_params.columns:
        rows = []
        for j in py_pd_params[c].keys():
            rows.append(py_pd_params[c][j])
        correct_table[c] = rows

    correct_table = pd.DataFrame(data=correct_table)
    correct_table.to_csv(out_csv)
Ejemplo n.º 3
0
def some_rpy2():
	flash('Loading data...please wait')
	r.load('mtu_inf_111813.RData')
	pm = r['predictor.mats']
	pmm = pm.rx(1)
	dataframe = r['data.frame']
	df = dataframe(pmm)
	firstcol = df.rx(1)
	seccol = df.rx(2)

	lattice = importr('lattice')
	xyplot = lattice.xyplot
	rprint = robjects.globalenv.get("print")

	#formula = Formula('firstcol ~ seccol')
	#formula.getenvironment()['firstcol'] = df.rx2(1)
	#formula.getenvironment()['seccol'] = df.rx2(2)
	#p = lattice.xyplot(formula)

	grdevices = importr('grDevices')

	#filenm = app.config['IMGS_FOLDER'] + 'hist.png'
	filenm = 'hist.png' # why is this in tmp still???

	grdevices.png(file=filenm, width=512, height=512)
	p = r.histogram(df.rx2(1))
	rprint(p) # works
	grdevices.dev_off()

	return render_template("hist.html", image='static/tmp/hist.png')
Ejemplo n.º 4
0
def get_gene_model(gene, ref_panel, weights_dir='fusion_twas/WEIGHTS'):
    weight_file = f'{weights_dir}/{ref_panel}/{gene}_500kb.wgt.RDat'
    assert os.path.exists(os.path.dirname(weight_file))
    if not os.path.exists(weight_file):
        print(f'WARNING: weight file {weight_file} missing for gene {gene}!  '
              f'Removing from the analysis.')
        return None
    r.load(weight_file)
    performance = r['cv.performance'][0]
    sorted_order = np.argsort(performance)
    best_model_index = sorted_order[-1]  # model with highest performance
    if best_model_index == 2:
        best_model_index = sorted_order[-2]  # if top1 is best, take 2nd-best
    model_weights = r['wgt.matrix'][:, best_model_index]
    if np.isnan(model_weights).all():
        print(f'WARNING: Best model for gene {gene} has all-nan weights!  '
              f'Removing from the analysis.')
        return None
    if (model_weights == 0).all():
        print(f'WARNING: Best model for gene {gene} has all-0 weights!  '
              f'Removing from the analysis.')
        return None
    rs_numbers = r.snps['V2'].values
    assert len(model_weights) == len(rs_numbers)
    model_weights = pd.Series(data=model_weights, index=rs_numbers)
    # Remove SNPs with 0 weight (or nan weight)
    model_weights = model_weights[model_weights != 0]
    return model_weights
Ejemplo n.º 5
0
def plotIDR(output_file, input_prefixes):
    '''create IDR plots.

    This code is taken from the R script

    batch-consistency-plot.r

    within the IDR package.
    '''

    dirname = os.path.dirname(__file__)
    R.source(os.path.join(dirname, "WrapperIDR.r"))

    R('''df.txt = 10''')

    R('''uri.list <- list()
         uri.list.match <- list()
         ez.list <- list()
         legend.txt <- c()
         em.output.list <- list()
         uri.output.list <- list()''')

    npair = len(input_prefixes)
    for x, input_prefix in enumerate(input_prefixes):

        R.load(input_prefix + "-uri.sav")
        R.load(input_prefix + "-em.sav")
        i = x + 1

        R('''uri.output.list[[%(i)i]] <- uri.output;
              em.output.list[[%(i)i]] <- em.output;
              # reverse =T for error rate;''' % locals())
        R('''
              ez.list[[%(i)i]] <- get.ez.tt.all(em.output, uri.output.list[[%(i)i]]$data12.enrich$merge1,
                                        uri.output.list[[%(i)i]]$data12.enrich$merge2);'''
          % locals())
        R('''
              # URI for all peaks
              uri.list[[%(i)i]] <- uri.output$uri.n;

              # URI for matched peaks
              uri.match <- get.uri.matched(em.output$data.pruned, df=df.txt);
              uri.list.match[[%(i)i]] <- uri.match$uri.n;
         ''' % locals())

        legend = "%(i)i = %(input_prefix)s" % locals()
        R('''
              legend.txt[%(i)i] <- '%(legend)s';
        ''' % locals())

    R.pdf(output_file)
    R('''par(mfcol=c(2,3), mar=c(5,6,4,2)+0.1)''')
    R('''plot.uri.group(uri.list, NULL, file.name=NULL, c(1:%(npair)i), title.txt="all peaks");
         plot.uri.group(uri.list.match, NULL, file.name=NULL, c(1:%(npair)i), title.txt="matched peaks");
         plot.ez.group(ez.list, plot.dir=NULL, file.name=NULL, legend.txt=c(1:%(npair)i), y.lim=c(0, 0.6));
         plot(0, 1, type="n", xlim=c(0,1), ylim=c(0,1), xlab="", ylab="", xaxt="n", yaxt="n"); 
         legend(0, 1, legend.txt, cex=0.6);''' % locals())
    R["dev.off"]()
Ejemplo n.º 6
0
def plotIDR( output_file, input_prefixes ):
    '''create IDR plots.

    This code is taken from the R script

    batch-consistency-plot.r

    within the IDR package.
    '''

    dirname = os.path.dirname(__file__)
    R.source(os.path.join( dirname, "WrapperIDR.r"))

    R('''df.txt = 10''')    

    R('''uri.list <- list()
         uri.list.match <- list()
         ez.list <- list()
         legend.txt <- c()
         em.output.list <- list()
         uri.output.list <- list()''')

    npair = len(input_prefixes)
    for x, input_prefix in enumerate(input_prefixes):

        R.load( input_prefix + "-uri.sav" )
        R.load( input_prefix + "-em.sav" )
        i = x + 1

        R( '''uri.output.list[[%(i)i]] <- uri.output;
              em.output.list[[%(i)i]] <- em.output;
              # reverse =T for error rate;''' % locals())
        R('''
              ez.list[[%(i)i]] <- get.ez.tt.all(em.output, uri.output.list[[%(i)i]]$data12.enrich$merge1,
                                        uri.output.list[[%(i)i]]$data12.enrich$merge2);''' % locals())
        R('''
              # URI for all peaks
              uri.list[[%(i)i]] <- uri.output$uri.n;

              # URI for matched peaks
              uri.match <- get.uri.matched(em.output$data.pruned, df=df.txt);
              uri.list.match[[%(i)i]] <- uri.match$uri.n;
         ''' % locals() )

        legend = "%(i)i = %(input_prefix)s" % locals()
        R('''
              legend.txt[%(i)i] <- '%(legend)s';
        '''% locals())
        
    R.pdf( output_file )
    R('''par(mfcol=c(2,3), mar=c(5,6,4,2)+0.1)''')
    R('''plot.uri.group(uri.list, NULL, file.name=NULL, c(1:%(npair)i), title.txt="all peaks");
         plot.uri.group(uri.list.match, NULL, file.name=NULL, c(1:%(npair)i), title.txt="matched peaks");
         plot.ez.group(ez.list, plot.dir=NULL, file.name=NULL, legend.txt=c(1:%(npair)i), y.lim=c(0, 0.6));
         plot(0, 1, type="n", xlim=c(0,1), ylim=c(0,1), xlab="", ylab="", xaxt="n", yaxt="n"); 
         legend(0, 1, legend.txt, cex=0.6);''' % locals())
    R["dev.off"]()
Ejemplo n.º 7
0
def pandas_load(name):
    '''
    loads .rdata file (R dataframe file) and returns it as Pandas dataframe.
    :param name: .rdata filename (eg: 'subset.Rdata')
    :return: pandas dataframe object
    '''
    pandas2ri.activate()
    r.load(name)  # name = 'subset.fcuk.Rdata'
    # name_without_ext = r['.'.join(name.split('.')[-2::-1][::-1])]
    # print(r.ls())  # ls() - list of active objects in R env
    df = pandas2ri.ri2py(r[r.ls()[0]])
    return df
Ejemplo n.º 8
0
def get_rdata(url):
    # For testing, probably want to do this a different way in production TODO
    response = urllib2.urlopen(url)
    html = response.read()
    fp = open("rdata" + url.replace("http://data.war-on-ice.net", "").replace("http://war-on-ice.com", ""), "w")
    fp.write(html)
    fp.close()
    robj = r.load("rdata" + url.replace("http://data.war-on-ice.net", "").replace("http://war-on-ice.com", ""))
    rdata = {}
    keys = {}
    for sets in robj:
        myRData = pandas2ri.ri2py(r[sets])
        rdata[sets] = []
        keys[sets] = set()
        # convert to DataFrame
        if not isinstance(myRData, pd.DataFrame):
            myRData = pd.DataFrame(myRData)
        for element in myRData:
            keys[sets].add(element)
            counter = 0
            for value in myRData[element]:
                if counter >= len(rdata[sets]):
                    rdata[sets].append({})
                rdata[sets][counter][element] = value
                counter += 1
    return rdata
Ejemplo n.º 9
0
def get_rdata(url):
    # For testing, probably want to do this a different way in production TODO
    response = urllib2.urlopen(url)
    html = response.read()
    fp = open(
        "rdata" + url.replace("http://data.war-on-ice.net", "").replace(
            "http://war-on-ice.com", ""), "w")
    fp.write(html)
    fp.close()
    robj = r.load("rdata" + url.replace(
        "http://data.war-on-ice.net", "").replace("http://war-on-ice.com", ""))
    rdata = {}
    keys = {}
    for sets in robj:
        myRData = pandas2ri.ri2py(r[sets])
        rdata[sets] = []
        keys[sets] = set()
        # convert to DataFrame
        if not isinstance(myRData, pd.DataFrame):
            myRData = pd.DataFrame(myRData)
        for element in myRData:
            keys[sets].add(element)
            counter = 0
            for value in myRData[element]:
                if counter >= len(rdata[sets]):
                    rdata[sets].append({})
                rdata[sets][counter][element] = value
                counter += 1
    return rdata
Ejemplo n.º 10
0
    def __init__(self, path):
        if not os.path.isfile(path):
            errormessage = "PEXO output not found in the specified path: {}".format(
                path)
            raise FileNotFoundError(errormessage)

        self.path = path
        self.contents = r.load(path)
Ejemplo n.º 11
0
def _download_and_import_RData_file(url):
    filename, headers = urlretrieve(url)

    # Load the RData file into R and get the name of the new variable created
    r_obj_name = r.load(filename)[0]

    # Load that variable and convert to a pandas DataFrame
    df = pandas2ri.ri2py(r[r_obj_name])

    return df
Ejemplo n.º 12
0
def prelude1_Rdata(connection):
    # print(parse_csv_with_DatesAndIDs('subset.csv'))
    # виправляю weather прямо в sql (один раз :) ):
    # inserter(con_in,'Weather','City_date')
    # завантажую датасети:
    connection.row_factory = sqlite3.Row
    cur = connection.cursor()
    r.load('flights.Rdata')  # name = 'subset.Rdata'
    # r('diablo <- df[1350500:5819079, ]')
    for i in range(5819079 // 10000 + 1):  # 5819079 5819 79
        cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='Delays{}".format(i))

        delays = pandas_load_by_parts(r, i)
        # weather = pd.read_sql('select * from Weather', con_in)
        # корегую імена колонок:
        # di = {}
        # for item in weather.axes[1]:
        #     di[item] = item.strip()
        #     print(item.strip())
        # weather.rename(columns=di, inplace=True)
        # записую в sql:
        # weather.to_sql('Weather', con_out)
        delays.to_sql('Delays_2nd_{}'.format(i), connection)
Ejemplo n.º 13
0
def _read_ExpressionSet_RData(RData):
    """Read ExpressionSet RData to Rpy2 robjects.

    RData: Path to the input RData file.
           ExpressionSet must be the only object in the RData.

    Return Rpy2's eSet object, assayData, featureData, phenotypeData.
    """
    importr('Biobase')
    rdata = r.load(RData)
    eSet = r.get(rdata)  # rpy2 ExpressionSet object (assumed)
    assayData = r.assayData(eSet)  # rpy2 environment object
    fData = r.fData(eSet)  # rpy2 DataFrame object
    pData = r.pData(eSet)  # rpy2 DataFrame object
    return eSet, assayData, fData, pData
Ejemplo n.º 14
0
def get_rdata(url):
    try:
        robj = r.load(r.url(url))
        rdata = {}
        keys = {}
        for sets in robj:
            myRData = pandas2ri.ri2py(r[sets])
            rdata[sets] = []
            keys[sets] = set()
            # convert to DataFrame
            if not isinstance(myRData, pd.DataFrame):
                myRData = pd.DataFrame(myRData)
            for element in myRData:
                keys[sets].add(element)
                counter = 0
                for value in myRData[element]:
                    if counter >= len(rdata[sets]):
                        rdata[sets].append({})
                    rdata[sets][counter][element] = value
                    counter += 1
        r.closeAllConnections()
        return rdata
    except:
        return None
Ejemplo n.º 15
0
 def fromRData(sce_class, rdata):
     rs4_object = robjects.r[r.load(rdata)[0]]
     sce = sce_class.fromRS4(rs4_object)
     sce.rs4 = rs4_object
     return sce
Ejemplo n.º 16
0
 def matrix(self):
     pandas2ri.activate()
     matrix = robjects.r[r.load("./tests/gene_marker_matrix.rdata")[0]]
     print("RHO", matrix.shape)
     return matrix
Ejemplo n.º 17
0
        weight_dir = f'fusion_twas/WEIGHTS/{ref_panel}'
        trait_to_sumstats = {
            'CAD': 'fusion_twas/cardiogramplusc4d/cad.sumstats',
            'LDL': 'fusion_twas/LDL/ldl.sumstats',
            'Crohns': 'fusion_twas/crohns/crohns.sumstats',
        }
        sumstats_file = trait_to_sumstats[trait_name]
        rs_to_z = pd.read_table(sumstats_file, usecols=['SNP', 'Z'], index_col='SNP')['Z']
        rs_to_p = pd.Series(2 * np.where(rs_to_z > 0, norm.sf(rs_to_z), norm.cdf(rs_to_z)),
                            index=rs_to_z.index)
        dfs = {}
        models = 'blup', 'lasso', 'top1', 'enet', 'prs'
        for gene in locus_genes:
            weight_file = f'{weight_dir}/{gene}_500kb.wgt.RDat'
            r.load(weight_file)
            performance = r['cv.performance'][0]
            sorted_order = np.argsort(performance)
            best_model_index = sorted_order[-1]  # model with highest performance
            if best_model_index == 2: best_model_index = sorted_order[-2]  # if top1 is best, take 2nd-best
            df = pd.DataFrame({'weight': r['wgt.matrix'][:, best_model_index]}, index=r.snps['V2'].values)
            df = df[df['weight'] != 0]
            df['p'] = df.index.map(rs_to_p.__getitem__)  # map() doesn't work with dicts when used on indices
            df = df.sort_values('p')
            if gene == causal_gene:
                GWAS_hits = df.index[:6]
            # noinspection PyUnboundLocalVariable
            for GWAS_hit in GWAS_hits:
                df[f'LD with {GWAS_hit}'] = df.index.map(lambda rs: get_LD(
                    chrom, rs, GWAS_hit, use_STARNET=True))
            dfs[gene] = df
Ejemplo n.º 18
0
def items():
    r.load("{}.rdata".format("items123"))
    d = pandas2ri.ri2py(r["items123"])
    s = pd.Series(index=map(int, d[: len(d) // 2]), data=d[len(d) // 2 :])
    s.to_pickle("items.pd")
    return s
Ejemplo n.º 19
0
def convert_from_R(name):
    r.load("{}.RData".format(name))
    df = pandas2ri.ri2py(r["res_max2"])
    df = convert_to_data_format(df)
    df.to_pickle("{}.pd".format(name))
    return df
Ejemplo n.º 20
0
 def read_comadre(self):
     r.load(self.path)
     self._raw_data = r.comadre
def get_data(cohort,
             assembly='tophatV04',
             normalization='raw',
             filter='none',
             counter='htseq',
             sync=True,
             metaCounts=False,
             countDir='data/in',
             phenoFile='data/in.RData',
             filterFile=None):
    """
    Loads data from RData count files in `countDir`
    and pheno data from 'phenoDir'.

    Example
    -------
    >>> counts, pheno = get_data('ukd1')

    Parameters
    ----------
    cohort String
        Determines the cohort to load.
        (e.g. 'ukd1', 'ukd2', 'ukd4', 'osr1', 'osr2',
                 'ut1', 'prad', 'pcap')
    assembly String
        The assembly used to define the features
        that were counted. (e.g. 'tophatV04' or 'stringtieV05')
    normalization String
        Which normalization of the counts to use.
        (e.g. 'raw', 'cpm', 'tpm', 'vst', 'rlog', 'normTransform')
    filter String
        The filter applied befor normalizing the data.
        (e.g. 'none' or 'q9')
    counter String
        The tool and summarization to count the features.
        e.g. 'kalGene' stands for kallisto counts that where
        summarized to gene counts.
    sync Bool
        A boolean indicating whether counts and pheno data
        should be restricted to common samples and ordered the
        equally.
    metaCounts Bool
        A boolean indicating whether features that
        start with '__' (e.g. '__alignment_not_unique') should
        be included.
    countDir Path
        A director where the expression data is stored in files
        `<normalization>-none-<cohort>-<assembly>-counts.RData`.
    phenoFile Path
        Path to a file that contains the pheno data as
        R dataframe.
    filterFile Path
        A text file containing the names of features
        that should be used.

    Value
    -----
    The function returns the two objects `counts, pheno`.
    counts pandas.dataframe
        A pandas data frame containing the counts
        with samples as columns and features as rows.
    pheno pandas.dataframe
        A pandas data frame containing the phenotypic
        data with samples as rows.
    """

    # check types
    stringArgs = [cohort, assembly, normalization, filter, counter]
    isString = [isinstance(arg, str) for arg in stringArgs]
    if (not all(isString)):
        raise TypeError(
            "All arguments but `sync` and `metaCounts` musst be strings.")
    if not isinstance(sync, bool):
        raise TypeError("The argument `sync` musst be boolean value.")
    if not isinstance(metaCounts, bool):
        raise TypeError("The argument `metaCounts` musst be boolean value.")

    # check normalization
    norms = [
        'raw', 'tpm-normalized', 'vst-normalized', 'rlog-normalized',
        'normTransform-normalized', 'cpm-normalized'
    ]
    normInd = [n.startswith(normalization) for n in norms]
    nHits = sum(normInd)
    if (nHits < 1):
        raise ValueError(
            "The argument `normalization` needs to be an abbreviation for one of "
            + ', '.join(norms))
    elif (nHits > 2):
        raise ValueError(
            "The abbreviation in the argument `normalization` is ambiguous.")
    else:
        normTerm = list(compress(norms, normInd))[0]

    if (normTerm == 'cpm-normalized'):
        normTerm = 'raw'
        doCPM = True
    else:
        doCPM = False

    if (counter == 'kallisto'):
        counter = 'kalGene'
    if (counter == 'htseq'):
        fnParts = [normTerm, filter, cohort, assembly]
    else:
        fnParts = [normTerm, filter, cohort, assembly, counter]

    # look for counts file
    cFileName = '-'.join(fnParts) + '-counts.RData'
    countsFile = os.path.join(countDir, cFileName)
    if (not os.path.isfile(countsFile)):
        raise IOError('There is no counts file for the given parameters: ' +
                      countsFile)

    pandas2ri.activate()

    # look for pheno file
    if (sync and not os.path.isfile(phenoFile)):
        raise IOError('The pheno data file could not e found: ' + phenoFile)
    elif (not os.path.isfile(phenoFile)):
        warnings.warn('The pheno data file could not e found: ' + phenoFile)
        pheno = pd.DataFrame()
    else:
        r.load(phenoFile)
        r('ind <- sapply(pheno, mode) == "logical"')
        r('pheno[ind] = lapply(pheno[ind], as.numeric)'
          )  # only float arrays have `NaN`
        r('pheno <- subset(pheno, Cohort != "UKDP1")'
          )  # remove duplicate samples
        pheno = r('pheno').set_index('ID')

    # load counts file
    r.load(countsFile)
    r('colnames(Counts) <- make.names(colnames(Counts))')
    if filterFile is not None:
        if not os.path.isfile(filterFile):
            raise IOError(
                f'The given filter file `{filterFile}` does not exist.')
        r(f'features <- readLines("{filterFile}")')
        r('Counts <- subset(Counts, rownames(Counts) %in% features)')
    counts = pd.DataFrame(r('Counts'),
                          index=r('rownames(Counts)'),
                          columns=r('colnames(Counts)'))

    if (not metaCounts):
        normalFeatures = [not f.startswith('__') for f in counts.index.values]
        counts = counts.loc[normalFeatures, :]

    if (doCPM):
        counts = 1e6 * counts / counts.apply(sum, axis=0)

    if (sync):
        commonSample = sorted(set(pheno.index).intersection(counts.columns))
        counts = counts[commonSample]
        pheno = pheno.loc[commonSample]

    return counts, pheno