Ejemplo n.º 1
0
    def get_federov_data(self, factors):
        low_level_limits  = IntVector([self.parameter_ranges[f][0] for f in factors])
        high_level_limits = IntVector([self.parameter_ranges[f][1] - 1 for f in factors])
        factor_centers    = IntVector([0 for f in factors])
        factor_levels     = IntVector([self.parameter_ranges[f][1] for f in factors])
        factor_round      = IntVector([0 for f in factors])
        is_factor         = BoolVector([False for f in factors])
        mix               = BoolVector([False for f in factors])

        opt_federov_data = {
                             "var": StrVector(factors),
                             "low": low_level_limits,
                             "high": high_level_limits,
                             "center": factor_centers,
                             "nLevels": factor_levels,
                             "round": factor_round,
                             "factor": is_factor,
                             "mix": mix
                           }

        opt_federov_dataframe = DataFrame(opt_federov_data)
        opt_federov_dataframe = opt_federov_dataframe.rx(StrVector(["var",
                                                                   "low",
                                                                   "high",
                                                                   "center",
                                                                   "nLevels",
                                                                   "round",
                                                                   "factor",
                                                                   "mix"]))
        return opt_federov_dataframe
Ejemplo n.º 2
0
def qcrop2(xlist, ylist, labels=None, nq=4.):
    if labels is None:
        labels = map(str, range(len(xlist)))
    x = []
    y = []
    xcrop = []
    ycrop = []
    facet = []
    for i, (onex, oney) in enumerate(zip(xlist, ylist)):
        xmin, xmax = qlim1(onex, nq)
        ymin, ymax = qlim1(oney, nq)
        cropx, cropy = zip(*[(
            nan,
            nan) if vy > ymax or vy < ymin or vx < xmin or vx > xmax else (vx,
                                                                           vy)
                             for vx, vy in zip(onex, oney)])
        xcrop += cropx
        ycrop += cropy
        x += onex
        y += oney
        facet += [labels[i]] * len(onex)

    df = DataFrame({
        'x':
        FloatVector(x),
        'y':
        FloatVector(y),
        'xcrop':
        FloatVector(xcrop),
        'ycrop':
        FloatVector(ycrop),
        'facet':
        FactorVector(StrVector(facet), levels=StrVector(labels))
    })
    return df
Ejemplo n.º 3
0
def bargraph_language(results):
    r = robjects.r

    for language in languages:
        varis = []
        probs = []
        locs = []
        for (lang, prob, var) in results.keys():
            if lang == language:
                loc = results[(lang, prob, var)]
                varis.append(pretty_varis[var])
                probs.append(prob)
                locs.append(loc)
        r.pdf('bargraph-loc-lang-' + language + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Variation': StrVector(varis),
            'Problem': StrVector(probs),
            'Lines': IntVector(locs),
        })

        #print (df)
        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Lines', fill='Variation') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('ylab("Lines of Code")')
        pp.plot()
        r['dev.off']()
Ejemplo n.º 4
0
    def Run(self):

        self.transit_message("Starting Corrplot")
        start_time = time.time()

        # assume first non-comment line is header; samples are
        headers = None
        data, means = [], []

        if self.filetype == "gene_means":
            for line in open(self.gene_means):
                w = line.rstrip().split('\t')
                if line[0] == '#':
                    headers = w[3:]
                    continue  # last comment line has names of samples
                data.append(w)
                cnts = [float(x) for x in w[3:]]
                means.append(cnts)
        elif self.filetype == "anova" or self.filetype == "zinb":
            n = -1  # number of conditions
            for line in open(self.gene_means):
                w = line.rstrip().split('\t')
                if line[0] == '#' or (
                        'pval' in line and 'padj' in line
                ):  # check for 'pval' for backwards compatibility
                    headers = w
                    continue  # keep last comment line as headers
                if n == -1:
                    # ANOVA header line has names of conditions, organized as 3+2*n+3 (2 groups (means, LFCs) X n conditions)
                    # ZINB header line has names of conditions, organized as 3+4*n+3 (4 groups X n conditions)
                    if self.filetype == "anova": n = int((len(w) - 6) / 2)
                    elif self.filetype == "zinb":
                        n = int((len(headers) - 6) / 4)
                    headers = headers[3:3 + n]
                    headers = [x.replace("Mean_", "") for x in headers]
                vals = [float(x)
                        for x in w[3:3 + n]]  # take just the columns of means
                qval = float(w[-2])
                if qval < 0.05:
                    data.append(w)
                    means.append(vals)
        else:
            print("filetype not recognized: %s" % self.filetype)
            sys.exit(-1)
        print("correlations based on %s genes" % len(means))

        genenames = ["%s/%s" % (w[0], w[1]) for w in data]
        hash = {}
        headers = [h.replace("Mean_", "") for h in headers]
        for i, col in enumerate(headers):
            hash[col] = FloatVector([x[i] for x in means])
        df = DataFrame(hash)  # can't figure out how to set rownames

        corrplotFunc = self.make_corrplotFunc()
        corrplotFunc(
            df, StrVector(headers), StrVector(genenames), self.outfile
        )  # pass headers to put cols in order, since df comes from dict

        self.finish()
        self.transit_message("Finished Corrplot")
Ejemplo n.º 5
0
    def _mark_timestamp(self, blSegsL):
        """
        mark segs in final sample
        """
        # 此处应用R来进行求解

        # 首先,求解每相邻数据的基线之差的集合
        #
        # 或直接列出所有基线

        # 然后,根据相邻数据的基线之差,映射到数据的非基线之上,确定归宿于哪一个
        # 基线之差
        #
        # 或找出落入基线之中的最大索引

        # 最后,所有的数据点中最先落入基线之差的为目标时间戳
        #
        # 根据该索引作为时间戳

        from rpy2.robjects.packages import importr
        from rpy2.robjects import IntVector, StrVector, globalenv
        import rpy2.robjects as robjects

        GR = importr('GenomicRanges')
        IR = importr('IRanges')

        GRL = GR.GRangesList()
        globalenv["GRL"] = GRL
        for blSegs, idx in zip(blSegsL, range(len(blSegsL))):
            chromNames = StrVector([seg.chromName for seg in blSegs])
            starts = IntVector([seg.start for seg in blSegs])
            ends = IntVector([seg.end for seg in blSegs])
            tempGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends))
            globalenv["tempGR"] = tempGR
            robjects.r("GRL[[{0}]]=tempGR".format(str(idx+1)))
            GRL = robjects.r["GRL"]

        # 此处由于list中保存的是指向目标Seg的指针,所以更新nonBLSegs即可
        nonBlSegs = list(set(self._segPoolL[-1].segments) - set(blSegsL[-1]))
        chromNames = StrVector([seg.chromName for seg in nonBlSegs])
        starts = IntVector([seg.start for seg in nonBlSegs])
        ends = IntVector([seg.end for seg in nonBlSegs])
        nonBlGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends))

        # fo = IR.findOverlaps(nonBlGR, GRL)
        # For large SCNA
        fo = IR.findOverlaps(nonBlGR, GRL, minoverlap=5000)
        globalenv["fo"] = fo
        robjects.reval("fom <- as.matrix(fo)")
        overlapIdx = np.array(list(robjects.r.fom)).reshape(tuple(reversed(robjects.r.fom.dim))) - 1
        # [[2, 2, 3, 3],
        # [1, 2, 1, 2]]
        #
        print overlapIdx

        for index in set(overlapIdx[0,]):
            yIdxes = np.where(overlapIdx[0,]==index)[0]
            ts = np.max(overlapIdx[1,yIdxes]+1)
            nonBlSegs[index].tag = str(ts)
Ejemplo n.º 6
0
def residues_groups(site_type, modified_residues):
    if site_type.name == 'phosphorylation':
        return StrVector(['S|T', 'Y'])
    # TODO: better grouping residues for site-specific enzymes:
    # for glycosylation there are ~16 enzymes; the idea would be
    # to load "site" : "terminal sugar" associations (e.g. from O-GlycBase)
    # and then map "terminal sugar" : "enzyme" for enzymes known to catalyze
    # glycosylation with given "terminal sugar" (& fro given link type)
    # Some additional literature review might be needed
    return StrVector(['|'.join(modified_residues)])
Ejemplo n.º 7
0
def bargraph_variation_diff():
    r = robjects.r

    for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]:
        langs = []
        probs = []
        diffs = []
        for lang in languages:
            for prob in problems:
                error = False
                try:
                    time = result[lang][prob][standard]
                except KeyError:
                    error = True
                try:
                    time_expert = result[lang][prob][expert]
                except KeyError:
                    error = True

                if not error:
                    diff = (float(time_expert + time) / float(time) - 1)
                else:
                    diff = 0

                langs.append(pretty_langs[lang])
                probs.append(prob)
                diffs.append(diff)

        r.pdf('bargraph-codingtime-diff-' + standard + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Language': StrVector(langs),
            'Problem': StrVector(probs),
            'Difference': FloatVector(diffs),
        })

        #print (df)
        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('ylab("Coding time difference (in percent)")') +\
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('scale_y_continuous(labels = percent_format())')
        pp.plot()
        r['dev.off']()
Ejemplo n.º 8
0
    def __init__(self, item, names=None, verbose=False):
        """Make data instance from a python or R (rpy2) object input item"""
        #self._r = item     # archive the original data object (but not used)
        self.verbose = verbose
        self.dim = ()

        if hasattr(item, 'names'):  # store names, colnames, rownames
            self.names = item.names  #   as StrVectors
        if names is not None:
            self.names = StrListVector(names)
        if hasattr(item, 'colnames'):
            self.colnames = item.colnames
        if hasattr(item, 'rownames'):
            self.rownames = item.rownames
        if isinstance(item, (Series, DataFrame)):
            self.rownames = StrVector(item.index)
        if isinstance(item, DataFrame):
            self.colnames = StrVector(item.columns)
        if isinstance(item, Series) and item.name is not None:
            self.colnames = StrVector([item.name])

        if isinstance(item, (ListVector, ro.vectors.DataFrame, dict)):
            try:  # convert to dict if dict-like (i.e. ListVector, R DataFrame)
                names = [
                    self.names[i]
                    if isinstance(self.names, StrVector) else self.names[0][i]
                    for i in range(len(item))
                ]
            except:
                names = [k for k, v in item.items()]
            self.iloc = {n: PyR(v) for n, (k, v) in zip(names, item.items())}
            if verbose:
                print(f"PyR: dict (len={len(self.iloc)}){type(item)}")
        else:  # not dict-like, so convert to numpy array and apply shape dims
            self.iloc = np.array(item)
            if hasattr(item, 'dim'):
                self.dim = tuple(item.dim)
                if len(self.dim) > 1:
                    self.iloc = self.iloc.reshape(tuple(item.dim), order='F')
            self.dim = self.iloc.shape
            if (not hasattr(self, 'rownames') and len(self.iloc.shape) > 1
                    and self.names and isinstance(self.names, ListVector)):
                self.rownames = self.names[0]  # try to infer rownames
            if (not hasattr(self, 'colnames') and len(self.iloc.shape) > 1
                    and self.names and isinstance(self.names, ListVector)):
                self.colnames = self.names[1]  # try to infer colnames
            if verbose:
                print(f"PyR: ndarray {self.iloc.shape} {type(item)}")
Ejemplo n.º 9
0
def draw_hist(length,pdfname='hist.pdf',b=25,m=700,wd=8,hd=6): #length = d5
#    分区间统计
#    for z1, z2 in groupby(sorted(length), key=lambda x: x//5):
#        print('{}-{}: {}'.format(z1*5, (z1+1)*5-1, len(list(z2))))
#    matplotlib   作图
#    lenths = array(length)
#    pyplot.hist(x=lenths,bins=50)
#    pyplot.xlabel('Sequence Length')
#    pyplot.xlim(400,500)
#    pyplot.ylabel('Sequence Number')
#    pyplot.title('Sequence Length Distribution')
#    pyplot.show()    
    robjects.globalenv["dd"] = IntVector(length)
    robjects.globalenv["nm"] = StrVector([pdfname])
    robjects.globalenv["b"] = IntVector([b])
    robjects.globalenv["m"] = IntVector([m])
    robjects.globalenv["wd"] = IntVector([wd])
    robjects.globalenv["hd"] = IntVector([hd])
    
    r_script = '''
    library(ape)
    pdf(nm,width=wd,height=hd)
    xcol=seq(0,m,b)

    hist(dd,freq=TRUE,breaks=xcol,col='#228B22',xlab='Sequence Length',ylab='Sequence number',main='Distribution of Sequence Length')
    dev.off()       
    '''
    robjects.r(r_script)
Ejemplo n.º 10
0
def StrListVector(strList):
    """Convert input to a StrVector, or a ListVector recursively"""
    try:
        assert (len(strList) > 0)  # NULL, None, '', non-str scalar  etc
    except:
        return NULL
    if isinstance(strList, ListVector):  # already a ListVector
        return ListVector(strList)
    elif isinstance(strList, StrVector):  # already a StrVector
        return StrVector(strList)
    elif isinstance(strList, str):  # str scalar, so apply StrVector
        return StrVector([strList])
    elif any([types.is_list_like(s) for s in strList]):  # not the deepest list
        return ListVector([(None, StrListVector(s)) for s in strList])
    else:
        return StrVector(list(strList))  # is deepest list(-like) of str types
def create_roast_scorer(gene_sets='c2.cp.kegg',
                        id_type='entrez',
                        grouping='by_substance',
                        q_value_cutoff=0.1,
                        na_action='fill_0',
                        cache=True,
                        cache_signatures=False):
    """Only cache signatures when doing permutations, otherwise it will only slow it down"""

    importr('limma')
    importr('Biobase')

    gene_sets_r = ListVector({
        gene_set.name: StrVector(list(gene_set.genes))
        for gene_set in db.load(gene_sets=gene_sets, id_type=id_type).gene_sets
    })

    def set_gene_set_collection():
        globalenv[gene_sets] = gene_sets_r

    def roast_score(disease: ExpressionWithControls,
                    compound: ExpressionWithControls):

        if len(compound.cases.columns) < 2 or len(
                compound.controls.columns) < 2:
            print(
                f'Skipping {compound} not enough degrees of freedom (no way to compute in-group variance)'
            )
            return None

        if cache:
            multiprocess_cache_manager.respawn_cache_if_needed()

        try:
            disease_gene_sets = roast(disease,
                                      gene_sets=gene_sets,
                                      use_cache=cache)
            disease_gene_sets.drop(disease_gene_sets[
                disease_gene_sets['fdr_q-val'] > q_value_cutoff].index,
                                   inplace=True)

            signature_gene_sets = roast(compound,
                                        gene_sets=gene_sets,
                                        use_cache=cache and cache_signatures)

            joined = combine_gsea_results(disease_gene_sets,
                                          signature_gene_sets, na_action)

            if randint(0, 100) == 1:
                r('gc()')

            return joined.score.mean()
        except RRuntimeError as e:
            print(e)
            return None

    return scoring_function(roast_score,
                            input=ExpressionWithControls,
                            grouping=grouping,
                            before_batch=set_gene_set_collection)
Ejemplo n.º 12
0
    def configure(self, params):
        super(RPredictor, self).configure(params)

        if self._positive_class_label is None:
            self._positive_class_label = ro.rinterface.NULL
        if self._negative_class_label is None:
            self._negative_class_label = ro.rinterface.NULL
        if self._class_labels is None:
            self._class_labels = ro.rinterface.NULL
        else:
            self._class_labels = StrVector(self._class_labels)

        r_handler.source(R_COMMON_PATH)
        r_handler.source(R_SCORE_PATH)
        r_handler.init(self._custom_model_path, self._target_type.value)
        if self._target_type == TargetType.UNSTRUCTURED:
            for hook_name in [
                    CustomHooks.LOAD_MODEL,
                    CustomHooks.SCORE_UNSTRUCTURED,
            ]:
                if not hasattr(r_handler, hook_name):
                    raise DrumCommonException(
                        "In '{}' mode hook '{}' must be provided.".format(
                            TargetType.UNSTRUCTURED.value, hook_name))

        self._model = r_handler.load_serialized_model(self._custom_model_path)
Ejemplo n.º 13
0
def c_index_from_r(values,
                   isdead,
                   nbdays,
                   values_test,
                   isdead_test,
                   nbdays_test,
                   isfactor=False):
    """ """
    rob.r('set.seed(2016)')
    isdead = FloatVector(isdead)
    isdead_test = FloatVector(isdead_test)

    nbdays = FloatVector(nbdays)
    nbdays_test = FloatVector(nbdays_test)

    values = FloatVector(values)
    values_test = FloatVector(values_test)

    if isfactor:
        values = StrVector(values)
        values_test = StrVector(values_test)

    cox = Formula('Surv(nbdays, isdead) ~ values')

    cox.environment['nbdays'] = nbdays
    cox.environment['isdead'] = isdead
    cox.environment['values'] = values

    res = survival.coxph(cox)
    frame = rob.r('data.frame')
    predict = rob.r.predict(res, frame(values=values_test))
    concordance_index = rob.r('concordance.index')

    try:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            c_index = concordance_index(predict,
                                        nbdays_test,
                                        isdead_test,
                                        method='noether')
    except Exception as e:
        print("exception found for c index!: {0}".format(e))
        return nan

    del res, cox, frame

    return c_index[0][0]
Ejemplo n.º 14
0
def bargraph_language(cfg, values):
    r = robjects.r
    for lang in cfg.languages:
        times = []
        varss = []
        probs = []
        ses = []

        for prob in cfg.problems:
            for var in cfg.variations:
                # we use the pretty names to make the
                varss.append(pretty_varis[var])
                probs.append(prob)

                data = FloatVector(values[prob][var][lang][0])
                times.append(r['mean'](data)[0])

                t_result = r['t.test'](data, **{
                    " conf.level": 0.999
                }).rx('conf.int')[0]
                ses.append((t_result[1] - t_result[0]) / 2)

        r.pdf('bargraph-executiontime-lang-' + lang + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Variation': StrVector(varss),
            'Problem': StrVector(probs),
            'Time': FloatVector(times),
            'SE': FloatVector(ses)
        })

        limits = ggplot2.aes(ymax='Time + SE', ymin='Time - SE')
        dodge = ggplot2.position_dodge(width=0.9)

        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('ylab("Execution time (in seconds)")')
        pp.plot()
        r['dev.off']()
Ejemplo n.º 15
0
def sites_mutated_ratio(path='static/plot.png',
                        width=1400,
                        height=900,
                        dpi=72,
                        exclude: List[str] = None,
                        glycosylation='together'):
    from pandas import DataFrame
    from helpers.ggplot2 import GG
    from rpy2.robjects.packages import importr
    from rpy2.robjects import StrVector

    rows = []
    for disorder in [True, False]:
        for source in source_manager.confirmed:
            ratios = sites_mutated_ratio_by_type(source.name,
                                                 disordered=disorder,
                                                 relative=False,
                                                 display=False,
                                                 exclude=exclude,
                                                 glycosylation=glycosylation)
            for site_name, percentage in ratios.items():
                row = {
                    'site_type': site_name,
                    'disordered_region': 'Yes' if disorder else 'No',
                    'percentage': percentage,
                    'source': source.name
                }
                rows.append(row)

    df = DataFrame(rows)
    ggplot2 = importr("ggplot2")
    theme_options = {
        'axis.text.x': ggplot2.element_text(angle=90, hjust=1),
        'axis.text': ggplot2.element_text(size=15),
        'text': ggplot2.element_text(size=14),
        'legend.text': ggplot2.element_text(size=14),
        'legend.position': 'bottom'
    }
    plot = (GG(
        ggplot2.ggplot(
            df,
            ggplot2.aes_string(
                x='site_type', y='percentage', fill='disordered_region'))) +
            ggplot2.geom_bar(stat='identity',
                             position=ggplot2.position_stack(reverse=True)) +
            ggplot2.facet_grid('~source') + ggplot2.theme(**theme_options) +
            ggplot2.labs(x='Site type',
                         y=r'Percentage of sites affected by mutations',
                         fill='Is site in disordered region?') +
            ggplot2.scale_fill_manual(
                values=StrVector(["#998ec3", "#f1a340"])))

    if path:
        ggplot2.ggsave(str(path),
                       width=width / dpi,
                       height=height / dpi,
                       dpi=dpi,
                       units='in',
                       bg='transparent')
Ejemplo n.º 16
0
def uninstall_grf():
    """ Ensures the grf packages is not installed before the test runs"""
    if rpackages.isinstalled("grf"):
        robjects.r.options(download_file_method="curl")
        utils = rpackages.importr("utils")
        utils.chooseCRANmirror(ind=0)

        utils.remove_packages(StrVector(["grf"]))
Ejemplo n.º 17
0
    def python_type_to_R_type(cls, pobject=None):
        if isinstance(pobject,(list, np.ndarray, pd.Series)):
            if isinstance(pobject,(list, pd.Series)):
                pobject = np.array(pobject)

            if re.match('^int',pobject.dtype.name) is not None:
                return IntVector(pobject)
            elif re.match('^float',pobject.dtype.name) is not None:
                return FloatVector(pobject)
            elif re.match('^str',pobject.dtype.name) is not None:
                return StrVector(pobject)
            elif re.match('^bool',pobject.dtype.name) is not None:
                return StrVector(pobject)
            else:
                return pobject
        else:
            return pobject
def install_e1071():
    # This only needs to be called once for each package on each machine.
    # Do it from an interactive session, because it might ask questions.
    import rpy2.robjects.packages as rpackages
    from rpy2.robjects import StrVector
    utils = rpackages.importr('utils')
    utils.chooseCRANmirror(ind=1)
    utils.install_packages(StrVector(['e1071']))
Ejemplo n.º 19
0
def bargraph_language():
    r = robjects.r

    for language in languages:
        varis = []
        probs = []
        times = []
        for prob in problems:
            for var in variations:
                try:
                    time = result[language][prob][var]
                except KeyError:
                    time = 0

                # for the expert times, add expert and non-expert times together
                if var.startswith('expert'):
                    try:
                        time = time + result[language][prob][var.replace(
                            'expert', '')]
                    except KeyError:
                        pass

                varis.append(pretty_varis[var])
                probs.append(prob)
                times.append(time)
        r.pdf('bargraph-codingtime-lang-' + language + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Variation': StrVector(varis),
            'Problem': StrVector(probs),
            'Time': IntVector(times),
        })

        #print (df)
        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('ylab("Coding time (in minutes)")')
        pp.plot()
        r['dev.off']()
Ejemplo n.º 20
0
def plot_bar(stats, output_file=None, **kw):
    names = [r['name'] for r in stats.values()[0][0]]
    with_rates = [r['with_rate'] for r in stats.values()[0][0]]
    names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)]

    by_dir = defaultdict(list)
    for triad in stats:
        for r in stats[triad]:
            by_dir[r[0]['from_directory']].append(r)

    for d in by_dir:
        by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]])

    runs = []
    g_stats = []
    data = []
    alpha = 0
    for d, v in by_dir.items():
        if 'exons' in d.split('/'):
            dataset = 'Nuclear'
        elif 'mtDNA' in d.split('/'):
            dataset = 'Mitochondrial'
        else:
            dataset = 'Microbial'
        print dataset
        for j, g in enumerate(v):
            g_stats += g
            data += [dataset] * len(g)
            runs += [j] * len(g)
            print names[j], sum(1 for _g in g if _g > 0.05) / len(g)
            alpha = max(alpha, get_alpha(g))
        print 'Samples', len(g)
    labels = 'expression(' + ','.join(names) + ')'

    df = DataFrame({
        'run': IntVector(runs),
        'g_stat': FloatVector(g_stats),
        'data': StrVector(data)
    })
    globalenv['df'] = df
    R('library(scales)')
    #            'geom_jitter(alpha=0.2, size=1) + ' + \
    #            'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \
    #            'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \
    #            'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \
    #            'geom_hline(yintercept=0.05, color="black") + ' + \
    cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \
            'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \
            'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \
            'scale_x_discrete(labels=' + labels + ') + ' + \
            'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \
            'facet_grid(. ~ data)'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
Ejemplo n.º 21
0
    def generate_valid_sample(self, sample_size):
        search_space_dataframe = {}

        for n in self.axis_names:
            search_space_dataframe[n] = []

        search_space = {}
        evaluated = 0

        info(
            "Generating valid search space of size {0} (does not spend evaluations)"
            .format(sample_size))

        while len(search_space) < sample_size:
            candidate_point = self.getRandomCoord()
            candidate_point_key = str(candidate_point)
            evaluated += 1

            if candidate_point_key not in search_space:
                perf_params = self.coordToPerfParams(candidate_point)

                is_valid = eval(self.constraint, copy.copy(perf_params),
                                dict(self.input_params))

                if is_valid:
                    search_space[candidate_point_key] = candidate_point

                    for n in perf_params:
                        candidate_value = self.parameter_values[n].index(
                            perf_params[n])
                        search_space_dataframe[n].append(candidate_value)

                    if len(search_space) % int(sample_size / 10) == 0:
                        info("Valid coordinates: " + str(len(search_space)) +
                             "/" + str(sample_size))
                        info("Tested coordinates: " + str(evaluated))

                if evaluated % 1000000 == 0:
                    info("Tested coordinates: " + str(evaluated))

        info("Valid/Tested configurations: " + str(len(search_space)) + "/" +
             str(evaluated))

        for k in search_space_dataframe:
            search_space_dataframe[k] = IntVector(search_space_dataframe[k])

        search_space_dataframe_r = DataFrame(search_space_dataframe)
        search_space_dataframe_r = search_space_dataframe_r.rx(
            StrVector(self.axis_names))

        info("Generated Search Space:")
        info(str(self.base.summary_default(search_space_dataframe_r)))

        coded_search_space_dataframe_r = self.encode_data(
            search_space_dataframe_r)

        return coded_search_space_dataframe_r
    def __init__(self, ytrue, ypred, cutoff=None, cutoffvariable="threshold"):

        self.ytrue = ytrue
        self.ypred = ypred

        self.aucobj = self._get_auc_obj()
        self.auc = self.aucobj[8][0]
        self.ci = rpackage_pROC.ci(self.aucobj, x="best")
        self.lowci = self.ci[0]
        self.highci = self.ci[1]
        self.binary = None

        if cutoff is None:
            cutoffmetrics = rpackage_pROC.coords(self.aucobj,
                                                 "best",
                                                 cutoffvariable,
                                                 ret=StrVector([
                                                     "threshold",
                                                     "specificity",
                                                     "sensitivity", "accuracy",
                                                     "ppv", "npv"
                                                 ]))

        else:
            cutoffmetrics = rpackage_pROC.coords(self.aucobj,
                                                 cutoff,
                                                 cutoffvariable,
                                                 ret=StrVector([
                                                     "threshold",
                                                     "specificity",
                                                     "sensitivity", "accuracy",
                                                     "ppv", "npv"
                                                 ]))

            if cutoffvariable == "threshold":
                binary_pred = [1 if x > cutoff else 0 for x in ypred]
                self.binary = binary_pred

        self.threshold = cutoffmetrics[0]
        self.specificity = cutoffmetrics[1]
        self.sensitivity = cutoffmetrics[2]
        self.accuracy = cutoffmetrics[3]
        self.ppv = cutoffmetrics[4]
        self.npv = cutoffmetrics[5]
Ejemplo n.º 23
0
def install_telescope():
    '''
    This function has not been tested and not guarantee to work!
    '''
    from rpy2.robjects import StrVector
    utils = rpackages.importr('utils')
    utils.chooseCRANmirror(ind = 1)
    pack = ('devtools', 'remotes')
    utils.install_packages(StrVector(pack))
    rpy2.robjects.r('remotes::install_url(url="https://github.com/DescartesResearch/telescope/archive/master.zip", INSTALL_opt= "--no-multiarch")') 
Ejemplo n.º 24
0
def bargraph_variation_diff(cfg, values):
    r = robjects.r

    for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]:
        langs = []
        probs = []
        diffs = []
        for lang in cfg.languages:
            for prob in cfg.problems:
                data = FloatVector(values[prob][standard][lang][0])
                data_expert = FloatVector(values[prob][expert][lang][0])

                mean = r['mean'](data)[0]
                mean_expert = r['mean'](data_expert)[0]
                diff = (float(mean_expert) / float(mean) - 1)

                langs.append(pretty_langs[lang])
                probs.append(prob)
                diffs.append(diff)

        r.pdf('bargraph-executiontime-diff-' + standard + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Language': StrVector(langs),
            'Problem': StrVector(probs),
            'Difference': FloatVector(diffs),
        })

        #print (df)
        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('ylab("Execution time difference (in percent)")') +\
            robjects.r('scale_y_continuous(labels = percent_format())')
        pp.plot()
        r['dev.off']()
Ejemplo n.º 25
0
def bargraph_variation_norm(results):
    r = robjects.r

    for variation in variations:
        langs = []
        probs = []
        locs = []
        for problem in problems:
            results_filtered = {
                key: results[key]
                for key in [(lang, problem, variation) for lang in languages]
            }
            loc_min = min(results_filtered.values())

            for (lang, prob, var) in results_filtered.keys():
                loc_norm = (float(
                    results_filtered[(lang, prob, var)])) / float(loc_min)
                langs.append(pretty_langs[lang])
                probs.append(prob)
                locs.append(loc_norm)

        r.pdf('bargraph-loc-var-norm-' + variation + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Language': StrVector(langs),
            'Problem': StrVector(probs),
            'Lines': FloatVector(locs),
        })

        #print (df)
        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Lines', fill='Language') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('ylab("Lines of Code (normalized to smallest)")')
        pp.plot()
        r['dev.off']()
Ejemplo n.º 26
0
def tupls2RDataframe(data, titles):
    cols = [[] for _ in titles]
    for datum in data:
        for i, e in enumerate(datum):
            cols[i].append(e)
    col_d = {}
    for i, t in enumerate(titles):
        col_d[t] = StrVector(tuple(cols[i]))
        col_d[t] = FactorVector(col_d[t])
    dataf = DataFrame(col_d)
    return dataf
Ejemplo n.º 27
0
def bspl(Pi, Xi, nbas=20, fdn=['Temperature', 'Salinity']):
    basis = create_basis(Pi[0], Pi[-1], nbas)
    with localconverter(numpy2ri.converter) as cv:
        Xi_R = cv.py2rpy(Xi)
    fdobj = fda.Data2fd(argvals=FloatVector(Pi),
                        y=Xi_R,
                        basisobj=basis,
                        fdnames=StrVector(['Level', 'Station'] + fdn))
    size = np.array(fdobj[0]).shape
    print("{0} B-splines computed for {1} variables.".format(size[1], size[2]))
    return fdobj
Ejemplo n.º 28
0
def mem_usage_graph(cfg):
    r = robjects.r
    varis = []
    langs = []
    probs = []
    mems = []
    for var in cfg.variations:
        for lang in cfg.languages:
            for prob in cfg.problems:
                mem_filename = get_mem_output(lang, prob, var)
                with open(mem_filename, 'r') as mem_file:
                    mem = mem_file.readline()
                    mems.append(float(mem))
                varis.append(pretty_varis[var])
                langs.append(pretty_langs[lang])
                probs.append(prob)

    # memory usage is a simple histogram with all information in one graph.
    r.pdf('bargraph-memusage.pdf', height=pdf_height(), width=pdf_width())
    df = robjects.DataFrame({
        'Language': StrVector(langs),
        'Problem': StrVector(probs),
        'Variation': StrVector(varis),
        'Mem': FloatVector(mems)
    })

    gp = ggplot2.ggplot(df)

    # we rotate the x labels to make sure they don't overlap
    pp = gp  +\
        ggplot2.opts (**{'axis.text.x': ggplot2.theme_text (angle = 90, hjust=1)}) + \
        ggplot2.aes_string (x='Problem', y='Mem', fill='Language') + \
        ggplot2.geom_bar (position='dodge', stat='identity') + \
        ggplot2.facet_wrap ('Variation') + \
        ggplot2_options () + \
        ggplot2_colors () + \
        robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
        robjects.r('ylab("Memory usage (in bytes)")')# + \

    pp.plot()
    r['dev.off']()
Ejemplo n.º 29
0
def bargraph_variation_diff(results):
    r = robjects.r

    for (standard, expert) in [('seq', 'expertseq'), ('par', 'expertpar')]:
        langs = []
        probs = []
        diffs = []
        for lang in languages:
            for prob in problems:
                loc = results[(lang, prob, standard)]
                loc_expert = results[(lang, prob, expert)]
                diff = (float(loc_expert) / float(loc) - 1)

                langs.append(pretty_langs[lang])
                probs.append(prob)
                diffs.append(diff)

        r.pdf('bargraph-loc-diff-' + standard + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Language': StrVector(langs),
            'Problem': StrVector(probs),
            'Difference': FloatVector(diffs),
        })

        #print (df)
        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Difference', fill='Language') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('ylab("Lines of code difference (in percent)")') +\
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('scale_y_continuous(labels = percent_format())')
        pp.plot()
        r['dev.off']()
Ejemplo n.º 30
0
    def ro(self):
        """Expose a view as RObject, to be manipulated in R environment"""
        # Convert to R vector of correct data type
        if isinstance(self.iloc, dict):
            out = ListVector([(None, PyR(v).ro) for v in self.iloc])
        if types.is_float_dtype(self.iloc):
            out = FloatVector(self.iloc.reshape(-1, order='F'))
        elif types.is_integer_dtype(self.iloc):
            out = IntVector(self.iloc.reshape(-1, order='F'))
        else:
            out = StrVector(self.iloc.reshape(-1, order='F'))
        if len(self.dim) > 1:  # reshape to R Array if has non-trivial dim
            out = ro.r.array(out, dim=IntVector(self.dim))

        # Collect R object name attributes
        if hasattr(self, 'rownames'):
            out.rownames = StrVector(self.rownames)
        if hasattr(self, 'colnames'):
            out.colnames = StrVector(self.colnames)
        if hasattr(self, 'names'):
            out.names = ListVector(self.names) if isinstance(
                self.names, ListVector) else StrVector(self.names)
        return out