Beispiel #1
0
    def get_federov_data(self, factors):
        low_level_limits  = IntVector([self.parameter_ranges[f][0] for f in factors])
        high_level_limits = IntVector([self.parameter_ranges[f][1] - 1 for f in factors])
        factor_centers    = IntVector([0 for f in factors])
        factor_levels     = IntVector([self.parameter_ranges[f][1] for f in factors])
        factor_round      = IntVector([0 for f in factors])
        is_factor         = BoolVector([False for f in factors])
        mix               = BoolVector([False for f in factors])

        opt_federov_data = {
                             "var": StrVector(factors),
                             "low": low_level_limits,
                             "high": high_level_limits,
                             "center": factor_centers,
                             "nLevels": factor_levels,
                             "round": factor_round,
                             "factor": is_factor,
                             "mix": mix
                           }

        opt_federov_dataframe = DataFrame(opt_federov_data)
        opt_federov_dataframe = opt_federov_dataframe.rx(StrVector(["var",
                                                                   "low",
                                                                   "high",
                                                                   "center",
                                                                   "nLevels",
                                                                   "round",
                                                                   "factor",
                                                                   "mix"]))
        return opt_federov_dataframe
def draw_hist(length,pdfname='hist.pdf',b=25,m=700,wd=8,hd=6): #length = d5
#    分区间统计
#    for z1, z2 in groupby(sorted(length), key=lambda x: x//5):
#        print('{}-{}: {}'.format(z1*5, (z1+1)*5-1, len(list(z2))))
#    matplotlib   作图
#    lenths = array(length)
#    pyplot.hist(x=lenths,bins=50)
#    pyplot.xlabel('Sequence Length')
#    pyplot.xlim(400,500)
#    pyplot.ylabel('Sequence Number')
#    pyplot.title('Sequence Length Distribution')
#    pyplot.show()    
    robjects.globalenv["dd"] = IntVector(length)
    robjects.globalenv["nm"] = StrVector([pdfname])
    robjects.globalenv["b"] = IntVector([b])
    robjects.globalenv["m"] = IntVector([m])
    robjects.globalenv["wd"] = IntVector([wd])
    robjects.globalenv["hd"] = IntVector([hd])
    
    r_script = '''
    library(ape)
    pdf(nm,width=wd,height=hd)
    xcol=seq(0,m,b)

    hist(dd,freq=TRUE,breaks=xcol,col='#228B22',xlab='Sequence Length',ylab='Sequence number',main='Distribution of Sequence Length')
    dev.off()       
    '''
    robjects.r(r_script)
    def _mark_timestamp(self, blSegsL):
        """
        mark segs in final sample
        """
        # 此处应用R来进行求解

        # 首先,求解每相邻数据的基线之差的集合
        #
        # 或直接列出所有基线

        # 然后,根据相邻数据的基线之差,映射到数据的非基线之上,确定归宿于哪一个
        # 基线之差
        #
        # 或找出落入基线之中的最大索引

        # 最后,所有的数据点中最先落入基线之差的为目标时间戳
        #
        # 根据该索引作为时间戳

        from rpy2.robjects.packages import importr
        from rpy2.robjects import IntVector, StrVector, globalenv
        import rpy2.robjects as robjects

        GR = importr('GenomicRanges')
        IR = importr('IRanges')

        GRL = GR.GRangesList()
        globalenv["GRL"] = GRL
        for blSegs, idx in zip(blSegsL, range(len(blSegsL))):
            chromNames = StrVector([seg.chromName for seg in blSegs])
            starts = IntVector([seg.start for seg in blSegs])
            ends = IntVector([seg.end for seg in blSegs])
            tempGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends))
            globalenv["tempGR"] = tempGR
            robjects.r("GRL[[{0}]]=tempGR".format(str(idx+1)))
            GRL = robjects.r["GRL"]

        # 此处由于list中保存的是指向目标Seg的指针,所以更新nonBLSegs即可
        nonBlSegs = list(set(self._segPoolL[-1].segments) - set(blSegsL[-1]))
        chromNames = StrVector([seg.chromName for seg in nonBlSegs])
        starts = IntVector([seg.start for seg in nonBlSegs])
        ends = IntVector([seg.end for seg in nonBlSegs])
        nonBlGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends))

        # fo = IR.findOverlaps(nonBlGR, GRL)
        # For large SCNA
        fo = IR.findOverlaps(nonBlGR, GRL, minoverlap=5000)
        globalenv["fo"] = fo
        robjects.reval("fom <- as.matrix(fo)")
        overlapIdx = np.array(list(robjects.r.fom)).reshape(tuple(reversed(robjects.r.fom.dim))) - 1
        # [[2, 2, 3, 3],
        # [1, 2, 1, 2]]
        #
        print overlapIdx

        for index in set(overlapIdx[0,]):
            yIdxes = np.where(overlapIdx[0,]==index)[0]
            ts = np.max(overlapIdx[1,yIdxes]+1)
            nonBlSegs[index].tag = str(ts)
Beispiel #4
0
def cox(hidden, survival, epoch, method='MG'):
    def clustering(hidden, method):
        if method == 'KNN':
            clf = cluster.KMeans(n_clusters=3)
            clf.fit(hidden)
            return clf.predict(hidden)

        if method == 'MG':
            clf = mixture.BayesianGaussianMixture(n_components=10,
                                                  n_init=10,
                                                  max_iter=500,
                                                  covariance_type='full')
            clf.fit(hidden)
            return clf.predict(hidden)

    predicts = clustering(hidden, method)
    T1, E1, G1 = [], [], []

    #print(predicts)
    unique, counts = np.unique(predicts, return_counts=True)

    #print(np.asarray((unique, counts/predicts.shape[0])).T)
    # easier to grep
    clusters = np.asarray((unique, counts / predicts.shape[0])).T
    for i in clusters:
        print('Epoch {} # CLUSTER: {} '.format(epoch, i))

    sscore = silhouette_score(hidden, predicts)
    print('Epoch {}  # SILHOUTE:  {}'.format(epoch, sscore))
    for i in range(len(predicts)):
        T1.append(survival[i, -2])
        E1.append(survival[i, -1])
    #temp = np.array(T1)
    #print(temp)
    #print(temp.astype(int))
    # print(T1)
    # print('\n')
    # print(E1)
    # print('\n')
    # print(predicts)
    info = pd.DataFrame({'status': T1, 'survive': E1, 'clusters': predicts})
    ratio = 0
    formula = Formula('x~y')
    env = formula.environment
    env['y'] = IntVector(predicts)
    env['x'] = surv.Surv(IntVector(np.array(T1).astype(int)),
                         IntVector(np.array(E1).astype(int)))
    result = surv.survdiff(formula)
    p_value = 1 - np.array(stats.pchisq(result[4], len(set(predicts)) - 1))
    # R pvalue

    #result = multivariate_logrank_test(np.array(T1),  np.array(predicts).astype(int), np.array(E1) )

    #p_value = result.p_value
    return p_value, ratio, info
Beispiel #5
0
def bargraph_language(results):
    r = robjects.r

    for language in languages:
        varis = []
        probs = []
        locs = []
        for (lang, prob, var) in results.keys():
            if lang == language:
                loc = results[(lang, prob, var)]
                varis.append(pretty_varis[var])
                probs.append(prob)
                locs.append(loc)
        r.pdf('bargraph-loc-lang-' + language + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Variation': StrVector(varis),
            'Problem': StrVector(probs),
            'Lines': IntVector(locs),
        })

        #print (df)
        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Lines', fill='Variation') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('ylab("Lines of Code")')
        pp.plot()
        r['dev.off']()
def getNonParametricPValue(labels, values, random_seed=0, printResults=True):
    '''Markers localization p-value calculation:
    Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006.
    Implemented in R in "gravity: Estimation Methods for Gravity Models" at:
    https://rdrr.io/cran/gravity/man/ppml.html
    '''

    np.random.seed(random_seed)
    #np.random.shuffle(labels)

    dataf = DataFrame({
        'label': IntVector(tuple(labels)),
        'distance': FloatVector(tuple(values))
    })
    fit = R(
        'function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)'
    )(dataf)

    # Deviance is -2.*log_likelihood
    altDeviance = list(fit[9].items())[0][1]
    nullDeviance = list(fit[11].items())[0][1]
    p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1)

    if printResults:
        print(
            'Non-parametric method:',
            '\n\t',
            #'  Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t',
            #'Null deviance:\t', np.round(nullDeviance, 1), '\n\t',
            #'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t',
            'p-value:\t',
            '%.1e' % p_value,
            '\n')

    return p_value
Beispiel #7
0
def plot_bar(stats, output_file=None, **kw):
    names = [r['name'] for r in stats.values()[0][0]]
    with_rates = [r['with_rate'] for r in stats.values()[0][0]]
    names = [n + ('+Gamma' if w else '') for n, w in zip(names, with_rates)]

    by_dir = defaultdict(list)
    for triad in stats:
        for r in stats[triad]:
            by_dir[r[0]['from_directory']].append(r)

    for d in by_dir:
        by_dir[d] = zip(*[[gs_p(_r['gs_p']) for _r in r] for r in by_dir[d]])

    runs = []
    g_stats = []
    data = []
    alpha = 0
    for d, v in by_dir.items():
        if 'exons' in d.split('/'):
            dataset = 'Nuclear'
        elif 'mtDNA' in d.split('/'):
            dataset = 'Mitochondrial'
        else:
            dataset = 'Microbial'
        print dataset
        for j, g in enumerate(v):
            g_stats += g
            data += [dataset] * len(g)
            runs += [j] * len(g)
            print names[j], sum(1 for _g in g if _g > 0.05) / len(g)
            alpha = max(alpha, get_alpha(g))
        print 'Samples', len(g)
    labels = 'expression(' + ','.join(names) + ')'

    df = DataFrame({
        'run': IntVector(runs),
        'g_stat': FloatVector(g_stats),
        'data': StrVector(data)
    })
    globalenv['df'] = df
    R('library(scales)')
    #            'geom_jitter(alpha=0.2, size=1) + ' + \
    #            'geom_boxplot(fill=NA, outlier.size=0, size=1.5, color=alpha("white", 0.5)) + ' + \
    #            'geom_boxplot(alpha=0.8, outlier.size=0) + ' + \
    #            'geom_hline(yintercept=0.05, size=1.5, alpha=0.5, color="white") + ' + \
    #            'geom_hline(yintercept=0.05, color="black") + ' + \
    cmd = 'gg <- ggplot(df, aes(factor(run), g_stat)) + ' + \
            'ylab("Goodness-of-Fit p-value") + xlab("Model") + ' + \
            'geom_boxplot(outlier.size=1, outlier.colour=alpha("black",'+str(alpha)+')) + ' + \
            'scale_x_discrete(labels=' + labels + ') + ' + \
            'theme(axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5)) + ' + \
            'facet_grid(. ~ data)'
    R(cmd)
    if output_file:
        R('ggsave("' + output_file + '", gg, width=5, height=5)')
    else:
        print R['gg']
        raw_input('Press Enter to continue...')
Beispiel #8
0
    def generate_valid_sample(self, sample_size):
        search_space_dataframe = {}

        for n in self.axis_names:
            search_space_dataframe[n] = []

        search_space = {}
        evaluated = 0

        info(
            "Generating valid search space of size {0} (does not spend evaluations)"
            .format(sample_size))

        while len(search_space) < sample_size:
            candidate_point = self.getRandomCoord()
            candidate_point_key = str(candidate_point)
            evaluated += 1

            if candidate_point_key not in search_space:
                perf_params = self.coordToPerfParams(candidate_point)

                is_valid = eval(self.constraint, copy.copy(perf_params),
                                dict(self.input_params))

                if is_valid:
                    search_space[candidate_point_key] = candidate_point

                    for n in perf_params:
                        candidate_value = self.parameter_values[n].index(
                            perf_params[n])
                        search_space_dataframe[n].append(candidate_value)

                    if len(search_space) % int(sample_size / 10) == 0:
                        info("Valid coordinates: " + str(len(search_space)) +
                             "/" + str(sample_size))
                        info("Tested coordinates: " + str(evaluated))

                if evaluated % 1000000 == 0:
                    info("Tested coordinates: " + str(evaluated))

        info("Valid/Tested configurations: " + str(len(search_space)) + "/" +
             str(evaluated))

        for k in search_space_dataframe:
            search_space_dataframe[k] = IntVector(search_space_dataframe[k])

        search_space_dataframe_r = DataFrame(search_space_dataframe)
        search_space_dataframe_r = search_space_dataframe_r.rx(
            StrVector(self.axis_names))

        info("Generated Search Space:")
        info(str(self.base.summary_default(search_space_dataframe_r)))

        coded_search_space_dataframe_r = self.encode_data(
            search_space_dataframe_r)

        return coded_search_space_dataframe_r
Beispiel #9
0
def draw_hist(length,pdfname='hist.pdf',b=5,m=700,wd=8,hd=6): #length = d5
  
    robjects.globalenv["dd"] = IntVector(length)
    robjects.globalenv["nm"] = StrVector([pdfname])
    robjects.globalenv["b"] = IntVector([b])
    robjects.globalenv["m"] = IntVector([m])
    robjects.globalenv["wd"] = IntVector([wd])
    robjects.globalenv["hd"] = IntVector([hd])
    
    r_script = '''
    library(ape)
    pdf(nm,width=wd,height=hd)
    xcol=seq(0,m,b)

    hist(dd,freq=TRUE,breaks=xcol,col='#228B22',xlab='Sequence Length',ylab='Sequence number',main='Distribution of Sequence Length')
    dev.off()       
    '''
    robjects.r(r_script)
Beispiel #10
0
def rank_abundance_data(counter):
    n = len(counter)
    ranks = IntVector(range(1, n + 1))
    counts = [c for (i, c) in counter.most_common()]
    counts_sum = sum(counts)
    fracs_arr = [(c / counts_sum) for c in counts]
    fracs = FloatVector(fracs_arr)

    return ranks, fracs
Beispiel #11
0
def loess_fromR(x, y, f, d=2):
    x_vector = IntVector(x)
    y_vector = FloatVector(y)
    globalenv["x_vector"] = x_vector
    globalenv["y_vector"] = y_vector
    globalenv["f"] = f
    a = round(f, 2) if round(f, 2) > 0.0 else f
    model = stats.loess('y_vector~x_vector', span=a, degree=d)
    return model.rx2('fitted')
Beispiel #12
0
    def ro(self):
        """Expose a view as RObject, to be manipulated in R environment"""
        # Convert to R vector of correct data type
        if isinstance(self.iloc, dict):
            out = ListVector([(None, PyR(v).ro) for v in self.iloc])
        if types.is_float_dtype(self.iloc):
            out = FloatVector(self.iloc.reshape(-1, order='F'))
        elif types.is_integer_dtype(self.iloc):
            out = IntVector(self.iloc.reshape(-1, order='F'))
        else:
            out = StrVector(self.iloc.reshape(-1, order='F'))
        if len(self.dim) > 1:  # reshape to R Array if has non-trivial dim
            out = ro.r.array(out, dim=IntVector(self.dim))

        # Collect R object name attributes
        if hasattr(self, 'rownames'):
            out.rownames = StrVector(self.rownames)
        if hasattr(self, 'colnames'):
            out.colnames = StrVector(self.colnames)
        if hasattr(self, 'names'):
            out.names = ListVector(self.names) if isinstance(
                self.names, ListVector) else StrVector(self.names)
        return out
Beispiel #13
0
def estimate_cpu_writes(ops_observed):
    stats = importr('stats')

    ops = IntVector([
        5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000,
        3500, 4000, 4500, 5000
    ])
    cpu_used = IntVector([
        1.5733, 1.5944, 2.09, 2.346, 2.488, 2.596, 4.925, 6.956, 9.02, 10.75,
        16.06, 20.74, 25.2100192678, 30.2011560694, 35.0838150289,
        38.0040540541, 42.8837545126, 47.6525096525
    ])
    TYPE = StrVector(["monoH.FC"])

    ro.globalenv["ops"] = ops
    ro.globalenv["cpu_used"] = cpu_used
    ro.globalenv["TYPE"] = TYPE

    splinefun = ro.r['splinefun']
    sp_w = splinefun(ops, cpu_used, TYPE)

    res = float(sp_w(ops_observed).r_repr())
    print '\ncpu_writes: ' + str(res)
    return res
Beispiel #14
0
    def python_type_to_R_type(cls, pobject=None):
        if isinstance(pobject,(list, np.ndarray, pd.Series)):
            if isinstance(pobject,(list, pd.Series)):
                pobject = np.array(pobject)

            if re.match('^int',pobject.dtype.name) is not None:
                return IntVector(pobject)
            elif re.match('^float',pobject.dtype.name) is not None:
                return FloatVector(pobject)
            elif re.match('^str',pobject.dtype.name) is not None:
                return StrVector(pobject)
            elif re.match('^bool',pobject.dtype.name) is not None:
                return StrVector(pobject)
            else:
                return pobject
        else:
            return pobject
Beispiel #15
0
def bargraph_language():
    r = robjects.r

    for language in languages:
        varis = []
        probs = []
        times = []
        for prob in problems:
            for var in variations:
                try:
                    time = result[language][prob][var]
                except KeyError:
                    time = 0

                # for the expert times, add expert and non-expert times together
                if var.startswith('expert'):
                    try:
                        time = time + result[language][prob][var.replace(
                            'expert', '')]
                    except KeyError:
                        pass

                varis.append(pretty_varis[var])
                probs.append(prob)
                times.append(time)
        r.pdf('bargraph-codingtime-lang-' + language + '.pdf',
              height=pdf_height(),
              width=pdf_width())
        df = robjects.DataFrame({
            'Variation': StrVector(varis),
            'Problem': StrVector(probs),
            'Time': IntVector(times),
        })

        #print (df)
        gp = ggplot2.ggplot(df)

        pp = gp + \
            ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \
            ggplot2.geom_bar (position='dodge', stat='identity') + \
            ggplot2_options () + \
            ggplot2_colors () + \
            robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\
            robjects.r('ylab("Coding time (in minutes)")')
        pp.plot()
        r['dev.off']()
Beispiel #16
0
    def create_variables(self, variables=None):
        """ 在R中创建变量

        :param dict,pd.Dataframe variables: 变量数据
        :return: 无返回值
        """
        # 解析变量名,存入variable_names
        if isinstance(variables, dict):
            variable_names = variables.keys()
        elif isinstance(variables, pd.DataFrame):
            variable_names = list(variables.columns)
        else:
            print('vars is not dict or dataframe!')
            variable_names = []

        for var_name in variable_names:
            if isinstance(variables[var_name], (list,pd.Series)):
                if isinstance(variables[var_name][0],(int,np.int8)):
                    self.R.globalenv[var_name] = IntVector(variables[var_name])
                if isinstance(variables[var_name][0],(float,np.float32)):
                    self.R.globalenv[var_name] = FloatVector(variables[var_name])
            else:
                self.R.globalenv[var_name] = variables[var_name]
    import rpy2
    exact = importr("exact2x2")

    INF = float('inf')
    fstring = '%.' + str(pdecimal) + 'f'
    estring = '%.' + str(pdecimal) + 'e'
    # print(fstring)
    for line in sys.stdin:
        line = line.strip()
        if line:
            ss = line.split()
            try:
                c1 = [int(ss[x]) for x in t1Index]
                c2 = [int(ss[x]) for x in t2Index]
                # print(IntVector(c1+c2))
                m = rpy2.robjects.r.matrix(IntVector(c1 + c2),
                                           nrow=2,
                                           byrow="T")
                # print(ci_alpha)
                r = exact.exact2x2(m,
                                   tsmethod="minlike",
                                   alternative=alternative,
                                   conf_level=ci_alpha)

                out = []
                out.append(r[r.names.index('p.value')][0])
                out.append(r[r.names.index('estimate')][0])
                out.append(r[r.names.index('conf.int')][0])
                out.append(r[r.names.index('conf.int')][1])

                if out[1] == 0 or out[1] == INF:
Beispiel #18
0
import rpy2.robjects as R
from rpy2.robjects import IntVector

# Dez cobaias foram submetidas ao tratamento de engorda com certa ração.
# Os pesos em gramas, antes e após o teste são dados a seguir
# (supõe-se que provenham de distribuições normais).
# A 1% de significância, podemos concluir que o uso da ração contribuiu para o aumento do peso médio dos animais?

# Antes
# 635, 704, 662, 560, 603, 745, 698, 575, 633, 669
# Depois
# 640, 712, 681, 558, 610, 740, 707, 585, 635, 682

# t test
antes = IntVector([635, 704, 662, 560, 603, 745, 698, 575, 633, 669])
depois = IntVector([640, 712, 681, 558, 610, 740, 707, 585, 635, 682])
result = R.r['t.test'](antes,
                       depois,
                       alternative='two.sided',
                       alpha=0.01,
                       paired=True)
import ipdb

ipdb.set_trace()
print result.rx('p.value')[0][0]
print result.rx('statistic')[0][0]
print result.rx('parameter')[0][0]
print result.rx('estimate')[0][0]

final_result = 1 if result.rx('p.value')[0][0] < 0.01 else 0
Beispiel #19
0
base = importr('base')
print(dir(base))
print(base.pi)

# 获得R中的变量,调用__getitem__()方法
pi = robjects.r['pi']
print(pi, type(pi), len(pi), pi[0], pi.r_repr(), type(pi.r_repr()), float(pi.r_repr()))

# R的向量表达
res = robjects.StrVector(['abc', 'def'])

# R的函数,调用的参数如果是vector,dataframe或matrix,必须进行转换
#rsort = robjects.r['sort']
rsort = importr('base').sort
# Wrong:rsort([3,2,4])
print(rsort(IntVector([3,2,4])), type(rsort(IntVector([3,2,4]))))
# 调用函数时,记得R中用.的分隔符,在Python中用_替代
print(base.rank(0, na_last = True))

# 一个OLS的例子
# 构建R环境变量
stats = importr('stats')
# 可以查看R函数的参数列表
print(tuple(stats.rnorm.formals().names))

# 创建R形式的变量
ctl = FloatVector([4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14])
trt = FloatVector([4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69])

# 调用R下的gl函数
group = base.gl(2, 10, 20, labels = ["Ctl","Trt"])
Beispiel #20
0
def line_plot (cfg, var, control, change_name, changing, selector, base_selector, basis):
  speedups = []
  thrds = []
  changes = []
  lowers = []
  uppers = []

  for n in cfg.threads:
    probs.append ('ideal')
    langs.append ('ideal')
    speedups.append (n)
    thrds.append (n)
    changes.append ('ideal')
    lowers.append (n)
    uppers.append (n)
    
  for c in changing:
    sel  = selector (c)

    # sequential base
    base = FloatVector (base_selector(c))
    # base with p = 1
    base_p1 = FloatVector (sel(1))
    # use fastest sequential program
    if basis == 'fastest' and mean (base_p1) < mean(base):
      base = base_p1
    elif basis == 'seq':
      pass
    elif basis == 'p1':
      base = base_p1
      
    for n in cfg.threads:
      ntimes = FloatVector (sel(n))

      # ratio confidence interval
      labels = ['Base'] * r.length(base)[0] + ['N']*r.length (ntimes)[0]
      df = DataFrame ({'Times': base + ntimes, 
                       'Type': StrVector(labels)})
      ratio_test = r['pairwiseCI'] (r('Times ~ Type'), data=df,
                                    control='N',
                                    method='Param.ratio',
                                    **{'var.equal': False,
                                    'conf.level': 0.999})[0][0]

      lowers.append (ratio_test[1][0])
      uppers.append (ratio_test[2][0])

      mn = mean (ntimes)      
      speedups.append (mean(base) / mn)
      # plot slowdowns
      #speedups.append (-mn/base)#(base / mn)
      thrds.append (n)
      if change_name == 'Language':
        changes.append (pretty_langs [c])
      else:
        changes.append (c)

  df = DataFrame ({'Speedup': FloatVector (speedups),
                   'Threads': IntVector (thrds),
                   change_name: StrVector (changes),
                   'Lower': FloatVector (lowers),
                   'Upper': FloatVector (uppers)
                   })
  ideal_changing = ['ideal']
  if change_name == 'Language':
    ideal_changing.extend ([pretty_langs [c] for c in changing])
  else:
    ideal_changing.extend (changing)

  legendVec = IntVector (range (len (ideal_changing)))
  legendVec.names = StrVector (ideal_changing)

  gg = ggplot2.ggplot (df)

  limits = ggplot2.aes (ymax = 'Upper', ymin = 'Lower')
  dodge = ggplot2.position_dodge (width=0.9)

  pp = gg + \
      ggplot2.geom_line() + ggplot2.geom_point(size=3) +\
      ggplot2.aes_string(x='Threads', y='Speedup', 
                         group=change_name, color=change_name, 
                         shape=change_name) + \
      ggplot2.scale_shape_manual(values=legendVec) + \
      ggplot2.geom_errorbar (limits, width=0.25) + \
      ggplot2_options () + \
      ggplot2_colors () + \
      ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 15, vjust=-0.2)}) + \
      robjects.r('ylab("Speedup")') + \
      robjects.r('xlab("Cores")')

      # ggplot2.xlim (min(threads), max(threads)) + ggplot2.ylim(min(threads), max(threads)) +\
  pp.plot()

  r['dev.off']()
Beispiel #21
0
def as_dataframe(cfg, results, basis):
    r = robjects.r
    varis = []
    langs = []
    probs = []
    times = []
    threads = []

    # speedups, with upper and lower bounds below
    speedups = []
    speedup_lowers = []
    speedup_uppers = []

    ses = []  # standard errors
    mems = []  # memory usage

    langs_ideal = list(cfg.languages)
    langs_ideal.append('ideal')

    probs_ideal = list(cfg.problems)
    probs_ideal.append('ideal')

    for var in cfg.variations:
        for lang in langs_ideal:  # cfg.languages:
            for prob in probs_ideal:  # cfg.problems:
                for thread in cfg.threads:

                    if lang == 'ideal' and prob == 'ideal':
                        continue
                    elif lang == 'ideal' or prob == 'ideal':
                        varis.append(var)
                        langs.append(pretty_langs[lang])
                        probs.append(prob)
                        threads.append(thread)
                        speedups.append(thread)
                        speedup_lowers.append(thread)
                        speedup_uppers.append(thread)
                        times.append(0)
                        ses.append(0)
                        mems.append(0)
                        continue

                    varis.append(var)  # pretty_varis [var])
                    langs.append(pretty_langs[lang])
                    probs.append(prob)
                    threads.append(thread)

                    if var.find('seq') >= 0:
                        thread = cfg.threads[-1]

                    vals = FloatVector(results[thread][prob][var][lang][0])
                    time = mean(vals)
                    times.append(time)

                    #
                    # time confidence interval
                    #
                    t_result = r['t.test'](FloatVector(vals), **{
                        " conf.level": 0.999
                    }).rx('conf.int')[0]
                    ses.append((t_result[1] - t_result[0]) / 2)

                    #
                    # memory usage
                    #
                    mem_filename = get_mem_output(lang, prob, var)
                    with open(mem_filename, 'r') as mem_file:
                        mem = mem_file.readline()
                        mems.append(float(mem))

                    # we include dummy data for the sequential case to avoid the
                    # speedup calculation below
                    if var.find('seq') >= 0:
                        speedups.append(1)
                        speedup_lowers.append(1)
                        speedup_uppers.append(1)
                        continue

                    #
                    # speedup values and confidence intervals
                    #
                    seq_vals = results[cfg.threads[-1]][prob][var.replace(
                        'par', 'seq')][lang][0]

                    # sequential base
                    base = FloatVector(seq_vals)
                    # base with p = 1
                    base_p1 = FloatVector(results[1][prob][var][lang][0])
                    # use fastest sequential program
                    if basis == 'fastest' and mean(base_p1) < mean(base):
                        base = base_p1
                    elif basis == 'seq':
                        pass
                    elif basis == 'p1':
                        base = base_p1

                    labels = ['Base'
                              ] * r.length(base)[0] + ['N'] * r.length(vals)[0]
                    df = DataFrame({
                        'Times': base + vals,
                        'Type': StrVector(labels)
                    })
                    ratio_test = r['pairwiseCI'](r('Times ~ Type'),
                                                 data=df,
                                                 control='N',
                                                 method='Param.ratio',
                                                 **{
                                                     'var.equal': False
                                                 })[0][0]

                    speedups.append(mean(base) / time)
                    speedup_lowers.append(ratio_test[1][0])
                    speedup_uppers.append(ratio_test[2][0])

    df = robjects.DataFrame({
        'Language': StrVector(langs),
        'Problem': StrVector(probs),
        'Variation': StrVector(varis),
        'Threads': IntVector(threads),
        'Time': FloatVector(times),
        'SE': FloatVector(ses),
        'Speedup': FloatVector(speedups),
        'SpeedupLower': FloatVector(speedup_lowers),
        'SpeedupUpper': FloatVector(speedup_uppers),
        'Mem': FloatVector(mems)
    })

    r.assign('df', df)

    r('save (df, file="performance.Rda")')

    # reshape the data to make variation not a column itself, but a part of
    # the other columns describe ie, time, speedup, etc.
    #
    # also, remove the 'ideal' problem as we don't want it in this plot.
    df = r('''
redf = reshape (df, 
                timevar="Variation", 
                idvar = c("Language","Problem","Threads"), 
                direction="wide")
redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain"))
redf[which(redf$Problem != "ideal"),]
''')

    r.pdf('speedup-expertpar-all.pdf', height=6.5, width=10)

    change_name = 'Language'

    legendVec = IntVector(range(len(langs_ideal)))
    legendVec.names = StrVector(langs_ideal)

    gg = ggplot2.ggplot(df)

    limits = ggplot2.aes(ymax='SpeedupUpper.expertpar',
                         ymin='SpeedupLower.expertpar')
    dodge = ggplot2.position_dodge(width=0.9)

    pp = gg + \
        ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\
        robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\
        ggplot2.aes_string(x='Threads', y='Speedup.expertpar',
                           group=change_name, color=change_name,
                           shape=change_name) + \
        ggplot2.geom_errorbar (limits, width=0.25) + \
        ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2),
                         'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2),
                         'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10),
                         'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10),
                         'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10),
                         'legend.text' : ggplot2.theme_text(family = 'serif', size = 10),
                         'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10),
                         'aspect.ratio' : 1,
                         }) + \
        robjects.r('ylab("Speedup")') + \
        robjects.r('xlab("Number of cores")') + \
        ggplot2.facet_wrap ('Problem', nrow = 2)

    pp.plot()

    r['dev.off']()
Beispiel #22
0
                    utils = importr('dplyr')
                    utils = importr('gravity')
                except Exception as exception:
                    print(exception)
                    import rpy2.robjects.packages as rpackages
                    utils = rpackages.importr('utils')
                    utils = rpackages.importr('gravity')
                    utils.chooseCRANmirror(ind=1)
                    utils.install_packages('dplyr')
                    utils.install_packages('gravity')
                finally:
                    from rpy2.robjects.packages import importr
                    utils = importr('dplyr')
                    utils = importr('gravity')

                dataf = DataFrame({'label': IntVector(tuple(labels)), 'distance': FloatVector(tuple(data.T[0]))})
                fit = R('function(x) ppml(dependent_variable="label", distance="distance", additional_regressors=NULL, robust=TRUE, data=x)')(dataf)
                #print(fit)

                # Deviance is -2.*log_likelihood
                altDeviance = list(fit[9].items())[0][1]
                nullDeviance = list(fit[11].items())[0][1]
                p_value = scipy.stats.chi2.sf(nullDeviance - altDeviance, 1)
                print('Poisson pseudo-maximum likelihood estimation (PPML) by J.M.C. Santos Silva & Silvana Tenreyro, 2006.')
                print('Implemented in R in "gravity: Estimation Methods for Gravity Models" at:')
                print('https://rdrr.io/cran/gravity/man/ppml.html')
                print('Robust PPML based (a.k.a. QMLE) deviances and their difference test (chi2 p-value):\n\t', 
                      'Null deviance:\t', np.round(nullDeviance, 1), '\n\t',
                      'Alternative deviance:\t', np.round(altDeviance, 1), '\n\t',
                      'p-value:\t', '%.1e' % p_value, '\n')
Beispiel #23
0
def as_dataframe (cfg, results, basis):
  r = robjects.r
  varis = []
  langs = []
  probs = []
  times = []
  threads = []

  # speedups, with upper and lower bounds below
  speedups = [] 
  speedup_lowers = []
  speedup_uppers = []

  ses = [] # standard errors
  mems = [] # memory usage

  langs_ideal = list (cfg.languages)
  langs_ideal.append ('ideal')

  probs_ideal = list (cfg.problems)
  probs_ideal.append ('ideal')

  for var in cfg.variations:
    for lang in langs_ideal: # cfg.languages:
      for prob in probs_ideal: # cfg.problems:
        for thread in cfg.threads:

          if lang == 'ideal' and prob == 'ideal':
            continue
          elif lang == 'ideal' or prob == 'ideal':
            varis.append (var)
            langs.append (pretty_langs[lang])
            probs.append (prob)
            threads.append (thread)
            speedups.append (thread)
            speedup_lowers.append (thread)
            speedup_uppers.append (thread)
            times.append (0)
            ses.append(0)
            mems.append (0)
            continue

          varis.append (var) # pretty_varis [var])
          langs.append (pretty_langs [lang])
          probs.append (prob)
          threads.append (thread)
          
          if var.find('seq') >= 0:
            thread = cfg.threads[-1]

          vals = FloatVector (results[thread][prob][var][lang][0])
          time = mean (vals)
          times.append (time)

          #
          # time confidence interval
          #
          t_result = r['t.test'] (FloatVector(vals), 
                                  **{" conf.level": 0.999}).rx ('conf.int')[0]
          ses.append ((t_result[1] - t_result[0])/2)

          #
          # memory usage
          #
          mem_filename = get_mem_output (lang, prob, var)
          with open (mem_filename, 'r') as mem_file:
            mem = mem_file.readline()
            mems.append (float (mem))

          # we include dummy data for the sequential case to avoid the 
          # speedup calculation below
          if var.find('seq') >= 0:
            speedups.append (1)
            speedup_lowers.append (1)
            speedup_uppers.append (1)
            continue
            
          #
          # speedup values and confidence intervals
          #
          seq_vals = results[cfg.threads[-1]][prob][var.replace ('par', 'seq')][lang][0]

          # sequential base
          base = FloatVector (seq_vals)
          # base with p = 1
          base_p1 = FloatVector (results[1][prob][var][lang][0])
          # use fastest sequential program
          if basis == 'fastest' and mean (base_p1) < mean(base):
            base = base_p1
          elif basis == 'seq':
            pass
          elif basis == 'p1':
            base = base_p1
      

          labels = ['Base'] * r.length(base)[0] + ['N']*r.length (vals)[0]
          df = DataFrame ({'Times': base + vals, 
                           'Type': StrVector(labels)})
          ratio_test = r['pairwiseCI'] (r('Times ~ Type'), data=df,
                                        control='N',
                                        method='Param.ratio',
                                        **{'var.equal': False})[0][0]

          speedups.append (mean(base) / time)
          speedup_lowers.append (ratio_test[1][0])
          speedup_uppers.append (ratio_test[2][0])

  df = robjects.DataFrame({'Language': StrVector (langs),
                           'Problem': StrVector (probs),
                           'Variation' : StrVector (varis),
                           'Threads': IntVector (threads),
                           
                           'Time': FloatVector (times),
                           'SE': FloatVector (ses),
                           
                           'Speedup': FloatVector (speedups),
                           'SpeedupLower': FloatVector (speedup_lowers),
                           'SpeedupUpper': FloatVector (speedup_uppers),
                           
                           'Mem' : FloatVector (mems)
                           })


  r.assign ('df', df)

  r ('save (df, file="performance.Rda")')
  
  # reshape the data to make variation not a column itself, but a part of
  # the other columns describe ie, time, speedup, etc.
  #
  # also, remove the 'ideal' problem as we don't want it in this plot.
  df = r('''
redf = reshape (df, 
                timevar="Variation", 
                idvar = c("Language","Problem","Threads"), 
                direction="wide")
redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain"))
redf[which(redf$Problem != "ideal"),]
''')
  
  r.pdf ('speedup-expertpar-all.pdf',
         height=6.5, width=10)

  change_name = 'Language'

  legendVec = IntVector (range (len (langs_ideal)))
  legendVec.names = StrVector (langs_ideal)

  gg = ggplot2.ggplot (df)

  limits = ggplot2.aes (ymax = 'SpeedupUpper.expertpar', ymin = 'SpeedupLower.expertpar')
  dodge = ggplot2.position_dodge (width=0.9)

  pp = gg + \
      ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\
      robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\
      ggplot2.aes_string(x='Threads', y='Speedup.expertpar', 
                         group=change_name, color=change_name, 
                         shape=change_name) + \
      ggplot2.geom_errorbar (limits, width=0.25) + \
      ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2),
                       'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2),
                       'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10),
                       'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10),
                       'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10),
                       'legend.text' : ggplot2.theme_text(family = 'serif', size = 10),
                       'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10),
                       'aspect.ratio' : 1,
                       }) + \
      robjects.r('ylab("Speedup")') + \
      robjects.r('xlab("Number of cores")') + \
      ggplot2.facet_wrap ('Problem', nrow = 2)

  pp.plot()

  r['dev.off']()
Beispiel #24
0
        #Mod
        soundType.append(1)
        #Saline
        muscimol.append(0)


import rpy2

from rpy2.robjects.packages import importr

lme4 = importr('lme4')

from rpy2.robjects import IntVector, Formula


allnCorr = IntVector(allnCorr)
allnVal = IntVector(allnVal)
allfracCorr = IntVector(allfracCorr)
sessionInds = IntVector(sessionInds)
animalInds = IntVector(animalInds)
soundType = IntVector(soundType)
muscimol = IntVector(muscimol)

model = Formula('allnCorr/allnVal ~ sessionInds + (1 | animalInds), weights=allnVal, family=binomial')

env = model.environment
env['allnCorr'] = allnCorr
env['allnVal'] = allnVal
env['sessionInds'] = sessionInds
env['animalInds'] = animalInds
Beispiel #25
0
def line_plot(cfg, var, control, change_name, changing, selector,
              base_selector, basis):
    speedups = []
    thrds = []
    changes = []
    lowers = []
    uppers = []

    for n in cfg.threads:
        probs.append('ideal')
        langs.append('ideal')
        speedups.append(n)
        thrds.append(n)
        changes.append('ideal')
        lowers.append(n)
        uppers.append(n)

    for c in changing:
        sel = selector(c)

        # sequential base
        base = FloatVector(base_selector(c))
        # base with p = 1
        base_p1 = FloatVector(sel(1))
        # use fastest sequential program
        if basis == 'fastest' and mean(base_p1) < mean(base):
            base = base_p1
        elif basis == 'seq':
            pass
        elif basis == 'p1':
            base = base_p1

        for n in cfg.threads:
            ntimes = FloatVector(sel(n))

            # ratio confidence interval
            labels = ['Base'] * r.length(base)[0] + ['N'] * r.length(ntimes)[0]
            df = DataFrame({'Times': base + ntimes, 'Type': StrVector(labels)})
            ratio_test = r['pairwiseCI'](r('Times ~ Type'),
                                         data=df,
                                         control='N',
                                         method='Param.ratio',
                                         **{
                                             'var.equal': False,
                                             'conf.level': 0.999
                                         })[0][0]

            lowers.append(ratio_test[1][0])
            uppers.append(ratio_test[2][0])

            mn = mean(ntimes)
            speedups.append(mean(base) / mn)
            # plot slowdowns
            #speedups.append (-mn/base)#(base / mn)
            thrds.append(n)
            if change_name == 'Language':
                changes.append(pretty_langs[c])
            else:
                changes.append(c)

    df = DataFrame({
        'Speedup': FloatVector(speedups),
        'Threads': IntVector(thrds),
        change_name: StrVector(changes),
        'Lower': FloatVector(lowers),
        'Upper': FloatVector(uppers)
    })
    ideal_changing = ['ideal']
    if change_name == 'Language':
        ideal_changing.extend([pretty_langs[c] for c in changing])
    else:
        ideal_changing.extend(changing)

    legendVec = IntVector(range(len(ideal_changing)))
    legendVec.names = StrVector(ideal_changing)

    gg = ggplot2.ggplot(df)

    limits = ggplot2.aes(ymax='Upper', ymin='Lower')
    dodge = ggplot2.position_dodge(width=0.9)

    pp = gg + \
        ggplot2.geom_line() + ggplot2.geom_point(size=3) +\
        ggplot2.aes_string(x='Threads', y='Speedup',
                           group=change_name, color=change_name,
                           shape=change_name) + \
        ggplot2.scale_shape_manual(values=legendVec) + \
        ggplot2.geom_errorbar (limits, width=0.25) + \
        ggplot2_options () + \
        ggplot2_colors () + \
        ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 15, vjust=-0.2)}) + \
        robjects.r('ylab("Speedup")') + \
        robjects.r('xlab("Cores")')

    # ggplot2.xlim (min(threads), max(threads)) + ggplot2.ylim(min(threads), max(threads)) +\
    pp.plot()

    r['dev.off']()
Beispiel #26
0
def run_all(site_type: str,
            gene_sets: Mapping[str, Path] = GENE_SETS,
            gene_set_filter: Tuple[int] = (5, 1000),
            correct=False,
            **kwargs):
    """Runs all active_pathways combinations for given site_type.

    Uses pan_cancer/clinvar Active Driver analyses results
    and all provided GMT gene sets.

    Args:
        site_type: site filter which will be passed to ActiveDriver analysis
        gene_sets: gene sets to be considered
        gene_set_filter: a two-tuple: (min, max) number of genes required
            to be in a gene set. If not set, the default of (5, 1000) is used

    Results are saved in `output_dir`.

    Returns:
        Mapping of directories with newly computed ActivePathways results
    """
    data_table = importr('data.table')
    paths = {}

    kwargs['geneset.filter'] = IntVector(gene_set_filter)

    for analysis in [
            active_driver.pan_cancer_analysis, active_driver.clinvar_analysis
    ]:
        for gene_set in gene_sets:
            path = output_dir / analysis.name / gene_set / site_type

            # remove the old results (if any)
            rmtree(path, ignore_errors=True)
            # recreate dir
            path.mkdir(parents=True)

            path = path.absolute()

            ad_result = analysis(site_type)
            print(
                f'Preparing active pathways: {analysis.name} for {len(ad_result["all_gene_based_fdr"])} genes'
            )
            print(f'Gene sets/background: {gene_set}')

            gene_sets_path = gene_sets[gene_set]

            if callable(gene_sets_path):
                gene_sets_path = gene_sets_path()

            result = run_active_pathways(ad_result,
                                         str(gene_sets_path),
                                         cytoscape_dir=path,
                                         correct=correct,
                                         **kwargs)

            data_table.fwrite(result,
                              str(path / 'pathways.tsv'),
                              sep='\t',
                              sep2=r.c('', ',', ''))

            paths[(analysis, gene_set)] = path

    return paths
Beispiel #27
0
def sequence_logo(pwm_or_seq,
                  path: Path = None,
                  width=369,
                  height=149,
                  dpi=72,
                  legend=False,
                  renumerate=True,
                  title: str = None,
                  **kwargs):
    """Generate a sequence logo from Position Weight Matrix (pwm)
    or a list of aligned sequences.

    and save it into a file if a path was provided.
    The logo will be generated with ggseqlogo (R).

    Args:
        pwm_or_seq: list of sequences or PWM matrix or dict where
            keys are names of facets and values are lists or PWMs
        path: where the file should be saved
        renumerate:
            change the labels of x axis to reflect relative position
            to the modified (central) residue (15-aa sequence is assumed)
        width: width in pixels
        height: height in pixels
        dpi: the DPI of the plotting device
        legend: whether and where the legend should be shown
        title: the title of the plot
    """
    gglogo = importr("ggseqlogo")
    ggplot2 = importr("ggplot2")

    if isinstance(pwm_or_seq, list):
        pwm_or_seq = StrVector(pwm_or_seq)
    elif isinstance(pwm_or_seq, dict):
        pwm_or_seq = TaggedList(pwm_or_seq.values(), pwm_or_seq.keys())

    theme_options = {
        'legend.position': legend or 'none',
        'legend.title': ggplot2.element_blank(),
        'legend.text': ggplot2.element_text(size=14),
        'legend.key.size': r.unit(0.2, 'in'),
        'plot.title': ggplot2.element_text(hjust=0.5, size=16),
        'axis.title.y': ggplot2.element_text(size=16),
        'text': ggplot2.element_text(size=20),
        'plot.margin': r.unit([0.03, 0.045, -0.2, 0.06], 'in'),
    }

    plot = GG(gglogo.ggseqlogo(pwm_or_seq, **kwargs)) + ggplot2.theme(
        **theme_options) + ggplot2.labs(y='bits')

    if renumerate:
        plot += ggplot2.scale_x_continuous(breaks=IntVector(range(1, 14 + 2)),
                                           labels=IntVector(range(-7, 7 + 1)))
    if title:
        plot += ggplot2.ggtitle(title)

    if path:
        ggplot2.ggsave(str(path),
                       width=width / dpi,
                       height=height / dpi,
                       dpi=dpi,
                       units='in',
                       bg='transparent')

    return plot
Beispiel #28
0
    def __init__(self, params):
        self.base = importr("base")
        self.utils = importr("utils")
        self.stats = importr("stats")
        self.algdesign = importr("AlgDesign")
        self.car = importr("car")
        self.rsm = importr("rsm")
        self.dplyr = importr("dplyr")
        self.quantreg = importr("quantreg")
        self.dicekrig = importr("DiceKriging")
        self.diced = importr("DiceDesign")

        #numpy.random.seed(11221)
        #self.base.set_seed(11221)

        self.complete_design_data = None
        self.complete_search_space = None

        self.total_runs = 20
        orio.main.tuner.search.search.Search.__init__(self, params)

        self.name = "GPR"

        self.parameter_ranges = {}

        for i in range(len(self.params["axis_val_ranges"])):
            self.parameter_ranges[self.params["axis_names"][i]] = [
                0, len(self.params["axis_val_ranges"][i])
            ]

        info("Parameters: " + str(self.parameter_ranges))

        self.parameter_values = {}

        for i in range(len(self.params["axis_val_ranges"])):
            self.parameter_values[self.params["axis_names"]
                                  [i]] = self.params["axis_val_ranges"][i]

        info("Parameter Real Ranges: " + str(self.axis_val_ranges))
        info("Parameter Range Values: " + str(self.parameter_values))

        self.range_matrix = {}

        for i in range(len(self.axis_names)):
            self.range_matrix[self.axis_names[i]] = IntVector(
                self.axis_val_ranges[i])

        self.range_matrix = ListVector(self.range_matrix)
        info("DataFrame Ranges: " +
             str(self.base.summary_default(self.range_matrix)))

        self.starting_sample = int(round(len(self.params["axis_names"]) + 2))
        self.steps = 22
        self.extra_experiments = int(round(len(self.params["axis_names"]) * 1))
        self.testing_set_size = 300000
        self.failure_multiplier = 100

        self.__readAlgoArgs()

        self.experiment_data = None
        self.best_points_complete = None

        if self.time_limit <= 0 and self.total_runs <= 0:
            err(('%s search requires search time limit or ' +
                 'total number of search runs to be defined') %
                self.__class__.__name__)

        self.run_summary_database = dataset.connect("sqlite:///" +
                                                    'run_summary.db')
        self.summary = self.run_summary_database["dlmt_run_summary"]

        info("Starting sample: " + str(self.starting_sample))
        info("GPR steps: " + str(self.steps))
        info("Experiments added per step: " + str(self.extra_experiments))
        info("Initial Testing Set Size: " + str(self.testing_set_size))
        info("Constraints: " + str(self.constraint))
Beispiel #29
0
 def factory(self, date, data, raw = False):
     if not raw:
         tdate = self.dateconvert
         date = IntVector([tdate(dt) for dt in date])
     #data = FloatVector(data)
     return self.r['zoo'](data,date)
Beispiel #30
0
    def measure_design(self, encoded_design, step_number):
        design = self.rsm.decode_data(encoded_design)

        info("Measuring design of size " + str(len(design[0])))

        design_names = [
            str(n) for n in self.base.names(design) if n not in [
                "cost_mean", "predicted_mean", "predicted_sd",
                "predicted_mean_2s"
            ]
        ]
        initial_factors = self.params["axis_names"]
        measurements = []

        info("Current Design Names: " + str(design_names))

        info("Complete decoded design:")
        info(str(design))

        info("Complete original design:")
        info(str(encoded_design))

        for line in range(1, len(design[0]) + 1):
            if type(design.rx(line, True)[0]) is int:
                design_line = [v for v in design.rx(line, True)]
            else:
                design_line = [
                    int(round(float(v[0]))) for v in design.rx(line, True)
                ]

            candidate = [0] * len(initial_factors)

            for i in range(len(design_names)):
                #    if should_redecode:
                #        candidate[initial_factors.index(design_names[i])] = self.parameter_values[design_names[i]].index(design_line[i])
                #    else:
                candidate[initial_factors.index(
                    design_names[i])] = design_line[i]

            info("Evaluating candidate:")
            info(str(candidate))

            measurement = self.getPerfCosts([candidate])
            if measurement != {}:
                measurements.append(
                    float(numpy.mean(measurement[str(candidate)][0])))
            else:
                measurements.append(robjects.NA_Real)

        encoded_design = encoded_design.rx(
            True, IntVector(tuple(range(1,
                                        len(initial_factors) + 1))))

        info("Encoded design")
        info(str(encoded_design))
        info("Dims design")
        info(str(self.base.dim(encoded_design)))
        info("Measurements")
        info(str(measurements))
        info("FloatVector Measurements")
        info(str(FloatVector(measurements)))
        info("Attempting DF")
        info(
            str(
                self.base.dim(
                    DataFrame({"cost_mean": FloatVector(measurements)}))))
        info("Dims DF")
        info(
            str(
                self.base.dim(
                    DataFrame({"cost_mean": FloatVector(measurements)}))))

        encoded_design = self.dplyr.bind_cols(
            encoded_design, DataFrame({"cost_mean":
                                       FloatVector(measurements)}))

        info("Complete design, with measurements:")
        info(str(self.base.summary_default(encoded_design)))

        encoded_design = encoded_design.rx(
            self.stats.complete_cases(encoded_design), True)
        encoded_design = encoded_design.rx(
            self.base.is_finite(self.base.rowSums(encoded_design)), True)

        info("Clean encoded design, with measurements:")
        info(str(self.base.summary_default(encoded_design)))

        self.utils.write_csv(encoded_design,
                             "design_step_{0}.csv".format(step_number))

        if self.complete_design_data == None:
            self.complete_design_data = encoded_design
        else:
            info(str(self.complete_design_data))
            info(str(encoded_design))

            self.complete_design_data = self.base.rbind(
                self.complete_design_data, encoded_design)

        return encoded_design
Beispiel #31
0
        else:
            info("Using pre-generated space for this size")
            search_space_database = dataset.connect(
                "sqlite:///search_space_{0}.db".format(self.seed_space_size))
            for experiment in search_space_database['experiments']:
                search_space.append(eval(experiment["value"]))

        info("Starting DOPT-anova")

        r_search_space = {}
        for i in range(len(search_space[0])):
            r_row = [self.dim_uplimits[i] - 1, 0]
            for col in search_space:
                r_row.append(col[i])

            r_search_space[initial_factors[i]] = IntVector(r_row)

        data = DataFrame(r_search_space)
        data = data.rx(StrVector(initial_factors))

        self.dopt_anova(initial_factors, initial_inverse_factors, data)

        sys.exit()

        perf_cost, mean_perf_cost = self.MAXFLOAT, self.MAXFLOAT

        params = self.coordToPerfParams(coord)
        end_time = time.time()
        search_time = start_time - end_time
        speedup = float(eval_cost[0]) / float(best_perf_cost)
        search_time = time.time() - start_time
Beispiel #32
0
    def run_zinb(self, data, genes, NZMeanByRep, LogZPercByRep,
                 RvSiteindexesMap, conditions, covariates, interactions):
        """
            Runs Zinb for each gene across conditions and returns p and q values
            ([[Wigdata]], [Gene], [Number], [Number], {Rv: [SiteIndex]}, [Condition], [Covar], [Interaction]) -> Tuple([Number], [Number], [Status])
            Wigdata :: [Number]
            Gene :: {start, end, rv, gene, strand}
            SiteIndex: Integer
            Condition :: String
            Covar :: String
            Interaction :: String
            Status :: String
        """

        count = 0
        self.progress_range(len(genes))
        pvals, Rvs, status = [], [], []
        r_zinb_signif = self.def_r_zinb_signif()
        if (self.winz):
            self.transit_message("Winsorizing and running analysis...")

        self.transit_message("Condition: %s" % self.condition)

        comp1a = "1+cond"
        comp1b = "1+cond"

        # include cond in mod0 only if testing interactions
        comp0a = "1" if len(self.interactions) == 0 else "1+cond"
        comp0b = "1" if len(self.interactions) == 0 else "1+cond"
        for I in self.interactions:
            comp1a += "*" + I
            comp1b += "*" + I
            comp0a += "+" + I
            comp0b += "+" + I
        for C in self.covars:
            comp1a += "+" + C
            comp1b += "+" + C
            comp0a += "+" + C
            comp0b += "+" + C
        zinbMod1 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % (
            comp1a, comp1b)
        zinbMod0 = "cnt~%s+offset(log(NZmean))|%s+offset(logitZperc)" % (
            comp0a, comp0b)

        nbMod1 = "cnt~%s" % (comp1a)
        nbMod0 = "cnt~%s" % (comp0a)
        toRFloatOrStrVec = lambda xs: FloatVector(
            [float(x) for x in xs]) if self.is_number(xs[0]) else StrVector(xs)

        for gene in genes:
            count += 1
            Rv = gene["rv"]
            ## Single gene case for debugging
            if (GENE):
                Rv = None
                if GENE in RvSiteindexesMap:
                    Rv = GENE
                else:
                    for g in genes:
                        if (g['gene'] == GENE):
                            Rv = g["rv"]
                            break
                if not Rv:
                    self.transit_error("Cannot find gene: {0}".format(GENE))
                    sys.exit(0)

            if (DEBUG):
                self.transit_message(
                    "======================================================================"
                )
                self.transit_message(gene["rv"] + " " + gene["gene"])

            if (len(RvSiteindexesMap[Rv]) <= 1):
                status.append("TA sites <= 1, not analyzed")
                pvals.append(1)
            else:
                # For winsorization
                # norm_data = self.winsorize((map(
                #     lambda wigData: wigData[RvSiteindexesMap[Rv]], data))) if self.winz else list(map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data))
                norm_data = list(
                    map(lambda wigData: wigData[RvSiteindexesMap[Rv]], data))
                ([
                    readCounts, condition, covarsData, interactionsData,
                    NZmean, logitZPerc
                ]) = self.melt_data(norm_data, conditions, covariates,
                                    interactions, NZMeanByRep, LogZPercByRep)
                if (numpy.sum(readCounts) == 0):
                    status.append(
                        "pan-essential (no counts in all conditions) - not analyzed"
                    )
                    pvals.append(1)
                else:
                    df_args = {
                        'cnt': IntVector(readCounts),
                        'cond': toRFloatOrStrVec(condition),
                        'NZmean': FloatVector(NZmean),
                        'logitZperc': FloatVector(logitZPerc)
                    }
                    ## Add columns for covariates and interactions if they exist.
                    df_args.update(
                        list(
                            map(
                                lambda t_ic: (t_ic[
                                    1], toRFloatOrStrVec(covarsData[t_ic[0]])),
                                enumerate(self.covars))))
                    df_args.update(
                        list(
                            map(
                                lambda t_ic:
                                (t_ic[1],
                                 toRFloatOrStrVec(interactionsData[t_ic[0]])),
                                enumerate(self.interactions))))

                    melted = DataFrame(df_args)
                    # r_args = [IntVector(readCounts), StrVector(condition), melted, map(lambda x: StrVector(x), covars), FloatVector(NZmean), FloatVector(logitZPerc)] + [True]
                    debugFlag = True if DEBUG or GENE else False
                    pval, msg = r_zinb_signif(melted, zinbMod1, zinbMod0,
                                              nbMod1, nbMod0, debugFlag)
                    status.append(msg)
                    pvals.append(float(pval))
                if (DEBUG or GENE):
                    self.transit_message(
                        "Pval for Gene {0}: {1}, status: {2}".format(
                            Rv, pvals[-1], status[-1]))
                if (GENE):
                    self.transit_message("Ran for single gene. Exiting...")
                    sys.exit(0)
            Rvs.append(Rv)
            # Update progress
            text = "Running ZINB Method... %5.1f%%" % (100.0 * count /
                                                       len(genes))
            self.progress_update(text, count)

        pvals = numpy.array(pvals)
        mask = numpy.isfinite(pvals)
        qvals = numpy.full(pvals.shape, numpy.nan)
        qvals[mask] = statsmodels.stats.multitest.fdrcorrection(pvals)[
            1]  # BH, alpha=0.05

        p, q, statusMap = {}, {}, {}
        for i, rv in enumerate(Rvs):
            p[rv], q[rv], statusMap[rv] = pvals[i], qvals[i], status[i]
        return (p, q, statusMap)