Beispiel #1
0
 def _set_covmatrix(self, covmatrix=None):
     """Builds covmatrix from self.pars. If setting from an externally
     provided covariance matrix then updates self.pars for consistency"""
     # If covmatrix hasn't been provided, generate from self._pars
     # and set.
     if covmatrix is None:
         dx = self._pars[6]
         dv = self._pars[7]
         self._covmatrix = np.identity(6)
         self._covmatrix[:3, :3] *= dx ** 2
         self._covmatrix[3:, 3:] *= dv ** 2
     # If covmatrix has been provided, reverse engineer the most
     # suitable set of parameters and update self._pars accordingly
     # (e.g. take the geometric mean of the (square-rooted) velocity
     # eigenvalues as dv, as this at least ensures constant volume
     # in velocity space).
     else:
         self._covmatrix = np.copy(covmatrix)
         dx = gmean(np.sqrt(
             np.linalg.eigvalsh(self._covmatrix[:3, :3]))
         )
         dv = gmean(np.sqrt(
             np.linalg.eigvalsh(self._covmatrix[3:, 3:]))
         )
         self._pars[6] = dx
         self._pars[7] = dv
         self.set_sphere_stds()
	def score(self, h, ref, postag=False, hpos=[], refpos=[], wts=[]):
		""" Weights are for ngram weights in the average
		"""
		score = 0.0

		if len(h) > 0:
			ngram_precisions = self.ngram_precisions(h, ref)
			bp = self.brevity_penalty(h, ref)

			if postag:
				postag_ngram_precisions = self.ngram_precisions(hpos, refpos)
				if wts:
					score = bp * (1-self.beta)*self.wgmean(ngram_precisions, wts) + \
						self.beta*self.wgmean(postag_ngram_precisions, wts)
				else:
					score = bp * (1-self.beta)*gmean(ngram_precisions) + \
						self.beta*gmean(postag_ngram_precisions)
			
			else:
				if wts:
					score = bp * self.wgmean(ngram_precisions, wts)
				else:
					score = bp * gmean(ngram_precisions)

		return score
def impute_missing_total_reads(total_reads, missing_variant_confidence):
  # Change NaNs to masked values via SciPy.
  masked_total_reads = ma.fix_invalid(total_reads)

  # Going forward, suppose you have v variants and s samples in a v*s matrix of
  # read counts. Missing values are masked.

  # Calculate geometric mean of variant read depth in each sample. Result: s*1
  sample_means = gmean(masked_total_reads, axis=0)
  assert np.sum(sample_means <= 0) == np.sum(np.isnan(sample_means)) == 0
  # Divide every variant's read count by its mean sample read depth to get read
  # depth enrichment relative to other variants in sample. Result: v*s
  normalized_to_sample = np.dot(masked_total_reads, np.diag(1./sample_means))
  # For each variant, calculate geometric mean of its read depth enrichment
  # across samples. Result: v*1
  variant_mean_reads = gmean(normalized_to_sample, axis=1)
  assert np.sum(variant_mean_reads <= 0) == np.sum(np.isnan(variant_mean_reads)) == 0

  # Convert 1D arrays to vectors to permit matrix multiplication.
  imputed_counts = np.dot(variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1)))
  nan_coords = np.where(np.isnan(total_reads))
  total_reads[nan_coords] = imputed_counts[nan_coords]
  assert np.sum(total_reads <= 0) == np.sum(np.isnan(total_reads)) == 0

  total_reads[nan_coords] *= missing_variant_confidence
  return np.floor(total_reads).astype(np.int)
Beispiel #4
0
    def test_2D(self):
        a = ma.array(((1, 2, 3, 4), (1, 2, 3, 4), (1, 2, 3, 4)), mask=((0, 0, 0, 0), (1, 0, 0, 1), (0, 1, 1, 0)))
        actual = mstats.gmean(a)
        desired = np.array((1, 2, 3, 4))
        assert_array_almost_equal(actual, desired, decimal=14)

        desired1 = mstats.gmean(a, axis=0)
        assert_array_almost_equal(actual, desired1, decimal=14)

        actual = mstats.gmean(a, -1)
        desired = ma.array((np.power(1 * 2 * 3 * 4, 1.0 / 4.0), np.power(2 * 3, 1.0 / 2.0), np.power(1 * 4, 1.0 / 2.0)))
        assert_array_almost_equal(actual, desired, decimal=14)
Beispiel #5
0
 def set_sphere_stds(self):
     """
     Set the spherical standard deviations in position space and
     velocity space. Calculated in such a way so as to preserved
     volume in position space and velocity space retrospectively.
     Note that combined phase-space volume is not conserved by this
     implementation.
     """
     self._sphere_dx = gmean(np.sqrt(
             np.linalg.eigvalsh(self._covmatrix[:3, :3]))
     )
     self._sphere_dv = gmean(np.sqrt(
             np.linalg.eigvalsh(self._covmatrix[3:, 3:]))
     )
Beispiel #6
0
def make_histos(v, s, nIon, size, nbins, ion):

    '''
    Bins up the x and y data into nbins
    Value is each bin is the geometric mean of column density
    '''
    
    # Find column density
    column = np.zeros(len(size))
    for i, (n,l) in enumerate(zip(nIon, size)):
        # Take the cube root of the cell length and convert from kpc to cm
        length = l**(1./3.) * 3.086e21
        column[i] = (10**n) * length
    
    print min(column)
    print max(column)
    vmin = -100
    vmax = 100
    smin = -220
    smax = 220  

    print 'Making histogram'
    H, xed, yed = np.histogram2d(v,s,bins=nbins, range=[[vmin,vmax],[smin,smax]])
    h = np.zeros_like(H)
    print 'Histogram done\n'

    for i in range(0,H.shape[0]):
        for j in range(0,H.shape[1]):
            # #rows = shape[0]
            # #cols = shape[1]
            vmin = xed[j]
            vmax = xed[j+1]
            smin = yed[i]
            smax = yed[i+1]
            val = []
            for k, (vel,sloc) in enumerate(zip(v,s)):
                if vel>vmin and vel<vmax and sloc>smin and sloc<smax:
                    val.append(column[k])
            print np.log10(min(val))
            print np.log10(max(val))
            print np.log10(np.mean(val))
            print np.log10(gmean(val))
            print vmin, vmax
            print smin, smax
            h[i,j] = gmean(val)
            print h[i,j]
    h = np.log10(h)
    np.savetxt('{0:s}_velHist.out'.format(ion), h)
    return h,xed,yed
def geoMean(vals):
     vals = array(vals)
     if len(unique(sign(vals))) != 1:
          raise ArithmeticError("Sequence of numbers for geometric mean must be all positive or all negative")
     vals = numpy.abs(vals)
     m = gmean(vals)
     return m
  def compute_result_dist_prodll_allt(self, all_variables):
    '''
            Given outputs from FitExperimentAllT, will compute the geometric mean of the LL.

            UGLY HACK: in order to keep track of the minLL, we return it here.
            You should have a cma_iter_function that cleans it before cma_es.tell() is called...
        '''
    if 'result_ll_sum' in all_variables:
      repetitions_axis = all_variables.get('repetitions_axis', -1)

      # Shift to get LL > 0 always
      currMinLL = np.min(all_variables['result_ll_sum'])
      if currMinLL < all_variables['all_parameters']['shiftMinLL']:
        all_variables['all_parameters']['shiftMinLL'] = currMinLL

      # Remove the current minLL, to make sure fitness > 0
      print 'Before: ', all_variables['result_ll_sum']
      all_variables['result_ll_sum'] -= all_variables['all_parameters'][
          'shiftMinLL']
      all_variables['result_ll_sum'] += 0.001
      print 'Shifted: ', all_variables['result_ll_sum']

      result_dist_nll_geom = -mstats.gmean(
          utils.nanmean(all_variables['result_ll_sum'], axis=repetitions_axis),
          axis=-1)

      print result_dist_nll_geom
      return np.array([
          result_dist_nll_geom, all_variables['all_parameters']['shiftMinLL']
      ])
    else:
      raise ValueError('result_ll_sum was not found in the outputs')
def testCombo(paramCombo, use_datasets, numExamples, compute_mistakes=False, verbose=False, parallelize=True):
    paramsStr = getParamsString(paramCombo)
    if parallelize:
        sys.stdout = open(str(os.getpid()) + ".out", "a")

    # Create length penalty function from params
    if 'lengthPenaltyParams' in paramCombo:
        power, firstDenom, secondDenom = paramCombo['lengthPenaltyParams']
        paramCombo['lengthPenaltyFn'] = lambda x: x**power/firstDenom if x<4 else x/secondDenom
        del paramCombo['lengthPenaltyParams']

    # Create ngram penalty and adjacency boost functions from params
    if 'ngramPenaltyParams' in paramCombo:
        constant = paramCombo['ngramPenaltyParams']
        paramCombo['ngramPenaltyFn'] = lambda length, count: constant * float(length) / np.sqrt(count)
        del paramCombo['ngramPenaltyParams']
    if 'ngramAdjacentBoostParams' in paramCombo:
        constant = paramCombo['ngramAdjacentBoostParams']
        paramCombo['ngramAdjacentBoostFn'] = lambda length, count: constant * np.sqrt(length * count)
        del paramCombo['ngramAdjacentBoostParams']

    constructor = paramCombo[MODEL_KEYWORD]
    del paramCombo[MODEL_KEYWORD]
    model = constructor(**paramCombo)
    paramCombo.update({MODEL_KEYWORD: constructor})

    results = model.evaluate(numExamples=numExamples, compute_mistakes=compute_mistakes, verbose=verbose, use_datasets=use_datasets)
    score = gmean([results[dataset][0] for dataset in use_datasets])

    print "Parameters:\n%s" % (paramsStr)
    print "Score: {}\n\n\n".format(score)
    return score, paramsStr, paramCombo 
    def getRatio(self, parent_count_dict, child_count_dict, values):
        ratio_list = []
        parent_sum = 0
        parent_count = 0

        for value in values:
            if parent_count_dict[value] == 0:
                if child_count_dict[value] == 0:
                    ratio_list.append(0)
                else:
                    ratio_list.append(float("inf"))
            else:
                ratio_list.append(float(child_count_dict[value]) / parent_count_dict[value])
            parent_count += 1
            parent_sum += parent_count_dict[value]
        ## FOR

        if len(ratio_list) == 0:
            ratio_list.append(0)
        ## IF

        if parent_count > 0:
            parent_average = float(parent_sum) / parent_count
        else:
            parent_average = 0
        ## IF

        return mstats.gmean(ratio_list), parent_average
Beispiel #11
0
def fit_logistic_GLM(X, y, 
                     C_value = np.array([-4,5]), 
                     num_cv = 5, 
                     verbose = False, 
                     intercept_scaling = 10,
                     penalty = 'l1',
                     reg_strength = None,
                     plot_results = False
                     ):
    scores_to_return = []
    loss_score = []
    X = pp.scale(X)
    # If regularization strength isn't specified, CV to find it
    if reg_strength == None:
        kf = SKFold(y = y, n_folds = num_cv)
        C_values = np.logspace(C_value[0], C_value[1], 10)
        C_dict = {"C": C_values}
        best_param = []
    #------------------------------------------------------------------------------ 
        for train, test in kf:
            X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
            # Do grid search for regularization parameter
            clf = GSCV(
                        LR(C=1, penalty=penalty, dual=False,intercept_scaling=intercept_scaling),
                        C_dict,
                        cv=num_cv
                        )
            # Fit model
            clf.fit(X_train,y_train)
            best_param.append(clf.best_params_['C'])
            if verbose:
                for params, mean_score, scores in clf.grid_scores_:
                    print("%0.3f (+/-%0.03f) for %r"
                      % (mean_score, scores.std() / 2, params))
        if verbose:
            print np.mean(np.asarray(scores))
        reg_strength = gmean(best_param)
#------------------------------------------------------------------------------ 
    kf2 = SKFold(y = y, n_folds = num_cv)
    clf = []
    clf_temp = LR(
             penalty=penalty, 
             dual=False,
             C = reg_strength,
             intercept_scaling = intercept_scaling
             )
    for train, test in kf2:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        clf_temp.fit(X_train, y_train)
        scores_to_return.append(clf_temp.score(X_test, y_test))
        clf.append(clf_temp)
        pred = clf_temp.predict_proba(X_test)[:,1]
        loss_score.append(lossFx(y_test,pred))
#------------------------------------------------------------------------------ 
    # Plot results
    if plot_results:
        plot_logistic_fit(clf,X,kf2)
    # Returns model, scores of each CV, best C parameter, CV fold indices
    return clf, scores_to_return, loss_score, reg_strength, kf2
Beispiel #12
0
    def test_1D(self):
        a = (1,2,3,4)
        actual = mstats.gmean(a)
        desired = np.power(1*2*3*4,1./4.)
        assert_almost_equal(actual, desired, decimal=14)

        desired1 = mstats.gmean(a,axis=-1)
        assert_almost_equal(actual, desired1, decimal=14)
        assert_(not isinstance(desired1, ma.MaskedArray))

        a = ma.array((1,2,3,4),mask=(0,0,0,1))
        actual = mstats.gmean(a)
        desired = np.power(1*2*3,1./3.)
        assert_almost_equal(actual, desired,decimal=14)

        desired1 = mstats.gmean(a,axis=-1)
        assert_almost_equal(actual, desired1, decimal=14)
def main(argv):
    parsed = parse_args(argv)

    # check directory
    if not parsed.dir_fimo.endswith("/"):
        parsed.dir_fimo += "/"

    # get the lists of tfs, targets
    rids = numpy.loadtxt(parsed.fn_rids, dtype=str)
    gids = numpy.loadtxt(parsed.fn_gids, dtype=str)
    adjmtr = numpy.zeros([len(rids), len(gids)])

    # build the adjmtr
    lines = open(parsed.fn_infer, "r").readlines()
    for i in range(1, len(lines)):

        # get inferred tf and database motif
        linesplit = lines[i].strip().split('\t')
        infer_tf = linesplit[0]
        zscore = float(linesplit[3])
        if parsed.inference_method == 'cisbp':
            infer_motifs = linesplit[1].split(',')
        elif parsed.inference_method == 'fire':
            infer_motifs = linesplit[0].split(',')
        else:
            sys.exit("Inference method not specified.")

        if zscore >= parsed.zscore_thld:
            # get fimo scores for the inferred motif
            index = numpy.where(rids == infer_tf)[0]
            if len(index) > 0:
                if len(infer_motifs) > 1:
                    temp_mtr = numpy.zeros([len(infer_motifs), len(gids)])
                    for j in range(len(infer_motifs)):
                        fn_motif = parsed.dir_fimo + infer_motifs[j] + parsed.summary_suffix
                        # fn_motif = parsed.dir_fimo + infer_motifs[j] + ".summary"
                        # fn_motif = parsed.dir_fimo + infer_motifs[j] + ".summary_mask3_cons_thd_0.5"
                        if os.path.isfile(fn_motif):
                            dict_scores = get_fimo_scores(fn_motif)
                            for k in range(len(gids)):
                                t = gids[k]
                                temp_mtr[j, k] = dict_scores[t] if t in dict_scores else 0
                    adjmtr[index[0], :] = gmean(temp_mtr).data

                else:
                    fn_motif = parsed.dir_fimo + infer_motifs[0] + parsed.summary_suffix
                    # fn_motif = parsed.dir_fimo + infer_motifs[0] + ".summary" 
                    # fn_motif = parsed.dir_fimo + infer_motifs[0] + ".summary_mask3_cons_thd_0.5"
                    if os.path.isfile(fn_motif):
                        dict_scores = get_fimo_scores(fn_motif)
                        for j in range(len(gids)):
                            t = gids[j]
                            adjmtr[index[0], j] = dict_scores[t] if t in dict_scores else 0
        
    # write adjmtr file
    write_adjmtr(adjmtr, parsed.fn_adjmtr)
Beispiel #14
0
 def rotatecube(self, theta=0, phi=0, trim=0):
     "angles in degrees"
     if (int(theta) % 360) != 0:
         rho = rotate(self.rho, theta, (0, 2), mode="nearest", order=1)
         rhoN = rotate(self.rhoN, theta, (0, 2), mode="nearest", order=1)
         t = rotate(self.t, theta, (0, 2), mode="nearest", order=1)
         v = rotate(self.V, theta, (1, 3), mode="nearest", order=1)
         M = Ry(theta * pi / 180)
         M[
             abs(M) < (finfo(1.0).eps * 10)
         ] = 0  # set numbers with abs value less than 10 times the floating point epsilon to 0
         f = lambda x: (x * M).flat
         v = apply_along_axis(f, 0, v)
         if (int(phi) % 360) != 0:
             rho = rotate(rho, phi, (0, 1), mode="nearest", order=1)
             rhoN = rotate(rhoN, phi, (0, 1), mode="nearest", order=1)
             t = rotate(t, phi, (0, 1), mode="nearest", order=1)
             v = rotate(v, phi, (1, 2), mode="nearest", order=1)
             M = Rz(phi * pi / 180)
             M[abs(M) < (finfo(1.0).eps * 10)] = 0
             f = lambda x: (x * M).flat
             v = apply_along_axis(f, 0, v)
     elif (int(phi) % 360) != 0:
         rho = rotate(self.rho, phi, (0, 1), mode="nearest", order=1)
         rhoN = rotate(self.rhoN, phi, (0, 1), mode="nearest", order=1)
         t = rotate(self.t, phi, (0, 1), mode="nearest", order=1)
         v = rotate(self.V, phi, (1, 2), mode="nearest", order=1)
         M = Rz(phi * pi / 180)
         M[abs(M) < (finfo(1.0).eps * 10)] = 0
         f = lambda x: (x * M).flat
         v = apply_along_axis(f, 0, v)
     else:
         rho = self.rho.copy()
         t = self.t.copy()
         v = self.V.copy()
     rho[rho < 1e-30] = 1e-30
     t[t < 1] = 1
     thresh = rho
     if trim:
         for _ in rho.shape:
             thresh = gmean(thresh, axis=0)
         sl = trimCube(rho, thresh * 5)
         self.rho = rho[sl]
         self.rhoN = rhoN[sl]
         self.t = t[sl]
         sl = [slice(0, 3)] + sl
         self.V = v[sl]
     else:
         self.rho = rho
         self.rhoN = rhoN
         self.t = t
         self.V = v
     try:
         self.dt[...] = 0
     except AttributeError:
         None
    def adaptive_bandwidths(self):
        """Computes the bandwidths for the adaptive KDE."""

        key = "adaptive_bandwidths"
        if key not in self.cache:
            KDE_list = self.KDE_of_training_list(fixed=True, approx=False)
            geom_mean = gmean(KDE_list)
            lambdas = np.power(KDE_list/geom_mean, -0.5)
            self.cache[key] = lambdas * self.h
        return self.cache[key]
def independent_variable_model_collapse(model,independent_column_name="Frequency", **options):
    """Returns a model with a single set of independent variables. Default is to average values together
    but geometric mean, std, variance, rss, mad and median are options.
    Geometric means of odd number of negative values fails"""
    if isinstance(model,pandas.DataFrame):
        model_1 = DataFrame_to_AsciiDataTable(model)
    defaults = {"method": "mean"}
    # load other options from model
    for option, value in model.options.items():
        if not re.search('begin_line|end_line', option):
            defaults[option] = value
    for element in model.elements:
        if model.__dict__[element]:
            if re.search("meta", element, re.IGNORECASE):
                defaults["metadata"] = model.metadata.copy()
            else:
                defaults[element] = model.__dict__[element][:]
    # We need to preserve the frequency column some how
    collapse_options = {}
    for key, value in defaults.items():
        collapse_options[key] = value
    for key, value in options.items():
        collapse_options[key] = value
    unique_independent_variable_list = sorted(list(set(model[independent_column_name])))
    independent_variable_selector = model.column_names.index(independent_column_name)
    out_data = []
    for index, independent_variable in enumerate(unique_independent_variable_list):
        data_row = [x for x in model.data[:] if x[independent_variable_selector] == independent_variable]
        if re.search('mean|av', collapse_options["method"], re.IGNORECASE):
            new_row = np.mean(np.array(data_row), axis=0).tolist()
        elif re.search('median', collapse_options["method"], re.IGNORECASE):
            new_row = np.median(np.array(data_row), axis=0).tolist()
        elif re.search('geometric', collapse_options["method"], re.IGNORECASE):
            new_row = gmean(np.array(data_row), axis=0).tolist()
        elif re.search('st', collapse_options["method"], re.IGNORECASE):
            new_row = np.std(np.array(data_row), axis=0).tolist()
        elif re.search('var', collapse_options["method"], re.IGNORECASE):
            new_row = np.var(np.array(data_row), axis=0, dtype=np.float64).tolist()
        elif re.search('rms', collapse_options["method"], re.IGNORECASE):
            new_row = np.sqrt(np.mean(np.square(np.array(data_row)), axis=0, dtype=np.float64)).tolist()
        elif re.search('rss', collapse_options["method"], re.IGNORECASE):
            new_row = np.sqrt(np.sum(np.square(np.array(data_row)), axis=0, dtype=np.float64)).tolist()
        elif re.search('mad', collapse_options["method"], re.IGNORECASE):
            new_row = mad(np.array(data_row), axis=0).tolist()
        new_row[independent_variable_selector]=independent_variable
        out_data.append(new_row)

    collapse_options["data"] = out_data

    if collapse_options["specific_descriptor"]:
        collapse_options["specific_descriptor"] = collapse_options["method"] + "_" + \
                                                  collapse_options["specific_descriptor"]
    resulting_model = AsciiDataTable(None, **collapse_options)
    return resulting_model
Beispiel #17
0
def centeredLogRatio(otu_table, otu_table_m):
    from scipy.stats.mstats import gmean
    noZeros = otu_table.copy().replace(0, np.nan)
    geomeans = np.repeat(np.nan, repeats = noZeros.shape[0])
    for i in range(0, noZeros.shape[0]):
        geomeans[i] = gmean(noZeros.ix[i, :].dropna())
    clr_table = np.log(noZeros.divide(geomeans, axis=0))
    clr_table.replace(np.nan, 0, inplace=True)
    clr_table_m = otu_table_m.copy()
    clr_table_m.ix[:, otu_table.columns] = clr_table
    return clr_table, clr_table_m
Beispiel #18
0
def summarise_metric(ys):
    flat = []
    for s in SERIES:
        flat += ys[s]
    m = gmean(flat)
    result = [0.0] * (len(flat) / len(SERIES))
    for s in SERIES:
        for i in range(len(result)):
            result[i] += ys[s][i] / m
    for i in range(len(result)):
        result[i] /= float(len(SERIES))
    return result
Beispiel #19
0
def ratings(d, ks, rating_index):
  avgs = [] 
  for k in ks:
    if not k in d:
      if rating_index < 2: 
        avgs.append(50.0)
      else: 
        avgs.append(5.0)
    else: 
      v = d[k][rating_index]
      avgs.append(ms.gmean(v))
  return np.average(avgs)
Beispiel #20
0
def clear_data( RFs, n ):
    p = 25
    Z, T = [], []
    Noise = np.load( 'noise.npy' ).reshape(n*n,p,p)
    cRFs = np.zeros((n*n,p,p))
    for i in range( n ):
        for j in range( n ):
            RF = RFs[i,j,...]

            # WARNING : Centering the RF
            s0,s1 = np.unravel_index(np.argmax(RF),RF.shape)
            RF = np.roll(RF,13-s0,axis=0)
            RF = np.roll(RF,13-s1,axis=1)
            # WARNING : Centering the RF

            # RF += Noise[i*n+j]
            # RF = gaussian_filter( RF, sigma=2.2 )

            RF += 1.5*Noise[i*n+j]
            RF = gaussian_filter( RF, sigma=1.5 )

            abs_max = np.max( np.abs( RF ) )
            RF[np.where( ( ( RF < +0.10*abs_max ) & (RF>0) ) | ( ( RF > -0.10*abs_max ) & (RF < 0) ) ) ]=0
            RF = locate_noise( RF )
            cRFs[i*n+j,...] = RF
            exc = 50.0 * ( RF > 0).sum()/( p * p )
            inh = 50.0 * ( RF < 0).sum()/( p * p )
            Z.append([exc,inh])

    Z = np.array(Z)
    np.nan_to_num(Z)
    print '------ Excitatory ------- Inhibitory -------'
    print 'Minimum :', Z[:,0].min(), Z[:,1].min()
    print 'Maximum :', Z[:,0].max(), Z[:,1].max()
    print 'Mean :', np.mean( Z[:,0] ), np.mean( Z[:,1] )
    print 'Mean :', np.mean( np.log10(Z[:,0]) ), np.mean( np.log10(Z[:,1]) )
    print 'SD : ', np.std( np.log10(Z[:,0]) ), np.std( np.log10(Z[:,1]) )
    print 'GMean :', gmean( Z[:,0] ), gmean( Z[:,1] )
    print "Pearson cor: ", pearsonr( Z[:,0], np.abs(Z[:,1]) )
    return Z, cRFs
Beispiel #21
0
def gmean_bin(x, y, nIon, size, nbins, xlims, ylims, ion):
    '''
    Bins up the data according to x and y
    Value in each bin is the geometric mean of the column
    density contribution of cells in that bin, as determined
    by multiplying nIon by size**1/3
    '''

    # Calculate the column density
    column = np.zeros(len(size))
    f = open('{0:s}_column.out'.format(ion), 'w')
    for i, (n,l) in enumerate(zip(nIon, size)):
        # Take the cube root of the cell length and convert from kpc to cm
        length = l**(1./3.) * 3.086e21
        col = n * length
        column[i] = col
        f.write('{0:.4e}\n'.format(col))
    f.close()        

    # Make the bins
    xbins = np.linspace(xlims[0], xlims[1], nbins+1)
    ybins = np.linspace(ylims[0], ylims[1], nbins+1)

    # Determine what cells go in what bins
    xdig = np.digitize(x, xbins)
    ydig = np.digitize(y, ybins)

    # Fix the edge effects
    maxBinNum = len(xbins)
    for i in range(len(xdig)):
        if xdig[i]==maxBinNum:
            xdig[i] -= 1
        if ydig[i]==maxBinNum:
            ydig[i] -= 1
    
    # Create empty array
    h = np.zeros((nbins, nbins))

    # Loop through array
    for i in range(nbins):
        for j in range(nbins):
            # Find the indicies where x and y belong to this bin
            bits = np.bitwise_and( xdig==i+1, ydig==j+1)
            if True in bits:
                h[i,j] = np.log10( gmean( column[bits] ) )

    h = np.rot90(h)
    h = np.flipud(h)
    np.savetxt('{0:s}_velHist.out'.format(ion), h)
    print 'Max of h: ', np.max(h)
    print 'Mean of h: ', np.mean(h)
    return h, xbins, ybins
Beispiel #22
0
def get_annot_kpts_baseline_weights(ibs, aid_list, config2_=None, config={}):
    r"""
    Returns weights based on distinctiveness and/or features score / or ones.  Customized based on config.

    Args:
        qreq_ (QueryRequest):  query request object with hyper-parameters
        aid_list (int):  list of annotation ids
        config (dict):

    Returns:
        list: weights_list

    CommandLine:
        python -m ibeis.algo.hots.scoring --test-get_annot_kpts_baseline_weights

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.hots.scoring import *  # NOQA
        >>> qreq_, cm = plh.testdata_scoring('testdb1')
        >>> aid_list = cm.daid_list
        >>> config = qreq_.qparams
        >>> # execute function
        >>> config2_ = qreq_.qparams
        >>> kpts_list = qreq_.ibs.get_annot_kpts(aid_list, config2_=config2_)
        >>> weights_list = get_annot_kpts_baseline_weights(qreq_.ibs, aid_list, config2_, config)
        >>> # verify results
        >>> depth1 = ut.get_list_column(ut.depth_profile(kpts_list), 0)
        >>> depth2 = ut.depth_profile(weights_list)
        >>> assert depth1 == depth2
        >>> print(depth1)
        >>> result = str(depth2)
        >>> print(result)
    """
    # TODO: clip the fgweights? (dilation?)
    # TODO; normalize and paramatarize and clean
    dcvs_on = config.get('dcvs_on')
    fg_on = config.get('fg_on')
    weight_lists = []
    if dcvs_on:
        qdstncvs_list = get_kpts_distinctiveness(ibs, aid_list, config2_, config)
        weight_lists.append(qdstncvs_list)
    if fg_on:
        qfgweight_list = ibs.get_annot_fgweights(aid_list, ensure=True, config2_=config2_)
        weight_lists.append(qfgweight_list)
    if len(weight_lists) == 0:
        baseline_weights_list = [np.ones(num, np.float) for num in ibs.get_annot_num_feats(aid_list, config2_=config2_)]
        #baseline_weights_list = [None] * len(aid_list)
    else:
        # geometric mean of the selected weights
        baseline_weights_list = [spmstat.gmean(weight_tup) for weight_tup in zip(*weight_lists)]
    return baseline_weights_list
    def get_probs(md5, average_type='gmean'):
        temp = []

        for position, row in enumerate(md5_2_ind_joined[md5]):
            temp += [ms[position][row]]

        if average_type == 'mean':
            temp = scipy.sparse.vstack(temp).mean(axis=0)
        elif average_type == 'gmean':
            temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0)

        temp[temp < 1e-6] = 0

        return md5, csr_matrix(temp)
Beispiel #24
0
def calculate_nf(sample_frame, ref_targets, ref_sample):
    """Calculates a normalization factor from the geometric mean of the
    expression of all ref_targets, normalized to a reference sample.

    :param DataFrame sample_frame: A sample data frame.
    :param iterable ref_targets: A list or Series of target names.
    :param string ref_sample: The name of the sample to normalize against.
    :return: a Series indexed by sample name containing normalization factors
        for each sample.
    """
    grouped = sample_frame.groupby(['Target', 'Sample'])['Cq'].aggregate(average_cq)
    samples = sample_frame['Sample'].unique()
    nfs = gmean([pow(2, -grouped.ix[zip(repeat(ref_gene), samples)] + grouped.ix[ref_gene, ref_sample]) for ref_gene in ref_targets])
    return pd.Series(nfs, index=samples)
Beispiel #25
0
def main():
    infile = open('analysis/PI_DataSet.txt')
    header = infile.readline().rstrip().rsplit()
    tpvdict = {}
    drvdict = {}
    mutlist = [
        '10F', '32I', '46I', '47V', '50V', '54L', '54M', '74P', '76V', '82T',
        '82F', '84V', '90M'
    ]
    for line in infile:
        line = line.rstrip().rsplit()
        muts = []
        for i, mut in enumerate(line[9:]):
            if str(i + 1) + mut in mutlist:
                muts.append(str(i + 1) + mut)
        if len(muts) == 1:
            mut = muts[0]
            if mut not in tpvdict:
                tpvdict[mut] = []
                drvdict[mut] = []
            if line[7] != 'NA':
                tpvdict[mut].append(float(line[7]))
            if line[8] != 'NA':
                drvdict[mut].append(float(line[8]))
    print tpvdict
    print drvdict
    infile.close()
    for mut in tpvdict:
        tpvdict[mut] = gmean(tpvdict[mut])
        drvdict[mut] = gmean(drvdict[mut])
    outfile = open('analysis/resistance_single.txt', 'w')
    outfile.write('mutation\tTPV\tDRV\n')
    for mut in tpvdict:
        outfile.write(mut + '\t' + str(tpvdict[mut]) + '\t' +
                      str(drvdict[mut]) + '\n')
    outfile.close()
def get_probs(i, average_type='gmean'):
    image_name = file_names.loc[i, 'file_name']
    temp = []

    for j, m in enumerate(ms):
        temp += [m[i]]

    if average_type == 'mean':
        temp = scipy.sparse.vstack(temp).mean(axis=0)
    elif average_type == 'gmean':
        temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0)

    temp[temp < 1e-6] = 0

    return file_to_md5[image_name], csr_matrix(temp)
def get_probs(i, average_type='gmean'):
    image_name = file_names.loc[i, 'file_name']
    temp = []

    for j, m in enumerate(ms):
        temp += [m[i]]

    if average_type == 'mean':
        temp = scipy.sparse.vstack(temp).mean(axis=0)
    elif average_type == 'gmean':
        temp = gmean(scipy.sparse.vstack(temp).todense() + 1e-15, axis=0)

    temp[temp < 1e-6] = 0

    return file_to_md5[image_name], csr_matrix(temp)
Beispiel #28
0
 def compositional_transform(self, add_pseudocount:bool=False):
     """calculated the three Aitchison geometry transforms for the Ab counts.
     - alr uses the IgG1 counts as universal reference
     - ilr contrasts are based on the SVD of clr
     
     if add_pseudocount is set to true add 1 to prevent zero division, otherwise
     cells with zero counts in the denominator create inf and have to be filtered
     before downstream analysis, e.g.:
         clr_filter = np.isfinite(clr).all(axis=1)
         clr = clr[clr_filter,:]"""
     if add_pseudocount:
         self.clr_data = np.log((1+self.andat_raw.X)/gmean(self.andat_raw.X+1, axis=1).reshape(-1,1))
         self.alr_data = np.log((1+self.andat_raw.X)/(self.andat_raw.X[:,-1]+1).reshape(-1,1))
         
         U,s,Vt = np.linalg.svd(self.clr_data, full_matrices=False)
         self.ilr_data = np.dot(U*s,helmert(len(s)).T)
     else:
         self.clr_data = np.log(self.andat_raw.X/gmean(self.andat_raw.X, axis=1).reshape(-1,1))
         self.alr_data = np.log(self.andat_raw.X/(self.andat_raw.X[:,-1].reshape(-1,1)))
         
         finite_clr = np.isfinite(self.clr_data).all(axis=1)
         U,s,Vt = np.linalg.svd(self.clr_data[finite_clr,:], full_matrices=False)
         self.ilr_data = np.dot(U*s,helmert(len(s)).T)
     return
Beispiel #29
0
 def run_IterLinQuadReg_matrix(self,
                               A,
                               B,
                               C,
                               dist_info_sharing='AM',
                               us_init=None):
     x_input, u_input = self.state_inputs()
     if np.ndim(A) != 2:
         if dist_info_sharing == 'GM':
             A = gmean(A, axis=0)
             B = gmean(B, axis=0)
             C = gmean(C, axis=0)
             print(A.shape, 'A', B.shape, 'B', C.shape, 'C')
         elif dist_info_sharing == 'AM':
             A = np.sum(A, axis=0) / A.shape[0]
             B = np.sum(B, axis=0, keepdims=True) / B.shape[0]
             B = B.T
             C = np.sum(C, axis=0) / C.shape[0]
     else:
         pass
     f = self.next_states_matrix(x_input, u_input, A, B, C)
     dynamics = AutoDiffDynamics(f, x_input, u_input)
     x_goal = self.augment_state(self.x_goal)
     if self.Q_terminal.all() == None:
         cost = QRCost(self.Q, self.R)
     else:
         cost = QRCost(self.Q,
                       self.R,
                       Q_terminal=self.Q_terminal,
                       x_goal=x_goal)
     x0 = self.augment_state(self.x0)
     if us_init == None:
         us_init = np.random.uniform(-1, 1, (self.N, dynamics.action_size))
     ilqr = iLQR(dynamics, cost, self.N)
     xs, us = ilqr.fit(x0, us_init, on_iteration=self.on_iteration)
     return xs, us
Beispiel #30
0
    def scale_parameter(self, phi, psi, rotamer, residues, concentration,
                        alpha, quadtree):
        r"""
        Calculates query-dependent scale parameter at (``phi``, ``psi``).

        When calculating the mean χ angles of a rotamer at a particular (φ, ψ)
        point, each χ angle is calculated as a weighted mean:

        .. math::

            \mu(\chi|\phi, \psi, r) = \frac{%
                \sum_i^{N_r} K_m(\phi - \phi_i) K_m(\psi - \psi_i) \chi_i
            }{%
                \sum_i^{N_r} K_m(\phi - \phi_i) K_m(\psi - \psi_i)
            }.

        The scale factor for the kernels :math:`K_m` is evaluated once at
        each query (φ, ψ) point. We follow the method of Shapovalov and Dunbrack
        (2010) and ensure that the bandwidth of the kernel encompasses at least
        25 points.
        """

        pilot_estimates = to_numpy_array(
            residues, lambda res: res["rotamer_pilot"][rotamer])
        phi_psi_list = to_numpy_array(
            residues, lambda res: np.deg2rad(np.array(res["torsion"])))

        # Get the non-adaptive kde at the query point
        query_estimator = prob_density_kde(np.array([phi, psi]), phi_psi_list,
                                           concentration)
        scale_param = np.power(gmean(pilot_estimates) / query_estimator, alpha)

        # We want to expand the scale parameter in sparse regions.
        # We do this by following Shapovalov and Dunbrack.
        # First, we find the nearest 25 points. If the distance of the farthest
        # point is less than our required distance, we accept the current scale
        # parameter. Otherwise, we take the distance of the 25th point and
        # convert that into our scale factor.

        # The conversion between distances r and scale factors λ is given by
        # r = 1/√(k / λ). So λ = kr^2
        cutoff_distance = np.sqrt(scale_param / concentration)
        distances, _ = quadtree.query(np.array([phi, psi]), 25)

        if distances[-1] >= cutoff_distance:
            scale_param = concentration * distances[-1]**2

        return scale_param
Beispiel #31
0
def compute_hdbscan(data):
    global hdbscan_cache
    if 4 in hdbscan_cache:
        return hdbscan_cache

    # Recompute raw dataset scales, as the normalization may not be scale
    scales = data.values.div(data.values.max(axis=1), axis=0)
    test_data = dataset.from_values(data.features, scales, data.values)

    # Build map from number of classes to configs
    ranges = {}
    for c, s in itertools.product(CLUSTERS, SAMPLES):
        clusterer = hdbscan.HDBSCAN(metric='l2',
                                    min_cluster_size=c,
                                    min_samples=s)
        clusterer.fit(data.normalized)
        # Labels are 0-indexed
        n_classes = clusterer.labels_.max() + 1
        chosen_labels = [
            np.argmax(mstats.gmean(x, axis=0)) for x in clusterer.exemplars_
        ]
        kernel_map = [data.normalized.columns[i] for i in chosen_labels]
        err = utils.geom_mean(
            utils.get_perfect_errors_for(kernel_map, test_data))
        #print("hdbscan {} classes for {}, {}. err {}".format(n_classes, c, s, err))
        #print('\n'.join(kernel_map))
        if n_classes in ranges.keys():
            ranges[n_classes] += [(c, s, err)]
        else:
            ranges[n_classes] = [(c, s, err)]

    for i in sorted(ranges.keys()):
        print("hdbscan: {} -> {}".format(i, ranges[i]))

    # Scan through map to get best trained config for each number of classes
    configs = {0: (15, 15)}
    for i in range(1, 16):
        if i in ranges:
            m = 0
            for c, s, e in ranges[i]:
                if e > m:
                    configs[i] = (c, s)
                    m = e
        else:
            configs[i] = configs[i - 1]

    hdbscan_cache = configs
    return configs
def get_statistics(exp_nm):
  '''
    Complete global editing statistics for each target site:
      - Base editing frequency among all non-noisy reads
      - Indel frequency among all non-noisy reads
    Can be used to calculate base editing to indel ratio (low biological replicability, due to low replicability of indel frequency).
  '''

  stats_df = pd.read_csv(inp_dir + f'{exp_nm}.csv', index_col = 0)

  mdf = stats_df

  # Filter readcount
  mdfs = mdf[mdf['Total count_indel'] >= 1000]

  # Filter target sites with no substrate nt in core
  has_c_in_core = lambda row: bool('C' in row['gRNA (20nt)'][3 : 7])
  mdfs['Has Core C'] = mdfs.apply(has_c_in_core, axis = 'columns')
  mdfs = mdfs[mdfs['Has Core C']]

  # Filter target sites with very low base editing fq
  mdfs['Base edit fq'] = (mdfs['Edited count']) / mdfs['Total count_indel']
  mdfs = mdfs[mdfs['Base edit fq'] > 0.025]
  # mdfs = mdfs[mdfs['Base edit fq'] > 0.01]

  # No pseudocounts
  mdfs = mdfs[mdfs['Indel count'] > 0]
  # # Pseudocount
  # mdfs['Edited count'] += 1
  # mdfs['Indel count'] += 1

  mdfs['Base edit to indel ratio'] = mdfs['Edited count'] / mdfs['Indel count']

  mdfs['Log10 base edit to indel ratio'] = np.log10(mdfs['Base edit to indel ratio'])
  mdfs.to_csv(out_dir + '%s.csv' % (exp_nm))

  from scipy.stats.mstats import gmean
  data = mdfs['Base edit to indel ratio']
  stats = {
    'Num. target sites': len(mdfs),
    'Geometric mean': gmean(data),
    'Median': np.median(data),
    '25th percentile': np.percentile(data, 25),
    '75th percentile': np.percentile(data, 75),
    'Geometric std': np.exp(np.std(np.log(data))),
  }

  return stats
Beispiel #33
0
def multioutput_fscore(y_true, y_pred, beta=1):
    score_list = []
    if isinstance(y_pred, pd.DataFrame) == True:
        y_pred = y_pred.values
    if isinstance(y_true, pd.DataFrame) == True:
        y_true = y_true.values
    for column in range(0, y_true.shape[1]):
        score = fbeta_score(y_true[:, column],
                            y_pred[:, column],
                            beta,
                            average='weighted')
        score_list.append(score)
    f1score_numpy = np.asarray(score_list)
    f1score_numpy = f1score_numpy[f1score_numpy < 1]
    f1score = gmean(f1score_numpy)
    return f1score
Beispiel #34
0
def _runBacktest(allocation_map, ticker_data, start_date_int, next_date_int):
    small_ticker_data = {
        ticker: data
        for ticker, data in ticker_data.items() if ticker in allocation_map
    }
    (small_ticker_tuple, small_data_matrix,
     small_expense_array) = data_cleaner.cleanAndConvertData(
         small_ticker_data, 30, next_date_int, first_date=start_date_int)
    allocation_map = defaultdict(int, allocation_map)
    small_allocation_array = np.array(
        [allocation_map[ticker] for ticker in small_ticker_tuple],
        dtype=np.float64)
    performance = gmean(np.matmul(small_data_matrix, small_allocation_array))
    expense = pow(1 - np.matmul(small_allocation_array, small_expense_array),
                  1 / config.TRADING_DAYS_PER_YEAR)
    return performance * expense
Beispiel #35
0
 def mean_diversity(self,
                    path,
                    alpha=1.0,
                    include_sink=False,
                    method='arithmetic'):
     path = CheckPath(path)
     diversities = self.individual_diversities(path,
                                               alpha=alpha,
                                               include_sink=include_sink)
     # Computing the mean
     if method == 'arithmetic':
         return diversities.mean()
     elif method == 'geo':
         return gmean(diversities)
     elif method == 'wpm':
         raise ValueError('Weighted Power Mean Method not implemented yet.')
def search_combinations(grid, qos = 1.2):
	benchmarks = [x for x in grid if x.endswith('1') or x.endswith('2')]
	sens = dict((bench, gmean(grid[bench].values())) for bench in benchmarks)
	sens = map(lambda x: x[0], sorted(sens.items(), key = lambda x: x[1]))

	gridT = transpose_grid(grid)
	(cos, _, _) = classes(grid, [qos])

	best = ('', 0, [])
	for b1 in sens[:len(sens) - 1]:
		for b2 in sens[sens.index(b1) + 1:]:
			cont = contentiousness(b1, b2, grid)
			accuracy = validate(cont, cos)
			if accuracy > best[1]:
				best = (b1 + '_' + b2, accuracy, cont)
	return best[2]
Beispiel #37
0
    def extract_preprocessed(self, X, y):
        stds = X.std(axis=0)
        stds = stds[stds > 0]
        std_ratio = gmean(stds)
        corr_mean = X.corr(method='pearson').abs().values.mean()
        skew_mean = X.skew(axis=0).mean()
        kurt_mean = X.kurtosis(axis=0).mean()

        self.meta_data.update({
            'STDRatio': std_ratio,
            'CorrelationMean': corr_mean,
            'SkewnessMean': skew_mean,
            'KurtosisMean': kurt_mean
        })

        self._extract_landmarks(X, y)
Beispiel #38
0
def get_weight_attr(cov_u, cov_v, reads_weight, db_weight):
    cov_diff = 1.0 / (abs(cov_u - cov_v) + sys.float_info.epsilon)
    weight_attr = {
        'cov_diff':
        cov_diff,
        'reads_and_db':
        reads_weight + db_weight,
        'geometric_mean':
        gmean([cov_diff, reads_weight, db_weight]),
        'harmonic_mean':
        hmean([
            cov_diff, reads_weight + sys.float_info.epsilon,
            db_weight + sys.float_info.epsilon
        ])
    }
    return weight_attr
def main(argv):
    parsed = parse_args(argv)
    
    # get input networks
    fns = parsed.FILE
    networks = []
    for i in range(len(fns)-1):
        x = numpy.abs(numpy.loadtxt(fns[i]))
        x += numpy.min(x[numpy.nonzero(x)])
        networks.append(x)
    combined = gmean(networks)

    # write combined network
    fn_output = parsed.FILE[len(parsed.FILE)-1]
    # numpy.savetxt(fn_output, combined, fmt="%.10f", delimiter="\t", newline="\n")
    write_adjmtr(fn_output, combined)
Beispiel #40
0
def calculate_sfm(frame):
    """Calculates the Spectral Flatness Measure of a signal

     The SFM is defined as the ratio of the geometrical mean by the
     arithmetical mean

    :param frame: frame of a discrete signal
    :return: the SFM of the frame
    """
    a = np.mean(frame)
    g = gmean(frame)
    if a == 0 or g/a <= 0:
        sfm = 0
    else:
        sfm = 10*np.log10(g/a)
    return sfm
Beispiel #41
0
def attributes(location, kind):
    global fail
    global fallas
    img = cv2.imread(location, 0)

    #Preprocessing
    #If image is monochromatic
    hist = cv2.calcHist([img], [0], None, [256], [0, 256])
    #Else
    #Gray scale

    trace = hist.reshape(256)
    #trace[trace!=-10000]+=1

    #gTrace=trace[trace>0]

    #Getting atributes
    attributes = np.zeros(10, dtype='<U256')  #.astype(object)

    #Kurtosis
    attributes[0] = str(sts.kurtosis(trace))
    #Skewness
    attributes[1] = str(sts.skew(trace))
    #Std
    attributes[2] = str(np.std(trace))
    #Range
    attributes[3] = str(np.ptp(trace))
    #Median
    attributes[4] = str(np.median(trace))
    #Geometric_Mean
    attributes[5] = str(gmean(trace))
    #Hjorth
    a, mor, comp = hjorth_params(trace)
    #Mobility
    attributes[6] = str(mor)
    #Complexity
    attributes[7] = str(comp)
    attributes[8] = str(kind)

    attributes[9] = str(location)
    #print(attributes)
    if (str(comp) == 'nan' or str(mor) == 'nan'
            or str(attributes[5]) == "nan"):
        a = np.array((location, str(attributes[5]), mor, comp))
        fallas = np.vstack((fallas, a))
        fail += 1
    return attributes
Beispiel #42
0
def calc_detectable_difference(data):
    '''Second part of performance indicator calculations adapted from Ruijter et al. (Methods, 2013)'''

    conc = np.repeat([15 * np.geomspace(1, 10000, num=5)], 3)

    conc_log = np.log10(conc)
    data_log = np.log10(data)

    steyx_arr = []

    for i in range(len(data_log[0, :])):
        steyx_i = calc_steyx(conc_log, data_log[:, i])
        steyx_arr.append(steyx_i)

    steyx = np.asarray(steyx_arr)

    intv = generate_intv()
    mean_arr = []

    for i in intv:
        mean_i = np.mean(data_log[i[0]:i[1]], axis=0)
        mean_arr.append(mean_i)

    mean_intv = np.asarray(mean_arr)

    mean_x = np.mean(conc_log)
    ss_x = np.var(conc_log, ddof=1) * 13

    conc_log_mean = conc_log[[0, 3, 6, 9, 12]]
    sq_part = np.sqrt(1. / 3. + (((conc_log_mean - mean_x)**2) / ss_x))
    se_yfit = np.outer(sq_part, steyx)

    t_intv = t.ppf(1 - 0.0125, 2)

    ci_y_upper = mean_intv + t_intv * se_yfit
    ci_y_lower = mean_intv - t_intv * se_yfit

    ci_y_upper_no_log = 10**ci_y_upper
    ci_y_lower_no_log = 10**ci_y_lower
    mean_intv_no_log = 10**mean_intv

    fold_up = ci_y_upper_no_log / mean_intv_no_log
    fold_down = mean_intv_no_log / ci_y_lower_no_log

    detectable_difference = gmean(fold_up, axis=0)

    return detectable_difference
Beispiel #43
0
    def __init__(self, dataset, n_classes):
        # HDBScan is a better clustering algorithm that may give a better set
        # of representatives.
        c, s = compute_hdbscan(dataset)[n_classes]
        clusterer = hdbscan.HDBSCAN(metric='l2',
                                    min_cluster_size=c,
                                    min_samples=s)
        clusterer.fit(dataset.normalized)

        # For each cluster, choose a representative that gives the best overall
        # performance for the class exemplars.
        chosen_labels = [
            np.argmax(mstats.gmean(x, axis=0)) for x in clusterer.exemplars_
        ]
        kernel_map = [dataset.normalized.columns[i] for i in chosen_labels]
        self.classes = kernel_map
        self.name = "{}{}".format(self.cls_name, n_classes)
Beispiel #44
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    slr = make_pipeline(MinMaxScaler(), LogisticRegression())
    plr = make_pipeline(PCA(), LogisticRegression())
    nb_bag = BaggingClassifier(base_estimator=GaussianNB())
    clfs = (
            GaussianNB(),
            #GridSearchCV(slr, dict(logisticregression__C=[1.0, 0.8])),
            make_pipeline(PCA(), GaussianNB()),
            GridSearchCV(plr, dict(pca__n_components=[None, 3, 8], logisticregression__C=[1.0, 0.7]), scoring='roc_auc'),
            GridSearchCV(nb_bag, dict(max_samples=[0.2, 0.4, 0.6], max_features=[0.3, 0.7]), scoring='roc_auc'),
            xgb.XGBClassifier(n_estimators=20, max_depth=3, colsample_bytree=0.7, subsample=0.6, learning_rate=0.1),
            #make_pipeline(KMeans(), GaussianNB()),
            #GridSearchCV(
            #    BaggingClassifier(),
            #    dict(base_estimator=[None, GaussianNB(), LogisticRegression()],
            #        n_estimators=[7, 10, 14],
            #        max_samples=[0.3, 0.6])),
            #GridSearchCV(xgb.XGBClassifier(), dict(n_estimators=[2, 3, 4], learning_rate=[0.01, 0.1], subsample=[0.5, 0.9])),
            #BaggingClassifier(base_estimator=SVC(), max_features=0.8, max_samples=2500, n_estimators=5),
    )
    preds = []
    for clf in clfs:
        print clf
        clf.fit(X_train, y_train)
        val_pred = clf.predict(X_val)
        print roc_auc_score(y_val, val_pred)
        clf.fit(X_val, y_val)
        train_pred = clf.predict(X_train)
        preds.append(np.concatenate((train_pred, val_pred)))
        print roc_auc_score(y_train, train_pred)
        print

    y_all = np.concatenate((y_train, y_val))
    preds = np.column_stack(preds)
    gm = gmean(preds, axis=1)
    hm = hmean(preds+1, axis=1)
    preds = np.column_stack((preds, gm, hm))
    print 'GM', roc_auc_score(y_all, gm)
    print 'HM', roc_auc_score(y_all, hm)
    meta = GaussianNB()
    meta = GridSearchCV(xgb.XGBClassifier(), dict(max_depth=[2, 3, 4], learning_rate=[0.01, 0.05, 0.1], n_estimators=[20, 40, 60]), scoring='roc_auc')
    meta.fit(preds, y_all)
    scores = cross_val_score(meta, preds, y_all, scoring='roc_auc', cv=5)
    print scores
    print scores.mean()
Beispiel #45
0
def dominate(portfolio,
             mu,
             cov,
             cost,
             prices,
             risk_tolerance,
             single_period=False):
    """By Default, always multi-period"""

    # start date for the based portfolio to be determined ... always assign to past 6 months (ie rebalance the period)
    start_date = (datetime.now() -
                  relativedelta(months=6)).strftime("%Y-%m-%d")

    # get the number of days in the backtest period ... to determine target returns and variances later
    days = business_days(start_date, datetime.now().strftime("%Y-%m-%d"))

    # call backtest to get the value of the portfolio
    portfolio_value = back_test(portfolio,
                                start_date,
                                end_date=None,
                                dollars=None)[0].sum(axis=1)

    print(">>> portfolio_value: ", portfolio_value)

    # calculate portfolio returns
    portfolio_returns = (portfolio_value / portfolio_value.shift(1) -
                         1).dropna()

    print(">>> portfolio_returns: ", portfolio_returns)

    # assign the target return and variance
    target_returns = (gmean(portfolio_returns + 1, axis=0) - 1) * days
    target_variance = portfolio_returns.var() * days

    mu_p2 = mu[0] if single_period else mu[1]
    cov_p2 = cov[0] if single_period else cov[1]

    soln, agg_soln = optimize(mu=(mu[0], mu_p2),
                              sigma=(cov[0], cov_p2),
                              alpha=(0.05, 0.10),
                              return_target=(target_returns, target_returns),
                              costs=cost,
                              prices=prices,
                              gamma=risk_tolerance[2])

    return soln, agg_soln
Beispiel #46
0
 def _all_shared_index(self):
     try:
         # Getting data of weighted Volume average of less than 5 mins
         wvl = []
         query = ("SELECT DISTINCT stock FROM RECORDS")
         self.dbRecord.cur.execute(query)
         result = self.dbRecord.cur.fetchall()
         print(result)
         for stock_record in range(len(result)):
             print(result[stock_record][0])
             w = self._get_volume_weighted_average(result[stock_record][0])
             wvl.append(w)
         res = gmean(wvl)
         return res
     except Exception as e:
         print("Database connectivity error" + str(e))
         return None
def get_data_online():
    """get equal length dividend yield and interest rate time series for longest period possible
    dividend yield comes from S&P 500; interest rate from Kenneth French's database"""

    dy_monthly_data = quandl.get("MULTPL/SP500_DIV_YIELD_MONTH")["Value"]

    # Resampling yearly dividend yield and cutting of 2021
    dy_data = dy_monthly_data.resample("Y").apply(lambda x: gmean(x))[:-1]

    datareader = pandas_datareader.famafrench.FamaFrenchReader(
        "F-F_Research_Data_Factors", freq="Y", start=1926)
    int_data = datareader.read()[1][
        "RF"]  # 0 for monthly data; 1 for yearly data

    min_len = min(len(dy_data), len(int_data))

    return (int_data[-min_len:].values / 100), dy_data[-min_len:].values / 100
Beispiel #48
0
 def set_count(self):
     counts = np.array([gene.count for gene in self.genes.values()])
     stoichs = np.array(
         [gene.stoichiometry for gene in self.genes.values()])
     reactions = np.array(
         [gene.nz_reactions for gene in self.genes.values()])
     # if there is any non-zero protein that is used only in this reaction, filter other zeros
     exclusive_non_zero = False
     if len(counts) > 1 and 0. in counts:
         for (cnt, reac) in zip(counts, reactions):
             if reac == 1 and cnt != 0:
                 exclusive_non_zero = True
     counts = counts / stoichs
     #counts = counts / reactions
     if exclusive_non_zero:
         counts = filter(lambda x: x != 0, counts)
     self.count = gmean(counts)
def my_rescale_sin(value_at_each_time,
                   L=0.02,
                   R=0.98,
                   h=2.5,
                   l=0.2 / 2,
                   silent=True):
    if any(value_at_each_time != 0):
        # I compute the geometric mean from our estimator.
        G = gmean(value_at_each_time)

    else:  # G == 0, it happens if no norm computed.
        # then it has to return 0.01 such that it widen all the kernels.
        return np.full(len(value_at_each_time), 0.01)

    L_quant = np.quantile(value_at_each_time, L)
    R_quant = np.quantile(value_at_each_time, R)

    if not L_quant < G < R_quant:
        raise Error_not_allowed_input(
            "L < G < R for the well definiteness of the function.")

    if not silent:
        print("Left boundary : ", L_quant)
    if not silent:
        print("Right boundary : ", R_quant)

    xx = value_at_each_time - G

    ans = 0
    scaling1 = math.pi / (G - L_quant)
    scaling2 = math.pi / (R_quant - G)
    # I fix the part outside of my interest, to be the final value, h.
    # This part corresponds to math.pi.
    # I also need the scaling by +h/2 given by math.pi

    # xx2 and xx3 are the cosinus, but they are different cosinus.
    # So I fix them where I don't want them to move at 0 and then I can add the two functions.
    my_xx2 = np.where((xx * scaling1 > -math.pi) & (xx * scaling1 < 0),
                      xx * scaling1, math.pi)  # left
    my_xx3 = np.where((xx * scaling2 > 0) & (xx * scaling2 < math.pi),
                      xx * scaling2, math.pi)  # right
    ans += -(h - l) / 2 * np.cos(my_xx2)
    ans += -(h - l) / 2 * np.cos(my_xx3)

    ans += l  # avoid infinite width kernel, with a minimal value.
    return ans
Beispiel #50
0
    def evaluate(self):
        prob = np.array([])

        for inputs, targets in self.data_loader:
            predicted = self.model.predict(inputs)

            if type(predicted) is list:
                # if predicted == [[label, label, ...], [...], ...]
                #    targets = [[label, label, ...], [...], ...]
                predicted = np.concatenate(predicted)
                targets = np.concatenate(targets)
            predicted = predicted[[range(len(targets)), targets]]
            if type(predicted) is not np.ndarray:
                predicted = predicted.cpu().numpy()
            prob = np.concatenate((prob, predicted))
        perplexity = gmean(1 / prob)
        return perplexity
Beispiel #51
0
    def __init__(self, n_model, norm=False):

        # retrieve sed and wavelength data from model
        self.n_model = n_model
        self.filepath = model_path + 'model' + str(
            n_model) + '/data_th/sed_rt.fits'

        hdulist = fits.open(self.filepath)
        sedlist = hdulist[0].data[0][0]
        self.wavelength = hdulist[1].data

        # normalize according to highest inclination
        if norm == True:
            mean_range = np.where((2 < self.wavelength)
                                  & (self.wavelength < 10))
            gmean = stats.gmean(sedlist[-1][mean_range])
            self.seds = [sed / gmean for sed in sedlist]

        # for non-normalized case
        else:
            self.seds = sedlist

        # store the model image
        impath = self.filepath[:-19] + 'data_0.6/RT.fits'
        self.images = fits.open(impath)[0].data[0][0]

        # store convolved+rebinned images image
        # convolve images with rebinned PSF (20mas/pixel)
        convolved_prebinned_images = [
            convolve_fft(i, PSF_20mas) for i in self.images
        ]
        # rebin to 40mas/pixel via averaging
        binpix = int((self.images[0].shape[0] - 1) / 2)
        self.convolved_images = [
            rebin(i[:-1, :-1], (binpix, binpix))
            for i in convolved_prebinned_images
        ]

        # generate model inclinations- is it in the files?
        i_0, i_f = np.radians(45), np.radians(90)
        cosi = np.linspace(np.cos(i_0), np.cos(i_f), 15)
        self.inclinations = np.degrees(np.arccos(cosi))

        # store parameters
        self.parameters = get_model_parameters(n_model)
Beispiel #52
0
def get_colors(image: np.ndarray, n_colors: int, write_pca: bool = False):
    pca = PCA(n_components=3)
    print(image.shape)
    X = np.array(image).reshape(-1, 3)
    pca.fit(X)
    samples = np.random.randint(-1000, 2, size=X.shape[0])
    index = np.where(samples > 0, np.ones(shape=X.shape[0]),
                     np.zeros(shape=X.shape[0])).astype(np.int).nonzero()

    Y = pca.transform(X[index]).astype(np.int)

    X_pca_0 = Y[:, 0]
    X_pca_1 = Y[:, 2]

    #     good_colors = np.where(X_pca_0 > 150)[0]
    more_good_colors = np.apply_along_axis(f2, 1, X[index]).nonzero()

    # plot samples in eigenspace
    fig = plt.figure(figsize=(12.8, 9.6))
    ax = fig.add_subplot(111)
    ax.scatter(X_pca_0, X_pca_1, c=X[index] / 255)
    if write_pca:
        ax.figure.savefig('./kmeans-pca_output.png', format='png')
    # plt.show()

    cluster = KMeans(n_colors)
    cluster.fit(X[index][more_good_colors])

    clustered_colors = cluster.predict(X[index][more_good_colors])
    print(clustered_colors.shape)
    color_map = dict()
    for label, color in zip(clustered_colors, X[index][more_good_colors]):
        try:
            color_map[label].append(color)
        except KeyError:
            color_map[label] = [color]

    print('cluster geometric means')
    for label, members in color_map.items():
        value = gmean(np.array(members))

        print(label, len(members), value)
        color_map[label] = value

    return color_map.values()
Beispiel #53
0
def analyze(predictions, test):
    frame = test["dataframe"]
    oracle = np.array(frame["oracle_enc"], dtype=np.bool)
    incorrect = np.logical_xor(predictions, oracle)
    correct = np.logical_not(incorrect)

    zero_r = Counter(oracle).most_common(1)[0][0]
    zero_r_key = enc2key(zero_r)

    speedups = np.array([
        min(d["runtime_cpu"], d["runtime_gpu"]) / d[enc2key(p)]
        for p, d in zip(predictions,
                        frame.T.to_dict().values())
    ])
    speedup_avg = speedups.mean()
    speedup_geo = gmean(speedups)

    accuracy = sum(correct) / len(test["dataframe"])

    confusion_matrix = np.zeros((2, 2), dtype="int32")
    confusion_matrix[0][0] = sum(
        np.logical_and(np.logical_not(predictions), np.logical_not(oracle)))
    confusion_matrix[0][1] = sum(
        np.logical_and(predictions, np.logical_not(oracle)))
    confusion_matrix[1][0] = sum(
        np.logical_and(np.logical_not(predictions), oracle))
    confusion_matrix[1][1] = sum(np.logical_and(predictions, oracle))

    assert (confusion_matrix.sum() == len(test["dataframe"]))
    assert (confusion_matrix[0][1] +
            confusion_matrix[1][1] == sum(predictions))
    assert (confusion_matrix[0][1] + confusion_matrix[1][0] == sum(incorrect))
    assert (confusion_matrix[0][0] + confusion_matrix[1][1] == sum(correct))
    print(confusion_matrix)

    return {
        "accuracy": accuracy,
        "correct": correct,
        "confusion_matrix": confusion_matrix,
        "speedups": speedups,
        "speedup_min": min(speedups),
        "speedup_max": max(speedups),
        "speedup_avg": speedup_avg,
        "speedup_geo": speedup_geo,
    }
Beispiel #54
0
def main():

    sol = dict()
    for method in ['dopri5', 'adams']:
        for tol in [1e-3, 1e-6, 1e-9]:
            print('======= {} | tol={:e} ======='.format(method, tol))
            nfes = []
            times = []
            errs = []
            for c in ['A', 'B', 'C', 'D', 'E']:
                for i in ['1', '2', '3', '4', '5']:
                    diffeq, init, _ = getattr(detest, c + i)()
                    t0, y0 = init()
                    diffeq = NFEDiffEq(diffeq)

                    if not c + i in sol:
                        sol[c + i] = odeint(diffeq,
                                            y0,
                                            torch.stack(
                                                [t0, torch.tensor(20.)]),
                                            atol=1e-12,
                                            rtol=1e-12,
                                            method='dopri5')[1]
                        diffeq.nfe = 0

                    start_time = time.time()
                    est = odeint(diffeq,
                                 y0,
                                 torch.stack([t0, torch.tensor(20.)]),
                                 atol=tol,
                                 rtol=tol,
                                 method=method)
                    time_spent = time.time() - start_time

                    error = torch.sqrt(torch.mean((sol[c + i] - est[1])**2))

                    errs.append(error.item())
                    nfes.append(diffeq.nfe)
                    times.append(time_spent)

                    print('{}: NFE {} | Time {} | Err {:e}'.format(
                        c + i, diffeq.nfe, time_spent, error.item()))

            print('Total NFE {} | Total Time {} | GeomAvg Error {:e}'.format(
                np.sum(nfes), np.sum(times), gmean(errs)))
def carga_pi(data):
    '''
    Recibe un dataset especifico por categoria de producto de una region; 
    Limpia los datos, determina las diferentes variedades dentro de las categorias 
    de producto; Carga los precios promedio de estes productos y determina su
    variación en relación al mes anterior y calcula la media geométrica entre las
    variacenes de producto. Retorna una tabla con los valores "pit". 
    '''
    x = data.split('_')
    x = x[1].split('/')
    producto = x[-1]
    df = limpia_data(data)
    variedades, n = encuentra_variedades(df)
    tabla_precios = precios_promedio(df, variedades)
    tabla_var = variaciones_precios_promedio(tabla_precios, n)
    tabla_var[producto] = gmean(tabla_var, axis=1)
    tabla_var = tabla_var[1:]
    return tabla_var
Beispiel #56
0
def saatiMethod():
    relationshipMatrix = []
    firstRow = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]
    relationshipMatrix.append(firstRow)
    for i in range(2, 10):
        nextRow = []
        for j in range(9):
            value = firstRow[j] / i
            nextRow.append(value)
        relationshipMatrix.append(nextRow)
    gmeanList = []
    for row in relationshipMatrix:
        gmeanList.append(gmean(row))
    priorityVectors = []
    for g in gmeanList:
        pVector = g / sum(gmeanList)
        priorityVectors.append(pVector)
    return priorityVectors
Beispiel #57
0
def normalize2(data, mask):
    (nC, nB) = mask.shape
    handled = full(shape=(nB), fill_value=False, dtype=np.bool)
    print("Começa a normalizar: \n\n")
    for c in range(0, nC):
        if (all(handled)):
            break
        factor = stats.gmean(data[c, handled]) if any(handled) else 1
        print(" Handled: ", handled)
        handle = mask[c, :] & ~handled
        print("Handling: ", handle)
        if (any(handle)):
            data[:, handle] *= kron(factor / data[c, handle], ones((nC, 1)))
            handled = handled | handle

    return data, mask

    print(data)
def discardseasons(df, seasons, gdthres=2.0, smin=5):
    """
    Calculate peak variability in order to keep only seasons with relatively low variability rdthres.
    Always mantain at least smin seasons.

    :param df: data frame with seasons by columns
    :param seasons: list of column names corresponding to each season
    :param gdthres: maximum geometric deviation from median
    :param smin: minimum number of seasons maintained

    :return drop_seasons: list with seasons to be dropped
    """

    drop_seasons = []
    seasons = seasons.copy()

    # Drop null seasons
    series = df[seasons].max()
    drop_seasons = list(series[series == 0].index)
    series.drop(drop_seasons, axis=0, inplace=True)
    # If resulting data contains less than smin seasons, return
    nseasons = len(series)
    nmax = nseasons - smin
    if nmax <= 0:
        return drop_seasons

    ####### Test removing one by one ######
    # Take log of geometric deviation threshold for simplicity
    gdthres = np.log(gdthres)
    for n in range(nmax):
        # Current maxima
        tmp_series = df[list(set(seasons).difference(drop_seasons))].max()
        # Grab current geometric mean
        series_gmean = np.log(gmean(tmp_series))
        # Calculate maximum geometric deviation from geometric mean
        mgd = abs(np.log(tmp_series) - series_gmean).max()

        if mgd > gdthres:
            idx = abs(np.log(tmp_series) - series_gmean).idxmax()
            drop_seasons.append(idx)

    return drop_seasons
Beispiel #59
0
    def getGeometricMAP(self, depth=1000, trec_eval=True):
        """
            The Geometric Mean Average Precision is the same as measured by MAP (mean average precision) on individual topics,\n
            but the geometric mean is used on over the results of each topic.
            Note that as done in the original trec_eval, the Geometric Map is only reported in the summary over all topics, not
            for individual topics.

            Params
            -------
            depth: the evaluation depth. Default = 1000
            trec_eval: set to True if result should be the same as trec_eval, e.g., sort documents by score first. Default = True.

            Returns
            --------
            The Geometric Mean Average Precision for all topics. Topics with MAP = 0 are replaced by MAP = GMEAN_MIN (default = .00001)
        """
        from scipy.stats.mstats import gmean
        maps = self.getMAP(depth=depth, trec_eval=trec_eval, per_query=True)
        maps = maps.replace(0.0, self.GMEAN_MIN)
        return gmean(maps)[0]