def get_correlations(y, yhat, yhat_sim, cbool, kernels=np.power(2, range(1, 10))): if yhat.ndim == 1: yhat = yhat[np.newaxis] if yhat_sim.ndim == 1: yhat_sim = yhat_sim[:, np.newaxis] spt = neo.SpikeTrain(np.where(y)[0] * pq.ms, sampling_rate=pq.kHz, t_stop=y.shape[0] * pq.ms) rate = [ elephant.statistics.instantaneous_rate( spt, sampling_period=pq.ms, kernel=elephant.kernels.GaussianKernel(x * pq.ms)) for x in kernels ] R = {} R['yhat'] = [ scipy.corrcoef(x.magnitude.ravel()[cbool], yhat.ravel()[cbool])[0, 1] for x in rate ] sim_rate = np.mean(yhat_sim, axis=1) R['yhat_sim'] = [ scipy.corrcoef(x.magnitude.ravel()[cbool], sim_rate.ravel()[cbool])[0, 1] for x in rate ] return (R, kernels)
def alignAndCompareMotifs(motif1, motif2, reportAll=False, tryAllAlignments=True, reverseComp=True, quitThreshold=None, normalizeRows=True, fillValue=.25): """ Compare the PWM's for two motifs by calculating their correlation coefficient. By default, all possible alignments and orientations will be tried and the top coefficient will be reported. fillValue may be a number, or a 4-element array with nuc frequencies Returns (corrCoef, motif2_relative_posn, motif2_orientation) form best alignment, or the entire list if reportAll=True. """ pwm1,pwm2 = motif1.matrix, motif2.matrix if normalizeRows: # make sum in each row = 1 pwm1, pwm2 = map(normalizePwmRows, [pwm1, pwm2]) alignsToTry = xrange(-len(motif2) + 1, len(motif1)-1) if tryAllAlignments else [0] # all possible shifts or no shifting results = [] for curOffset in alignsToTry: curPwm1, curPwm2 = map(scipy.array, extendPWMs(pwm1, pwm2, curOffset, fillValue)) # flatten arrays and take 1-dimensional correlation between them corrCoef = scipy.corrcoef(curPwm1.ravel(), curPwm2.ravel())[0,1] # top-right is correlation between matrices results.append([corrCoef, curOffset, 1]) if quitThreshold is not None and corrCoef > quitThreshold: # return immediately if quit threshold has been passed break if reverseComp: curPwm2 = scipy.array(reverseComplement(curPwm2)) corrCoef = scipy.corrcoef(curPwm1.ravel(), curPwm2.ravel())[0,1] # top-right is correlation between matrices results.append([corrCoef, curOffset, -1]) if quitThreshold is not None and corrCoef > quitThreshold: # return immediately if quit threshold has been passed break if reportBest: results = scipy.array(results) best = results[results[:,0].argmax(), :] # choose the result (row) with the best corrCoef return best else: return results
def Corr(GDP,I,C): m = sp.shape(GDP)[1] GDPIcorr = [] GDPCcorr = [] for i in range(0, m): gdp = GDP[:,i] inv = I[:,i] con = C[:,i] #Correlation between output and investment for each series gdpi = sp.corrcoef(gdp,inv) GDPIcorr.append(gdpi[0,1]) #Correlation between output and consumption for each series gdpc = sp.corrcoef(gdp,con) GDPCcorr.append(gdpc[0,1]) #Mean and standard deviation of correlation between GDP and #Investment and Consumption over total number of simulations GDPICORR = sp.array(GDPIcorr) gdpimean = sp.mean(GDPICORR) gdpistdev = sp.std(GDPICORR) GDPCCORR = sp.array(GDPCcorr) gdpcmean = sp.mean(GDPCCORR) gdpcstdev = sp.std(GDPCCORR) sp.savetxt('GDPICORR.csv',GDPICORR) sp.savetxt('GDPCCORR.csv',GDPCCORR) print "The mean and standard deviation between GDP and" print "Investment and GDP and Consumption followed by" print "The lists of each correlation coefficient for" print "each series are saved in csv files" return gdpimean, gdpistdev, gdpcmean, gdpcstdev
def word_party_correlations(folder='model'): stopwords = codecs.open("stopwords.txt", "r", "utf-8").readlines()[5:] stops = map(lambda x: x.lower().strip(), stopwords) # using now stopwords and filtering out digits bow = TfidfVectorizer(min_df=2) datafn = folder + '/textdata/rawtext.pickle' data = cPickle.load(open(datafn)) bow = bow.fit(chain.from_iterable(data.values())) # create numerical labels Y = hstack( map((lambda x: ones(len(data[data.keys()[x]])) * x), range(len(data)))) # create data matrix for key in data.keys(): data[key] = bow.transform(data[key]) X = vstack(data.values()) # map sentiment vector to bow space words = load_sentiment() sentiment_vec = zeros(X.shape[1]) for key in words.keys(): if bow.vocabulary_.has_key(key): sentiment_vec[bow.vocabulary_[key]] = words[key] # do sentiment analysis sentiments = X.dot(sentiment_vec) # compute label-BoW-tfidf-feature correlation lb = LabelBinarizer() partylabels = zscore(lb.fit_transform(Y), axis=0) # sentiment vs party correlation sentVsParty = corrcoef(partylabels.T, sentiments)[-1, :-1] fn = folder + '/sentiment_vs_party.json' for key in range(len(data.keys())): print "Sentiment vs Party %s: %0.2f" % (data.keys()[key], sentVsParty[key]) json.dump(dict(zip(data.keys(), sentVsParty)), open(fn, 'wb')) wordidx2word = dict(zip(bow.vocabulary_.values(), bow.vocabulary_.keys())) allcors = dict(zip(data.keys(), [[]] * len(data.keys()))) # this is extremely cumbersome and slow, ... # but computing the correlations naively on the matrices # requires densifying the matrix X, which is memory intense for partyidx in range(len(data.keys())): cors_words = [] print 'Computing correlations for %s' % data.keys()[partyidx] for wordidx in range(X.shape[-1]): cors = corrcoef(X[:, wordidx].todense().flatten(), partylabels[:, partyidx])[1, 0] if abs(cors) > .01: cors_words.append((wordidx2word[wordidx], cors)) allcors[data.keys()[partyidx]] = dict(cors_words) fn = folder + '/words_correlations.json' json.dump(dict(allcors), open(fn, 'wb'))
def weighted_average_aligned_runs(self,sources,mixing): ''' Averages one aligned ICA run and calculates the reproducibility for each component. This version does not add only super-threshold CCs to the reproducibililty index, and it uses a weighted average to form the average components. The weights are defined as w_i = sum_{j neq i} SCC(i,j). ''' rep = np.triu(np.abs(corrcoef(sources)),1).sum()/(0.5*self.K*(self.K-1)) rWeights = np.asarray([(np.abs(corrcoef(sources)[j,:]).sum() - 1.0)/(sources.shape[0]-1) for j in range(0,sources.shape[0])])[:,np.newaxis] return ((rWeights*sources).sum(axis=0))/(rWeights.sum()),((mixing*rWeights.T).sum(axis=1))/(rWeights.sum()),rep
def word_party_correlations(folder='model'): stopwords = codecs.open("stopwords.txt", "r", "utf-8").readlines()[5:] stops = map(lambda x:x.lower().strip(),stopwords) # using now stopwords and filtering out digits bow = TfidfVectorizer(min_df=2) datafn = folder+'/textdata/rawtext.pickle' data = cPickle.load(open(datafn)) bow = bow.fit(chain.from_iterable(data.values())) # create numerical labels Y = hstack(map((lambda x: ones(len(data[data.keys()[x]]))*x),range(len(data)))) # create data matrix for key in data.keys(): data[key] = bow.transform(data[key]) X = vstack(data.values()) # map sentiment vector to bow space words = load_sentiment() sentiment_vec = zeros(X.shape[1]) for key in words.keys(): if bow.vocabulary_.has_key(key): sentiment_vec[bow.vocabulary_[key]] = words[key] # do sentiment analysis sentiments = X.dot(sentiment_vec) # compute label-BoW-tfidf-feature correlation lb = LabelBinarizer() partylabels = zscore(lb.fit_transform(Y),axis=0) # sentiment vs party correlation sentVsParty = corrcoef(partylabels.T,sentiments)[-1,:-1] fn = folder+'/sentiment_vs_party.json' for key in range(len(data.keys())): print "Sentiment vs Party %s: %0.2f"%(data.keys()[key],sentVsParty[key]) json.dump(dict(zip(data.keys(),sentVsParty)),open(fn,'wb')) wordidx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys())) allcors = dict(zip(data.keys(),[[]]*len(data.keys()))) # this is extremely cumbersome and slow, ... # but computing the correlations naively on the matrices # requires densifying the matrix X, which is memory intense for partyidx in range(len(data.keys())): cors_words = [] print 'Computing correlations for %s'%data.keys()[partyidx] for wordidx in range(X.shape[-1]): cors = corrcoef(X[:,wordidx].todense().flatten(),partylabels[:,partyidx])[1,0] if abs(cors)>.01: cors_words.append((wordidx2word[wordidx],cors)) allcors[data.keys()[partyidx]] = dict(cors_words) fn = folder+'/words_correlations.json' json.dump(dict(allcors),open(fn,'wb'))
def selective_average_aligned_runs(self,sources,mixing): ''' Averages one aligned ICA run and calculates a reproducibility index. This version uses the original definition in Yang et al. ''' # threshold for inclusion thresh = 0.7 corrsToSum = np.triu(np.abs(corrcoef(sources)),1).flatten() rep = (corrsToSum[np.nonzero(corrsToSum > thresh)].sum())/(0.5*self.K*(self.K-1)) # now only add a component to the average if there is at least one correlation with the other RCs > threshold # the > 1 statement is because the diagonal elements are always 1.0, so there will always be at least one # cross-correlation (namely self-correlation) which is bigger than 1 toInclude = ((np.abs(corrcoef(sources)) > thresh).sum(axis=0) > 1) return sources[toInclude,:].mean(axis=0),mixing[:,toInclude].mean(axis=1),rep
def multi_glasso_train(beta1_current, chrom_betas1, chrom_ld_dict, ld_radius, n_indV, tune_idx, Y, X, lambda1, lambda2, chr_list, num_iter=60): print "Starting training with lambda1 = %.5f and lambda2 = %.5f" % ( lambda1, lambda2) n_tune = len(tune_idx) predicted0 = np.zeros(n_tune) for chrom_str in chromosomes_list: if chrom_str in chr_list: predicted0 += sp.dot(beta1_current[chrom_str][:, 0], X[chrom_str][:, tune_idx]) #print "%.2f kg = %.2f lb = %.2f gal = %.2f l" % (var1, var2, var3, var4) tune_cor_old = sp.corrcoef(Y[tune_idx], predicted0)[0, 1] print "Tuning COR of initial: %.3f" % tune_cor_old for k in range(num_iter): predicted1 = np.zeros(n_tune) for chrom_str in chromosomes_list: if chrom_str in chr_list: beta1_current[chrom_str] = inner_iter( beta_hats1=chrom_betas1[chrom_str], n_indV=n_indV, lambda1=lambda1, lambda2=lambda2, start_betas1=beta1_current[chrom_str], ld_radius=ld_radius, ld_dict1=chrom_ld_dict[chrom_str], ) predicted1 += sp.dot(beta1_current[chrom_str][:, 0], X[chrom_str][:, tune_idx]) #tune_err_new = np.mean((y1[tune_idx] - predicted)**2) tune_cor_new = sp.corrcoef(Y[tune_idx], predicted1)[0, 1] if np.isnan(tune_cor_new): break print "Tuning COR at %.1f step: %.3f" % (k, tune_cor_new) if tune_cor_new <= tune_cor_old: break else: tune_cor_old = tune_cor_new return beta1_current, tune_cor_new
def pearson(X, Y=None): """Compute the Pearson correlation between `X` and `Y`. Parameters ---------- X : array_like or sparse matrix with shape (`n_fprints_X`, `n_bits`). Y : array_like or sparse matrix, optional with shape (`n_fprints_Y`, `n_bits`). Returns ------- pearson : array of shape (`n_fprints_X`, `n_fprints_Y`) See Also -------- soergel: Soergel similarity for non-binary data cosine, dice, tanimoto """ X, Y = _check_array_pair(X, Y) Xlen = X.shape[0] if issparse(X): X = vstack((X, Y), format="csr") X = X - X.mean(axis=1) cov = (X * X.T) / (X.shape[1] - 1.0) d = np.sqrt(np.diag(cov)) with np.errstate(divide="ignore"): # handle 0 in denominator pearson = cov / np.outer(d, d) else: with np.errstate(divide="ignore"): # handle 0 in denominator pearson = scipy.corrcoef(X, Y) return np.asarray(np.nan_to_num(pearson[:Xlen, Xlen:]))
def calculate_stock_correlation(data): """ This function should take a list containing two lists of the form returned by get_yahoo_data (list of date, adj. close tuples) and return the correlation of the daily returns as defined above. """ apple_returns = [] google_returns = [] apple_data = data[0] google_data = data[1] cm = apple_data[0][1] for i in range(1,len(apple_data)): cn = apple_data[i][1] daily_return = (cn-cm)/cm apple_returns.append(daily_return) cm = cn cm = google_data[0][1] for i in range(1,len(google_data)): cn = google_data[i][1] daily_return = (cn-cm)/cm google_returns.append(daily_return) cm = cn corr_matrix = scipy.corrcoef(google_returns,apple_returns) corr_value = corr_matrix[0][1] return corr_value
def get_correlations(self, pids=None): """ Returns correlation matrix between traits All traits are used if pids is left empty. """ import bisect if not pids: pids = sorted(self.phen_dict.keys()) num_traits = len(pids) corr_mat = sp.ones((num_traits, num_traits)) for i, pid1 in enumerate(pids): pd = self.get_avg_value_dict(pid1) ets1 = pd['ecotypes'] pvs1 = pd['values'] for j, pid2 in enumerate(pids[:i]): pd = self.get_avg_value_dict(pid2) ets2 = pd['ecotypes'] pvs2 = pd['values'] common_ets = set(ets1).intersection(set(ets2)) ets_ix1 = map(ets1.index, common_ets) ets_ix2 = map(ets2.index, common_ets) vs1 = [pvs1[et_i] for et_i in ets_ix1] vs2 = [pvs2[et_i] for et_i in ets_ix2] corr_mat[i, j] = sp.corrcoef(vs1, vs2)[0, 1] corr_mat[j, i] = corr_mat[i, j] return corr_mat, pids
def calculate_stock_correlation(data): """ This function should take a list containing two lists of the form returned by get_yahoo_data (list of date, adj. close tuples) and return the correlation of the daily returns as defined above. """ apple_returns = [] google_returns = [] apple_data = data[0] google_data = data[1] cm = apple_data[0][1] for i in range(1, len(apple_data)): cn = apple_data[i][1] daily_return = (cn - cm) / cm apple_returns.append(daily_return) cm = cn cm = google_data[0][1] for i in range(1, len(google_data)): cn = google_data[i][1] daily_return = (cn - cm) / cm google_returns.append(daily_return) cm = cn corr_matrix = scipy.corrcoef(google_returns, apple_returns) corr_value = corr_matrix[0][1] return corr_value
def corr(X, Y): """Compare two histories event by event and give a similarity score. Warning ------- Note the asymmetry of X and Y; the latter is inferred and can therefore contain ties. We add an additional variable to denote the time of birth of an edge. Parameters ---------- X : list of tuples Reference history vector; tuples represent edges. Position corresponds to time. Y : list of pairs Inferred history vector with ranking information. The first entry of the pair contains an edge (pair) The second entry contains the rank of the edge (float). Return ------ score : float Correlation of of generated and infered history. """ # Augment reference history with arrival times X = [(_, t) for t, _ in enumerate(X)] # Sort based on edges X = sorted(X, key=lambda x: x[0]) Y = sorted(Y, key=lambda x: x[0]) corr = sp.corrcoef([x[1] for x in X], [y[1] for y in Y])[0, 1] return corr
def cal_coff(array, indicator): axis = indicator == 0 if axis: length = array.shape[1] else: length = array.shape[0] for x in xrange(0, length): for y in xrange(0, length): if x != y: if axis: yield sp.corrcoef(array[:, x], array[:, y]) else: yield sp.corrcoef(array[x, :], array[y, :])
def run_STM_CV(Xc,yc,cbool_bin,yhat): num_components = 3 num_features = 24 k = 20 KF = sklearn.model_selection.KFold(k,shuffle=True) yhat_model = np.zeros(yc.shape[0]) MODELS =[] count=0 for train_index,test_index in KF.split(Xc): count+=1 print('\t{} of {} crossvalidations'.format(count,k)) model = cmt.models.STM(Xc.shape[1],0, num_components, num_features, cmt.nonlinear.ExponentialFunction, cmt.models.Poisson) retval = model.train(Xc[train_index,:].T,yc[train_index,:].T,parameters=get_params()) if not retval: print('Max_iter ({:.0f}) reached'.format(get_params()['max_iter'])) MODELS.append(model) yhat_model[test_index] = model.predict(Xc[test_index].T) yhat[cbool_bin] =yhat_model yhat[yhat>binsize]=binsize r = scipy.corrcoef(yhat[cbool_bin].ravel(),yc.ravel())[0,1] print('\t\t corrcoef = {}'.format(r)) return(r)
def cal_coff(array,indicator): axis = indicator == 0; if axis: length = array.shape[1] else: length = array.shape[0] for x in xrange(0,length): for y in xrange(0,length): if x != y : if axis: yield sp.corrcoef(array[:,x], array[:,y]) else: yield sp.corrcoef(array[x,:], array[y,:])
def pcor(X,Y,Z): """ computes the correlation amtrix of X and Y conditioning on Z """ if X.ndim==1: X = X[:,SP.newaxis] if Y.ndim==1: Y = Y[:,SP.newaxis] if Z is None: return STATS.pearsonr(X,Y) if Z.ndim==1: Z = Z[:,SP.newaxis] nSamples = X.shape[0] betaX, _, _, _ = LA.lstsq(Z,X) betaY, _, _, _ = LA.lstsq(Z,Y) Xres = X - SP.dot(Z,betaX) Yres = Y - SP.dot(Z,betaY) corr_cond = SP.corrcoef(Xres[:,0],Yres[:,0])[0,1] dz = Z.shape[1] # dimension of conditioning variable df = max(nSamples - dz - 2,0) # degrees of freedom with warnings.catch_warnings(): warnings.filterwarnings("ignore") tstat = corr_cond / SP.sqrt(1.0 - corr_cond ** 2) # calculate t statistic tstat = math.sqrt(df) * tstat pv_cond = 2 * t.cdf(-abs(tstat), df, loc=0, scale=1) # calculate p value return corr_cond,pv_cond
def pcor(X, Y, Z): """ computes the correlation amtrix of X and Y conditioning on Z """ if X.ndim == 1: X = X[:, SP.newaxis] if Y.ndim == 1: Y = Y[:, SP.newaxis] if Z is None: return STATS.pearsonr(X, Y) if Z.ndim == 1: Z = Z[:, SP.newaxis] nSamples = X.shape[0] betaX, _, _, _ = LA.lstsq(Z, X) betaY, _, _, _ = LA.lstsq(Z, Y) Xres = X - SP.dot(Z, betaX) Yres = Y - SP.dot(Z, betaY) corr_cond = SP.corrcoef(Xres[:, 0], Yres[:, 0])[0, 1] dz = Z.shape[1] # dimension of conditioning variable df = max(nSamples - dz - 2, 0) # degrees of freedom with warnings.catch_warnings(): warnings.filterwarnings("ignore") tstat = corr_cond / SP.sqrt( 1.0 - corr_cond**2) # calculate t statistic tstat = math.sqrt(df) * tstat pv_cond = 2 * t.cdf(-abs(tstat), df, loc=0, scale=1) # calculate p value return corr_cond, pv_cond
def selectTraits(self,phenoMAF=None,corrMin=None,nUnique=False): """ use only a subset of traits filter out all individuals that have missing values for the selected ones """ self.idx_samples = SP.ones(self.n_s,dtype=bool) # filter out nan samples self.idx_samples[SP.isnan(self.Y[:,self.idx_traits]).any(1)] = False # filter out phenotypes that are not diverse enough if phenoMAF!=None: expr_mean = self.Y[self.idx_samples].mean(0) expr_std = self.Y[self.idx_samples].std(0) z_scores = SP.absolute(self.Y[self.idx_samples]-expr_mean)/SP.sqrt(expr_std) self.idx_traits[(z_scores>1.5).mean(0) < phenoMAF] = False # use only correlated phenotypes if corrMin!=None and self.Y.shape[1]>1: corr = SP.corrcoef(self.Y[self.idx_samples].T) corr-= SP.eye(corr.shape[0]) self.idx_traits[SP.absolute(corr).max(0)<0.3] = False # filter out binary phenotypes if nUnique and self.Y.shape[1]>1: for i in range(self.Y.shape[1]): if len(SP.unique(self.Y[self.idx_samples][:,i]))<=nUnique: self.idx_traits[i] = False LG.debug('number of traits(before filtering): %d'%self.n_t) LG.debug('number of traits(after filtering): %d'%self.idx_traits.sum()) LG.debug('number of samples(before filtering): %d'%self.n_s) LG.debug('number of samples(after filtering): %d'%self.idx_samples.sum())
def alignAndCompareMotifs(motif1, motif2, reportAll=False, tryAllAlignments=True, reverseComp=True, quitThreshold=None, normalizeRows=True, fillValue=.25): """ Compare the PWM's for two motifs by calculating their correlation coefficient. By default, all possible alignments and orientations will be tried and the top coefficient will be reported. fillValue may be a number, or a 4-element array with nuc frequencies Returns (corrCoef, motif2_relative_posn, motif2_orientation) form best alignment, or the entire list if reportAll=True. """ pwm1, pwm2 = motif1.matrix, motif2.matrix if normalizeRows: # make sum in each row = 1 pwm1, pwm2 = map(normalizePwmRows, [pwm1, pwm2]) alignsToTry = xrange( -len(motif2) + 1, len(motif1) - 1) if tryAllAlignments else [0] # all possible shifts or no shifting results = [] for curOffset in alignsToTry: curPwm1, curPwm2 = map(scipy.array, extendPWMs(pwm1, pwm2, curOffset, fillValue)) # flatten arrays and take 1-dimensional correlation between them corrCoef = scipy.corrcoef( curPwm1.ravel(), curPwm2.ravel())[0, 1] # top-right is correlation between matrices results.append([corrCoef, curOffset, 1]) if quitThreshold is not None and corrCoef > quitThreshold: # return immediately if quit threshold has been passed break if reverseComp: curPwm2 = scipy.array(reverseComplement(curPwm2)) corrCoef = scipy.corrcoef(curPwm1.ravel(), curPwm2.ravel())[ 0, 1] # top-right is correlation between matrices results.append([corrCoef, curOffset, -1]) if quitThreshold is not None and corrCoef > quitThreshold: # return immediately if quit threshold has been passed break if reportBest: results = scipy.array(results) best = results[results[:, 0].argmax( ), :] # choose the result (row) with the best corrCoef return best else: return results
def simple_supervised_demo(): print "Simple demo of supervised factor inference" model = get_simple_model_object(expr_file='data/expression_sparse.csv') # simple object using default simulated dataset; see simple_unsupervised_demo for how it is constructed prior = SP.loadtxt("data/prior_sparse.csv",delimiter=",") # and prior for which factor regulates which gene. This matrix has entries between 0 and 1. The (g,k) entry represents the probability that gene g is affected by factor k model.setSparsityPrior(prior) # prior on which factors affect which genes model.update() for i in range(prior.shape[1]): print "Correlation between factor",i, "prior and weight",SP.corrcoef(model.getW()[:,i], prior[:,i])[0,1], "sum prior", sum(prior[:,i])
def portfolio_var(R, w): cor = sp.corrcoef(R.T) std_dev = sp.std(R, axis=0) var = 0.0 for i in xrange(n): for j in xrange(n): var += w[i] * w[j] * std_dev[i] * std_dev[j] * cor[i, j] return var
def calc_vif(data_enc): cc = sp.corrcoef(data_enc.values, rowvar=False) if cc.ndim < 2: return [] VIF = np.round(np.linalg.inv(cc).diagonal(), 6) return sorted(zip(data_enc.columns, VIF), key=lambda item: item[1], reverse=True)
def pearsCorrRavel(Y1, Y2): """ calculated the prearson correlation between vec(Y1) and vec(Y2) """ y1 = Y1.ravel() y2 = Y2.ravel() rv = SP.corrcoef(y1, y2)[0, 1] return rv
def correlationMatrix(mdata,linit,lend,nstep): lstep=(lend-linit)/nstep corr=np.zeros((mdata.shape[0],mdata.shape[0])) for length in range(linit,lend,lstep): corrs=corrcoef(mdata[:,length:length+lstep]) corr+=corrs corr/=nstep return corr
def summarize_accuracy(prs_files): true_phens = [] prs_phens = [] ldpred_phens = [] tp_prs_rs = [] tp_ldpred_rs = [] for prsf in prs_files: if os.path.isfile(prsf): rt = pd.read_csv(prsf,skipinitialspace=True, index_col=False) true_phens.extend(rt['true_phens']) prs_phens.extend(rt['raw_effects_prs']) ldpred_phens.extend(rt['pval_derived_effects_prs']) tp_prs_rs.append(sp.corrcoef(rt['true_phens'],rt['raw_effects_prs'])[0,1]) tp_ldpred_rs.append(sp.corrcoef(rt['true_phens'],rt['pval_derived_effects_prs'])[0,1]) return (sp.mean(tp_prs_rs),sp.mean(tp_ldpred_rs))
def portfolio_var(R,w): cor = sp.corrcoef(R.T) std_dev=sp.std(R,axis=0) var = 0.0 for i in xrange(n): for j in xrange(n): var += w[i]*w[j]*std_dev[i]*std_dev[j]*cor[i, j] return var
def pca(dat, npca=None, verbose = False): if isinstance(dat, sp.ndarray): dat = pd.DataFrame(dat) names = [] for i in range(dat.shape[1]): names.append("x"+str(i+1)) dat.columns = names names = list(dat.columns) nr = dat.shape[0] nc = dat.shape[1] r = sp.corrcoef(dat, rowvar=False) heikin = dat.mean(axis=0) bunsan = dat.var(axis=0, ddof=1) sd = sp.sqrt(bunsan) eval, evec = linalg.eig(r) eval = sp.real(eval) rank = rankdata(eval, method="ordinal") rank = nc+1-rank eval2 = eval.copy() evec2 = evec.copy() for i in range(nc): j = sp.where(rank == i+1)[0][0] eval[i] = eval2[j] evec[:, i] = evec2[:, j] contr = eval/nc*100 cum_contr = sp.cumsum(contr) fl = (sp.sqrt(eval)*evec) for i in range(nc): dat.ix[:, i] = (dat.ix[:, i]-heikin[i]) / sd[i] fs = sp.dot(dat, evec*sp.sqrt(nr/(nr-1))) if npca is None: npca = sp.sum(eval >= 1) eval = eval[0:npca] cont = eval/nc cumc = sp.cumsum(cont) fl = fl[:, 0:npca] rcum = sp.sum((fl ** 2), axis=1) if verbose: print(" ", end="") for j in range(npca): print("{0:>8s}".format("PC"+str(j+1)), end="") print(" Contribution") for i in range(nc): print("{0:>12s}".format(names[i]), end="") for j in range(npca): print(" {0:7.3f}".format(fl[i, j]), end="") print(" {0:7.3f}".format(rcum[i])) print(" Eigenvalue", end="") for j in range(npca): print(" {0:7.3f}".format(eval[j]), end="") print("\nContribution", end="") for j in range(npca): print(" {0:7.3f}".format(cont[j]), end="") print("\nCum.contrib.", end="") for j in range(npca): print(" {0:7.3f}".format(cumc[j]), end="") print() return {"r":r, "fl":fl, "eval":eval, "fs":fs[:, 0:npca]}
def coefficients_VIF(self): #eps = 1e-20 x = self._model.model.exog[:, 1:].copy() inv_corr = np.linalg.inv(sp.corrcoef(x, rowvar=False)) diag = list(inv_corr.diagonal()) if self.include_constant: diag = [np.nan] + diag return pd.Series(diag, index=self._params_idx)
def generate_vif(csv): """Prints the Variance Inflation Factor (VIF) for the values in a given csv.""" # Create pandas dataframe from csv. data = pd.read_csv(csv) # Calculate VIF from dataframe. cc = sp.corrcoef(data.values, rowvar=False) vif = np.linalg.inv(cc) v = vif.diagonal() return str(v)
def pearsCorrMean(Y1, Y2): """ calculated the avg prearson correlation between columns of Y1 and Y2 """ rv = 0 for ic in range(Y.shape[1]): rv += SP.corrcoef(Y1[:, ic], Y2[:, ic])[0, 1] rv /= float(Y.shape[1]) return rv
def supervised_prior_comparison_demo(): print "Supervised factor inference demo, comparing different error rates in prior specification" prior = SP.loadtxt("data/prior_sparse.csv",delimiter=",") # compare outcomes of inference depending on uncertainty in prior for error in (0,0.01,0.1,0.2): print "Prior error=",error model = get_simple_model_object(expr_file='data/expression_sparse.csv') # simple object using default simulated dataset; see simple_unsupervised_demo for how it is constructed p = prior p[p > 0.5] = (1-error) p[p < 0.5] = error model.setSparsityPrior(p) # prior on which factors affect which genes model.update() for i in range(prior.shape[1]): if SP.isnan(model.getW()).any(): pdb.set_trace() elif SP.isnan(SP.corrcoef(model.getW()[:,i], prior[:,i])[0,1]): pdb.set_trace() print "Correlation between factor",i, "prior and weight",SP.corrcoef(model.getW()[:,i], prior[:,i])[0,1], "sum prior", sum(prior[:,i])
def repeats_test(rsType, num_wheels=1, num_spins_per_wheel=1): wheel_segments = numpy.random.randint(low=0,high=100,size=100) hits = [0] * len(wheel_segments) for i in xrange(num_wheels): rs = rsType(wheel_segments) for j in xrange(num_spins_per_wheel): hits[rs.spin(random.random())] += 1 corr = scipy.corrcoef(wheel_segments, hits)[0,1] print corr return corr
def centrality_correlation(G=None): import scipy, pylab, cPickle if G == None: npr.seed(3) random.seed(3) des = centralityTreeDesign() #des = cdaDesign() #gives high correlation des.setParam('b', 15.) des.setParam('q', 0.7) #gives low correlation #des.setParam('b', 2.0) #des.setParam('q', 3.0) des.fixedParams['nn'] = 300 G = des.buildNet() results = [] centralities = nx.centrality.brandes_betweenness_centrality(G) for node in G: if G.degree(node) == 1: continue #jitter for visualization: #results.append((G.degree(node)+npr.rand()*0.4, centralities[node])) results.append((G.degree(node), centralities[node])) results = np.array(results) pylab.rc('text', usetex=True) pylab.plot(results[:, 0], results[:, 1], '.') #pylab.title(r'lambda=%2.1f'%(rat,)) pylab.xlabel('Degree') pylab.ylabel('Betweenness') #fixme: statistics question - what are all the other coefficients:? corr = scipy.corrcoef(results[:, 0], results[:, 1])[0, 1] pylab.figtext(0.2, 0.8, 'Correlation=%f' % corr) filename = 'output/correlation_motion.vs.betweenness.pkl' outputFile = open(filename, 'wb') report = {'results': results, 'correlation': corr} cPickle.dump(report, outputFile) outputFile.close() print print 'Pickle: ' + filename + ' written!' results = np.array(results) print 'Correlation: %.4f' % corr try: pylab.savefig('output/correlation_motion.vs.betweenness_lambda=' + str(rat) + '_results.eps') except: print 'Unable to save figure...'
def plot_phen_relatedness(self, k, k_accessions, plot_file_prefix, pids=None): import kinship import pylab import scipy as sp from scipy import linalg if not pids: pids = self.get_pids() self.convert_to_averages(pids) self.filter_ecotypes_2(k_accessions, pids) for pid in pids: ets = self.get_ecotypes(pid) vals = self.get_values(pid) k_m = kinship.prepare_k(k, k_accessions, ets) c = sp.sum((sp.eye(len(k_m)) - (1.0 / len(k_m)) * sp.ones(k_m.shape)) * sp.array(k_m)) k_scaled = (len(k) - 1) * k / c p_her = self.get_pseudo_heritability(pid, k_m) x_list = [] y_list = [] for i in range(len(ets)): for j in range(i): x_list.append(k_m[i, j]) y_list.append(vals[i] - vals[j]) ys = sp.array(y_list) ys = ys * ys xs = sp.array(x_list) phen_name = self.get_name(pid) phen_name = phen_name.replace("<i>", "") phen_name = phen_name.replace("</i>", "") phen_name = phen_name.replace("+", "_plus_") phen_name = phen_name.replace("/", "_div_") file_name = plot_file_prefix + "_%d_%s.png" % (pid, phen_name) pylab.figure() pylab.plot(xs, ys, "k.", alpha=0.2) pylab.xlabel("Relatedness") pylab.ylabel("Squared phenotypic difference") # Plot regression line Y_mat = sp.mat(ys).T X_mat = sp.hstack((sp.mat(sp.ones(len(xs))).T, sp.mat(xs).T)) (betas, residues, rank, s) = linalg.lstsq(X_mat, Y_mat) x_min, x_max = pylab.xlim() pylab.plot([x_min, x_max], [betas[0] + x_min * betas[1], betas[0] + x_max * betas[1]]) corr = sp.corrcoef(xs, ys)[0, 1] y_min, y_max = pylab.ylim() x_range = x_max - x_min y_range = y_max - y_min pylab.axis( [x_min - 0.025 * x_range, x_max + 0.025 * x_range, y_min - 0.025 * y_range, y_max + 0.15 * y_range] ) pylab.text(x_min + 0.1 * x_range, y_max + 0.03 * y_range, "Correlation: %0.4f" % (corr)) pylab.text(x_min + 0.5 * x_range, y_max + 0.03 * y_range, "Pseudo-heritability: %0.4f" % (p_her)) pylab.savefig(file_name) del k_m del k_scaled
def bowtie_polynom(modis_img, cs, folder): print 'Determine overlap pattern... ' sw = 10000 / cs #stripwidth overlaplist = [] #define list to store number of overlapped lines #devide in parts with a width of 40 pixel for i in sp.arange(0, modis_img.shape[1] - 40, 40): part = modis_img[:, i:i + 39] #search in every scanning strip samples = [] for j in sp.arange(sw - 2, part.shape[0] - sw, sw): target = part[ j - 1:j + 1, :] #cut out a target, which overlapped counter-part shall be found searchwindow = part[ j + 2:j + sw + 2] #,: cut out the window, where the overlapped counter part might be located #start the search c = [ ] #calculate correlation coefficients of every given offset from 3 to 11 for offset in sp.arange(3, sw / 2 + 1): imgpart = searchwindow[ offset - 3:offset - 1] #,: cut out image, which has to be compared with the target c.append( sp.corrcoef(imgpart.flatten(), target.flatten())[ 0, 1]) #calculate correlatoin coefficient c = sp.array(c) overl = sp.ndimage.measurements.maximum_position( c )[0] + 3 #find the overlap with the highes correlation coefficient samples.append([ overl, c.max() ]) #attach overlap and correlation coefficient to the sample list samples = sp.array(samples) #print i, samples[:,1].mean() if samples[:, 1].mean() > 0.9: #chek the mean correlation coefficient: #print('Bowtie Correlation high - removing effect') overlaplist.append([ i + 20, samples[:, 0].mean() ]) #save result, if correlation coefficient is high #print(overlaplist) o = sp.array(overlaplist) X = o[:, 0] overlap = o[:, 1] #Calculate a second order Polynom to describe the overlap p = sp.polyfit(X, overlap, 2) #print 'done, Overlap polynom: '+str(p) else: #print('low Bowtie correlation') p = [1., 1., 1.] #overlaplist.append([i+20,1]) #os.system('rm -r '+folder) #print('scene deleted') return p
self.mapping[indexes[i]] = finalbeta[i] return self.mapping def stats(self, startdate, enddate, mktbasket, output = False): """ Calculates statistics for a fund over a period. Parameters ---------- startdate : datetime beginning of statistic period enddate : datetime end of statistic period mktbasket : dict dictionary of market streams output : bool if True, output results to db Returns ------- stats : dict dictionary of statistics """ inputmatrix, fundreturns, indexes, daterange = self.align(startdate, enddate, mktbasket) if self.mapping and not(inputmatrix is None): weights = scipy.array([self.mapping[mykey] if mykey in self.mapping else 0.0 for mykey in mktbasket.keys()]) projected = scipy.dot(inputmatrix,weights.reshape(len(indexes),1)).flatten() actual = fundreturns.flatten() diff = actual-projected outdata = { 'TE' : scipy.std(diff)*100.0*100.0, 'BETA' : scipy.cov(projected,actual)[1,0]/scipy.var(projected), 'ALPHA' : (scipy.product(diff+1.0))**(1.0/diff.size)-1.0, 'VOL' : scipy.std(actual)*scipy.sqrt(252.0), 'PROJ' : scipy.product(1.0+projected)-1.0, 'ACT' : scipy.product(1.0+actual)-1.0, 'R2' : 0.0 if scipy.all(actual==0.0) else scipy.corrcoef(projected,actual)[1,0]**2.0, 'AV' : self.av(startdate), 'DELTA' : self.deltaestimate(startdate) } outdata['DIFF'] = outdata['ACT']-outdata['PROJ'] outdata['PL'] = outdata['DELTA']*outdata['DIFF']*100.0 if output: cnxn = pyodbc.connect(ORACLESTRING) cursor = cnxn.cursor() sql = 'INSERT INTO FUNDOUTPUT VALUES ({0!s},{1!s},{2!s},{3!s},{4!s},{5!s},{6},{7},{8!s},{9!s},{10!s},{11!s},{12!s},{13!s});' sql = sql.format(self.fundcode,outdata['PROJ'],outdata['ACT'],outdata['DIFF'], outdata['DELTA'],outdata['PL'],oracledatebuilder(startdate), oracledatebuilder(enddate),outdata['TE'],outdata['R2'],outdata['BETA'], outdata['ALPHA'],outdata['VOL'],outdata['AV']) cursor.execute(sql) cnxn.commit() cnxn.close()
def stats(self, startdate, enddate, mktbasket, avdate, output=False, mappingoverride=None): """ Calculates statistics for a fund over a period. Parameters ---------- startdate : datetime beginning of statistic period enddate : datetime end of statistic period mktbasket : dict dictionary of market streams output : bool if True, output results to db mappingoverride : None or mapping dictionary whether to override the db mapping Returns ------- stats : dict dictionary of statistics """ actualstream, projstream = self.project(mktbasket, mappingoverride) if actualstream[startdate:enddate] is None: return None if projstream[startdate:enddate] is None: return None actual = actualstream[startdate:enddate].returns projected = projstream[startdate:enddate].returns diff = actual - projected outdata = { 'TE' : scipy.std(diff) * 100.0 * 100.0, 'BETA' : scipy.cov(projected, actual, bias=1)[1, 0] / scipy.var(projected), 'ALPHA' : (scipy.product(diff + 1.0)) ** (1.0 / diff.size) - 1.0, 'VOL' : scipy.std(actual) * scipy.sqrt(252.0), 'PROJ' : scipy.product(1.0 + projected) - 1.0, 'ACT' : scipy.product(1.0 + actual) - 1.0, 'R2' : 0.0 if scipy.all(actual == 0.0) else scipy.corrcoef(projected, actual)[1, 0] ** 2.0, 'AV' : self.av(avdate), 'DELTA' : self.deltaestimate(avdate) } outdata['DIFF'] = outdata['ACT'] - outdata['PROJ'] outdata['PL'] = outdata['DELTA'] * outdata['DIFF'] * 100.0 if output: cnxn = pyodbc.connect(ORACLESTRING) cursor = cnxn.cursor() sql = 'INSERT INTO FUNDOUTPUT VALUES ({0!s},{1!s},{2!s},{3!s},{4!s},{5!s},{6},{7},{8!s},{9!s},{10!s},{11!s},{12!s},{13!s});' sql = sql.format(self.fundcode, outdata['PROJ'], outdata['ACT'], outdata['DIFF'], outdata['DELTA'], outdata['PL'], oracledatebuilder(startdate), oracledatebuilder(enddate), outdata['TE'], outdata['R2'], outdata['BETA'], outdata['ALPHA'], outdata['VOL'], outdata['AV']) cursor.execute(sql) cnxn.commit() cnxn.close() return outdata
def Autocorr(GDP,I,C): m = sp.shape(GDP)[1] GDPauto = [] Iauto = [] Cauto = [] for i in range(0,m): #GDP autocorrelation coefficients for each series appended #to the empty GDPauto list gdp = GDP[:,i] gauto = sp.corrcoef(gdp[0:-1],gdp[1:]) GDPauto.append(gauto[0,1]) #Investment autocorrelation coefficients for each series #appended to the empty Iauto list invest = I[:,i] iauto = sp.corrcoef(invest[0:-1],invest[1:]) Iauto.append(iauto[0,1]) #Consumption autocorrelation coefficients for each series #appended to the empty Cauto list consum = C[:,i] cauto = sp.corrcoef(consum[0:-1],consum[1:]) Cauto.append(cauto[0,1]) #Calculate the mean and standard deviation of these moments #across the total number of simulations GDPAUTO = sp.array(GDPauto) gdpsimmean = sp.mean(GDPAUTO) gdpsimstdev = sp.std(GDPAUTO) IAUTO = sp.array(Iauto) isimmean = sp.mean(IAUTO) isimstdev = sp.std(IAUTO) CAUTO = sp.array(Cauto) csimmean = sp.mean(CAUTO) csimstdev = sp.std(CAUTO) sp.savetxt('GDPAUTO.csv',GDPAUTO) sp.savetxt('IAUTO.csv',IAUTO) sp.savetxt('CAUTO.csv',CAUTO) print "GDP/Investment/Consumption Simulations Mean/Standard Deviation" print "of Autocorrelation. The Autocorrelation Coefficients" print "of GDP,Investment,Consumption for each series have been saved" print "separately in csv files" return gdpsimmean, gdpsimstdev, isimmean, isimstdev, csimmean, csimstdev
def plot_phen_relatedness(self, k, k_accessions, plot_file_prefix, pids=None): import kinship import pylab import scipy as sp from scipy import linalg if not pids: pids = self.get_pids() self.convert_to_averages(pids) self.filter_ecotypes_2(k_accessions, pids) for pid in pids: ets = self.get_ecotypes(pid) vals = self.get_values(pid) k_m = kinship.prepare_k(k, k_accessions, ets) c = sp.sum((sp.eye(len(k_m)) - (1.0 / len(k_m)) * sp.ones(k_m.shape)) * sp.array(k_m)) k_scaled = (len(k) - 1) * k / c p_her = self.get_pseudo_heritability(pid, k_m) x_list = [] y_list = [] for i in range(len(ets)): for j in range(i): x_list.append(k_m[i, j]) y_list.append(vals[i] - vals[j]) ys = sp.array(y_list) ys = ys * ys xs = sp.array(x_list) phen_name = self.get_name(pid) phen_name = phen_name.replace('<i>', '') phen_name = phen_name.replace('</i>', '') phen_name = phen_name.replace('+', '_plus_') phen_name = phen_name.replace('/', '_div_') file_name = plot_file_prefix + '_%d_%s.png' % (pid, phen_name) pylab.figure() pylab.plot(xs, ys, 'k.', alpha=0.2) pylab.xlabel('Relatedness') pylab.ylabel('Squared phenotypic difference') #Plot regression line Y_mat = sp.mat(ys).T X_mat = sp.hstack((sp.mat(sp.ones(len(xs))).T, sp.mat(xs).T)) (betas, residues, rank, s) = linalg.lstsq(X_mat, Y_mat) x_min, x_max = pylab.xlim() pylab.plot([x_min, x_max], [betas[0] + x_min * betas[1], betas[0] + x_max * betas[1]]) corr = sp.corrcoef(xs, ys)[0, 1] y_min, y_max = pylab.ylim() x_range = x_max - x_min y_range = y_max - y_min pylab.axis([x_min - 0.025 * x_range, x_max + 0.025 * x_range, y_min - 0.025 * y_range, y_max + 0.15 * y_range]) pylab.text(x_min + 0.1 * x_range, y_max + 0.03 * y_range, 'Correlation: %0.4f' % (corr)) pylab.text(x_min + 0.5 * x_range, y_max + 0.03 * y_range, 'Pseudo-heritability: %0.4f' % (p_her)) pylab.savefig(file_name) del k_m del k_scaled
def MonteCarlo(P,Q,R,S,N,X0,Z,Xbar,alpha,delta,reps,T): C = sp.zeros((T,reps)) L = sp.zeros((T,reps)) Lauto = [] Cauto = [] Wauto = [] LWcorr = [] for i in range(reps): if i % 500 ==0: print 'Simulation #: ', i X=[] X = Xgen(X0,Z[:,i],P,Q,Xbar) #find C,L for each time period consumption,labor,wage=CLgen(X,Z[:,i],alpha,delta) #Consumption autocorrelation coefficients for each series #appended to the empty Cauto list #cauto = sp.corrcoef(consumption[0:-1],consumption[1:]) cauto = sp.mean(consumption) Cauto.append(cauto) #Labor autocorrelation coefficients for each series #appended to the empty Lauto list #lauto = sp.corrcoef(labor[0:-1],labor[1:]) lauto = sp.mean(labor) Lauto.append(lauto) #Wage autocorrelation for each series #appended to the empty Lauto list #wauto = sp.corrcoef(wage[0:-1],wage[1:])[0,1] wauto = sp.mean(wage) Wauto.append(wauto) #Wage and Labor correlation coefficient for each series #appended to the empty LWcorr list lwcorr = sp.corrcoef(labor,wage) LWcorr.append(lwcorr) #What we will return, arrays of the auto and correlation coefficients CAUTO = sp.array(Cauto) LAUTO = sp.array(Lauto) WAUTO = sp.array(Wauto) LWCORR = sp.array(LWcorr) return CAUTO,LAUTO,WAUTO,LWCORR
def midparent_predictions(Y, parent1, parent2, Itrain=None, Itest=None): if Itrain is None: Itrain = sp.ones(Y.shape[0], dtype=bool) Itest = sp.ones(Y.shape[0], dtype=bool) modelmatrix = patsy.dmatrix("0 + parent1 + parent2") Rsquared = sp.zeros((Y.shape[1])) pred = sp.zeros_like(Y[Itest, :]) for j in range(Y.shape[1]): lm = linear_model.LinearRegression() lm.fit(modelmatrix[Itrain, :], Y[Itrain, j]) pred[:, j] = lm.predict(modelmatrix[Itest, :]) Rsquared[j] = sp.corrcoef(pred[:, j], Y[Itest, j])[0, 1]**2 return np.row_stack((Rsquared)), pred
def midparent_predictions(Y, parent1, parent2, Itrain=None, Itest=None): if Itrain is None: Itrain = sp.ones(Y.shape[0], dtype=bool) Itest = sp.ones(Y.shape[0], dtype=bool) modelmatrix = patsy.dmatrix("0 + parent1 + parent2") Rsquared = sp.zeros((Y.shape[1])) pred = sp.zeros_like(Y[Itest, :]) for j in range(Y.shape[1]): lm = linear_model.LinearRegression() lm.fit(modelmatrix[Itrain, :], Y[Itrain, j]) pred[:, j] = lm.predict(modelmatrix[Itest, :]) Rsquared[j] = sp.corrcoef(pred[:, j], Y[Itest, j])[0,1]**2 return np.row_stack((Rsquared)), pred
def supervised_prior_comparison_demo(): print "Supervised factor inference demo, comparing different error rates in prior specification" prior = SP.loadtxt("data/prior_sparse.csv", delimiter=",") # compare outcomes of inference depending on uncertainty in prior for error in (0, 0.01, 0.1, 0.2): print "Prior error=", error model = get_simple_model_object( expr_file='data/expression_sparse.csv' ) # simple object using default simulated dataset; see simple_unsupervised_demo for how it is constructed p = prior p[p > 0.5] = (1 - error) p[p < 0.5] = error model.setSparsityPrior(p) # prior on which factors affect which genes model.update() for i in range(prior.shape[1]): if SP.isnan(model.getW()).any(): pdb.set_trace() elif SP.isnan(SP.corrcoef(model.getW()[:, i], prior[:, i])[0, 1]): pdb.set_trace() print "Correlation between factor", i, "prior and weight", SP.corrcoef( model.getW()[:, i], prior[:, i])[0, 1], "sum prior", sum(prior[:, i])
def calculate_stock_correlation(data): """ This function should take a list containing two lists of the form returned by get_yahoo_data (list of date, adj. close tuples) and return the correlation of the daily returns as defined above. """ one = [] two = [] for i in xrange(1, len(data[0])): one.append((data[0][i][1]-data[0][i-1][1])/data[0][i-1][1]) two.append((data[1][i][1]-data[1][i-1][1])/data[1][i-1][1]) return scipy.corrcoef(one, two)[0][1] pass
def simple_supervised_demo(): print "Simple demo of supervised factor inference" model = get_simple_model_object( expr_file='data/expression_sparse.csv' ) # simple object using default simulated dataset; see simple_unsupervised_demo for how it is constructed prior = SP.loadtxt( "data/prior_sparse.csv", delimiter="," ) # and prior for which factor regulates which gene. This matrix has entries between 0 and 1. The (g,k) entry represents the probability that gene g is affected by factor k model.setSparsityPrior(prior) # prior on which factors affect which genes model.update() for i in range(prior.shape[1]): print "Correlation between factor", i, "prior and weight", SP.corrcoef( model.getW()[:, i], prior[:, i])[0, 1], "sum prior", sum(prior[:, i])
def canonicalize_signs(self, sources, mixing): ''' Accepts an set of sources and corresponding mixing matrices from an ICA component (should be a realization component, as this operation makes no sense for regular ICA realizations) and fixes the signs of the realizations, using the sign of the inter-source cross correlations. Specifically, the 0th source is arbitrarily deemed to have the canonical sign; components which correlate positively with this component keep the same signs, and those which negatively correlate have their signs reversed. This WILL NOT ensure all source-source correlations are positive, but will tend to cause the 'well matched' components to have the same sign. ''' compSigns = np.sign(corrcoef(sources)[0, :]) for i in range(1, sources.shape[0]): sources[i, :] = compSigns[i]*sources[i, :] mixing[:, i] = compSigns[i]*mixing[:, i] return sources, mixing
def correlationMatrix(mdata,linit,lend,nstep): lstep=(lend-linit)/nstep corr=np.zeros((mdata.shape[0],mdata.shape[0])) liter= [linit+(i*lstep) for i in range(nstep)] #print liter, len(liter),lend zz=0 for length in liter: corrs=corrcoef(mdata[:,length:length+lstep]) corr+=corrs zz+=1 print '.', print corr/=nstep return corr
def portfolioVariance(R, w): #find the correlation coefficient but before transform returns into three columns corr = sp.corrcoef(R.T) #find deviation along column axis standarDeviation = sp.std(R, axis=0) var = 0.0 n = len(w) #since we have weights and standard deviation,we find variances by pairing #permutatively 2 stocks,find the vaariances and sum them as we do for i in range(n): for j in range(n): var += w[i] * w[j] * standarDeviation[i] * standarDeviation[ j] * corr[i, j] return var
def canonicalize_signs(self,sources,mixing): ''' Accepts an set of sources and corresponding mixing matrices from an ICA component (should be a realization component, as this operation makes no sense for regular ICA realizations) and fixes the signs of the realizations, using the sign of the inter-source cross correlations. Specifically, the 0th source is arbitrarily deemed to have the canonical sign; components which correlate positively with this component keep the same signs, and those which negatively correlate have their signs reversed. This WILL NOT ensure all source-source correlations are positive, but will tend to cause the 'well matched' components to have the same sign. ''' compSigns = np.sign(corrcoef(sources)[0,:]) for i in range(1,sources.shape[0]): sources[i,:] = compSigns[i]*sources[i,:] mixing[:,i] = compSigns[i]*mixing[:,i] return sources,mixing
def centrality_correlation(G): results = [] centralities = nx.centrality.brandes_betweenness_centrality( G, normalized=False) for node in G: #if G.degree(node) == 1: #if there is just one root node and the rest are leaves, get just one point # continue #jitter for visualization: #results.append((G.degree(node)+npr.rand()*0.4, centralities[node])) results.append((G.degree(node), centralities[node])) results = np.array(results) corr = scipy.corrcoef(results[:, 0], results[:, 1])[0, 1] return corr
def main(events): data = get_event_data(events) matrix = event_data_to_matrix(data, events) print "Correlation coefficients" for event1, event2 in list_to_pairs(events): data1 = matrix[:,events.index(event1)] data2 = matrix[:,events.index(event2)] coeff = corrcoef(data1, data2)[0][1] print "%s\tx\t%s:\t%f" % (event1, event2, coeff)
def pred_accuracy(y_true, y_pred): y_true = sp.copy(y_true) if len(sp.unique(y_true)) == 2: print 'dichotomous trait, calculating AUC' y_min = y_true.min() y_max = y_true.max() if y_min != 0 or y_max != 1: y_true[y_true == y_min] = 0 y_true[y_true == y_max] = 1 fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred) auc = metrics.auc(fpr, tpr) return auc else: print 'continuous trait, calculating COR' cor = sp.corrcoef(y_true, y_pred)[0, 1] return cor
def corrParallelSym(Y,df=None): """ computes the correlation matric of Y """ nSamples = Y.shape[1] corr = SP.corrcoef(Y) if df is None: df = max(nSamples - 2,0) # degrees of freedom with warnings.catch_warnings(): warnings.filterwarnings("ignore") tstat = corr / SP.sqrt(1.0 - corr ** 2) # calculate t statistic tstat = math.sqrt(df) * tstat pv = 2 * t.cdf(-abs(tstat), df, loc=0, scale=1) # calculate p value return corr,pv
def pred_accuracy(y_true, y_pred): y_true = sp.copy(y_true) if len(sp.unique(y_true))==2: print 'dichotomous trait, calculating AUC' y_min = y_true.min() y_max = y_true.max() if y_min!= 0 or y_max!=1: y_true[y_true==y_min]=0 y_true[y_true==y_max]=1 fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred) auc = metrics.auc(fpr, tpr) return auc else: print 'continuous trait, calculating COR' cor = sp.corrcoef(y_true,y_pred)[0,1] return cor
def get_r2(Y1,Y2): """ return list of squared correlation coefficients (one per task) """ if Y1.ndim==1: Y1 = SP.reshape(Y1,(Y1.shape[0],1)) if Y2.ndim==1: Y2 = SP.reshape(Y2,(Y2.shape[0],1)) t = Y1.shape[1] r2 = [] for i in range(t): _r2 = SP.corrcoef(Y1[:,i],Y2[:,i])[0,1]**2 r2.append(_r2) r2 = SP.array(r2) return r2