def test_forwardS_same_as_old(self): self.forS.astep(0.) pS0_Test = self.forS.compute_S0_GIVEN_X0( self.forS.computeLikelihoodOfS( self.myTestPoint['X'], logistic.cdf(self.myTestPoint['B_logodds']), logistic.cdf(self.myTestPoint['B0_logodds']))) pS0_Correct = load(open('pS0_Test_data.pkl', 'rb')) pSnt_Test = np.zeros((self.nObs, self.M)) for n in xrange(self.N): n0 = self.zeroIndices[n] for t in xrange(self.T[n] - 1): pSnt_Test[n0 + t] = self.forS.compute_pSt_GIVEN_St1( n0, t, self.myTestPoint['S'][n0 + t]) pSnt_Test = pSnt_Test / pSnt_Test.sum(axis=1)[:, np.newaxis] pSnt_Correct = load(open('pSnt_Test_data.pkl', 'rb')) pSnt_Correct = np.concatenate( [pSnt_Correct[i, 0:self.T[i], :] for i in range(self.N)]) pSnt_Correct = pSnt_Correct / pSnt_Correct.sum(axis=1)[:, np.newaxis] #import pdb; pdb.set_trace() np.testing.assert_array_almost_equal(pS0_Test, pS0_Correct, err_msg="forwardS test off", decimal=6) np.testing.assert_array_almost_equal(pSnt_Test, pSnt_Correct, err_msg="forwardS test off", decimal=6)
def sim_t_fixed_data(): n = 10000 np.random.seed(1011) df = pd.DataFrame() df['W1'] = np.random.normal(size=n) df['W2'] = np.random.binomial(1, size=n, p=logistic.cdf(df['W1'])) df['W3'] = np.random.normal(size=n) df['A'] = np.random.binomial(1, size=n, p=logistic.cdf(-1 + 2 * df['W1']**2)) df['Ya1'] = np.random.binomial( 1, size=n, p=logistic.cdf(-0.5 + 2 * df['W1']**2 + 0.5 * df['W2'] - 0.5 * 1 + 1.1 * df['W3'])) df['Ya0'] = np.random.binomial( 1, size=n, p=logistic.cdf(-0.5 + 2 * df['W1']**2 + 0.5 * df['W2'] + 1.1 * df['W3'])) df['Y'] = np.where(df['A'] == 1, df['Ya1'], df['Ya0']) df['W1_sq'] = df['W1']**2 df['t'] = 1 df['t0'] = 0 df['id'] = df.index return df
def train(iterations,confidence): #scratch implementation to train the ANN no_units=6 no_features=4 W1,b1,W2,b2=init_para(no_units,no_features) X,y=get_Xy() # X--->train_X , y--->test_y m=5 #training examples for i in range(iterations): Z1 = np.dot(W1, X) + b1 # foreprop begins... A1 = logistic.cdf(Z1) Z2 = np.dot(W2, A1) + b2 A2 = logistic.cdf(Z2) # foreprop ends... log = np.multiply(np.log(A2), y) + np.multiply((1 - y), np.log(1 - A2)) #cross entropy function cost = -np.sum(log) / float(m) # cost function... # dA2 = -(y/A2)+((1-y)/(1-A2)) #back prop begins... dZ2 = A2 - y dW2 = (np.dot(dZ2, A1.T)) / m db2 = np.sum(dZ2, axis=1, keepdims=True) # dA1 = dZ2*(W2) dZ1 = np.multiply(np.dot(W2.T,dZ2),(1-np.power(A1,2))) dW1 = (np.dot(dZ1, X.T)) / m db1 = np.sum(dZ1, axis=1, keepdims=True) # backprop ends... print (cost) W1 = W1 - confidence * dW1 # update begins... b1 = b1 - confidence * db1 W2 = W2 - confidence * dW2 b2 = b2 - confidence * db2 # update ends... return W1,b1,W2,b2
def statin_dgm_truth(network, pr_a, shift=False, restricted=False): graph = network.copy() data = network_to_df(graph) # Running Data Generating Mechanism for A if shift: # If a shift in the Odds distribution is instead specified prob = logistic.cdf(-5.3 + 0.2 * data['L'] + 0.15 * (data['A'] - 30) + 0.4 * np.where(data['R_1'] == 1, 1, 0) + 0.9 * np.where(data['R_2'] == 2, 1, 0) + 1.5 * np.where(data['R_3'] == 3, 1, 0)) odds = probability_to_odds(prob) pr_a = odds_to_probability(np.exp(np.log(odds) + pr_a)) statin = np.random.binomial(n=1, p=pr_a, size=nx.number_of_nodes(graph)) data['statin'] = statin if restricted: # removing other observations from the restricted set attrs = exposure_restrictions(network=network.graph['label'], exposure='statin') exclude = list(attrs.keys()) data = data.loc[~data.index.isin(exclude)].copy() # Running Data Generating Mechanism for Y pr_y = logistic.cdf(-5.05 - 0.8 * data['statin'] + 0.37 * (np.sqrt(data['A'] - 39.9)) + 0.75 * data['R'] + 0.75 * data['L']) cvd = np.random.binomial(n=1, p=pr_y, size=data.shape[0]) return np.mean(cvd)
def sofrygin_observational(graph): """Simulates the exposure and outcome according to the mechanisms specified in Sofrygin & van der Laan 2017 A ~ Bernoulli(expit(-1.2 + 1.5*W + 0.6*map(W))) Y ~ Bernoulli(expit(-2.5 + 1.5*W + 0.5*A + 1.5*map(A) + 1.5*map(W))) Returns ------- Network object with node attributes """ n = len(graph.nodes()) w = np.array([d['W'] for n, d in graph.nodes(data=True)]) # Calculating map(W), generating A, and adding to network w_s = exp_map(graph, 'W', measure='sum') a = np.random.binomial(n=1, p=logistic.cdf(-1.2 + 1.5*w + 0.6*w_s), size=n) for node in graph.nodes(): graph.node[node]['A'] = a[node] # Calculating map(A), generating Y, and adding to network a_s = exp_map(graph, 'A', measure='sum') y = np.random.binomial(n=1, p=logistic.cdf(-2.5 + 1.5*w + 0.5*a + 1.5*a_s + 1.5*w_s), size=n) for node in graph.nodes(): graph.node[node]['Y'] = y[node] return graph
def yardage_distribution_table(yds_increment): #this creates a 3 dimmensional array that indicates the probability #an offense gets a certain amount of yard chunks according to the offensive & #defensive playcalls global mu_array mu = mu_array global s_array s = s_array global turnover_array global d_list d = d_list global zero_index x_shape = np.shape(mu)[0] y_shape = np.shape(mu)[1] d_shape = np.shape(d)[0] table1 = np.empty((x_shape, y_shape, d_shape)) for x in range(np.shape(mu)[0]): for y in range(np.shape(mu)[1]): for d_1 in range(np.shape(d)[0]): table1[x,y,d_1] = (logistic.cdf((yds_increment)*d[d_1]+(yds_increment/2),mu[x,y],s[x,y]) - \ logistic.cdf((yds_increment)*d[d_1]-(yds_increment/2),mu[x,y],s[x,y])) #the above line rounds yardage gained to the nearest chunk; note #we are using a logistic fit for this data normalizing_factor = sum(table1[x, y]) #ensures each table sums to 1 for probabilities sake table1[x, y] = (table1[x, y] / normalizing_factor) return table1
def vwma(vals: pd.Series, mean_alpha: float = 0.125, verbose: bool = False, inverse: bool = False): orig_idx = vals.index diff_vals = vals / vals.shift(1) if verbose: print(diff_vals) print(len(diff_vals)) diff_vals.dropna(inplace=True) scaler_std = sk_prep.StandardScaler() # normal_vol_ewma = vals.ewm(alpha=mean_alpha).std() # if verbose: # print(normal_vol_ewma) normal_vol_ewma = [ v[0] for v in scaler_std.fit_transform(diff_vals.values.reshape(-1, 1)) ] if inverse: normal_vol_ewma = [1 - logistic.cdf(v) for v in normal_vol_ewma] else: normal_vol_ewma = [logistic.cdf(v) for v in normal_vol_ewma] avg_ewm_factor = mean_alpha / 0.5 alphas = [v * avg_ewm_factor for v in normal_vol_ewma] alphas = [mean_alpha] + alphas if verbose: print('Length of alphas list: ', len(alphas)) print('Length of values list: ', len(vals)) final_data = pd.DataFrame(data=list(zip(vals, alphas)), columns=['vals', 'alpha'], index=orig_idx) cume_alphas = None last_vwma = None for idx, val, alpha in final_data.itertuples(): if not cume_alphas: cume_alphas = mean_alpha vwma = val else: cume_alphas += (alpha * (1 - cume_alphas)) adj_alpha = alpha / cume_alphas vwma = (val * adj_alpha) + (last_vwma * (1 - adj_alpha)) final_data.at[idx, 'cume_alphas'] = cume_alphas final_data.at[idx, 'vwma'] = vwma last_vwma = vwma # print(val, alpha) # print(sum(normal_vol_ewma)/len(normal_vol_ewma)) if verbose: print('==== Head ====') print(final_data.head(10)) print('==== Tail ====') print(final_data.tail(10)) print(len(final_data['vwma'])) # final_data.set_index(orig_idx) return final_data['vwma']
def test_forwardX_same_as_old(self): Psi = self.forX.computePsi(self.myTestPoint['S'],logistic.cdf(self.myTestPoint['B_logodds'])) LikelihoodOfX = self.forX.computeLikelihoodOfX(self.myTestPoint['X'],self.Z_original,logistic.cdf(self.myTestPoint['L_logodds'])) beta = self.forX.computeBeta(Psi,LikelihoodOfX) pX_Test = self.forX.computePX(beta,logistic.cdf(self.myTestPoint['B0_logodds']),self.myTestPoint['S'],self.myTestPoint['X'],LikelihoodOfX,Psi) pX_Test = pX_Test[:,:,0] / (pX_Test[:,:,0]+pX_Test[:,:,1]) pX_Correct = load(open('pX_Test_data.pkl','rb')) pX_Correct = np.concatenate([pX_Correct[i,0:self.T[i],:,:] for i in range(self.N)]) pX_Correct = pX_Correct[:,:,0] / (pX_Correct[:,:,0]+pX_Correct[:,:,1]) #import pdb; pdb.set_trace() np.testing.assert_array_almost_equal(pX_Test, pX_Correct, err_msg="forwardX likelihood test off",decimal = 6)
def prediction(self, modfv): """For each altMod, multiples its feature vector with featureWeights, subtracts featureBias, applies sigmoid, returns sum.""" assert len(modfv.sAltMods) == len(modfv.altModFvs) numSoftMods = 0 for alt in xrange(len(modfv.sAltMods)): dot = np.asscalar(np.dot(modfv.altModFvs[alt], self.featureWeights)) numSoftMods += logistic.cdf(dot - self.featureBias) # 1 - ... because we need to output noncomp confidence. return (1. - logistic.cdf(numSoftMods / self.numSoftModsScale - self.numSoftModsBias))
def continuous_data(self): n = 10000 np.random.seed(1011) df = pd.DataFrame() df['W1'] = np.random.normal(size=n) df['W2'] = np.random.binomial(1, size=n, p=logistic.cdf(df['W1'])) df['W3'] = np.random.normal(size=n) df['A'] = np.random.binomial(1, size=n, p=logistic.cdf(-1 + 2 * df['W1'] ** 2)) df['Y'] = -0.5 + 2*df['W1'] + 0.5*df['W2'] - 0.5*df['A'] + 1.1*df['W3'] + np.random.normal(size=n) df['t'] = 1 df['id'] = df.index return df
def weight_matrix(x_data, y_data, model, phi, s, alpha): """ Function to calculate Wmap given the input data and corresponsing labels Parameters: x_data - Independent variables y_data - labels model - Type of model (logistic, poisson, ordinal) phi - Threshold values for ordinal regression and empty list for others s and alpha - Control parameter for the spread of the distribution Returns: Wmap for the given data and number of iterations it took to converge """ #y_data = y_data.reshape(-1,1) w = np.zeros((x_data.shape[1], 1)) count = 0 while True: a = x_data.dot(w) if model == "logistic": yi = logistic.cdf(a) d = y_data - yi r = yi * (1 - yi) elif model == "poisson": yi = np.exp(a) d = np.subtract(y_data, yi) r = yi else: yi = logistic.cdf(s * (phi - a)) d = [ yi[i][y_data[i]] + yi[i][y_data[i] - 1] - 1 for i in range(len(x_data)) ] d = np.array(d) r = [ s**2 * ((yi[i][y_data[i]] * (1 - yi[i][y_data[i]])) + (yi[i][y_data[i] - 1] * (1 - yi[i][y_data[i] - 1]))) for i in range(len(x_data)) ] r = np.array(r) g = x_data.transpose().dot(d) - (alpha * w) r = np.diagflat(r) h_inv = inv(-x_data.transpose().dot(r).dot(x_data) - (alpha * np.identity(x_data.shape[1]))) w_new = w - h_inv.dot(g) if np.divide(norm(w_new - w, 2), norm(w, 2)) < 0.001 or count == 100: break w = w_new count += 1 return w_new, count
def naloxone_dgm_truth(network, pr_a, shift=False, restricted=False): graph = network.copy() data = network_to_df(graph) adj_matrix = nx.adjacency_matrix(graph, weight=None) data['O_sum'] = fast_exp_map(adj_matrix, np.array(data['O']), measure='sum') data['O_mean'] = fast_exp_map(adj_matrix, np.array(data['O']), measure='mean') data['G_sum'] = fast_exp_map(adj_matrix, np.array(data['G']), measure='sum') data['G_mean'] = fast_exp_map(adj_matrix, np.array(data['G']), measure='mean') # Running Data Generating Mechanism for A if shift: # If a shift in the Odds distribution is instead specified prob = logistic.cdf(-1.3 - 1.5 * data['P'] + 1.5 * data['P'] * data['G'] + 0.95 * data['O_mean'] + 0.95 * data['G_mean']) odds = probability_to_odds(prob) pr_a = odds_to_probability(np.exp(np.log(odds) + pr_a)) naloxone = np.random.binomial(n=1, p=pr_a, size=nx.number_of_nodes(graph)) data['naloxone'] = naloxone if restricted: # if we are in the restricted scenarios attrs = exposure_restrictions(network=network.graph['label'], exposure='naloxone') data.update( pd.DataFrame(list(attrs.values()), index=list(attrs.keys()), columns=['naloxone'])) exclude = list(attrs.keys()) # Creating network summary variables data['naloxone_sum'] = fast_exp_map(adj_matrix, np.array(data['naloxone']), measure='sum') # Running Data Generating Mechanism for Y pr_y = logistic.cdf(-1.1 - 0.2 * data['naloxone_sum'] + 1.7 * data['P'] - 0.9 * data['G'] + 0.75 * data['O_mean'] - 0.75 * data['G_mean']) overdose = np.random.binomial(n=1, p=pr_y, size=nx.number_of_nodes(graph)) if restricted: data['overdose'] = overdose data = data.loc[~data.index.isin(exclude)].copy() overdose = np.array(data['overdose']) return np.mean(overdose)
def naloxone_baseline_dgm(graph, number_of_nodes): """Simulates baseline variables for the naloxone & overdose data set G ~ Bernoulli(0.325) Uc ~ Bernoulli(0.65) P ~ Bernoulli(expit(B + B*G + B*sum(G))) O ~ Bernoulli(P==1: 0.1, P==0: 0.3) Returns ------- pandas DataFrame with the distribution of W """ # Gender g = np.random.binomial( n=1, p=0.325, size=number_of_nodes ) # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4454335/ for node, value in zip(graph.nodes(), g): graph.node[node]['G'] = value g_s = exp_map(graph, 'G', measure='mean') # Trust in authorities (unobserved variable) c = np.random.binomial(n=1, p=0.75, size=number_of_nodes) # Recently released from prison beta_p = {0: -1.1, 1: 0.5, 2: 0.1} # Beta parameters mp = logistic.cdf(beta_p[0] + beta_p[1] * g + beta_p[2] * g_s) # model p = np.random.binomial( n=1, p=mp, size=number_of_nodes) # Generating values from above # Prior overdose beta_o = {0: -1.7, 1: 0.1, 2: 0.1, 3: 0.6} # Beta parameters mo = logistic.cdf(beta_o[0] + beta_o[1] * g + beta_o[2] * g_s + beta_o[3] * p) # model o = np.random.binomial( n=1, p=mo, size=number_of_nodes) # Generating values from above # Output W distribution data set nodes = [] for nod, d in graph.nodes(data=True): nodes.append(nod) data = pd.DataFrame() data['id'] = nodes data['G'] = g data['Uc'] = c data['P'] = p data['O'] = o return data
def test_calc_b(): from scipy.stats import logistic from update_params_linear_regression import calc_b p_2_new = np.array([1.0, 2.0, 3.0]) p_3 = np.array([2.0, 1.0, 1.0]) m_1 = np.array([2.0, 4.0, 3.0]) v_1 = np.array([1.0, 1.0, 1.0]) v_s = np.array([3.0, 1.0, 2.0]) res = calc_b(p_2_new, p_3, m_1, v_1, v_s) expected_res = np.array([0.0, 3.5, 2.0 / 3.0]) * logistic.cdf(np.array([3.0, 3.0, 4.0])) + \ np.array([3.0, 15.0, 8.0]) * logistic.cdf(np.array([-3.0, -3.0, -4.0])) np.testing.assert_array_almost_equal(res, expected_res)
def run(self, x, y): # N: # of Examples, k: # of features (N, k) = x.shape cumu_false = 0.0 cumu_false_negative = 0.0 if self.w is None: self.w = csc_matrix(np.zeros((k, 1))) # Start PA for i in range(N): if i % 100 == 0: print('step: ', i) print('Cumulative Error Rate in this day', cumu_false / (i + 1)) print('Cumulative False Negative Rate in this day', cumu_false_negative / (i + 1)) xi = x[i, :].T yi = y[i, :][0] tmp = (self.w.T).dot(xi) prob_of_positive = logistic.cdf(tmp[0, 0]) predict = 1 if prob_of_positive >= 0.5 else -1 mistake = 1 if (predict != yi) else 0 false_negative = 1 if yi == 1 and mistake else 0 cumu_false += mistake cumu_false_negative += false_negative # update w self.w = self.w + self.gamma * xi * ((yi + 1) / 2 - prob_of_positive) return (cumu_false, cumu_false_negative)
def label_generator(problem, X, param, difficulty=1, beta=None, important=None): if important is None or important > X.shape[-1]: important = X.shape[-1] dim_latent = sum([important**i for i in range(1, difficulty+1)]) if beta is None: beta = np.random.normal(size=[1, dim_latent]) important_dims = np.random.choice(X.shape[-1], important, replace=False) funct_init = lambda inp: np.sum(beta * generate_features(inp[:,important_dims], difficulty), -1) batch_size = max(100, min(len(X), 10000000//dim_latent)) y_true = np.zeros(len(X)) while True: try: for itr in range(int(np.ceil(len(X)/batch_size))): y_true[itr * batch_size: (itr+1) * batch_size] = funct_init( X[itr * batch_size: (itr+1) * batch_size]) break except MemoryError: batch_size = batch_size//2 mean, std = np.mean(y_true), np.std(y_true) funct = lambda x: (np.sum(beta * generate_features( x[:, important_dims], difficulty), -1) - mean) / std y_true = (y_true - mean)/std if problem is 'classification': y_true = logistic.cdf(param * y_true) y = (np.random.random(X.shape[0]) < y_true).astype(int) elif problem is 'regression': y = y_true + param * np.random.normal(size=len(y_true)) else: raise ValueError('Invalid problem specified!') return beta, y, y_true, funct
def getProblems(self): problems = {} def cycleGen(items, speed): i = 0 while True: item = items[i] for s in range(speed): yield i, item i += 1 if i >= len(items): i = 0 getQuestionDifficulty = cycleGen(self.questionDifficulty, 1) getQuestionSkill = cycleGen(self.questionSkill, 2) for pid in range(1, self.questionCount + 1): questionDifficultyGroup, questionDifficulty = next( getQuestionDifficulty) questionSkillGroup, questionSkill = next(getQuestionSkill) problem = { 'id': pid, 'title': str(pid), 'statement': 'none', 'performance': {}, 'difficulty': np.random.normal(questionDifficulty), 'difficultyGroup': questionDifficultyGroup, 'skillGroup': questionSkillGroup } problems[pid] = problem userSkills = np.random.normal(size=(self.userCount, len(self.questionSkill))) for u in range(self.userCount): if self.lambdaSolvedLevels: groups = self.lambdaSolvedLevels(u) else: groups = [] allGroups = list(range(len(self.questionDifficulty))) shuffle(allGroups) for group in allGroups: groups.append(group) if random() > self.probabilitySolvingNextLevel: break for problem in problems.values(): if problem['difficultyGroup'] not in groups: continue a = np.random.random() r = logistic.cdf(userSkills[u, problem['skillGroup']] - problem['difficulty']) problem['performance'][u] = 1.0 if a >= r else 0.0 return problems
def RF(self, args): ## Random Forest logger.info("Running Random Forest... ") if args.predictor.lower() == 'classifier': from sklearn.ensemble import RandomForestClassifier as randomforest rf = randomforest( #n_estimators = 5000, criterion='entropy', random_state=42) elif args.predictor.lower() == 'regressor': from sklearn.ensemble import RandomForestRegressor as randomforest ## Initialize RandomForest rf = randomforest(n_estimators=5000, min_samples_leaf=0.12, criterion='entropy', warm_start=True, max_depth=8) rf.fit(self.X_train, self.y_train) # Get the predicted values self.y_pred = rf.predict(self.X_data) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = rf return self
def predict_proba(self, X): try: X = RegressionModel.augment_matrix(X) proba_y = logistic.cdf(np.matmul(X, self.w_)) return proba_y except TypeError: raise RuntimeError("Unfitted model")
def diet_baseline_dgm(graph, number_of_nodes): """Simulates baseline variables for the diet & BMI data set Returns ------- pandas DataFrame with the distribution of W """ # Gender g = np.random.binomial(n=1, p=0.5, size=number_of_nodes) # Baseline BMI b = np.random.lognormal(3.4, sigma=0.2, size=number_of_nodes) # Exercise pe = logistic.cdf( -0.25) # logistic.cdf(-0.25 + 0.3*g + -0.0515*b + 0.001*b*b) e = np.random.binomial( n=1, p=pe, size=number_of_nodes) # Generating values from above # Output W distribution data set nodes = [] for nod, d in graph.nodes(data=True): nodes.append(nod) data = pd.DataFrame() data['id'] = nodes data['G'] = g data['B'] = b data['E'] = e return data
def _sigmoid(self, sigmoid_spacing, n_drifts): """ Funkcja, która generuje okresowy sigmoid zgodnie z wymaganiami. """ period = ( int((self.n_samples) / (n_drifts)) if n_drifts > 0 else int(self.n_samples) ) css = sigmoid_spacing if sigmoid_spacing is not None else 9999 _probabilities = ( logistic.cdf( np.concatenate( [ np.linspace( -css if i % 2 else css, css if i % 2 else -css, period ) for i in range(n_drifts) ] ) ) if n_drifts > 0 else np.ones(self.n_samples) ) # Szybka naprawa, żeby dało się przepuścić podzielną z resztą liczbę dryfów probabilities = np.ones(self.n_chunks * self.chunk_size) * _probabilities[-1] probabilities[: _probabilities.shape[0]] = _probabilities return (period, probabilities)
def sigmoid(x): """ Activation function for LSTM gate layers x: a vector, to pass through the sigmoid, squashing it to the range [0,1] """ # Could experiment with more activation functions: #return np.tanh(x) return logistic.cdf(x)
def cal_single_user_error(self, r): err = 0.0 row = self.matrix[r] poshidprobs = np.zeros(self.numhids) poshidprobs += self.hidbiaises for i in range(len(row)): poshidprobs += self.Wijk[row[i][1] - 1][row[i][0]] poshidprobs = logistic.cdf(poshidprobs) #end of positive phase poshidstates = poshidprobs > np.random.rand(self.numhids) #print 'poshidstates', np.mean(poshidstates) for en in self.matrix1[r]: item = en[0] pred = en[1] negdata = np.zeros(self.five) for tmp in range(self.five): negdata[tmp] = np.sum(self.Wijk[tmp][item] * poshidstates) negdata[tmp] += self.visbiaises[tmp][item] negdata = np.exp(negdata) sum1 = np.sum(negdata) negdata /= sum1 tmp = np.zeros(self.five) for i in range(self.five): tmp[i] = i + 1 score = np.sum(negdata * tmp) err += abs(pred - score) return err
def diet_dgm(network, restricted=False): """ Parameters ---------- network: input network restricted: whether to use the restricted treatment assignment """ graph = network.copy() data = network_to_df(graph) adj_matrix = nx.adjacency_matrix(graph, weight=None) data['G_mean'] = fast_exp_map(adj_matrix, np.array(data['G']), measure='mean') data['E_mean'] = fast_exp_map(adj_matrix, np.array(data['E']), measure='mean') data['E_sum'] = fast_exp_map(adj_matrix, np.array(data['E']), measure='sum') data['B_mean_dist'] = fast_exp_map(adj_matrix, np.array(data['B']), measure='mean_dist') data['B_mean'] = fast_exp_map(adj_matrix, np.array(data['B']), measure='mean') # Running Data Generating Mechanism for A pr_a = logistic.cdf(-0.5 + 0.05 * (data['B'] - 30) + 0.25 * data['G'] * data['E'] + 0.05 * data['E_mean']) diet = np.random.binomial(n=1, p=pr_a, size=nx.number_of_nodes(graph)) data['diet'] = diet if restricted: # if we are in the restricted scenarios attrs = exposure_restrictions(network=network.graph['label'], exposure='diet') data.update( pd.DataFrame(list(attrs.values()), index=list(attrs.keys()), columns=['diet'])) data['diet_sum'] = fast_exp_map(adj_matrix, np.array(data['diet']), measure='sum') data['diet_t3'] = np.where(data['diet_sum'] > 3, 1, 0) # Running Data Generating Mechanism for Y bmi = (3 + data['B'] - 5 * data['diet'] - 5 * data['diet_t3'] + 3 * data['G'] - 3 * data['E'] - 0.5 * data['E_sum'] + data['B_mean_dist'] + np.random.normal(0, scale=1, size=nx.number_of_nodes(graph))) data['bmi'] = bmi # Adding node information back to graph for n in graph.nodes(): graph.nodes[n]['diet'] = int(data.loc[data.index == n, 'diet'].values) graph.nodes[n]['bmi'] = float(data.loc[data.index == n, 'bmi'].values) return graph
def weightWithDropslop(self, weighted, scale): 'weight the adjacency matrix with the sudden drop of ts for each col' if weighted: colWeights = np.multiply(self.tspim.dropslops, self.tspim.dropfalls) else: colWeights = self.tspim.dropslops if scale == 'logistic': from scipy.stats import logistic from sklearn import preprocessing 'zero mean scale' colWeights = preprocessing.scale(colWeights) colWeights = logistic.cdf(colWeights) elif scale == 'linear': from sklearn import preprocessing #add a base of suspecious for each edge colWeights = preprocessing.minmax_scale(colWeights) + 1 elif scale == 'plusone': colWeights += 1 elif scale == 'log1p': colWeights = np.log1p(colWeights) + 1 else: print '[Warning] no scale for the prior weight' n = self.nV colDiag = lil_matrix((n, n)) colDiag.setdiag(colWeights) self.graphr = self.graphr * colDiag.tocsr() self.graph = self.graphr.tocoo(copy=False) self.graphc = self.graph.tocsc(copy=False) print "finished computing weight matrix"
def RF(self, args): ## Random Forest logger.info("Running Random Forest... ") if args.predictor.lower() == 'classifier': from sklearn.ensemble import RandomForestClassifier as randomforest rf = randomforest(criterion='entropy', class_weight='balanced', random_state=42) elif args.predictor.lower() == 'regressor': from sklearn.ensemble import RandomForestRegressor as randomforest ## Initialize RandomForest rf = randomforest(n_estimators=20000, max_depth=4, random_state=42, max_samples=0.6, n_jobs=-1) rf.fit(self.X_train, self.y_train) # Get the predicted values self.y_pred = rf.predict(self.X_data) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = rf return self
def run(self, x, y, U): # N: # of Examples, k: # of features (N, k) = x.shape (tmp, kNew) = U.shape UT = U.T if self.w is None: self.w = csc_matrix(np.zeros((kNew, 1))) # Start PA for i in range(N): if i % 100 == 0: print ('step: ', i) print ('Cumulative Error Rate', self.cumu_false / self.cumu_data) print ('Cumulative False Negative Rate', self.cumu_false_negative / self.cumu_data) xi = x[i, :].T yi = y[i, :][0] xiNew = UT.dot(xi) tmp = (self.w.T).dot(xiNew) prob_of_positive = logistic.cdf(tmp[0, 0]) predict = 1 if prob_of_positive >= 0.5 else -1 mistake = 1 if (predict != yi) else 0 false_negative = 1 if yi == 1 and mistake else 0 self.cumu_false += mistake self.cumu_false_negative += false_negative self.cumu_data += 1 # update w self.w = self.w + self.gamma * xiNew * ((yi + 1) / 2 - prob_of_positive) return (self.cumu_false, self.cumu_false_negative, self.w)
def random_data(N=5000, K=3, unobservables=False, **kwargs): """ Function that generates data according to one of two simple models that satisfies the Unconfoundedness assumption. The covariates and error terms are generated according to X ~ N(mu, Sigma), epsilon ~ N(0, Gamma). The counterfactual outcomes are generated by Y0 = X*beta + epsilon_0, Y1 = delta + X*(beta+theta) + epsilon_1, Selection is done according to the following propensity score function: P(D=1|X) = Lambda(X*beta), where Lambda is the standard logistic CDF. Expected args ------------- N: integer Number of units to draw. Defaults to 5000. K: integer Number of covariates. Defaults to 3. unobservables: Boolean Returns potential outcomes and true propensity score in addition to observed outcome and covariates if True. Defaults to False. mu, Sigma, Gamma, beta, delta, theta: NumPy ndarrays Parameter values appearing in data generating process. Returns ------- Tuple (Y, D, X) or (Y, D, X, Y0, Y1), where Y: N-dimensional array of observed outcomes D: N-dimensional array of treatment indicator, with 1=treated, 0=control X: N-by-K matrix of covariates Y0: N-dimensional array of non-treated outcomes Y1: N-dimensional array of treated outcomes """ mu = kwargs.get('mu', np.zeros(K)) beta = kwargs.get('beta', np.ones(K)) theta = kwargs.get('theta', np.ones(K)) delta = kwargs.get('delta', 3) Sigma = kwargs.get('Sigma', np.identity(K)) Gamma = kwargs.get('Gamma', np.identity(2)) X = np.random.multivariate_normal(mean=mu, cov=Sigma, size=N) Xbeta = X.dot(beta) pscore = logistic.cdf(Xbeta) D = np.array([np.random.binomial(1, p, size=1) for p in pscore]).flatten() epsilon = np.random.multivariate_normal(mean=np.zeros(2), cov=Gamma, size=N) Y0 = Xbeta + epsilon[:,0] Y1 = delta + X.dot(beta+theta) + epsilon[:,1] Y = (1-D)*Y0 + D*Y1 if unobservables: return Y, D, X, Y0, Y1, pscore else: return Y, D, X
def vaccine_baseline_dgm(graph, number_of_nodes): """Simulates baseline variables for the vaccine & infection data set A ~ Bernoulli(0.12) H ~ Bernoulli(0.65) Returns ------- pandas DataFrame with the distribution of W """ data = pd.DataFrame() nodes = [] for nod, d in graph.nodes(data=True): nodes.append(nod) data['id'] = nodes # Asthma a = np.random.binomial(n=1, p=0.15, size=number_of_nodes) data['A'] = a # Hand hygiene d = np.random.binomial(n=1, p=logistic.cdf(-0.15 + 0.1 * a), size=number_of_nodes) data['H'] = d # Output W distribution data set return data
def get_data(): np.random.seed(0) beta0 = 2 beta = np.array([1] * 10 + [-1] * 10 + [0] * 80)[None, :] X = np.random.uniform(0, 1, p * n).reshape((p, n)) f_true = logistic.cdf(beta0 + beta @ X)[0] return X, f_true
def random_data(N=5000, K=3, unobservables=False, **kwargs): """ Function that generates data according to one of two simple models that satisfies the unconfoundedness assumption. The covariates and error terms are generated according to X ~ N(mu, Sigma), epsilon ~ N(0, Gamma). The counterfactual outcomes are generated by Y0 = X*beta + epsilon_0, Y1 = delta + X*(beta+theta) + epsilon_1. Selection is done according to the following propensity score function: P(D=1|X) = Lambda(X*beta). Here Lambda is the standard logistic CDF. Parameters ---------- N: int Number of units to draw. Defaults to 5000. K: int Number of covariates. Defaults to 3. unobservables: bool Returns potential outcomes and true propensity score in addition to observed outcome and covariates if True. Defaults to False. mu, Sigma, Gamma, beta, delta, theta: NumPy ndarrays, optional Parameter values appearing in data generating process. Returns ------- tuple A tuple in the form of (Y, D, X) or (Y, D, X, Y0, Y1) of observed outcomes, treatment indicators, covariate matrix, and potential outomces. """ mu = kwargs.get('mu', np.zeros(K)) beta = kwargs.get('beta', np.ones(K)) theta = kwargs.get('theta', np.ones(K)) delta = kwargs.get('delta', 3) Sigma = kwargs.get('Sigma', np.identity(K)) Gamma = kwargs.get('Gamma', np.identity(2)) X = np.random.multivariate_normal(mean=mu, cov=Sigma, size=N) Xbeta = X.dot(beta) pscore = logistic.cdf(Xbeta) D = np.array([np.random.binomial(1, p, size=1) for p in pscore]).flatten() epsilon = np.random.multivariate_normal(mean=np.zeros(2), cov=Gamma, size=N) Y0 = Xbeta + epsilon[:, 0] Y1 = delta + X.dot(beta + theta) + epsilon[:, 1] Y = (1 - D) * Y0 + D * Y1 if unobservables: return Y, D, X, Y0, Y1, pscore else: return Y, D, X
def plot_logistic_fit(models, data, CV_info,num_columns = 2): num_cv = CV_info.n_folds num_rows = int(np.ceil(float(num_cv)/float(num_columns))) fig_temp = plot.subplots(nrows=num_rows, ncols=num_columns) fig = fig_temp[0] fig.tight_layout() axes = fig_temp[1] cv = 0 # for train,test in CV_info: row_n = int(np.ceil(cv/num_columns)) col_n = int(np.mod(float(cv),float(num_columns))) axes[row_n,col_n].set_title('CV fold %i' % (cv+1)) intercept = models[cv].intercept_ parameters = np.squeeze(np.asarray(models[cv].coef_)) #------------------------------------------------------------------------------ # For plotting data along collapsed dimension collapsed_x_data = intercept + np.dot(parameters,data[test].transpose()) y_data = models[cv].predict(data[test]) y_data = np.asarray(y_data) axes[row_n,col_n].scatter(collapsed_x_data,y_data) #------------------------------------------------------------------------------ # For plotting function x_func = np.linspace(np.min(collapsed_x_data),np.max(collapsed_x_data),100) y_func = logistic.cdf(x_func) axes[row_n,col_n].plot(x_func,y_func) #------------------------------------------------------------------------------ cv += 1 #------------------------------------------------------------------------------ plot.show()
def SGBoost(self, args): ## Stochastic gradient Boosting logger.info("Running Stochastic Gradient Boosting ... ") if args.predictor.lower() == 'classifier': from sklearn.ensemble import GradientBoostingClassifier as sgbt elif args.predictor.lower() == 'regressor': from sklearn.ensemble import GradientBoostingRegressor as sgbt ## Initialize model sgbt = sgbt(max_depth=6, subsample= 0.6, n_estimators = 5000) ## Fit regressor to the training set sgbt.fit(self.X_train, self.y_train) ## Predict the labels self.y_pred = sgbt.predict(self.X_data) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = sgbt return self
def fit(self, X, y, beta=0., max_iter=10000, eps=10e-6): X, num_features = self.init_params_(X) n_iter = 0 conv_criterion = True l_w = LogisticRegression.log_likelihood( self.eta_, X, y) + beta * np.inner(self.w_, self.w_) / 2 while conv_criterion and n_iter < max_iter: nabla_l_w = LogisticRegression.grad_log_likelihood( self.eta_, X, y) + beta * self.w_ Hl_w = LogisticRegression.hess_log_likelihood( self.eta_, X) + beta * np.eye(num_features + 1) inv_Hl_w = np.linalg.inv(Hl_w) self.w_ = self.w_ - np.matmul(inv_Hl_w, nabla_l_w) n_iter += 1 conv_criterion = l_w self.eta_ = logistic.cdf(np.matmul(X, self.w_)) l_w = LogisticRegression.log_likelihood( self.eta_, X, y) + beta * np.inner(self.w_, self.w_) / 2 det_H = np.linalg.det(Hl_w) conv_criterion = (np.abs(conv_criterion - l_w) > eps) and (np.abs(det_H) > eps)
def precompute_single_recommendations(self, user, N): user_ratings_so_far = self.matrix[user] #positive phase poshidprobs = np.zeros(self.numhids) poshidprobs += self.hidbiaises for i in range(len(user_ratings_so_far)): poshidprobs += self.Wijk[user_ratings_so_far[i][1] - 1][user_ratings_so_far[i][0]] poshidprobs = logistic.cdf(poshidprobs) poshidstates = poshidprobs > np.random.rand(self.numhids) print np.mean(poshidstates) #start negative phase negdata = np.zeros([self.five, self.numdims]) for tmp in range(self.five): negdata[tmp] = np.dot(self.Wijk[tmp], poshidstates) negdata[tmp] += self.visbiaises[tmp] negdata = np.exp(negdata) sum1 = np.sum(negdata, axis=0) negdata /= sum1 tmp = np.zeros([self.five, self.numdims]) for i in range(self.five): tmp[i] = i + 1 score = np.sum(negdata * tmp, axis=0) self.score[user] = score print np.mean(score) for en in user_ratings_so_far: score[en[0]] = -1 return list(np.argsort(score)[-N:])
def XGBoost(self, args): ## Gradient Boosting logger.info("Running Gradient Boosting ... ") if args.predictor.lower() == 'classifier': from xgboost import XGBClassifier as xgb elif args.predictor.lower() == 'regressor': from xgboost import XGBRegressor as xgb xg_regression_model = xgb(objective='binary:logistic', n_estimator=20000, colsample_bytree=0.6, max_depth=6) ## Fit the regressor to the training set xg_regression_model.fit(self.X_train, self.y_train) ## Predict the labels self.y_pred = xg_regression_model.predict(self.X_data) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = xg_regression_model return self
def diff_to_prob( differentials, sigmoid_mean = 0, sigmoid_slope = .19 ): # Ted's slope is 1/scale prob = logistic.cdf(differentials,loc=sigmoid_mean,scale=1/sigmoid_slope) return prob
def test(self, data): (rows, columns) = data.shape data = numpy.insert(data, 0, numpy.ones(rows), axis=1) (rows, columns) = data.shape y = logistic.cdf(numpy.dot(data,self.w)) pred = numpy.around(y) return (y, pred)
def train(self): self.theta = np.random.rand(self.Ndim) if self.wk_rank == 0: self.push_vector("theta", self.theta) self.sync() for i in range(self.iter_num): # print('iter: %d' % i) self.local_theta = np.zeros(self.Ndim) for x, y in self.train_set: coef = self.learning_rate * (y - logistic.cdf(np.inner(x, self.theta))) self.local_theta += (coef * x) self.push_vector("theta", self.local_theta) self.sync() self.theta = self.pull_vector("theta") if self.wk_rank == 0: print(self.theta)
def forward(x0, w): """ NN forward propagation algorithm. Inputs (x0, w): x0: Network input values w: Tuple of weights, one for each layer Outputs (zL, xl_list): zL: Output neuron discriminative function (NOT passed through sigmoid) xl_list: list of activation values for each layer (YES passed through sigmoids) """ xl = x0 xl_list = [xl] for wl in w: zl = np.dot(wl, xl) xl = logistic.cdf(zl) xl_list.append(xl) return np.asscalar(zl), xl_list
def get_hull(x, y, n_bins): bins = np.linspace(-5., 5., n_bins) bins = logistic.cdf(bins) * (2) - 1 bins = np.linspace(x.min(), x.max(), n_bins) down_hull = np.zeros((n_bins-1, )) up_hull = np.zeros((n_bins-1, )) x_hull = np.zeros((n_bins-1, )) for i in range(n_bins-1): down, up = bins[i], bins[i+1] bin_ids = (x >= down) & (x < up) down_id = y[bin_ids].argmin() up_id = y[bin_ids].argmax() down_hull[i] = y[bin_ids][down_id] up_hull[i] = y[bin_ids][up_id] x_hull[i] = x[bin_ids][down_id] x_hull = bins[:-1] + (bins[1] - bins[0])/2 return x_hull, down_hull, up_hull
def calc_log_evidence(m, v_2, sigma_0, X, y, m_1, v_1, m_2, v, p, p_3, p_2, v_s, bias): ## TODO: Calculate properly with the bias!! sigma_0_inv = 1. / sigma_0 V_2 = np.diag(v_2) v_2_inv = 1. / v_2 V_2_inv = np.diag(v_2_inv) v_1_inv = 1. / v_1 v_inv = 1. / v m_1_s_v_1 = np.multiply(m_1 ** 2, v_1_inv) m_2_s_v_2 = np.multiply(m_2 ** 2, v_2_inv) m_s_v = np.multiply(m ** 2, v_inv) n, d = X.shape try: alpha = scipy.linalg.det(np.identity(d) + sigma_0_inv * np.dot(V_2, np.dot(X.T, X))) except ValueError: import ipdb; ipdb.set_trace() cdf_p3 = log.cdf(p_3) cdf_m_p3 = log.cdf(-p_3) cdf_p2 = log.cdf(p_2) cdf_m_p2 = log.cdf(-p_2) # import ipdb; ipdb.set_trace() c = cdf_p3 * norm.pdf(0, m_1[:d - bias], np.sqrt(v_1 + v_s)[: d - bias]) + \ cdf_m_p3 * norm.pdf(0, m_1[: d - bias], np.sqrt(v_1)[: d - bias]) c[np.where(c == 0)[0]] = 0.0000000001 log_s1 = 0.5 * (np.dot(m.T, np.dot(V_2_inv, m_2) + sigma_0_inv * np.dot(X.T, y)) - n * np.log(2 * np.pi * sigma_0) - sigma_0_inv * np.dot(y.T, y) - np.dot(m_2.T, np.dot(V_2_inv, m_2)) - np.log(alpha) + np.sum(np.log(1. + np.multiply(v_2, v_1_inv)) + m_1_s_v_1 + m_2_s_v_2 - m_s_v)) # import ipdb; ipdb.set_trace() log_s2 = 0.5 * np.sum(2. * np.log(c) + np.log(1. + np.multiply(v_1, v_2_inv)[: d - bias]) + m_1_s_v_1[: d - bias] + m_2_s_v_2[: d - bias] - m_s_v[: d - bias] + 2. * np.log(log.cdf(p) * cdf_m_p3 + log.cdf(-p) * cdf_p3) - 2. * np.log(cdf_m_p3 * cdf_p3)) res = log_s1 + log_s2 + 0.5 * d * np.log(2. * np.pi) + \ 0.5 * np.sum(np.log(v) + m_s_v - m_1_s_v_1 - m_2_s_v_2) + \ np.sum(np.log(np.multiply(cdf_p2, cdf_p3) + np.multiply(cdf_m_p2, cdf_m_p3))) if np.isinf(res) or np.isnan(res): import ipdb; ipdb.set_trace() return res
def train(self, data, target, lamda, iterations, tolerance, learning_rate): (rows, columns) = data.shape data = numpy.insert(data, 0, numpy.ones(rows), axis=1) (rows, columns) = data.shape self.w = numpy.zeros(columns) ew_old = -numpy.inf for i in range(iterations): yx = numpy.dot(data, self.w) s = logistic.cdf(yx) ew = (target*numpy.log(s) + (1-target)*numpy.log(1-s)).sum() - (0.5*lamda)*(numpy.matmul(numpy.transpose(self.w), self.w)) print('Iteration: {}, Cost function: {}'.format(i, ew)); if abs(ew - ew_old) < tolerance: break; gradient = numpy.matmul(numpy.transpose(data), target-s) - lamda*self.w self.w = self.w + learning_rate*gradient ew_old = ew
def calc_b(p_2_new, p_3, m_1, v_1, v_s): tmp_1 = logistic.cdf(p_2_new + p_3) * (m_1 ** 2 - v_1 - v_s) / (v_1 + v_s) ** 2 tmp_2 = logistic.cdf(- p_2_new - p_3) * (m_1 ** 2 * v_1 ** -2 - 1.0 / v_1) return tmp_1 + tmp_2
def sigmoid(value): return logistic.cdf(value)
def test_forwardS_same_as_old(self): self.forS.astep(0.) pS0_Test = self.forS.compute_S0_GIVEN_X0(self.forS.computeLikelihoodOfS(self.myTestPoint['X'],logistic.cdf(self.myTestPoint['B_logodds']),logistic.cdf(self.myTestPoint['B0_logodds']))) pS0_Correct = load(open('pS0_Test_data.pkl', 'rb')) pSnt_Test = np.zeros((self.nObs,self.M)) for n in xrange(self.N): n0 = self.zeroIndices[n] for t in xrange(self.T[n]-1): pSnt_Test[n0+t] = self.forS.compute_pSt_GIVEN_St1(n0,t,self.myTestPoint['S'][n0+t]) pSnt_Test = pSnt_Test / pSnt_Test.sum(axis=1)[:,np.newaxis] pSnt_Correct = load(open('pSnt_Test_data.pkl', 'rb')) pSnt_Correct = np.concatenate([pSnt_Correct[i,0:self.T[i],:] for i in range(self.N)]) pSnt_Correct = pSnt_Correct / pSnt_Correct.sum(axis=1)[:,np.newaxis] #import pdb; pdb.set_trace() np.testing.assert_array_almost_equal(pS0_Test, pS0_Correct, err_msg="forwardS test off",decimal = 6) np.testing.assert_array_almost_equal(pSnt_Test, pSnt_Correct, err_msg="forwardS test off",decimal = 6)
def sigmoid(z): """# Notice that z can be a scalar, a vector or a matrix # return the sigmoid of input z""" return logistic.cdf(z)#your code here
def __init__(self, num_features, num_output=1, hidden_layer=None, activation=("expit", 1), learn_rate=1, default_bias="random", max_epochs=10, scale=1, verbose=False, temperature=1, online=True): """Constructor for the NeuralNet class. num_features: The number of features that each sample has. This will equal the number of neurons in the input layer. num_output: The number of output labels each sample has. This will equal the number of neurons in the output layer. hidden_layer: A list containing the number of nodes in the (i+1)th hidden layer (for i starting at 0). If set to None (default value), then the neural network will have one hidden layer with num_features * 1.5 hidden nodes. activation: The default activation function to use in each neuron. Default is the inverse logistic function with temperature = 1: s(x) = 1/(1 + e^(-x)). This uses scipy for optimization purposes. learn_rate: The learning rate applied to the training process. Default value is 1. default_bias: The default weight assigned to the weight vector. Default value is random, uniformly between (-scale, scale). max_epochs: The max number of iterations on the training set. Default value is 10. scale: Determines the range of random values for the initial weights of the model. The value of the weights will range from (-scale, scale). For example, if scale=2, then the initial weights can range from (-scale, scale). Default value is 1. verbose: Used to see how fast the neural network is being trained. Indicates when an epoch has finished. online: Indicates that the weights should be updated for every training example. """ self.verbose = verbose self.num_features = num_features self.num_output = num_output self.hidden_layer = hidden_layer self.learn_rate = learn_rate self.default_bias = default_bias self.max_epochs = max_epochs self.scale = scale self.online = online # NOTE: There is no proven evidence that the ideal number of # nodes in the hidden layer is 1.5, but it is suggested. Citation needed. if self.hidden_layer is None: num_nodes = int(math.floor(num_features * 1.5)) self.hidden_layer = [num_nodes] # TODO: Address this. if len(self.hidden_layer) > 1: raise NotImplementedError(one_line("""Neural network containing more than one hidden layer has not been implemented yet.""")) # TODO: Address this. if self.num_output != 1: raise NotImplementedError(one_line("""Neural network containing more than one output label has not been implemented yet.""")) # Assign activation function here, depending on the argument. if activation[0] == "expit": temp = activation[1] # This is just an optimization using scipy if temp == 1: self.default_act = expit self.default_deriv = np.vectorize(lambda x: x * (1 - x)) else: self.default_act = lambda x: logistic.cdf(x, scale=temp) self.default_deriv = np.vectorize(lambda x: temp * x * (1 - x)) # elif activation[0] == "tanh": # if activation[1] != 1: print(one_line("""Warning: other temperatures # for the tanh activation function have not been implemented.""")) # self.default_act = lambda x: 2 * expit(x) - 1 # self.default_deriv = np.vectorize(lambda x: 2 * x * (1 - x)) else: raise NotImplementedError(one_line("""Activation function not supported yet: {0}""".format(activation))) self.init_weights()
import os from scipy.stats import logistic flag = "prod" conf = Config(flag, "prod" , 300) model = np.load(conf.path_model_npy + ".npy") word_embed = model[0] prod_embed = model[1] transfer_w = model[2] transfer_b = model[3] dp = DataProvider(conf) weight = np.dot(word_embed, transfer_w) weight = logistic.cdf(np.add(weight, transfer_b)) for topic_id in range(conf.dim_item): word_ids = weight[:,topic_id] word_ids = np.argsort(word_ids)[::-1][:50] words = [(dp.idx2word[word_id], weight[word_id, topic_id]) for word_id in word_ids if weight[word_id, topic_id] > .9] print("Topic", topic_id) print(words) print("=========================\n") print("finish")
def sigmoid(x): return logistic.cdf(x)
import numpy as np import matplotlib.pyplot as plt from scipy.stats import logistic from sat_exp import saturating_exp F = lambda x, (a,b,l): 0.5 + (1-0.5-l)*logistic.cdf(x, loc=a, scale=b) Finv = lambda thresh_val, (a,b,l): logistic.ppf((thresh_val - 0.5)/(1-0.5-l), loc=a, scale=b) def color_list(n, cmap=None): cm = plt.get_cmap("RdYlGn" if cmap is None else cmap) colors = [cm(i) for i in np.linspace(0, 1, n)] return colors*(n/len(colors)) + colors[:n%len(colors)] def plot_pmf(cohs, pcor, res): cmap = color_list(len(res)+1, 'cool') xs = np.array(cohs) xsf = np.linspace(min(xs), max(xs), 50) for i, (theta, thresh) in enumerate(res): plt.scatter(cohs, pcor[i, :]/100.0, color=cmap[i]) plt.plot(xsf, F(xsf, theta), color=cmap[i], linestyle='-') plt.xlim([0.01, None]) plt.ylim([0.45, 1.05]) plt.xscale('log') plt.xlabel('signal strength') plt.ylabel('accuracy') plt.show() def plot_pmf_thresh(reses): cmap = color_list(len(reses)+1, 'Greens') for i, res in enumerate(reses): durs = np.arange(1, len(res)+1)
filename = "skeletons & matching cropped pics/cropped pics/v018-penn.9-1uB2D2-cropm.png" img = io.imread(filename) img = (img - np.mean(img)) / np.std(img) sigma = .75 Hxx, Hxy, Hyy = hessian_matrix(img, sigma=sigma, mode="wrap") e1, e2 = hessian_matrix_eigvals(Hxx, Hxy, Hyy) # How much bigger is the first eigenvalue's magnitude # compared with the second? log_condition = np.log(abs(e1/e2)) log_condition = log_condition / np.std(log_condition) out = logistic.cdf(log_condition) markers = np.zeros_like(out) markers[out < 0] = 1 markers[out > np.percentile(out, 90)] = 2 plt.imshow(out) plt.set_cmap('binary') plt.colorbar() plt.show()
def trainProx(wRows, wData, n, b, X, y, eta, l1, l2, outputFreq): nr,nc = X.shape nl = y.shape[1] assert y.shape[0] == nr assert b.size == nl if useAdaGrad: if useSharedStep: assert n.size == nc else: assert n.shape == (nc,nl) # vector of time step at which each coordinate is up-to-date tVec = np.zeros(nc, dtype=np.int64) onlineLoss = 0 totalOnlineLoss = 0 subEpoch = 0 for t in range(nr): if t % 100 == 0: print "training row: " + str(t) (row, xInds, xVals) = getSample(X, t) if xInds.size == 0: continue # 1. Lazily update relevant rows of w, storing them in tempW totalNnzW = sum(wRows[xInd].size for xInd in xInds) tempW = np.ndarray(totalNnzW) kVec = np.ndarray(totalNnzW, dtype=np.int64) if useAdaGrad: etaVec = np.ndarray(totalNnzW) else: etaVec = eta pos = 0 for xInd in xInds: numW = wRows[xInd].size endPos = pos+numW kVec[pos:endPos] = t - tVec[xInd] if useAdaGrad: if useSharedStep: etaVec[pos:endPos] = eta / (1 + math.sqrt(n[xInd])) else: etaVec[pos:endPos] = eta / (1 + np.sqrt(n[xInd,wRows[xInd]])) tempW[pos:endPos] = wData[xInd] pos = endPos tempW = iteratedProx(tempW, kVec, l1*etaVec, l2*etaVec) tVec[xInds] = t # 2. Compute scores scores = b.copy() pos = 0 for (xInd, xVal) in zip(xInds, xVals): numW = wRows[xInd].size endPos = pos+numW scores[wRows[xInd]] += tempW[pos:endPos] * xVal pos = endPos # 3. Compute loss and subtract labels from (transformed) scores for gradient (startY, endY) = y.indptr[row], y.indptr[row+1] yCols = y.indices[startY:endY] yVals = y.data[startY:endY] if useSqErr: # linear probability model # quadratic loss for incorrect prediction, no penalty for invalid (out of range) correct prediction scores[yCols] = yVals - scores[yCols] scores = np.clip(scores, 0, np.inf) scores[yCols] *= -1 loss = 0.5 * np.dot(scores, scores) onlineLoss += loss totalOnlineLoss += loss else: pos = logistic.logcdf(scores) neg = logistic.logcdf(-scores) pos -= neg scores = logistic.cdf(scores) loss = -np.dot(pos[yCols], yVals)-neg.sum() scores[yCols] -= yVals onlineLoss += loss totalOnlineLoss += loss # 4. Compute gradient as outer product # this will be dense in general, unfortunately g = np.outer(xVals, scores) # 5. Compute updated point (store it in g) if useAdaGrad: if useSharedStep: n[xInds] += np.square(g).sum(1) etaVec = np.tile(eta/(1+np.sqrt(n[xInds])), (nl,1)).T else: n[xInds,:] += np.square(g) etaVec = eta/(1+np.sqrt(n[xInds,:])) else: etaVec = eta g *= -etaVec pos = 0 for xI in range(xInds.size): xInd = xInds[xI] numW = wRows[xInd].size endPos = pos+numW g[xI,wRows[xInd]] += tempW[pos:endPos] pos = endPos # 6. Sparsify updated point and store it back to W # now g holds dense (over labels) W - eta*g reassignToConvertedW(wRows, wData, xInds, g) # Print output periodically if (t+1) % outputFreq == 0: bringAllUpToDate(wRows, wData, tVec, t+1) tVec = np.tile(t+1, nc) printOutputLine(subEpoch, wRows, wData, b, testX, testY, l1, l2, onlineLoss / outputFreq) subEpoch += 1 onlineLoss = 0 # print output for whole epoch if nr % outputFreq != 0: # otherwise we are already up to date bringAllUpToDate(wRows, wData, tVec, nr) printOutputLine("*", wRows, wData, b, testX, testY, l1, l2, totalOnlineLoss / nr) print
def _make_weight(year): scaled = 2*(year-YEARS[0]) / (YEARS[-1] - YEARS[0]) - 1 scaled *= -4 return logistic.cdf(scaled)
def calc_a(p_2_new, p_3, m_1, v_1, v_s): tmp_1 = logistic.cdf(p_2_new + p_3) * m_1 / (v_1 + v_s) tmp_2 = logistic.cdf(-p_2_new - p_3) * m_1 / v_1 return tmp_1 + tmp_2
def sigmoid(self): return FeatureObj(logistic.cdf(self.v), True) # TODO pay attention to rounding errors
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Dec 15 15:16:27 2017 @author: michellezhao """ from scipy.stats import logistic import matplotlib.pyplot as plt import numpy as np fig, ax = plt.subplots(1,1) x = np.linspace(0,50) ax.plot(x, logistic.cdf(x, loc = 3, scale = 1), 'r-', lw=5, alpha=0.6, label= 'gompertz cdf') plt.show()