def getSJSuggestion(jobPoolLocal): global userDF, appDF, jobDF, jobPool, liveJobDF, liveJobDict, LiveJobs, JobUserSparseMatrix, LiveJobUserSparseMatrix jobPool = jobPoolLocal st = datetime.now() userDF = pd.DataFrame({'userId': appDF['userId'].unique()}) appDF['userLookUp'] = pd.match(appDF['userId'], userDF['userId']) appDF['jobLookUp'] = pd.match(appDF['jobId'], jobDF['jobId']) appDF['liveJobLookUp'] = pd.match(appDF['jobId'], liveJobDF['jobId']) row = appDF['jobLookUp'] col = appDF['userLookUp'] data = np.repeat(1,appDF.shape[0]) JobUserSparseMatrix = sp.coo_matrix((np.array(data), (np.array(row),np.array(col))), shape=(jobDF.shape[0], userDF.shape[0])) # 30L x 5L del row, col, data JobUserSparseMatrix = JobUserSparseMatrix.tocsr() row = appDF[~(appDF.liveJobLookUp == -1)]['liveJobLookUp'] col = appDF[~(appDF.liveJobLookUp == -1)]['userLookUp'] data = np.repeat(1,appDF[~(appDF.liveJobLookUp == -1)].shape[0]) LiveJobUserSparseMatrix = sp.coo_matrix((np.array(data), (np.array(row),np.array(col))), shape=(liveJobDF.shape[0], userDF.shape[0])) # 30L x 1L del row, col, data LiveJobUserSparseMatrix = LiveJobUserSparseMatrix.tocsr() liveJobDict = dict(zip(liveJobDF.index, liveJobDF['jobId'])) collectionJS.drop() collectionJSExport.drop() print 'JobSuggestions Count:', collectionJS.count(), 'JobSuggestionsExport Count:', collectionJSExport.count() st1 = datetime.now() for jobPosition in range(0, JobUserSparseMatrix.shape[0], jobPool): getSJSuggestionPoolWise(jobPosition) print datetime.now() - st1 print 'JobSuggestions Count:', collectionJS.count(), 'JobSuggestionsExport Count:', collectionJSExport.count() print "Run time :" + str(datetime.now() - st) # 10 min
def balanceFactors(data, sep, cost, factors, constraints, model): its = 0 cnvg = 1 while cnvg > .0001: its = its + 1 if model != 'attConstrained': calcAi(data, sep, cost, factors, model) AiBF = (data.groupby(data[constraints['production']].name).aggregate({"Ai": np.sum})) AiBF["Ai"] = 1/AiBF["Ai"] updates = AiBF.ix[pd.match(data[constraints['production']], AiBF.index), "Ai"] data["Ai"] = updates.reset_index(level=0, drop=True) if(updates.notnull().any()) else data["Ai"] if model == 'prodConstrained': break if its == 1: data["OldAi"] = data["Ai"] else: data["diff"] = abs((data["OldAi"] - data["Ai"])/data["OldAi"]) data["OldAi"] = data["Ai"] if model != 'prodConstrained': calcBj(data, sep, cost, factors, model) BjBF = data.groupby(data[constraints['attraction']].name).aggregate({"Bj": np.sum}) BjBF["Bj"] = 1/BjBF["Bj"] updates = BjBF.ix[pd.match(data[constraints['attraction']], BjBF.index), "Bj"] data["Bj"] = updates.reset_index(level=0, drop=True) if(updates.notnull().any()) else data["Bj"] if its == 1: if model == 'attConstrained': break data["OldBj"] = data["Bj"] else: data["diff"] = abs((data["OldBj"] - data["Bj"])/data["OldBj"]) data["OldBj"] = data["Bj"] cnvg = np.sum(data["diff"]) #print cnvg, its return data
def _load_data(self, x1, x2): final_data = np.zeros((x1.num_row, x2.num_col)) final_row_labels = x1.row_labels final_col_labels = x2.col_labels prior_data = np.genfromtxt(self.data_file) prior_row_labels = self.prior_row_labels.tolist() assert prior_data.shape[0] == len(prior_row_labels) prior_col_labels = self.prior_col_labels.tolist() assert prior_data.shape[1] == len(prior_col_labels) prior_row_match_ind = pd.match(prior_row_labels, final_row_labels) prior_rows_to_transfer = [ el for el in range(len(prior_row_labels)) if prior_row_match_ind[el] != -1 ] final_rows_to_fill = prior_row_match_ind[prior_row_match_ind != -1] prior_col_match_ind = pd.match(prior_col_labels, final_col_labels) prior_cols_to_transfer = [ el for el in range(len(prior_col_labels)) if prior_col_match_ind[el] != -1 ] final_cols_to_fill = prior_col_match_ind[prior_col_match_ind != -1] final_data[np.ix_(final_rows_to_fill, final_cols_to_fill)] = \ prior_data[prior_rows_to_transfer,:][:,prior_cols_to_transfer] self.row_labels = final_row_labels self.col_labels = final_col_labels self.data = final_data
def category_transformation(train_categoric, test_categoric, labels, type='std'): if type == 'freq': print("Encoding categories by freqency rank...") for c in train_categoric.columns: freqs = train_categoric[c].append(test_categoric[c]).value_counts() train_categoric[c] = pd.match(train_categoric[c].values, freqs[0:91].index) test_categoric[c] = pd.match(test_categoric[c].values, freqs[0:91].index) if type == 'std': print("Encoding categories by sklearn label encoder...") for c in train_categoric.columns: lbl = LabelEncoder() lbl.fit( list(train_categoric.ix[:, c]) + list(test_categoric.ix[:, c])) train_categoric.ix[:, c] = lbl.transform(train_categoric.ix[:, c]) test_categoric.ix[:, c] = lbl.transform(test_categoric.ix[:, c]) if type == 'tgtrate': print("Encoding categories by target rate...") for c in train_categoric.columns: train_categoric[c], test_categoric[c] = category_to_prob_weight( train_categoric, test_categoric, c, labels) if type == 'rank': print("Encoding categories by rank transformation...") for c in train_categoric.columns: rank = pd.concat([train_categoric[c], labels], axis=1).groupby(c).mean().sort_values( by='target', ascending=False) train_categoric[c] = pd.match(train_categoric[c].values, rank[0:20000].index) test_categoric[c] = pd.match(test_categoric[c].values, rank[0:20000].index) if type == 'onehot': print("One hot... ") for c in train_categoric.columns: uniques = np.unique(train_categoric[c]) if len(uniques) > 100: train_categoric.drop(c, axis=1, inplace=True) test_categoric.drop(c, axis=1, inplace=True) x_cat_train = train_categoric.T.to_dict().values() x_cat_test = test_categoric.T.to_dict().values() # vectorize vectorizer = DV(sparse=False) train_categoric = pd.DataFrame(vectorizer.fit_transform(x_cat_train)) test_categoric = pd.DataFrame(vectorizer.transform(x_cat_test)) return train_categoric, test_categoric
def balanceFactors(data, sep, cost, factors, constraints, model): """ calculate balancing factors and balance the balancing factors if doubly constrained model """ its = 0 cnvg = 1 while cnvg > .0001: its = its + 1 #If model is prod or doubly constrained if model != 'attConstrained': calcAi(data, sep, cost, factors, model) AiBF = (data.groupby( data[constraints['production']].name).aggregate({"Ai": np.sum})) AiBF["Ai"] = 1 / AiBF["Ai"] updates = AiBF.ix[ pd.match(data[constraints['production']], AiBF.index), "Ai"] data["Ai"] = updates.reset_index(level=0, drop=True) if ( updates.notnull().any()) else data["Ai"] #If model is prod constrained stop here - dont need to balance if model == 'prodConstrained': break if its == 1: data["OldAi"] = data["Ai"] else: data["diff"] = abs( (data["OldAi"] - data["Ai"]) / data["OldAi"]) data["OldAi"] = data["Ai"] #If model is att or doubly constrained if model != 'prodConstrained': calcBj(data, sep, cost, factors, model) BjBF = data.groupby( data[constraints['attraction']].name).aggregate({"Bj": np.sum}) BjBF["Bj"] = 1 / BjBF["Bj"] updates = BjBF.ix[ pd.match(data[constraints['attraction']], BjBF.index), "Bj"] data["Bj"] = updates.reset_index(level=0, drop=True) if ( updates.notnull().any()) else data["Bj"] if its == 1: #If model is att constrained stop here - dont need to balance if model == 'attConstrained': break data["OldBj"] = data["Bj"] else: data["diff"] = abs( (data["OldBj"] - data["Bj"]) / data["OldBj"]) data["OldBj"] = data["Bj"] cnvg = np.sum(data["diff"]) #print cnvg, its return data
def category_transformation(train_categoric, test_categoric, labels, type='std'): if type == 'freq': print("Encoding categories by freqency rank...") for c in train_categoric.columns: freqs = train_categoric[c].append(test_categoric[c]).value_counts() train_categoric[c] = pd.match(train_categoric[c].values, freqs[0:1000].index) test_categoric[c] = pd.match(test_categoric[c].values, freqs[0:1000].index) if type == 'tgtrate': print("Encoding categories by target rate...") for c in train_categoric.columns: train_categoric[c], test_categoric[c] = category_to_prob_weight(train_categoric, test_categoric, c, labels) return train_categoric, test_categoric
def get_highest_reviews(bid): other_reviewers_of_restaurant=list(yelp['user_id'][yelp['business_id']==bid]) uid=random.sample(other_reviewers_of_restaurant, 1) user_column=pandas.match(uid, users)[0] similarity_indices_for_user=list(df.ix[:,user_column]) z=numpy.array(similarity_indices_for_user) most_similar_users=numpy.argsort(z)[0:10] most_similar_users=[users[most_similar_users[i]] for i in range(10)] name_rest=yelp['name.business'][yelp['business_id']==bid].unique()[0] f = lambda row: row['user_id'] in most_similar_users and row['name.business'] in name_rest k = yelp.apply(f, axis=1) temp=yelp[k] temp.iloc[:,[1,3,6,9,11,17,21,27]] x1=list(temp['stars.review']); x2=list(temp['richness']); x3=list(temp['fans']) x4=list(temp['review_count.review']); x5=list(temp['stars.business']) predicted_values=list() for i in range(len(temp)): predicted_values.append([predict_expected_value(x1[i],x2[i],x3[i],x4[i],x5[i]), i]) predicted_values.sort(key=lambda x: x[0]) predicted_values=predicted_values[-3:] predicted_values.sort(key=lambda x: x[1]) reviews=list() for value in range(len(predicted_values)): row_of_review=predicted_values[value][1] print(row_of_review) reviews.append(temp['text'].iloc[row_of_review]) return reviews
def total_flows(dt, f, locs): """ sum rows or columns to derive total inflows or total outflows """ totals = dt.groupby(locs).aggregate({f: np.sum}) return totals.ix[pd.match(locs, totals.index.astype(str))].reset_index()[f]
def explain_prediction(bst, explainer, data): """ :param bst: :type bst: xgb.Booster :param explainer: :type explainer: pd.DataFrame :param data: :return: """ nodes = bst.predict(data, pred_leaf=True) colnames = list(explainer.columns.values)[:-2] preds_breakdown = pd.DataFrame(np.zeros((nodes.shape[0], len(colnames))), columns=colnames) print("Extracting the breakdown of each prediction...") num_trees = nodes.shape[1] with click.progressbar(range(num_trees), num_trees) as bar: for idx in bar: nodes_for_tree = nodes[:, idx] tree_breakdown = explainer[explainer["tree"] == idx].fillna(0) preds_breakdown_for_tree = tree_breakdown.loc[ pd.match(nodes_for_tree, tree_breakdown["leaf"])][colnames] \ .reset_index(drop=True) preds_breakdown = preds_breakdown + preds_breakdown_for_tree print("DONE!") return preds_breakdown
def parse_usearch_allpairs(filename, seqnames): """Read output of ``usearch -allpairs_global -blast6out`` and return a square distance matrix. ``seqnames`` determines the marginal order of sequences in the matrix. """ data = pd.read_table(filename, header=None, names=BLAST6NAMES) data['dist'] = pd.Series( 1.0 - data['pct_id'] / 100.0, index=data.index) # for each sequence pair, select the longest alignment if there is # more than one (chooses first occurrence if there are two the same # length). maxidx = data.groupby(['query', 'target']).apply( lambda x: x['align_len'].idxmax()) data = data.iloc[maxidx] if set(seqnames) != set(data['query']) | set(data['target']): # shutil.copy(filename, '.') raise UsearchError( 'some sequences are missing from the output ({})'.format(filename)) nseqs = len(seqnames) distmat = numpy.repeat(0.0, nseqs ** 2) distmat.shape = (nseqs, nseqs) ii = pd.match(data['query'], seqnames) jj = pd.match(data['target'], seqnames) # usearch_allpairs_files returns comparisons corresponding to a # triangular matrix, whereas vsearch_allpairs_files returns all # comparisons. Here we convert both to a square matrix. if data.shape[0] == nseqs * nseqs: distmat[ii, jj] = data['dist'] elif data.shape[0] == (nseqs * (nseqs - 1)) / 2: distmat[ii, jj] = data['dist'] distmat[jj, ii] = data['dist'] else: msg = 'not all pairwise comparisons are represented ({})' raise UsearchError(msg.format(filename)) return distmat
def getUserAppsDF(): global jobDF, userDF, userAppsDF, userAppsDict, UserAppsSparseMatrix userAppsDF = pd.read_csv(projectHomeSJ + "/Input/ApplicationData_sorted.csv", names=['userId', 'jobId']) userAppsDF = userAppsDF[userAppsDF.jobId.isin(jobDF.job)] userDF = pd.DataFrame({'userId': userAppsDF.userId.unique()}) userAppsDF['userLookUp'] = pd.match(userAppsDF['userId'], userDF['userId']) userAppsDF['jobLookUp'] = pd.match(userAppsDF['jobId'], jobDF['job']) userAppsDF_1 = pd.DataFrame( list(collectionCA.find({}, { '_id': 1, 'userApps': 1 }))) userAppsDict = dict(zip(userAppsDF_1['_id'], userAppsDF_1['userApps'])) del userAppsDF_1 UserAppsSparseMatrix = sp.coo_matrix( (np.repeat(1, userAppsDF.shape[0]), (userAppsDF['userLookUp'], userAppsDF['jobLookUp'])), shape=(userDF.shape[0], jobDF.shape[0])) UserAppsSparseMatrix = UserAppsSparseMatrix.tocsr()
def memo_test(events, selected_genes, groups, permutations=10000): groups_memo = [pandas.match(group, selected_genes) for group in groups] events_selected = events[selected_genes] sampler = switching.EventMatrixSampler(events_selected.astype(int), "gobbi") coverages = numpy.array( [events_selected[i].any(0).sum() for i in groups_memo]) higher_coverage = numpy.zeros_like(coverages) for i in xrange(permutations): null_sample = sampler.sample() higher_coverage += (numpy.array( [null_sample[i].any(0).sum() for i in groups_memo]) >= coverages).astype(int) return (higher_coverage + 1.0) / (permutations + 1.0)
def calc_Bj(self, dt, d, of, p, dc=False): """ calculate Bj balancing factor """ Bj = self.calc_dcy(self.c, self.cf, p) if of: for fx in of: Bj *= of[fx]**p[fx] if not dc: dt['Bj'] = Bj else: dt['Bj'] = Bj*dt['Ai']*dt['Oi'] Bj = (dt.groupby(d).aggregate({'Bj': np.sum})) Bj['Bj'] = 1/Bj['Bj'] Bj = Bj.ix[pd.match(d, Bj.index), 'Bj'] return Bj.reset_index(level=0, drop=True)
def calc_Ai(self, dt, o, df, p, dc=False): """ calculate Ai balancing factor """ Ai = self.calc_dcy(self.c, self.cf, p) if df: for fx in df: Ai *= df[fx]**p[fx] if not dc: dt['Ai'] = Ai else: dt['Ai'] = Ai*dt['Bj']*dt['Dj'] Ai = (dt.groupby(o).aggregate({'Ai': np.sum})) Ai['Ai'] = 1/Ai['Ai'] Ai = Ai.ix[pd.match(o, Ai.index), 'Ai'] return Ai.reset_index(level=0, drop=True)
def comp(vector): # Function that computes the distinct observations in a numeric vector. # It is based entirely on the "comp11" function from the BNPTSclust # package in R created by David Alejandro Martell Juarez # # IN: # # vector <- numeric vector. # # OUT: # # jstar <- variable that rearranges the input vector into a vector with only # its unique values. # nstar <- frequency of each distinct observation in the input vector. # rstar <- number of distinct observations in the input vector. # gn <- variable that indicates the group number to which every # entry in the input vector belongs. n = len(vector) mat = vector[:, None] == vector jstar = np.repeat(False, n) led = np.repeat(False, n) for j in np.arange(0, n): if not led[j]: jstar[j] = True if j + 1 == n: break ji = np.arange(j + 1, n) tt = mat[ji, j] == True led[ji] = led[ji] | tt if all(np.delete(led, np.arange(0, j + 1))): break ystar = vector[jstar] nstar = np.apply_along_axis(np.sum, 0, mat[:, jstar]) rstar = len(nstar) gn = pd.match(vector, ystar) return jstar, nstar, rstar, gn
if s >= .6 : cn+=1 if cn > 0: res.append([se_id[i],unip,tname,cn]) return res compound_target_mapping = [] for i,r in target_pairs.iterrows(): if i%200 == 0 and i > 0: print i # target names/symbols t1 = r['TARGET_A'] t2 = r['TARGET_B'] # get corresponding uniprots unip1 = targets.iloc[pd.match([t1],targets['SYMBOL'])]['UNIPROT'].tolist()[0] unip2 = targets.iloc[pd.match([t2],targets['SYMBOL'])]['UNIPROT'].tolist()[0] # get twosides compounds that have >.6 similarity to # at least one target active in ChEMBL se_cp1 = get_se_compound(unip1,t1) se_cp2 = get_se_compound(unip2,t2) if len(se_cp1) > 0 and len(se_cp2) > 0: # compound_target_mapping.append(se_cp1+se_cp2) for v1 in se_cp1 : # for 1st target/stitch compounds association for v2 in se_cp2: # for 2nd target/stitch compounds association # if both the compound for the 1st target and the compound for the 2nd target # are in the twosides dataset if v1[0] in SE_data['stitch_id1'] and v2[0] in SE_data['stitch_id2']:
def test_match(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pd.match([1, 2, 3], [1])
def setup(data, trips, sep, cost, factors, constraints, prodCon, attCon, initialParams, Oi, Dj, totalFlows): """ set up all initial variables and balancing factors for mle calibration """ #The following setup is for within all models #There is always a beta parameter so set it to user's initial value and add to param list data['beta'] = initialParams['beta'] params = ['beta'] #This is the observed data for which we want to derive parameters if cost == 'exp': knowns = data[sep] elif cost == 'pow': knowns = np.log(data[sep]) else: sys.exit( sys.exit( "The distance/cost function must be either 'pow' or 'exp'.")) #For doubly constrained model if (prodCon == True) & (attCon == True): #Variables for constants and deriving them data["Bj"] = 1.0 data["Ai"] = 1.0 data["OldAi"] = 10.000000000 data["OldBj"] = 10.000000000 data["diff"] = abs((data["OldAi"] - data["Ai"]) / data["OldAi"]) #Calc total outflows and inflows if Oi: data["Oi"] = data[Oi] else: Oi = data.groupby(data[constraints['production']]).aggregate( {trips: np.sum}) data["Oi"] = Oi.ix[pd.match(data[constraints['production']], Oi.index)].reset_index()[trips] if Dj: data["Dj"] = data[Dj] else: Dj = data.groupby(data[constraints['attraction']]).aggregate( {trips: np.sum}) data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']], Dj.index)].reset_index()[trips] #For Production Constrained model if (prodCon == True) & (attCon == False): #Calc total outflows if factors == None: if not Dj: Dj = data.groupby(data[totalFlows]).aggregate({trips: np.sum}) data["Dj"] = Dj.ix[pd.match( data[totalFlows], Dj.index)].reset_index()[trips].sort_index() else: data["Dj"] = data[Dj] if not Oi: Oi = data.groupby(data[constraints['production']]).aggregate( {trips: np.sum}) data["Oi"] = Oi.ix[pd.match(data[constraints['production']], Oi.index)].reset_index()[trips] else: data['Oi'] = data[Oi] #For Attraction Constrained model if (prodCon == False) & (attCon == True): #Calc total inflows if factors == None: if not Oi: Oi = data.groupby(data[totalFlows]).aggregate({trips: np.sum}) data["Oi"] = Oi.ix[pd.match(data[totalFlows], Oi.index)].reset_index()[trips] else: data["Oi"] = data[Oi] if not Dj: Dj = data.groupby(data[constraints['attraction']]).aggregate( {trips: np.sum}) data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']], Dj.index)].reset_index()[trips] else: data["Dj"] = data[Dj] #For Unconstrained Model if (prodCon == False) & (attCon == False): for factor in factors['origins']: #Include that information in the model knowns = knowns + np.log(data[factor]) #Add to params list params.append(str(factor)) #variable param vector data[str(factor) + 'Param'] = initialParams[factor] for factor in factors['destinations']: #Include that informatio in the model knowns = knowns + np.log(data[factor]) #Add to params list params.append(str(factor)) #variable param vector data[str(factor) + 'Param'] = initialParams[factor] #For all models besides unconstrained - is probably redundant and can be refactored #If there are additional factors we will include that observed data, add it to param list, and add a data vector for the param if factors != None: if attCon != False: for factor in factors['origins']: #Include that information in the model knowns = knowns + np.log(data[factor]) #Add to params list params.append(str(factor)) #variable param vector data[str(factor) + 'Param'] = initialParams[factor] if prodCon != False: for factor in factors['destinations']: #Include that informatio in the model knowns = knowns + np.log(data[factor]) #Add to params list params.append(str(factor)) #variable param vector data[str(factor) + 'Param'] = initialParams[factor] #Observed information is sum of trips multiplied by the log of known information observed = np.sum(data[trips] * knowns) #return observed info, data, knownn info, and params list return observed, data, knowns, params
def load(m_params): num_features = m_params['n_features'] minbin = m_params['minbin'] getcached = m_params['getcached'] codetest = m_params['codetest'] trainfilename = 'train_' + str(num_features) + str(minbin) + '.h5' testfilename = 'test_' + str(num_features) + str(minbin) + '.h5' # Read HDF format file print("1a. Reading the train and test data...\n") if getcached and os.path.isfile('input/' + trainfilename): train = pd.read_hdf('input/' + trainfilename, 'train') test = pd.read_hdf('input/' + testfilename, 'test') labels = train['target'] test_ids = test['ID'] train.drop(['ID', 'target'], axis=1, inplace=True) test.drop(['ID'], axis=1, inplace=True) return train.values, labels.values, test.values, test_ids.values else: train = pd.read_hdf('input/train.h5', 'train') test = pd.read_hdf('input/test.h5', 'test') if codetest: train = train.ix[0:999, :] test = test.ix[0:999, :] labels = train['target'] test_ids = test['ID'] train_ids = train['ID'] train.drop(['ID', 'target'], axis=1, inplace=True) test.drop(['ID'], axis=1, inplace=True) print( "1c. Breaking dataframe into numeric, object and date parts...\n") train_numeric = train.select_dtypes(include=['float64', 'int64']) test_numeric = test.select_dtypes(include=['float64', 'int64']) train_categoric = train.select_dtypes(include=['object']) test_categoric = test.select_dtypes(include=['object']) train_dates = train.select_dtypes(include=['datetime64[ns]']) test_dates = test.select_dtypes(include=['datetime64[ns]']) # Zip code engineering print("2. Zip code engineering...\n") train['VAR_0241'] = train['VAR_0241'].fillna(99999) test['VAR_0241'] = test['VAR_0241'].fillna(99999) train_zips = np.empty([train.shape[0], 7]) test_zips = np.empty([test.shape[0], 7]) try: zp = train['VAR_0241'].astype('int64').astype(str) zp = zp.replace('', '99999') train_zips[:, 0] = zp.map(lambda x: x[:2]).astype('int32') train_zips[:, 1] = zp.map(lambda x: x[:1] + x[-1:]).astype('int32') train_zips[:, 2] = zp.map(lambda x: x[:3]).astype('int32') train_zips[:, 3] = zp.map(lambda x: x[1:3]).astype('int32') train_zips[:, 4] = zp.map(lambda x: x[1:4]).astype('int32') train_zips[:, 5] = zp.map(lambda x: x[2:4]).astype('int32') train_zips[:, 6] = zp.map(lambda x: x[3:5]).astype('int32') zp = test['VAR_0241'].astype('int64').astype(str) zp = zp.replace('', '99999') test_zips[:, 0] = zp.map(lambda x: x[:2]).astype('int32') test_zips[:, 1] = zp.map(lambda x: x[:1] + x[-1:]).astype('int32') test_zips[:, 2] = zp.map(lambda x: x[:3]).astype('int32') test_zips[:, 3] = zp.map(lambda x: x[1:3]).astype('int32') test_zips[:, 4] = zp.map(lambda x: x[1:4]).astype('int32') test_zips[:, 5] = zp.map(lambda x: x[2:4]).astype('int32') test_zips[:, 6] = zp.map(lambda x: x[3:5]).astype('int32') zipcolumns = [ 'zip0', 'zip1', 'zip2', 'zip3', 'zip4', 'zip5', 'zip6' ] train_zips = pd.DataFrame(train_zips, columns=zipcolumns) test_zips = pd.DataFrame(test_zips, columns=zipcolumns) except: print('Zip codes cant be encoded') exit() # Deal with categorical data print("3. Categorical variable encoding... \n") for c in train_categoric.columns: freqs = train_categoric[c].append(test_categoric[c]).value_counts() train_categoric[c] = pd.match(train_categoric[c].values, freqs[0:70].index) test_categoric[c] = pd.match(test_categoric[c].values, freqs[0:70].index) # Deal with categorical data print("4. Numeric Column Smoothing... \n") train_numeric = train_numeric.fillna(0) test_numeric = test_numeric.fillna(0) numeric_col_count = 0 if minbin > 1: for c in train_numeric.columns: train_numeric[c], test_numeric[c] = bin( train_numeric[c], test_numeric[c], labels, minbin) numeric_col_count += 1 if not (numeric_col_count % 10): print('Numeric Col Count: ', numeric_col_count) gc.collect() # Create new date transformations print('5. Create new date columns...\n') def tdtoint(td): if not pd.isnull(td): return td.astype('timedelta64[D]').astype(np.int32) else: return 0 # Diffs between important dates for i in [ 'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217', 'VAR_0169', 'VAR_0178', 'VAR_0166' ]: for j in [ 'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217', 'VAR_0169', 'VAR_0178', 'VAR_0166' ]: if i < j: keypair = i + '_' + j else: keypair = j + '_' + i if i != j and keypair not in train_dates.columns: train_dates[keypair] = train_dates[i] - train[j] train_dates[keypair] = train_dates[keypair].apply(tdtoint) test_dates[keypair] = test_dates[i] - test_dates[j] test_dates[keypair] = test_dates[keypair].apply(tdtoint) # Date Splits datecols = pd.read_pickle('input/datecols.pkl') for c in datecols['col'].values.tolist(): train_dates[c + '_y'] = train_dates[c].dt.year train_dates[c + '_m'] = train_dates[c].dt.month train_dates[c + '_d'] = train_dates[c].dt.day train_dates[c + '_wd'] = train_dates[c].dt.weekday train_dates[c + '_hr'] = train_dates[c].dt.hour test_dates[c + '_y'] = test_dates[c].dt.year test_dates[c + '_m'] = test_dates[c].dt.month test_dates[c + '_d'] = test_dates[c].dt.day test_dates[c + '_wd'] = test_dates[c].dt.weekday test_dates[c + '_hr'] = test_dates[c].dt.hour train_dates.drop(datecols['col'].values.tolist(), axis=1, inplace=True) test_dates.drop(datecols['col'].values.tolist(), axis=1, inplace=True) gc.collect() print("5. Merging arrays together...\n") # put seperate parts together again train = pd.concat( [train_categoric, train_dates, train_numeric, train_zips], axis=1) test = pd.concat([test_categoric, test_dates, test_numeric, test_zips], axis=1) # Get only top n features print("1b. Filtering by pickled important columns...\n") cols = pd.read_pickle("input/vars_importance.pkl") cols = list(cols.ix[0:num_features, "var"]) for c in cols: if c not in train.columns: cols.remove(c) train = train[cols].fillna(0) test = test[cols].fillna(0) gc.collect() try: print("6. Writing to hdf format...\n") pd.concat([train_ids, train, labels], axis=1).to_hdf('input/' + trainfilename, key='train', format='fixed', mode='w') pd.concat([test_ids, test], axis=1).to_hdf('input/' + testfilename, key='test', format='fixed', mode='w') except: error = sys.exc_info()[0] print("Error: %s" % error) return train.values, labels.values, test.values, test_ids.values
sider_se = sider_se[sider_se[3] == 'PT'] #sider_fq = pd.read_table('/home/az338/ucc-fileserver/sider/meddra_freq.tsv',header=None) #sider_fq = sider_fq.drop(0,1) # concatenate sider and offsides side effect sider_se = sider_se.drop([3,4],1) sider_se.columns = ['stitch_id','umls_id','event'] allSE = pd.concat([sider_se,offsides[['stitch_id','umls_id','event']]]) allSE.to_csv(DATA_DIR+'sider_and_offsides_SE.csv') # load stitch_id smiles mapping dataset stitch_smiles = pd.read_table('/home/az338/ucc-fileserver/stitch_v4/chemicals.v4.0.tsv') # match stitch id in SE dataset to smiles idx_offsides = pd.match(offsides['stitch_id'],stitch_smiles['chemical']) idx_offsides = idx_offsides[idx_offsides >= 0] idx_sider = pd.match(sider_se['stitch_id'],stitch_smiles['chemical']) idx_sider = idx_sider[idx_sider >= 0] # concatenate sider and offsides structures offsides_struct = stitch_smiles.iloc[idx_offsides][['chemical','SMILES_string']].drop_duplicates() sider_struct = stitch_smiles.iloc[idx_sider][['chemical','SMILES_string']].drop_duplicates() all_struct = pd.concat([offsides_struct,sider_struct]).drop_duplicates() all_struct.to_csv(DATA_DIR+'sider_offsides_smiles.csv')
import pandas as pd DATA_DIR = '/home/az338/ucc-fileserver/AZ_challenge_data/' SE_DATA_DIR = '/scratch/az338/ucc-fileserver/side_effects_data/' # load twosides SE_data = pd.read_table(SE_DATA_DIR+'3003377s-twosides.tsv') # load compound structures SE_struct = pd.read_table('/home/az338/ucc-fileserver/stitch_v4/chemicals.v4.0.tsv') # get unique compound identifiers in twosides twosides_compounds = list(set(SE_data['stitch_id1'].values.tolist()+SE_data['stitch_id2'].values.tolist())) # match these compounds to the OFFSIDES/SIDER structural dataset # to match the stitch IDs to their structure idx = pd.match(twosides_compounds,SE_struct['chemical']) twosides_struct = SE_struct.iloc[idx[idx>0]] twosides_struct=twosides_struct.drop('molecular_weight',axis=1) twosides_struct.to_csv(SE_DATA_DIR+'twosides_smiles.csv')
def time_match_string(self): with warnings.catch_warnings(record=True): pd.match(self.all, self.uniques)
def load(m_params): num_features = m_params['n_features'] minbin = m_params['minbin'] getcached = m_params['getcached'] t0 = time.time() trainfilename = 'train_' + str(num_features) + str(minbin) + '.h5' testfilename = 'test_' + str(num_features) + str(minbin) + '.h5' # Read HDF format file print("1. Reading the train and test data...\n") if getcached and os.path.isfile(trainfilename): train = pd.read_hdf(trainfilename, 'train') test = pd.read_hdf(testfilename, 'test') labels = train['target'] test_ids = test['ID'] train.drop(['ID', 'target'], axis=1, inplace=True) test.drop(['ID'], axis=1, inplace=True) return train.values, labels.values, test.values, test_ids.values elif getcached and os.path.isfile('train_binned_' + str(minbin) + '.h5'): train = pd.read_hdf('train_binned_' + str(minbin) + '.h5', 'train') test = pd.read_hdf('test_binned_' + str(minbin) + '.h5', 'test') labels = train['target'] test_ids = test['ID'] train.drop(['ID', 'target'], axis=1, inplace=True) test.drop(['ID'], axis=1, inplace=True) else: train = pd.read_hdf('train.h5', 'train') test = pd.read_hdf('test.h5', 'test') labels = train['target'] test_ids = test['ID'] gc.collect() print("Postcode column \n") print(train['VAR_0241'].dtype, len(np.unique(train['VAR_0241']))) print(test['VAR_0241'].dtype, len(np.unique(test['VAR_0241']))) # Zip code engineering print("4. Zip code engineering...\n") train['VAR_0241'] = train['VAR_0241'].fillna(99999) test['VAR_0241'] = test['VAR_0241'].fillna(99999) try: zp = train['VAR_0241'].astype('int64').astype(str) zp = zp.replace('', '99999') train['zip_00xxx'] = zp.map(lambda x: x[:2]).astype('int32') train['zip_0xxx0'] = zp.map(lambda x: x[:1] + x[-1:]).astype( 'int32') train['zip_000xx'] = zp.map(lambda x: x[:3]).astype('int32') train['zip_x00xx'] = zp.map(lambda x: x[1:3]).astype('int32') train['zip_x000x'] = zp.map(lambda x: x[1:4]).astype('int32') train['zip_xx00x'] = zp.map(lambda x: x[2:4]).astype('int32') train['zip_xxx00'] = zp.map(lambda x: x[3:5]).astype('int32') zp = test['VAR_0241'].astype('int64').astype(str) zp = zp.replace('', '99999') test['zip_00xxx'] = zp.map(lambda x: x[:2]).astype('int32') test['zip_0xxx0'] = zp.map(lambda x: x[:1] + x[-1:]).astype( 'int32') test['zip_000xx'] = zp.map(lambda x: x[:3]).astype('int32') test['zip_x00xx'] = zp.map(lambda x: x[1:3]).astype('int32') test['zip_x000x'] = zp.map(lambda x: x[1:4]).astype('int32') test['zip_xx00x'] = zp.map(lambda x: x[2:4]).astype('int32') test['zip_xxx00'] = zp.map(lambda x: x[3:5]).astype('int32') except: print('BOLLOCKS Zip codes cant be encoded') exit() # Deal with categorical data and smoothing print( "2. Categorical variable encoding and numeric col smoothing... \n") numeric_col_count = 0 for c in train.columns[1:-1]: if train[c].name != 'target': if train[c].dtype.name == 'object': freqs = train[c].append(test[c]).value_counts() train[c] = pd.match(train[c].values, freqs[0:70].index) test[c] = pd.match(test[c].values, freqs[0:70].index) elif train[c].dtype.name in ['int64', 'float64' ] and minbin > 1: # smooth numeric cols train[c] = bin(train[c], train[c], train['target'], minbin) test[c] = bin(test[c], train[c], train['target'], minbin) numeric_col_count += 1 if not (numeric_col_count % 10): print('Numeric Col Count: ', numeric_col_count) gc.collect() # Create new date transformations print('3. Create new date columns...\n') def tdtoint(td): if not pd.isnull(td): return td.astype('timedelta64[D]').astype(int) else: return 0 # Diffs between important dates for i in [ 'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217', 'VAR_0169', 'VAR_0178', 'VAR_0166' ]: for j in [ 'VAR_0073', 'VAR_0075', 'VAR_0176', 'VAR_0179', 'VAR_0217', 'VAR_0169', 'VAR_0178', 'VAR_0166' ]: if i < j: keypair = i + '_' + j else: keypair = j + '_' + i if i != j and keypair not in train.columns: train[keypair] = train[i] - train[j] train[keypair] = train[keypair].apply(tdtoint) test[keypair] = test[i] - test[j] test[keypair] = test[keypair].apply(tdtoint) # Date Splits datecols = pd.read_pickle('datecols.pkl') for c in datecols['col'].values.tolist(): train[c + '_y'] = train[c].dt.year train[c + '_m'] = train[c].dt.month train[c + '_d'] = train[c].dt.day train[c + '_wd'] = train[c].dt.weekday train[c + '_hr'] = train[c].dt.hour test[c + '_y'] = test[c].dt.year test[c + '_m'] = test[c].dt.month test[c + '_d'] = test[c].dt.day test[c + '_wd'] = test[c].dt.weekday test[c + '_hr'] = test[c].dt.hour train.drop(datecols['col'].values.tolist(), axis=1, inplace=True) gc.collect() # Fill any remaining N/As train = train.fillna(0) test = test.fillna(0) #print("4.5. Writing to hdf format...\n") #train.to_hdf('train_binned_' + str(minbin) + '.h5',key='train',format='fixed',mode='w') #test.to_hdf('test_binned_' + str(minbin) + '.h5',key='test',format='fixed',mode='w') # Get only top n features print("5. Filtering by pickled important columns...\n") cols = pd.read_pickle("vars_importance.pkl") cols = cols.ix[0:num_features, "var"].tolist() print("6. Writing to hdf format...\n") #zipcols = ['zip_00xxx', 'zip_0xxx0', 'zip_000xx', 'zip_x00xx', 'zip_x000x', 'zip_xx00x', 'zip_xxx00'] zipcols = [] train[cols + zipcols + ['ID', 'target']].to_hdf(trainfilename, key='train', format='fixed', mode='w') test[cols + zipcols + ['ID']].to_hdf(testfilename, key='test', format='fixed', mode='w') train = train[cols + zipcols] test = test[cols + zipcols] gc.collect() return train.values, labels.values, test.values, test_ids.values
path_var = 'E:\\Wojtek\\_DSCN_\\Analiza_danych\\Leas\\data_set\\' var_df = pd.read_csv(path_var+'variables.csv', sep=';') var_df.columns() var_df.type_var.value_counts() var_df.head(10) var_df.type_pred.value_counts(dropna=False) #predictors = var_df.variable[ (var_df.type_var == 'pred') | (var_df.variable == target_name) ] predictors = var_df.variable[ var_df.type_var == 'pred' ] ind_target = data_frame.columns.get_loc(target_name) type(predictors) #type(list)(predictors) pd.match(df.columns, predictors) #zwraca pozycję wystąpienia: R-owy odpowiednik which( vect1 %in% vec2 ) col = df.columns.isin(predictors)+df.columns.isin([target_name]) # to raczej nie jest eleganckie rozwiązanie target_name = 'TR_D90M12' df_filtered = df.loc[ (df.FFINRPFH_czy == 1) & (df.TR_ANEKS_RODZAJ_id == 0) & (df.TR_POZIOM_wykonanie == 1) & (df.PORECZYCIEL_CZY == 0) & (pd.isnull(df.TR_FRAUD_DataStatusu)) & (~ pd.isnull(df.loc[:,target_name])), col] df_filtered.shape df_filtered2 = df_filtered.dropna(axis = 0, how = 'any') df_filtered2.shape
def get_dense_specs(): train = pd.read_csv('../input/train_set.csv', parse_dates=[ 2, ]) test = pd.read_csv('../input/test_set.csv', parse_dates=[ 3, ]) tube = pd.read_csv('../input/tube.csv', true_values=['Y'], false_values=['N']) materials = pd.read_csv('../input/bill_of_materials.csv') aggs = pd.read_csv('../input/ta_aggs.csv') components = pd.read_csv('../input/components.csv') train = pd.merge(train, tube, on='tube_assembly_id') test = pd.merge(test, tube, on='tube_assembly_id') train = pd.merge(train, materials, on='tube_assembly_id') test = pd.merge(test, materials, on='tube_assembly_id') train = pd.merge(train, aggs, on='tube_assembly_id', how='left') test = pd.merge(test, aggs, on='tube_assembly_id', how='left') # create some new features train['year'] = train.quote_date.dt.year train['month'] = train.quote_date.dt.month test['year'] = test.quote_date.dt.year test['month'] = test.quote_date.dt.month train['odd'] = train.quantity % 2 test['odd'] = test.quantity % 2 train['div5'] = (train.quantity % 5) test['div5'] = (test.quantity % 5) train['material_id'].replace(np.nan, ' ', regex=True, inplace=True) test['material_id'].replace(np.nan, ' ', regex=True, inplace=True) train['bracket_pricing'] = train['bracket_pricing'].replace(['Yes', 'No'], [1, 0]) test['bracket_pricing'] = test['bracket_pricing'].replace(['Yes', 'No'], [1, 0]) fields_to_encode = [ 'supplier', 'material_id', 'end_a', 'end_x', 'end_a_1x', 'end_a_2x', 'end_x_1x', 'end_x_2x', 'bracket_pricing' ] for i in range(1, 9): column_label = 'component_id_' + str(i) fields_to_encode.append(column_label) tmp = pd.merge(train, components, left_on=column_label, right_on='component_id', how='left')['component_type_id'] train[column_label] = tmp tmp = pd.merge(test, components, left_on=column_label, right_on='component_id', how='left')['component_type_id'] test[column_label] = tmp train[column_label].replace(np.nan, ' ', regex=True, inplace=True) test[column_label].replace(np.nan, ' ', regex=True, inplace=True) for j, clf in enumerate(train.columns.tolist()): print(j, clf) ''' # label encode the categorical variables for i in fields_to_encode: print('Encoding',i) lbl = LabelEncoder() lbl.fit(list(train.ix[:,i]) + list(test.ix[:,i])) train.ix[:,i] = lbl.transform(train.ix[:,i]) test.ix[:,i] = lbl.transform(test.ix[:,i]) for i in fields_to_encode: print('Encoding',i) freqs = train[i].append(test[i]).value_counts() train[i] = pd.match(train[i].values, freqs[0:45].index) test[i] = pd.match(test[i].values, freqs[0:45].index) ''' for i in fields_to_encode: print('Encoding', i) rank = pd.concat([train[i], train['cost']], axis=1).groupby(i).mean().sort('cost', ascending=False) print(rank[0:20]) train[i] = pd.match(train[i].values, rank[0:45].index) test[i] = pd.match(test[i].values, rank[0:45].index) train.fillna(0, inplace=True) test.fillna(0, inplace=True) return train, test
core_variables = [ 'DO_mgL', 'satDO_mgL', 'DOsat_pct', 'WaterTemp_C', 'Depth_m', 'Level_m', 'Discharge_m3s', 'Light_PAR', 'Light_lux' ] varcells = [] for x in sitedata.Variables: if x is None: varcells.append(x) else: var_arr = np.asarray(x.split(',')) isCore = np.in1d(var_arr, core_variables) core = var_arr[isCore] not_core = var_arr[~isCore] if any(core): core = core[np.argsort(pd.match(core, core_variables))] not_core.sort() var_arr = ', '.join(np.concatenate((core, not_core))) varcells.append(var_arr) for i in xrange(len(varcells)): if varcells[i] is None: varcells[i] = '-' sitedata.Variables = varcells fr = sitedata['firstRecord'].dt.strftime('%Y-%m-%d') lr = sitedata['lastRecord'].dt.strftime('%Y-%m-%d') timerange = fr + ' to ' + lr sitedata['Coverage'] = timerange.apply(lambda x: x if x != 'NaT to NaT' else '-')
index_end = int(np.minimum( (args.splitIndex) * np.ceil(float(h5f.shape[0] / 2) / args.splitFold), (h5f.shape[0] / 2))) else: index_start = 0 index_end = int(h5f.shape[0] / 2) snp_temp = (np.asarray(h5f[index_start:index_end,:])+ np.asarray(h5f[index_start+int(h5f.shape[0]/2):index_end+int(h5f.shape[0]/2),:]))/2.0 snpEffects.append(snp_temp) coor = pd.read_csv(args.coorFile,sep='\t',header=None) coor = coor.iloc[index_start:index_end,:] #Fetch the distance to TSS information gene = pd.read_csv(args.geneFile,sep='\t',header=None) geneinds = pd.match(coor.iloc[:,0].map(str).str.replace('chr','')+' '+coor.iloc[:,1].map(str), gene.iloc[:,0].map(str).str.replace('chr','')+' '+gene.iloc[:,2].map(str)) if np.any(geneinds==-1): raise ValueError("Gene association file does not match the vcf file.") if args.fixeddist == 0: dist = - np.asarray(gene.iloc[geneinds,-1]) else: dist = args.fixeddist genename = np.asarray(gene.iloc[geneinds,-2]) strand= np.asarray(gene.iloc[geneinds,-3]) #comptue expression effects snpExpEffects = compute_effects(snpEffects, \ dist, strand,\ models, maxshift=maxshift, nfeatures=args.nfeatures, batchSize = args.batchSize) #write output
train[c + '_m'] = train[c].dt.month train[c + '_d'] = train[c].dt.day train[c + '_wd'] = train[c].dt.weekday train[c + '_hr'] = train[c].dt.hour test[c + '_y'] = test[c].dt.year test[c + '_m'] = test[c].dt.month test[c + '_d'] = test[c].dt.day test[c + '_wd'] = test[c].dt.weekday test[c + '_hr'] = test[c].dt.hour train.drop(datecols['col'].values.tolist(), axis=1, inplace=True) print("categorical variable encoding and cleaning...\n") for c in train.columns[1:-1]: if train[c].dtype.name == 'object': freqs = train[c].append(test[c]).value_counts() train[c] = pd.match(train[c].values, freqs[0:70].index) test[c] = pd.match(test[c].values, freqs[0:70].index) train = train.fillna(0) test = test.fillna(0) labels = train['target'] train.drop(['ID', 'target'], axis=1, inplace=True) features = train.columns.values print("filtering by pickled important columns...\n") vars = pd.read_pickle("vars_importance.pkl") train = train[vars.ix[0:1250, "var"].tolist()] test = test[vars.ix[0:1250, "var"].tolist()] print("converting to numpy array...\n")
def setup(data, trips, sep, cost, factors, constraints, prodCon, attCon, initialParams, Oi, Dj, totalFlows): #For doubly constrained model if (prodCon == True) & (attCon == True): #Variables for constants and deriving them data["Bj"] = 1.0 data["Ai"] = 1.0 data["OldAi"] = 10.000000000 data["OldBj"] = 10.000000000 data["diff"] = abs((data["OldAi"] - data["Ai"])/data["OldAi"]) #Calc total outflows and inflows if Oi: print '1' data["Oi"] = data[Oi] else: print '2' Oi = data.groupby(data[constraints['production']]).aggregate({trips: np.sum}) data["Oi"] = Oi.ix[pd.match(data[constraints['production']], Oi.index)].reset_index()[trips] if Dj: print '3' data["Dj"] = data[Dj] else: print '4' Dj = data.groupby(data[constraints['attraction']]).aggregate({trips: np.sum}) data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']], Dj.index)].reset_index()[trips] #For Production Constrained model if (prodCon == True) & (attCon == False): #Calc total outflows if factors == None: print Dj if not Dj: Dj = data.groupby(data[totalFlows]).aggregate({trips: np.sum}) data["Dj"] = Dj.ix[pd.match(data[totalFlows], Dj.index)].reset_index()[trips].sort_index() else: data["Dj"] = data[Dj] if not Oi: Oi = data.groupby(data[constraints['production']]).aggregate({trips: np.sum}) data["Oi"] = Oi.ix[pd.match(data[constraints['production']], Oi.index)].reset_index()[trips] else: data['Oi'] = data[Oi] #For Attraction Constrained model if (prodCon == False) & (attCon == True): #Calc total inflows if factors == None: if not Oi: Oi = data.groupby(data[totalFlows]).aggregate({trips: np.sum}) data["Oi"] = Oi.ix[pd.match(data[totalFlows], Oi.index)].reset_index()[trips] else: data["Oi"] = data[Oi] if not Dj: Dj = data.groupby(data[constraints['attraction']]).aggregate({trips: np.sum}) data["Dj"] = Dj.ix[pd.match(data[constraints['attraction']], Dj.index)].reset_index()[trips] else: data["Dj"] = data[Dj] #For Unconstrained Model if (prodCon == False) & (attCon == False): pass #The following setup is for within all models #There is always a beta parameter so set it to user's initial value and add to param list print initialParams data['beta'] = initialParams['beta'] params = ['beta'] #This is the observed data for which we want to derive parameters if cost == 'exp': knowns = data[sep] elif cost == 'pow': knowns = np.log(data[sep]) else: sys.exit(sys.exit("The distance/cost function must be either 'pow' or 'exp'.")) #If there are additional factors we will include that observed data, add it to param list, and add a data vector for the param if factors != None: if attCon != False: for factor in factors['origins']: #Include that information in the model knowns = knowns+np.log(data[factor]) #Add to params list params.append(str(factor)) #variable param vector data[str(factor) + 'Param'] = initialParams[factor] if prodCon != False: for factor in factors['destinations']: #Include that informatio in the model knowns = knowns+np.log(data[factor]) #Add to params list params.append(str(factor)) #variable param vector print initialParams data[str(factor) + 'Param'] = initialParams[factor] #Observed information is sum of trips multiplied by the log of known information observed = np.sum(data[trips]*knowns) #return observed info, data, knownn info, and params list return observed, data, knowns, params
import pandas as pd DATA_DIR = '/scratch/az338/ucc-fileserver/AZ_challenge_data/' # challenge training data, cell/disease area(DA) mapping and challenge cmp/target mapping challenge_pairs = pd.read_csv(DATA_DIR+'drug_synergy_data/ch1_train_combination_and_monoTherapy.csv') cell_da_map = pd.read_csv(DATA_DIR+'sanger_molecular_data/cell_info.csv') challenge_cmp_target_map = pd.read_csv(DATA_DIR+'drug_synergy_data/Drug_info_release_curated.csv') challenge_cmp_target_map.columns = ['ChallengeName','Target'] + list(challenge_cmp_target_map.columns[2:]) # map disease area to corresponding cell-line challenge_pairs['DISEASE_AREA'] = cell_da_map.iloc[pd.match(challenge_pairs['CELL_LINE'], cell_da_map['Sanger.Name'])]['Disease.Area'].tolist() # make mapping flat (separate targets to different lines) cmp_target_map = challenge_cmp_target_map[['ChallengeName','Target']] flat_map = [] for i, r in cmp_target_map.iterrows(): for t in r['Target'].split(','): flat_map.append([r['ChallengeName'],t.rstrip(' ').lstrip(' ')]) flat_map = pd.DataFrame(flat_map).drop_duplicates() flat_map.columns = ['Compound','Target'] # convert compound-compound associations to target-target associations synergy_scores = challenge_pairs[['DISEASE_AREA','COMPOUND_A','COMPOUND_B','SYNERGY_SCORE','QA','CELL_LINE']] target_synergy = [] for i, r in synergy_scores.iterrows(): #targets_A = flat_map.iloc[pd.match(r['COMPOUND_A'],flat_map['Compound'])]['Target'] #targets_B = flat_map.iloc[pd.match(r['COMPOUND_B'],flat_map['Compound'])]['Target'] targets_A = flat_map[flat_map['Compound'] == r['COMPOUND_A']]['Target']
path = os.getenv('HOME')+'/python/phylogeny/pavelMattis/vector_machines/' for f in [x for x in os.listdir(path+'data/list_length_project/sets/CognateData/output') if x!='.svn']: db = f.split('.')[0] data = pd.read_table(path+'data/list_length_project/sets/CognateData/output/'+f,encoding='utf-8') data = data[['-' not in unicode(x) for x in data.cognate_class.values]] output = pd.DataFrame() output['ID'] = arange(len(data))+1 output['Taxon'] = data.language.astype('string') output['Gloss'] = data.gloss.values output['GlossID'] = pd.match(data.gloss.values,data.gloss.unique())+1 output['IPA'] = [re.sub(r"[ -]","",unicode(x)) for x in data.transcription] output['Tokens'] = [' '.join(asjp2tokens(unicode(w))) for w in output.IPA] cClasses = array([x+':'+unicode(y).strip('?') for (x,y) in data[['gloss','cognate_class']].values]) output['CogID'] = pd.match(cClasses,unique(cClasses)) output[['Taxon','Gloss']] = output[['Taxon','Gloss']].astype('string') output['dbID'] = [db+'_'+str(x-1) for x in output.ID.values] output.to_csv('reformattedData/asjp/'+db+'.tsv',encoding='utf-8', sep='\t',index=False) for f in [x for x in os.listdir(path+'data/list_length_project/sets/mattis_new/output') if x!='.svn']: db = f.split('.')[0] data = pd.read_table(path+'/data/list_length_project/sets/mattis_new/output/'+f,encoding='utf-8') data = data[['-' not in unicode(x) for x in data.cognate_class.values]] output = pd.DataFrame()
def _create_pandas_frame(dataset_path, samples_path, targets_path): """ Creates and returns a pandas DataFrame object that includes the dataset's samples and targets. Also, the samples are augmented by calculating and adding the feature7 column. Note that the function requires paths as arguments instead of the data itself (which is why the temp dir is create in the calling add_feature7). This is for reasons that were once reasonable. """ fname = dataset_path.split('/')[-1] db = fname.split('.')[0] # read in wordlist wordlist = pd.read_table(dataset_path, encoding='utf-8', na_filter=False, dtype=object) # keep track of synonyms within the same language synDict = defaultdict(lambda: 0) synocc = [] for l, g in wordlist[['language', 'global_id']].values: synDict[l, g] += 1 synocc.append(unicode(synDict[l, g])) wordlist['synonym_number'] = synocc dDict = { 'sample_id': unicode, 'feature1': double, 'feature2': double, 'feature3': double, 'feature4': double, 'feature5': double, 'feature6': double, 'feature8': double } # read in feature matrix for word pairs vectors = pd.read_table(samples_path, encoding='utf-8', na_filter=False, dtype=dDict) # read in cognacy judgments labels = pd.read_table(targets_path, encoding='utf-8', na_filter=False, dtype={ 'sample_id': unicode, 'target': int }) # colect metadata for wordpairs in vectors metaRaw = array([x.split('/') for x in vectors.sample_id.values]) meta = pd.DataFrame(c_[metaRaw[:, 0], [x.split(',') for x in metaRaw[:, 1]], [x.split(',') for x in metaRaw[:, 2]]], columns=['global_id', 'l1', 'l2', 'id1', 'id2']) meta['sample_id'] = vectors.sample_id meta1 = pd.merge(wordlist[[ 'global_id', 'language', 'gloss', 'synonym_number', 'transcription', 'cognate_class' ]], meta, left_on=['global_id', 'language', 'synonym_number'], right_on=['global_id', 'l1', 'id1'])[[ 'sample_id', 'global_id', 'l1', 'l2', 'transcription', 'cognate_class', 'id2' ]] meta2 = pd.merge(wordlist[[ 'global_id', 'language', 'gloss', 'synonym_number', 'transcription', 'cognate_class' ]], meta1, left_on=['global_id', 'language', 'synonym_number'], right_on=['global_id', 'l2', 'id2'])[[ 'sample_id', 'gloss', 'l1', 'transcription_y', 'cognate_class_y', 'l2', 'transcription_x', 'cognate_class_x' ]] meta2.columns = [ 'sample_id', u'gloss', 'l1', u'w1', u'cc1', 'l2', u'w2', u'cc2' ] meta2 = meta2.ix[pd.match(vectors.sample_id, meta2.sample_id)] concepts = meta2.gloss.unique() feature7 = pd.Series([ abs( corrcoef( array( vectors[meta2.gloss == c][['feature2', 'feature4']].values, double).T)[0, 1]) for c in concepts ], index=concepts, dtype=double) feature7[feature7.isnull()] = 0 vectors['feature7'] = feature7.ix[meta2.gloss.values].values combined = pd.merge(pd.merge(meta2, vectors, on='sample_id'), labels, on='sample_id') combined = combined[combined.columns[1:]] combined['db'] = db return combined
'.wt1100.fasta.ref.vcf' ], shell=True) #create .evo1 .evo2 .evo3 try: check_call([ 'python evoevalues.production.py ' + sys.argv[1] + '.wt1100.fasta.ref.vcf' ], shell=True) #create .evo.evalues dataevoe = pd.read_csv(sys.argv[1] + '.wt1100.fasta.ref.vcf.evo.evalues', delimiter=',', header=None) dataevoe[0] = 'chr' + dataevoe[0].astype(str) matchedinds = pd.match( np.asarray(coordata['chr'].astype(str) + coordata['pos'].astype(str)), np.asarray(dataevoe[0].astype(str) + dataevoe[1].astype(str))) dataevoe = np.asarray(dataevoe.iloc[:, -4:]) dataevoe = dataevoe[matchedinds, :] #impute evolutionary feature E-values for rare cases that evolutionary features are not available dataevoe[matchedinds == -1, :] = np.asarray([1, 1, 1, 1])[np.newaxis, :] datadeepsea = np.exp( np.mean(np.log(datae), axis=1) + np.mean(np.log(dataevoe), axis=1)) except: datadeepsea = np.exp(np.mean(np.log(datae), axis=1)) temp = pd.DataFrame(datadeepsea[:, np.newaxis]) temp.columns = ['Functional significance score'] datadeepsea = pd.concat([coordata, temp], axis=1)
def testCluster(vdb, featureSubset=FEATURES, C=0.82, gamma=9e-04, kernel='linear', th=.34): """ Inference on test data. """ newWordList = pd.DataFrame() fitting = trainingVectors validation = test[test.db == vdb].copy() X = fitting[featureSubset].values y = fitting.target.values svClf = svm.SVC(kernel=kernel, C=C, gamma=gamma, probability=True) svClf.fit(X, y) svScores = svClf.predict_proba(validation[featureSubset].values)[:, 1] validation['svScores'] = svScores scores = pd.DataFrame() wordlist = pd.DataFrame() concepts = validation.gloss.unique() taxa = unique(validation[['l1', 'l2']].values.flatten()) dataWordlist = vstack([ validation[['gloss', 'l1', 'w1', 'cc1']].values, validation[['gloss', 'l2', 'w2', 'cc2']].values ]) dataWordlist = pd.DataFrame( dataWordlist, columns=['concept', 'doculect', 'counterpart', 'cc']) dataWordlist = dataWordlist.drop_duplicates() dataWordlist.index = [ '_'.join(map(unicode, x)) for x in dataWordlist[['concept', 'doculect', 'counterpart']].values ] validation['id_1'] = [ c + '_' + l + '_' + unicode(w) for (c, l, w) in validation[['gloss', 'l1', 'w1']].values ] validation['id_2'] = [ c + '_' + l + '_' + unicode(w) for (c, l, w) in validation[['gloss', 'l2', 'w2']].values ] for c in concepts: dataC = validation[validation.gloss == c].copy() dataC['id_1'] = [ x.replace(' ', '').replace(',', '') for x in dataC.id_1 ] dataC['id_2'] = [ x.replace(' ', '').replace(',', '') for x in dataC.id_2 ] wlC = dataWordlist[dataWordlist.concept == c].copy() if len(wlC) > 1: wlC.index = [ x.replace(' ', '').replace(',', '') for x in wlC.index ] svMtx = zeros((len(wlC.index), len(wlC.index))) svMtx[pd.match(dataC.id_1, wlC.index), pd.match(dataC.id_2, wlC.index)] = dataC.svScores.values svMtx[pd.match(dataC.id_2, wlC.index), pd.match(dataC.id_1, wlC.index)] = dataC.svScores.values svDistMtx = log(1 - svMtx) tth = log(th) - svDistMtx.min() svDistMtx -= svDistMtx.min() fill_diagonal(svDistMtx, 0) pDict = infomap_clustering(tth, svDistMtx) pArray = vstack( [c_[pDict[k], [k] * len(pDict[k])] for k in pDict.keys()]) partitionIM = pArray[argsort(pArray[:, 0]), 1] else: partitionIM = array([1]) wlC['inferredCC'] = [vdb + ':' + c + ':' + str(x) for x in partitionIM] wlC['db'] = vdb newWordList = pd.concat([newWordList, wlC]) newWordList.index = arange(len(newWordList)) return newWordList
from numpy import * import pandas as pd # this script computes Cronbach's alpha for all languages in the sample data = pd.read_csv('conceptwiseSimilarities.csv', index_col=0) concepts = array(data.columns[-40:]) taxa = unique(data[['language1', 'language2']].values.flatten()) nrMts = [] for c in concepts: cMtx = zeros((len(taxa), len(taxa))) ix1 = list(pd.match(data.language1, taxa)) ix2 = list(pd.match(data.language2, taxa)) cMtx[ix1, ix2] = data[c].values cMtx[ix2, ix1] = data[c].values nrMts.append(cMtx) matrices = zeros((len(taxa), len(taxa), len(nrMts))) for c in xrange(40): matrices[:, :, c] = nrMts[c] def cronbach(x): itemwise = sum(apply_along_axis(var, 0, x)) total = var(apply_along_axis(sum, 1, x)) return 1. * len(x) / (len(x) - 1) * (1 - itemwise / total)
#https://stackoverflow.com/questions/36063251/python-pandas-how-can-i-group-by-and-assign-an-id-to-all-the-items-in-a-group df["b"] = LabelEncoder().fit_transform(df['g']) #int count from 0 #https://stackoverflow.com/questions/41594703/pandas-assign-an-index-to-each-group-identified-by-groupby df['b'] = pd.Categorical(df['a'].astype(str)).codes df['b'] = pd.Categorical(df['a'].astype(str) + df['c'].astype(str)).codes #allow multiple col groups #R: ind = order(v) y = np.argsort(v) y = v.argsort() #R: match(v1, vdict) -> vdict[match(v1,vdict)] gives v1 np.searchsorted(vdict,v1) #if vdict is sorted vdict[np.searchsorted(vdict,v1)] #gives v1 pd.match([1,2,3,5,8,2],[1,2,4,5,9,2]) match(c(1,2,3,5,8,2),c(1,2,4,5,9,2)) #R: d[order(v),] d.reindex(np.argsort(d['c'])).reset_index(drop=True) #R: setcolorder(d,new_col_order) #https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns #d.reindex_axis(['a','b','c'], axis=1) #deprecated d.reindex(['a','b','c'], axis=1) #copy all data d.sort_values(["a","b"], ascending = [True,False], inplace=False) d.sort_values("a", ascending = True) #inplace is False default s.sort_values() #series no need to add input d['b'] = d.a.sort_values() # error merge to d by index, undoing the sort d['b'] = d.a.sort_values().values #correct
for i in range(1,sheet.nrows): if sheet.cell_value(i,0) != '': date_value = sheet.cell_value(i,0) ticker = sheet.cell_value(i,2) dt = datetime.datetime(*xlrd.xldate_as_tuple(date_value,book.datemode)) row_index.append(i) dates.append(dt.strftime("%Y%m%d")) tickers.append(ticker) df = pd.DataFrame({'Date':pd.Series(dates, index=row_index),'Ticker':pd.Series(tickers, index=row_index)}) df['Ticker'] = df['Ticker'].apply(lambda x: x.replace('-',' ')) #Find Unique Tickers and Get Exchange Info from Yahoo! unique_tickers = pd.unique(df['Ticker']).tolist() ref_position = pd.match(df['Ticker'].tolist(),unique_tickers).tolist() unique_tickers = [i.replace(' ','-') for i in unique_tickers] print('[Reading Exchange Information from Yahoo! Finance]') exchange_info = [] for i in range(len(unique_tickers)/query_limit): print('Reading ' + str((i+1)*query_limit)) query_url = 'http://download.finance.yahoo.com/d/quotes.csv?s=' + '+'.join(unique_tickers[i*query_limit:(i+1)*query_limit]) + '&f=x' if len(exchange_info) == 0: exchange_info = pd.read_csv(query_url,header=None).iloc[:,0].tolist() else: exchange_info.extend(pd.read_csv(query_url,header=None).iloc[:,0].tolist()) if len(unique_tickers)%query_limit > 0: print('Reading ' + str(len(unique_tickers)))
def time_match_strings(self): pd.match(self.all, self.uniques)
fl = pd.read_csv(reportPath + fl) fl = fl.replace(np.nan,0).replace('',0) report = fl.reset_index(drop = True) climb = climb.replace(np.nan, 0).replace('',0) ## apply designated filters: climb = climb[ climb.vols1825 <= 0.7] climb = climb[ climb.rollingSTD730 <= 0.2] #climb = climb[ climb['skew'] > 0] #climb = climb[ climb.avgRectractAsGainPercent_180 <= 80] climb = pd.merge(climb,coins, on = 'ticker') for periodCol in ['180']: climb['change'] = ~(climb['climbOrRetract_' + periodCol] == report['climbOrRetract_' + periodCol][pd.match(climb.ticker,report.ticker)].values) changeReport = climb[['ticker','latestPrice','climbOrRetract_' + periodCol,'avgRectractAsGainPercent_' + periodCol,'currentAsPercentOfPrevious_' + periodCol,'targetFib_'+ periodCol, 'target_'+ periodCol,'targetGain_' + periodCol,'avgDaysClimbing_' + periodCol,'avgDaysRetracting_' + periodCol, 'gainFromMin_1_2','gainFromMin_2_4','gainFromMin_5_7','change']] changeReport = changeReport[ changeReport.change == True].reset_index(drop = True) if changeReport.shape[0] > 0: changeReport.to_csv('C:\\Users\\Nick\\Documents\\project MONEY\\Output Reports\\crypto\\daily\\changeReport_daily' + periodCol + '_' + dte + '.csv', sep =',', index = False) splitLen = int(np.ceil(changeReport.ticker.unique().shape[0] / 3)) msgImageData = [] n = 0 for i in range(splitLen): if n < changeReport.ticker.unique().shape[0]:
]] meta2 = pd.merge(wordlist[[ 'global_id', 'language', 'gloss', 'synonym_number', 'transcription', 'cognate_class' ]], meta1, left_on=['global_id', 'language', 'synonym_number'], right_on=['global_id', 'l2', 'id2'])[[ 'sample_id', 'gloss', 'l1', 'transcription_y', 'cognate_class_y', 'l2', 'transcription_x', 'cognate_class_x' ]] meta2.columns = [ 'sample_id', u'gloss', 'l1', u'w1', u'cc1', 'l2', u'w2', u'cc2' ] meta2 = meta2.ix[pd.match(vectors.sample_id, meta2.sample_id)] concepts = meta2.gloss.unique() feature7 = pd.Series([ abs( corrcoef( array( vectors[meta2.gloss == c][['feature2', 'feature4']].values, double).T)[0, 1]) for c in concepts ], index=concepts, dtype=double) feature7[feature7.isnull()] = 0 vectors['feature7'] = feature7.ix[meta2.gloss.values].values combined = pd.merge(pd.merge(meta2, vectors, on='sample_id'), labels, on='sample_id')