def do_svm(rfam_id): rfam = Rfam(use_website = True) rnas, organisms, consensus_2d = rfam.get_entry(rfam_id = 'RF%05u'%rfam_id) #a matrix for each stem-loop stem_loop_descriptions = [] for i, rna in enumerate(rnas): #print to_bn(consensus_2d, len(rna)) #print rna.sequence ss = base_pairs_to_secondary_structure(rna, consensus_2d) ss.find_junctions() ss.find_stem_loops() if i == 0: print ss.stem_loops for index, stem_loop in enumerate(ss.stem_loops): stem_loop_description = None if index >= len(stem_loop_descriptions): stem_loop_description = {} stem_loop_descriptions.append(stem_loop_description) else: stem_loop_description = stem_loop_descriptions[index] #print stem_loop for helix in stem_loop['helices']: location = helix['location'] #we extract the sequence for each strand and we remove the gaps strand_1 = rna.sequence[location[0][0]-1:location[0][1]].replace('-','') strand_2 = rna.sequence[location[1][0]-1:location[1][1]].replace('-','') #if len(strand_1) != len(strand_2): # print "not fully conserved helix" # print rna.sequence[location[0][0]-1:location[0][1]], rna.sequence[location[1][0]-1:location[1][1]] l = stem_loop_description.get(helix['name']+'_strand_1', []) l.append(len(strand_1)) stem_loop_description[helix['name']+'_strand_1'] = l l = stem_loop_description.get(helix['name']+'_strand_2', []) l.append(len(strand_2)) stem_loop_description[helix['name']+'_strand_2'] = l for inner_loop in stem_loop['inner_loops']: for single_strand in inner_loop['single_strands']: #we extract the sequence for this single-strand and we remove the gaps seq = rna.sequence[single_strand['location'][0]-1:single_strand['location'][1]].replace('-','') #print single_strand['name'] l = stem_loop_description.get("inner_loop_%s"%single_strand['name'], []) l.append(len(seq)) stem_loop_description["inner_loop_%s"%single_strand['name']] = l apical_loop = stem_loop['apical_loop']['single_strands'][0] seq = rna.sequence[apical_loop['location'][0]-1:apical_loop['location'][1]].replace('-','') l = stem_loop_description.get("apical_loop_%s"%apical_loop['name'], []) l.append(len(seq)) stem_loop_description["apical_loop_%s"%apical_loop['name']] = l for stem_loop_description in stem_loop_descriptions: df = DataFrame(stem_loop_description) columns = df.columns print columns print df.as_matrix(columns)
def ListReading(filname, sname): table = read_excel(filname, sheetname=sname, header=0) Matrix = DataFrame.as_matrix(table) n = len(Matrix) m = len(Matrix[0]) NPOS = {} Bond = {} C0 = {} C1 = {} for i in range(1, n): for j in range(1, m): if i == 1: NPOS[str(Matrix[0, j])] = Matrix[i, j] elif i == 2: Bond[(str(Matrix[0, j]), 'Sin')] = Matrix[i, j] elif i == 3: Bond[(str(Matrix[0, j]), 'Dou')] = Matrix[i, j] elif i == 4: C0[str(Matrix[0, j])] = Matrix[i, j] elif i == 5: C1[str(Matrix[0, j])] = Matrix[i, j] return NPOS, Bond, C0, C1
def __get_news_matrix(self, news: pd.DataFrame, max_len=1): ordered_cols = [ 'datetime', 'symbol', 'title', 'actual', 'forecast', 'previous', 'symbol_pair', 'preceding_price' ] news = news[ordered_cols] all_titles = self.prep_data_provider.get_all_titles() all_pairs = self.prep_data_provider.get_currency_pair_strings() all_currencies = self.prep_data_provider.get_all_currencies() news = news.reset_index(drop=True) # # print(news.iloc[11717].tolist()) # print(news.iloc[11718].tolist()) # print(news.iloc[11719].tolist()) # news = self.one_hot_from_all_items(news, 'preceding_price', all_labels) news['preceding_price'].apply(lambda x: x * 10) news = self.one_hot_from_all_items(news, 'symbol', all_currencies) news = self.one_hot_from_all_items(news, 'symbol_pair', all_pairs) news = self.one_hot_from_all_items(news, 'title', all_titles) # print(news.iloc[11717].tolist()) # print(news.iloc[11718].tolist()) # print(news.iloc[11719].tolist()) # quit() news = news.drop('datetime', 1) return news.as_matrix()[:max_len]
def pre_process(x_train: pd.DataFrame, ind: int): def labels(shape: Tuple, target_shape: Tuple): # create one-hot image-shape array of labels for a picture array = np.ndarray((shape[0], shape[1], 2)) array[:, :, :] = [1, 0] array[y1:y2 + 1, x1:x2 + 1, :] = [0, 1] image = cv2.resize(array, (target_shape[1], target_shape[0])) return np.reshape(image, (-1, 2)) # Read image from data x_train = x_train.as_matrix() file = x_train[ind, 0] x1, y1, x2, y2 = x_train[ind, 1], x_train[ind, 2], x_train[ind, 3], x_train[ind, 4] p = re.compile('dayClip\d+') span = p.search(file).span() clip = file[span[0]: span[1]] formatted = file.replace(clip, clip + "/frames/" + clip) img = cv2.imread("./" + formatted) shape = img.shape # Crop cropped = img[0:shape[0] // 2] # Resize resized = cv2.resize(cropped, (400, 200), interpolation=cv2.INTER_AREA) # Blur blurred = cv2.GaussianBlur(resized, (5, 5), 0) # Convert color space final_image = cv2.cvtColor(blurred, cv2.COLOR_BGR2YUV) return final_image, labels(cropped.shape, resized.shape)
def transform(self): subjectlist = map(lambda x:int(x[1:3]),self.subjsess_list) feedbacksesslist = map(lambda x:int(x[8:10]),self.subjsess_list) X = DataFrame() xsubj = [] xsess = [] xfeedbacknum = [] xstartpos = [] # xstartpostime = [] #time isnt really improving accuracy for findex in range(len(self.flist)): x_df = read_csv(self.flist[findex]) fb_indices = x_df[x_df["FeedBackEvent"] == 1].index.tolist() # starttime_indices = x_df["Time"].iloc[fb_indices] del x_df fb_nums = range(len(fb_indices)) subj_nums = [subjectlist[findex]]*len(fb_indices) sess_nums = [feedbacksesslist[findex]]*len(fb_indices) xsubj.extend(subj_nums) xsess.extend(sess_nums) xfeedbacknum.extend(fb_nums) xstartpos.extend(fb_indices) # xstartpostime.extend(starttime_indices) X["subject"] = xsubj X["sess"] = xsess X["feedback_num"] = xfeedbacknum X["start_pos"] = xstartpos # X["start_pos_time"] = xstartpostime return X.as_matrix()
def transform(self, x): """Combine the matrix in X with the selected function. Args: x: array of shape [n_samples, n_features] Returns: array of shape [n_samples, n_features + 1] with a extra featured with the combined matrix. """ df = x is_df = False if self.columns and self.column_name: if not isinstance(x, DataFrame): columns = [str(c_i) for c_i in range(x.shape[1])] self.column_name = str(x.shape[1] + 1) df = DataFrame(data=x, columns=columns) is_df = True df = combine_matrix(df, columns=self.columns, column_result=self.column_name, func=self.op) return df if not is_df else df.as_matrix( columns=df.columns.sort_values())
def ParamReading(filen, sname): from sets import Set table = read_excel(filen, sheetname=sname, header=0) Matrix = DataFrame.as_matrix(table) n = len(Matrix) m = len(Matrix[0]) SMI = Set() #Smileset Second = Set() L = {} #Propery dictionary #Smiles data for i in range(1, n): xs = [] #Set addition-------- for j in range(0, 3): xs.append(str(Matrix[i, j])) SMI.add(tuple(xs)) #List addition------ for j in range(3, m): if isnan(Matrix[i, j]) == False: L[tuple(xs), str(Matrix[0, j])] = Matrix[i, j] #Second index informatio for j in range(3, m): Second.add(str(Matrix[0, j])) return SMI, Second, L
def transform(self): subjectlist = map(lambda x: int(x[1:3]), self.subjsess_list) feedbacksesslist = map(lambda x: int(x[8:10]), self.subjsess_list) X = DataFrame() xsubj = [] xsess = [] xfeedbacknum = [] xstartpos = [] # xstartpostime = [] #time isnt really improving accuracy for findex in range(len(self.flist)): x_df = read_csv(self.flist[findex]) fb_indices = x_df[x_df["FeedBackEvent"] == 1].index.tolist() # starttime_indices = x_df["Time"].iloc[fb_indices] del x_df fb_nums = range(len(fb_indices)) subj_nums = [subjectlist[findex]] * len(fb_indices) sess_nums = [feedbacksesslist[findex]] * len(fb_indices) xsubj.extend(subj_nums) xsess.extend(sess_nums) xfeedbacknum.extend(fb_nums) xstartpos.extend(fb_indices) # xstartpostime.extend(starttime_indices) X["subject"] = xsubj X["sess"] = xsess X["feedback_num"] = xfeedbacknum X["start_pos"] = xstartpos # X["start_pos_time"] = xstartpostime return X.as_matrix()
def Impute(data_as_DataFrame, kNNGraph, Method = IgnoringNan.mean, target = None ): """Impute(data_as_DataFrame,Graph) -> pandas DataFrame with nan's imputed Imputation is via Graph Neighborhoods of kNNGraph Method is applied to each neighborhood array of values for a vertex with an nan Note: data_as_DataFrame can also be a numpy array """ try: data_as_DataFrame.columns data_as_DataFrame.index DFrame = data_as_DataFrame.copy() except: DFrame = DataFrame( data_as_DataFrame ) cols = DFrame.columns inds = DFrame.index Data = DFrame.as_matrix() m,n = DFrame.shape for i in range(m): nbrs = kNNGraph.neighbors(i) for j in range(n): if( isnan( Data[i,j] ) ): DFrame.set_value( inds[i],cols[j], int( Method( array( [Data[nbr,j] for nbr in nbrs] ) ) ) ) return DFrame
def __init__(self, notation=None, resolution=0.01, parameters=None): self.parameters = {"beta_r": 0.5, "beta_e": 1e-5, "c": 1.0, "alpha": 0.05, "phi": 50.0, "gamma": 0.2, "r0": 0.05, "A": 5.0, "mu_poisson": 3.0, "mu_wald": 0.68, "lambda_wald": 0.93} if parameters is not None: self.parameters.update(parameters) # TODO(David): Completely change this, probably, because it sucks self.resolution = resolution self.original_notation = notation self.notation = notation # in case we need to convert point events self.array_notation = notation # numpy array for jit self.parse_notation() self.convert_notation() # TODO(David): Fix for multiple cycle types df_cycle_types = DataFrame(self.array_notation[:, :, 0], columns=["event", "id"]) df_cycle_types = df_cycle_types[(df_cycle_types.event==STIMULUS_ON) | (df_cycle_types.event==STIMULUS_OFF)].drop_duplicates() df_cycle_types["row"] = arange(0, df_cycle_types.shape[0]) self.time_marker_info = df_cycle_types.as_matrix() self.n_time_markers = self.time_marker_info.shape[0] self.time_marker_row = 2
def transform_target_vector(df: pd.DataFrame, binary=False) -> np.array: """Only used on data with known labels otherwise it will fail""" binarize = lambda x: 1 if x > 0 else 0 detect = lambda x: x if np.isnan(x) else binarize(x) if binary: df.returnQuantity = df.returnQuantity.apply(detect) return np.squeeze(df.as_matrix(columns=['returnQuantity'])).astype(np.float32)
def test_as_matrix(self): frame = self.frame mat = frame.as_matrix() frameCols = frame.columns for i, row in enumerate(mat): for j, value in enumerate(row): col = frameCols[j] if np.isnan(value): assert np.isnan(frame[col][i]) else: assert value == frame[col][i] # mixed type mat = self.mixed_frame.as_matrix(['foo', 'A']) assert mat[0, 0] == 'bar' df = DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) mat = df.as_matrix() assert mat[0, 0] == 1j # single block corner case mat = self.frame.as_matrix(['A', 'B']) expected = self.frame.reindex(columns=['A', 'B']).values assert_almost_equal(mat, expected)
def buildTransferEntropyMatrix(fname, debug=False): #print("Starting JVM and importing Java Objects...") print(fname) teCalcClass = JPackage("infodynamics.measures.continuous.gaussian" ).TransferEntropyCalculatorGaussian teCalc = teCalcClass() seCalcClass = JPackage( "infodynamics.measures.continuous.kernel").EntropyCalculatorKernel seCalc = seCalcClass() #print("Success!") raw_df = pd.read_table(fname, delimiter='\t') data = DataFrame.as_matrix(raw_df)[:, 1:-1] mat = np.negative(np.ones((data.shape[1], data.shape[1]))) print("Building matrix for " + fname) for i in range(mat.shape[0]): if debug: print("Row: " + str(i) + ", Time: " + str(time.time())) for j in range(mat.shape[1]): if i != j: c1 = data[:, i] c2 = data[:, j] mat[i, j] = getTransferEntropy(teCalcClass, teCalc, c1, c2) else: c = data[:, i] mat[i, j] = getShannonEntropy(seCalc, c) #Keep in bits. if data.shape[1] == 265: mat = np.insert(mat, (25, 132, 176), 0.0, axis=0) mat = np.insert(mat, (25, 132, 176), 0.0, axis=1) #shutdownJVM() return mat
def build_wordencoder(embeddings: pd.DataFrame, transform: Callable[[str], str]) \ -> TextEncoder: """ Create a word-level encoder: a Callable, mapping strings into integer arrays. Encoders dispatch on input type: if you pass a single string, you will get a 1D array, if you pass an Iterable of strings, you will get a 2D array, where row i encodes the i-th string in the Iterable. :param embeddings: a dataframe of word vectors indexed by words. The last vector (row) is used to encode OOV words. :return: """ wordmap = {word: i for i, word in enumerate(embeddings.index)} if not wordmap: raise ValueError('empty `embeddings`') if not all(isinstance(word, str) for word in wordmap): raise ValueError('`embeddings` can be indexed by strings alone') oov = wordmap[embeddings.index[-1]] vectors = embeddings.as_matrix() def index(word: str) -> int: if not word: raise ValueError("can't encode empty words") return wordmap.get(transform(word), oov) def wordencoder(target: Union[str, Iterable[str]]) -> np.ndarray: if isinstance(target, str): return vectors[index(target)] indices = list(map(index, target)) if not indices: raise ValueError('there are no `target`s') return np.vstack(vectors[indices]) return wordencoder
def orient_undirected_graph(self, data, graph, CItest="gaussian", method_indep='pcalg', alpha=0.01, njobs=SETTINGS.NB_JOBS, verbose=False, **kwargs): """Run PC on an undirected graph.""" # Building setup w/ arguments. self.arguments['{CITEST}'] = self.CI_tests[CItest] self.arguments['{METHOD_INDEP}'] = self.method_indep[method_indep] self.arguments['{DIRECTED}'] = 'TRUE' self.arguments['{ALPHA}'] = str(alpha) self.arguments['{NJOBS}'] = str(njobs) self.arguments['{VERBOSE}'] = str(verbose).upper() fe = DataFrame(nx.adj_matrix(graph, weight=None).todense()) fg = DataFrame(1 - fe.as_matrix()) results = self.run_pc(data, fixedEdges=fe, fixedGaps=fg, verbose=verbose) return nx.relabel_nodes(nx.DiGraph(results), {idx: i for idx, i in enumerate(data.columns)})
def gonzales(data, k): #transform the data numpy array to data frame using the id as index points_list = DataFrame(data[:, 1:], index=data[:, 0]) #adding two columns in the points data frame for saving the centers and distance points_list["distance"] = np.nan points_list["center"] = np.nan distance_column_index = points_list.columns.get_loc("distance") #choosing a random point as the first center #center0 = points_list.sample(n=1 , random_state = randint(0,100) , axis=0) center0 = points_list.head(1) centers_list = DataFrame(center0.drop(['distance', 'center'], axis=1)) centers_list['color'] = 'r' colors = "bgcmykw" #=========================================================================== # print(centers_list) # print("==============Initialization finished===========") #=========================================================================== #looping k-1 time to have k centers for k_cycle in range(1, k + 1): # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster max_distance = 0 next_cluster = np.nan #loop on all the points to assign them to their closest center for indexp, p in points_list.iterrows(): #variables to save the choose the closest center min_cluster_distance = math.inf closest_cluster = None for indexc, center in centers_list.iterrows(): dis = spatial.distance.euclidean( center.as_matrix(columns=[0, 1]), p.as_matrix(columns=[0, 1])) if dis < min_cluster_distance: min_cluster_distance = dis closest_cluster = indexc p["distance"] = min_cluster_distance p["center"] = closest_cluster if min_cluster_distance > max_distance: max_distance = min_cluster_distance next_cluster = indexp centers_list = centers_list.append( points_list.ix[[next_cluster], :distance_column_index]) centers_list.set_value(next_cluster, 'color', colors[k_cycle]) #======================================================================= # print(centers_list) # print("==============Cycle finished===========") #======================================================================= centers_list.drop(centers_list.tail(1).index, inplace=True) centers_list.drop(['color'], axis=1, inplace=True) #=========================================================================== # centers_list.plot(kind='scatter', x=0, y=1 , c='r' ) # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2 ) # plt.show() #=========================================================================== #print(points_list) return centers_list.as_matrix(columns=[0, 1])
def _to_xy(df: pd.DataFrame, target: str): """Converts a Pandas dataframe to the x,y inputs that TensorFlow needs""" result = [] for x in df.columns: if x != target: result.append(x) dummies = df[target] return df.as_matrix(result).astype(np.float32), dummies.as_matrix().flatten().astype(int)
def transform_target_vector(df: pd.DataFrame, binary=False) -> np.array: """Only used on data with known labels otherwise it will fail""" binarize = lambda x: 1 if x > 0 else 0 detect = lambda x: x if np.isnan(x) else binarize(x) if binary: df.returnQuantity = df.returnQuantity.apply(detect) return np.squeeze(df.as_matrix(columns=['returnQuantity'])).astype( np.float32)
def getData(training_file, time_steps, split_percent): print("Train on file",training_file) series = read_csv(training_file) file_vector = getFileVector(training_file) #file_vector_size = len(file_vector) print("File vector ", file_vector) file_vector_size = len(file_vector) # Convert series to dataframe to perform shift and generate test set series_dataframe = DataFrame(series) shifted_dataframe = series_dataframe.shift(1) shifted_dataframe.fillna(0, inplace=True) # DataFrames to Numpy arrays series_dataframe_narray = series_dataframe.as_matrix() shifted_dataframe_narray = shifted_dataframe.as_matrix() # Create small samples of time_steps size split_length = time_steps num_samples = shifted_dataframe_narray.shape[0]-split_length file_vector_narray = np.array(file_vector).reshape(1,1,-1) #print("File vector reshaped before np repeat ", file_vector_narray.shape) file_vector_repeat = np.repeat(np.repeat(file_vector_narray,num_samples,axis=0),split_length,axis=1) #print("X_samples shape repeated ", file_vector_repeat.shape) num_features = file_vector_size+1 X_samples = np.zeros([num_samples, split_length, num_features]) y_samples = np.zeros([num_samples, 1]) #print("X_samples shape ", X_samples.shape) for i in range(num_samples) : #print("X samples shape ", X_samples[i].shape) #print("shifted ", shifted_dataframe_narray[i:i+split_length,0].reshape(-1,1).shape) X_samples[i] = np.append(file_vector_repeat[i],shifted_dataframe_narray[i:i+split_length,0].reshape(-1,1),axis=1) #print("X_samples ", X_samples[i]) y_samples[i] = series_dataframe_narray[i+split_length,0] # Add additional dimension to feed it into tensorflow. It needs 3D tensor for LSTM #X_samples = np.reshape(X_samples,[X_samples.shape[0],X_samples.shape[1],1]) # Split into training and test data split = int(split_percent*num_samples) X_train_samples = X_samples[:split] y_train_samples = y_samples[:split] X_test_samples = X_samples[split:] y_test_samples = y_samples[split:] return X_train_samples, y_train_samples, X_test_samples, y_test_samples
def gonzales(data , k): #transform the data numpy array to data frame using the id as index points_list = DataFrame(data[:, 1:] , index = data[ : , 0]) #adding two columns in the points data frame for saving the centers and distance points_list["distance"] = np.nan points_list["center"] = np.nan distance_column_index = points_list.columns.get_loc("distance") #choosing a random point as the first center #center0 = points_list.sample(n=1 , random_state = randint(0,100) , axis=0) center0 = points_list.head(1) centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1)) centers_list['color'] = 'r' colors = "bgcmykw" #=========================================================================== # print(centers_list) # print("==============Initialization finished===========") #=========================================================================== #looping k-1 time to have k centers for k_cycle in range(1,k+1): # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster max_distance = 0 next_cluster = np.nan #loop on all the points to assign them to their closest center for indexp, p in points_list.iterrows(): #variables to save the choose the closest center min_cluster_distance = math.inf closest_cluster = None for indexc, center in centers_list.iterrows(): dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1])) if dis < min_cluster_distance: min_cluster_distance = dis closest_cluster = indexc p["distance"] = min_cluster_distance p["center"] = closest_cluster if min_cluster_distance > max_distance: max_distance = min_cluster_distance next_cluster = indexp centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index ]) centers_list.set_value(next_cluster, 'color', colors[k_cycle]) #======================================================================= # print(centers_list) # print("==============Cycle finished===========") #======================================================================= centers_list.drop(centers_list.tail(1).index, inplace=True) centers_list.drop(['color'], axis=1 ,inplace=True) #=========================================================================== # centers_list.plot(kind='scatter', x=0, y=1 , c='r' ) # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2 ) # plt.show() #=========================================================================== #print(points_list) return centers_list.as_matrix(columns=[0 ,1])
def df2array(stock_df: pd.DataFrame, X_feats: List[str], y_feat: str, rescale=False): dataX = stock_df.as_matrix(X_feats) dataY = stock_df.as_matrix([y_feat]).reshape(-1) dataY = np.sign(np.sign(dataY) + 1.0) # float => label dataX = dataX[np.isfinite(dataY), :] dataY = dataY[np.isfinite(dataY)] dataX = np.nan_to_num(dataX) if rescale: X_mean = np.mean(dataX, axis=0) X_std = np.std(dataX, axis=0) dataX = (dataX - X_mean[np.newaxis, :]) / X_std[np.newaxis, :] return dataX, dataY
def pickleRawData(): f = open(RAW_FILE_NAME,'r') middata = [] # [h l close vo time] rawdata = f.readlines() f.close() # rawdata = rawdata[-2500:] print('read and format data:\n') for line in tqdm(rawdata): cell = line.split(',') time = cell[0]+' '+cell[1] highest = float(cell[3]) lowest = float(cell[4]) close = float(cell[5]) volume = float(cell[6]) time = makeTime(time) middata.append([highest, lowest, close, volume, time]) datalen = len(middata) assert( datalen> SAMPLE_LENGTH) num = datalen - SAMPLE_LENGTH +1 lastdata = [] print('read completed! now enter core processing:\n') for i in tqdm(range(num)): d = middata[i:i+SAMPLE_LENGTH] df = DataFrame(d) max_v = max(df[0]) min_v = min(df[1]) f_k = lambda x: (GRID_HIGH*(x-min_v)/(max_v-min_v)) df['h'] = df[0].apply(f_k) df['l'] = df[1].apply(f_k) df['c'] = df[2].apply(f_k) df['v'] = df[3] df['t'] = df[4] df['v'].astype('float') for i in range(5): del df[i] # print(df) matrix = df.as_matrix() matrix = matrix.transpose() matrix = matrix/GRID_HIGH # print(matrix) # exit() lastdata.append(matrix) lastlen = len(lastdata) assert(lastlen > PREDICT_LENGTH) lastnum = lastlen - PREDICT_LENGTH outxy = [] print('core completed!, make output tuple:\n') for i in tqdm(range(lastnum)): outxy.append((lastdata[i],lastdata[i+PREDICT_LENGTH])) random.shuffle(outxy) # print(outxy) with open(filename(), 'wb') as f: pickle.dump(outxy, f)
def fit(self, X: pd.DataFrame, w: np.ndarray): if len(X) == 0: raise NotEnoughParticles("Fitting not possible.") self._X_arr = X.as_matrix() sample_cov = smart_cov(self._X_arr, w) dim = sample_cov.shape[0] eff_sample_size = 1 / (w**2).sum() bw_factor = self.bandwidth_selector(eff_sample_size, dim) self.cov = sample_cov * bw_factor**2 * self.scaling self.normal = st.multivariate_normal(cov=self.cov, allow_singular=True)
def fit(self, X: pd.DataFrame, w: np.ndarray): if len(X) == 0: raise NotEnoughParticles("Fitting not possible.") self._X_arr = X.as_matrix() cov = smart_cov(self._X_arr, w) effective_sample_size = len(X) / (1 + w.var()) dimension = cov.shape[0] self.cov = cov * self.bandwidth_selector(effective_sample_size, dimension) * self.scaling self.normal = st.multivariate_normal(cov=self.cov, allow_singular=True)
def unfinished_tasks(all_configurations, data_frame: pd.DataFrame, conf_columns: List[str]): done_configs = data_frame.as_matrix(conf_columns) done_configs = {tuple(config) for config in done_configs} all_as_set = {tuple(config) for config in all_configurations} assert done_configs <= all_as_set print('found {}/{} is done.'.format(len(set(done_configs) & all_as_set), len(all_as_set))) print('there are {} finished settings not in all_configurations.'.format( len(done_configs - all_as_set))) return list(all_as_set - done_configs)
def fit(self, news_df: pd.DataFrame, batch_size: int, epochs: int, company_alias: Dict = None, verbose: int = 0): news_df_prs = tokenize_csv_file(news_df, should_replace_company=True, should_remove_NE=True, should_remove_numbers=False, company_alias=company_alias) self.emb_matrix = self.text_converter.fit(news_df_prs, 'text') self.model = nn_model(self.emb_matrix) X = self.text_converter.convert(news_df_prs, 'text') y = news_df.as_matrix(['sentiment']) print('Embedding Matrix Size: {}\nTraining Data Size, X: {}, Y:{}'.format(np.shape(self.emb_matrix), np.shape(X), np.shape(y))) print('Start Training...') self.model.fit(X, y, batch_size=batch_size, epochs=epochs, verbose=verbose)
def construct_data_matrix(input_file_name, output_filename_python, output_filename_matlab, has_header=True): data = dict() # used to store the data matrix Z data_dates = dict() # used to store the dates used in the data matrix # response_vector = dict() # response vector y ticker = None with open(input_file_name, 'r') as infile: if has_header: infile.readline() for l in csv.reader(infile): cur_date = datetime.strptime(l[PRICE_CHANGE_FIELDS.Date.zvalue], PRICE_CHANGE_DATE_FORMAT) # Filter everything that's not from 2014 H2 if cur_date.year != 2014 or cur_date.month < 6: continue # Filter lines without an analyst target price try: Pi = float(l[PRICE_CHANGE_FIELDS.New_Target.zvalue]) # Analyst prediction = Pi / Peoq except ValueError: continue Peoq = float(l[PRICE_CHANGE_FIELDS.Current_Price.zvalue]) Pf = float(l[PRICE_CHANGE_FIELDS.Price_in_a_year.zvalue]) # true values y = Pf / Peoq analyst = l[PRICE_CHANGE_FIELDS.Firm.zvalue] if ticker != l[PRICE_CHANGE_FIELDS.Ticker.zvalue]: ticker = l[PRICE_CHANGE_FIELDS.Ticker.zvalue] data[ticker] = dict() data_dates[ticker] = dict() data[ticker][RESPONSE_LABEL] = Pf /Peoq # Don't assume data is ordered latest first if analyst not in data[ticker] or data_dates[ticker][analyst] < cur_date: data[ticker][analyst] = Pi / Peoq data_dates[ticker][analyst] = cur_date # if ticker not in response_vector: # response_vector[ticker] = Pf / Peoq pickle.dump(data, open(output_filename_python, 'wb')) # Z = pickle.load(open('data/data_matrix.pkl')) df = DataFrame(data).T.fillna(0) column_labels = df.columns.tolist() # Firms row_labels = df.T.columns.tolist() # Tickers # df['Citigroup Inc.'] # df.T['AAPL'] savedict = {"data": df.as_matrix(), "column_labels": column_labels, "row_labels": row_labels} sio.savemat(output_filename_matlab, savedict)
def standard_som(): path = "data/dorothea_clean.csv" #path = "research/data/housing.data" table = pd.read_csv(path, header=None) table_new = table.dropna(axis=0) data = DataFrame.as_matrix(table_new) my_som = SOM(20, 20, 5000) lattice = my_som.calc(data) u_matrix = create_u_matrix(lattice) plt.matshow(u_matrix.T, fignum=100, cmap='viridis') plt.show()
def orient_directed_graph(self, data, dag, alg='HC', **kwargs): """ Improve a directed acyclic graph using CGNN :param data: data :param dag: directed acyclic graph to optimize :param alg: type of algorithm :param log: Save logs of the execution :return: improved directed acyclic graph """ data = DataFrame(scale(data.as_matrix()), columns=data.columns) alg_dic = {'HC': hill_climbing, 'tabu': tabu_search} return alg_dic[alg](dag, data, self.infer_graph, **kwargs)
def cal_cor(): cor = list() df = DataFrame() input = open('raw_data.pkl', 'rb') df = pickle.load(input) data = np.matrix(df) data = (data.T)[1:, :-1] rate = df.as_matrix(['ReturnRate'])[1:] #filter cor a = [] for i in xrange(data.shape[0]): cor.append(np.corrcoef(rate.T, data[i])[0][1]) if abs(cor[i]) < 0.03: a.append(i) print 'del:' + index[i] + ':' + str(cor[i]) else: print 'keep:----' + index[i] + ':' + str(cor[i]) + '----' data = np.delete(data, a, axis=0) #delete filtered features in data a.reverse() index_cor = copy(index) for i in a: index_cor.pop(i) output = open('filter_cor.pkl', 'wb') pack = {} pack['result'] = rate pack['data'] = data pack['index'] = index_cor pickle.dump(pack, output) output.close() #绘图 if True: fig = figure() #六个特征同时 for i in xrange(len(pack['index'])): plot(pack['data'].tolist()[i], label=pack['index'][i]) plot(pack['result'].T.tolist()[0], linewidth=2.5, label='result') legend(loc='upper left') savefig( 'C:\\Projects\\FuzzyNeuro\\FuzzyNeuro\\20170320\\1\\raw_data.png', dpi=200) #分别描绘六个特征 for i in xrange(len(pack['index'])): figure() plot(pack['data'].tolist()[i], label=pack['index'][i]) legend(loc='upper left') savefig('C:\\Projects\\FuzzyNeuro\\FuzzyNeuro\\20170320\\1\\' + pack['index'][i] + '_raw.png', dpi=200)
def CorReading(filname, sname): table = read_excel(filname, sheetname=sname, header=0) Matrix = DataFrame.as_matrix(table) n = len(Matrix) m = len(Matrix[0]) L = {} for i in range(1, n): for j in range(1, m): if isnan(Matrix[i, j]) == False: L[(str(Matrix[0, j]), str(Matrix[i, 0]))] = Matrix[i, j] return L
def cleanTrain(): if not DEBUG: pdtest = pd.read_csv('test_ver2.csv/test_ver2.csv', delimiter=',') pdtrain = pd.read_csv('train_ver2.csv/train_ver2.csv', delimiter=',') pickle.dump(pdtrain, open(r'RawTrain.pickle', "wb")) pickle.dump(pdtest, open(r'RawTest.pickle', 'wb')) if DEBUG: train = pd.read_csv('train_ver2.csv/train_ver2.csv', delimiter=',') train2 = DataFrame.as_matrix(train) return pdtrain
def sequential_forward_selection(clf, X: pd.DataFrame, y: pd.DataFrame, k) -> list: """ calculate for each available amount of features the best set. like in the tutor, large sets contain the smaller sets. :return: a dict indexed by int's, each entry contains a set of the best features selected for this entry. """ X = X.loc[:, X.columns != 'Unnamed: 0'] base = [feature for feature in X.keys()] bestIndexes = dict() bestScores = dict() X = X.as_matrix() y = y.as_matrix().ravel() for i in range(k): bestScore = 0 for j in range(0, len(base)): if j in bestIndexes.values(): continue currIndexes = [bestIndexes[l] for l in range(i)] currIndexes.append(j) currX = X[:, currIndexes] tempScore = metrics.accuracy_score( y, cross_val_predict(clf, currX, y, cv=3)) if tempScore > bestScore: bestScore = tempScore bestIndexes[i] = j bestScores[i] = bestScore indexByOrder = [] bestFeatures = [] print(bestScores) for l in bestIndexes.keys(): indexByOrder.append(bestIndexes[l]) bestFeatures.append(base[bestIndexes[l]]) return bestIndexes, bestFeatures, bestScores
def generateMatrix(): db=pymysql.connect("localhost","root","root","bookRec") cursor=db.cursor() sql="select user_id,book_id,score from comment limit 1000;" #取出所有的用户id,书籍id,评分 cursor.execute(sql) data=cursor.fetchall() db.close() frame=DataFrame() #建立矩阵 n=0 for item in data: n+=1 print(n) #向矩阵中增加值,矩阵会自动扩大 frame.loc[item[1],item[0]]=item[2] return mat(frame.as_matrix())
def find_top_k_chars(recording, model, k=3): x_test, labels = preprocess_data(recording) # Convert this test data to datafrmae test_dat = DataFrame(x_test) # Center and scale the data center_scale(test_dat) # This is a dictionary with a unique id as key and labels as value id_to_char = dict(zip(range(len(model.classes_[0])), model.classes_[0])) # Predict using the rf model and return the predicted characters test_prediction = model.predict_proba(test_dat.as_matrix()) return dict(zip(labels, top_k_prediction(test_prediction, id_to_char)))
def pca(df: DataFrame, file_path: str, eigenvalues_condition: Callable[[float], bool]): """ Transforma un dataset en otro con menos dimensiones mediante PCA y permite guardarlo en un archivo csv. Implementacion basada en el documento 'A tutorial on Principal Components Analysis' de Lindsay I Smith :param df: dataset con atributos solamente numericos y sin el atributo objetivo :param file_path: ruta relativa al archivo csv en donde se guardara el resultado :param eigenvalues_condition: funcion booleana para filtrar los valores propios (y con estos los vectores propios asociados) que se usaran para generar la matriz row_feature_vector (ver documento). """ # se omite el primer paso asumiendo que los datos cumplen las precondiciones # segundo paso: resta de los promedios row_data_adjust = DataFrame() means = [] for a in df.columns.values: means.append(df[a].mean()) for (i, a) in enumerate(df.columns.values): row_data_adjust[a] = df[a] - means[i] # tercer paso: calculo de matriz de covarianzas C = row_data_adjust.cov() # cuarto paso: calculo de valores y vectores propios de la matriz de covarianzas U, Sigma, V = randomized_svd(C.as_matrix(), n_components=C.shape[0], n_iter=5, random_state=None) # quinto paso: eleccion de componentes para formar el vector de caracteristicas order = (-Sigma).argsort() Sigma = Sigma[order] U = U[:, order] filtered_indices = [ i for i in range(len(Sigma)) if eigenvalues_condition(Sigma[i]) ] row_feature_vector = U[:, filtered_indices].transpose() # sexto paso : derivacion del nuevo dataset row_data_adjust = row_data_adjust.as_matrix()\ .transpose() # noinspection PyUnresolvedReferences final_data = np.matmul(row_feature_vector, row_data_adjust) final_data = final_data.transpose() # se guarda en un csv final_data = DataFrame(final_data) final_data.to_csv(file_path, index=False, encoding='utf-8')
def __init__(self): # Load data self.public_sector = read_table( "data/dhmosia_kthria/dhmosia_kthria_attikis.csv", index_col="ktirio_ypiresia", sep=';') self.stops = read_csv("data/oasa/stops.txt") self.routes = read_csv("data/oasa/routes.txt", index_col="route_short_name") self.stop_times = read_csv("data/oasa/stop_times.txt", index_col="stop_id") # Get only the coordinates of the bus stops to make the # "training" data self.coordinates = DataFrame.as_matrix( self.stops, columns=["stop_lon", "stop_lat"]) self.nbrs = NearestNeighbors().fit(self.coordinates)
def kmeans_scikit(data , k): points_list = DataFrame(data[:, 1:] , index = data[ : , 0]) mat = points_list.as_matrix() print(mat) # Using sklearn km = sklearn.cluster.KMeans(n_clusters=k) km.fit(mat) # Get cluster assignment labels labels = km.labels_ print(labels) print('==============') print(km.predict([[20 ,-15]])) # Format results as a DataFrame #results = pd.DataFrame([points_list.index,labels]).T points_list['labels'] = labels points_list.plot(kind='scatter', x=0, y=1 , c='labels' ) plt.show() print(points_list)
def encode_features(df: pd.DataFrame, ft: str) -> csr_matrix: """Encode categorical features""" if ft not in set(encode_label + encode_int): return csr_matrix(df.as_matrix(columns=[ft])) label_enc = LabelEncoder() one_hot_enc = OneHotEncoder(sparse=True) if ft in encode_label: V = df[ft].as_matrix().T V_lab = label_enc.fit_transform(V).reshape(-1, 1) V_enc = one_hot_enc.fit_transform(V_lab) return V_enc if ft in encode_int: V = df[ft].as_matrix().reshape(-1, 1) V_enc = one_hot_enc.fit_transform(V) return V_enc
def _train(self, model_dates: pd.Series, train_data: pd.DataFrame) -> ERModel: train_start_date = model_dates.trainStart train_end_date = model_dates.trainEnd time_line = train_data.index train_data = train_data.as_matrix() left = bisect.bisect_left(time_line, train_start_date) right = bisect.bisect_left(time_line, train_end_date) y = train_data[left:right, 0] x = train_data[left:right, 1:] model = ERModel() model.fit(x, y) return model
def kmeans_scikit(data, k): points_list = DataFrame(data[:, 1:], index=data[:, 0]) mat = points_list.as_matrix() print(mat) # Using sklearn km = sklearn.cluster.KMeans(n_clusters=k) km.fit(mat) # Get cluster assignment labels labels = km.labels_ print(labels) print('==============') print(km.predict([[20, -15]])) # Format results as a DataFrame #results = pd.DataFrame([points_list.index,labels]).T points_list['labels'] = labels points_list.plot(kind='scatter', x=0, y=1, c='labels') plt.show() print(points_list)
def __kalman(df: pd.DataFrame) -> pd.DataFrame: """ :return: Kalman smooth these columns: ['masl', 'lat', 'lon', 'dmasl', 'dlat', 'dlon'] """ columns = ['masl', 'lat', 'lon', 'dmasl', 'dlat', 'dlon'] trans_mat = np.array([ [1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], ]) obs_mat = np.array([ [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], ]) kf = pykalman.KalmanFilter( transition_matrices=trans_mat, transition_covariance=1.0e-4 * np.eye(6), observation_matrices=obs_mat, observation_covariance=1.0e-1 * np.eye(6), initial_state_mean=[df.masl[0], df.lat[0], df.lon[0], df.dmasl[0], df.dlat[0], df.dlon[0]], initial_state_covariance=1.0e-3 * np.eye(6), ) x = df.as_matrix(columns=columns) (state_means, state_covs) = kf.em(x, n_iter=6).smooth(x) df['k_masl'] = state_means[:, 0] df['k_lat'] = state_means[:, 1] df['k_lon'] = state_means[:, 2] df['k_dmasl'] = state_means[:, 3] df['k_dlat'] = state_means[:, 4] df['k_dlong'] = state_means[:, 5] return df
def cluster_comments(self): """ Clusters comments based on their timestamps and assigns cluster-membership as attribute to nodes. """ the_nodes = self.graph.nodes() if len(the_nodes) < 7: logging.warning( "Skipped clustering for %s, only %i comments", self.post_title, len(the_nodes)) for node in the_nodes: self.graph.node[node]['cluster_id'] = None else: com_ids, stamps = zip( *((node, data["com_timestamp"]) for node, data in self.graph.nodes_iter(data=True))) data = DataFrame( {'timestamps': stamps}, index=com_ids).sort_values( by='timestamps') epoch = data.ix[0, 'timestamps'] data['timestamps'] = data['timestamps'].apply( lambda timestamp: (timestamp - epoch).total_seconds()) # TODO: identify outliers (or sparse end of stamps) and set # cluster-id to None # (find sparse "end" by using the time-stamps as index # and ones as data, # pd.rolling_sum() for 1 day-window and create mask based on < 2) # then create cluster_data for data[data["cluster_id"] != None] cluster_data = data.as_matrix() # TODO: need more refined way to set quantile # more consise, but see if check for 0 bandwidth is still needed for quantile in [.05, .3, 1, False]: if quantile: try: bandwidth = estimate_bandwidth( cluster_data, quantile=quantile) except ValueError: logging.info( "Estimation with quantile %f failed", quantile) else: break else: logging.warning("Could not cluster %s", self.post_title) sys.exit(1) with warnings.catch_warnings(): warnings.simplefilter("ignore") try: mshift = MeanShift(bandwidth=bandwidth, bin_seeding=False) mshift.fit(cluster_data) except ValueError: mshift = MeanShift(bandwidth=0.5, bin_seeding=False) mshift.fit(cluster_data) labels = mshift.labels_ unique_labels = np.sort(np.unique(labels)) logging.info("Found %i clusters in %s", len(unique_labels), self.post_title) try: assert len(labels) == len(cluster_data) # assert (unique_labels == np.arange(len(unique_labels))).all() except AssertionError as err: logging.warning("Mismatch cluster-labels: %s", err) print(unique_labels) print(labels) data['cluster_id'] = labels for com_id in data.index: self.graph.node[com_id]['cluster_id'] = data.ix[ com_id, 'cluster_id']
""" Main """ data = sensorData.join(campusData, how = 'inner', rsuffix = 'Time_Stamp') # Change attributes order data = DataFrame(data, columns = ['Temperature', 'Humidity', 'WindSpeed', 'SolarRadiation', 'Temp1', 'Hum1', 'Temp2', 'Hum2']) # Drops any Nan Values after removal of outliers (Precautionary) data = data.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = False) trainData = data.as_matrix(columns = ['Temperature', 'Humidity', 'WindSpeed', 'SolarRadiation']) resultData = data.as_matrix(columns = ['Temp1']) maxValue = max(resultData) minValue = min(resultData) resultData = (resultData - minValue) / (maxValue - minValue) (w0To1, w1To2) = learnProcess(trainData, resultData)
DoubleDensity1 = APM.randint(8,20, 2*n1)-0.5 DoubleDensity1[n1:] = APM.randint( 8,15,n1 ) DDy = [ APM.randint(0,9)+ APM.rand() for i in range(2*n1)] Class0 = array( [ APM.randint(4,13, n0), randint(4,9,n0)-0.5, [0 for i in range(n0) ] ] ) Class1 = array( [ DoubleDensity1, DDy, [1 for i in range(2*n1) ] ] ) ClassU = array( [ APM.randint(8,12,nu), APM.randint(3,6,nu) + APM.randint(0,5,nu)*0.2 , [nan for i in range(nu) ]] ) Exdat = zeros( (3,n0+2*n1+nu) ) Exdat[:,:n0] = Class0 Exdat[:,n0:(n0+2*n1)] = Class1 Exdat[:,(n0+2*n1):] = ClassU Example3 = DataFrame( Exdat.T, columns = ['x','y','class'] ) Example3Data = Example3.as_matrix() n0 = 50 n1 = 200 nu = 5 Class0 = array( [ [ (APM.rand()+ 7)*cos(2*pi*i/n0) for i in range(n0)], [ (APM.rand()+ 7)*sin(2*pi*i/n0) for i in range(n0)], [0 for i in range(n0) ] ] ) Class1 = array( [ [ (4*APM.rand()+ 2)*cos(2*pi*i/n1) for i in range(n1)], [ (4*APM.rand()+ 2)*sin(2*pi*i/n1) for i in range(n1)], [1 for i in range(n1) ] ] ) ClassU = array( [ [ (6.1+i/nu)*cos(2*pi*i/nu) for i in range(nu)], [ (6.1+i/nu)*sin(2*pi*i/nu) for i in range(nu)], [nan for i in range(nu) ]] ) Exdat = zeros( (3,n0+n1+nu) ) Exdat[:,:n0] = Class0 Exdat[:,n0:(n0+n1)] = Class1 Exdat[:,(n0+n1):] = ClassU
from sklearn.naive_bayes import MultinomialNB #from sklearn.naive_bayes import MutinomialNB data=DataFrame({'petal1':[],'petal2':[],'sepal1':[],'sepal2':[]}) tar=DataFrame({'target':[]}) md=genfromtxt('/home/shubh/Downloads/iris.data',delimiter=',',dtype=None) i=0 for row in md: d={'petal1':[md[i][0]],'petal2':[md[i][1]],'sepal1':[md[i][2]],'sepal2':[md[i][3]]} t={'target':[md[i][4]]} d2=DataFrame(d) t2=DataFrame(t) data=data.append(d2) tar=tar.append(t2) i=i+1 #print data.head() target=tar['target'].values feed=data.as_matrix(['petal1','petal2','sepal1','sepal2']) #print feed cls=MultinomialNB() cls.fit(feed,target) a=[4.8,3.0,1.4,0.3] ts=cls.predict(a) print ts #print target
if int(info[0]) == item_id: f.close() return info[1] line = f.readline() f.close() return None if __name__ == '__main__': #データの読み込み original_data = np.loadtxt("u.data", delimiter="\t") original_users = original_data[:,0] original_items = original_data[:,1] score = original_data[:,2] #ユーザーid,映画idのリストから重複を削除 users = np.unique(original_users) items = np.unique(original_items) #ユーザー*アイテムのデータフレーム作成 #全要素を0で初期化 df = DataFrame(np.zeros((len(users), len(items))), index=users, columns=items) #評価点の代入 for i in range(len(original_users)): df.ix[original_users[i], original_items[i]] = score[i] data = df.as_matrix() MF = MatrixFactorization() MF.fit(data) index = convert_user_id_to_index(users, 2) #ユーザーidに対応するインデックスを取得 item_index, rate = MF.predict(data, index) #ユーザーのインデックスに対してレコメンドするアイテムのインデックスを取得 item_id = convert_index_to_item_id(items, item_index) #アイテムのインデックスに対応するidを取得 print(get_item_name(item_id), rate) #レコメンドされたアイテム名を出力 print("error=%d"%MF.get_result_error(data)) #RとU^TVの二乗誤差を出力
obj2 = Series([4, 5, 6, 7], index=['d', 'b', 'a', 'c']) print obj2 print "obj2.index:", obj2 print obj2['a'] print obj2['d'] ''' 列数据的获取 ''' ''' DataFrame.as_matrix(columns=None) ''' import pandas as pd data1 = pd.DataFrame(...) # 任意初始化一个列数为3的DataFrame data1.columns = ['a', 'b', 'c'] 1. data1['b'] # 这里取到第2列(即b列)的值 2. data1.b # 效果同1,取第2列(即b列)
ndata = DataFrame() for col in data.columns: encoder.fit(uniques[col]) ndata[col] = encoder.transform(data[col]) # Simple Explorartory Analysis # Bar plots for col in data.columns: agg = data.groupby([col, 'class']).count() sns.set_style("whitegrid") ax = sns.barplot(x=agg.ix[:, [0]].index.values, y=agg.ix[:, [0]].values.T[0]) plt.title("Distribution of " + col) plt.show() # PCA pca_cal(standardize_dataset(ndata.as_matrix()), data['class'], data.columns, title="PCA with normalization") # Seperate dataset to test and train set kf = KFold(n=len(ndata), n_folds=10, shuffle=True) train, test = kf.get_indices() s = Score() total_train_error = [] total_test_error = [] for k in range(1, 200, 3): train_error = [] test_error = [] for i in range(10): print "round %d %d" % (k, i) train_data = ndata.ix[train[i]]
data['min'] = D.min(axis=1) data['max'] = D.max(axis=1) data['median'] = D.median(axis=1) data['sum'] = D.sum(axis=1) data['skewness'] = D.skew(axis=1) #data['mode'] = D.astype(int).mode(axis=1) data['mad'] = D.mad(axis=1) data['var'] = D.var(axis=1) data['sem'] = D.sem(axis=1) data['kurt'] = D.kurt(axis=1) data['quantile_25'] = D.quantile(0.25,axis=1) data['quantile_50'] = D.quantile(0.50,axis=1) data['quantile_75'] = D.quantile(0.75,axis=1) #data = data/data.max(axis=0) #normalize wrt max df = data.as_matrix() # Generate random features and distance matrix. dist = np.zeros([df.shape[0],df.shape[0]]) for i in range(df.shape[0]): for j in range(df.shape[0]): dist[i,j] = np.sum(np.abs(df[i]-df[j])) # Compute and plot first dendrogram. fig = plt.figure(figsize=(20,20)) ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) Y = sch.linkage(dist, method='centroid') Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([])