def prepare_template_vars(self, sq_m_avg: Series, flats_ttl: DataFrame, flat_disp: DataFrame) -> Dict: # final report variables pocet_bytu = len(self.data.index) nejdrazsi_region = sq_m_avg.idxmax() nejvyssi_cena = sq_m_avg.loc[nejdrazsi_region] nejlevnejsi_region = sq_m_avg.idxmin() nejnizsi_cena = sq_m_avg.loc[nejlevnejsi_region] nejvice_bytu = flats_ttl.idxmax()['Počet bytů'] nejvice_bytu_pct = int(flats_ttl.loc[nejvice_bytu]['Počet bytů'] / pocet_bytu * 100) nejcastejsi_dispozice = flat_disp.idxmax()['Počet bytů'] total = self.get_sum(flat_disp) nejcastejsi_dispozice_pct = int( flat_disp.loc[nejcastejsi_dispozice]['Počet bytů'] / total * 100) return { "pocet_bytu": pocet_bytu, "nejdrazsi_region": nejdrazsi_region.upper(), "nejvyssi_cena": "{:,}".format(nejvyssi_cena).replace(',', ' '), "nejlevnejsi_region": nejlevnejsi_region.upper(), "nejnizsi_cena": "{:,}".format(nejnizsi_cena).replace(',', ' '), "nejvice_bytu": nejvice_bytu, "nejvice_bytu_pct": nejvice_bytu_pct, "nejcastejsi_dispozice": nejcastejsi_dispozice, "nejcastejsi_dispozice_pct": nejcastejsi_dispozice_pct, "vygenerovano": datetime.now().strftime("%d.%m.%Y %H:%M") }
def predict(model, result_dir): test_gen = data_generator(test_data_dir, shuffle=False) proba = model.predict_generator(test_gen, nb_test_samples) proba_df = DataFrame(proba, index=test_gen.filenames) proba_df.to_csv(os.path.join(result_dir, 'proba.csv')) proba_df.idxmax(axis=1).to_csv(os.path.join(result_dir, 'pred.csv'))
def practice_three(): df = DataFrame( [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df.sum() # 列求和 df.sum(axis=1) # 行求和 df.mean(axis=1, skipna=False) ''' axis 0行,1列 skipna 排除缺失值 level 分组约简 ''' df.idxmax() # .idxmax(),.idxmin()间接统计,达到最小值或最大值的索引 df.cumsum() # 累计型 df.describe() # 用于一次性产生多个汇总统计 ''' 描述和汇总统计 count 非NA值得数量 describe 针对Series或各DataFrame列计算汇总统计 min,max 计算最大值最小值 argmin,argmax 计算最大值最小值的索引位置 idxmin,idxmax 计算最大值最小值的索引值 quantile 计算样本的分位数(0到1) sum 值的和 mean 值的平均数 median 值的算术中位数(50%分位数) mad 根据平均值计算平均绝对离差 var 样本值的方差 std 样本值的标准差 skew 样本值的偏度(三阶矩) kurt 样本值的峰度(四阶矩) cumsum 样本值的累计和 cummin,cummax 样本值的累计最大值和累计最小值 cumprod 样本值的累计积 diff 计算一阶差分(对时间序列很有用) pct_change 计算百分数变化 ''' # 相关系数与协方差 ''' .tail() .corr() .cov() .corrwith() ''' # 唯一值、值计数以及成员资格 obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) obj.unique() # 得到唯一值 obj.value_counts() # 计算各值出现的频率 pd.value_counts(obj.values, sort=False) # 同上,可用于数组或序列 obj.isin(['b', 'c']) # 判断矢量化集合的成员资格 pass
def classify_spectro_df(spectro_df: pd.DataFrame): """Take a spectrogram df and classify it as inputs.""" # This checks if there's a peak characteristic of a whistle. def get_prominence(series): return scipy.signal.peak_prominences(series, [series.argmax()])[0][0] # Make a DataFrame to hold information we're going to use to classify what # this audio means. df = pd.DataFrame( data=spectro_df.idxmax(), columns=['peak_freq'], ) df['log_peak_freq'] = np.log(df.peak_freq) df['prominence'] = spectro_df.apply(get_prominence) df['whistle'] = df.prominence > cfg['prominence_threshold'] # Get the regions where there's whistling to parse them active_regions = _get_active_regions(df) for start, end in active_regions: print(f'whistle detected: ({start}, {end})') keyboard = pynput.keyboard.Controller() keyboard.press(pynput.keyboard.Key.caps_lock) time.sleep(0.2) keyboard.release(pynput.keyboard.Key.caps_lock) # Drop anything which has been classified if active_regions: spectro_df = spectro_df.loc[:, active_regions[-1][-1]:] return spectro_df, df, active_regions
def bischoff(self, df: pd.DataFrame): """ :param party_vote_dict: Mapping between party names and total received votes :return: """ preallocate_seats = df["seats"].sum() for _ in tqdm(range(self.MAX_SEATS - preallocate_seats)): for index, row in df.iterrows(): # print(index) df.at[index, self.Keywords.QUOTA.value] = math.floor( float(row[self.Keywords.TOTAL_VOTES.value]) / float(row[self.Keywords.SEATS.value] + 1)) party_with_highest_quota = df.idxmax()[self.Keywords.QUOTA.value] df.at[party_with_highest_quota, self.Keywords.SEATS.value] += 1 seats_distributed = df[self.Keywords.SEATS.value].sum() if seats_distributed == 200: print("Done") break elif seats_distributed > 200: print("Error distributing") print(df) break else: pass # print(seats_distributed) print(df)
def get_preds_probas(est: ClassifierMixin, X_test: DataFrame, y_test: Series, mapper_dict: Dict) -> DataFrame: """ Get prediction probabilities (if available) or return true and predicted labels """ df_preds = DataFrame(est.predict(X_test), index=X_test.index) if hasattr(est.named_steps["clf"], "predict_proba"): # Get prediction probabilities (if available) df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index) # Append prediction and prediction probabilities df_summ = concat([df_preds, df_probas], axis=1) df_summ.columns = ["predicted_label"] + [ f"probability_of_{i}" for i in range(0, len(np.unique(y_test))) ] # Get label (class) with maximum prediction probability for each row df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1) df_summ["probability_of_max_class"] = df_probas.max(axis=1) # Compare .predict_proba() and manually extracted prediction # probability lhs = df_summ["max_class_number_manually"] rhs = df_summ["predicted_label"].replace(mapper_dict) assert (lhs == rhs).eq(True).all() else: df_summ = df_preds.copy() # Get true label df_summ.insert(0, "true_label", y_test) return df_summ
def testSingle(self, test, fold): # # devents = xgb.DMatrix( test[ self.variables ].values ) # prediction = DataFrame( self.models[fold].predict( devents ) ) # # return DataFrame(dtype = float, data = {"predicted_class":prediction.idxmax(axis=1).values, # "predicted_prob": prediction.max(axis=1).values } ) devents = xgb.DMatrix(test[self.variables].values) prediction = DataFrame(self.models[fold].predict(devents)) # note: this uses idxmax (the column header of the max value) and tries to convert it to a float # therefore renaming of the header should be done AFTER extracting the predicted_class df = DataFrame(dtype=float, data={ "predicted_frac_class": prediction.idxmax(axis=1).values, "predicted_frac_prob": prediction.max(axis=1).values }) # header renaming headers = [] for i in range(0, len(prediction.columns)): headers.append("predicted_frac_prob_" + str(i)) prediction.columns = headers # horizontal concat (adding columns) result = concat([prediction, df], axis=1) return result
def transform_majority_label(rated_annotations: pd.DataFrame) -> pd.Series: """ Reduce a Pandas.DataFrame showing consistency per class per data point to a data point - label association. :param rated_annotations: :return: ndarray. """ return rated_annotations.idxmax(axis=1)
def select_signatures(W: pd.DataFrame, H: pd.DataFrame): """ Scales NMF output by sample and feature totals to select Signatures. ------------------------ Args: * W: input W matrix (K x n_features) * H: input H matrix (n_samples x K) Returns: * W: output W matrix with max_id, max, and max_norm columns * H: output H matrix with max_id, max, and max_norm columns """ Wnorm = W.copy() Hnorm = H.copy() # Scale Matrix for j in range(W.shape[1]): Wnorm.iloc[:,j] *= H.sum(1).values[j] Hnorm.iloc[j,:] *= W.sum(0).values[j] # Normalize Wnorm = Wnorm.div(Wnorm.sum(1),axis=0) Hnorm = Hnorm.div(Hnorm.sum(0),axis=1) H = H.T Hnorm = Hnorm.T # Get Max Values H_max_id = H.idxmax(axis=1, skipna=True).astype('int') H['max'] = H.max(axis=1, skipna=True) H['max_id'] = H_max_id Hnorm['max_norm']=Hnorm.max(axis=1, skipna=True) W_max_id = W.idxmax(axis=1, skipna=True).astype('int') W['max'] = W.max(axis=1, skipna=True) W['max_id'] = W_max_id Wnorm['max_norm']=Wnorm.max(axis=1, skipna=True) H['max_norm'] = Hnorm['max_norm'] W['max_norm'] = Wnorm['max_norm'] _rename = {x:'S'+x for x in list(H)[:-3]} H = H.rename(columns=_rename) W = W.rename(columns=_rename) return W,H
def pred_prob_to_pred_label(y_pred_proba: pd.DataFrame) -> List[str]: """ Convert a dataframe of predicted probabilities (shape (n_samples, n_classes)) to a list of predicted classes. """ if len(y_pred_proba) == 0: return [] return y_pred_proba.idxmax(axis=1).tolist()
def calConsensus(processedList): baseList = ["A", "C", "G", "T"] #profileList = [] df = DataFrame() for n in baseList: List = [i.count(n) for i in processedList] df[n] = Series(List) for value in df.idxmax(axis=1): print(value, end="")
def testSingle(self, test, fold): devents = xgb.DMatrix(test[self.variables].values) prediction = DataFrame(self.models[fold].predict(devents)) return DataFrame(dtype=float, data={ "predicted_class": prediction.idxmax(axis=1).values, "predicted_prob": prediction.max(axis=1).values })
def _from_categorical(data: DataFrame, mapping: dict) -> DataFrame: """ Based on the mapping computed with _categorial_mapping function on a similar dataset, converts the encoded data into initial data :param data: dataset to be converted back to the inital form :param mapping: the mapping computed with _categorical mapping function :return: reverted dataset """ categories = data.idxmax(axis=1) # get the categories return DataFrame([mapping[c] for c in categories ]) # easily construct the dataframe from list of
def predict_ovr(newX=None, n_jobs=1): global _X global _Y global _newX global _coefs_ovr _newX = _X if newX is None else newX classes = np_sort(unique(_Y)) with Pool(n_jobs) as pool: preds = pool.map(_predict_class_ovr, classes) preds = DataFrame(dict(zip(classes, preds))) return array(preds.idxmax(axis="columns"))
def undo_one_hot( df: pd.DataFrame, new_column_name: Optional[str] = None ) -> Union[pd.Series, pd.DataFrame]: """Undo one-hot encoding.""" # we have to overwrite the column names because `idxmax` uses the column names df.columns = pd.Index(range(df.shape[1])) result = df.idxmax(axis="columns") if new_column_name is not None: return result.to_frame(name=new_column_name) else: return result
def classify_sentences(sentences, model): if model == 'infersent': model = load_model('data/infersent_model.pkl', glove_path='data/glove.840B.300d.txt', infersent_path='data/infersent.allnli.pickle') else: model = load_model('data/bow_model.pkl', glove_path=None, infersent_path=None) df = DataFrame(model.predict_proba(sentences), columns=model.classes_) df['max'] = df.idxmax(axis=1) return df
def _to_labels(probabilities: pd.DataFrame) -> pd.Series: labels = probabilities.idxmax(axis='columns') # Find places where there are multiple maximum values max_probabilities = probabilities.max(axis='columns') is_max: pd.DataFrame = probabilities.eq(max_probabilities, axis='rows') number_of_max: pd.Series = is_max.sum(axis='columns') multiple_max: pd.Series = number_of_max.gt(1) # Set those locations as an 'undecided' label labels[multiple_max] = 'undecided' # TODO: emit a warning if any are set to 'undecided' return labels
def experimentDropoutRate(self, args): # Load data X_train, X_dev, X_test, Y_train, Y_dev, classes = Data().load_data() nb_features = X_train.shape[1] print(nb_features, 'features') nb_classes = Y_train.shape[1] print(nb_classes, 'classes') args.epochs = 100 model = None folds = Data().cross_validation_split(X_train, Y_train) metrics_per_set = DataFrame() for rate in arange(0.4, 0.9, 0.1): print("\nEvaluating dropout rate ", str(rate)) count = 0 mean_of_folds = DataFrame() for fold in folds: print("Training holding fold", str(count), "out..") if count == len(folds) - 1: early_stopping_fold = folds[0] else: early_stopping_fold = folds[count + 1] union_set = Data().construct_union_set( fold.copy(), early_stopping_fold.copy(), folds.copy()) model = Classifier(type='DropoutAdam', nb_features=nb_features, nb_classes=nb_classes, epochs=args.epochs, batch_size=32, classes=classes, run_number=args.run, rate=rate) model.fit(X_train=union_set[0], Y_train=union_set[1], X_dev=early_stopping_fold[0], Y_dev=early_stopping_fold[1]) mean_of_folds["Fold " + str(count + 1)] = model.predict( X_test=fold[0], X_dev=fold[0], Y_dev=fold[1]) count += 1 if model is not None: metrics_per_set["Rate: " + str(rate)] = mean_of_folds.mean(axis=1) # metrics_per_set['cols'] = ['acc', 'prec', 'rec', 'f1'] print(metrics_per_set) print(metrics_per_set.idxmax(axis=1))
def bayesclass_predict(Class, model, data): x = data k = model.predict(x) df = DataFrame(index=Class.index.values, columns=x.index.values) for i in Class.index.values: df.loc[i] = norm.logpdf(x=np.ravel(k), loc=Class.Mean.ix[i], scale=Class.Variance.ix[i]) condition = np.ravel([df.max() > -50]) j = np.round(np.ravel(k)) j = j * (~condition) j = j + np.ravel(df.idxmax()) * condition return j
def highest_density_interval(posteriors: pd.DataFrame, p=0.9) -> pd.DataFrame: """ Get HDI posteriors: pandas DataFrame of posteriors p: confidence interval """ # If we pass a DataFrame, just call this recursively on the columns if isinstance(posteriors, pd.DataFrame): return pd.DataFrame( [ highest_density_interval(posteriors[col], p=p) for col in posteriors ], index=posteriors.columns, ) cumsum = np.cumsum(posteriors.values) # N x N matrix of total probability mass for each low, high total_p = cumsum - cumsum[:, None] # Return all indices with total_p > p lows, highs = (total_p > p).nonzero() # Find the smallest range (highest density) best = (highs - lows).argmin() low = posteriors.index[lows[best]] most_likely = posteriors.idxmax(axis=0) high = posteriors.index[highs[best]] return pd.Series( [most_likely, low, high], index=["most_likely", f"low_{p*100:.0f}", f"high_{p*100:.0f}"], ).round(2)
from numpy.random import randn import matplotlib.pyplot as plt array1 = np.array([[10, np.nan, 20], [30, 40, np.nan]]) print array1 df1 = DataFrame(array1, index=[1, 2], columns=list('ABC')) print df1 #sum() print "Sum of cols", df1.sum() #sums along each column print df1.sum(axis=1) #sum along indexes print "Min", df1.min() print "Max", df1.max() print df1.idxmax() print df1.cumsum() print df1.describe() df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC')) print df2 plt.plot(df2) plt.legend(df2.columns, loc="lower right") plt.savefig('samplepic.png') plt.show() series1 = Series(list('abcccaabd')) print series1.unique() print series1.value_counts()
df = DataFrame([[1.4, np.nan], [7.1,-4.5], [np.nan, np.nan], [0.75, -1.3]], index = ['a','b','c','d'], columns=['one','two'] ) print(df) print('\n') print(df.sum()) print('\n') print(df.sum(axis=1)) print('\n') print(df.mean()) print('\n') print(df.mean(axis=1,skipna=False)) print('\n') print(df.idxmax()) print('\n') print(df.cumsum()) print('\n') print(df.cumsum(axis=1)) print('\n') print(df.describe()) print('\n') ############################################################### obj = Series(['a','a','b','c']*4) print(obj) print(obj.describe()) print('\n')
[np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() # columns sum df.sum(axis=1) # sum row by row df (7.10 - 4.5)/2 df.mean(axis=1, skipna=False) df df.idxmax() df df.cumsum() # accumultation df.describe() # multiple summary statistics in one shot. obj = Series(['a', 'a', 'b', 'c'] * 4) obj obj.describe() ## Correlation and Covariance import pandas.io.data as web all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'], for tic, data in all_data.iteritems()}) price = DataFrame({tic: data['Adj Close']
frame.loc['a':'d', 'STL':] frame.iloc[0:3, 1:2] frame['UMST'] = 4 frame.reindex(index=['c', 'e', 'a'], columns=['UM', 'Washu']) frame[frame < 0] = np.nan frame.isnull() frame.dropna() frame.dropna(axis=1) um = frame['UM'] um[um.notnull()] frame.fillna(method='ffill', axis=0, limit=1, inplace=False) frame.fillna(method='ffill', axis=1, limit=1) frame.mean() frame.mean(axis=1, skipna=False) frame.idxmin() frame.idxmax(axis=1) frame2 = DataFrame( { 'Washu': np.random.randn(5), 'UM': np.random.randn(5), 'UMST': np.random.randn(5) }, index=list('abcde')) frame3 = DataFrame({ 'a': { 'Washu': 1, 'UM': 3 }, 'b': { 'Washu': 2, 'UM': 4
def centered_plot(best_values: pd.DataFrame, best_names: pd.DataFrame): # Things are normalized so classifiers are centered at normscore 0 # Hence we only have to plot best_selector vs best_baseline best_values = best_values.drop(columns='classifiers') best_names = best_names.drop(columns='classifiers') selector_normscores = best_values['selectors'].values baseline_normscores = best_values['baselines'].values # We choose the label name of which performed better labels = [ best_names.loc[task, category] for task, category in best_values.idxmax(axis=1).items() ] # Assign colors so selectors and baselines are visually distinct selector_names = set(filter(lambda name: 'selector' in name, labels)) baseline_names = set(filter(lambda name: 'baseline' in name, labels)) selector_colors = sns.color_palette('viridis_r', len(selector_names) * 2) baseline_colors = sns.color_palette('rocket', len(baseline_names)) cmap = { **{name: selector_colors[i] for i, name in enumerate(selector_names)}, **{name: baseline_colors[i] for i, name in enumerate(baseline_names)}, } colors = [cmap[label] for label in labels] figsize = (8, 10) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(1, 1, 1) xlims = (-1.3, 1.05) ylims = (-1.3, 1.8) ax.set_xlim(xlims) ax.set_ylim(ylims) # Axis lines within box of radius 1 ax.plot((xlims[0], 1), (0, 0), c='black', linestyle=':', linewidth=0.5) ax.plot((0, 0), (ylims[0], 1), c='black', linestyle=':', linewidth=0.5) # Horizontal Oracle line ax.plot((xlims[0], 1), (1, 1), c='black', linestyle=':', linewidth=1.0) # Vertical line for outisde box ax.plot((1, 1), (ylims[0], ylims[1]), c='black', linestyle=':', linewidth=0.5) # Diagonal line for marking which side is better ax.plot((xlims[0], 1), (ylims[0], 1), c='grey', linestyle='--', linewidth=0.2) # Text indicating the regions offsets = (0.1, 0.05) ax.text(0 + 0.3 + offsets[0], ylims[0] + offsets[1], "best baseline < single best\nbest selector > single best", fontsize=8) ax.text(xlims[0] + offsets[0], ylims[0] + offsets[1], "best baseline < single best\nbest selector < single best", fontsize=8) ax.text(xlims[0] + offsets[0], 0 + offsets[1], "best baseline > single best\nbest selector < single best", fontsize=8) ax.text(0 + 0.3 + offsets[0], 0 + offsets[1], "best baseline > single best\nbest selector > single best", fontsize=8) ax.text(0 + offsets[0], 1 + offsets[1], "baseline > oracle", fontsize=8) legend_lines = [ Line2D([0], [0], color='w', marker='o', markerfacecolor=col, label=name.replace('_', ' ')) for name, col in cmap.items() ] ax.legend(handles=legend_lines) ax.scatter(x=selector_normscores, y=baseline_normscores, c=colors) ax.set_xlabel('Selector normalized score') ax.set_ylabel('Baseline normalized score') #ax.axes.set_aspect('equal') ax.set_title('Selector/Baseline performances for 62 Datasets') return fig
def umap_dataset_properties_selectors_baselines(best_values: pd.DataFrame, best_names: pd.DataFrame, cached_metaprops: str, random_state=5): if len(best_values.index) < 4: print("Can't create meaningful UMAP of less than 4 points") return if os.path.exists(cached_metaprops): df_metaprops = pd.read_csv(cached_metaprops, index_col=0) else: tasks = list(map(int, best_values.index)) dataset_ids = [ openml.tasks.get_task(task).dataset_id for task in tasks ] # This will take a while to get # Hence the caching dataset_metaprops = [ openml.datasets.get_dataset(dataset_id).qualities for dataset_id in dataset_ids ] available_keys = reduce( lambda acc, metaprops: acc.intersection(metaprops.keys()), dataset_metaprops, set(dataset_metaprops[0].keys())) dict_metaprops = { k: [metaprop[k] for metaprop in dataset_metaprops] for k in available_keys } df_metaprops = pd.DataFrame.from_dict(dict_metaprops, orient='index', columns=tasks) df_metaprops.to_csv(cached_metaprops) # Drop features that have more than 30% missing cut_percentage = 0.00 # Most features have 0%, 12% or 67% missing for row in df_metaprops.index: missing = sum(df_metaprops.loc[row].isnull()) / len( df_metaprops.loc[row]) if missing > cut_percentage: df_metaprops.drop(index=row, inplace=True) # Convert the rest of the nans to the mean (8/62 had 24/48 missing features) df_metaprops = df_metaprops.apply(lambda row: row.fillna(row.mean()), axis=1) df_metaprops = df_metaprops.T # Make the tasks be on the index # Scale Data according to UMAPS recommendation df_scaled_metaprops = StandardScaler().fit_transform(df_metaprops) # Use UMAP to produce embedding # Doesn't really make sense with low number of datasets n_datasets = len(df_metaprops) K = math.ceil(n_datasets / 2) if n_datasets < 20 else 10 umapper = UMAP(n_neighbors=K, random_state=random_state) embeddings = umapper.fit_transform(df_scaled_metaprops) figsize = (10, 12) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(1, 1, 1) # Take out classifiers best_values = best_values.drop(columns='classifiers') best_names = best_names.drop(columns='classifiers') # We choose the label name of which performed better labels = [ best_names.loc[task, category] for task, category in best_values.idxmax(axis=1).items() ] # Assign colors so selectors and baselines are visually distinct selector_names = set(filter(lambda name: 'selector' in name, labels)) baseline_names = set(filter(lambda name: 'baseline' in name, labels)) selector_colors = sns.color_palette('viridis_r', len(selector_names) * 2) baseline_colors = sns.color_palette('rocket', len(baseline_names)) cmap = { **{name: selector_colors[i] for i, name in enumerate(selector_names)}, **{name: baseline_colors[i] for i, name in enumerate(baseline_names)}, } colors = [cmap[label] for label in labels] ax.scatter(embeddings[:, 0], embeddings[:, 1], c=colors) legend_lines = [ Line2D([0], [0], color='w', marker='o', markerfacecolor=col, label=name.replace('_', ' ')) for name, col in cmap.items() ] ax.legend(handles=legend_lines) ax.set_xlabel('UMAP axis 1') ax.set_ylabel('UMAP axis 2') ax.set_title( 'UMAP projection of dataset meta-features - Selectors / AutoML') return fig
d 0.5 ''' print '其它函数' print df ''' one two a 1.0 NaN b 7.0 4.0 c NaN NaN d 0.0 1.0 ''' print df.idxmax() # 计算每一列最大值的索引 ''' one b two b ''' print df.cumsum() # 每一列的累加和 ''' one two a 1.0 NaN b 8.0 4.0 c NaN NaN d 8.0 5.0 ''' print df.describe() # 对DataFrame每列计算汇总统计
from pandas import DataFrame data = { 'Speed': [101, 109, 106], 'Temp': [34, 32, 45], 'Humidity': [4500, 2300, 5800] } frame = DataFrame(data) print(frame) print(frame.sum()) #to calculate sum of all columns print(frame.sum(axis=1)) # to calculate sum of rows print(frame.idxmax()) # to calculate max value at particular index value. print(frame.idxmin())
dframe1 = DataFrame(arr, index=["A", "B"], columns=["One", "Two", "Three"]) dframe1 # Sum method dframe1.sum() # ignores null values (treats them as 0s) dframe1.sum(axis=1) # sum across rows # Min method dframe1.min() # finds the minimum value in each column dframe1.min(axis=1) # minimum value of each row dframe1.idxmin() # Find the index of minimum value column # Max method dframe1.max() dframe1.idxmax() # Cumulative sum dframe1.cumsum() # accumulates along each columns values # Describe method dframe1.describe() # summary statistics of dataframe (by columns) # correlation and covariance import pandas.io.data as pdweb # import pandas_datareader.data as pdweb import datetime prices = pdweb.get_data_yahoo( ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1)
df1 = DataFrame(dic1) #calculating the sum of individual columns df1.sum() #calculating the sum of individual rows df1.sum(axis=1) #here axis =1 represents the horizontal axis #calculating the maximum values for each individual columns #results will in the form of displayed index df1.idxmax() #similarly for minimum values for each individual columns df1.idxmin() #fundamental operations on DataFrames like addition,subtraction etc dic2 = { "cse": [10, 13, 11], "maths": [11, 14, 17], "english": [5, 7, 9], "ece": [11, 13, 15] } df2 = DataFrame(dic2)
dframe1 #Let's see the sum() method in action dframe1.sum() #Notice how it ignores NaN values #Notice how it ignores NaN values dframe1.sum(axis=1) #Can also grab min and max values of dataframe dframe1.min() #As well as there index dframe1.idxmin() dframe1.idxmax() dframe1.max() dframe1 #Can also do an accumulation sum dframe1.cumsum() #A very useful feature is describe, which provides summary statistics describe=dframe1.describe() # We can also get information on correlation and covariance #For more info on correlation and covariance, check out the videos below! from IPython.display import YouTubeVideo YouTubeVideo('xGbpuFNR1ME')
def main(): """ Calculation and aggregation of summary statistics """ # Summary of statistics # return is not ndarray df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print df.mean(axis=1) # exclude nan print df.mean(axis=1, skipna=False) print df.idxmin() print df.idxmax() print df.cumsum() print df.describe() # values are not number obj = Series(list('aabc') * 4) print obj.describe() methods = ['count', 'min', 'max', # 'argmin', 'argmax', 'quantile', 'median', 'mad', 'var', 'std', 'skew', 'kurt', 'cummin', 'cummax', 'cumprod', 'diff', 'pct_change'] for method in methods: print u'「{0}」'.format(method) print getattr(df, method)() print '' # Correspond and Covariance all_data = {} lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']: for ticket in lst: #, 'GOOG']: # IOError: after 3 tries, Yahoo! did not return a 200 # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv' all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) if all_data: returns = price.pct_change() print returns.tail() print '' print returns.MSFT.corr(returns.IBM) print returns.MSFT.cov(returns.IBM) print '' print returns.corr() print returns.cov() print '' print returns.corrwith(returns.IBM) print returns.corrwith(volume) # unique, frequency, belong print '','' obj = Series(list('cadaabbcc')) uniques = obj.unique() print uniques print obj.value_counts() print pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) print mask print obj[mask] data = DataFrame({ 'Qu1' : [1,3,4,3,4], 'Qu2' : [2,3,1,2,3], 'Qu3' : [1,5,2,4,4], }) print data print data.apply(pd.value_counts).fillna(0)
# 检验是否唯一 obj.index.is_unique # 一个索引有多个值,那么该索引就会返回多个值。 obj['a'] ## 汇总和计算描述统计 df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]], index=['a','b','c','d'], columns=['one','two']) # 对列 df.sum() # 对行 df.sum(axis=1) # 默认会排除NA,但是可以通过skipna禁用该功能 df.mean(axis=1,skipna=False) # 返回最大值的索引 df.idxmax() # 累加 df.cumsum() df.describe() # 相关系数 returns.MSFT.corr(returns.IBM) returns.corr() returns.cov() returns.corrwith(returns.IBM) ## 唯一值,值计数以及成员资格 obj = Series(['c','a','d','a','a','b','b','c','c']) uniques = obj.unique() # 统计个数 obj.value_counts() # 统计个数后默认排序,也可以不排序
### Descriptive statistics df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() df.sum(axis=1) # NB for this one NaNs are treated at 0 df.cumsum() df.mean(axis=1, skipna=False) df.describe() # also works on other objects df.idxmax() # returns the id of the index of the max ### Handling Missing Data string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado']) string_data string_data.isnull() string_data[0] = None string_data.isnull() data = Series([1, NA, 3.5, NA, 7]) data.dropna() data[data.notnull()] # another way to do it data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]]) cleaned = data.dropna() # row wise data
# -*- coding:utf-8 -*- import numpy as np from pandas import Series, DataFrame print('求和') df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) print(df) print(df.sum()) #按列求和 print(df.sum(axis=1)) # 按行求和 print('平均数') print(df.mean(axis=1, skipna=False)) print(df.mean(axis=1)) print('其它') print(df.idxmax()) print(df.cumsum()) print(df.describe()) obj = Series(['a', 'a', 'b', 'c'] * 4) print(obj.describe())
# -*- coding: utf-8 -*- import numpy as np from pandas import Series, DataFrame print '求和' df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index = ['a', 'b', 'c', 'd'], columns = ['one', 'two']) print df print df.sum() # 按列求和 print df.sum(axis = 1) # 按行求和 print print '平均数' print df.mean(axis = 1, skipna = False) print df.mean(axis = 1) print print '其它' print df.idxmax() print df.cumsum() print df.describe() obj = Series(['a', 'a', 'b', 'c'] * 4) print obj.describe()