class Player: def __init__(self, first_name, last_name, id): self.first_name = first_name self.last_name = last_name self.id = id self.hrs = [0,0,0,0,0,0] #One for each month of the game self.hr_total = 0 self.hr_series = Series() self.hr_total_series = Series() def __str__(self): return str.format('{0} : {1}', self.id, self.last_name) def __repr__(self): return self.__str__() def add_hrs(self, count, date): self.hr_total += count self.hr_total_series[date] = self.hr_series.sum() + count if(self.hr_series.last_valid_index() == date ): self.hr_series[date] = count + self.hr_series[date] else: self.hr_series[date] = count def name(self): return self.first_name + " " + self.last_name def get_player_hr_dataframe(self): return self.hr_series.to_frame(self.name()) def get_player_hr_total_dataframe(self): return self.hr_total_series.to_frame(self.name())
def test_constructor_from_dense_series(self): # GH 19393 # series with name x = Series(np.random.randn(10000), name='a') result = SparseDataFrame(x) expected = x.to_frame().to_sparse() tm.assert_sp_frame_equal(result, expected) # series with no name x = Series(np.random.randn(10000)) result = SparseDataFrame(x) expected = x.to_frame().to_sparse() tm.assert_sp_frame_equal(result, expected)
def save_counts(file_read, file_write): data = df.from_csv(file_read) answer_count = Series.to_frame( data.groupby(['language'])['answer_count'].sum()) question_count = Series.to_frame( data.groupby(['language'])['post_type'].count()) answer_count = answer_count.reset_index() question_count = question_count.reset_index() question_count.columns = ['language', 'question_count'] merge = df.merge(question_count, answer_count, on=['language', 'language']) merge['total'] = merge.question_count + merge.answer_count merge['ratio'] = merge.answer_count / merge.question_count merge.to_csv(file_write)
def test_equals_None_vs_float(): # GH#44190 left = Series([-np.inf, np.nan, -1.0, 0.0, 1.0, 10 / 3, np.inf], dtype=object) right = Series([None] * len(left)) # these series were found to be equal due to a bug, check that they are correctly # found to not equal assert not left.equals(right) assert not right.equals(left) assert not left.to_frame().equals(right.to_frame()) assert not right.to_frame().equals(left.to_frame()) assert not Index(left, dtype="object").equals(Index(right, dtype="object")) assert not Index(right, dtype="object").equals(Index(left, dtype="object"))
def match_data(bench: pd.Series, *args): """ 将不同的pandas.Series按照时间index对齐 Parameters ---------- bench 对齐所要参照的基准 args 需要对对齐的序列 Returns ------- pandas.DataFrame 对齐后的DataFrame """ assert isinstance(bench.index, pd.DatetimeIndex), "Index should be pandas.DatetimeIndex!" res = bench.to_frame() for i in range(len(args)): res[args[i].name] = args[i] return res
def simulate_new_housing( indiv_hh: DataFrame, archetype_new_build: pd.Series, percentage_demand_met: float, projected_la_housing_demand: DataFrame, year: int, random_state: int = 42, ) -> DataFrame: if percentage_demand_met == 0: new_housing = pd.DataFrame() else: annual_demand = projected_la_housing_demand.loc[year] new_housing_by_la: List[DataFrame] = [] for la, demand in annual_demand.items(): la_total_new_buildings = int(demand * percentage_demand_met) la_new_housing = (indiv_hh.query("local_authority == @la").sample( la_total_new_buildings, random_state=random_state).loc[:, ["local_authority", "EDNAME"]]) new_housing_by_la.append(la_new_housing) new_housing_raw = pd.concat(new_housing_by_la).reset_index() archetype_properties = archetype_new_build.to_frame().T archetype_broadcast = pd.concat( [archetype_properties] * len(new_housing_raw)).reset_index( ) # broadcast archetype to the same length as new housing new_housing = pd.concat([new_housing_raw, archetype_broadcast], axis=1).set_index("SMALL_AREA") return new_housing
def test_shift_dt64values_int_fill_deprecated(self): # GH#31971 ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) df = ser.to_frame() with tm.assert_produces_warning(FutureWarning): result = df.shift(1, fill_value=0) expected = Series([pd.Timestamp(0), ser[0]]).to_frame() tm.assert_frame_equal(result, expected) # axis = 1 df2 = DataFrame({"A": ser, "B": ser}) df2._consolidate_inplace() with tm.assert_produces_warning(FutureWarning): result = df2.shift(1, axis=1, fill_value=0) expected = DataFrame({ "A": [pd.Timestamp(0), pd.Timestamp(0)], "B": df2["A"] }) tm.assert_frame_equal(result, expected) # same thing but not consolidated # This isn't great that we get different behavior, but # that will go away when the deprecation is enforced df3 = DataFrame({"A": ser}) df3["B"] = ser assert len(df3._mgr.arrays) == 2 result = df3.shift(1, axis=1, fill_value=0) expected = DataFrame({"A": [0, 0], "B": df2["A"]}) tm.assert_frame_equal(result, expected)
def to_csv(self, path: str, file_name: str, data_: pd.Series): data_path_ = os.path.join(path, file_name + '.csv') data_df = data_.to_frame().T header = False if os.path.exists(data_path_) else True data_df.to_csv(data_path_, mode='a', header=header)
def test_imethods_with_dups(self): # GH6493 # iat/iloc with dups s = Series(range(5), index=[1, 1, 2, 2, 3], dtype="int64") result = s.iloc[2] assert result == 2 result = s.iat[2] assert result == 2 msg = "index 10 is out of bounds for axis 0 with size 5" with pytest.raises(IndexError, match=msg): s.iat[10] msg = "index -10 is out of bounds for axis 0 with size 5" with pytest.raises(IndexError, match=msg): s.iat[-10] result = s.iloc[[2, 3]] expected = Series([2, 3], [2, 2], dtype="int64") tm.assert_series_equal(result, expected) df = s.to_frame() result = df.iloc[2] expected = Series(2, index=[0], name=2) tm.assert_series_equal(result, expected) result = df.iat[2, 0] assert result == 2
def get_trend_variables(s: pd.Series, field_name: str, alpha: float = 0.25) \ -> (pd.DataFrame, list): d = s.to_frame(name=field_name) ema_df = pd.DataFrame() new_name_trend_strength = field_name + '_trend_strength' new_name_trend_strength_weighted = field_name + '_trend_strength_weighted' range_len = 17 index_range = range(1, range_len) for exp in index_range: h1 = alpha * (1 / (exp)) h2 = alpha * (1 / (exp + 1)) ema1 = d[field_name].ewm(alpha=h1).mean() ema2 = d[field_name].ewm(alpha=h2).mean() ema_df[str(exp)] = ema1 / ema2 # ema_df[col_name] = (ema_df[col_name] / ema_df[col_name].shift(1)).fillna(1) # for exp in range(1, 17): # ema_df[str(exp)] = d[field_name].rolling(exp*2).mean() # ema_df[str(exp)] = (ema_df[str(exp)] / ema_df[str(exp)].shift(1)).fillna(1) # weights = [2**(v-1) for v in index_range] weights = [v for v in index_range] weights = pd.Series(list(map(lambda x: x / sum(weights), weights)), index=ema_df.columns.values) d[new_name_trend_strength_weighted] = ema_df.subtract(-1).dot(weights) neg = np.sum((ema_df.values < 1), axis=1) pos = np.sum((ema_df.values > 1), axis=1) d[new_name_trend_strength] = ((pos - neg) + range_len) / ( 2 * range_len) # creates an index, 0-1, of trend strength return d, [new_name_trend_strength, new_name_trend_strength_weighted]
def return_pv(pv: pd.Series, shr_mem: list, prog_mem: list, coords: tuple, i: int): """ Does necessary stuff to pv to convert it back to xarray (adds lat, lon) and saves it to shr_mem also updates and draws progress bar Parameters ---------- pv : Pandas series containing calculated pv values shr_mem : List shared memory where all the calculated pv time series are stored prog_mem : List list indicating the overall progress of the computation, first value ([0]) is the total number of coordinate tuples to compute. coords : Tuple coordinates of pv station (lat, lon) i : int index where in shr_mem to save pv, unique for every coordinate tuple """ pv = pv.to_frame() pv.columns = ["pv"] pv = pv.reset_index() pv["lat"] = coords[0] pv["lon"] = coords[1] pv = pv.set_index(["lon", "lat", "time"]) shr_mem[i] = pv.to_xarray() prog_mem.append(1) len_coord_list = prog_mem[0] progress_bar(len(prog_mem), len_coord_list)
def test_to_sql_series(self): s = Series(np.arange(5, dtype='int64'), name='series') sql.to_sql(s, "test_series", self.conn, flavor='sqlite', index=False) s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn, flavor='sqlite') tm.assert_frame_equal(s.to_frame(), s2)
def calc_daily_qty(final_qty: float, trades: pd.Series, start_date: datetime, end_date: datetime) -> pd.Series: """Calculates the daily position quantities based on the final quantity and the trades occurred during the period.""" df = pd.concat([ pd.DataFrame(data={'position': [np.nan, final_qty]}, index=[start_date, end_date]), trades.to_frame('trade_qty') ]) # type: pd.DataFrame df.sort_index(inplace=True) df = df.resample('1D').sum() df.index.name = 'dt' df.reset_index(inplace=True) # Global fillna won't work with pandas 0.18: # https://github.com/pandas-dev/pandas/issues/7630 df['trade_qty'].fillna(0, inplace=True) df['position'].fillna(0, inplace=True) # FIXME: looping is not nice # https://stackoverflow.com/questions/34855859/ # is-there-a-way-in-pandas-to-use-previous-row-value- # in-dataframe-apply-when-previ for i in reversed(range(len(df) - 1)): df.loc[i, 'position'] = \ df.loc[i + 1, 'position'] - df.loc[i + 1, 'trade_qty'] df.index = df['dt'] df.index.name = None return df['position']
def object2proto(obj: pd.Series) -> PandasSeries_PB: """Convert pd.Series to PandasDataFrame_PB with pyarrow. Args: obj: target Series Returns: Serialized version of Series, which will be used to reconstruction. """ # https://arrow.apache.org/docs/python/pandas.html # series must either be converted to a dataframe or use pa.Array # however pa.Array mentions you must account for the null values yourself dataframe = obj.to_frame() schema = pa.Schema.from_pandas(dataframe) table = pa.Table.from_pandas(dataframe) sink = pa.BufferOutputStream() writer = pa.ipc.new_file(sink, schema) writer.write(table) writer.close() buf = sink.getvalue() siz = len(buf) df_bytes = pa.compress(buf, asbytes=True) return PandasSeries_PB(series=df_bytes, decompressed_size=siz)
def test_imethods_with_dups(self): # GH6493 # iat/iloc with dups s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64') result = s.iloc[2] assert result == 2 result = s.iat[2] assert result == 2 msg = "index 10 is out of bounds for axis 0 with size 5" with pytest.raises(IndexError, match=msg): s.iat[10] msg = "index -10 is out of bounds for axis 0 with size 5" with pytest.raises(IndexError, match=msg): s.iat[-10] result = s.iloc[[2, 3]] expected = Series([2, 3], [2, 2], dtype='int64') tm.assert_series_equal(result, expected) df = s.to_frame() result = df.iloc[2] expected = Series(2, index=[0], name=2) tm.assert_series_equal(result, expected) result = df.iat[2, 0] assert result == 2
def test_imethods_with_dups(self): # GH6493 # iat/iloc with dups s = Series(range(5), index=[1, 1, 2, 2, 3], dtype='int64') result = s.iloc[2] assert result == 2 result = s.iat[2] assert result == 2 pytest.raises(IndexError, lambda: s.iat[10]) pytest.raises(IndexError, lambda: s.iat[-10]) result = s.iloc[[2, 3]] expected = Series([2, 3], [2, 2], dtype='int64') tm.assert_series_equal(result, expected) df = s.to_frame() result = df.iloc[2] expected = Series(2, index=[0], name=2) tm.assert_series_equal(result, expected) result = df.iat[2, 0] assert result == 2
def s_plot(s: pd.Series) -> NotebookChart: "Generate a Chartify plot of this series, auto-detecting the plot type." x_axis_type = _detect_axis_type(s.index) ch = NotebookChart(x_axis_type=x_axis_type, blank_labels=True) df = s.to_frame().reset_index() df.columns = ['index', 'value'] if x_axis_type == 'categorical': ch.plot.bar( df, categorical_columns='index', numeric_column='value', ) else: ch.plot.line( df, x_column='index', y_column='value', ) if s.name: ch.axes.set_yaxis_label(s.name) if s.index.name: ch.axes.set_xaxis_label(s.index.name) return ch
def generate_text_features(self, X: Series, feature: str) -> DataFrame: X: DataFrame = X.to_frame(name=feature) X[feature + '.char_count'] = [self.char_count(value) for value in X[feature]] X[feature + '.word_count'] = [self.word_count(value) for value in X[feature]] X[feature + '.capital_ratio'] = [ self.capital_ratio(value) for value in X[feature] ] X[feature + '.lower_ratio'] = [self.lower_ratio(value) for value in X[feature]] X[feature + '.digit_ratio'] = [self.digit_ratio(value) for value in X[feature]] X[feature + '.special_ratio'] = [ self.special_ratio(value) for value in X[feature] ] symbols = [ '!', '?', '@', '%', '$', '*', '&', '#', '^', '.', ':', ' ', '/', ';', '-', '=' ] for symbol in symbols: X[feature + '.symbol_count.' + symbol] = [ self.symbol_in_string_count(value, symbol) for value in X[feature] ] X[feature + '.symbol_ratio.' + symbol] = X[feature + '.symbol_count.' + symbol] / X[feature + '.char_count'] X[feature + '.symbol_ratio.' + symbol].fillna(0, inplace=True) X = X.drop(feature, axis=1) return X
def statistically_significant_symbols(p:pd.Series, not_significant="ns", return_pvalues=False): """ # Lexicon # ns # P > 0.05 # * # P ≤ 0.05 # ** # P ≤ 0.01 # *** # P ≤ 0.001 # **** # P ≤ 0.0001 (For the last two choices only) Future: Make this customizable """ if not hasattr(p, "__iter__"): if p > 0.05: return not_significant symbol = "" if p <= 0.05: symbol += "*" if p <= 0.01: symbol += "*" if p <= 0.001: symbol += "*" if p <= 0.0001: symbol += "*" return symbol else: symbols = pd.Series(p).map(lambda x:statistically_significant_symbols(x, not_significant=not_significant)) if return_pvalues: return pd.concat([symbols.to_frame("symbol"), p.to_frame("p_value")], axis=1) else: return symbols
def _is_series_of_strings(series_to_test: pd.Series) -> bool: if not isinstance(series_to_test, pd.Series): return False elif series_to_test.to_frame().applymap( lambda x: not isinstance(x, str)).squeeze().any(): return False return True
def test_where_datetimelike_noop(self, dtype): # GH#45135, analogue to GH#44181 for Period don't raise on no-op # For td64/dt64/dt64tz we already don't raise, but also are # checking that we don't unnecessarily upcast to object. ser = Series(np.arange(3) * 10**9, dtype=np.int64).view(dtype) df = ser.to_frame() mask = np.array([False, False, False]) res = ser.where(~mask, "foo") tm.assert_series_equal(res, ser) mask2 = mask.reshape(-1, 1) res2 = df.where(~mask2, "foo") tm.assert_frame_equal(res2, df) res3 = ser.mask(mask, "foo") tm.assert_series_equal(res3, ser) res4 = df.mask(mask2, "foo") tm.assert_frame_equal(res4, df) # opposite case where we are replacing *all* values -> we downcast # from object dtype # GH#45768 res5 = df.where(mask2, 4) expected = DataFrame(4, index=df.index, columns=df.columns) tm.assert_frame_equal(res5, expected) # unlike where, Block.putmask does not downcast df.mask(~mask2, 4, inplace=True) tm.assert_frame_equal(df, expected.astype(object))
def mock_stock_data_provider_closing_prices( closing_prices: pd.Series) -> MagicMock: with patch('tm.StockDataProvider') as mock: instance = mock.return_value type(instance).history = PropertyMock( return_value=closing_prices.to_frame(name='Close')) return instance
def _to_label_features_with_min_cut( self, feature_col: pd.Series, feature_name: str, category_min_cnt: int = None, category_min_rate: float = None, aggregation_value='others') -> pd.DataFrame: feature_col: pd.Series = feature_col.astype('object') if category_min_cnt is not None or category_min_rate is not None: category_cnt_series: pd.Series = feature_col.value_counts() sum_cnt = len(feature_col.values) for category_val, cnt in zip(category_cnt_series.index, category_cnt_series.values): if (category_min_cnt is not None and cnt < category_min_cnt) \ or (category_min_rate is not None and float(cnt) / float(sum_cnt) < category_min_rate): feature_col.replace(category_val, aggregation_value, inplace=True) feature_col: pd.Series = feature_col.astype('category') feature_df = feature_col.to_frame(name=feature_name) dummy_df: pd.DataFrame = pd.get_dummies(feature_df, drop_first=False) self._feature_columns = dummy_df.columns return dummy_df
def event_plot(series: pd.Series) -> Axes: """Return matplotlib event plot""" plt.style.use("bmh") if series.empty: return empty_mpl_figure() df = series.to_frame(name="timestamp") df.loc[:, "event"] = 1 + (np.random.rand(df.size) - 0.5) / 5 # jitter ax = df.plot.scatter( x="timestamp", y="event", c="None", edgecolors="blue", alpha=1, s=80, figsize=(15, 5), legend=False, grid=True, ) # set x axis ax.xaxis.set_major_formatter(DateFormatter("%b %d %H:%M")) buffer = ((series.max() - series.min()) / 30) + timedelta(seconds=1) ax.set_xlim(left=series.min() - buffer, right=pd.Timestamp.utcnow() + buffer) ax.set_xlabel("Timestamp (UTC)") # set y axis ax.set_ybound(0, 2) ax.axes.get_yaxis().set_visible(False) plt.tight_layout() return ax
def get_predict_increased_data_frame2(sel_frame, target, predict_start_day, shift_days, params): # 建立ARIMA模型 test_frame = sel_frame.set_index('createtime') test_frame: DataFrame = test_frame.asfreq('1H', method='bfill') print("pre_count:" + str(test_frame.count())) print("dup_count:" + str(test_frame.count())) model: ARIMA = ARIMA(test_frame, (params[0], params[1], params[2])).fit() # 给出一份模型报告 # print("model summary:\n" + str(model.summary2())) # 复制最后一个数据,并加上ARIMA预测出的结果 en_day = datetime.datetime.strptime( predict_start_day, "%Y-%m-%d") + datetime.timedelta(days=shift_days) predict_dta: DataFrame = Series.to_frame( model.predict(start=predict_start_day, end=en_day)) predict_dta.columns = ['predict_' + target] print("predict_dta:" + str(predict_dta.head(10))) predict_dta = predict_dta.cumsum() print("predict_dta_cumsum:" + str(predict_dta.head(10))) last_value = test_frame.tail(1) final_predict_dta = predict_dta + last_value.values[0][0] print("final_predict_dta:" + str(final_predict_dta.head(10))) return final_predict_dta
def get_predict_increased_data_frame(sel_frame, target, predict_start_day, shift_days, params): # 建立ARIMA模型 test_frame = sel_frame.set_index('createtime') test_frame: DataFrame = test_frame.asfreq('1H', method='bfill') print("pre_count:" + str(test_frame.count())) # test_frame = duplicate_frame(test_frame, predict_start_day, shift_days) # print("dup_count:" + str(test_frame.count())) model: ARIMA = ARIMA(test_frame, (params[0], params[1], params[2])).fit() # 给出一份模型报告 # print("model summary:\n" + str(model.summary2())) en_day = datetime.datetime.strptime( predict_start_day, "%Y-%m-%d") + datetime.timedelta( days=shift_days) + datetime.timedelta(seconds=-1) times = pd.date_range(predict_start_day, en_day, freq='h') series = Series(model.forecast(shift_days * 24)[0], index=times) predict_dta: DataFrame = Series.to_frame(series) predict_dta.columns = ['predict_' + target] print("predict_dta:" + str( model.predict(start=predict_start_day, end=en_day + datetime.timedelta(seconds=1)).head(10))) print("forecast_dta:" + str(model.forecast(100))) max_value = test_frame.max()[0] min_value = test_frame.min()[0] predict_dta = predict_dta['predict_' + target].map( lambda a: a - max_value + min_value if a >= max_value else a) return predict_dta
def _get_first_names_probs(self, first_names: pd.Series) -> pd.DataFrame: first_names_probs = first_names.to_frame().merge( p_first_name_given_race_df, left_on='first_name', right_index=True, how='left', ) return first_names_probs
def test_flags_identity(self, frame_or_series): s = Series([1, 2]) if frame_or_series is DataFrame: s = s.to_frame() assert s.flags is s.flags s2 = s.copy() assert s2.flags is not s.flags
def _get_last_name_probs(self, last_names: pd.Series) -> pd.DataFrame: last_names_probs = last_names.to_frame().merge( self._PROB_RACE_GIVEN_SURNAME, left_on='last_name', right_index=True, how='left', ) return last_names_probs
def log_series(s: pd.Series, message: str = None): if not isinstance(s, pd.Series): raise TypeError("Argument must be a pandas.Series") logger = logging.getLogger() logger.debug("{}\n\n{}\n".format(message if message else "", s.to_frame().to_markdown()))
def test_flags_identity(self, as_frame): s = Series([1, 2]) if as_frame: s = s.to_frame() assert s.flags is s.flags s2 = s.copy() assert s2.flags is not s.flags
def test_flags_identity(self, frame_or_series): obj = Series([1, 2]) if frame_or_series is DataFrame: obj = obj.to_frame() assert obj.flags is obj.flags obj2 = obj.copy() assert obj2.flags is not obj.flags
def test_ops_datetimelike_align(self): # GH 7500 # datetimelike ops need to align dt = Series(date_range('2012-1-1', periods=3, freq='D')) dt.iloc[2] = np.nan dt2 = dt[::-1] expected = Series([timedelta(0), timedelta(0), pd.NaT]) # name is reset result = dt2 - dt assert_series_equal(result, expected) expected = Series(expected, name=0) result = (dt2.to_frame() - dt.to_frame())[0] assert_series_equal(result, expected)
def _z_test_word_list(word_count_series_one: pd.Series, word_count_series_two: pd.Series) -> pd.Series: """Run z-test on all the words of two input word lists. :param word_count_series_one: a pandas series where: - the data is the word counts. - the index is the corresponding words. - the name depends on the what the input is. If a file is given, the name will be string "File" add the actual file name, or if a class is given, the name will be string "class" add the actual class name. :param word_count_series_two: a pandas series where: - the data is the word counts. - the index is the corresponding words. - the name depends on the what the input is. If a file is given, the name will be string "File" add the actual file name, or if a class is given, the name will be string "class" add the actual class name. :return: a panda series where: - the data is the z-scores. - the index is the corresponding words. - the name is a readable header for analysis result. """ # Find sample population of the two input data set. total_word_count_one = word_count_series_one.sum() total_word_count_two = word_count_series_two.sum() # Join two input pandas series together to avoid making the assumption # that they are parallel array in future analysis. joined_data_frame = word_count_series_one.to_frame().join( word_count_series_two.to_frame()) # Perform the z-test to detect word anomalies. # We are using dict instead of pandas series here, because this method # requires 'full_word_score_dict' to be sorted via the absolute value # of the z-scores (the 'value' of the dictionary). # For code clarity we use this as a temp solution, but in future we # can implement the 'sort_by' function for series in our general # functions if we need it for better performance. full_word_score_dict = \ {word: TopwordModel._z_test(p1=count1 / total_word_count_one, p2=count2 / total_word_count_two, n1=total_word_count_one, n2=total_word_count_two) for word, [count1, count2] in joined_data_frame.iterrows()} # Filter out the insignificant result. sig_word_score_dict = \ {word: z_score for word, z_score in full_word_score_dict.items() if abs(z_score) >= 1.96} # Sort 'sig_word_score_dict' by absolute value of z-scores in # descending order. sorted_dict = OrderedDict(sorted(sig_word_score_dict.items(), key=lambda item: abs(item[1]), reverse=True)) # Convert the sorted result to a panda series. result_series = pd.Series(sorted_dict) # Set the result series name. result_series.name = f"{word_count_series_one.name} compares to " \ f"{word_count_series_two.name}" return result_series
test_x_t = text.iloc[test_idx] train_x_v = df.iloc[train_idx, :] test_x_v = df.iloc[test_idx, :] tf = tf_vectorizer.fit_transform(train_x_t.tolist()) train_x = hstack([csr_matrix(train_x_v), tf], format='csr') tf = tf_vectorizer.fit_transform(test_x_t.tolist()) test_x = hstack([csr_matrix(test_x_v), tf], format='csr') test_x_list.append(test_x) test_y_all = test_y_all.append(test_y, ignore_index=True) _ = sgdr.partial_fit(train_x, train_y, classes=[0.0, 1.0]) print time() - t1 test_x = vstack(test_x_list, format='csr') test_y = test_y_all.to_frame('original') test_y['predicted'] = sgdr.predict_proba(test_x)[1] mad = (test_y['original'] - test_y['predicted']).abs().median() type_s = (test_y['original'].gt(0.0) == test_y['predicted'].gt(0.0)).mean() fn = (test_y['original'].gt(0.0) & test_y['predicted'].le(0.0)).mean() fp = (test_y['original'].le(0.0) & test_y['predicted'].gt(0.0)).mean() import seaborn as sns sns.distplot(test_y['original'])
def test_to_sql_series(self): s = Series(np.arange(5, dtype='int64'), name='series') sql.to_sql(s, "test_series", self.conn, flavor='sqlite', index=False) s2 = sql.read_sql("SELECT * FROM test_series", self.conn, flavor='sqlite') tm.assert_frame_equal(s.to_frame(), s2)
def alpha_correlation(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata, method: str = 'spearman') -> None: try: alpha_correlation_fn = _alpha_correlation_fns[method] except KeyError: raise ValueError('Unknown alpha correlation method %s. The available ' 'options are %s.' % (method, ', '.join(_alpha_correlation_fns.keys()))) # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='numeric', drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata contains only non-numeric or empty columns. This " "visualizer requires at least one numeric metadata column to " "execute.") # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() # create a dataframe containing the data to be correlated, and drop # any samples that have no data in either column df = pd.concat([metadata_column.to_series(), alpha_diversity], axis=1, join='inner') # compute correlation correlation_result = alpha_correlation_fn(df[metadata_column.name], df[alpha_diversity.name]) warning = None if alpha_diversity.shape[0] != df.shape[0]: warning = {'initial': alpha_diversity.shape[0], 'method': method.title(), 'filtered': df.shape[0]} escaped_column = quote(column) filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) with open(os.path.join(output_dir, filename), 'w') as fh: fh.write("load_data('%s'," % column) df.to_json(fh, orient='split') fh.write(",") json.dump(warning, fh) fh.write(",") json.dump({ 'method': method.title(), 'testStat': '%1.4f' % correlation_result[0], 'pVal': '%1.4f' % correlation_result[1], 'sampleSize': df.shape[0]}, fh) fh.write(");") index = os.path.join(TEMPLATES, 'alpha_correlation_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'filtered_columns': ', '.join(sorted(filtered_columns))}) shutil.copytree(os.path.join(TEMPLATES, 'alpha_correlation_assets', 'dist'), os.path.join(output_dir, 'dist'))
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns( drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat([alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis(groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append( ['%s:%s' % (column, names[i]), '%s:%s' % (column, names[j])]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests( kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump({'initial': initial_data_length, 'filtered': filtered_data_length}, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join( TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render(index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons])}) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def _25(series: pd.Series) -> TSVTaxonomyFormat: return _dataframe_to_tsv_taxonomy_format(series.to_frame())