def get_events( users: List[str], start: date = date(2018, 1, 1), end: date = date(2019, 1, 1) ) -> pd.DataFrame: return sql_to_df("events.sql", users=users, start=start, end=end).pipe(_rename_events).pipe(coerce_types, types=Cols.types())
def products_per_bag(start: date = date(2018, 12, 1), end: date = date(2019, 2, 25)): init_plt() user_date_stats = sql_to_df('views_per_bag.sql', start=start, end=end) user_date_stats["bag_indicator"] = user_date_stats["bag_indicator"].astype(float) sns.lineplot(x="product", y="bag_indicator", data=user_date_stats) plt.gcf().suptitle("Product Views vs Bags") plt.xlabel("Product Views") plt.ylabel("Likelihood of Bagging") plt.xlim(0, 25) plt.show()
def _join_first_order_facts(users: pd.DataFrame) -> pd.DataFrame: data = sql_to_df("first_order_facts.sql").set_index(Cols.EMAIL) data[Cols.FIRST_ORDER_DIVISION] = data[Cols.FIRST_ORDER_DIVISION].replace( regex=r"\|unknown", value="") data[Cols.FIRST_ORDER_HAS_BABY] = data[Cols.FIRST_ORDER_DIVISION].isin( ["baby", "baby|kids"]).astype(int).astype(float) * 1.0 data[Cols.FIRST_ORDER_HAS_KIDS] = data[Cols.FIRST_ORDER_DIVISION].isin( ["kids", "baby|kids"]).astype(int).astype(float) * 1.0 joined = users.join(data, on=Cols.EMAIL, how="left") return joined
def _join_mixpanel_stats(users: pd.DataFrame) -> pd.DataFrame: mixpanel_data = sql_to_df("user_visit_stats.sql").set_index(Cols.EMAIL) joined = users.join(mixpanel_data, on=Cols.EMAIL, how="left") # Single-visit users were retained for one day. joined[Cols.DAYS_RETAINED] = joined[Cols.DAYS_RETAINED].apply( lambda d: 1.0 if d < 1 else d) joined[Cols.DAYS_RETAINED_PER_VISIT] = joined[ Cols.DAYS_RETAINED_PER_VISIT].apply(lambda d: 1.0 if d < 1 else d) joined[Cols.QUARTERS_RETAINED] = joined[Cols.DAYS_RETAINED].apply( lambda d: d / 90.0 if d >= 90.0 else 1.0) joined[Cols.ORDERS_PER_QUARTER] = joined[Cols.LIFETIME_ORDERS] / joined[ Cols.QUARTERS_RETAINED] joined[Cols.YEARS_RETAINED] = joined[Cols.DAYS_RETAINED].apply( lambda d: d / 365.0 if d >= 365.0 else 1.0) joined[Cols.ORDERS_PER_YEAR] = joined[Cols.LIFETIME_ORDERS] / joined[ Cols.YEARS_RETAINED] return joined
def get_transactions() -> pd.DataFrame: return sql_to_df("transactions.sql").pipe( coerce_types, types=Cols.types()).pipe(_calculate_fields)