def get_style_popular_hops(style: Style, use_filter: list) -> DataFrame: style_ids = style.get_ids_including_sub_styles() query = ''' SELECT rh.recipe_id, rh.kind_id, rh.amount_percent FROM recipe_db_recipehop AS rh JOIN recipe_db_recipe AS r ON rh.recipe_id = r.uid WHERE r.style_id IN ({}) '''.format(','.join('%s' for _ in style_ids)) if len(use_filter): query += " AND rh.use IN ({})".format(','.join('%s' for _ in use_filter)) df = pd.read_sql(query, connection, params=style_ids + use_filter) if len(df) == 0: return df # Aggregate amount per recipe hops = df.groupby(["recipe_id", "kind_id"]).agg({"amount_percent": "sum"}).reset_index() # Filter top hops top_hops_ids = hops["kind_id"].value_counts()[:10].index.values top_hops = hops[hops['kind_id'].isin(top_hops_ids)] # Get only the values of the mostly used hops # Calculate boxplot values agg = [lowerfence, q1, 'median', 'mean', q3, upperfence, 'count'] aggregated = top_hops.groupby('kind_id').agg({'amount_percent': agg}) aggregated = aggregated.reset_index() aggregated = aggregated.sort_values(by=('amount_percent', 'count'), ascending=False) # Finally, add hop names aggregated['hop'] = aggregated['kind_id'].map(get_hop_names_dict()) return aggregated
def get_style_metric_values(style: Style, metric: str) -> DataFrame: style_ids = style.get_ids_including_sub_styles() precision = METRIC_PRECISION[ metric] if metric in METRIC_PRECISION else METRIC_PRECISION['default'] query = ''' SELECT round({}, {}) as {} FROM recipe_db_recipe WHERE {} IS NOT NULL AND style_id IN ({}) '''.format(metric, precision, metric, metric, ','.join('%s' for _ in style_ids)) df = pd.read_sql(query, connection, params=style_ids) df = remove_outliers(df, metric, 0.02) bins = 16 if metric in ['og', 'fg'] and len(df) > 0: abs = df[metric].max() - df[metric].min() bins = max([1, round(abs / 0.002)]) if bins > 18: bins = round(bins / math.ceil(bins / 12)) histogram = df.groupby([pd.cut(df[metric], bins, precision=precision) ])[metric].agg(['count']) histogram = histogram.reset_index() histogram[metric] = histogram[metric].map(str) return histogram
def get_style_hop_pairings(style: Style) -> DataFrame: style_ids = style.get_ids_including_sub_styles() query = ''' SELECT rh.recipe_id, rh.kind_id, rh.amount_percent FROM recipe_db_recipehop AS rh JOIN recipe_db_recipe AS r ON rh.recipe_id = r.uid WHERE rh.kind_id IS NOT NULL AND r.style_id IN ({}) '''.format(','.join('%s' for _ in style_ids)) hops = pd.read_sql(query, connection, params=style_ids) return get_hop_pairings(hops)
def calculate_all_style_metrics(self, calculator: StyleMetricCalculator, style: Style) -> None: style.recipes_count = calculator.calc_recipes_count(style) for metric in calculator.available_metrics: self.stdout.write('Calculate {} for style {}'.format( metric.value, style.name)) (min, mean, max) = calculator.calc_metric(style, metric) self.stdout.write(str((min, mean, max))) setattr(style, "recipes_%s_min" % metric.value, None if math.isnan(min) else min) setattr(style, "recipes_%s_mean" % metric.value, None if math.isnan(mean) else mean) setattr(style, "recipes_%s_max" % metric.value, None if math.isnan(max) else max)
def get_style_popular_fermentables(style: Style, category_filter: list, type_filter: list) -> DataFrame: style_ids = style.get_ids_including_sub_styles() query = ''' SELECT rf.recipe_id, f.name AS fermentable, rf.amount_percent FROM recipe_db_recipefermentable AS rf JOIN recipe_db_recipe AS r ON rf.recipe_id = r.uid JOIN recipe_db_fermentable AS f ON rf.kind_id = f.id WHERE r.style_id IN ({}) '''.format(','.join('%s' for _ in style_ids)) if len(category_filter): query += " AND f.category IN ({})".format(','.join( '%s' for _ in category_filter)) if len(type_filter): query += " AND f.type IN ({})".format(','.join('%s' for _ in type_filter)) df = pd.read_sql(query, connection, params=style_ids + category_filter + type_filter) if len(df) == 0: return df # Aggregate amount per recipe fermentables = df.groupby(["recipe_id", "fermentable"]).agg({ "amount_percent": "sum" }).reset_index() top_fermentables_ids = fermentables["fermentable"].value_counts( )[:10].index.values top_fermentables = fermentables[fermentables['fermentable'].isin( top_fermentables_ids )] # Get only the values of the mostly used fermentable agg = [lowerfence, q1, 'median', 'mean', q3, upperfence, 'count'] aggregated = top_fermentables.groupby('fermentable').agg( {'amount_percent': agg}) aggregated = aggregated.reset_index() aggregated = aggregated.sort_values(by=('amount_percent', 'count'), ascending=False) return aggregated
def get_style_trending_hops(style: Style) -> DataFrame: style_ids = style.get_ids_including_sub_styles() recipes_per_month = get_style_num_recipes_per_month(style_ids) query = ''' SELECT date(r.created, 'start of month') AS month, rh.kind_id, count(DISTINCT r.uid) AS recipes FROM recipe_db_recipe AS r JOIN recipe_db_recipehop AS rh ON r.uid = rh.recipe_id WHERE r.created IS NOT NULL AND r.created > %s AND rh.kind_id IS NOT NULL AND r.style_id IN ({}) GROUP BY date(r.created, 'start of month'), rh.kind_id '''.format(','.join('%s' for _ in style_ids)) per_month = pd.read_sql(query, connection, params=[POPULARITY_MIN_MONTH] + style_ids) per_month = per_month.merge(recipes_per_month, on="month") per_month['month'] = pd.to_datetime(per_month['month']) per_month['recipes_percent'] = per_month['recipes'] / per_month[ 'total_recipes'] * 100 trending = filter_trending(per_month, 'kind_id', 'month', 'recipes_percent') trending = set_multiple_series_start(trending, 'kind_id', 'month', 'recipes_percent') # Finally, add hop names trending['hop'] = trending['kind_id'].map(get_hop_names_dict()) return trending
def load_styles(): csv_file = load_csv('styles.csv') header = next(csv_file) styles = {} for row in csv_file: if len(row) == 1: continue # Skip empty lines row = map(cast_values, row) data = dict(zip(header, row)) style_id = make_style_id(data['id']) parent_style_id = make_style_id(data['parent_style_id']) parent_style = None if data['og_min'] is not None: data['og_min'] /= 1000 if data['og_max'] is not None: data['og_max'] /= 1000 if data['fg_min'] is not None: data['fg_min'] /= 1000 if data['fg_max'] is not None: data['fg_max'] /= 1000 if parent_style_id is not None: if parent_style_id in styles: parent_style = styles[parent_style_id] else: raise Exception( "Could not find parent style {}".format(parent_style_id)) try: style = Style.objects.get(pk=style_id) except Style.DoesNotExist: style = Style() pass for field in data: setattr(style, field, data[field]) style.id = style_id style.parent_style = parent_style style.save() styles[style.id] = style
def get_style_popularity(style: Style) -> DataFrame: style_ids = style.get_ids_including_sub_styles() return calculate_styles_popularity(style_ids)