def do_single_search(request_form): """ search method called from both welcome() and search() :param request_form: :return: """ search_terms = request_form["singleTermQuery"].lower() language_var, country_var = request_form["languageAndRegion"].split(':', 1) try: specific_query = simple_query_totals({"query": "body_text_ws:%s" % search_terms, "filter": ["country_s:%s" % country_var, "langid_s:%s" % language_var]}) except (KeyError, HTTPError): return flask.render_template('no_results.html', query=search_terms, available_options=AVAILABLE_OPTIONS, search_mode='single') matches = specific_query['num_docs'].sum() ############################# # GET TOTALS FOR EVERYTHING # ############################# totals = simple_query_totals({"query": "*:*", "filter": ["country_s:%s" % country_var, "langid_s:%s" % language_var]}) gender_totals = totals.groupby('gender').num_docs.sum() age_totals = totals.groupby('age').num_docs.sum() age_totals = sort_and_filter_age(age_totals) age_totals_norm = age_totals / age_totals.sum() age_and_gender_totals = prepare_age_and_gender(totals) # nuts_total = totals.groupby('nuts_3').num_docs.sum() ########### # GENDER # ########### gender_specific_query = specific_query.groupby('gender').num_docs.sum() abs_percentages = gender_specific_query / gender_totals try: renormalizer = 1.0 / abs_percentages.sum() except ZeroDivisionError: return flask.render_template('no_results.html', query=search_terms, available_options=AVAILABLE_OPTIONS, search_mode='single') gender_query_adjusted = abs_percentages * renormalizer ####### # AGE # ####### age_specific_query = specific_query.groupby('age').num_docs.sum() age_specific_query = sort_and_filter_age(age_specific_query) age_specific_query_norm = age_specific_query / age_specific_query.sum() compare_age_df = pd.DataFrame({'background distribution': age_totals_norm, 'query': pd.rolling_mean(age_specific_query_norm, ROLLING_MEAN_FRAME)}) compare_age_df['i'] = compare_age_df.index ################## # AGE AND GENDER # ################## age_and_gender_specific_query = prepare_age_and_gender(specific_query) try: age_specific_male_totals = gender_specific_query['M'].sum() compare_male_df = pd.DataFrame({'background distribution': age_and_gender_totals['M'], 'query': pd.rolling_mean(age_and_gender_specific_query['M'], ROLLING_MEAN_FRAME)}) except KeyError: age_specific_male_totals = 0 compare_male_df = pd.DataFrame({'background distribution': age_and_gender_totals['M']}) compare_male_df['i'] = compare_male_df.index try: age_specific_female_totals = gender_specific_query['F'] compare_female_df = pd.DataFrame({'background distribution': age_and_gender_totals['F'], 'query': pd.rolling_mean(age_and_gender_specific_query['F'], ROLLING_MEAN_FRAME)}) except KeyError: age_specific_female_totals = 0 compare_female_df = pd.DataFrame({'background distribution': age_and_gender_totals['F']}) compare_female_df['i'] = compare_female_df.index ######## # NUTS # ######## nuts_query = specific_query.groupby('nuts_3').num_docs.sum() nuts_total = nuts_query.sum() nuts_query_norm = nuts_query / nuts_total special_regions = nuts_query_norm > nuts_query_norm.median() outliers = ', '.join( sorted(['%s (%s)' % (NUTS_NAMES[x], x) for x in special_regions.index if special_regions.ix[x].any() == True])) # TODO move plotting to its own function gender_plot = Bar(gender_query_adjusted, title="Gender distribution", ylabel="percentage", logo=None, toolbar_location="below", # width=300, # height=400, webgl=False) age_plot = Line(compare_age_df, x='i', title="Age distribution", x_range=Range1d(start=MIN_AGE, end=MAX_AGE), xlabel='age', ylabel="percentage", logo=None, toolbar_location="below", # width=800, # height=400, legend='top_right', color=['silver', 'red'], webgl=False) age_gender_plot_M = Line(compare_male_df, x='i', title="Age distribution for men", xlabel='age', ylabel="percentage", x_range=Range1d(start=MIN_AGE, end=MAX_AGE), logo=None, toolbar_location="below", # width=600, # height=400, legend='top_right', color=['silver', 'green'], webgl=False) age_gender_plot_F = Line(compare_female_df, x='i', title="Age distribution for women", xlabel='age', x_range=Range1d(start=MIN_AGE, end=MAX_AGE), logo=None, toolbar_location="below", # width=600, # height=400, legend='top_right', color=['silver', 'blue'], webgl=False) bokeh_script, (gender_plot_div, age_plot_div, age_gender_plot_F_div, age_gender_plot_M_div) = components( (gender_plot, age_plot, age_gender_plot_F, age_gender_plot_M)) return flask.render_template('single_term_results.html', query=search_terms, matches=matches, bokeh_script=bokeh_script, gender_query_adjusted=gender_query_adjusted, gender_plot=gender_plot_div, age_plot=age_plot_div, age_gender_plot_F=age_gender_plot_F_div, age_gender_plot_M=age_gender_plot_M_div, country_code=country_var, map_views=MAP_VIEWS, nuts_query=nuts_query_norm.to_json(), outliers=outliers, gender_total=gender_specific_query.sum(), age_total=age_specific_query.sum(), age_total_M=age_specific_male_totals, age_total_F=age_specific_female_totals, nuts_total=nuts_query.sum(), available_options=AVAILABLE_OPTIONS)
def do_double_search(request_form): """ search method called from both welcome() and search() :param request_form: :return: """ search_term1 = request_form["doubleTermQuery1"].lower() search_term2 = request_form["doubleTermQuery2"].lower() language_var, country_var = request_form["languageAndRegion"].split(':', 1) try: specific_query1 = simple_query_totals({"query": "body_text_ws:%s" % search_term1, "filter": ["country_s:%s" % country_var, "langid_s:%s" % language_var]}) except (KeyError, HTTPError): return flask.render_template('no_results.html', query=search_term1, available_options=AVAILABLE_OPTIONS, search_mode='double') try: specific_query2 = simple_query_totals({"query": "body_text_ws:%s" % search_term2, "filter": ["country_s:%s" % country_var, "langid_s:%s" % language_var]}) except (KeyError, HTTPError): return flask.render_template('no_results.html', query=search_term2, available_options=AVAILABLE_OPTIONS, search_mode='double') # need to check country again for some reason matches = [specific_query1['num_docs'].sum(), specific_query2['num_docs'].sum()] ############################# # GET TOTALS FOR EVERYTHING # ############################# totals = simple_query_totals({"query": "*:*", "filter": ["country_s:%s" % country_var, "langid_s:%s" % language_var]}) gender_totals = totals.groupby('gender').num_docs.sum() age_totals = totals.groupby('age').num_docs.sum() age_totals = sort_and_filter_age(age_totals) age_totals_norm = age_totals / age_totals.sum() ########### # GENDER # ########### gender_specific_query1 = pd.DataFrame(data=specific_query1.groupby('gender').num_docs.sum(), index=['F', 'M']).fillna(0) gender_specific_query2 = pd.DataFrame(data=specific_query2.groupby('gender').num_docs.sum(), index=['F', 'M']).fillna(0) abs_percentages1 = gender_specific_query1.num_docs / gender_totals abs_percentages2 = gender_specific_query2.num_docs / gender_totals try: renormalizer1 = 1.0 / abs_percentages1.sum() except ZeroDivisionError: return flask.render_template('no_results.html', query=search_term1, available_options=AVAILABLE_OPTIONS, search_mode='double') try: renormalizer2 = 1.0 / abs_percentages2.sum() except ZeroDivisionError: return flask.render_template('no_results.html', query=search_term2, available_options=AVAILABLE_OPTIONS, search_mode='double') gender_query_adjusted1 = abs_percentages1 * renormalizer1 gender_query_adjusted2 = abs_percentages2 * renormalizer2 gender_comparison = pd.DataFrame( data={search_term1: gender_specific_query1.values.reshape(-1), search_term2: gender_specific_query2.values.reshape(-1)}, index=['F', 'M']).T gender_comparison_adjusted = pd.DataFrame( data={search_term1: gender_query_adjusted1.values, search_term2: gender_query_adjusted2.values}, index=['F', 'M']).T del gender_comparison.index.name chi2, pvalue, dof, expected = chi2_contingency(gender_comparison) gender_stats_level = bisect(P_LEVELS, pvalue) if gender_stats_level == len(P_LEVELS): gender_stats_msg = "Gender difference is <em>not</em> statistically significant (Chi-squared contingency test with p > %.4f)" % ( P_LEVELS[-1]) else: gender_stats_msg = "Gender difference is statistically significant at p < %s (p = %.4f with Chi-squared contingency test)" % ( P_LEVELS[gender_stats_level], pvalue) J = pd.DataFrame(gender_comparison_adjusted.unstack()) L = pd.DataFrame(data={'variable': [J.index.levels[1][x] for x in J.index.labels[1]], 'gender': [J.index.levels[0][x] for x in J.index.labels[0]], 'count': J.values.T[0].tolist()}) gender_plot = Bar(L, ylabel="percentage", group='gender', label='variable', values='count', title="Distribution by gender", logo=None, toolbar_location="below", # width=600, # height=400, legend='top_right', color=['blue', 'green'], webgl=False) ####### # AGE # ####### age_specific_query1 = specific_query1.groupby('age').num_docs.sum() age_specific_query1 = sort_and_filter_age(age_specific_query1) age_specific_query_norm1 = age_specific_query1 / age_specific_query1.sum() age_specific_query2 = specific_query2.groupby('age').num_docs.sum() age_specific_query2 = sort_and_filter_age(age_specific_query2) age_specific_query_norm2 = age_specific_query2 / age_specific_query2.sum() compare_age_df = pd.DataFrame({'background distribution': age_totals_norm, 'first term': pd.rolling_mean(age_specific_query_norm1, ROLLING_MEAN_FRAME), 'second term': pd.rolling_mean(age_specific_query_norm2, ROLLING_MEAN_FRAME) }) r, pvalue = spearmanr(compare_age_df['first term'], compare_age_df['second term']) age_stats_level = bisect(P_LEVELS, pvalue) if age_stats_level == len(P_LEVELS): age_stats_msg = "Age difference is <em>not</em> statistically significant (p > %s)" % (P_LEVELS[-1]) else: age_stats_msg = "Age difference is <em>statistically significant</em> at p < %s (p = %s)" % ( P_LEVELS[age_stats_level], pvalue) compare_age_df['i'] = compare_age_df.index age_plot = Line(compare_age_df, x='i', title="Age distribution", ylabel="percentage", xlabel='age', logo=None, toolbar_location="below", legend='top_right', color=['silver', 'blue', 'green'], # width=1000, # height=400, webgl=False) ######## # NUTS # ######## # TODO: what about missing regions? nuts_specific_query1 = specific_query1.groupby('nuts_3').num_docs.sum() nuts_specific_query2 = specific_query2.groupby('nuts_3').num_docs.sum() nuts_query_norm1 = nuts_specific_query1 / nuts_specific_query1.sum() nuts_query_norm2 = nuts_specific_query2 / nuts_specific_query2.sum() regions = list(sorted(set(nuts_specific_query1.index).union(set(nuts_specific_query2.index)))) nutsdiff = pd.DataFrame(0, index=regions, columns=arange(1)) nutsdiff[0] = nuts_query_norm1 - nuts_query_norm2 nutsdiff['G2'] = abs(nutsdiff[0]) > nutsdiff[0].abs().mean() outliers = sorted([x for x in regions if nutsdiff['G2'].ix[x].any() == True]) is_it_term2 = nutsdiff[0].ix[outliers] < 0 outliers1 = ', '.join( sorted(['%s (%s)' % (NUTS_NAMES[x], x) for x in is_it_term2.index if is_it_term2[x] == False])) outliers2 = ', '.join(sorted(['%s (%s)' % (NUTS_NAMES[x], x) for x in is_it_term2.index if is_it_term2[x] == True])) outlier_description = [] if outliers1: outlier_description.append( '<em>%s</em> is more prevalent than <em>%s</em> in regions %s' % (search_term1, search_term2, outliers1)) if outliers2: if outlier_description: outlier_description.append(', while <br />') outlier_description.append( '<em>%s</em> is more prevalent than <em>%s</em> in regions %s' % (search_term2, search_term1, outliers2)) outlier_description = ''.join(outlier_description) bokeh_script, (gender_plot_div, age_plot_div) = components((gender_plot, age_plot)) return flask.render_template('comparison_term_results.html', query1=search_term1, query2=search_term2, matches=matches, gender_comparison=gender_comparison.to_html(justify='right'), gender_stats_msg=gender_stats_msg, bokeh_script=bokeh_script, gender_plot=gender_plot_div, age_plot=age_plot_div, country_code=country_var, outlier_description=outlier_description, gender_total1=gender_specific_query1.sum().num_docs, gender_total2=gender_specific_query2.sum().num_docs, age_total1=age_specific_query1.sum(), age_total2=age_specific_query2.sum(), # age_total_M=age_specific_male_totals, # age_total_F=age_specific_female_totals, nuts_total1=nuts_specific_query1.sum(), nuts_total2=nuts_specific_query2.sum(), available_options=AVAILABLE_OPTIONS )