def run_num_correlations(question_pairs, data): print("Running correlations.") correlation_results = [] for t in question_pairs: full_response_entries = tools.get_responses_to_numbers(t, data) answers_1, answers_2 = tools.extract_vals_from_responses(*full_response_entries) invalid_1, invalid_2 = tools.get_indexes_of_invalid_repsonse_types( [int], answers_1, answers_2 ) invalid_all = tools.merge_invalid_indexes(invalid_1, invalid_2) final_answers_1, final_answers_2 = tools.remove_entries_at_indexes( invalid_all, answers_1, answers_2) # print(answers_1, answers_2) # print(final_answers_1, final_answers_2) # print(len(answers_1), len(answers_2)) # print(len(final_answers_1), len(final_answers_2)) slope, intercept, r_value, p_value, std_err = stats.linregress( final_answers_1, final_answers_2 ) r_squared = r_value**2 result = {"questions": t, "slope": slope, "intercept": intercept, "r_value": r_value, "p_value": p_value, "std_err": std_err, "r_squared": r_squared } correlation_results.append(result) print("Finished running correlations.") return correlation_results
def plot_correlations(results, data, pdf): print("Saving {} result plots to pdf.".format(len(results))) for result in results: print('.'), sys.stdout.flush() q1, q2 = result['questions'] title_1 = tools.get_question_title(q1, data) title_2 = tools.get_question_title(q2, data) x_raw = tools.get_responses_to_number(q1, data) y_raw = tools.get_responses_to_number(q2, data) x,y = tools.extract_vals_from_responses(x_raw, y_raw) invalid_x, invalid_y = tools.get_indexes_of_invalid_repsonse_types( [int], x, y ) invalid_all = tools.merge_invalid_indexes(invalid_x, invalid_y) x, y = tools.remove_entries_at_indexes(invalid_all, x, y) # Calculate the point density xy = np.vstack([x,y]) try: z = stats.gaussian_kde(xy)(xy) except Exception as e: print(xy) raise e size = 5000*z final_size = [] for s in size: final_size.append(max(s,60)) # Calculate axis numbers x_range = (min(x)-1, max(x)+1) y_range = (min(y)-1, max(y)+1) # generate data for best fit line slope = result['slope'] intercept = result['intercept'] x_fit_points = x_range y_fit_points = (x_range[0]*slope + intercept, x_range[1]*slope + intercept) fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.set_title("{} vs {}\nr_squared = {:.4f}".format(title_1, title_2, result['r_squared'])) ax.set_xlabel("{} (Q{})".format(title_1, q1)) ax.set_ylabel("{} (Q{})".format(title_2, q2)) ax.scatter(x, y, c=z, s=final_size, edgecolor='') ax.plot(x_fit_points, y_fit_points, '-') pdf.savefig(fig) plt.close(fig) print("\nDone saving plots to pdf.\n")