def correlation_coefficient(aquery, bquery): cov = covariance(aquery, bquery) adatapoints = DataPoint.get_by_query(aquery) bdatapoints = DataPoint.get_by_query(bquery) sdA = standard_deviation(adatapoints) sdB = standard_deviation(bdatapoints) if cov is None or sdA is None or sdB is None: return None cc = cov / (sdA * sdB) return cc
def crosssection_suite(query): crosssection_list = [] user = query.user for q in Query.get_by_user(user): if q.format == 'number' and q.name != query.name: for x in range(query_range(q)[0], query_range(q)[1]): avg_name = 'Average when ' + q.name + ' = ' + str(x) avg_value = float_str_format(avg_int_on_sliced_int(query, q, x)) crosssection_list.append((avg_name, avg_value)) percent_name = 'Change from average when ' + q.name + ' = ' + str(x) percent_value =float_str_format(percent_from_avg_int_on_sliced_int(query, q, x)) crosssection_list.append((percent_name, percent_value)) elif q.format == 'text': for word in common_words(DataPoint.get_by_query(q)).split(', '): avg_name = 'Average when "' + word + '" is in ' + q.name avg_value = float_str_format(avg_int_on_sliced_text(query, q, word)) crosssection_list.append((avg_name, avg_value)) #percent_name = 'Change from average when ' + word + ' in ' +q.name #percent_value = float_str_format(percent_from_avg_int_on_sliced_int(query, q, x)) #crosssection_list[percent_name] = percent_value return crosssection_list
def analyze_text_query_data(query): datapoints = DataPoint.get_by_query(query) analytic_list = [] analytic_list.extend(basic_suite(datapoints)) return analytic_list
def refresh_datapoints(self, queries): # weird thing here. # if DataPoint.get_by_query cache-misses, we are actually refreshing # this twice in a row. If it cache-hits, we only refresh it once (which # is what we want) # maybe this should be inside model.py, but I don't think an extra # few memcache puts every day are going to kill us right now. # for each metric # get all datapoints in json # put into memcache for query in queries: datapoints = DataPoint.get_by_query(query) json_dps = DataPoint.JsonFromArray(datapoints) mck_metric_datapoints = str(query.key()) + '.datapoints' mck_metric_datapoints_last_update = str(query.key()) + '.datapoints-last-update' memcache.set( key=mck_metric_datapoints, value=json_dps, ) memcache.set( key=mck_metric_datapoints_last_update, value=datetime.now().strftime('%s'), ) logging.info("Refreshed datapoints for metric: " + str(query.key()))
def avg_int_on_sliced_text(aquery, bquery, value): adatapoints = DataPoint.get_by_query(aquery) bdatapoints = DataPoint.get_by_query(bquery) adata = mapize_int_data(adatapoints) bdata = mapize_data(bdatapoints) # not int data! # bucket to days adata = bucket_to_days(adata) bdata = bucket_to_days(bdata) # throw out all the datapoints that aren't 'value' bdata = text_cross_section(bdata, value) adata, bdata = symmettrysize(adata, bdata) avg = map_data_average(adata) return avg
def covariance(int_query_a, int_query_b): # cov = sum for all i (x - xnaught)(y-ynaught) all over N-1 # we need to match up points between the two datasets adatapoints = DataPoint.get_by_query(int_query_a) bdatapoints = DataPoint.get_by_query(int_query_b) adata = mapize_int_data(adatapoints) bdata = mapize_int_data(bdatapoints) #if adata is None: # print int_query_a.name + " is none!" # tweak the data so we only have a single point for each day # this can return just index: data, since we don't care about the actual # times adata = bucket_to_days(adata) bdata = bucket_to_days(bdata) # tweak the data such that there is a 1:1 mapping between the sets adata, bdata = symmettrysize(adata, bdata) # key it from 0... # do the actual covariance N = len(adata) if N <= 1: # we divide by N-1 just below return None aAvg = map_data_average(adata) bAvg = map_data_average(bdata) sum = 0.0 for i in adata.keys(): sum += (adata[i] - aAvg)*(bdata[i] - bAvg) if N-1 <= 0: cov = 0 else: cov = sum/(N-1) return cov
def get(self): user = self.get_user() if not user: return query_id = self.request.get('query_id') query = Query.get_by_id(query_id) datapoints = DataPoint.get_by_query(query) frequencies = common_word_frequencies(datapoints) self.response.out.write('[' + json.dumps(frequencies) + ']')
def avg_int_on_sliced_int(aquery, bquery, value): adatapoints = DataPoint.get_by_query(aquery) bdatapoints = DataPoint.get_by_query(bquery) adata = mapize_int_data(adatapoints) bdata = mapize_int_data(bdatapoints) # bucket sleep by days bdata = bucket_to_days(bdata) adata = bucket_to_days(adata) # throwout all the sleep dps that aren't 8 bdata = integer_cross_section(bdata, value) symmettrysize(adata, bdata) if len(adata) == 0: return 0 # average the bdata values avg = map_data_average(adata) # return it return avg
def analyze_integer_query_data(query): datapoints = DataPoint.get_by_query(query) analytic_list = [] # basics analytic_list.extend(basic_suite(datapoints)) # daily basics analytic_list.extend(daily_suite(datapoints)) analytic_list.extend(covariance_suite(query)) analytic_list.extend(correlation_suite(query)) analytic_list.extend(crosssection_suite(query )) return analytic_list
def get(self): user = self.get_user() if not user: return user_email = self.request.get('user_email') query_id = self.request.get('query_id') user = User.get_by_email(user_email) query = db.get(query_id) #hmmm datapoints = [] for datapoint in DataPoint.get_by_query(query): datapoints.append(datapoint.to_dict()) self.response.out.write(json.dumps(datapoints))
def get(self): user = self.get_user() if not user: return user_email = self.request.get('user_email') user = User.get_by_email(user_email) queries = Query.get_by_user(user) datapoints = [] for query in queries: for datapoint in DataPoint.get_by_query(query): datapoints.append(datapoint.to_dict()) self.response.out.write(json.dumps(datapoints))
def get(self): user = self.get_user() if not user: return user_email = self.request.get('user_email') query_id = self.request.get('query_id') query = Query.get_by_id(query_id) datapoints = DataPoint.get_by_query(query) csv_data = '' for dp in datapoints: csv_data += self.dp_to_csv(dp) self.response.out.write(csv_data)
def query_data_from_db(self, query): query_template = open('ui/html/metric_data.html').read() rows = '' # get all datapoints associated with the query datapoints = DataPoint.get_by_query(query) # for each datapoint from the query # append data_point_to_row(dp) for dp in datapoints: rows += self.data_point_to_row(dp) params = { 'rows': rows, 'name': query.name, 'query_id': query.key() } return query_template % params
def post(self): user = self.get_user() if not user: return query_id = self.request.get('query_id') query = db.get(query_id) if not query: self.response.out.write('failure!') return # delete all the datapoints associated with the query as well. datapoints = DataPoint.get_by_query(query) for dp in datapoints: dp.delete() # finally, delete the query query.delete()
def text_overview(query): datapoints = DataPoint.get_by_query(query) return 'Common Words: ' + common_words(datapoints)
def time_overview(query): datapoints = DataPoint.get_by_query(query) return 'Average Time: ' + str(average_time(datapoints))
def query_average(query): datapoints = DataPoint.get_by_query(query) return average(datapoints)
def integer_overview(query): datapoints = DataPoint.get_by_query(query) return 'Average: ' + float_str_format(average(datapoints))
def query_range(query): datapoints = DataPoint.get_by_query(query) return data_range(datapoints)