def execute(appconfig, query, args, fetcher): start, end, timezone = util.parseTime(args) offset = util.getOffsetSeconds(start, timezone) start, end = start - offset, end - offset offset_seconds = 0 if args.get('offsetdate') == 'yesterday': offset_seconds = util.total_seconds(timedelta(days=1)) elif args.get('offsetdate') == 'lastweek': offset_seconds = util.total_seconds(timedelta(days=7)) start -= offset_seconds ids = args['id'].split(',') objtype = '256' if 'campaign' == args['type'] else '257' ids = [_id + "_" + objtype for _id in ids] try: colo = appconfig.get("hbase", "hbase") except: colo = 'blue_prod' keys = ",".join(ids) if args.get('analysis') == "true": rets = load_hbase(colo, 'analysis', start, end, keys, offset) else: rets = load_hbase(colo, 'cache', start, end, keys, offset) return rets
def execute(self, appconfig, query, args): ret = {} start, end, timezone = util.parseTime(args) self.parallel = util.get(query, 'parallel', 'false') self.threads = [] query['timezone'] = timezone query['conditions'] = self.get_additional_conditions(args, query) offset_minutes = util.get(query, 'offset_minutes', 0) self.fetch_data(appconfig, query, args, start, end, util.total_seconds(timedelta(minutes=offset_minutes)), ret, 'today') if 'true' == query.get('dod'): self.fetch_data( appconfig, query, args, start, end, util.total_seconds(timedelta(days=1, minutes=offset_minutes)), ret, 'ystd') if 'true' == query.get('wow'): self.fetch_data( appconfig, query, args, start, end, util.total_seconds(timedelta(days=7, minutes=offset_minutes)), ret, 'lastwk') if self.parallel and len(self.threads) > 0: for t in self.threads: t.join() return ret
def __init__(self, *args): super(BaseAggregationRule, self).__init__(*args) bucket_interval = self.rules.get('bucket_interval') if bucket_interval: if 'seconds' in bucket_interval: self.rules['bucket_interval_period'] = str( bucket_interval['seconds']) + 's' elif 'minutes' in bucket_interval: self.rules['bucket_interval_period'] = str( bucket_interval['minutes']) + 'm' elif 'hours' in bucket_interval: self.rules['bucket_interval_period'] = str( bucket_interval['hours']) + 'h' elif 'days' in bucket_interval: self.rules['bucket_interval_period'] = str( bucket_interval['days']) + 'd' elif 'weeks' in bucket_interval: self.rules['bucket_interval_period'] = str( bucket_interval['weeks']) + 'w' else: raise EAException("Unsupported window size") if self.rules.get('use_run_every_query_size'): if total_seconds(self.rules['run_every']) % total_seconds( self.rules['bucket_interval_timedelta']) != 0: raise EAException( "run_every must be evenly divisible by bucket_interval if specified" ) else: if total_seconds(self.rules['buffer_time']) % total_seconds( self.rules['bucket_interval_timedelta']) != 0: raise EAException( "Buffer_time must be evenly divisible by bucket_interval if specified" )
def _calc_sum_avg_std(self, values): def timedelta_avg(vals): return self._sum_times(vals)/self._count_times(vals) if self._count_times(values) == 0: return (None, None, None) total = self._sum_times(values) avg = timedelta_avg(values) variance = map(lambda x: timedelta(seconds=math.pow(total_seconds(x - avg),2)), self._non_nones(values)) std = timedelta(seconds=math.sqrt(total_seconds(timedelta_avg(variance)))) return (total, avg, std)
def _calc_sum_avg_std(self, values): def timedelta_avg(vals): return self._sum_times(vals) / self._count_times(vals) if self._count_times(values) == 0: return (None, None, None) total = self._sum_times(values) avg = timedelta_avg(values) variance = map( lambda x: timedelta(seconds=math.pow(total_seconds(x - avg), 2)), self._non_nones(values)) std = timedelta( seconds=math.sqrt(total_seconds(timedelta_avg(variance)))) return (total, avg, std)
def _get_groups_in_seconds(self, data_array, columns=(3,None)): """Gets dates in seconds for each of the experiment's groups""" groups = [] for group in self.get_groups(data_array, columns): group = [[total_seconds(time) for time in dimension] for dimension in group] groups.append(group) return groups
def _get_groups_in_seconds(self, data_array, columns=(3, None)): """Gets dates in seconds for each of the experiment's groups""" groups = [] for group in self.get_groups(data_array, columns): group = [[total_seconds(time) for time in dimension] for dimension in group] groups.append(group) return groups
def __init__(self, *args): super(BaseAggregationRule, self).__init__(*args) bucket_interval = self.rules.get('bucket_interval') if bucket_interval: if 'seconds' in bucket_interval: self.rules['bucket_interval_period'] = str(bucket_interval['seconds']) + 's' elif 'minutes' in bucket_interval: self.rules['bucket_interval_period'] = str(bucket_interval['minutes']) + 'm' elif 'hours' in bucket_interval: self.rules['bucket_interval_period'] = str(bucket_interval['hours']) + 'h' elif 'days' in bucket_interval: self.rules['bucket_interval_period'] = str(bucket_interval['days']) + 'd' elif 'weeks' in bucket_interval: self.rules['bucket_interval_period'] = str(bucket_interval['weeks']) + 'w' else: raise EAException("Unsupported window size") if self.rules.get('use_run_every_query_size'): if total_seconds(self.rules['run_every']) % total_seconds(self.rules['bucket_interval_timedelta']) != 0: raise EAException("run_every must be evenly divisible by bucket_interval if specified") else: if total_seconds(self.rules['buffer_time']) % total_seconds(self.rules['bucket_interval_timedelta']) != 0: raise EAException("Buffer_time must be evenly divisible by bucket_interval if specified")
def read(self, file_ref, tape_idx=None): '''Context manager produces a file-like for reading.''' if not self._is_init: raise ValueError("The spool has not been initialized") self._check_spool(file_ref) self._open_read_size += file_ref.size file_path = self.get_spooled_path(file_ref, read=True) # Read the file from tape and write to the spool file try: with self.tape_mgr.read_file(file_ref, tape_idx) as src_f: with open(file_path, 'w') as dest_f: start = datetime.now() while True: buf = src_f.read(self.block_size) dest_f.write(buf) if len(buf) < self.block_size: break end = datetime.now() except BaseException: os.remove(file_path) self._open_read_size -= file_ref.size raise bytes_per_sec = file_ref.size / total_seconds(end - start) logger.info('Read file %s (%d bytes) from tape at: %s', file_ref.name, file_ref.size, get_readable_bw(bytes_per_sec)) # Open the spool file for reading and yield it f = open(file_path, 'r') try: yield f finally: f.close() os.remove(file_path) self._open_read_size -= file_ref.size
def query_topdiffspender(config, start, end, param): today_start = calendar.timegm(start.timetuple()) ystd_start = calendar.timegm((start - timedelta(days=1)).timetuple()) lastwk_start = calendar.timegm((start - timedelta(days=7)).timetuple()) db_end = util.query_end_time(param['table'], config, timezone=param['timezone']) end = min(end, db_end) today_end = calendar.timegm(end.timetuple()) / 900 * 900 ystd_end = calendar.timegm((end - timedelta(days=1)).timetuple()) lastwk_end = calendar.timegm((end - timedelta(days=7)).timetuple()) param['today_start'] = util.time2str(today_start, DATE_FORMAT) param['ystd_start'] = util.time2str(ystd_start, DATE_FORMAT) param['lastwk_start'] = util.time2str(lastwk_start, DATE_FORMAT) param['today_end'] = util.time2str(today_end, DATE_FORMAT) param['ystd_end'] = util.time2str(ystd_end, DATE_FORMAT) param['lastwk_end'] = util.time2str(lastwk_end, DATE_FORMAT) fields = 'time,id,today_spend,ystd_spend,lastwk_spend,today_click,ystd_click,lastwk_click,today_imp,ystd_imp,lastwk_imp,today_serve,ystd_serve,lastwk_serve,dod_delta,wow_delta,tot_3day_spend,adv'.split( ',') tabid = param.get('tabid') if tabid == 'native_topdiffsection': command = SECTION_DIFF_TEMPLATE.format(**param) else: param['today_query'] = iterate_day(today_start, today_end, param, TODAY_TEMPLATE) param['ystd_query'] = iterate_day(ystd_start, ystd_end, param, YEST_TEMPLATE) param['lastwk_query'] = iterate_day(lastwk_start, lastwk_end, param, LASTWK_TEMPLATE) command = CMPGN_ADV_DIFF_TEMPLATE.format(**param) return util.fetch_sql_data(config, fields, command, -util.total_seconds(timedelta(minutes=5)))
def run_tests(self): """Processes all the data, runs the statistical tests, and returns the results""" import numpy as np from stats import get_mww, get_ttest_equal_var, get_ttest_diff_var, get_levene, get_simple_stats, get_shapiro calculations = [] headers_task_durations = ['group', 'user', 'task', 'duration'] headers_activity_times = ['group', 'user'] + Measurements.to_list() tasks_data = ASADataSet(headers_task_durations, []) tasks_data_secs = ASADataSet(headers_task_durations, []) activities_data = ASADataSet(headers_activity_times, []) activities_data_secs = ASADataSet(headers_activity_times, []) stdout.write("Running tests for experiment '{0}'\n".format(self.name)) stdout.write("Loading tasks: ") stdout.flush() for group in self.groups: tasks_rows = [(group.name,) + row for row in group.get_task_times()] tasks_data.data.extend(tasks_rows) tasks_data_secs.data.extend([row[0:3] + tuple(total_seconds(v) for v in row[3:]) for row in tasks_rows]) stdout.write(".") stdout.flush() calculations.append(("task_times", tasks_data)) calculations.append(("task_times_secs", tasks_data_secs)) stdout.write(" [finished]\n") stdout.flush() stdout.write("Loading activities: ") stdout.flush() for group in self.groups: activities_rows = [[group.name] + row for row in group.get_activity_times()] activities_data.data.extend(activities_rows) activities_data_secs.data.extend([row[0:2] + [total_seconds(v) for v in row[2:]] for row in activities_rows]) stdout.write(".") stdout.flush() calculations.append(("activity_times", activities_data)) calculations.append(("activity_times_secs", activities_data_secs)) stdout.write(" [finished]\n") stdout.flush() ###### Run statistical tests ###### stdout.write("Running statistical tests") stdout.flush() tasks_times_array = np.array(tasks_data.data) activity_times_array = np.array(activities_data.data) sstats_headers = ['group', 'sum', 'average', 'mean', 'median', 'std', 'var', 'count'] mww_headers = ['u', 'p_value_twotailed', 'p_value_lessthan', 'p_value_greaterthan'] ttest_headers = ['t_twotailed', 'p_value_twotailed', 't_lessthan', 'p_value_lessthan', 't_greaterthan', 'p_value_greaterthan', 'for_variance'] levene_headers = ['p_value', 'w'] shapiro_headers = ['group', 'activity', 'p_value', 'w'] ### task tests sstats_results = ASADataSet(['task'] + sstats_headers, []) mww_results = ASADataSet(['task'] + mww_headers, []) ttest_results = ASADataSet(['task'] + ttest_headers, []) levene_results = ASADataSet(['task'] + levene_headers, []) tasks_names = sorted(set(tasks_times_array[:, 2])) for task_name in tasks_names: task_array = tasks_times_array[tasks_times_array[:, 2] == task_name] groups_data = self._get_groups_in_seconds(task_array) group1_data = groups_data[0][0] # one "dimension" assumed, as task times imply only one column group2_data = groups_data[1][0] # one "dimension" assumed, as task times imply only one column sstats_results.data.append((task_name, self.groups[0].name) + get_simple_stats(group1_data)) sstats_results.data.append((task_name, self.groups[1].name) + get_simple_stats(group2_data)) mww_results.data.append((task_name,) + get_mww(group1_data, group2_data)) tasks_levene_result = get_levene(group1_data, group2_data) levene_results.data.append((task_name,) + tasks_levene_result) if tasks_levene_result[0] > 0.05: # equal variance ttest_results.data.append((task_name,) + get_ttest_equal_var(group1_data, group2_data) + ("equal",)) else: ttest_results.data.append((task_name,) + get_ttest_diff_var(group1_data, group2_data) + ("diff",)) calculations.append(("task_times_sstats", sstats_results)) calculations.append(("task_times_mww_test", mww_results)) calculations.append(("task_times_ttest_test", ttest_results)) calculations.append(("task_times_levene_test", levene_results)) #### totals task times tests groups_data = self._get_groups_in_seconds(tasks_times_array) group1_data = groups_data[0][0] # one "dimension" assumed, as task times imply only one column group2_data = groups_data[1][0] # one "dimension" assumed, as task times imply only one column calculations.append(("total_task_times_sstats", ASADataSet(sstats_headers, [(self.groups[0].name,) + get_simple_stats(group1_data), (self.groups[1].name,) + get_simple_stats(group2_data)]))) calculations.append(("total_task_times_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)]))) total_levene_result = [get_levene(group1_data, group2_data)] calculations.append(("total_task_times_levene_test", ASADataSet(levene_headers, total_levene_result))) if total_levene_result[0] > 0.05: # equal variance calculations.append(("total_task_times_ttest_test", ASADataSet(ttest_headers, [get_ttest_equal_var(group1_data, group2_data) + ("equal",)]))) else: calculations.append(("total_task_times_ttest_test", ASADataSet(ttest_headers, [get_ttest_diff_var(group1_data, group2_data) + ("diff",)]))) #### totals task times tests per subject ### (i.e., times that subjects took working on the entirety of the tasks, rather than the times they took on each task) from pandas import DataFrame total_task_times = np.array(DataFrame([row[0:2] + row[3:] for row in tasks_times_array.tolist()]).groupby([0,1], as_index=False).aggregate(np.sum).to_records(index=False).tolist()) calculations.append(("total_task_times_persubject", ASADataSet(['group', 'user', 'duration'], total_task_times))) groups_data = self._get_groups_in_seconds(total_task_times, columns=(2,3)) group1_data = groups_data[0][0] # one "dimension" assumed, as task times imply only one column group2_data = groups_data[1][0] # one "dimension" assumed, as task times imply only one column calculations.append(("total_task_times_persubject_sstats", ASADataSet(sstats_headers, [(self.groups[0].name,) + get_simple_stats(group1_data), (self.groups[1].name,) + get_simple_stats(group2_data)]))) calculations.append(("total_task_times_persubject_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)]))) total_levene_result = [get_levene(group1_data, group2_data)] calculations.append(("total_task_times_persubject_levene_test", ASADataSet(levene_headers, total_levene_result))) if total_levene_result[0] > 0.05: # equal variance calculations.append(("total_task_times_persubject_ttest_test", ASADataSet(ttest_headers, [get_ttest_equal_var(group1_data, group2_data) + ("equal",)]))) else: calculations.append(("total_task_times_persubject_ttest_test", ASADataSet(ttest_headers, [get_ttest_diff_var(group1_data, group2_data) + ("diff",)]))) #### activity tests # [group, user, wiki_view, wiki_edit, search, asa_artifact_view, asa_artifact_edit, asa_index, asa_search] groups_data = self._get_groups_in_seconds(activity_times_array, columns=(2,None)) group1_data = groups_data[0] group2_data = groups_data[1] intermediate_calcs = { "activity_times_sstats": ASADataSet(sstats_headers[:1] + ['activity'] + sstats_headers[1:], []), "activity_times_shapiro_test": ASADataSet(shapiro_headers, []), "activity_times_mww_test": ASADataSet(['activity'] + mww_headers, []), "activity_times_ttest_test": ASADataSet(['activity'] + ttest_headers, []), "activity_times_levene_test": ASADataSet(['activity'] + levene_headers, []), } for measurement_id, measurement in Measurements.to_ids_list_that_matter(): intermediate_calcs["activity_times_sstats"].data.extend( [ (self.groups[0].name, measurement) + get_simple_stats(group1_data[measurement_id]), (self.groups[1].name, measurement) + get_simple_stats(group2_data[measurement_id]) ] ) import warnings with warnings.catch_warnings(record=True) as w: # catch warnings intermediate_calcs["activity_times_shapiro_test"].data.append( (self.groups[0].name, measurement) + get_shapiro(group1_data[measurement_id]) ) if len(w) > 0: print '\x1b[31m' + "\n... Warning running shapiro-wilk on '{0}' for group '{1}': {2}".format(measurement, self.groups[0].name, w[-1].message) + '\033[0m' with warnings.catch_warnings(record=True) as w: # catch warnings intermediate_calcs["activity_times_shapiro_test"].data.append( (self.groups[1].name, measurement) + get_shapiro(group2_data[measurement_id]) ) if len(w) > 0: print '\x1b[31m' + "\n... Warning running shapiro-wilk on '{0}' for group '{1}': {2}".format(measurement, self.groups[1].name, w[-1].message) + '\033[0m' try: intermediate_calcs["activity_times_mww_test"].data.append( (measurement,) + get_mww(group1_data[measurement_id], group2_data[measurement_id]) ) except ValueError: # get_mww() returns a ValueError when the values on both groups are the same print "MWW raised a ValueError. Values on both groups are the same?" intermediate_calcs["activity_times_mww_test"].data.append((measurement, None, None)) activities_levene_result = get_levene(group1_data[measurement_id], group2_data[measurement_id]) intermediate_calcs["activity_times_levene_test"].data.append( (measurement,) + activities_levene_result ) if activities_levene_result[0] > 0.05: # equal variance intermediate_calcs["activity_times_ttest_test"].data.append( (measurement,) + get_ttest_equal_var(group1_data[measurement_id], group2_data[measurement_id]) + ("equal",) ) else: intermediate_calcs["activity_times_ttest_test"].data.append( (measurement,) + get_ttest_diff_var(group1_data[measurement_id], group2_data[measurement_id]) + ("diff",) ) measurement_id += 1 for icalc_tpl in intermediate_calcs.iteritems(): calculations.append(icalc_tpl) #### activity times by issue tests intermediate_calcs = { "issues_activity_times": ASADataSet(['group', 'user', 'duration_i2', 'duration_i6'], []), "issues_activity_times_sstats": ASADataSet(sstats_headers[:1] + ['issue'] + sstats_headers[1:], []), "issues_activity_times_mww_test": ASADataSet(['issue'] + mww_headers, []), "issues_activity_times_levene_test": ASADataSet(['issue'] + levene_headers, []), "issues_activity_times_ttest_test": ASADataSet(['issue'] + ttest_headers, []) } issues_activity_times = np.array([np.concatenate((row[0:2], [sum(row[[2,3,5,6]], timedelta())], [sum(row[[4,7,8]], timedelta())])) for row in activity_times_array]).tolist() intermediate_calcs["issues_activity_times"].data.extend(issues_activity_times) groups_data = self._get_groups_in_seconds(np.array(issues_activity_times), (2, None)) for idx, name in [(0, "understanding"), (1, "finding")]: group1_data = groups_data[0][idx] group2_data = groups_data[1][idx] intermediate_calcs["issues_activity_times_sstats"].data.extend( [(self.groups[0].name, name) + get_simple_stats(group1_data), (self.groups[1].name, name) + get_simple_stats(group2_data)] ) intermediate_calcs["issues_activity_times_mww_test"].data.extend( [(name,) + get_mww(group1_data, group2_data)] ) issues_levene_result = get_levene(group1_data, group2_data) intermediate_calcs["issues_activity_times_levene_test"].data.extend( [(name,) + issues_levene_result] ) if issues_levene_result[0] > 0.05: # equal variance intermediate_calcs["issues_activity_times_ttest_test"].data.extend( [(name,) + get_ttest_equal_var(group1_data, group2_data) + ("equal",)] ) else: intermediate_calcs["issues_activity_times_ttest_test"].data.extend( [(name,) + get_ttest_equal_var(group1_data, group2_data) + ("diff",)] ) for icalc_tpl in intermediate_calcs.iteritems(): calculations.append(icalc_tpl) #### totals activity times tests total_activity_times = np.array([np.concatenate((row[0:2], [sum(row[2:], timedelta())])) for row in activity_times_array]).tolist() calculations.append(("total_activity_times", ASADataSet(['group', 'user', 'duration'], total_activity_times))) groups_data = self._get_groups_in_seconds(np.array(total_activity_times), (2, None)) group1_data = groups_data[0][0] group2_data = groups_data[1][0] calculations.append(("total_activity_times_sstats", ASADataSet(sstats_headers, [(self.groups[0].name,) + get_simple_stats(group1_data), (self.groups[1].name,) + get_simple_stats(group2_data)]))) calculations.append(("total_activity_times_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)]))) total_levene_result = get_levene(group1_data, group2_data) calculations.append(("total_activity_times_levene_test", ASADataSet(levene_headers, [total_levene_result]))) if total_levene_result[0] > 0.05: # equal variance calculations.append(("total_activity_times_ttest_test", ASADataSet(ttest_headers, [get_ttest_equal_var(group1_data, group2_data) + ("equal",)]))) else: calculations.append(("total_activity_times_ttest_test", ASADataSet(ttest_headers, [get_ttest_diff_var(group1_data, group2_data) + ("diff",)]))) # questionnaires questionnaire_questions = ASADataSet(['question_number', 'question'], self.get_questionnaire_questions()) questionnaire_hypothesis = ASADataSet(['question_number', 'hypothesis'], self.get_questionnaire_hypothesis()) questionnaire_histogram = ASADataSet(['group', 'question', '1', '2', '3', '4', '5'], []) questionnaire_one_answer_per_row = ASADataSet(['group', 'user', 'question', 'answer'], []) questionnaire_one_answer_per_column = ASADataSet([], []) questionnaire_sstats = ASADataSet(['question'] + sstats_headers, []) questionnaire_mww_results = ASADataSet(['question'] + mww_headers, []) questionnaire_ttest_results = ASADataSet(['question'] + ttest_headers, []) questionnaire_levene_results = ASADataSet(['question'] + levene_headers, []) def get_question_and_answers(questionnaire_row): question = questionnaire_row[0] answers = questionnaire_row[1:-6] if type(answers[0]) is float: # discard questions with a non-numeric answer answers = [int(answer) if type(answer) is float else answer for answer in answers] # floats become ints answers_noned = [a if not a == "" else None for a in answers] # replace missing data values with None answers = [a for a in answers if not a == ""] # discard missing data values return question, answers, answers_noned return question, None, None group1_name = self.groups[0].name group2_name = self.groups[1].name group1_subjects = self.groups[0].get_questionnaire_subjects() group1_data = self.groups[0].get_questionnaire_questions_and_answers() group2_subjects = self.groups[1].get_questionnaire_subjects() group2_data = self.groups[1].get_questionnaire_questions_and_answers() questionnaire_one_answer_per_column.headers = ['question'] + group1_subjects + group2_subjects if not group1_data is None: for i in range(len(group1_data)): # for each question question_g1, answers_g1, answers_g1_noned = get_question_and_answers(group1_data[i]) question_g2, answers_g2, answers_g2_noned = get_question_and_answers(group2_data[i]) assert question_g1 == question_g2 if answers_g1 is None or answers_g2 is None: continue for i in range(len(group1_subjects)): if not answers_g1_noned[i] is None: questionnaire_one_answer_per_row.data.append((group1_name, group1_subjects[i], question_g1, answers_g1_noned[i])) for i in range(len(group2_subjects)): if not answers_g2_noned[i] is None: questionnaire_one_answer_per_row.data.append((group2_name, group2_subjects[i], question_g2, answers_g2_noned[i])) questionnaire_one_answer_per_column.data.append((question_g1,) + tuple(answers_g1_noned + answers_g2_noned)) questionnaire_histogram.data.append((group1_name, question_g1) + tuple(np.bincount(np.array(answers_g1), minlength=6)[1:])) questionnaire_histogram.data.append((group2_name, question_g2) + tuple(np.bincount(np.array(answers_g2), minlength=6)[1:])) questionnaire_sstats.data.append((question_g1, group1_name) + get_simple_stats(answers_g1)) questionnaire_sstats.data.append((question_g2, group2_name) + get_simple_stats(answers_g2)) questionnaire_mww_results.data.append((question_g1,) + get_mww(answers_g1, answers_g2)) quest_levene_result = get_levene(answers_g1, answers_g2) questionnaire_levene_results.data.append((question_g1,) + quest_levene_result) if quest_levene_result[0] > 0.05: # equal variance questionnaire_ttest_results.data.append((question_g1,) + get_ttest_equal_var(answers_g1, answers_g2) + ("equal",)) else: questionnaire_ttest_results.data.append((question_g1,) + get_ttest_diff_var(answers_g1, answers_g2) + ("diff",)) calculations.append(("questionnaire_questions", questionnaire_questions)) calculations.append(("questionnaire_hypothesis", questionnaire_hypothesis)) calculations.append(("questionnaire_histogram", questionnaire_histogram)) calculations.append(("questionnaire_one_answer_per_row", questionnaire_one_answer_per_row)) calculations.append(("questionnaire_one_answer_per_column", questionnaire_one_answer_per_column)) calculations.append(("questionnaire_sstats", questionnaire_sstats)) calculations.append(("questionnaire_mww_test", questionnaire_mww_results)) calculations.append(("questionnaire_levene_test", questionnaire_levene_results)) calculations.append(("questionnaire_ttest_test", questionnaire_ttest_results)) stdout.write(" [finished]\n") stdout.flush() return ASAExperimentCalculations(self, calculations)
def run_tests(self): """Processes all the data, runs the statistical tests, and returns the results""" import numpy as np from stats import get_mww, get_ttest_equal_var, get_ttest_diff_var, get_levene, get_simple_stats, get_shapiro calculations = [] headers_task_durations = ['group', 'user', 'task', 'duration'] headers_activity_times = ['group', 'user'] + Measurements.to_list() tasks_data = ASADataSet(headers_task_durations, []) tasks_data_secs = ASADataSet(headers_task_durations, []) activities_data = ASADataSet(headers_activity_times, []) activities_data_secs = ASADataSet(headers_activity_times, []) stdout.write("Running tests for experiment '{0}'\n".format(self.name)) stdout.write("Loading tasks: ") stdout.flush() for group in self.groups: tasks_rows = [(group.name, ) + row for row in group.get_task_times()] tasks_data.data.extend(tasks_rows) tasks_data_secs.data.extend([ row[0:3] + tuple(total_seconds(v) for v in row[3:]) for row in tasks_rows ]) stdout.write(".") stdout.flush() calculations.append(("task_times", tasks_data)) calculations.append(("task_times_secs", tasks_data_secs)) stdout.write(" [finished]\n") stdout.flush() stdout.write("Loading activities: ") stdout.flush() for group in self.groups: activities_rows = [[group.name] + row for row in group.get_activity_times()] activities_data.data.extend(activities_rows) activities_data_secs.data.extend([ row[0:2] + [total_seconds(v) for v in row[2:]] for row in activities_rows ]) stdout.write(".") stdout.flush() calculations.append(("activity_times", activities_data)) calculations.append(("activity_times_secs", activities_data_secs)) stdout.write(" [finished]\n") stdout.flush() ###### Run statistical tests ###### stdout.write("Running statistical tests") stdout.flush() tasks_times_array = np.array(tasks_data.data) activity_times_array = np.array(activities_data.data) sstats_headers = [ 'group', 'sum', 'average', 'mean', 'median', 'std', 'var', 'count' ] mww_headers = [ 'u', 'p_value_twotailed', 'p_value_lessthan', 'p_value_greaterthan' ] ttest_headers = [ 't_twotailed', 'p_value_twotailed', 't_lessthan', 'p_value_lessthan', 't_greaterthan', 'p_value_greaterthan', 'for_variance' ] levene_headers = ['p_value', 'w'] shapiro_headers = ['group', 'activity', 'p_value', 'w'] ### task tests sstats_results = ASADataSet(['task'] + sstats_headers, []) mww_results = ASADataSet(['task'] + mww_headers, []) ttest_results = ASADataSet(['task'] + ttest_headers, []) levene_results = ASADataSet(['task'] + levene_headers, []) tasks_names = sorted(set(tasks_times_array[:, 2])) for task_name in tasks_names: task_array = tasks_times_array[tasks_times_array[:, 2] == task_name] groups_data = self._get_groups_in_seconds(task_array) group1_data = groups_data[0][ 0] # one "dimension" assumed, as task times imply only one column group2_data = groups_data[1][ 0] # one "dimension" assumed, as task times imply only one column sstats_results.data.append((task_name, self.groups[0].name) + get_simple_stats(group1_data)) sstats_results.data.append((task_name, self.groups[1].name) + get_simple_stats(group2_data)) mww_results.data.append((task_name, ) + get_mww(group1_data, group2_data)) tasks_levene_result = get_levene(group1_data, group2_data) levene_results.data.append((task_name, ) + tasks_levene_result) if tasks_levene_result[0] > 0.05: # equal variance ttest_results.data.append( (task_name, ) + get_ttest_equal_var(group1_data, group2_data) + ("equal", )) else: ttest_results.data.append( (task_name, ) + get_ttest_diff_var(group1_data, group2_data) + ("diff", )) calculations.append(("task_times_sstats", sstats_results)) calculations.append(("task_times_mww_test", mww_results)) calculations.append(("task_times_ttest_test", ttest_results)) calculations.append(("task_times_levene_test", levene_results)) #### totals task times tests groups_data = self._get_groups_in_seconds(tasks_times_array) group1_data = groups_data[0][ 0] # one "dimension" assumed, as task times imply only one column group2_data = groups_data[1][ 0] # one "dimension" assumed, as task times imply only one column calculations.append( ("total_task_times_sstats", ASADataSet( sstats_headers, [(self.groups[0].name, ) + get_simple_stats(group1_data), (self.groups[1].name, ) + get_simple_stats(group2_data)]))) calculations.append(("total_task_times_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)]))) total_levene_result = [get_levene(group1_data, group2_data)] calculations.append(("total_task_times_levene_test", ASADataSet(levene_headers, total_levene_result))) if total_levene_result[0] > 0.05: # equal variance calculations.append( ("total_task_times_ttest_test", ASADataSet(ttest_headers, [ get_ttest_equal_var(group1_data, group2_data) + ("equal", ) ]))) else: calculations.append( ("total_task_times_ttest_test", ASADataSet(ttest_headers, [ get_ttest_diff_var(group1_data, group2_data) + ("diff", ) ]))) #### totals task times tests per subject ### (i.e., times that subjects took working on the entirety of the tasks, rather than the times they took on each task) from pandas import DataFrame total_task_times = np.array( DataFrame([ row[0:2] + row[3:] for row in tasks_times_array.tolist() ]).groupby([0, 1], as_index=False).aggregate( np.sum).to_records(index=False).tolist()) calculations.append(("total_task_times_persubject", ASADataSet(['group', 'user', 'duration'], total_task_times))) groups_data = self._get_groups_in_seconds(total_task_times, columns=(2, 3)) group1_data = groups_data[0][ 0] # one "dimension" assumed, as task times imply only one column group2_data = groups_data[1][ 0] # one "dimension" assumed, as task times imply only one column calculations.append( ("total_task_times_persubject_sstats", ASADataSet( sstats_headers, [(self.groups[0].name, ) + get_simple_stats(group1_data), (self.groups[1].name, ) + get_simple_stats(group2_data)]))) calculations.append(("total_task_times_persubject_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)]))) total_levene_result = [get_levene(group1_data, group2_data)] calculations.append(("total_task_times_persubject_levene_test", ASADataSet(levene_headers, total_levene_result))) if total_levene_result[0] > 0.05: # equal variance calculations.append( ("total_task_times_persubject_ttest_test", ASADataSet(ttest_headers, [ get_ttest_equal_var(group1_data, group2_data) + ("equal", ) ]))) else: calculations.append( ("total_task_times_persubject_ttest_test", ASADataSet(ttest_headers, [ get_ttest_diff_var(group1_data, group2_data) + ("diff", ) ]))) #### activity tests # [group, user, wiki_view, wiki_edit, search, asa_artifact_view, asa_artifact_edit, asa_index, asa_search] groups_data = self._get_groups_in_seconds(activity_times_array, columns=(2, None)) group1_data = groups_data[0] group2_data = groups_data[1] intermediate_calcs = { "activity_times_sstats": ASADataSet(sstats_headers[:1] + ['activity'] + sstats_headers[1:], []), "activity_times_shapiro_test": ASADataSet(shapiro_headers, []), "activity_times_mww_test": ASADataSet(['activity'] + mww_headers, []), "activity_times_ttest_test": ASADataSet(['activity'] + ttest_headers, []), "activity_times_levene_test": ASADataSet(['activity'] + levene_headers, []), } for measurement_id, measurement in Measurements.to_ids_list_that_matter( ): intermediate_calcs["activity_times_sstats"].data.extend([ (self.groups[0].name, measurement) + get_simple_stats(group1_data[measurement_id]), (self.groups[1].name, measurement) + get_simple_stats(group2_data[measurement_id]) ]) import warnings with warnings.catch_warnings(record=True) as w: # catch warnings intermediate_calcs["activity_times_shapiro_test"].data.append( (self.groups[0].name, measurement) + get_shapiro(group1_data[measurement_id])) if len(w) > 0: print '\x1b[31m' + "\n... Warning running shapiro-wilk on '{0}' for group '{1}': {2}".format( measurement, self.groups[0].name, w[-1].message) + '\033[0m' with warnings.catch_warnings(record=True) as w: # catch warnings intermediate_calcs["activity_times_shapiro_test"].data.append( (self.groups[1].name, measurement) + get_shapiro(group2_data[measurement_id])) if len(w) > 0: print '\x1b[31m' + "\n... Warning running shapiro-wilk on '{0}' for group '{1}': {2}".format( measurement, self.groups[1].name, w[-1].message) + '\033[0m' try: intermediate_calcs["activity_times_mww_test"].data.append( (measurement, ) + get_mww(group1_data[measurement_id], group2_data[measurement_id])) except ValueError: # get_mww() returns a ValueError when the values on both groups are the same print "MWW raised a ValueError. Values on both groups are the same?" intermediate_calcs["activity_times_mww_test"].data.append( (measurement, None, None)) activities_levene_result = get_levene(group1_data[measurement_id], group2_data[measurement_id]) intermediate_calcs["activity_times_levene_test"].data.append( (measurement, ) + activities_levene_result) if activities_levene_result[0] > 0.05: # equal variance intermediate_calcs["activity_times_ttest_test"].data.append( (measurement, ) + get_ttest_equal_var(group1_data[measurement_id], group2_data[measurement_id]) + ("equal", )) else: intermediate_calcs["activity_times_ttest_test"].data.append( (measurement, ) + get_ttest_diff_var(group1_data[measurement_id], group2_data[measurement_id]) + ("diff", )) measurement_id += 1 for icalc_tpl in intermediate_calcs.iteritems(): calculations.append(icalc_tpl) #### activity times by issue tests intermediate_calcs = { "issues_activity_times": ASADataSet(['group', 'user', 'duration_i2', 'duration_i6'], []), "issues_activity_times_sstats": ASADataSet(sstats_headers[:1] + ['issue'] + sstats_headers[1:], []), "issues_activity_times_mww_test": ASADataSet(['issue'] + mww_headers, []), "issues_activity_times_levene_test": ASADataSet(['issue'] + levene_headers, []), "issues_activity_times_ttest_test": ASADataSet(['issue'] + ttest_headers, []) } issues_activity_times = np.array([ np.concatenate((row[0:2], [sum(row[[2, 3, 5, 6]], timedelta())], [sum(row[[4, 7, 8]], timedelta())])) for row in activity_times_array ]).tolist() intermediate_calcs["issues_activity_times"].data.extend( issues_activity_times) groups_data = self._get_groups_in_seconds( np.array(issues_activity_times), (2, None)) for idx, name in [(0, "understanding"), (1, "finding")]: group1_data = groups_data[0][idx] group2_data = groups_data[1][idx] intermediate_calcs["issues_activity_times_sstats"].data.extend([ (self.groups[0].name, name) + get_simple_stats(group1_data), (self.groups[1].name, name) + get_simple_stats(group2_data) ]) intermediate_calcs["issues_activity_times_mww_test"].data.extend([ (name, ) + get_mww(group1_data, group2_data) ]) issues_levene_result = get_levene(group1_data, group2_data) intermediate_calcs[ "issues_activity_times_levene_test"].data.extend([ (name, ) + issues_levene_result ]) if issues_levene_result[0] > 0.05: # equal variance intermediate_calcs[ "issues_activity_times_ttest_test"].data.extend([ (name, ) + get_ttest_equal_var(group1_data, group2_data) + ("equal", ) ]) else: intermediate_calcs[ "issues_activity_times_ttest_test"].data.extend([ (name, ) + get_ttest_equal_var(group1_data, group2_data) + ("diff", ) ]) for icalc_tpl in intermediate_calcs.iteritems(): calculations.append(icalc_tpl) #### totals activity times tests total_activity_times = np.array([ np.concatenate((row[0:2], [sum(row[2:], timedelta())])) for row in activity_times_array ]).tolist() calculations.append(("total_activity_times", ASADataSet(['group', 'user', 'duration'], total_activity_times))) groups_data = self._get_groups_in_seconds( np.array(total_activity_times), (2, None)) group1_data = groups_data[0][0] group2_data = groups_data[1][0] calculations.append( ("total_activity_times_sstats", ASADataSet( sstats_headers, [(self.groups[0].name, ) + get_simple_stats(group1_data), (self.groups[1].name, ) + get_simple_stats(group2_data)]))) calculations.append(("total_activity_times_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)]))) total_levene_result = get_levene(group1_data, group2_data) calculations.append(("total_activity_times_levene_test", ASADataSet(levene_headers, [total_levene_result]))) if total_levene_result[0] > 0.05: # equal variance calculations.append( ("total_activity_times_ttest_test", ASADataSet(ttest_headers, [ get_ttest_equal_var(group1_data, group2_data) + ("equal", ) ]))) else: calculations.append( ("total_activity_times_ttest_test", ASADataSet(ttest_headers, [ get_ttest_diff_var(group1_data, group2_data) + ("diff", ) ]))) # questionnaires questionnaire_questions = ASADataSet( ['question_number', 'question'], self.get_questionnaire_questions()) questionnaire_hypothesis = ASADataSet( ['question_number', 'hypothesis'], self.get_questionnaire_hypothesis()) questionnaire_histogram = ASADataSet( ['group', 'question', '1', '2', '3', '4', '5'], []) questionnaire_one_answer_per_row = ASADataSet( ['group', 'user', 'question', 'answer'], []) questionnaire_one_answer_per_column = ASADataSet([], []) questionnaire_sstats = ASADataSet(['question'] + sstats_headers, []) questionnaire_mww_results = ASADataSet(['question'] + mww_headers, []) questionnaire_ttest_results = ASADataSet(['question'] + ttest_headers, []) questionnaire_levene_results = ASADataSet(['question'] + levene_headers, []) def get_question_and_answers(questionnaire_row): question = questionnaire_row[0] answers = questionnaire_row[1:-6] if type(answers[0] ) is float: # discard questions with a non-numeric answer answers = [ int(answer) if type(answer) is float else answer for answer in answers ] # floats become ints answers_noned = [a if not a == "" else None for a in answers ] # replace missing data values with None answers = [a for a in answers if not a == ""] # discard missing data values return question, answers, answers_noned return question, None, None group1_name = self.groups[0].name group2_name = self.groups[1].name group1_subjects = self.groups[0].get_questionnaire_subjects() group1_data = self.groups[0].get_questionnaire_questions_and_answers() group2_subjects = self.groups[1].get_questionnaire_subjects() group2_data = self.groups[1].get_questionnaire_questions_and_answers() questionnaire_one_answer_per_column.headers = [ 'question' ] + group1_subjects + group2_subjects if not group1_data is None: for i in range(len(group1_data)): # for each question question_g1, answers_g1, answers_g1_noned = get_question_and_answers( group1_data[i]) question_g2, answers_g2, answers_g2_noned = get_question_and_answers( group2_data[i]) assert question_g1 == question_g2 if answers_g1 is None or answers_g2 is None: continue for i in range(len(group1_subjects)): if not answers_g1_noned[i] is None: questionnaire_one_answer_per_row.data.append( (group1_name, group1_subjects[i], question_g1, answers_g1_noned[i])) for i in range(len(group2_subjects)): if not answers_g2_noned[i] is None: questionnaire_one_answer_per_row.data.append( (group2_name, group2_subjects[i], question_g2, answers_g2_noned[i])) questionnaire_one_answer_per_column.data.append( (question_g1, ) + tuple(answers_g1_noned + answers_g2_noned)) questionnaire_histogram.data.append( (group1_name, question_g1) + tuple(np.bincount(np.array(answers_g1), minlength=6)[1:])) questionnaire_histogram.data.append( (group2_name, question_g2) + tuple(np.bincount(np.array(answers_g2), minlength=6)[1:])) questionnaire_sstats.data.append((question_g1, group1_name) + get_simple_stats(answers_g1)) questionnaire_sstats.data.append((question_g2, group2_name) + get_simple_stats(answers_g2)) questionnaire_mww_results.data.append( (question_g1, ) + get_mww(answers_g1, answers_g2)) quest_levene_result = get_levene(answers_g1, answers_g2) questionnaire_levene_results.data.append((question_g1, ) + quest_levene_result) if quest_levene_result[0] > 0.05: # equal variance questionnaire_ttest_results.data.append( (question_g1, ) + get_ttest_equal_var(answers_g1, answers_g2) + ("equal", )) else: questionnaire_ttest_results.data.append( (question_g1, ) + get_ttest_diff_var(answers_g1, answers_g2) + ("diff", )) calculations.append( ("questionnaire_questions", questionnaire_questions)) calculations.append( ("questionnaire_hypothesis", questionnaire_hypothesis)) calculations.append( ("questionnaire_histogram", questionnaire_histogram)) calculations.append(("questionnaire_one_answer_per_row", questionnaire_one_answer_per_row)) calculations.append(("questionnaire_one_answer_per_column", questionnaire_one_answer_per_column)) calculations.append(("questionnaire_sstats", questionnaire_sstats)) calculations.append( ("questionnaire_mww_test", questionnaire_mww_results)) calculations.append( ("questionnaire_levene_test", questionnaire_levene_results)) calculations.append( ("questionnaire_ttest_test", questionnaire_ttest_results)) stdout.write(" [finished]\n") stdout.flush() return ASAExperimentCalculations(self, calculations)
def humanize(self, other=None, locale='en_us'): ''' Returns a localized, humanized representation of a relative difference in time. :param other: (optional) an :class:`Arrow <arrow.arrow.Arrow>` or ``datetime`` object. Defaults to now in the current :class:`Arrow <arrow.arrow.Arrow>` object's timezone. :param locale: (optional) a ``str`` specifying a locale. Defaults to 'en_us'. Usage:: >>> earlier = arrow.utcnow().replace(hours=-2) >>> earlier.humanize() '2 hours ago' >>> later = later = earlier.replace(hours=4) >>> later.humanize(earlier) 'in 4 hours' ''' locale = locales.get_locale(locale) if other is None: utc = datetime.utcnow().replace(tzinfo=dateutil_tz.tzutc()) dt = utc.astimezone(self._datetime.tzinfo) elif isinstance(other, Arrow): dt = other._datetime elif isinstance(other, datetime): if other.tzinfo is None: dt = other.replace(tzinfo=self._datetime.tzinfo) else: dt = other.astimezone(self._datetime.tzinfo) else: raise TypeError() delta = int(util.total_seconds(self._datetime - dt)) sign = -1 if delta < 0 else 1 diff = abs(delta) delta = diff if diff < 10: return locale.describe('now') if diff < 45: return locale.describe('seconds', sign) elif diff < 90: return locale.describe('minute', sign) elif diff < 2700: minutes = sign * int(max(delta / 60, 2)) return locale.describe('minutes', minutes) elif diff < 5400: return locale.describe('hour', sign) elif diff < 79200: hours = sign * int(max(delta / 3600, 2)) return locale.describe('hours', hours) elif diff < 129600: return locale.describe('day', sign) elif diff < 2160000: days = sign * int(max(delta / 86400, 2)) return locale.describe('days', days) elif diff < 3888000: return locale.describe('month', sign) elif diff < 29808000: self_months = self._datetime.year * 12 + self._datetime.month other_months = dt.year * 12 + dt.month months = sign * abs(other_months - self_months) return locale.describe('months', months) elif diff < 47260800: return locale.describe('year', sign) else: years = sign * int(max(delta / 31536000, 2)) return locale.describe('years', years)