Esempio n. 1
0
def execute(appconfig, query, args, fetcher):

    start, end, timezone = util.parseTime(args)
    offset = util.getOffsetSeconds(start, timezone)
    start, end = start - offset, end - offset

    offset_seconds = 0
    if args.get('offsetdate') == 'yesterday':
        offset_seconds = util.total_seconds(timedelta(days=1))
    elif args.get('offsetdate') == 'lastweek':
        offset_seconds = util.total_seconds(timedelta(days=7))
    start -= offset_seconds

    ids = args['id'].split(',')
    objtype = '256' if 'campaign' == args['type'] else '257'

    ids = [_id + "_" + objtype for _id in ids]

    try:
        colo = appconfig.get("hbase", "hbase")
    except:
        colo = 'blue_prod'

    keys = ",".join(ids)

    if args.get('analysis') == "true":
        rets = load_hbase(colo, 'analysis', start, end, keys, offset)
    else:
        rets = load_hbase(colo, 'cache', start, end, keys, offset)

    return rets
Esempio n. 2
0
    def execute(self, appconfig, query, args):
        ret = {}

        start, end, timezone = util.parseTime(args)

        self.parallel = util.get(query, 'parallel', 'false')
        self.threads = []
        query['timezone'] = timezone
        query['conditions'] = self.get_additional_conditions(args, query)

        offset_minutes = util.get(query, 'offset_minutes', 0)

        self.fetch_data(appconfig, query, args, start, end,
                        util.total_seconds(timedelta(minutes=offset_minutes)),
                        ret, 'today')

        if 'true' == query.get('dod'):
            self.fetch_data(
                appconfig, query, args, start, end,
                util.total_seconds(timedelta(days=1, minutes=offset_minutes)),
                ret, 'ystd')

        if 'true' == query.get('wow'):
            self.fetch_data(
                appconfig, query, args, start, end,
                util.total_seconds(timedelta(days=7, minutes=offset_minutes)),
                ret, 'lastwk')

        if self.parallel and len(self.threads) > 0:
            for t in self.threads:
                t.join()

        return ret
Esempio n. 3
0
    def __init__(self, *args):
        super(BaseAggregationRule, self).__init__(*args)
        bucket_interval = self.rules.get('bucket_interval')
        if bucket_interval:
            if 'seconds' in bucket_interval:
                self.rules['bucket_interval_period'] = str(
                    bucket_interval['seconds']) + 's'
            elif 'minutes' in bucket_interval:
                self.rules['bucket_interval_period'] = str(
                    bucket_interval['minutes']) + 'm'
            elif 'hours' in bucket_interval:
                self.rules['bucket_interval_period'] = str(
                    bucket_interval['hours']) + 'h'
            elif 'days' in bucket_interval:
                self.rules['bucket_interval_period'] = str(
                    bucket_interval['days']) + 'd'
            elif 'weeks' in bucket_interval:
                self.rules['bucket_interval_period'] = str(
                    bucket_interval['weeks']) + 'w'
            else:
                raise EAException("Unsupported window size")

            if self.rules.get('use_run_every_query_size'):
                if total_seconds(self.rules['run_every']) % total_seconds(
                        self.rules['bucket_interval_timedelta']) != 0:
                    raise EAException(
                        "run_every must be evenly divisible by bucket_interval if specified"
                    )
            else:
                if total_seconds(self.rules['buffer_time']) % total_seconds(
                        self.rules['bucket_interval_timedelta']) != 0:
                    raise EAException(
                        "Buffer_time must be evenly divisible by bucket_interval if specified"
                    )
Esempio n. 4
0
    def _calc_sum_avg_std(self, values):
        def timedelta_avg(vals):
            return self._sum_times(vals)/self._count_times(vals)

        if self._count_times(values) == 0:
            return (None, None, None)
        total = self._sum_times(values)
        avg = timedelta_avg(values)
        variance = map(lambda x: timedelta(seconds=math.pow(total_seconds(x - avg),2)), self._non_nones(values))
        std = timedelta(seconds=math.sqrt(total_seconds(timedelta_avg(variance))))
        return (total, avg, std)
Esempio n. 5
0
    def _calc_sum_avg_std(self, values):
        def timedelta_avg(vals):
            return self._sum_times(vals) / self._count_times(vals)

        if self._count_times(values) == 0:
            return (None, None, None)
        total = self._sum_times(values)
        avg = timedelta_avg(values)
        variance = map(
            lambda x: timedelta(seconds=math.pow(total_seconds(x - avg), 2)),
            self._non_nones(values))
        std = timedelta(
            seconds=math.sqrt(total_seconds(timedelta_avg(variance))))
        return (total, avg, std)
Esempio n. 6
0
 def _get_groups_in_seconds(self, data_array, columns=(3,None)):
     """Gets dates in seconds for each of the experiment's groups"""
     groups = []
     for group in self.get_groups(data_array, columns):
         group = [[total_seconds(time) for time in dimension] for dimension in group]
         groups.append(group)
     return groups
Esempio n. 7
0
 def _get_groups_in_seconds(self, data_array, columns=(3, None)):
     """Gets dates in seconds for each of the experiment's groups"""
     groups = []
     for group in self.get_groups(data_array, columns):
         group = [[total_seconds(time) for time in dimension]
                  for dimension in group]
         groups.append(group)
     return groups
Esempio n. 8
0
    def __init__(self, *args):
        super(BaseAggregationRule, self).__init__(*args)
        bucket_interval = self.rules.get('bucket_interval')
        if bucket_interval:
            if 'seconds' in bucket_interval:
                self.rules['bucket_interval_period'] = str(bucket_interval['seconds']) + 's'
            elif 'minutes' in bucket_interval:
                self.rules['bucket_interval_period'] = str(bucket_interval['minutes']) + 'm'
            elif 'hours' in bucket_interval:
                self.rules['bucket_interval_period'] = str(bucket_interval['hours']) + 'h'
            elif 'days' in bucket_interval:
                self.rules['bucket_interval_period'] = str(bucket_interval['days']) + 'd'
            elif 'weeks' in bucket_interval:
                self.rules['bucket_interval_period'] = str(bucket_interval['weeks']) + 'w'
            else:
                raise EAException("Unsupported window size")

            if self.rules.get('use_run_every_query_size'):
                if total_seconds(self.rules['run_every']) % total_seconds(self.rules['bucket_interval_timedelta']) != 0:
                    raise EAException("run_every must be evenly divisible by bucket_interval if specified")
            else:
                if total_seconds(self.rules['buffer_time']) % total_seconds(self.rules['bucket_interval_timedelta']) != 0:
                    raise EAException("Buffer_time must be evenly divisible by bucket_interval if specified")
Esempio n. 9
0
    def read(self, file_ref, tape_idx=None):
        '''Context manager produces a file-like for reading.'''
        if not self._is_init:
            raise ValueError("The spool has not been initialized")
            
        self._check_spool(file_ref)
    
        self._open_read_size += file_ref.size
        file_path = self.get_spooled_path(file_ref, read=True)
        
        # Read the file from tape and write to the spool file
        try:
            with self.tape_mgr.read_file(file_ref, tape_idx) as src_f:
                with open(file_path, 'w') as dest_f:
                    start = datetime.now()
                    while True:
                        buf = src_f.read(self.block_size)
                        dest_f.write(buf)
                        if len(buf) < self.block_size:
                            break
                    end = datetime.now()
        except BaseException:
            os.remove(file_path)
            self._open_read_size -= file_ref.size
            raise
        bytes_per_sec = file_ref.size / total_seconds(end - start)
        logger.info('Read file %s (%d bytes) from tape at: %s', 
                    file_ref.name, 
                    file_ref.size, 
                    get_readable_bw(bytes_per_sec))

        # Open the spool file for reading and yield it
        f = open(file_path, 'r')
        try:
            yield f
        finally:
            f.close()
            os.remove(file_path)
            self._open_read_size -= file_ref.size
Esempio n. 10
0
def query_topdiffspender(config, start, end, param):
    today_start = calendar.timegm(start.timetuple())
    ystd_start = calendar.timegm((start - timedelta(days=1)).timetuple())
    lastwk_start = calendar.timegm((start - timedelta(days=7)).timetuple())

    db_end = util.query_end_time(param['table'],
                                 config,
                                 timezone=param['timezone'])
    end = min(end, db_end)
    today_end = calendar.timegm(end.timetuple()) / 900 * 900

    ystd_end = calendar.timegm((end - timedelta(days=1)).timetuple())
    lastwk_end = calendar.timegm((end - timedelta(days=7)).timetuple())

    param['today_start'] = util.time2str(today_start, DATE_FORMAT)
    param['ystd_start'] = util.time2str(ystd_start, DATE_FORMAT)
    param['lastwk_start'] = util.time2str(lastwk_start, DATE_FORMAT)
    param['today_end'] = util.time2str(today_end, DATE_FORMAT)
    param['ystd_end'] = util.time2str(ystd_end, DATE_FORMAT)
    param['lastwk_end'] = util.time2str(lastwk_end, DATE_FORMAT)

    fields = 'time,id,today_spend,ystd_spend,lastwk_spend,today_click,ystd_click,lastwk_click,today_imp,ystd_imp,lastwk_imp,today_serve,ystd_serve,lastwk_serve,dod_delta,wow_delta,tot_3day_spend,adv'.split(
        ',')

    tabid = param.get('tabid')
    if tabid == 'native_topdiffsection':
        command = SECTION_DIFF_TEMPLATE.format(**param)
    else:
        param['today_query'] = iterate_day(today_start, today_end, param,
                                           TODAY_TEMPLATE)
        param['ystd_query'] = iterate_day(ystd_start, ystd_end, param,
                                          YEST_TEMPLATE)
        param['lastwk_query'] = iterate_day(lastwk_start, lastwk_end, param,
                                            LASTWK_TEMPLATE)
        command = CMPGN_ADV_DIFF_TEMPLATE.format(**param)

    return util.fetch_sql_data(config, fields, command,
                               -util.total_seconds(timedelta(minutes=5)))
Esempio n. 11
0
    def run_tests(self):
        """Processes all the data, runs the statistical tests, and returns the results"""
        import numpy as np
        from stats import get_mww, get_ttest_equal_var, get_ttest_diff_var, get_levene, get_simple_stats, get_shapiro

        calculations = []

        headers_task_durations = ['group', 'user', 'task', 'duration']
        headers_activity_times = ['group', 'user'] + Measurements.to_list()
        tasks_data = ASADataSet(headers_task_durations, [])
        tasks_data_secs = ASADataSet(headers_task_durations, [])
        activities_data = ASADataSet(headers_activity_times, [])
        activities_data_secs = ASADataSet(headers_activity_times, [])
        stdout.write("Running tests for experiment '{0}'\n".format(self.name))
        stdout.write("Loading tasks: ")
        stdout.flush()
        for group in self.groups:
            tasks_rows = [(group.name,) + row for row in group.get_task_times()]
            tasks_data.data.extend(tasks_rows)
            tasks_data_secs.data.extend([row[0:3] + tuple(total_seconds(v) for v in row[3:]) for row in tasks_rows])
            stdout.write(".")
            stdout.flush()
        calculations.append(("task_times", tasks_data))
        calculations.append(("task_times_secs", tasks_data_secs))
        stdout.write(" [finished]\n")
        stdout.flush()

        stdout.write("Loading activities: ")
        stdout.flush()
        for group in self.groups:
            activities_rows = [[group.name] + row for row in group.get_activity_times()]
            activities_data.data.extend(activities_rows)
            activities_data_secs.data.extend([row[0:2] + [total_seconds(v) for v in row[2:]] for row in activities_rows])
            stdout.write(".")
            stdout.flush()
        calculations.append(("activity_times", activities_data))
        calculations.append(("activity_times_secs", activities_data_secs))
        stdout.write(" [finished]\n")
        stdout.flush()


        ###### Run statistical tests ######
        stdout.write("Running statistical tests")
        stdout.flush()

        tasks_times_array = np.array(tasks_data.data)
        activity_times_array = np.array(activities_data.data)

        sstats_headers = ['group', 'sum', 'average', 'mean', 'median', 'std', 'var', 'count']
        mww_headers = ['u', 'p_value_twotailed', 'p_value_lessthan', 'p_value_greaterthan']
        ttest_headers = ['t_twotailed', 'p_value_twotailed', 't_lessthan', 'p_value_lessthan', 't_greaterthan', 'p_value_greaterthan', 'for_variance']
        levene_headers = ['p_value', 'w']
        shapiro_headers = ['group', 'activity', 'p_value', 'w']

        ### task tests

        sstats_results = ASADataSet(['task'] + sstats_headers, [])
        mww_results = ASADataSet(['task'] + mww_headers, [])
        ttest_results = ASADataSet(['task'] + ttest_headers, [])
        levene_results = ASADataSet(['task'] + levene_headers, [])

        tasks_names = sorted(set(tasks_times_array[:, 2]))
        for task_name in tasks_names:
            task_array = tasks_times_array[tasks_times_array[:, 2] == task_name]
            groups_data = self._get_groups_in_seconds(task_array)
            group1_data = groups_data[0][0] # one "dimension" assumed, as task times imply only one column
            group2_data = groups_data[1][0] # one "dimension" assumed, as task times imply only one column

            sstats_results.data.append((task_name, self.groups[0].name) + get_simple_stats(group1_data))
            sstats_results.data.append((task_name, self.groups[1].name) + get_simple_stats(group2_data))
            mww_results.data.append((task_name,) + get_mww(group1_data, group2_data))

            tasks_levene_result = get_levene(group1_data, group2_data)
            levene_results.data.append((task_name,) + tasks_levene_result)
            if tasks_levene_result[0] > 0.05: # equal variance
                ttest_results.data.append((task_name,) + get_ttest_equal_var(group1_data, group2_data) + ("equal",))
            else:
                ttest_results.data.append((task_name,) + get_ttest_diff_var(group1_data, group2_data) + ("diff",))


        calculations.append(("task_times_sstats", sstats_results))
        calculations.append(("task_times_mww_test", mww_results))
        calculations.append(("task_times_ttest_test", ttest_results))
        calculations.append(("task_times_levene_test", levene_results))

        #### totals task times tests

        groups_data = self._get_groups_in_seconds(tasks_times_array)
        group1_data = groups_data[0][0] # one "dimension" assumed, as task times imply only one column
        group2_data = groups_data[1][0] # one "dimension" assumed, as task times imply only one column
        calculations.append(("total_task_times_sstats", ASADataSet(sstats_headers, [(self.groups[0].name,) + get_simple_stats(group1_data), (self.groups[1].name,) + get_simple_stats(group2_data)])))
        calculations.append(("total_task_times_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)])))
        total_levene_result = [get_levene(group1_data, group2_data)]
        calculations.append(("total_task_times_levene_test", ASADataSet(levene_headers, total_levene_result)))
        if total_levene_result[0] > 0.05: # equal variance
            calculations.append(("total_task_times_ttest_test", ASADataSet(ttest_headers, [get_ttest_equal_var(group1_data, group2_data) + ("equal",)])))
        else:
            calculations.append(("total_task_times_ttest_test", ASADataSet(ttest_headers, [get_ttest_diff_var(group1_data, group2_data) + ("diff",)])))


        #### totals task times tests per subject
        ### (i.e., times that subjects took working on the entirety of the tasks, rather than the times they took on each task)

        from pandas import DataFrame
        total_task_times = np.array(DataFrame([row[0:2] + row[3:] for row in tasks_times_array.tolist()]).groupby([0,1], as_index=False).aggregate(np.sum).to_records(index=False).tolist())
        calculations.append(("total_task_times_persubject", ASADataSet(['group', 'user', 'duration'], total_task_times)))

        groups_data = self._get_groups_in_seconds(total_task_times, columns=(2,3))
        group1_data = groups_data[0][0] # one "dimension" assumed, as task times imply only one column
        group2_data = groups_data[1][0] # one "dimension" assumed, as task times imply only one column
        calculations.append(("total_task_times_persubject_sstats", ASADataSet(sstats_headers,
            [(self.groups[0].name,) + get_simple_stats(group1_data), (self.groups[1].name,) + get_simple_stats(group2_data)])))
        calculations.append(("total_task_times_persubject_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)])))
        total_levene_result = [get_levene(group1_data, group2_data)]
        calculations.append(("total_task_times_persubject_levene_test", ASADataSet(levene_headers, total_levene_result)))
        if total_levene_result[0] > 0.05: # equal variance
            calculations.append(("total_task_times_persubject_ttest_test", ASADataSet(ttest_headers, [get_ttest_equal_var(group1_data, group2_data) + ("equal",)])))
        else:
            calculations.append(("total_task_times_persubject_ttest_test", ASADataSet(ttest_headers, [get_ttest_diff_var(group1_data, group2_data) + ("diff",)])))


        #### activity tests

        # [group, user, wiki_view, wiki_edit, search, asa_artifact_view, asa_artifact_edit, asa_index, asa_search]
        groups_data = self._get_groups_in_seconds(activity_times_array, columns=(2,None))
        group1_data = groups_data[0]
        group2_data = groups_data[1]
        intermediate_calcs = {
            "activity_times_sstats": ASADataSet(sstats_headers[:1] + ['activity'] + sstats_headers[1:], []),
            "activity_times_shapiro_test": ASADataSet(shapiro_headers, []),
            "activity_times_mww_test": ASADataSet(['activity'] + mww_headers, []),
            "activity_times_ttest_test": ASADataSet(['activity'] + ttest_headers, []),
            "activity_times_levene_test": ASADataSet(['activity'] + levene_headers, []),
        }
        for measurement_id, measurement in Measurements.to_ids_list_that_matter():
            intermediate_calcs["activity_times_sstats"].data.extend(
                [
                    (self.groups[0].name, measurement) + get_simple_stats(group1_data[measurement_id]),
                    (self.groups[1].name, measurement) + get_simple_stats(group2_data[measurement_id])
                ]
            )

            import warnings
            with warnings.catch_warnings(record=True) as w: # catch warnings
                intermediate_calcs["activity_times_shapiro_test"].data.append(
                    (self.groups[0].name, measurement) + get_shapiro(group1_data[measurement_id])
                )
                if len(w) > 0:
                    print '\x1b[31m' + "\n... Warning running shapiro-wilk on '{0}' for group '{1}': {2}".format(measurement, self.groups[0].name, w[-1].message) + '\033[0m'
            with warnings.catch_warnings(record=True) as w: # catch warnings
                intermediate_calcs["activity_times_shapiro_test"].data.append(
                    (self.groups[1].name, measurement) + get_shapiro(group2_data[measurement_id])
                )
                if len(w) > 0:
                    print '\x1b[31m' + "\n... Warning running shapiro-wilk on '{0}' for group '{1}': {2}".format(measurement, self.groups[1].name, w[-1].message) + '\033[0m'

            try:
                intermediate_calcs["activity_times_mww_test"].data.append(
                    (measurement,) + get_mww(group1_data[measurement_id], group2_data[measurement_id])
                )
            except ValueError:
                # get_mww() returns a ValueError when the values on both groups are the same
                print "MWW raised a ValueError. Values on both groups are the same?"
                intermediate_calcs["activity_times_mww_test"].data.append((measurement, None, None))

            activities_levene_result = get_levene(group1_data[measurement_id], group2_data[measurement_id])
            intermediate_calcs["activity_times_levene_test"].data.append(
                (measurement,) + activities_levene_result
            )
            if activities_levene_result[0] > 0.05: # equal variance
                intermediate_calcs["activity_times_ttest_test"].data.append(
                    (measurement,) + get_ttest_equal_var(group1_data[measurement_id], group2_data[measurement_id]) + ("equal",)
                )
            else:
                intermediate_calcs["activity_times_ttest_test"].data.append(
                    (measurement,) + get_ttest_diff_var(group1_data[measurement_id], group2_data[measurement_id]) + ("diff",)
                )
            measurement_id += 1

        for icalc_tpl in intermediate_calcs.iteritems():
            calculations.append(icalc_tpl)


        #### activity times by issue tests
        intermediate_calcs = {
            "issues_activity_times": ASADataSet(['group', 'user', 'duration_i2', 'duration_i6'], []),
            "issues_activity_times_sstats": ASADataSet(sstats_headers[:1] + ['issue'] + sstats_headers[1:], []),
            "issues_activity_times_mww_test": ASADataSet(['issue'] + mww_headers, []),
            "issues_activity_times_levene_test": ASADataSet(['issue'] + levene_headers, []),
            "issues_activity_times_ttest_test": ASADataSet(['issue'] + ttest_headers, [])
        }

        issues_activity_times = np.array([np.concatenate((row[0:2], [sum(row[[2,3,5,6]], timedelta())], [sum(row[[4,7,8]], timedelta())])) for row in activity_times_array]).tolist()
        intermediate_calcs["issues_activity_times"].data.extend(issues_activity_times)

        groups_data = self._get_groups_in_seconds(np.array(issues_activity_times), (2, None))
        for idx, name in [(0, "understanding"), (1, "finding")]:
            group1_data = groups_data[0][idx]
            group2_data = groups_data[1][idx]
            intermediate_calcs["issues_activity_times_sstats"].data.extend(
                [(self.groups[0].name, name) + get_simple_stats(group1_data),
                 (self.groups[1].name, name) + get_simple_stats(group2_data)]
            )
            intermediate_calcs["issues_activity_times_mww_test"].data.extend(
                [(name,) + get_mww(group1_data, group2_data)]
            )
            issues_levene_result = get_levene(group1_data, group2_data)
            intermediate_calcs["issues_activity_times_levene_test"].data.extend(
                [(name,) +  issues_levene_result]
            )
            if issues_levene_result[0] > 0.05: # equal variance
                intermediate_calcs["issues_activity_times_ttest_test"].data.extend(
                    [(name,) + get_ttest_equal_var(group1_data, group2_data) + ("equal",)]
                )
            else:
                intermediate_calcs["issues_activity_times_ttest_test"].data.extend(
                    [(name,) + get_ttest_equal_var(group1_data, group2_data) + ("diff",)]
                )

        for icalc_tpl in intermediate_calcs.iteritems():
            calculations.append(icalc_tpl)


        #### totals activity times tests

        total_activity_times = np.array([np.concatenate((row[0:2], [sum(row[2:], timedelta())])) for row in activity_times_array]).tolist()
        calculations.append(("total_activity_times", ASADataSet(['group', 'user', 'duration'], total_activity_times)))

        groups_data = self._get_groups_in_seconds(np.array(total_activity_times), (2, None))
        group1_data = groups_data[0][0]
        group2_data = groups_data[1][0]
        calculations.append(("total_activity_times_sstats", ASADataSet(sstats_headers,
            [(self.groups[0].name,) + get_simple_stats(group1_data), (self.groups[1].name,) + get_simple_stats(group2_data)])))
        calculations.append(("total_activity_times_mww_test", ASADataSet(mww_headers, [get_mww(group1_data, group2_data)])))
        total_levene_result = get_levene(group1_data, group2_data)
        calculations.append(("total_activity_times_levene_test", ASADataSet(levene_headers, [total_levene_result])))
        if total_levene_result[0] > 0.05: # equal variance
            calculations.append(("total_activity_times_ttest_test", ASADataSet(ttest_headers, [get_ttest_equal_var(group1_data, group2_data) + ("equal",)])))
        else:
            calculations.append(("total_activity_times_ttest_test", ASADataSet(ttest_headers, [get_ttest_diff_var(group1_data, group2_data) + ("diff",)])))


        # questionnaires

        questionnaire_questions = ASADataSet(['question_number', 'question'], self.get_questionnaire_questions())
        questionnaire_hypothesis = ASADataSet(['question_number', 'hypothesis'], self.get_questionnaire_hypothesis())
        questionnaire_histogram = ASADataSet(['group', 'question', '1', '2', '3', '4', '5'], [])
        questionnaire_one_answer_per_row = ASADataSet(['group', 'user', 'question', 'answer'], [])
        questionnaire_one_answer_per_column = ASADataSet([], [])
        questionnaire_sstats = ASADataSet(['question'] + sstats_headers, [])
        questionnaire_mww_results = ASADataSet(['question'] + mww_headers, [])
        questionnaire_ttest_results = ASADataSet(['question'] + ttest_headers, [])
        questionnaire_levene_results = ASADataSet(['question'] + levene_headers, [])

        def get_question_and_answers(questionnaire_row):
            question = questionnaire_row[0]
            answers = questionnaire_row[1:-6]
            if type(answers[0]) is float: # discard questions with a non-numeric answer
                answers = [int(answer) if type(answer) is float else answer for answer in answers] # floats become ints
                answers_noned = [a if not a == "" else None for a in answers] # replace missing data values with None
                answers = [a for a in answers if not a == ""] # discard missing data values
                return question, answers, answers_noned
            return question, None, None

        group1_name = self.groups[0].name
        group2_name = self.groups[1].name
        group1_subjects = self.groups[0].get_questionnaire_subjects()
        group1_data = self.groups[0].get_questionnaire_questions_and_answers()
        group2_subjects = self.groups[1].get_questionnaire_subjects()
        group2_data = self.groups[1].get_questionnaire_questions_and_answers()

        questionnaire_one_answer_per_column.headers = ['question'] + group1_subjects + group2_subjects

        if not group1_data is None:
            for i in range(len(group1_data)): # for each question
                question_g1, answers_g1, answers_g1_noned = get_question_and_answers(group1_data[i])
                question_g2, answers_g2, answers_g2_noned = get_question_and_answers(group2_data[i])
                assert question_g1 == question_g2
                if  answers_g1 is None or answers_g2 is None:
                    continue
                for i in range(len(group1_subjects)):
                    if not answers_g1_noned[i] is None:
                        questionnaire_one_answer_per_row.data.append((group1_name, group1_subjects[i], question_g1, answers_g1_noned[i]))
                for i in range(len(group2_subjects)):
                    if not answers_g2_noned[i] is None:
                        questionnaire_one_answer_per_row.data.append((group2_name, group2_subjects[i], question_g2, answers_g2_noned[i]))

                questionnaire_one_answer_per_column.data.append((question_g1,) + tuple(answers_g1_noned + answers_g2_noned))
                questionnaire_histogram.data.append((group1_name, question_g1) + tuple(np.bincount(np.array(answers_g1), minlength=6)[1:]))
                questionnaire_histogram.data.append((group2_name, question_g2) + tuple(np.bincount(np.array(answers_g2), minlength=6)[1:]))
                questionnaire_sstats.data.append((question_g1, group1_name) + get_simple_stats(answers_g1))
                questionnaire_sstats.data.append((question_g2, group2_name) + get_simple_stats(answers_g2))
                questionnaire_mww_results.data.append((question_g1,) + get_mww(answers_g1, answers_g2))
                quest_levene_result = get_levene(answers_g1, answers_g2)
                questionnaire_levene_results.data.append((question_g1,) + quest_levene_result)
                if quest_levene_result[0] > 0.05: # equal variance
                    questionnaire_ttest_results.data.append((question_g1,) + get_ttest_equal_var(answers_g1, answers_g2) + ("equal",))
                else:
                    questionnaire_ttest_results.data.append((question_g1,) + get_ttest_diff_var(answers_g1, answers_g2) + ("diff",))

        calculations.append(("questionnaire_questions", questionnaire_questions))
        calculations.append(("questionnaire_hypothesis", questionnaire_hypothesis))
        calculations.append(("questionnaire_histogram", questionnaire_histogram))
        calculations.append(("questionnaire_one_answer_per_row", questionnaire_one_answer_per_row))
        calculations.append(("questionnaire_one_answer_per_column", questionnaire_one_answer_per_column))
        calculations.append(("questionnaire_sstats", questionnaire_sstats))
        calculations.append(("questionnaire_mww_test", questionnaire_mww_results))
        calculations.append(("questionnaire_levene_test", questionnaire_levene_results))
        calculations.append(("questionnaire_ttest_test", questionnaire_ttest_results))

        stdout.write(" [finished]\n")
        stdout.flush()

        return ASAExperimentCalculations(self, calculations)
Esempio n. 12
0
    def run_tests(self):
        """Processes all the data, runs the statistical tests, and returns the results"""
        import numpy as np
        from stats import get_mww, get_ttest_equal_var, get_ttest_diff_var, get_levene, get_simple_stats, get_shapiro

        calculations = []

        headers_task_durations = ['group', 'user', 'task', 'duration']
        headers_activity_times = ['group', 'user'] + Measurements.to_list()
        tasks_data = ASADataSet(headers_task_durations, [])
        tasks_data_secs = ASADataSet(headers_task_durations, [])
        activities_data = ASADataSet(headers_activity_times, [])
        activities_data_secs = ASADataSet(headers_activity_times, [])
        stdout.write("Running tests for experiment '{0}'\n".format(self.name))
        stdout.write("Loading tasks: ")
        stdout.flush()
        for group in self.groups:
            tasks_rows = [(group.name, ) + row
                          for row in group.get_task_times()]
            tasks_data.data.extend(tasks_rows)
            tasks_data_secs.data.extend([
                row[0:3] + tuple(total_seconds(v) for v in row[3:])
                for row in tasks_rows
            ])
            stdout.write(".")
            stdout.flush()
        calculations.append(("task_times", tasks_data))
        calculations.append(("task_times_secs", tasks_data_secs))
        stdout.write(" [finished]\n")
        stdout.flush()

        stdout.write("Loading activities: ")
        stdout.flush()
        for group in self.groups:
            activities_rows = [[group.name] + row
                               for row in group.get_activity_times()]
            activities_data.data.extend(activities_rows)
            activities_data_secs.data.extend([
                row[0:2] + [total_seconds(v) for v in row[2:]]
                for row in activities_rows
            ])
            stdout.write(".")
            stdout.flush()
        calculations.append(("activity_times", activities_data))
        calculations.append(("activity_times_secs", activities_data_secs))
        stdout.write(" [finished]\n")
        stdout.flush()

        ###### Run statistical tests ######
        stdout.write("Running statistical tests")
        stdout.flush()

        tasks_times_array = np.array(tasks_data.data)
        activity_times_array = np.array(activities_data.data)

        sstats_headers = [
            'group', 'sum', 'average', 'mean', 'median', 'std', 'var', 'count'
        ]
        mww_headers = [
            'u', 'p_value_twotailed', 'p_value_lessthan', 'p_value_greaterthan'
        ]
        ttest_headers = [
            't_twotailed', 'p_value_twotailed', 't_lessthan',
            'p_value_lessthan', 't_greaterthan', 'p_value_greaterthan',
            'for_variance'
        ]
        levene_headers = ['p_value', 'w']
        shapiro_headers = ['group', 'activity', 'p_value', 'w']

        ### task tests

        sstats_results = ASADataSet(['task'] + sstats_headers, [])
        mww_results = ASADataSet(['task'] + mww_headers, [])
        ttest_results = ASADataSet(['task'] + ttest_headers, [])
        levene_results = ASADataSet(['task'] + levene_headers, [])

        tasks_names = sorted(set(tasks_times_array[:, 2]))
        for task_name in tasks_names:
            task_array = tasks_times_array[tasks_times_array[:,
                                                             2] == task_name]
            groups_data = self._get_groups_in_seconds(task_array)
            group1_data = groups_data[0][
                0]  # one "dimension" assumed, as task times imply only one column
            group2_data = groups_data[1][
                0]  # one "dimension" assumed, as task times imply only one column

            sstats_results.data.append((task_name, self.groups[0].name) +
                                       get_simple_stats(group1_data))
            sstats_results.data.append((task_name, self.groups[1].name) +
                                       get_simple_stats(group2_data))
            mww_results.data.append((task_name, ) +
                                    get_mww(group1_data, group2_data))

            tasks_levene_result = get_levene(group1_data, group2_data)
            levene_results.data.append((task_name, ) + tasks_levene_result)
            if tasks_levene_result[0] > 0.05:  # equal variance
                ttest_results.data.append(
                    (task_name, ) +
                    get_ttest_equal_var(group1_data, group2_data) +
                    ("equal", ))
            else:
                ttest_results.data.append(
                    (task_name, ) +
                    get_ttest_diff_var(group1_data, group2_data) + ("diff", ))

        calculations.append(("task_times_sstats", sstats_results))
        calculations.append(("task_times_mww_test", mww_results))
        calculations.append(("task_times_ttest_test", ttest_results))
        calculations.append(("task_times_levene_test", levene_results))

        #### totals task times tests

        groups_data = self._get_groups_in_seconds(tasks_times_array)
        group1_data = groups_data[0][
            0]  # one "dimension" assumed, as task times imply only one column
        group2_data = groups_data[1][
            0]  # one "dimension" assumed, as task times imply only one column
        calculations.append(
            ("total_task_times_sstats",
             ASADataSet(
                 sstats_headers,
                 [(self.groups[0].name, ) + get_simple_stats(group1_data),
                  (self.groups[1].name, ) + get_simple_stats(group2_data)])))
        calculations.append(("total_task_times_mww_test",
                             ASADataSet(mww_headers,
                                        [get_mww(group1_data, group2_data)])))
        total_levene_result = [get_levene(group1_data, group2_data)]
        calculations.append(("total_task_times_levene_test",
                             ASADataSet(levene_headers, total_levene_result)))
        if total_levene_result[0] > 0.05:  # equal variance
            calculations.append(
                ("total_task_times_ttest_test",
                 ASADataSet(ttest_headers, [
                     get_ttest_equal_var(group1_data, group2_data) +
                     ("equal", )
                 ])))
        else:
            calculations.append(
                ("total_task_times_ttest_test",
                 ASADataSet(ttest_headers, [
                     get_ttest_diff_var(group1_data, group2_data) + ("diff", )
                 ])))

        #### totals task times tests per subject
        ### (i.e., times that subjects took working on the entirety of the tasks, rather than the times they took on each task)

        from pandas import DataFrame
        total_task_times = np.array(
            DataFrame([
                row[0:2] + row[3:] for row in tasks_times_array.tolist()
            ]).groupby([0, 1], as_index=False).aggregate(
                np.sum).to_records(index=False).tolist())
        calculations.append(("total_task_times_persubject",
                             ASADataSet(['group', 'user', 'duration'],
                                        total_task_times)))

        groups_data = self._get_groups_in_seconds(total_task_times,
                                                  columns=(2, 3))
        group1_data = groups_data[0][
            0]  # one "dimension" assumed, as task times imply only one column
        group2_data = groups_data[1][
            0]  # one "dimension" assumed, as task times imply only one column
        calculations.append(
            ("total_task_times_persubject_sstats",
             ASADataSet(
                 sstats_headers,
                 [(self.groups[0].name, ) + get_simple_stats(group1_data),
                  (self.groups[1].name, ) + get_simple_stats(group2_data)])))
        calculations.append(("total_task_times_persubject_mww_test",
                             ASADataSet(mww_headers,
                                        [get_mww(group1_data, group2_data)])))
        total_levene_result = [get_levene(group1_data, group2_data)]
        calculations.append(("total_task_times_persubject_levene_test",
                             ASADataSet(levene_headers, total_levene_result)))
        if total_levene_result[0] > 0.05:  # equal variance
            calculations.append(
                ("total_task_times_persubject_ttest_test",
                 ASADataSet(ttest_headers, [
                     get_ttest_equal_var(group1_data, group2_data) +
                     ("equal", )
                 ])))
        else:
            calculations.append(
                ("total_task_times_persubject_ttest_test",
                 ASADataSet(ttest_headers, [
                     get_ttest_diff_var(group1_data, group2_data) + ("diff", )
                 ])))

        #### activity tests

        # [group, user, wiki_view, wiki_edit, search, asa_artifact_view, asa_artifact_edit, asa_index, asa_search]
        groups_data = self._get_groups_in_seconds(activity_times_array,
                                                  columns=(2, None))
        group1_data = groups_data[0]
        group2_data = groups_data[1]
        intermediate_calcs = {
            "activity_times_sstats":
            ASADataSet(sstats_headers[:1] + ['activity'] + sstats_headers[1:],
                       []),
            "activity_times_shapiro_test":
            ASADataSet(shapiro_headers, []),
            "activity_times_mww_test":
            ASADataSet(['activity'] + mww_headers, []),
            "activity_times_ttest_test":
            ASADataSet(['activity'] + ttest_headers, []),
            "activity_times_levene_test":
            ASADataSet(['activity'] + levene_headers, []),
        }
        for measurement_id, measurement in Measurements.to_ids_list_that_matter(
        ):
            intermediate_calcs["activity_times_sstats"].data.extend([
                (self.groups[0].name, measurement) +
                get_simple_stats(group1_data[measurement_id]),
                (self.groups[1].name, measurement) +
                get_simple_stats(group2_data[measurement_id])
            ])

            import warnings
            with warnings.catch_warnings(record=True) as w:  # catch warnings
                intermediate_calcs["activity_times_shapiro_test"].data.append(
                    (self.groups[0].name, measurement) +
                    get_shapiro(group1_data[measurement_id]))
                if len(w) > 0:
                    print '\x1b[31m' + "\n... Warning running shapiro-wilk on '{0}' for group '{1}': {2}".format(
                        measurement, self.groups[0].name,
                        w[-1].message) + '\033[0m'
            with warnings.catch_warnings(record=True) as w:  # catch warnings
                intermediate_calcs["activity_times_shapiro_test"].data.append(
                    (self.groups[1].name, measurement) +
                    get_shapiro(group2_data[measurement_id]))
                if len(w) > 0:
                    print '\x1b[31m' + "\n... Warning running shapiro-wilk on '{0}' for group '{1}': {2}".format(
                        measurement, self.groups[1].name,
                        w[-1].message) + '\033[0m'

            try:
                intermediate_calcs["activity_times_mww_test"].data.append(
                    (measurement, ) + get_mww(group1_data[measurement_id],
                                              group2_data[measurement_id]))
            except ValueError:
                # get_mww() returns a ValueError when the values on both groups are the same
                print "MWW raised a ValueError. Values on both groups are the same?"
                intermediate_calcs["activity_times_mww_test"].data.append(
                    (measurement, None, None))

            activities_levene_result = get_levene(group1_data[measurement_id],
                                                  group2_data[measurement_id])
            intermediate_calcs["activity_times_levene_test"].data.append(
                (measurement, ) + activities_levene_result)
            if activities_levene_result[0] > 0.05:  # equal variance
                intermediate_calcs["activity_times_ttest_test"].data.append(
                    (measurement, ) +
                    get_ttest_equal_var(group1_data[measurement_id],
                                        group2_data[measurement_id]) +
                    ("equal", ))
            else:
                intermediate_calcs["activity_times_ttest_test"].data.append(
                    (measurement, ) +
                    get_ttest_diff_var(group1_data[measurement_id],
                                       group2_data[measurement_id]) +
                    ("diff", ))
            measurement_id += 1

        for icalc_tpl in intermediate_calcs.iteritems():
            calculations.append(icalc_tpl)

        #### activity times by issue tests
        intermediate_calcs = {
            "issues_activity_times":
            ASADataSet(['group', 'user', 'duration_i2', 'duration_i6'], []),
            "issues_activity_times_sstats":
            ASADataSet(sstats_headers[:1] + ['issue'] + sstats_headers[1:],
                       []),
            "issues_activity_times_mww_test":
            ASADataSet(['issue'] + mww_headers, []),
            "issues_activity_times_levene_test":
            ASADataSet(['issue'] + levene_headers, []),
            "issues_activity_times_ttest_test":
            ASADataSet(['issue'] + ttest_headers, [])
        }

        issues_activity_times = np.array([
            np.concatenate((row[0:2], [sum(row[[2, 3, 5, 6]], timedelta())],
                            [sum(row[[4, 7, 8]], timedelta())]))
            for row in activity_times_array
        ]).tolist()
        intermediate_calcs["issues_activity_times"].data.extend(
            issues_activity_times)

        groups_data = self._get_groups_in_seconds(
            np.array(issues_activity_times), (2, None))
        for idx, name in [(0, "understanding"), (1, "finding")]:
            group1_data = groups_data[0][idx]
            group2_data = groups_data[1][idx]
            intermediate_calcs["issues_activity_times_sstats"].data.extend([
                (self.groups[0].name, name) + get_simple_stats(group1_data),
                (self.groups[1].name, name) + get_simple_stats(group2_data)
            ])
            intermediate_calcs["issues_activity_times_mww_test"].data.extend([
                (name, ) + get_mww(group1_data, group2_data)
            ])
            issues_levene_result = get_levene(group1_data, group2_data)
            intermediate_calcs[
                "issues_activity_times_levene_test"].data.extend([
                    (name, ) + issues_levene_result
                ])
            if issues_levene_result[0] > 0.05:  # equal variance
                intermediate_calcs[
                    "issues_activity_times_ttest_test"].data.extend([
                        (name, ) +
                        get_ttest_equal_var(group1_data, group2_data) +
                        ("equal", )
                    ])
            else:
                intermediate_calcs[
                    "issues_activity_times_ttest_test"].data.extend([
                        (name, ) +
                        get_ttest_equal_var(group1_data, group2_data) +
                        ("diff", )
                    ])

        for icalc_tpl in intermediate_calcs.iteritems():
            calculations.append(icalc_tpl)

        #### totals activity times tests

        total_activity_times = np.array([
            np.concatenate((row[0:2], [sum(row[2:], timedelta())]))
            for row in activity_times_array
        ]).tolist()
        calculations.append(("total_activity_times",
                             ASADataSet(['group', 'user', 'duration'],
                                        total_activity_times)))

        groups_data = self._get_groups_in_seconds(
            np.array(total_activity_times), (2, None))
        group1_data = groups_data[0][0]
        group2_data = groups_data[1][0]
        calculations.append(
            ("total_activity_times_sstats",
             ASADataSet(
                 sstats_headers,
                 [(self.groups[0].name, ) + get_simple_stats(group1_data),
                  (self.groups[1].name, ) + get_simple_stats(group2_data)])))
        calculations.append(("total_activity_times_mww_test",
                             ASADataSet(mww_headers,
                                        [get_mww(group1_data, group2_data)])))
        total_levene_result = get_levene(group1_data, group2_data)
        calculations.append(("total_activity_times_levene_test",
                             ASADataSet(levene_headers,
                                        [total_levene_result])))
        if total_levene_result[0] > 0.05:  # equal variance
            calculations.append(
                ("total_activity_times_ttest_test",
                 ASADataSet(ttest_headers, [
                     get_ttest_equal_var(group1_data, group2_data) +
                     ("equal", )
                 ])))
        else:
            calculations.append(
                ("total_activity_times_ttest_test",
                 ASADataSet(ttest_headers, [
                     get_ttest_diff_var(group1_data, group2_data) + ("diff", )
                 ])))

        # questionnaires

        questionnaire_questions = ASADataSet(
            ['question_number', 'question'],
            self.get_questionnaire_questions())
        questionnaire_hypothesis = ASADataSet(
            ['question_number', 'hypothesis'],
            self.get_questionnaire_hypothesis())
        questionnaire_histogram = ASADataSet(
            ['group', 'question', '1', '2', '3', '4', '5'], [])
        questionnaire_one_answer_per_row = ASADataSet(
            ['group', 'user', 'question', 'answer'], [])
        questionnaire_one_answer_per_column = ASADataSet([], [])
        questionnaire_sstats = ASADataSet(['question'] + sstats_headers, [])
        questionnaire_mww_results = ASADataSet(['question'] + mww_headers, [])
        questionnaire_ttest_results = ASADataSet(['question'] + ttest_headers,
                                                 [])
        questionnaire_levene_results = ASADataSet(['question'] +
                                                  levene_headers, [])

        def get_question_and_answers(questionnaire_row):
            question = questionnaire_row[0]
            answers = questionnaire_row[1:-6]
            if type(answers[0]
                    ) is float:  # discard questions with a non-numeric answer
                answers = [
                    int(answer) if type(answer) is float else answer
                    for answer in answers
                ]  # floats become ints
                answers_noned = [a if not a == "" else None for a in answers
                                 ]  # replace missing data values with None
                answers = [a for a in answers
                           if not a == ""]  # discard missing data values
                return question, answers, answers_noned
            return question, None, None

        group1_name = self.groups[0].name
        group2_name = self.groups[1].name
        group1_subjects = self.groups[0].get_questionnaire_subjects()
        group1_data = self.groups[0].get_questionnaire_questions_and_answers()
        group2_subjects = self.groups[1].get_questionnaire_subjects()
        group2_data = self.groups[1].get_questionnaire_questions_and_answers()

        questionnaire_one_answer_per_column.headers = [
            'question'
        ] + group1_subjects + group2_subjects

        if not group1_data is None:
            for i in range(len(group1_data)):  # for each question
                question_g1, answers_g1, answers_g1_noned = get_question_and_answers(
                    group1_data[i])
                question_g2, answers_g2, answers_g2_noned = get_question_and_answers(
                    group2_data[i])
                assert question_g1 == question_g2
                if answers_g1 is None or answers_g2 is None:
                    continue
                for i in range(len(group1_subjects)):
                    if not answers_g1_noned[i] is None:
                        questionnaire_one_answer_per_row.data.append(
                            (group1_name, group1_subjects[i], question_g1,
                             answers_g1_noned[i]))
                for i in range(len(group2_subjects)):
                    if not answers_g2_noned[i] is None:
                        questionnaire_one_answer_per_row.data.append(
                            (group2_name, group2_subjects[i], question_g2,
                             answers_g2_noned[i]))

                questionnaire_one_answer_per_column.data.append(
                    (question_g1, ) +
                    tuple(answers_g1_noned + answers_g2_noned))
                questionnaire_histogram.data.append(
                    (group1_name, question_g1) +
                    tuple(np.bincount(np.array(answers_g1), minlength=6)[1:]))
                questionnaire_histogram.data.append(
                    (group2_name, question_g2) +
                    tuple(np.bincount(np.array(answers_g2), minlength=6)[1:]))
                questionnaire_sstats.data.append((question_g1, group1_name) +
                                                 get_simple_stats(answers_g1))
                questionnaire_sstats.data.append((question_g2, group2_name) +
                                                 get_simple_stats(answers_g2))
                questionnaire_mww_results.data.append(
                    (question_g1, ) + get_mww(answers_g1, answers_g2))
                quest_levene_result = get_levene(answers_g1, answers_g2)
                questionnaire_levene_results.data.append((question_g1, ) +
                                                         quest_levene_result)
                if quest_levene_result[0] > 0.05:  # equal variance
                    questionnaire_ttest_results.data.append(
                        (question_g1, ) +
                        get_ttest_equal_var(answers_g1, answers_g2) +
                        ("equal", ))
                else:
                    questionnaire_ttest_results.data.append(
                        (question_g1, ) +
                        get_ttest_diff_var(answers_g1, answers_g2) +
                        ("diff", ))

        calculations.append(
            ("questionnaire_questions", questionnaire_questions))
        calculations.append(
            ("questionnaire_hypothesis", questionnaire_hypothesis))
        calculations.append(
            ("questionnaire_histogram", questionnaire_histogram))
        calculations.append(("questionnaire_one_answer_per_row",
                             questionnaire_one_answer_per_row))
        calculations.append(("questionnaire_one_answer_per_column",
                             questionnaire_one_answer_per_column))
        calculations.append(("questionnaire_sstats", questionnaire_sstats))
        calculations.append(
            ("questionnaire_mww_test", questionnaire_mww_results))
        calculations.append(
            ("questionnaire_levene_test", questionnaire_levene_results))
        calculations.append(
            ("questionnaire_ttest_test", questionnaire_ttest_results))

        stdout.write(" [finished]\n")
        stdout.flush()

        return ASAExperimentCalculations(self, calculations)
Esempio n. 13
0
    def humanize(self, other=None, locale='en_us'):
        ''' Returns a localized, humanized representation of a relative difference in time.

        :param other: (optional) an :class:`Arrow <arrow.arrow.Arrow>` or ``datetime`` object.
            Defaults to now in the current :class:`Arrow <arrow.arrow.Arrow>` object's timezone.
        :param locale: (optional) a ``str`` specifying a locale.  Defaults to 'en_us'.

        Usage::

            >>> earlier = arrow.utcnow().replace(hours=-2)
            >>> earlier.humanize()
            '2 hours ago'

            >>> later = later = earlier.replace(hours=4)
            >>> later.humanize(earlier)
            'in 4 hours'

        '''

        locale = locales.get_locale(locale)

        if other is None:
            utc = datetime.utcnow().replace(tzinfo=dateutil_tz.tzutc())
            dt = utc.astimezone(self._datetime.tzinfo)

        elif isinstance(other, Arrow):
            dt = other._datetime

        elif isinstance(other, datetime):
            if other.tzinfo is None:
                dt = other.replace(tzinfo=self._datetime.tzinfo)
            else:
                dt = other.astimezone(self._datetime.tzinfo)

        else:
            raise TypeError()

        delta = int(util.total_seconds(self._datetime - dt))
        sign = -1 if delta < 0 else 1
        diff = abs(delta)
        delta = diff

        if diff < 10:
            return locale.describe('now')

        if diff < 45:
            return locale.describe('seconds', sign)

        elif diff < 90:
            return locale.describe('minute', sign)
        elif diff < 2700:
            minutes = sign * int(max(delta / 60, 2))
            return locale.describe('minutes', minutes)

        elif diff < 5400:
            return locale.describe('hour', sign)
        elif diff < 79200:
            hours = sign * int(max(delta / 3600, 2))
            return locale.describe('hours', hours)

        elif diff < 129600:
            return locale.describe('day', sign)
        elif diff < 2160000:
            days = sign * int(max(delta / 86400, 2))
            return locale.describe('days', days)

        elif diff < 3888000:
            return locale.describe('month', sign)
        elif diff < 29808000:
            self_months = self._datetime.year * 12 + self._datetime.month
            other_months = dt.year * 12 + dt.month
            months = sign * abs(other_months - self_months)

            return locale.describe('months', months)

        elif diff < 47260800:
            return locale.describe('year', sign)
        else:
            years = sign * int(max(delta / 31536000, 2))
            return locale.describe('years', years)