def __init__(self, group_key='Group', outcome_key='Outcome', groups = {'1':[], '2':[]}, population_of_interest=None, *args, **kwargs ): ExactTest.__init__(self, *args, **kwargs) # validate input parameters if self.input_file: if not group_key or not outcome_key: raise ValueError('Keys must be defined for group and outcome columns.') groups = list(set(entry[group_key] for entry in csv.DictReader(open(self.input_file)))) group_of_interest = 0 try: group_of_interest = groups.index(population_of_interest) except: population_of_interest = groups[0] g1 = list() g2 = list() for row in csv.DictReader(open(self.input_file)): if row[group_key] == groups[group_of_interest]: g1.append(row[outcome_key]) else: g2.append(row[outcome_key]) if not row[group_key]: sys.stderr.write('Warning: row contains no group assignment in file %s: %s.\n'%(self.input_file, row)) groups = { groups[group_of_interest]:g1, 'Not %s'%groups[group_of_interest]:g2 } if not isinstance(groups, dict): raise TypeError('Groups must be supplied as a dictionary of pairs Group Label : Outcome Values.') if len(groups.keys()) != 2: raise ValueError('There must be exactly 2 groups; found %s'%len(groups.keys())) for k in groups.keys(): if not groups[k]: raise ValueError('No scores found for Group: %s.'%k) if not self.__class__.is_array_like(groups[k]): raise TypeError('Scores for Group: %s must be a list or tuple.'%k) groups[k] = list(groups[k]) if not population_of_interest: population_of_interest = groups.keys()[0] self.population_of_interest = population_of_interest # store group scores as a data attribute self.groups = groups # set the test parameter to median of the population of interest self.test_parameter = 'M %s'%population_of_interest # store the outcome key as a data attribute self.outcome_key = outcome_key # store the key of the smaller group self.smaller_group_key = groups.keys()[0] if len(groups[groups.keys()[0]]) < len(groups[groups.keys()[1]]) else groups.keys()[1] #self.hypothesized_value = 'M %s'%(groups.keys())[(groups.keys()).index(self.smaller_group_key)-1] # TODO: need to adjust when taking actual hypothesized value into account self.hypothesized_value = 'M %s'%groups.keys()[groups.keys().index(population_of_interest)-1] # order all of the scores based on rank try: self.combined_scores = [float(x) for k in groups.keys() for x in groups[k]] except: self.combined_scores = [x for k in groups.keys() for x in groups[k]] self.combined_scores.sort() # store sample size self.n = len(self.combined_scores)
def p_upper(self, outcome): """ Return probability of getting a value greater than or equal to outcome """ if not ExactTest.is_numeric(outcome): raise TypeError('outcome must be a number') return sum(self.pmf(self.__possible_scores[i]) for i in range(len(self.__possible_scores)) if self.__delta_values[i] >= outcome-self.__expected_value)
def __init__(self, pre_key='Pre', post_key='Post', # keys for the two columns to compare in the csv file pre=[], post=[], # optionally specify the pre and post scores manually hypothesized_value=0, # the value under test *args, **kwargs): ExactTest.__init__(self, *args, **kwargs) # validate input parameters if self.input_file: if not pre_key: raise ValueError('You must specify the name of the column containing pre-test scores.') if not post_key: raise ValueError('You must specify the name of the column containing post-test scores.') pre = list() post = list() try: for row in csv.DictReader(open(self.input_file)): if not pre_key in row.keys(): raise ValueError('The pre-test key %s was not found.'%pre_key) if not post_key in row.keys(): raise ValueError('The post-test key %s was not found.'%pre_key) if row[pre_key] and row[post_key]: pre.append(float(row[pre_key])) post.append(float(row[post_key])) elif (row[pre_key] and not row[post_key]) or (row[post_key] and not row[pre_key]): sys.stderr.write('Row does not contain matched pairs (%s). Data is being skipped.\n'%row) except KeyError as e: raise KeyError('Unable to locate key %s in file %s.'%(e, self.input_file)) if not pre or not post: raise ValueError('You must supply scores for both pre and post.') if not self.__class__.is_array_like(pre): raise TypeError('pre must be a list or tuple.') if not self.__classs__.is_array_like(post): raise TypeError('post must be a list or tuple.') if pre and post and not (len(pre)==len(post)): raise ValueError('pre and post must be sequences of equal length.') # set the test parameter to median difference self.test_parameter = 'Md' # store difference scores as data attribute self.difference_scores = [post[i]-pre[i] for i in range(len(post))] # sort difference scores by absolute value self.difference_scores.sort(cmp=lambda x,y: cmp(abs(x), abs(y))) # store sample size self.n = len(self.difference_scores) # generate replacement values and weights self.replacement_values = self.difference_scores self.weights = [1.0 if x > 0.0 else (0.0 if x < 0.0 else 0.5) for x in self.difference_scores]
def p_two(self, outcome): """ Return probability of getting a value whose distance from the expected value is greater or equal to that of outcome """ if not ExactTest.is_numeric(outcome): raise TypeError('outcome must be a number') return sum(self.pmf(self.__possible_scores[i]) for i in range(len(self.__possible_scores)) if abs(self.__delta_values[i]) >= abs(outcome-self.__expected_value))
def __init__(self, replacement_values): # validate replacement_values if not ExactTest.is_array_like(replacement_values): raise TypeError('replacement_values must be a list or tuple') # create the exact sampling distribution of all possible combinations n = len(replacement_values) self.__total_num_outcomes = 2**n self.__score_frequencies = dict() for i in range(self.__total_num_outcomes): weights = bin(i)[2:].zfill(n) ts = sum([replacement_values[x]*int(weights[x]) for x in range(n)]) self.__score_frequencies[ts] = self.__score_frequencies.setdefault(ts, 0) + 1 individual_probability_mass = 1.0/float(self.__total_num_outcomes) self.__score_probabilities = dict((k, v*individual_probability_mass) for k, v in self.__score_frequencies.iteritems()) self.__possible_scores = sorted(tuple(self.__score_frequencies.keys())) # store the expected value of the distribution self.__expected_value = 0.5*sum(replacement_values) # retain an ordered list of delta values for possible scores self.__delta_values = [x-self.__expected_value for x in self.__possible_scores] # store the variance of the distribution self.__variance = 0.25*sum(x**2 for x in replacement_values)