Ejemplo n.º 1
0
 def __init__(self,
              group_key='Group',
              outcome_key='Outcome',
              groups = {'1':[], '2':[]},
              population_of_interest=None,
              *args,
              **kwargs
              ):
     ExactTest.__init__(self, *args, **kwargs)
     
     # validate input parameters
     if self.input_file:
         if not group_key or not outcome_key:
             raise ValueError('Keys must be defined for group and outcome columns.')
         groups = list(set(entry[group_key] for entry in csv.DictReader(open(self.input_file))))
         group_of_interest = 0
         try: group_of_interest = groups.index(population_of_interest)
         except: population_of_interest = groups[0]
         g1 = list()
         g2 = list()
         for row in csv.DictReader(open(self.input_file)):
             if row[group_key] == groups[group_of_interest]: g1.append(row[outcome_key])
             else: g2.append(row[outcome_key])
             if not row[group_key]: sys.stderr.write('Warning: row contains no group assignment in file %s: %s.\n'%(self.input_file, row))
         groups = { groups[group_of_interest]:g1, 'Not %s'%groups[group_of_interest]:g2 }
     if not isinstance(groups, dict):
         raise TypeError('Groups must be supplied as a dictionary of pairs Group Label : Outcome Values.')
     if len(groups.keys()) != 2:
         raise ValueError('There must be exactly 2 groups; found %s'%len(groups.keys()))
     for k in groups.keys():
         if not groups[k]:
             raise ValueError('No scores found for Group: %s.'%k)
         if not self.__class__.is_array_like(groups[k]):
             raise TypeError('Scores for Group: %s must be a list or tuple.'%k)
         groups[k] = list(groups[k])
     if not population_of_interest: population_of_interest = groups.keys()[0]
     self.population_of_interest = population_of_interest
     
     # store group scores as a data attribute
     self.groups = groups
     # set the test parameter to median of the population of interest
     self.test_parameter = 'M %s'%population_of_interest
     # store the outcome key as a data attribute
     self.outcome_key = outcome_key
     # store the key of the smaller group
     self.smaller_group_key = groups.keys()[0] if len(groups[groups.keys()[0]]) < len(groups[groups.keys()[1]]) else groups.keys()[1]
     #self.hypothesized_value = 'M %s'%(groups.keys())[(groups.keys()).index(self.smaller_group_key)-1] # TODO: need to adjust when taking actual hypothesized value into account
     self.hypothesized_value = 'M %s'%groups.keys()[groups.keys().index(population_of_interest)-1] 
     # order all of the scores based on rank
     try: self.combined_scores = [float(x) for k in groups.keys() for x in groups[k]]
     except: self.combined_scores = [x for k in groups.keys() for x in groups[k]]
     self.combined_scores.sort()
     # store sample size
     self.n = len(self.combined_scores)
Ejemplo n.º 2
0
 def p_upper(self, outcome):
     """
     Return probability of getting a value greater than or equal to outcome
     """
     if not ExactTest.is_numeric(outcome):
         raise TypeError('outcome must be a number')
     return sum(self.pmf(self.__possible_scores[i]) for i in range(len(self.__possible_scores)) if self.__delta_values[i] >= outcome-self.__expected_value)
Ejemplo n.º 3
0
 def __init__(self,
              pre_key='Pre', post_key='Post',    # keys for the two columns to compare in the csv file
              pre=[], post=[],                   # optionally specify the pre and post scores manually
              hypothesized_value=0,              # the value under test
              *args,
              **kwargs):
     ExactTest.__init__(self, *args, **kwargs)
     
     # validate input parameters
     if self.input_file:
         if not pre_key:
             raise ValueError('You must specify the name of the column containing pre-test scores.')
         if not post_key:
             raise ValueError('You must specify the name of the column containing post-test scores.')
         pre = list()
         post = list()
         try:
             for row in csv.DictReader(open(self.input_file)):
                 if not pre_key in row.keys(): raise ValueError('The pre-test key %s was not found.'%pre_key)
                 if not post_key in row.keys(): raise ValueError('The post-test key %s was not found.'%pre_key)
                 if row[pre_key] and row[post_key]:
                     pre.append(float(row[pre_key]))
                     post.append(float(row[post_key]))
                 elif (row[pre_key] and not row[post_key]) or (row[post_key] and not row[pre_key]):
                     sys.stderr.write('Row does not contain matched pairs (%s). Data is being skipped.\n'%row)
         except KeyError as e:
             raise KeyError('Unable to locate key %s in file %s.'%(e, self.input_file))
     if not pre or not post:
         raise ValueError('You must supply scores for both pre and post.')
     if not self.__class__.is_array_like(pre):
         raise TypeError('pre must be a list or tuple.')
     if not self.__classs__.is_array_like(post):
         raise TypeError('post must be a list or tuple.')
     if pre and post and not (len(pre)==len(post)):
         raise ValueError('pre and post must be sequences of equal length.')
     
     # set the test parameter to median difference
     self.test_parameter = 'Md'
     # store difference scores as data attribute
     self.difference_scores = [post[i]-pre[i] for i in range(len(post))]
     # sort difference scores by absolute value
     self.difference_scores.sort(cmp=lambda x,y: cmp(abs(x), abs(y)))
     # store sample size
     self.n = len(self.difference_scores)
     # generate replacement values and weights
     self.replacement_values = self.difference_scores
     self.weights = [1.0 if x > 0.0 else (0.0 if x < 0.0 else 0.5) for x in self.difference_scores]
Ejemplo n.º 4
0
 def p_two(self, outcome):
     """
     Return probability of getting a value whose distance from the expected
     value is greater or equal to that of outcome
     """
     if not ExactTest.is_numeric(outcome):
         raise TypeError('outcome must be a number')
     return sum(self.pmf(self.__possible_scores[i]) for i in range(len(self.__possible_scores)) if abs(self.__delta_values[i]) >= abs(outcome-self.__expected_value))
Ejemplo n.º 5
0
 def __init__(self, replacement_values):
     # validate replacement_values
     if not ExactTest.is_array_like(replacement_values):
         raise TypeError('replacement_values must be a list or tuple')
     # create the exact sampling distribution of all possible combinations
     n = len(replacement_values)
     self.__total_num_outcomes = 2**n
     self.__score_frequencies = dict()
     for i in range(self.__total_num_outcomes):
         weights = bin(i)[2:].zfill(n)
         ts = sum([replacement_values[x]*int(weights[x]) for x in range(n)])
         self.__score_frequencies[ts] = self.__score_frequencies.setdefault(ts, 0) + 1
     individual_probability_mass = 1.0/float(self.__total_num_outcomes)
     self.__score_probabilities = dict((k, v*individual_probability_mass) for k, v in self.__score_frequencies.iteritems())
     self.__possible_scores = sorted(tuple(self.__score_frequencies.keys()))
     # store the expected value of the distribution
     self.__expected_value = 0.5*sum(replacement_values)
     # retain an ordered list of delta values for possible scores
     self.__delta_values = [x-self.__expected_value for x in self.__possible_scores]
     # store the variance of the distribution
     self.__variance = 0.25*sum(x**2 for x in replacement_values)