def __init__( self, group_to_object, object_to_group, sample_to_response, n_processes, parse_object_string=parse_object_string_sample, ): """ Builds a ProblemData object which is responsible for providing an interface to all aspects of a dataset. Args: object_to_group, group_to_object, """ if not isinstance(group_to_object, types.ListType): raise InputTypeError("group_to_object should be a list type") if not isinstance(object_to_group, types.ListType): raise InputTypeError("object_to_group should be a list type") if not len(object_to_group) == len(group_to_object): raise InputTypeError("object_to_group and group_to_object should be the same length") if not all([isinstance(o_to_g, types.DictType) for o_to_g in object_to_group]): raise InputTypeError("object_to_group should be a list of dict types") if not all([isinstance(g_to_o, types.DictType) for g_to_o in group_to_object]): raise InputTypeError("group_to_object should be a list of dict types") if not isinstance(sample_to_response, types.DictType): raise InputTypeError("sample_to_response should be a dict type") if not isinstance(n_processes, types.IntType) or n_processes < 0: raise InputTypeError("n_processes should be a non-negative int") if not isinstance(parse_object_string, types.FunctionType): raise InputTypeError("parse_object_sample should be a function") if len(inspect.getargspec(parse_object_string)[0]) < 1: raise InputTypeError("parse_object_sample should take at least one argument") self.response_variables = np.array([sample_to_response[sample] for sample in sample_to_response.keys()]) self.n_scopes = len(object_to_group) self.n_unmasked_samples = len(sample_to_response) # Keep a stack of masks for various levels of data partitions self.mask_stack = MaskStack(self.n_unmasked_samples) sample_indices = dict( [(sample_to_response.keys()[index], index) for index in range(len(sample_to_response.keys()))] ) assert all( [ self.response_variables[sample_indices[sample]] == sample_to_response[sample] for sample in sample_to_response.keys() ] ), "sample_indices are not able to map correctly back to the response variable" process_definitions = [] results = [] for scope in range(len(group_to_object)): process_definitions.append( ProcessDefinition( build_group_records, positional_arguments=(scope, group_to_object[scope], sample_indices, parse_object_string), tag=scope, ) ) multiprocess_functions(process_definitions, results.append, n_processes) self.group_records = [None] * self.n_scopes for scope, result in results: self.group_records[scope] = result for scope in range(self.n_scopes): for group in self.group_records[scope]: self.build_scope_map(self.group_records[scope][group], group_to_object, object_to_group) for scope in range(self.n_scopes): for key in self.group_records[scope]: assert ( self.group_records[scope][key].feature_record.get_id() == key ), "feature_record had mismatched id to it's key" assert ( self.group_records[scope][key].feature_record.get_scope() == scope ), "feature_record had mismatched scope to it's key"
class ProblemData: def __init__( self, group_to_object, object_to_group, sample_to_response, n_processes, parse_object_string=parse_object_string_sample, ): """ Builds a ProblemData object which is responsible for providing an interface to all aspects of a dataset. Args: object_to_group, group_to_object, """ if not isinstance(group_to_object, types.ListType): raise InputTypeError("group_to_object should be a list type") if not isinstance(object_to_group, types.ListType): raise InputTypeError("object_to_group should be a list type") if not len(object_to_group) == len(group_to_object): raise InputTypeError("object_to_group and group_to_object should be the same length") if not all([isinstance(o_to_g, types.DictType) for o_to_g in object_to_group]): raise InputTypeError("object_to_group should be a list of dict types") if not all([isinstance(g_to_o, types.DictType) for g_to_o in group_to_object]): raise InputTypeError("group_to_object should be a list of dict types") if not isinstance(sample_to_response, types.DictType): raise InputTypeError("sample_to_response should be a dict type") if not isinstance(n_processes, types.IntType) or n_processes < 0: raise InputTypeError("n_processes should be a non-negative int") if not isinstance(parse_object_string, types.FunctionType): raise InputTypeError("parse_object_sample should be a function") if len(inspect.getargspec(parse_object_string)[0]) < 1: raise InputTypeError("parse_object_sample should take at least one argument") self.response_variables = np.array([sample_to_response[sample] for sample in sample_to_response.keys()]) self.n_scopes = len(object_to_group) self.n_unmasked_samples = len(sample_to_response) # Keep a stack of masks for various levels of data partitions self.mask_stack = MaskStack(self.n_unmasked_samples) sample_indices = dict( [(sample_to_response.keys()[index], index) for index in range(len(sample_to_response.keys()))] ) assert all( [ self.response_variables[sample_indices[sample]] == sample_to_response[sample] for sample in sample_to_response.keys() ] ), "sample_indices are not able to map correctly back to the response variable" process_definitions = [] results = [] for scope in range(len(group_to_object)): process_definitions.append( ProcessDefinition( build_group_records, positional_arguments=(scope, group_to_object[scope], sample_indices, parse_object_string), tag=scope, ) ) multiprocess_functions(process_definitions, results.append, n_processes) self.group_records = [None] * self.n_scopes for scope, result in results: self.group_records[scope] = result for scope in range(self.n_scopes): for group in self.group_records[scope]: self.build_scope_map(self.group_records[scope][group], group_to_object, object_to_group) for scope in range(self.n_scopes): for key in self.group_records[scope]: assert ( self.group_records[scope][key].feature_record.get_id() == key ), "feature_record had mismatched id to it's key" assert ( self.group_records[scope][key].feature_record.get_scope() == scope ), "feature_record had mismatched scope to it's key" # Note: this is a pretty good test of the scope_map that # was helpful in debugging, but it has nasty complexity # ============================================================== # for t in final_groups: # assert isinstance(t, types.TupleType) and len(t) == 2,\ # "scope_map includes an entry that is not a scope/group pair" # final_scope, final_group = t # final_objs = group_to_object[final_scope][final_group] # objs = group_to_object[scope][key[0]] # assert any([final_obj in objs for final_obj in final_objs]),\ # "scope_map has an entry that lists a group which shares no objects" # ============================================================== def build_scope_map(self, group_record, group_to_object, object_to_group): scope_map = [None] * self.n_scopes scope = group_record.feature_record.get_scope() group = group_record.feature_record.get_id() objects = group_to_object[scope][group] for final_scope in range(self.n_scopes): if final_scope == scope: scope_map[final_scope] = [(scope, group)] continue final_group_set = set() for obj in objects: try: # make sure not to reference the object_to_group string, or object_to_string won't be collected final_group_id = object_to_group[final_scope][obj] final_group = self.group_records[final_scope][final_group_id].feature_record.get_id() final_group_set.add((final_scope, final_group)) except KeyError: continue scope_map[final_scope] = list(final_group_set) group_record.scope_map = scope_map def push_mask(self, mask): self.mask_stack.push_mask(mask) def pop_mask(self): self.mask_stack.pop_mask() def get_feature_abundance(self, scope, group): try: # is sum(self.get_feature_column(scope, group)) faster? mask = self.mask_stack.get_aggregate_mask() return sum( [ abundance for index, abundance in self.group_records[scope][group].sparce_sample_abundances if mask[index] ] ) except KeyError: raise KeyError("group (%s) not found at scope (%s)" % (group, scope)) def get_n_unmasked_samples(self): return self.n_unmasked_samples def get_max_scope(self): return self.n_scopes - 1 def get_group_ids(self, scope): return [group_id for group_id in self.group_records[scope]] def get_feature_column(self, scope, group): if not isinstance(scope, types.IntType) or scope < 0 or scope > self.get_max_scope(): raise InputTypeError("scope (%s) is not a valid scope index" % scope) try: group_record = self.group_records[scope][group] # convert from sparce to dense format feature_column = np.zeros(self.n_unmasked_samples) for index, abundance in group_record.sparce_sample_abundances: feature_column[index] = abundance mask = self.mask_stack.get_aggregate_mask() return feature_column[mask] except KeyError: raise KeyError("group (%s) not found at scope (%s)" % (group, scope)) def get_feature_record(self, scope, group): if not isinstance(scope, types.IntType) or scope < 0 or scope > self.get_max_scope(): raise InputTypeError("scope (%s) is not a valid scope index" % scope) try: return self.group_records[scope][group].feature_record except KeyError: raise KeyError("group (%s) not found at scope (%s)" % (group, scope)) def get_split_groups(self, scope, group, final_scope): if not isinstance(scope, types.IntType) or scope < 0 or scope > self.get_max_scope(): raise InputTypeError("scope (%s) is not a valid scope index" % scope) if not isinstance(final_scope, types.IntType) or final_scope < 0 or final_scope > self.get_max_scope(): raise InputTypeError("final_scope (%s) is not a valid scope index" % final_scope) try: return self.group_records[scope][group].scope_map[final_scope] except KeyError: raise KeyError("(group, final_scope) (%s, %s) not found at scope (%s)" % (group, final_scope, scope)) def get_response_variables(self): mask = self.mask_stack.get_aggregate_mask() return self.response_variables[mask]