def scope_optimization( initial_feature_vector, problem_data, group_vector_scorer, vector_generator, n_iterations, n_processes, n_maintain, save_iterations, ): outcome_set = [group_vector_scorer.score_feature_vector(problem_data, initial_feature_vector)] return_set = [outcome_set[0]] for iteration in range(1, n_iterations): new_vector_set = vector_generator.generate_vectors(outcome_set) testing_functions = [ (group_vector_scorer.score_feature_vector, (problem_data, new_vector)) for new_vector in new_vector_set ] result_handler = outcome_set.append multiprocess_functions(testing_functions, result_handler, n_processes) outcome_set.sort(key=lambda outcome: outcome.prediction_quality, reverse=True) outcome_set = outcome_set[:n_maintain] return_set.append(outcome_set[0]) return return_set if save_iterations else return_set[-1]
def scope_optimization_cross_validation(scope_optimization, initial_feature_vector, problem_data, vector_model, prediction_scoring_function, n_cross_folds, n_processes): masks = [(train, test) for train, test in KFold(problem_data.get_n_unmasked_samples(), n_folds=n_cross_folds)] #Perform the optimization process for each fold. n_iterations = scope_optimization.n_iterations training_outcomes = [[] for i in range(n_iterations)] for train_mask, test_mask in masks: problem_data.push_mask(test_mask) iteration_outcomes = scope_optimization.optimize_vector(initial_feature_vector, problem_data, True) problem_data.pop_mask() for iteration in range(n_iterations): training_outcomes[iteration].append(iteration_outcomes[iteration]) #Score the resulting feature vectors for each fold in parallel. process_definitions = [] for iteration in range(n_iterations): for mask_index in range(len(masks)): X = build_sample_matrix(problem_data, training_outcomes[iteration][mask_index].feature_vector) y = problem_data.get_response_variables() train_mask, test_mask = masks[mask_index] process_definitions.append(ProcessDefinition(build_model_outcome_with_matrices, positional_arguments= (training_outcomes[iteration][mask_index].feature_vector, vector_model, prediction_scoring_function, X[train_mask], y[train_mask], X[test_mask], y[test_mask]), tag=(iteration, mask_index))) test_results = [] multiprocess_functions(process_definitions, test_results.append, n_processes) test_outcomes = [[None] * len(masks) for i in range(scope_optimization.n_iterations)] for tag, mask_result in test_results: iteration, mask_index = tag test_outcomes[iteration][mask_index] = mask_result return test_outcomes
def __init__(self, group_to_object, object_to_group, sample_to_response, n_processes, parse_object_sample=parse_object_string_sample, mask=None): self.response_variables = np.array([sample_to_response[sample] for sample in sample_to_response.keys()]) self.max_scope = len(object_to_group)-1 self.mask = mask self.n_samples = len(sample_to_response) sample_indeces = dict([(sample_to_response.keys()[index], index) for index in range(len(sample_to_response.keys()))]) self.feature_columns = dict() self.feature_records = dict() self.scope_map = dict() def build_scope_columns_records_splits(scope, group_to_object, sample_indeces): feature_records = dict() feature_columns = dict() scope_map = dict() for group in group_to_object[scope]: objects = group_to_object[scope][group] feature_column = np.zeros((len(sample_indeces,))) for obj in objects: sample_id = parse_object_sample(obj) if sample_id in sample_indeces: sample_index = sample_indeces[sample_id] feature_column[sample_index] += 1 feature_columns[(scope, group)] = feature_column feature_abundance = feature_column.sum() feature_id = group feature_records[(scope, group)] = FeatureRecord(feature_id, scope, feature_abundance) for final_scope in range(len(group_to_object)): if final_scope == scope: self.scope_map[(scope, group, final_scope)] = [(scope, group)] continue final_groups = list(set(object_to_group[final_scope][obj] for obj in objects if obj in object_to_group[final_scope])) scope_map[(scope, group, final_scope)] = [(final_scope, final_group) for final_group in final_groups] return (feature_columns, feature_records, scope_map) functions = [] results = [] for scope in range(len(group_to_object)): functions.append( (build_scope_columns_records_splits, (scope, group_to_object, sample_indeces)) ) multiprocess_functions(functions, results.append, n_processes) for feature_columns, feature_records, scope_map in results: self.feature_columns.update(feature_columns) self.feature_records.update(feature_records) self.scope_map.update(scope_map)
def command_line_argument_wrapper(model, n_iterations, group_map_files, start_level, mapping_file, prediction_field, include_only, negate, n_maintain, n_generate, score_predictions_function, split_abun_coef, split_score_coef, merge_abun_coef, merge_score_coef, delete_abun_coef, delete_score_coef, split_proportion, merge_proportion, delete_proportion, n_cross_folds, n_processes, output_dir, n_trials): """ Sets up and executes scope optimization on a given problem, runs testing, and writes to files. Builds the data structures and objects that are used by fresco.scope_optimization from command line friendly arguments. Then runs scope optimization, performs cross fold testing on the results if instructed, and writes the results to the files. Args: model: A string representing a machine learning classification model to be used both for testing in within the optimization process. n_iterations: The number of iterations to be completed by the optimization process. group_map_files: A list of open files containing tab-separated lines mapping from groups to objects. For example: Group1 Object1 Object2 Object3 Group2 Object4 Object5 Map files should be ordered by decreasing level of scope, i.e., most general to least general map files. start_level: The starting scope level (map file index) for the optimization process. mapping_file: An open file with tab separated lines mapping from a sample ID to its properties. The first line should be tab separated identifiers for the properties. For example: SAMPLE_ID COLOR TASTE APPLE_g GREEN AMAZING APPLE_r RED AWEFUL n_maintain: The number of vectors to be kept after every iteration of optimization. n_generate: The number of vectors to be generated for each input vector by the vector generator. score_predictions_function: A function which takes two lists of class predictions of equal length and returns a numerical score. For example: def score_predictions(real_classes, predicted_classes): return sum([1 if read_classes[i] != predicted_classes[i] else 0 for i in range(len(real_classes))]) split_abun_coef: The abundance deviation coefficient for the splitting heuristic. split_score_coef: The prediction score deviation coefficient for the splitting heuristic. merge_abun_coef: The abundance deviation coefficient for the merging heuristic. merge_score_coef: The prediction score deviation coefficient for the merging heuristic. delete_abun_coef: The abundance deviation coefficient for the deletion heuristic. delete_score_coef: The prediction score deviation coefficient for the deletion heuristic. split_proportion: The proportion of total features to be split each iteration. merge_proportion: The proportion of total features to be split each iteration. delete_proportion: The proportion of total features to be split each iteration. n_cross_folds: The number of cross folds to use in scoring the vectors for selection. n_processes: The number of additional processes to spawn, at maximum. output_dir: The directory that the output files will be put in. n_trials: The number of cross folds to use in scoring the vectors returned by the optimization process. If 0, no testing will be performed. """ if not exists(output_dir): makedirs(output_dir) log_fp = join(output_dir, 'info.log') logging.basicConfig(filename=log_fp, filemode='w', level=logging.DEBUG, format='%(asctime)s\t%(levelname)s\t%(message)s') logging.info('Started feature vector optimization process for \'%s\' ' 'model' % model) start_time = time() feature_vector_output_fp = join(output_dir, 'feature_vector_output.txt') vector_model = GroupVectorModel(parse_model_string(model)) group_vector_scorer = CrossValidationGroupVectorScorer(score_predictions_function, vector_model, n_cross_folds) problem_data, initial_feature_vector = build_problem_data(group_map_files, mapping_file, prediction_field, start_level, include_only, negate, n_processes) group_actions = [SplitAction(problem_data, split_proportion, split_abun_coef, split_score_coef), MergeAction(problem_data, merge_proportion, merge_abun_coef, merge_score_coef), DeleteAction(problem_data, delete_proportion, delete_abun_coef, delete_score_coef)] vector_generator = ActionVectorGenerator(group_actions, n_generate) if n_trials > 0: xfold_feature_vectors = [[] for i in range(n_iterations)] masks = [(train, test) for train, test in KFold(problem_data.get_n_samples(), n_folds=n_trials, indices=False)] for train_mask, test_mask in masks: problem_data.set_mask(train_mask) iteration_outcomes = scope_optimization(initial_feature_vector, problem_data, group_vector_scorer, vector_generator, n_iterations, n_processes, n_maintain, True) for iteration in range(len(iteration_outcomes)): xfold_feature_vectors[iteration].append(iteration_outcomes[iteration].feature_vector) functions = [] mask_results = [] for iteration in range(len(iteration_outcomes)): for mask_index in range(len(masks)): functions.append( (mask_testing, (problem_data, masks[mask_index], vector_model, score_predictions_function, xfold_feature_vectors[iteration][mask_index], (iteration, mask_index))) ) multiprocess_functions(functions, mask_results.append, n_processes) test_outcomes = [[None for x in range(len(masks))] for i in range(n_iterations)] for tag, mask_result in mask_results: iteration, mask_index = tag test_outcomes[iteration][mask_index] = mask_result prediction_testing_output_fp = join(output_dir, 'prediction_testing_output.txt') write_to_file(testing_output_lines(test_outcomes), prediction_testing_output_fp) avg_outcome = stitch_avg_outcome(test_outcomes[-1], masks) write_to_file(feature_output_lines(avg_outcome), feature_vector_output_fp) else: outcome = scope_optimization(initial_feature_vector, problem_data, group_vector_scorer, vector_generator, n_iterations, n_processes, n_maintain, False) write_to_file(feature_output_lines(outcome), feature_vector_output_fp) end_time = time() elapsed_time = end_time - start_time logging.info('Finished feature vector optimization process for \'%s\' ' 'model' % model) logging.info('Total elapsed time (in seconds): %d' % elapsed_time)
def test_multiprocess_functions(self): """Test running processes in parallel.""" multiprocess_functions(self.procs, self.result_handler, 1) self.assertEqual(sorted(self.results), [2, 12])
def __init__( self, group_to_object, object_to_group, sample_to_response, n_processes, parse_object_string=parse_object_string_sample, ): """ Builds a ProblemData object which is responsible for providing an interface to all aspects of a dataset. Args: object_to_group, group_to_object, """ if not isinstance(group_to_object, types.ListType): raise InputTypeError("group_to_object should be a list type") if not isinstance(object_to_group, types.ListType): raise InputTypeError("object_to_group should be a list type") if not len(object_to_group) == len(group_to_object): raise InputTypeError("object_to_group and group_to_object should be the same length") if not all([isinstance(o_to_g, types.DictType) for o_to_g in object_to_group]): raise InputTypeError("object_to_group should be a list of dict types") if not all([isinstance(g_to_o, types.DictType) for g_to_o in group_to_object]): raise InputTypeError("group_to_object should be a list of dict types") if not isinstance(sample_to_response, types.DictType): raise InputTypeError("sample_to_response should be a dict type") if not isinstance(n_processes, types.IntType) or n_processes < 0: raise InputTypeError("n_processes should be a non-negative int") if not isinstance(parse_object_string, types.FunctionType): raise InputTypeError("parse_object_sample should be a function") if len(inspect.getargspec(parse_object_string)[0]) < 1: raise InputTypeError("parse_object_sample should take at least one argument") self.response_variables = np.array([sample_to_response[sample] for sample in sample_to_response.keys()]) self.n_scopes = len(object_to_group) self.n_unmasked_samples = len(sample_to_response) # Keep a stack of masks for various levels of data partitions self.mask_stack = MaskStack(self.n_unmasked_samples) sample_indices = dict( [(sample_to_response.keys()[index], index) for index in range(len(sample_to_response.keys()))] ) assert all( [ self.response_variables[sample_indices[sample]] == sample_to_response[sample] for sample in sample_to_response.keys() ] ), "sample_indices are not able to map correctly back to the response variable" process_definitions = [] results = [] for scope in range(len(group_to_object)): process_definitions.append( ProcessDefinition( build_group_records, positional_arguments=(scope, group_to_object[scope], sample_indices, parse_object_string), tag=scope, ) ) multiprocess_functions(process_definitions, results.append, n_processes) self.group_records = [None] * self.n_scopes for scope, result in results: self.group_records[scope] = result for scope in range(self.n_scopes): for group in self.group_records[scope]: self.build_scope_map(self.group_records[scope][group], group_to_object, object_to_group) for scope in range(self.n_scopes): for key in self.group_records[scope]: assert ( self.group_records[scope][key].feature_record.get_id() == key ), "feature_record had mismatched id to it's key" assert ( self.group_records[scope][key].feature_record.get_scope() == scope ), "feature_record had mismatched scope to it's key"