Example #1
0
def scope_optimization(
    initial_feature_vector,
    problem_data,
    group_vector_scorer,
    vector_generator,
    n_iterations,
    n_processes,
    n_maintain,
    save_iterations,
):
    outcome_set = [group_vector_scorer.score_feature_vector(problem_data, initial_feature_vector)]
    return_set = [outcome_set[0]]

    for iteration in range(1, n_iterations):
        new_vector_set = vector_generator.generate_vectors(outcome_set)

        testing_functions = [
            (group_vector_scorer.score_feature_vector, (problem_data, new_vector)) for new_vector in new_vector_set
        ]
        result_handler = outcome_set.append
        multiprocess_functions(testing_functions, result_handler, n_processes)

        outcome_set.sort(key=lambda outcome: outcome.prediction_quality, reverse=True)
        outcome_set = outcome_set[:n_maintain]
        return_set.append(outcome_set[0])

    return return_set if save_iterations else return_set[-1]
def scope_optimization_cross_validation(scope_optimization, initial_feature_vector, problem_data, vector_model, prediction_scoring_function, n_cross_folds, n_processes):
    masks = [(train, test) for train, test in KFold(problem_data.get_n_unmasked_samples(), n_folds=n_cross_folds)]
    
    #Perform the optimization process for each fold.
    n_iterations = scope_optimization.n_iterations
    training_outcomes = [[] for i in range(n_iterations)]
    for train_mask, test_mask in masks:
        problem_data.push_mask(test_mask)
        iteration_outcomes = scope_optimization.optimize_vector(initial_feature_vector, problem_data, True)
        problem_data.pop_mask()
        for iteration in range(n_iterations):
            training_outcomes[iteration].append(iteration_outcomes[iteration])
    
    #Score the resulting feature vectors for each fold in parallel.
    process_definitions = []
    for iteration in range(n_iterations):
        for mask_index in range(len(masks)):
            X = build_sample_matrix(problem_data, training_outcomes[iteration][mask_index].feature_vector)
            y = problem_data.get_response_variables()
            train_mask, test_mask = masks[mask_index]
            
            process_definitions.append(ProcessDefinition(build_model_outcome_with_matrices, positional_arguments=
                                                         (training_outcomes[iteration][mask_index].feature_vector, vector_model,
                                                          prediction_scoring_function, X[train_mask], y[train_mask], X[test_mask], y[test_mask]),
                                                         tag=(iteration, mask_index)))
    
    test_results = []
    multiprocess_functions(process_definitions, test_results.append, n_processes)

    test_outcomes = [[None] * len(masks) for i in range(scope_optimization.n_iterations)]
    for tag, mask_result in test_results:
        iteration, mask_index = tag
        test_outcomes[iteration][mask_index] = mask_result

    return test_outcomes
Example #3
0
    def __init__(self, group_to_object, object_to_group, sample_to_response, n_processes, parse_object_sample=parse_object_string_sample, mask=None):
        self.response_variables = np.array([sample_to_response[sample] for sample in sample_to_response.keys()])
        self.max_scope = len(object_to_group)-1
        self.mask = mask
        self.n_samples = len(sample_to_response)

        sample_indeces = dict([(sample_to_response.keys()[index], index) for index
                                in range(len(sample_to_response.keys()))])

        self.feature_columns = dict()
        self.feature_records = dict()
        self.scope_map = dict()

        def build_scope_columns_records_splits(scope, group_to_object, sample_indeces):
            feature_records = dict()
            feature_columns = dict()
            scope_map = dict()
            for group in group_to_object[scope]:
                objects = group_to_object[scope][group]

                feature_column = np.zeros((len(sample_indeces,)))
                for obj in objects:
                    sample_id = parse_object_sample(obj)

                    if sample_id in sample_indeces:
                        sample_index = sample_indeces[sample_id]
                        feature_column[sample_index] += 1
                feature_columns[(scope, group)] = feature_column

                feature_abundance = feature_column.sum()
                feature_id = group
                feature_records[(scope, group)] = FeatureRecord(feature_id, scope, feature_abundance)

                for final_scope in range(len(group_to_object)):
                    if final_scope == scope:
                        self.scope_map[(scope, group, final_scope)] = [(scope, group)]
                        continue

                    final_groups = list(set(object_to_group[final_scope][obj] for obj in objects if obj in object_to_group[final_scope]))
                    scope_map[(scope, group, final_scope)] = [(final_scope, final_group) for final_group in final_groups]

            return (feature_columns, feature_records, scope_map)

        functions = []
        results = []
        for scope in range(len(group_to_object)):
            functions.append( (build_scope_columns_records_splits, (scope, group_to_object, sample_indeces)) )
        multiprocess_functions(functions, results.append, n_processes)

        for feature_columns, feature_records, scope_map in results:
            self.feature_columns.update(feature_columns)
            self.feature_records.update(feature_records)
            self.scope_map.update(scope_map)
Example #4
0
def command_line_argument_wrapper(model, n_iterations, group_map_files,
        start_level, mapping_file, prediction_field, include_only, negate,
        n_maintain, n_generate, score_predictions_function, split_abun_coef,
        split_score_coef, merge_abun_coef, merge_score_coef, delete_abun_coef,
        delete_score_coef, split_proportion, merge_proportion,
        delete_proportion, n_cross_folds, n_processes, output_dir, n_trials):

    """
    Sets up and executes scope optimization on a given problem, runs testing, and writes to files.
    
    Builds the data structures and objects that are used by fresco.scope_optimization from
    command line friendly arguments. Then runs scope optimization, performs cross fold testing
    on the results if instructed, and writes the results to the files.
    
    Args:
        model: A string representing a machine learning classification model to be used
            both for testing in within the optimization process.
        n_iterations: The number of iterations to be completed by the optimization process.
        group_map_files: A list of open files containing tab-separated lines mapping from
            groups to objects. For example:
                Group1    Object1    Object2    Object3
                Group2    Object4    Object5
            Map files should be ordered by decreasing level of scope, i.e., most general to least
            general map files.
        start_level: The starting scope level (map file index) for the optimization process.
        mapping_file: An open file with tab separated lines mapping from a sample ID to its
            properties. The first line should be tab separated identifiers for the properties.
            For example:
                SAMPLE_ID    COLOR    TASTE
                APPLE_g    GREEN    AMAZING
                APPLE_r    RED    AWEFUL
        n_maintain: The number of vectors to be kept after every iteration of optimization.
        n_generate: The number of vectors to be generated for each input vector by the
            vector generator.
        score_predictions_function: A function which takes two lists of class predictions of
            equal length and returns a numerical score. For example:
                def score_predictions(real_classes, predicted_classes):
                    return sum([1 if read_classes[i] != predicted_classes[i] else 0
                                for i in range(len(real_classes))])
        split_abun_coef: The abundance deviation coefficient for the splitting heuristic.
        split_score_coef: The prediction score deviation coefficient for the splitting heuristic.
        merge_abun_coef: The abundance deviation coefficient for the merging heuristic.
        merge_score_coef: The prediction score deviation coefficient for the merging heuristic.
        delete_abun_coef: The abundance deviation coefficient for the deletion heuristic.
        delete_score_coef: The prediction score deviation coefficient for the deletion heuristic.
        split_proportion: The proportion of total features to be split each iteration.
        merge_proportion: The proportion of total features to be split each iteration.
        delete_proportion: The proportion of total features to be split each iteration.
        n_cross_folds: The number of cross folds to use in scoring the vectors for selection.
        n_processes: The number of additional processes to spawn, at maximum.
        output_dir: The directory that the output files will be put in.
        n_trials: The number of cross folds to use in scoring the vectors returned by the
            optimization process. If 0, no testing will be performed.
    """
    if not exists(output_dir):
        makedirs(output_dir)

    log_fp = join(output_dir, 'info.log')
    logging.basicConfig(filename=log_fp, filemode='w', level=logging.DEBUG,
                        format='%(asctime)s\t%(levelname)s\t%(message)s')
    logging.info('Started feature vector optimization process for \'%s\' '
                 'model' % model)
    start_time = time()

    feature_vector_output_fp = join(output_dir,
                                    'feature_vector_output.txt')

    vector_model = GroupVectorModel(parse_model_string(model))
    group_vector_scorer = CrossValidationGroupVectorScorer(score_predictions_function, vector_model, n_cross_folds)
    problem_data, initial_feature_vector = build_problem_data(group_map_files, mapping_file, prediction_field, start_level, include_only, negate, n_processes)
    group_actions = [SplitAction(problem_data, split_proportion, split_abun_coef, split_score_coef),
                     MergeAction(problem_data, merge_proportion, merge_abun_coef, merge_score_coef),
                     DeleteAction(problem_data, delete_proportion, delete_abun_coef, delete_score_coef)]
    vector_generator = ActionVectorGenerator(group_actions, n_generate)

    if n_trials > 0:
        xfold_feature_vectors = [[] for i in range(n_iterations)]
        masks = [(train, test) for train, test in KFold(problem_data.get_n_samples(), n_folds=n_trials, indices=False)]
        for train_mask, test_mask in masks:
            problem_data.set_mask(train_mask)
            iteration_outcomes = scope_optimization(initial_feature_vector, problem_data, group_vector_scorer, vector_generator, n_iterations, n_processes, n_maintain, True)
            for iteration in range(len(iteration_outcomes)):
                xfold_feature_vectors[iteration].append(iteration_outcomes[iteration].feature_vector)
        functions = []
        mask_results = []
        for iteration in range(len(iteration_outcomes)):
            for mask_index in range(len(masks)):
                functions.append( (mask_testing, (problem_data, masks[mask_index], vector_model, score_predictions_function, xfold_feature_vectors[iteration][mask_index], (iteration, mask_index))) )
        multiprocess_functions(functions, mask_results.append, n_processes)
        test_outcomes = [[None for x in range(len(masks))] for i in range(n_iterations)]
        for tag, mask_result in mask_results:
            iteration, mask_index = tag
            test_outcomes[iteration][mask_index] = mask_result

        prediction_testing_output_fp = join(output_dir,
                                            'prediction_testing_output.txt')
        write_to_file(testing_output_lines(test_outcomes),
                      prediction_testing_output_fp)
        
        avg_outcome = stitch_avg_outcome(test_outcomes[-1], masks)

        write_to_file(feature_output_lines(avg_outcome),
                      feature_vector_output_fp)
    else:
        outcome = scope_optimization(initial_feature_vector, problem_data, group_vector_scorer, vector_generator, n_iterations, n_processes, n_maintain, False)
        write_to_file(feature_output_lines(outcome), feature_vector_output_fp)

    end_time = time()
    elapsed_time = end_time - start_time
    logging.info('Finished feature vector optimization process for \'%s\' '
                 'model' % model)
    logging.info('Total elapsed time (in seconds): %d' % elapsed_time)
 def test_multiprocess_functions(self):
     """Test running processes in parallel."""
     multiprocess_functions(self.procs, self.result_handler, 1)
     self.assertEqual(sorted(self.results), [2, 12])
Example #6
0
    def __init__(
        self,
        group_to_object,
        object_to_group,
        sample_to_response,
        n_processes,
        parse_object_string=parse_object_string_sample,
    ):
        """
        Builds a ProblemData object which is responsible for providing an interface to all aspects of a dataset.
        
        Args: object_to_group, group_to_object, 
            
        """
        if not isinstance(group_to_object, types.ListType):
            raise InputTypeError("group_to_object should be a list type")
        if not isinstance(object_to_group, types.ListType):
            raise InputTypeError("object_to_group should be a list type")
        if not len(object_to_group) == len(group_to_object):
            raise InputTypeError("object_to_group and group_to_object should be the same length")
        if not all([isinstance(o_to_g, types.DictType) for o_to_g in object_to_group]):
            raise InputTypeError("object_to_group should be a list of dict types")
        if not all([isinstance(g_to_o, types.DictType) for g_to_o in group_to_object]):
            raise InputTypeError("group_to_object should be a list of dict types")
        if not isinstance(sample_to_response, types.DictType):
            raise InputTypeError("sample_to_response should be a dict type")
        if not isinstance(n_processes, types.IntType) or n_processes < 0:
            raise InputTypeError("n_processes should be a non-negative int")
        if not isinstance(parse_object_string, types.FunctionType):
            raise InputTypeError("parse_object_sample should be a function")
        if len(inspect.getargspec(parse_object_string)[0]) < 1:
            raise InputTypeError("parse_object_sample should take at least one argument")

        self.response_variables = np.array([sample_to_response[sample] for sample in sample_to_response.keys()])
        self.n_scopes = len(object_to_group)
        self.n_unmasked_samples = len(sample_to_response)

        # Keep a stack of masks for various levels of data partitions
        self.mask_stack = MaskStack(self.n_unmasked_samples)

        sample_indices = dict(
            [(sample_to_response.keys()[index], index) for index in range(len(sample_to_response.keys()))]
        )

        assert all(
            [
                self.response_variables[sample_indices[sample]] == sample_to_response[sample]
                for sample in sample_to_response.keys()
            ]
        ), "sample_indices are not able to map correctly back to the response variable"

        process_definitions = []
        results = []
        for scope in range(len(group_to_object)):
            process_definitions.append(
                ProcessDefinition(
                    build_group_records,
                    positional_arguments=(scope, group_to_object[scope], sample_indices, parse_object_string),
                    tag=scope,
                )
            )
        multiprocess_functions(process_definitions, results.append, n_processes)

        self.group_records = [None] * self.n_scopes
        for scope, result in results:
            self.group_records[scope] = result

        for scope in range(self.n_scopes):
            for group in self.group_records[scope]:
                self.build_scope_map(self.group_records[scope][group], group_to_object, object_to_group)

        for scope in range(self.n_scopes):
            for key in self.group_records[scope]:
                assert (
                    self.group_records[scope][key].feature_record.get_id() == key
                ), "feature_record had mismatched id to it's key"
                assert (
                    self.group_records[scope][key].feature_record.get_scope() == scope
                ), "feature_record had mismatched scope to it's key"