def command_line_argument_wrapper(model, n_iterations, group_map_files,
        start_level, mapping_file, prediction_field, include_only, negate,
        n_maintain, n_generate, score_predictions_function, split_abun_coef,
        split_score_coef, merge_abun_coef, merge_score_coef, delete_abun_coef,
        delete_score_coef, split_proportion, merge_proportion,
        delete_proportion, n_cross_folds, n_processes, output_dir, n_trials):

    """
    Sets up and executes scope optimization on a given problem, runs testing, and writes to files.
    
    Builds the data structures and objects that are used by fresco.scope_optimization from
    command line friendly arguments. Then runs scope optimization, performs cross fold testing
    on the results if instructed, and writes the results to the files.
    
    Args:
        model: A string representing a machine learning classification model to be used
            both for testing in within the optimization process.
        n_iterations: The number of iterations to be completed by the optimization process.
        group_map_files: A list of open files containing tab-separated lines mapping from
            groups to objects. For example:
                Group1    Object1    Object2    Object3
                Group2    Object4    Object5
            Map files should be ordered by decreasing level of scope, i.e., most general to least
            general map files.
        start_level: The starting scope level (map file index) for the optimization process.
        mapping_file: An open file with tab separated lines mapping from a sample ID to its
            properties. The first line should be tab separated identifiers for the properties.
            For example:
                SAMPLE_ID    COLOR    TASTE
                APPLE_g    GREEN    AMAZING
                APPLE_r    RED    AWEFUL
        n_maintain: The number of vectors to be kept after every iteration of optimization.
        n_generate: The number of vectors to be generated for each input vector by the
            vector generator.
        score_predictions_function: A function which takes two lists of class predictions of
            equal length and returns a numerical score. For example:
                def score_predictions(real_classes, predicted_classes):
                    return sum([1 if read_classes[i] != predicted_classes[i] else 0
                                for i in range(len(real_classes))])
        split_abun_coef: The abundance deviation coefficient for the splitting heuristic.
        split_score_coef: The prediction score deviation coefficient for the splitting heuristic.
        merge_abun_coef: The abundance deviation coefficient for the merging heuristic.
        merge_score_coef: The prediction score deviation coefficient for the merging heuristic.
        delete_abun_coef: The abundance deviation coefficient for the deletion heuristic.
        delete_score_coef: The prediction score deviation coefficient for the deletion heuristic.
        split_proportion: The proportion of total features to be split each iteration.
        merge_proportion: The proportion of total features to be split each iteration.
        delete_proportion: The proportion of total features to be split each iteration.
        n_cross_folds: The number of cross folds to use in scoring the vectors for selection.
        n_processes: The number of additional processes to spawn, at maximum.
        output_dir: The directory that the output files will be put in.
        n_trials: The number of cross folds to use in scoring the vectors returned by the
            optimization process. If 0, no testing will be performed.
    """
    simple_var_types = [
                 ("n_iterations", types.IntType),
                 ("n_maintain", types.IntType),
                 ("n_generate", types.IntType),
                 ("n_processes", types.IntType),
                 ("n_cross_folds", types.IntType),
                 ("n_trials", types.IntType),
                 ("start_level", types.IntType),
                 ("split_score_coef", (types.IntType, types.FloatType)),
                 ("split_abun_coef", (types.IntType, types.FloatType)),
                 ("merge_score_coef", (types.IntType, types.FloatType)),
                 ("merge_abun_coef", (types.IntType, types.FloatType)),
                 ("delete_score_coef", (types.IntType, types.FloatType)),   
                 ("delete_abun_coef", (types.IntType, types.FloatType)),
                 ("split_proportion", (types.IntType, types.FloatType)),
                 ("merge_proportion", (types.IntType, types.FloatType)),
                 ("delete_proportion", (types.IntType, types.FloatType)),
                 ("n_cross_folds", types.IntType),
                 ("mapping_file", types.FileType),
                 ("negate", types.BooleanType),
                 ("output_dir", types.StringType),
                 ("prediction_field", types.StringType),
                 ("include_only", (types.NoneType, types.ListType,
                                   types.TupleType)),
                 ("group_map_files", types.ListType)
                ]
    for var_name, var_type in simple_var_types:
        check_input_type(var_name, locals()[var_name], var_type)
    if len(inspect.getargspec(score_predictions_function)[0]) < 2:
        raise InputTypeError("scope_predictions_function should take at least two parameters")
    if not all([isinstance(f, types.FileType) for f in group_map_files]):
        raise InputTypeError("group_map_files should be a list of open files")
    if include_only != None:
        if not isinstance(include_only[0], types.StringType):
            raise InputTypeError("include_only[0] should be of type string")
        if not isinstance(include_only[1], types.ListType) or \
                not all([isinstance(value, types.StringType) for value in include_only[1]]):
            raise InputTypeError("include_only[1] should be a list of strings")

    if not exists(output_dir):
        makedirs(output_dir)

    log_fp = join(output_dir, 'info.log')
    logging.basicConfig(filename=log_fp, filemode='w', level=logging.DEBUG,
                        format='%(asctime)s\t%(levelname)s\t%(message)s')
    logging.info('Started feature vector optimization process for \'%s\' '
                 'model' % model)
    start_time = time()

    vector_model = GroupVectorModel(parse_model_string(model))
    group_vector_scorer = CrossValidationGroupVectorScorer(score_predictions_function, vector_model, n_cross_folds)
    
    problem_data = build_problem_data(group_map_files, mapping_file, prediction_field, start_level, include_only, negate, n_processes)
    assert isinstance(problem_data, ProblemData),\
        "build_problem_data did not return a GroupProblemData"
    
    initial_feature_ids = problem_data.get_group_ids(start_level)
    initial_feature_vector = FeatureVector([problem_data.get_feature_record(start_level, feature_id) for feature_id in initial_feature_ids])
    assert isinstance(initial_feature_vector, FeatureVector),\
        "build_problem_data did not return a FeatureVector"
    
    group_actions = [SplitAction(problem_data, split_proportion, split_abun_coef, split_score_coef),
                     MergeAction(problem_data, merge_proportion, merge_abun_coef, merge_score_coef),
                     DeleteAction(problem_data, delete_proportion, delete_abun_coef, delete_score_coef)]
    vector_generator = ActionVectorGenerator(group_actions, n_generate, problem_data)

    scope_optimization = ScopeOptimization(group_vector_scorer, vector_generator, n_processes, n_iterations, n_maintain)

    logging.info('Completed optimization initialization')
    if n_trials > 0:
        test_outcomes = scope_optimization_cross_validation(scope_optimization, initial_feature_vector, problem_data, vector_model, score_predictions_function, n_trials, n_processes)
        
        assert len(test_outcomes) == n_iterations, "test outcomes isn't returning an entry for each iteration"
        assert all([len(mask_outcomes) == n_trials for mask_outcomes in test_outcomes]), \
            "test outcomes isn't returning an outcome for each mask for each iteration"
        assert all([all([isinstance(outcome, ModelOutcome) for outcome in mask_outcomes])
                    for mask_outcomes in test_outcomes]), \
                    "test outcomes are not all of type ModelOutcome"
        
        prediction_testing_output_fp = join(output_dir,
                                            'prediction_testing_output.txt')
        write_to_file(testing_output_lines(test_outcomes),
                      prediction_testing_output_fp)
        assert isfile(prediction_testing_output_fp), \
            "Output file \"%s\"was not created" %prediction_testing_output_fp
            
        feature_vector_output_fp = join(output_dir,
                                        'fold_feature_vectors_output.txt')
        write_to_file(fold_features_output_lines(test_outcomes[-1], problem_data),
                      feature_vector_output_fp)
        assert isfile(feature_vector_output_fp), \
            "Output file \"%s\"was not created" %feature_vector_output_fp
    else:
        outcome = scope_optimization.optimize_vector(initial_feature_vector, problem_data, False)
        assert isinstance(outcome, ModelOutcome), "scope_optimization outcome is not a ModelOutcome"
        
        feature_vector_output_fp = join(output_dir,
                                        'feature_vector_output.txt')
        write_to_file(feature_output_lines(outcome, problem_data), feature_vector_output_fp)
        assert isfile(feature_vector_output_fp), \
            "Output file \"%s\"was not created" %feature_vector_output_fp

    end_time = time()
    elapsed_time = end_time - start_time
    logging.info('Finished feature vector optimization process for \'%s\' '
                 'model' % model)
    logging.info('Total elapsed time (in seconds): %d' % elapsed_time)
Beispiel #2
0
def command_line_argument_wrapper(model, n_iterations, group_map_files,
        start_level, mapping_file, prediction_field, include_only, negate,
        n_maintain, n_generate, score_predictions_function, split_abun_coef,
        split_score_coef, merge_abun_coef, merge_score_coef, delete_abun_coef,
        delete_score_coef, split_proportion, merge_proportion,
        delete_proportion, n_cross_folds, n_processes, output_dir, n_trials):

    """
    Sets up and executes scope optimization on a given problem, runs testing, and writes to files.
    
    Builds the data structures and objects that are used by fresco.scope_optimization from
    command line friendly arguments. Then runs scope optimization, performs cross fold testing
    on the results if instructed, and writes the results to the files.
    
    Args:
        model: A string representing a machine learning classification model to be used
            both for testing in within the optimization process.
        n_iterations: The number of iterations to be completed by the optimization process.
        group_map_files: A list of open files containing tab-separated lines mapping from
            groups to objects. For example:
                Group1    Object1    Object2    Object3
                Group2    Object4    Object5
            Map files should be ordered by decreasing level of scope, i.e., most general to least
            general map files.
        start_level: The starting scope level (map file index) for the optimization process.
        mapping_file: An open file with tab separated lines mapping from a sample ID to its
            properties. The first line should be tab separated identifiers for the properties.
            For example:
                SAMPLE_ID    COLOR    TASTE
                APPLE_g    GREEN    AMAZING
                APPLE_r    RED    AWEFUL
        n_maintain: The number of vectors to be kept after every iteration of optimization.
        n_generate: The number of vectors to be generated for each input vector by the
            vector generator.
        score_predictions_function: A function which takes two lists of class predictions of
            equal length and returns a numerical score. For example:
                def score_predictions(real_classes, predicted_classes):
                    return sum([1 if read_classes[i] != predicted_classes[i] else 0
                                for i in range(len(real_classes))])
        split_abun_coef: The abundance deviation coefficient for the splitting heuristic.
        split_score_coef: The prediction score deviation coefficient for the splitting heuristic.
        merge_abun_coef: The abundance deviation coefficient for the merging heuristic.
        merge_score_coef: The prediction score deviation coefficient for the merging heuristic.
        delete_abun_coef: The abundance deviation coefficient for the deletion heuristic.
        delete_score_coef: The prediction score deviation coefficient for the deletion heuristic.
        split_proportion: The proportion of total features to be split each iteration.
        merge_proportion: The proportion of total features to be split each iteration.
        delete_proportion: The proportion of total features to be split each iteration.
        n_cross_folds: The number of cross folds to use in scoring the vectors for selection.
        n_processes: The number of additional processes to spawn, at maximum.
        output_dir: The directory that the output files will be put in.
        n_trials: The number of cross folds to use in scoring the vectors returned by the
            optimization process. If 0, no testing will be performed.
    """
    if not exists(output_dir):
        makedirs(output_dir)

    log_fp = join(output_dir, 'info.log')
    logging.basicConfig(filename=log_fp, filemode='w', level=logging.DEBUG,
                        format='%(asctime)s\t%(levelname)s\t%(message)s')
    logging.info('Started feature vector optimization process for \'%s\' '
                 'model' % model)
    start_time = time()

    feature_vector_output_fp = join(output_dir,
                                    'feature_vector_output.txt')

    vector_model = GroupVectorModel(parse_model_string(model))
    group_vector_scorer = CrossValidationGroupVectorScorer(score_predictions_function, vector_model, n_cross_folds)
    problem_data, initial_feature_vector = build_problem_data(group_map_files, mapping_file, prediction_field, start_level, include_only, negate, n_processes)
    group_actions = [SplitAction(problem_data, split_proportion, split_abun_coef, split_score_coef),
                     MergeAction(problem_data, merge_proportion, merge_abun_coef, merge_score_coef),
                     DeleteAction(problem_data, delete_proportion, delete_abun_coef, delete_score_coef)]
    vector_generator = ActionVectorGenerator(group_actions, n_generate)

    if n_trials > 0:
        xfold_feature_vectors = [[] for i in range(n_iterations)]
        masks = [(train, test) for train, test in KFold(problem_data.get_n_samples(), n_folds=n_trials, indices=False)]
        for train_mask, test_mask in masks:
            problem_data.set_mask(train_mask)
            iteration_outcomes = scope_optimization(initial_feature_vector, problem_data, group_vector_scorer, vector_generator, n_iterations, n_processes, n_maintain, True)
            for iteration in range(len(iteration_outcomes)):
                xfold_feature_vectors[iteration].append(iteration_outcomes[iteration].feature_vector)
        functions = []
        mask_results = []
        for iteration in range(len(iteration_outcomes)):
            for mask_index in range(len(masks)):
                functions.append( (mask_testing, (problem_data, masks[mask_index], vector_model, score_predictions_function, xfold_feature_vectors[iteration][mask_index], (iteration, mask_index))) )
        multiprocess_functions(functions, mask_results.append, n_processes)
        test_outcomes = [[None for x in range(len(masks))] for i in range(n_iterations)]
        for tag, mask_result in mask_results:
            iteration, mask_index = tag
            test_outcomes[iteration][mask_index] = mask_result

        prediction_testing_output_fp = join(output_dir,
                                            'prediction_testing_output.txt')
        write_to_file(testing_output_lines(test_outcomes),
                      prediction_testing_output_fp)
        
        avg_outcome = stitch_avg_outcome(test_outcomes[-1], masks)

        write_to_file(feature_output_lines(avg_outcome),
                      feature_vector_output_fp)
    else:
        outcome = scope_optimization(initial_feature_vector, problem_data, group_vector_scorer, vector_generator, n_iterations, n_processes, n_maintain, False)
        write_to_file(feature_output_lines(outcome), feature_vector_output_fp)

    end_time = time()
    elapsed_time = end_time - start_time
    logging.info('Finished feature vector optimization process for \'%s\' '
                 'model' % model)
    logging.info('Total elapsed time (in seconds): %d' % elapsed_time)