Esempio n. 1
0
def main(argv):
    if len(argv) > 0 and not argv[0].startswith('-'):
        # Backward compatible arguments
        argv.insert(0,'--group')
        if len(argv) > 2:
            argv.insert(2,'--cv-folds' if argv[2].isdigit() else '--model')
        if len(argv) > 4:
            argv.insert(4,'--path')

    parser = argparse.ArgumentParser(description='Train a model, cross-validate or predict commit message scores')
    parser.add_argument('-g','--group', default='id', help="Group name to group on. May be one of the groups in the data set, such as 'id' or 'score'.")
    parser.add_argument('-m','--model', dest='model_file', default='', help='Model file to read a trained model from or save the model to after training.')
    parser.add_argument('-f','--cv-folds', dest='cv_folds', type=int, nargs='?', default=0, const=5, metavar='FOLDS', help='Perform cross-validation with number of folds to use.')
    parser.add_argument('-t','--only-train', dest='only_train', action='store_true', default=False, help='Stop after training the model. Ignored by --cv-folds.')
    parser.add_argument('-p','--path', default='', help='Path to read input files from.')

    # Additional algorithm parameters
    algos = Algorithms()
    parameters, algorithms = algos.read_manifest()

    parser.add_argument('--algorithm', default='RandomForestClassifier', choices=algorithms.keys(), help='Model algorithm to use for training and predictions')
    for parameter, versions in parameters.iteritems():
        kw = {
            "dest": parameter,
            "help": 'Only for {} {}'.format(', '.join(versions.keys()), 'algorithm' if len(versions) == 1 else 'algorithms')
        }

        values = list(itertools.chain(*versions.values()))
        if len(values) > 0:
            # Can't set a default here since it might depend per algorithm.
            if isinstance(values[0],(int,float)):
                kw["type"] = type(values[0])
            elif isinstance(values[0],(str,unicode)):
                # Remove duplicates
                values.sort()
                kw["choices"] = [k for k,v in itertools.groupby(values)]
            elif isinstance(values[0],list):
                kw["nargs"] = len(values[0])
                kw["type"] = type(values[0][0])

        parser.add_argument('--{}'.format(parameter.replace('_','-')), **kw)

    # Parse the arguments now that all arguments are known
    args = parser.parse_args(argv)

    # Convert chosen algorithm to class and parameters
    algorithm = algorithms[args.algorithm]
    algorithm_class = Utilities.get_class(algorithm['module'], args.algorithm)

    algorithm_parameters = {}
    for parameter in algorithm['parameters']:
        if args.__dict__[parameter] is None:
            algorithm_parameters[parameter] = parameters[parameter][args.algorithm][0]
        else:
            algorithm_parameters[parameter] = args.__dict__[parameter]

    if args.only_train or args.cv_folds > 0:
        Utilities.print_algorithm(args.algorithm, algorithm_parameters)

    classifier = Classifier(args.group, args.model_file)
    classifier.create_model(train=not args.cv_folds, class_name=algorithm_class, parameters=algorithm_parameters, dense=algorithm['dense'])
    if args.cv_folds > 0:
        classifier.output_cross_validate(args.cv_folds)
    elif not args.only_train:
        if args.path != "" or sys.stdin.isatty():
            path = args.path
            if path != "" and path[-1] != "/":
                path = path + "/"

            glob_pattern = 'commit_comments-dump.[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].json'
            files = glob(path + '[0-9]*/' + glob_pattern) + glob(path + glob_pattern)
            for name in files:
                with open(name, 'rb') as file:
                    try:
                        classifier.output(classifier.predict(file))
                    except ValueError as e:
                        raise(ValueError("File '{}' is incorrect: {}".format(name, e)))
        else:
            classifier.output(classifier.predict(sys.stdin))
def main(argv):
    folds = int(argv[0]) if len(argv) > 0 else 5
    filter = argv[1].lower() if len(argv) > 1 else ""

    # Fields to check whether the filter, if given, appears in.
    filter_fields = ['name', 'class_name', 'module']

    # Read the manifest containing algorithm descriptions.
    with open('algorithms.json', 'r') as manifest:
        algorithms = json.load(manifest)

    # Load previous results
    try:
        with open('experiment_results.json', 'r') as file:
            results = json.load(file)
    except:
        results = {}

    for algorithm in algorithms:
        # Skip running the algorithm if it is disabled or the filter name does
        # not appear in any of the fields.
        if 'disabled' in algorithm and algorithm['disabled']:
            continue
        if filter and all(
            [filter not in algorithm[k].lower() for k in filter_fields]):
            continue

        # Convert manifest entries to classifier class and parameters
        class_name = Utilities.get_class(algorithm['module'],
                                         algorithm['class_name'])
        dense = algorithm['dense'] if 'dense' in algorithm else False

        # Create all possible combinations of parameters.
        parameter_combinations = itertools.product(
            *algorithm['parameters'].values())

        single_parameters = [
            param for param, values in algorithm['parameters'].iteritems()
            if len(values) == 1
        ]
        string_parameters = [
            param for param, values in algorithm['parameters'].iteritems()
            if isinstance(values[0], (str, unicode))
        ]
        for combination in parameter_combinations:
            classifier = Classifier('id')

            # Turn the selected parameter combination back into a dictionary
            parameters = dict(zip(algorithm['parameters'].keys(), combination))

            # Create the model according to the parameters
            classifier.create_model(train=False,
                                    class_name=class_name,
                                    parameters=parameters,
                                    dense=dense)

            Utilities.print_algorithm(algorithm['name'], parameters)
            parameter_string = Utilities.get_parameter_string(
                parameters, single_parameters + string_parameters)

            # Run cross-validation and print results
            result = classifier.output_cross_validate(folds)
            print('')

            name = algorithm['name']
            for param in string_parameters:
                name += ", %s=%s" % (param, parameters[param])

            # Write the result measurements into the results dictionary.
            if name not in results:
                results[name] = OrderedDict()

            results[name].update({
                parameter_string: {
                    'average': result.mean(),
                    'standard_deviation': result.std()
                }
            })

            # Write intermediate results (back) into a pretty-printed JSON file
            with open('experiment_results.json', 'w') as file:
                json.dump(results, file, indent=4, separators=(',', ': '))
Esempio n. 3
0
def main(argv):
    folds = int(argv[0]) if len(argv) > 0 else 5
    filter = argv[1].lower() if len(argv) > 1 else ""

    # Fields to check whether the filter, if given, appears in.
    filter_fields = ['name', 'class_name', 'module']

    # Read the manifest containing algorithm descriptions.
    with open('algorithms.json', 'r') as manifest:
        algorithms = json.load(manifest)

    # Load previous results
    try:
        with open('experiment_results.json', 'r') as file:
            results = json.load(file)
    except:
        results = {}

    for algorithm in algorithms:
        # Skip running the algorithm if it is disabled or the filter name does 
        # not appear in any of the fields.
        if 'disabled' in algorithm and algorithm['disabled']:
            continue
        if filter and all([filter not in algorithm[k].lower() for k in filter_fields]):
            continue

        # Convert manifest entries to classifier class and parameters
        class_name = Utilities.get_class(algorithm['module'], algorithm['class_name'])
        dense = algorithm['dense'] if 'dense' in algorithm else False

        # Create all possible combinations of parameters.
        parameter_combinations = itertools.product(*algorithm['parameters'].values())

        single_parameters = [param for param,values in algorithm['parameters'].iteritems() if len(values) == 1]
        string_parameters = [param for param,values in algorithm['parameters'].iteritems() if isinstance(values[0],(str,unicode))]
        for combination in parameter_combinations:
            classifier = Classifier('id')

            # Turn the selected parameter combination back into a dictionary
            parameters = dict(zip(algorithm['parameters'].keys(), combination))

            # Create the model according to the parameters
            classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense)

            Utilities.print_algorithm(algorithm['name'], parameters)
            parameter_string = Utilities.get_parameter_string(parameters, single_parameters + string_parameters)

            # Run cross-validation and print results
            result = classifier.output_cross_validate(folds)
            print('')

            name = algorithm['name']
            for param in string_parameters:
                name += ", %s=%s" % (param,parameters[param])

            # Write the result measurements into the results dictionary.
            if name not in results:
                results[name] = OrderedDict()
            
            results[name].update({
                parameter_string: {
                    'average': result.mean(),
                    'standard_deviation': result.std()
                }
            })

            # Write intermediate results (back) into a pretty-printed JSON file
            with open('experiment_results.json', 'w') as file:
                json.dump(results, file, indent=4, separators=(',', ': '))