def main(argv): if len(argv) > 0 and not argv[0].startswith('-'): # Backward compatible arguments argv.insert(0,'--group') if len(argv) > 2: argv.insert(2,'--cv-folds' if argv[2].isdigit() else '--model') if len(argv) > 4: argv.insert(4,'--path') parser = argparse.ArgumentParser(description='Train a model, cross-validate or predict commit message scores') parser.add_argument('-g','--group', default='id', help="Group name to group on. May be one of the groups in the data set, such as 'id' or 'score'.") parser.add_argument('-m','--model', dest='model_file', default='', help='Model file to read a trained model from or save the model to after training.') parser.add_argument('-f','--cv-folds', dest='cv_folds', type=int, nargs='?', default=0, const=5, metavar='FOLDS', help='Perform cross-validation with number of folds to use.') parser.add_argument('-t','--only-train', dest='only_train', action='store_true', default=False, help='Stop after training the model. Ignored by --cv-folds.') parser.add_argument('-p','--path', default='', help='Path to read input files from.') # Additional algorithm parameters algos = Algorithms() parameters, algorithms = algos.read_manifest() parser.add_argument('--algorithm', default='RandomForestClassifier', choices=algorithms.keys(), help='Model algorithm to use for training and predictions') for parameter, versions in parameters.iteritems(): kw = { "dest": parameter, "help": 'Only for {} {}'.format(', '.join(versions.keys()), 'algorithm' if len(versions) == 1 else 'algorithms') } values = list(itertools.chain(*versions.values())) if len(values) > 0: # Can't set a default here since it might depend per algorithm. if isinstance(values[0],(int,float)): kw["type"] = type(values[0]) elif isinstance(values[0],(str,unicode)): # Remove duplicates values.sort() kw["choices"] = [k for k,v in itertools.groupby(values)] elif isinstance(values[0],list): kw["nargs"] = len(values[0]) kw["type"] = type(values[0][0]) parser.add_argument('--{}'.format(parameter.replace('_','-')), **kw) # Parse the arguments now that all arguments are known args = parser.parse_args(argv) # Convert chosen algorithm to class and parameters algorithm = algorithms[args.algorithm] algorithm_class = Utilities.get_class(algorithm['module'], args.algorithm) algorithm_parameters = {} for parameter in algorithm['parameters']: if args.__dict__[parameter] is None: algorithm_parameters[parameter] = parameters[parameter][args.algorithm][0] else: algorithm_parameters[parameter] = args.__dict__[parameter] if args.only_train or args.cv_folds > 0: Utilities.print_algorithm(args.algorithm, algorithm_parameters) classifier = Classifier(args.group, args.model_file) classifier.create_model(train=not args.cv_folds, class_name=algorithm_class, parameters=algorithm_parameters, dense=algorithm['dense']) if args.cv_folds > 0: classifier.output_cross_validate(args.cv_folds) elif not args.only_train: if args.path != "" or sys.stdin.isatty(): path = args.path if path != "" and path[-1] != "/": path = path + "/" glob_pattern = 'commit_comments-dump.[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].json' files = glob(path + '[0-9]*/' + glob_pattern) + glob(path + glob_pattern) for name in files: with open(name, 'rb') as file: try: classifier.output(classifier.predict(file)) except ValueError as e: raise(ValueError("File '{}' is incorrect: {}".format(name, e))) else: classifier.output(classifier.predict(sys.stdin))
def main(argv): folds = int(argv[0]) if len(argv) > 0 else 5 filter = argv[1].lower() if len(argv) > 1 else "" # Fields to check whether the filter, if given, appears in. filter_fields = ['name', 'class_name', 'module'] # Read the manifest containing algorithm descriptions. with open('algorithms.json', 'r') as manifest: algorithms = json.load(manifest) # Load previous results try: with open('experiment_results.json', 'r') as file: results = json.load(file) except: results = {} for algorithm in algorithms: # Skip running the algorithm if it is disabled or the filter name does # not appear in any of the fields. if 'disabled' in algorithm and algorithm['disabled']: continue if filter and all( [filter not in algorithm[k].lower() for k in filter_fields]): continue # Convert manifest entries to classifier class and parameters class_name = Utilities.get_class(algorithm['module'], algorithm['class_name']) dense = algorithm['dense'] if 'dense' in algorithm else False # Create all possible combinations of parameters. parameter_combinations = itertools.product( *algorithm['parameters'].values()) single_parameters = [ param for param, values in algorithm['parameters'].iteritems() if len(values) == 1 ] string_parameters = [ param for param, values in algorithm['parameters'].iteritems() if isinstance(values[0], (str, unicode)) ] for combination in parameter_combinations: classifier = Classifier('id') # Turn the selected parameter combination back into a dictionary parameters = dict(zip(algorithm['parameters'].keys(), combination)) # Create the model according to the parameters classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense) Utilities.print_algorithm(algorithm['name'], parameters) parameter_string = Utilities.get_parameter_string( parameters, single_parameters + string_parameters) # Run cross-validation and print results result = classifier.output_cross_validate(folds) print('') name = algorithm['name'] for param in string_parameters: name += ", %s=%s" % (param, parameters[param]) # Write the result measurements into the results dictionary. if name not in results: results[name] = OrderedDict() results[name].update({ parameter_string: { 'average': result.mean(), 'standard_deviation': result.std() } }) # Write intermediate results (back) into a pretty-printed JSON file with open('experiment_results.json', 'w') as file: json.dump(results, file, indent=4, separators=(',', ': '))
def main(argv): folds = int(argv[0]) if len(argv) > 0 else 5 filter = argv[1].lower() if len(argv) > 1 else "" # Fields to check whether the filter, if given, appears in. filter_fields = ['name', 'class_name', 'module'] # Read the manifest containing algorithm descriptions. with open('algorithms.json', 'r') as manifest: algorithms = json.load(manifest) # Load previous results try: with open('experiment_results.json', 'r') as file: results = json.load(file) except: results = {} for algorithm in algorithms: # Skip running the algorithm if it is disabled or the filter name does # not appear in any of the fields. if 'disabled' in algorithm and algorithm['disabled']: continue if filter and all([filter not in algorithm[k].lower() for k in filter_fields]): continue # Convert manifest entries to classifier class and parameters class_name = Utilities.get_class(algorithm['module'], algorithm['class_name']) dense = algorithm['dense'] if 'dense' in algorithm else False # Create all possible combinations of parameters. parameter_combinations = itertools.product(*algorithm['parameters'].values()) single_parameters = [param for param,values in algorithm['parameters'].iteritems() if len(values) == 1] string_parameters = [param for param,values in algorithm['parameters'].iteritems() if isinstance(values[0],(str,unicode))] for combination in parameter_combinations: classifier = Classifier('id') # Turn the selected parameter combination back into a dictionary parameters = dict(zip(algorithm['parameters'].keys(), combination)) # Create the model according to the parameters classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense) Utilities.print_algorithm(algorithm['name'], parameters) parameter_string = Utilities.get_parameter_string(parameters, single_parameters + string_parameters) # Run cross-validation and print results result = classifier.output_cross_validate(folds) print('') name = algorithm['name'] for param in string_parameters: name += ", %s=%s" % (param,parameters[param]) # Write the result measurements into the results dictionary. if name not in results: results[name] = OrderedDict() results[name].update({ parameter_string: { 'average': result.mean(), 'standard_deviation': result.std() } }) # Write intermediate results (back) into a pretty-printed JSON file with open('experiment_results.json', 'w') as file: json.dump(results, file, indent=4, separators=(',', ': '))