Beispiel #1
0
def main(argv):
    # Constants for the analyzer and the classifier
    dataset = 'commit_comments-dump.2015-01-29.json'
    group = 'id'
    model_file = 'model.pickle'

    # Create the analyzer
    analyzer = Analyzer(group)

    # Create the classifier
    algorithm_class = RandomForestRegressor
    algorithm_parameters = {
        'n_estimators': 100,
        'n_jobs': 2,
        'min_samples_split': 10
    }
    classifier = Classifier(group, model_file)
    classifier.create_model(train=True,
                            class_name=algorithm_class,
                            parameters=algorithm_parameters)

    # Compare analyzer output with classifier output and identify differences
    unrecognized_negative = {}
    unrecognized_positive = {}
    predictions = classifier.predict()
    line = 0  # Dataset line
    i = 0  # Prediction ID (+1)
    file = open(dataset, 'rb')
    for data in Utilities.read_json(file, 'id', group):
        line = line + 1
        if line % 1000 == 0:
            print(line)
        if not classifier.filter(data):
            continue
        i = i + 1

        message = data['message']
        score = analyzer.analyze(message)[0]
        if score == 0:
            continue

        diff = predictions[i - 1] - score
        if abs(diff) < 1.0:
            continue

        target = unrecognized_negative if diff < 0 else unrecognized_positive
        target[line] = diff

    result = sorted(unrecognized_positive.items(), key=lambda x: x[1])
    for item in result:
        print("{}: {}: {}".format(item[0], item[1],
                                  linecache.getline(dataset, item[0])[:-1]))
def main(argv):
    folds = int(argv[0]) if len(argv) > 0 else 5
    filter = argv[1].lower() if len(argv) > 1 else ""

    # Fields to check whether the filter, if given, appears in.
    filter_fields = ['name', 'class_name', 'module']

    # Read the manifest containing algorithm descriptions.
    with open('algorithms.json', 'r') as manifest:
        algorithms = json.load(manifest)

    # Load previous results
    try:
        with open('experiment_results.json', 'r') as file:
            results = json.load(file)
    except:
        results = {}

    for algorithm in algorithms:
        # Skip running the algorithm if it is disabled or the filter name does
        # not appear in any of the fields.
        if 'disabled' in algorithm and algorithm['disabled']:
            continue
        if filter and all(
            [filter not in algorithm[k].lower() for k in filter_fields]):
            continue

        # Convert manifest entries to classifier class and parameters
        class_name = Utilities.get_class(algorithm['module'],
                                         algorithm['class_name'])
        dense = algorithm['dense'] if 'dense' in algorithm else False

        # Create all possible combinations of parameters.
        parameter_combinations = itertools.product(
            *algorithm['parameters'].values())

        single_parameters = [
            param for param, values in algorithm['parameters'].iteritems()
            if len(values) == 1
        ]
        string_parameters = [
            param for param, values in algorithm['parameters'].iteritems()
            if isinstance(values[0], (str, unicode))
        ]
        for combination in parameter_combinations:
            classifier = Classifier('id')

            # Turn the selected parameter combination back into a dictionary
            parameters = dict(zip(algorithm['parameters'].keys(), combination))

            # Create the model according to the parameters
            classifier.create_model(train=False,
                                    class_name=class_name,
                                    parameters=parameters,
                                    dense=dense)

            Utilities.print_algorithm(algorithm['name'], parameters)
            parameter_string = Utilities.get_parameter_string(
                parameters, single_parameters + string_parameters)

            # Run cross-validation and print results
            result = classifier.output_cross_validate(folds)
            print('')

            name = algorithm['name']
            for param in string_parameters:
                name += ", %s=%s" % (param, parameters[param])

            # Write the result measurements into the results dictionary.
            if name not in results:
                results[name] = OrderedDict()

            results[name].update({
                parameter_string: {
                    'average': result.mean(),
                    'standard_deviation': result.std()
                }
            })

            # Write intermediate results (back) into a pretty-printed JSON file
            with open('experiment_results.json', 'w') as file:
                json.dump(results, file, indent=4, separators=(',', ': '))
def main(argv):
    folds = int(argv[0]) if len(argv) > 0 else 5
    filter = argv[1].lower() if len(argv) > 1 else ""

    # Fields to check whether the filter, if given, appears in.
    filter_fields = ['name', 'class_name', 'module']

    # Read the manifest containing algorithm descriptions.
    with open('algorithms.json', 'r') as manifest:
        algorithms = json.load(manifest)

    # Load previous results
    try:
        with open('experiment_results.json', 'r') as file:
            results = json.load(file)
    except:
        results = {}

    for algorithm in algorithms:
        # Skip running the algorithm if it is disabled or the filter name does 
        # not appear in any of the fields.
        if 'disabled' in algorithm and algorithm['disabled']:
            continue
        if filter and all([filter not in algorithm[k].lower() for k in filter_fields]):
            continue

        # Convert manifest entries to classifier class and parameters
        class_name = Utilities.get_class(algorithm['module'], algorithm['class_name'])
        dense = algorithm['dense'] if 'dense' in algorithm else False

        # Create all possible combinations of parameters.
        parameter_combinations = itertools.product(*algorithm['parameters'].values())

        single_parameters = [param for param,values in algorithm['parameters'].iteritems() if len(values) == 1]
        string_parameters = [param for param,values in algorithm['parameters'].iteritems() if isinstance(values[0],(str,unicode))]
        for combination in parameter_combinations:
            classifier = Classifier('id')

            # Turn the selected parameter combination back into a dictionary
            parameters = dict(zip(algorithm['parameters'].keys(), combination))

            # Create the model according to the parameters
            classifier.create_model(train=False, class_name=class_name, parameters=parameters, dense=dense)

            Utilities.print_algorithm(algorithm['name'], parameters)
            parameter_string = Utilities.get_parameter_string(parameters, single_parameters + string_parameters)

            # Run cross-validation and print results
            result = classifier.output_cross_validate(folds)
            print('')

            name = algorithm['name']
            for param in string_parameters:
                name += ", %s=%s" % (param,parameters[param])

            # Write the result measurements into the results dictionary.
            if name not in results:
                results[name] = OrderedDict()
            
            results[name].update({
                parameter_string: {
                    'average': result.mean(),
                    'standard_deviation': result.std()
                }
            })

            # Write intermediate results (back) into a pretty-printed JSON file
            with open('experiment_results.json', 'w') as file:
                json.dump(results, file, indent=4, separators=(',', ': '))