Esempio n. 1
0
    def classify(self, data):
        if 'version' not in self._model.config:
            logging.error(
                'Trying to use CommitMessageClassifier without learning first')
            return utility.get_zero_class_dict()

        if self._model.config['version'] != sklearn.__version__:
            logging.error(
                'Using CommitMessageClassifier with different scikit learn version (trained on: {}, used: {}) - relearn classifier first'
                .format(self._model.config['version'], sklearn.__version__))
            return utility.get_zero_class_dict()

        try:
            commits = data.get_commits()
        except github.GithubError:
            return utility.get_zero_class_dict()

        if self._knn is None:
            self._knn = pickle.loads(
                base64.b64decode(self._model.config['knn']))

        bow = [False for _ in self._model.config['bow']]
        for commit in commits:
            for word in commit['commit']['message'].split():
                i = self._find_position(word.lower())
                if i != -1:
                    bow[i] = True

        probability = self._knn.predict_proba([bow])
        result = utility.get_zero_class_dict()

        for i in range(len(self._knn.classes_)):
            result[self._knn.classes_[i]] = probability[0][i]
        return result
Esempio n. 2
0
    def classify(self, data):
        if len(self._model.config) is 0:
            logging.error(
                'Trying to use NameClassifier without learning first')
            return utility.get_zero_class_dict()

        try:
            repo_name = data.get_repository_data()['name']
        except github.GithubError:
            return utility.get_zero_class_dict()

        distances = []
        for target in self._model.config:
            distances += [(target, utility.edit_distance(repo_name, target))]

        distances.sort(key=lambda x: x[1])

        result = utility.get_zero_class_dict()

        nn = 1

        for c in utility.get_classes():
            result[c] += self._model.config[distances[0][0]][c]

        while nn < len(distances) and distances[nn - 1][1] == distances[nn][1]:
            for c in utility.get_classes():
                result[c] += self._model.config[distances[nn][0]][c]
            nn += 1

        for c in utility.get_classes():
            result[c] /= nn

        return result
Esempio n. 3
0
    def classify(self, data):
        if 'version' not in self._model.config:
            logging.error(
                'Trying to use LanguageDetailsClassifier without learning first'
            )
            return utility.get_zero_class_dict()

        if self._model.config['version'] != sklearn.__version__:
            logging.error(
                'Using LanguageDetailsClassifier with different scikit learn version (trained on: {}, used: {}) - relearn classifier first'
                .format(self._model.config['version'], sklearn.__version__))
            return utility.get_zero_class_dict()

        if self._tree is None:
            self._tree = pickle.loads(
                base64.b64decode(self._model.config['tree']))

        try:
            languages = data.get_languages()
        except github.GithubError:
            return utility.get_zero_class_dict()

        probability = self._tree.predict_proba([
            self._get_entry(languages, self._model.config['known_languages'])
        ])
        result = utility.get_zero_class_dict()
        for i in range(len(self._tree.classes_)):
            result[self._tree.classes_[i]] = probability[0][i]
        return result
Esempio n. 4
0
    def classify(self, data):
        try:
            language = data.get_repository_data()['language']
        except github.GithubError:
            return utility.get_zero_class_dict()

        if language is None:
            language = '_None_'

        if language in self._model.config:
            return self._model.config[language].copy()
        else:
            return utility.get_zero_class_dict()
Esempio n. 5
0
    def learn(self, learn):
        self._model.clear()

        for data in learn:
            try:
                repo_name = data[0].get_repository_data()['name']
            except github.GithubError:
                continue

            if repo_name not in self._model.config:
                self._model.config[repo_name] = utility.get_zero_class_dict()
            self._model.config[repo_name][data[1]] += 1

        for repo_name in self._model.config:
            count = 0
            for c in self._model.config[repo_name]:
                count += self._model.config[repo_name][c]

            # Saveguard - should never be true
            if count == 0:
                logging.error(
                    'NameClassifier has zero count for {}'.format(repo_name))
                continue

            for c in self._model.config[repo_name]:
                self._model.config[repo_name][c] /= count

        self._model.save()
Esempio n. 6
0
    def learn(self, learn):
        self._model.clear()

        for data in learn:
            try:
                language = data[0].get_repository_data()['language']
            except github.GithubError:
                continue

            if language is None:
                language = '_None_'

            if language not in self._model.config:
                self._model.config[language] = utility.get_zero_class_dict()
            self._model.config[language][data[1]] += 1

        for language in self._model.config:
            count = 0
            for c in self._model.config[language]:
                count += self._model.config[language][c]

            # Saveguard - should never be true
            if count == 0:
                logging.error(
                    'LanguageClassifier has zero count for {}'.format(
                        language))
                continue

            for c in self._model.config[language]:
                self._model.config[language][c] /= count

        self._model.save()
Esempio n. 7
0
    def learn(self, learn):
        self._model.clear()

        for data in learn:
            try:
                file_list = data[0].get_all_files()
            except github.GithubError:
                continue

            for file in file_list:
                file = file.split('/')[-1]
                file = file.split('.')[-1]
                file = file.lower()
                if file not in self._model.config:
                    self._model.config[file] = utility.get_zero_class_dict()
                self._model.config[file][data[1]] += 1

        for file in self._model.config:
            count = 0
            for c in self._model.config[file]:
                count += self._model.config[file][c]
            if count <= 1:
                continue
            for c in self._model.config[file]:
                self._model.config[file][c] /= count

        self._model.save()
Esempio n. 8
0
    def test_get_zero_class_dict(self):
        classes = utility.get_classes()
        zero_dict = utility.get_zero_class_dict()

        compare = dict()
        for c in classes:
            compare[c] = 0.0

        self.assertEqual(zero_dict, compare)
Esempio n. 9
0
    def classify(self, data):
        if 'version' not in self._model.config:
            logging.error(
                'Trying to use MetadataClassifier without learning first')
            return utility.get_zero_class_dict()

        if self._model.config['version'] != sklearn.__version__:
            logging.error(
                'Using MetadataClassifier with different scikit learn version (trained on: {}, used: {}) - relearn classifier first'
                .format(self._model.config['version'], sklearn.__version__))
            return utility.get_zero_class_dict()

        if self._tree is None:
            self._tree = pickle.loads(
                base64.b64decode(self._model.config['tree']))

        probability = self._tree.predict_proba([self._get_input(data)])
        result = utility.get_zero_class_dict()
        for i in range(len(self._tree.classes_)):
            result[self._tree.classes_[i]] = probability[0][i]
        return result
Esempio n. 10
0
    def test_get_best_class(self):
        class_dict = utility.get_zero_class_dict()
        class_dict['DEV'] = 0.1
        # Simple test
        self.assertEqual(utility.get_best_class(class_dict), 'DEV')

        # Test negative numbers
        class_dict['HW'] = -0.5
        self.assertEqual(utility.get_best_class(class_dict), 'DEV')

        # Test huge numbers
        class_dict['OTHER'] = 9999999999
        self.assertEqual(utility.get_best_class(class_dict), 'OTHER')
Esempio n. 11
0
    def classify(self, data):
        if 'version' not in self._model.config:
            logging.error(
                'Trying to use RepositoryStructureClassifier without learning first'
            )
            return utility.get_zero_class_dict()

        if self._model.config['version'] != sklearn.__version__:
            logging.error(
                'Using RepositoryStructureClassifier with different scikit learn version (trained on: {}, used: {}) - relearn classifier first'
                .format(self._model.config['version'], sklearn.__version__))
            return utility.get_zero_class_dict()

        try:
            tree = data.get_tree()
        except github.GithubError:
            return utility.get_zero_class_dict()

        name = data.get_dev_repo()[1].lower()

        if self._knn is None:
            self._knn = pickle.loads(
                base64.b64decode(self._model.config['knn']))

        bow = [False for _ in self._model.config['bow']]
        for object in tree['tree']:
            i = self._find_position(object['path'].lower().replace(
                name, '$REPO'))
            if i != -1:
                bow[i] = True

        probability = self._knn.predict_proba([bow])
        result = utility.get_zero_class_dict()

        for i in range(len(self._knn.classes_)):
            result[self._knn.classes_[i]] = probability[0][i]
        return result
Esempio n. 12
0
def _batch_worker(queue_input, queue_output):
    classifiers = classifier.get_all_classifiers()
    try:
        while True:
            data = queue_input.get(True, 1)
            sum_results = utility.get_zero_class_dict()
            classifier_results = dict()
            for c in classifiers:
                result = c.classify(data)
                classifier_results[c.name()] = result
                for key in sum_results.keys():
                    if key in result:
                        sum_results[key] += result[key] / len(classifiers)
            queue_output.put((data, utility.get_best_class(sum_results), sum_results, classifier_results))
    except queue.Empty:
        sys.exit(0)
Esempio n. 13
0
    def test_classify(self):
        # First learn the models.
        # The training is covered in an other test case so it should be fine.
        for c in self.classifier:
            c.learn([(x, 'DEV') for x in self.github_list
                     ])  # correct class does not matter

            result = c.classify(self.github_list[0])

            # All classes represented
            self.assertEqual(result.keys(),
                             utility.get_zero_class_dict().keys())

            # Result is in range
            for result_class in result:
                self.assertTrue(
                    0.0 <= result[result_class] <= 1.0,
                    'Class {} of classifier {} is out of range ({})'.format(
                        result_class, c.name(), result[result_class]))
Esempio n. 14
0
    def classify(self, data):
        result = utility.get_zero_class_dict()

        try:
            all_files = data.get_all_files()
        except github.GithubError:
            return result

        if len(all_files) == 0:
            return result

        for file in all_files:
            file = file.split('/')[-1]
            file = file.split('.')[-1]
            file = file.lower()
            if file in self._model.config:
                for c in self._model.config[file]:
                    result[c] += self._model.config[file][c]

        for key in result.keys():
            result[key] /= len(all_files)

        return result
Esempio n. 15
0
def main():
    # Open Output file
    file = None
    try:
        file = open(configserver.get('output'), 'w')
    except OSError:
        logging.error('Can not save results to {}'.format(
            configserver.get('output')))

    # Prepare data
    data = processor.dir_to_learning(configserver.get('learning_input'))
    if len(data) == 0:
        logging.error('No learning data - aborting')
        return

    k_fold = configserver.get('k-fold')

    if k_fold < 2:
        logging.error('k-cross must be at least 2 (is: {})'.format(k_fold))
        return

    logging.log(configserver.output_log_level(),
                'Starting validation ({}-cross validation)'.format(k_fold))
    logging.log(
        configserver.output_log_level(),
        'Depending on your system, the size of learning/validation data and the amount that needs to be downloaded this might take a while. Please wait.'
    )
    if file is not None:
        file.write(
            'Starting validation ({}-cross validation)\n'.format(k_fold))
        file.flush()

    datasets = [[] for i in range(k_fold)]

    for d in data:
        datasets[random.randint(0, k_fold - 1)] += [d]

    # Run k-fold cross-validation
    precision = utility.get_zero_class_dict()
    recall = utility.get_zero_class_dict()

    for run in range(k_fold):
        logging.log(configserver.output_log_level(),
                    'Starting validation run {}'.format(run + 1))
        if file is not None:
            file.write('Starting validation run {}\n'.format(run + 1))
            file.flush()

        learn = []
        truth = []

        # Create datasets for run
        for i in range(k_fold):
            if i == run:
                truth = datasets[i]
            else:
                learn += datasets[i]

        # Remove labels
        validate = [x[0] for x in truth]

        # Learn
        processor.learning(learn)

        # Calculate validation data set
        result = processor.batch(validate)

        # Cache results of this run
        for c in utility.get_classes():
            precision_result = calculate_precision(truth, result, c)
            recall_result = calculate_recall(truth, result, c)

            if file is not None:
                file.write(
                    '{:6} - precision: {:6.4f}, recall: {:6.4f}\n'.format(
                        c, precision_result, recall_result))

            precision[c] += precision_result
            recall[c] += recall_result

        if file is not None:
            file.write('\n')
            file.flush()

    # Calculate average
    for c in utility.get_classes():
        precision[c] /= k_fold
        recall[c] /= k_fold

    # Print results
    logging.log(
        configserver.output_log_level(),
        'Average results from {}-fold cross-validation:'.format(k_fold))
    precision_avg = 0.0
    recall_avg = 0.0
    if file is not None:
        file.write(
            'Average results from {}-fold cross-validation:\n'.format(k_fold))
    for c in utility.get_classes():
        precision_avg += precision[c]
        recall_avg += recall[c]
        logging.log(
            configserver.output_log_level(),
            '{:6} - precision: {:6.4f}, recall: {:6.4f}'.format(
                c, precision[c], recall[c]))
        if file is not None:
            file.write('{:6} - precision: {:6.4f}, recall: {:6.4f}\n'.format(
                c, precision[c], recall[c]))

    precision_avg /= len(utility.get_classes())
    recall_avg /= len(utility.get_classes())
    logging.log(
        configserver.output_log_level(),
        '{:6} - precision: {:6.4f}, recall: {:6.4f}'.format(
            'ALL', precision_avg, recall_avg))

    # Close file if open
    if file is not None:
        file.write('{:6} - precision: {:6.4f}, recall: {:6.4f}\n'.format(
            'ALL', precision_avg, recall_avg))
        file.write('\n')
        file.close()