def classify(self, data): if 'version' not in self._model.config: logging.error( 'Trying to use CommitMessageClassifier without learning first') return utility.get_zero_class_dict() if self._model.config['version'] != sklearn.__version__: logging.error( 'Using CommitMessageClassifier with different scikit learn version (trained on: {}, used: {}) - relearn classifier first' .format(self._model.config['version'], sklearn.__version__)) return utility.get_zero_class_dict() try: commits = data.get_commits() except github.GithubError: return utility.get_zero_class_dict() if self._knn is None: self._knn = pickle.loads( base64.b64decode(self._model.config['knn'])) bow = [False for _ in self._model.config['bow']] for commit in commits: for word in commit['commit']['message'].split(): i = self._find_position(word.lower()) if i != -1: bow[i] = True probability = self._knn.predict_proba([bow]) result = utility.get_zero_class_dict() for i in range(len(self._knn.classes_)): result[self._knn.classes_[i]] = probability[0][i] return result
def classify(self, data): if len(self._model.config) is 0: logging.error( 'Trying to use NameClassifier without learning first') return utility.get_zero_class_dict() try: repo_name = data.get_repository_data()['name'] except github.GithubError: return utility.get_zero_class_dict() distances = [] for target in self._model.config: distances += [(target, utility.edit_distance(repo_name, target))] distances.sort(key=lambda x: x[1]) result = utility.get_zero_class_dict() nn = 1 for c in utility.get_classes(): result[c] += self._model.config[distances[0][0]][c] while nn < len(distances) and distances[nn - 1][1] == distances[nn][1]: for c in utility.get_classes(): result[c] += self._model.config[distances[nn][0]][c] nn += 1 for c in utility.get_classes(): result[c] /= nn return result
def classify(self, data): if 'version' not in self._model.config: logging.error( 'Trying to use LanguageDetailsClassifier without learning first' ) return utility.get_zero_class_dict() if self._model.config['version'] != sklearn.__version__: logging.error( 'Using LanguageDetailsClassifier with different scikit learn version (trained on: {}, used: {}) - relearn classifier first' .format(self._model.config['version'], sklearn.__version__)) return utility.get_zero_class_dict() if self._tree is None: self._tree = pickle.loads( base64.b64decode(self._model.config['tree'])) try: languages = data.get_languages() except github.GithubError: return utility.get_zero_class_dict() probability = self._tree.predict_proba([ self._get_entry(languages, self._model.config['known_languages']) ]) result = utility.get_zero_class_dict() for i in range(len(self._tree.classes_)): result[self._tree.classes_[i]] = probability[0][i] return result
def classify(self, data): try: language = data.get_repository_data()['language'] except github.GithubError: return utility.get_zero_class_dict() if language is None: language = '_None_' if language in self._model.config: return self._model.config[language].copy() else: return utility.get_zero_class_dict()
def learn(self, learn): self._model.clear() for data in learn: try: repo_name = data[0].get_repository_data()['name'] except github.GithubError: continue if repo_name not in self._model.config: self._model.config[repo_name] = utility.get_zero_class_dict() self._model.config[repo_name][data[1]] += 1 for repo_name in self._model.config: count = 0 for c in self._model.config[repo_name]: count += self._model.config[repo_name][c] # Saveguard - should never be true if count == 0: logging.error( 'NameClassifier has zero count for {}'.format(repo_name)) continue for c in self._model.config[repo_name]: self._model.config[repo_name][c] /= count self._model.save()
def learn(self, learn): self._model.clear() for data in learn: try: language = data[0].get_repository_data()['language'] except github.GithubError: continue if language is None: language = '_None_' if language not in self._model.config: self._model.config[language] = utility.get_zero_class_dict() self._model.config[language][data[1]] += 1 for language in self._model.config: count = 0 for c in self._model.config[language]: count += self._model.config[language][c] # Saveguard - should never be true if count == 0: logging.error( 'LanguageClassifier has zero count for {}'.format( language)) continue for c in self._model.config[language]: self._model.config[language][c] /= count self._model.save()
def learn(self, learn): self._model.clear() for data in learn: try: file_list = data[0].get_all_files() except github.GithubError: continue for file in file_list: file = file.split('/')[-1] file = file.split('.')[-1] file = file.lower() if file not in self._model.config: self._model.config[file] = utility.get_zero_class_dict() self._model.config[file][data[1]] += 1 for file in self._model.config: count = 0 for c in self._model.config[file]: count += self._model.config[file][c] if count <= 1: continue for c in self._model.config[file]: self._model.config[file][c] /= count self._model.save()
def test_get_zero_class_dict(self): classes = utility.get_classes() zero_dict = utility.get_zero_class_dict() compare = dict() for c in classes: compare[c] = 0.0 self.assertEqual(zero_dict, compare)
def classify(self, data): if 'version' not in self._model.config: logging.error( 'Trying to use MetadataClassifier without learning first') return utility.get_zero_class_dict() if self._model.config['version'] != sklearn.__version__: logging.error( 'Using MetadataClassifier with different scikit learn version (trained on: {}, used: {}) - relearn classifier first' .format(self._model.config['version'], sklearn.__version__)) return utility.get_zero_class_dict() if self._tree is None: self._tree = pickle.loads( base64.b64decode(self._model.config['tree'])) probability = self._tree.predict_proba([self._get_input(data)]) result = utility.get_zero_class_dict() for i in range(len(self._tree.classes_)): result[self._tree.classes_[i]] = probability[0][i] return result
def test_get_best_class(self): class_dict = utility.get_zero_class_dict() class_dict['DEV'] = 0.1 # Simple test self.assertEqual(utility.get_best_class(class_dict), 'DEV') # Test negative numbers class_dict['HW'] = -0.5 self.assertEqual(utility.get_best_class(class_dict), 'DEV') # Test huge numbers class_dict['OTHER'] = 9999999999 self.assertEqual(utility.get_best_class(class_dict), 'OTHER')
def classify(self, data): if 'version' not in self._model.config: logging.error( 'Trying to use RepositoryStructureClassifier without learning first' ) return utility.get_zero_class_dict() if self._model.config['version'] != sklearn.__version__: logging.error( 'Using RepositoryStructureClassifier with different scikit learn version (trained on: {}, used: {}) - relearn classifier first' .format(self._model.config['version'], sklearn.__version__)) return utility.get_zero_class_dict() try: tree = data.get_tree() except github.GithubError: return utility.get_zero_class_dict() name = data.get_dev_repo()[1].lower() if self._knn is None: self._knn = pickle.loads( base64.b64decode(self._model.config['knn'])) bow = [False for _ in self._model.config['bow']] for object in tree['tree']: i = self._find_position(object['path'].lower().replace( name, '$REPO')) if i != -1: bow[i] = True probability = self._knn.predict_proba([bow]) result = utility.get_zero_class_dict() for i in range(len(self._knn.classes_)): result[self._knn.classes_[i]] = probability[0][i] return result
def _batch_worker(queue_input, queue_output): classifiers = classifier.get_all_classifiers() try: while True: data = queue_input.get(True, 1) sum_results = utility.get_zero_class_dict() classifier_results = dict() for c in classifiers: result = c.classify(data) classifier_results[c.name()] = result for key in sum_results.keys(): if key in result: sum_results[key] += result[key] / len(classifiers) queue_output.put((data, utility.get_best_class(sum_results), sum_results, classifier_results)) except queue.Empty: sys.exit(0)
def test_classify(self): # First learn the models. # The training is covered in an other test case so it should be fine. for c in self.classifier: c.learn([(x, 'DEV') for x in self.github_list ]) # correct class does not matter result = c.classify(self.github_list[0]) # All classes represented self.assertEqual(result.keys(), utility.get_zero_class_dict().keys()) # Result is in range for result_class in result: self.assertTrue( 0.0 <= result[result_class] <= 1.0, 'Class {} of classifier {} is out of range ({})'.format( result_class, c.name(), result[result_class]))
def classify(self, data): result = utility.get_zero_class_dict() try: all_files = data.get_all_files() except github.GithubError: return result if len(all_files) == 0: return result for file in all_files: file = file.split('/')[-1] file = file.split('.')[-1] file = file.lower() if file in self._model.config: for c in self._model.config[file]: result[c] += self._model.config[file][c] for key in result.keys(): result[key] /= len(all_files) return result
def main(): # Open Output file file = None try: file = open(configserver.get('output'), 'w') except OSError: logging.error('Can not save results to {}'.format( configserver.get('output'))) # Prepare data data = processor.dir_to_learning(configserver.get('learning_input')) if len(data) == 0: logging.error('No learning data - aborting') return k_fold = configserver.get('k-fold') if k_fold < 2: logging.error('k-cross must be at least 2 (is: {})'.format(k_fold)) return logging.log(configserver.output_log_level(), 'Starting validation ({}-cross validation)'.format(k_fold)) logging.log( configserver.output_log_level(), 'Depending on your system, the size of learning/validation data and the amount that needs to be downloaded this might take a while. Please wait.' ) if file is not None: file.write( 'Starting validation ({}-cross validation)\n'.format(k_fold)) file.flush() datasets = [[] for i in range(k_fold)] for d in data: datasets[random.randint(0, k_fold - 1)] += [d] # Run k-fold cross-validation precision = utility.get_zero_class_dict() recall = utility.get_zero_class_dict() for run in range(k_fold): logging.log(configserver.output_log_level(), 'Starting validation run {}'.format(run + 1)) if file is not None: file.write('Starting validation run {}\n'.format(run + 1)) file.flush() learn = [] truth = [] # Create datasets for run for i in range(k_fold): if i == run: truth = datasets[i] else: learn += datasets[i] # Remove labels validate = [x[0] for x in truth] # Learn processor.learning(learn) # Calculate validation data set result = processor.batch(validate) # Cache results of this run for c in utility.get_classes(): precision_result = calculate_precision(truth, result, c) recall_result = calculate_recall(truth, result, c) if file is not None: file.write( '{:6} - precision: {:6.4f}, recall: {:6.4f}\n'.format( c, precision_result, recall_result)) precision[c] += precision_result recall[c] += recall_result if file is not None: file.write('\n') file.flush() # Calculate average for c in utility.get_classes(): precision[c] /= k_fold recall[c] /= k_fold # Print results logging.log( configserver.output_log_level(), 'Average results from {}-fold cross-validation:'.format(k_fold)) precision_avg = 0.0 recall_avg = 0.0 if file is not None: file.write( 'Average results from {}-fold cross-validation:\n'.format(k_fold)) for c in utility.get_classes(): precision_avg += precision[c] recall_avg += recall[c] logging.log( configserver.output_log_level(), '{:6} - precision: {:6.4f}, recall: {:6.4f}'.format( c, precision[c], recall[c])) if file is not None: file.write('{:6} - precision: {:6.4f}, recall: {:6.4f}\n'.format( c, precision[c], recall[c])) precision_avg /= len(utility.get_classes()) recall_avg /= len(utility.get_classes()) logging.log( configserver.output_log_level(), '{:6} - precision: {:6.4f}, recall: {:6.4f}'.format( 'ALL', precision_avg, recall_avg)) # Close file if open if file is not None: file.write('{:6} - precision: {:6.4f}, recall: {:6.4f}\n'.format( 'ALL', precision_avg, recall_avg)) file.write('\n') file.close()