def train(cls): if MachineLearning.PKGS_CLASSIFICATIONS is None: ml_data = MachineLearningData() labels = ['RU', 'U', 'NU'] MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data(labels) cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS)
def train(cls): if MachineLearning.PKGS_CLASSIFICATIONS is None: ml_data = MachineLearningData() labels = ['RU', 'U', 'NU'] MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data(labels) cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS)
class PkgClassificationTests(unittest.TestCase): def setUp(self): self.ml_data = MachineLearningData() self.cache = AptCache() def test_get_pkg_debtags(self): vim_debtags = ['devel::editor', 'implemented-in::c', 'interface::commandline', 'interface::text-mode', 'role::program', 'scope::application', 'uitoolkit::ncurses', 'use::editing', 'works-with::text', 'works-with::unicode'] axi_path = "/var/lib/apt-xapian-index/index" axi = xapian.Database(axi_path) vim_debtags_result = self.ml_data.get_pkg_debtags(axi, 'vim') for debtag in vim_debtags: self.assertTrue(debtag in vim_debtags_result) @patch('apprecommender.ml.data.MachineLearningData.get_pkg_description') def test_get_pkg_terms(self, mock_description): mock_description.return_value = 'Vim is an text editor written in C' vim_terms = [u'vim', u'text', u'editor'] vim_terms_result = self.ml_data.get_pkg_terms(self.cache, 'vim') for term in vim_terms: self.assertTrue(term in vim_terms_result) def test_create_row_table_list(self): labels_name = ['devel::editor', 'implemented-in::c', 'complet', 'contain', 'syntax', 'unix', 'version'] pkg_elements = ['implemented-in::c', 'complet'] row_list_to_assert = [0, 1, 1, 0, 0, 0, 0] row_list = self.ml_data.create_row_table_list(labels_name, pkg_elements) self.assertEqual(row_list_to_assert, row_list) @patch('apprecommender.ml.data.MachineLearningData.get_pkg_description') def test_get_pkg_classification(self, mock_description): mock_description.return_value = 'vim is an text editor written in c' axi_path = "/var/lib/apt-xapian-index/index" axi = xapian.Database(axi_path) pkgs = {'vim': 'EX'} debtags_name = ['devel::editor', 'implemented-in::c', 'devel::interpreter', 'devel::lang:python'] terms_name = ['vim', 'editor', 'python'] assert_pkgs_classification = {'vim': [1, 1, 0, 0, 1, 1, 0, 'EX']} pkgs_classification = self.ml_data.get_pkgs_table_classification( axi, pkgs, self.cache, debtags_name, terms_name) self.assertEqual(assert_pkgs_classification, pkgs_classification)
def __init__(self, content, profile_size, suggestion_size=200): ContentBased.__init__(self, content, profile_size) self.content = content self.description = 'Machine-learning' self.profile_size = profile_size self.suggestion_size = suggestion_size self.cache = apt.Cache() self.ml_data = MachineLearningData() self.axi = xapian.Database(XAPIAN_DATABASE_PATH)
def train(cls): ml_data = MachineLearningData() labels = ['RU', 'U', 'NU'] try: MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data( labels) if len(MachineLearning.PKGS_CLASSIFICATIONS) >= 10: cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS) else: raise MachineLearningTrainError() except IOError: raise
def ml_cross_validation(folder_path, ml_strategy_str): logger = logging.getLogger('') logger.setLevel(logging.CRITICAL) if not os.path.exists(folder_path): os.mkdir(folder_path) partition_size = 0.8 rounds = 5 metrics_list = [SimpleAccuracy(), Precision(), Recall(), FPR(), F_score(1)] labels = ['RU', 'U', 'NU'] ml_data = MachineLearningData() pkg_data = get_pkg_data(ml_strategy_str, ml_data, labels) ml_cross_validation = get_strategy(ml_strategy_str, pkg_data, partition_size, rounds, metrics_list, labels) cross_validaton_file = 'cross_validation_result_{}_{}_{}_{}.txt'.format( ml_strategy_str, rounds, partition_size, dt.datetime.now().strftime('%Y%m%d%H%M')) ml_cross_validation.run(None) cross_validation_file_path = folder_path + cross_validaton_file with open(cross_validation_file_path, 'w') as result: result.write(ml_cross_validation.__str__()) return ml_cross_validation
def __init__(self, content, profile_size, suggestion_size=200): ContentBased.__init__(self, content, profile_size) self.content = content self.description = 'Machine-learning' self.profile_size = profile_size self.suggestion_size = suggestion_size self.cache = AptCache() self.ml_data = MachineLearningData() self.axi = xapian.Database(XAPIAN_DATABASE_PATH)
def train_model(self, pkgs_list, axi, save_files=True): cache = Cache() ml_data = MachineLearningData() pkgs_description, pkg_classification = self.prepare_data( pkgs_list, axi, cache, ml_data) pkg_features = self.vectorizer.fit_transform(pkgs_description) features_array = pkg_features.toarray() terms, debtags = self.get_used_terms_and_debtags( self.vectorizer.get_feature_names()) self.classifier = GaussianNB() self.classifier.fit(features_array, pkg_classification) path = BagOfWords.BAG_OF_WORDS_PKGS_CLASSIFICATION if save_files: self.save_features(terms, BagOfWords.BAG_OF_WORDS_TERMS) self.save_features(debtags, BagOfWords.BAG_OF_WORDS_DEBTAGS) self.save_pkgs_features(path, pkgs_list, features_array, pkg_classification) return BagOfWords.CREATED_MODEL
class MachineLearning(ContentBased): __metaclass__ = ABCMeta PKGS_CLASSIFICATIONS = None def __init__(self, content, profile_size, suggestion_size=200): ContentBased.__init__(self, content, profile_size) self.content = content self.description = 'Machine-learning' self.profile_size = profile_size self.suggestion_size = suggestion_size self.cache = AptCache() self.ml_data = MachineLearningData() self.axi = xapian.Database(XAPIAN_DATABASE_PATH) def display_recommended_terms(self, terms_name, debtags_name, item_score, rec_size): sorted_result = sorted(item_score.items(), key=operator.itemgetter(1)) sorted_result = list(reversed(sorted_result)) sorted_result = [pkg[0] for pkg in sorted_result][0:rec_size] sorted_result = list(reversed(sorted_result)) for pkg in sorted_result: pkg_terms = self.ml_data.get_pkg_terms(self.cache, pkg) pkg_debtags = self.ml_data.get_pkg_debtags(self.axi, pkg) terms_match = [] for term in pkg_terms: if term in terms_name: terms_match.append(term) debtags_match = [] for debtag in pkg_debtags: if debtag in debtags_name: debtags_match.append(debtag) print "\n\n=" print "{0}".format(pkg) print "debtags:" print debtags_match print "-" print "terms:" print terms_match print "=" def get_item_score(self, pkgs_score, pkgs_classifications): item_score = {} order = ['RU', 'U', 'NU'] order_values = [0, 1000, 2000] for pkg, classification in pkgs_classifications.iteritems(): item_score[pkg] = order_values[order.index(classification)] item_score[pkg] += pkgs_score[pkg] return item_score def get_pkgs_and_scores(self, rec, user): profile = user.content_profile(rec.items_repository, self.content, self.suggestion_size, rec.valid_tags) content_based = self.get_sugestion_from_profile( rec, user, profile, self.suggestion_size, because=False) pkgs, pkgs_score = [], {} for pkg_line in str(content_based).splitlines()[1:]: pkg = re.search(r'\d+:\s([\w-]+)', pkg_line) if not pkg.groups(): continue pkg = pkg.groups()[0] pkg_score = int(pkg_line.split(':')[0].strip()) pkgs.append(pkg) pkgs_score[pkg] = self.suggestion_size - pkg_score return pkgs, pkgs_score def get_pkgs_classifications(self, pkgs, terms_name, debtags_name): ml_strategy = self.get_ml_strategy() pkgs_classifications = {} kwargs = {} kwargs['terms_name'] = terms_name kwargs['debtags_name'] = debtags_name kwargs['ml_strategy'] = ml_strategy for pkg in pkgs: if pkg not in self.cache: continue attribute_vector = self.prepare_pkg_data( pkg, **kwargs) classification = self.get_pkg_classification( ml_strategy, attribute_vector) pkgs_classifications[pkg] = classification return pkgs_classifications def load_terms_and_debtags(self): terms_name = [] debtags_name = [] terms_path = self.get_terms_path() debtags_path = self.get_debtags_path() with open(terms_path, 'rb') as terms: terms_name = pickle.load(terms) with open(debtags_path, 'rb') as debtags: debtags_name = pickle.load(debtags) return terms_name, debtags_name @staticmethod def train(cls): ml_data = MachineLearningData() labels = ['RU', 'U', 'NU'] try: MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data( labels) if len(MachineLearning.PKGS_CLASSIFICATIONS) >= 10: cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS) else: raise MachineLearningTrainError() except IOError: raise @abstractmethod def get_debtags_path(self): raise NotImplementedError("Method not implemented.") @abstractmethod def get_ml_strategy(self): raise NotImplementedError("Method not implemented.") @abstractmethod def get_pkg_classification(self, ml_strategy, attribute_vector): raise NotImplementedError("Method not implemented.") @abstractmethod def get_terms_path(self): raise NotImplementedError("Method not implemented.") @abstractmethod def prepare_pkg_data(self, pkg, **kwargs): raise NotImplementedError("Method not implemented.") @abstractmethod def run_train(cls, pkgs_classifications): raise NotImplementedError("Method not implemented.") def run(self, rec, user, rec_size): user_profile = None terms_name, debtags_name = self.load_terms_and_debtags() pkgs, pkgs_score = self.get_pkgs_and_scores(rec, user) pkgs_classifications = self.get_pkgs_classifications(pkgs, terms_name, debtags_name) item_score = self.get_item_score(pkgs_score, pkgs_classifications) if Config().because: user_profile = user.pkg_profile return recommender.RecommendationResult( item_score, limit=rec_size, user_profile=user_profile)
class PkgClassificationTests(unittest.TestCase): def setUp(self): self.ml_data = MachineLearningData() self.cache = apt.Cache() def test_get_pkg_debtags(self): vim_debtags = [ 'devel::editor', 'implemented-in::c', 'interface::commandline', 'interface::text-mode', 'role::program', 'scope::application', 'uitoolkit::ncurses', 'use::editing', 'works-with::text', 'works-with::unicode' ] axi_path = "/var/lib/apt-xapian-index/index" axi = xapian.Database(axi_path) vim_debtags_result = self.ml_data.get_pkg_debtags(axi, 'vim') for debtag in vim_debtags: self.assertTrue(debtag in vim_debtags_result) def test_get_pkg_terms(self): vim_terms = [ u'almost', u'compat', u'version', u'editor', u'new', u'featur', u'ad', u'multi', u'level', u'undo', u'syntax', u'highlight', u'command', u'line', u'histori', u'help', u'filenam', u'complet', u'block', u'oper', u'fold', u'support', u'etc', u'packag', u'contain', u'version', u'vim', u'compil', u'rather', u'standard', u'set', u'featur', u'packag', u'provid', u'version', u'packag', u'need', u'less' ] vim_terms_result = self.ml_data.get_pkg_terms(self.cache, 'vim') for term in vim_terms: self.assertTrue(term in vim_terms_result) def test_create_row_table_list(self): labels_name = [ 'devel::editor', 'implemented-in::c', 'complet', 'contain', 'syntax', 'unix', 'version' ] pkg_elements = ['implemented-in::c', 'complet'] row_list_to_assert = [0, 1, 1, 0, 0, 0, 0] row_list = self.ml_data.create_row_table_list(labels_name, pkg_elements) self.assertEqual(row_list_to_assert, row_list) def test_get_pkg_classification(self): axi_path = "/var/lib/apt-xapian-index/index" axi = xapian.Database(axi_path) pkgs = {'vim': 'EX'} debtags_name = [ 'devel::editor', 'implemented-in::c', 'devel::interpreter', 'devel::lang:python' ] terms_name = ['contain', 'syntax', 'python'] assert_pkgs_classification = {'vim': [1, 1, 0, 0, 1, 1, 0, 'EX']} pkgs_classification = self.ml_data.get_pkgs_table_classification( axi, pkgs, self.cache, debtags_name, terms_name) self.assertEqual(assert_pkgs_classification, pkgs_classification)
def setUp(self): self.ml_data = MachineLearningData() self.cache = apt.Cache()
def setUp(self): self.ml_data = MachineLearningData() self.cache = AptCache()
class PkgClassificationTests(unittest.TestCase): def setUp(self): self.ml_data = MachineLearningData() self.cache = apt.Cache() def test_get_pkg_debtags(self): vim_debtags = ['devel::editor', 'implemented-in::c', 'interface::commandline', 'interface::text-mode', 'role::program', 'scope::application', 'uitoolkit::ncurses', 'use::editing', 'works-with::text', 'works-with::unicode'] axi_path = "/var/lib/apt-xapian-index/index" axi = xapian.Database(axi_path) vim_debtags_result = self.ml_data.get_pkg_debtags(axi, 'vim') for debtag in vim_debtags: self.assertTrue(debtag in vim_debtags_result) def test_get_pkg_terms(self): vim_terms = [u'vim', u'compat', u'version', u'unix', u'editor', u'vi', u'new', u'featur', u'ad', u'multi', u'level', u'undo', u'syntax', u'highlight', u'command', u'line', u'histori', u'line', u'help', u'filenam', u'complet', u'block', u'oper', u'fold', u'unicod', u'support', u'packag', u'contain', u'version', u'vim', u'compil', u'standard', u'set', u'featur', u'packag', u'doe', u'provid', u'gui', u'version', u'vim', u'vim', u'packag', u'need'] vim_terms_result = self.ml_data.get_pkg_terms(self.cache, 'vim') print vim_terms_result for term in vim_terms: self.assertTrue(term in vim_terms_result) def test_create_row_table_list(self): labels_name = ['devel::editor', 'implemented-in::c', 'complet', 'contain', 'syntax', 'unix', 'version'] pkg_elements = ['implemented-in::c', 'complet'] row_list_to_assert = [0, 1, 1, 0, 0, 0, 0] row_list = self.ml_data.create_row_table_list(labels_name, pkg_elements) self.assertEqual(row_list_to_assert, row_list) def test_get_pkg_classification(self): axi_path = "/var/lib/apt-xapian-index/index" axi = xapian.Database(axi_path) pkgs = {'vim': 'EX'} debtags_name = ['devel::editor', 'implemented-in::c', 'devel::interpreter', 'devel::lang:python'] terms_name = ['contain', 'syntax', 'python'] assert_pkgs_classification = {'vim': [1, 1, 0, 0, 1, 1, 0, 'EX']} pkgs_classification = self.ml_data.get_pkgs_table_classification( axi, pkgs, self.cache, debtags_name, terms_name) self.assertEqual(assert_pkgs_classification, pkgs_classification)
class MachineLearning(ContentBased): __metaclass__ = ABCMeta PKGS_CLASSIFICATIONS = None def __init__(self, content, profile_size, suggestion_size=200): ContentBased.__init__(self, content, profile_size) self.content = content self.description = 'Machine-learning' self.profile_size = profile_size self.suggestion_size = suggestion_size self.cache = apt.Cache() self.ml_data = MachineLearningData() self.axi = xapian.Database(XAPIAN_DATABASE_PATH) def display_recommended_terms(self, terms_name, debtags_name, item_score, rec_size): sorted_result = sorted(item_score.items(), key=operator.itemgetter(1)) sorted_result = list(reversed(sorted_result)) sorted_result = [pkg[0] for pkg in sorted_result][0:rec_size] sorted_result = list(reversed(sorted_result)) for pkg in sorted_result: pkg_terms = self.ml_data.get_pkg_terms(self.cache, pkg) pkg_debtags = self.ml_data.get_pkg_debtags(self.axi, pkg) terms_match = [] for term in pkg_terms: if term in terms_name: terms_match.append(term) debtags_match = [] for debtag in pkg_debtags: if debtag in debtags_name: debtags_match.append(debtag) print "\n\n=" print "{0}".format(pkg) print "debtags:" print debtags_match print "-" print "terms:" print terms_match print "=" def get_item_score(self, pkgs_score, pkgs_classifications): item_score = {} order = ['RU', 'U', 'NU'] order_values = [0, 1000, 2000] for pkg, classification in pkgs_classifications.iteritems(): item_score[pkg] = order_values[order.index(classification)] item_score[pkg] += pkgs_score[pkg] return item_score def get_pkgs_and_scores(self, rec, user): profile = user.content_profile(rec.items_repository, self.content, self.suggestion_size, rec.valid_tags) content_based = self.get_sugestion_from_profile(rec, user, profile, self.suggestion_size) pkgs, pkgs_score = [], {} for pkg_line in str(content_based).splitlines()[1:]: pkg = pkg_line.split(':')[1][1:] pkg_score = int(pkg_line.split(':')[0].strip()) pkgs.append(pkg) pkgs_score[pkg] = self.suggestion_size - pkg_score return pkgs, pkgs_score def get_pkgs_classifications(self, pkgs, terms_name, debtags_name): ml_strategy = self.get_ml_strategy() pkgs_classifications = {} kwargs = {} kwargs['terms_name'] = terms_name kwargs['debtags_name'] = debtags_name kwargs['ml_strategy'] = ml_strategy for pkg in pkgs: if pkg not in self.cache: continue attribute_vector = self.prepare_pkg_data( pkg, **kwargs) classification = self.get_pkg_classification( ml_strategy, attribute_vector) pkgs_classifications[pkg] = classification return pkgs_classifications def load_terms_and_debtags(self): terms_name = [] debtags_name = [] terms_path = self.get_terms_path() debtags_path = self.get_debtags_path() with open(terms_path, 'rb') as terms: terms_name = pickle.load(terms) with open(debtags_path, 'rb') as debtags: debtags_name = pickle.load(debtags) return terms_name, debtags_name @staticmethod def train(cls): if MachineLearning.PKGS_CLASSIFICATIONS is None: ml_data = MachineLearningData() labels = ['RU', 'U', 'NU'] MachineLearning.PKGS_CLASSIFICATIONS = ml_data.create_data(labels) cls.run_train(MachineLearning.PKGS_CLASSIFICATIONS) @abstractmethod def get_debtags_path(self): raise NotImplementedError("Method not implemented.") @abstractmethod def get_ml_strategy(self): raise NotImplementedError("Method not implemented.") @abstractmethod def get_pkg_classification(self, ml_strategy, attribute_vector): raise NotImplementedError("Method not implemented.") @abstractmethod def get_terms_path(self): raise NotImplementedError("Method not implemented.") @abstractmethod def prepare_pkg_data(self, pkg, **kwargs): raise NotImplementedError("Method not implemented.") @abstractmethod def run_train(cls, pkgs_classifications): raise NotImplementedError("Method not implemented.") def run(self, rec, user, rec_size): terms_name, debtags_name = self.load_terms_and_debtags() pkgs, pkgs_score = self.get_pkgs_and_scores(rec, user) pkgs_classifications = self.get_pkgs_classifications(pkgs, terms_name, debtags_name) item_score = self.get_item_score(pkgs_score, pkgs_classifications) result = recommender.RecommendationResult(item_score, limit=rec_size) return result