Example #1
0
 def setUpClass(self):
     cfg = Config()
     cfg.popcon_index = "test_data/.sample_pxi"
     cfg.popcon_dir = "test_data/popcon_dir"
     cfg.clusters_dir = "test_data/clusters_dir"
     cfg.popcon = 0
     self.rec = Recommender()
Example #2
0
    def test_error_train_on_run_apprec(self):
        config = Config()
        strategy = config.strategy
        config.strategy = 'mlbva'

        training_path = MachineLearningData.MACHINE_LEARNING_TRAINING
        MachineLearningData.MACHINE_LEARNING_TRAINING = "error.txt"

        result = apprec.run()

        config.strategy = strategy
        MachineLearningData.MACHINE_LEARNING_TRAINING = training_path

        self.assertEqual(apprec.ERROR_TRAIN, result)
Example #3
0
    def test_error_train_on_run_apprec(self):
        config = Config()
        strategy = config.strategy
        config.strategy = 'mlbva'

        training_path = MachineLearningData.MACHINE_LEARNING_TRAINING
        MachineLearningData.MACHINE_LEARNING_TRAINING = "error.txt"

        result = apprec.run()

        config.strategy = strategy
        MachineLearningData.MACHINE_LEARNING_TRAINING = training_path

        self.assertEqual(apprec.ERROR_TRAIN, result)
 def reset(self, params, rep):
     if params['name'].startswith("content"):
         cfg = Config()
         # if the index was not built yet
         # app_axi = AppAptXapianIndex(cfg.axi,"results/arnaldo/AppAxi")
         cfg.axi = "data/AppAxi"
         cfg.index_mode = "old"
         cfg.weight = params['weight']
         self.rec = Recommender(cfg)
         self.rec.set_strategy(params['strategy'])
         self.repo_size = self.rec.items_repository.get_doccount()
         self.user = LocalSystem()
         self.user.app_pkg_profile(self.rec.items_repository)
         self.sample_size = int(
             len(self.user.pkg_profile) * params['sample'])
Example #5
0
 def __init__(self):
     """
     Set initial parameters.
     """
     self.cfg = Config()
     # Load xapian indexes
     # self.axi_programs = xapian.Database(cfg.axi_programs)
     self.axi_desktopapps = xapian.Database(self.cfg.axi_desktopapps)
     if self.cfg.popcon:
         # self.popcon_programs = xapian.Database(cfg.popcon_programs)
         self.popcon_desktopapps = xapian.Database(
             self.cfg.popcon_desktopapps)
     # Load valid programs, desktopapps and tags
     # format: one package or tag name per line
     # self.valid_programs = []
     self.valid_desktopapps = []
     self.valid_tags = []
     logging.info("Loading recommender filters")
     # with open(os.path.join(cfg.filters_dir,"programs")) as pkgs:
     #    self.valid_programs = [line.strip() for line in pkgs
     #                           if not line.startswith("#")]
     with open(os.path.join(self.cfg.filters_dir, "desktopapps")) as pkgs:
         self.valid_desktopapps = [line.strip() for line in pkgs
                                   if not line.startswith("#")]
     with open(os.path.join(self.cfg.filters_dir, "debtags")) as tags:
         self.valid_tags = [line.strip() for line in tags
                            if not line.startswith("#")]
     # Set xapian index weighting scheme
     if self.cfg.weight == "bm25":
         self.weight = xapian.BM25Weight(self.cfg.bm25_k1, self.cfg.bm25_k2,
                                         self.cfg.bm25_k3, self.cfg.bm25_b,
                                         self.cfg.bm25_nl)
     else:
         self.weight = xapian.TradWeight()
     self.set_strategy(self.cfg.strategy)
Example #6
0
 def load_summary(self):
     cfg = Config()
     if self.connect_to_dde(cfg.dde_server, cfg.dde_port):
         json_data = json.load(urllib.urlopen(cfg.dde_url % self.name))
         self.summary = json_data['r']['description']
     else:
         pkg_version = apt.Cache()[self.name].candidate
         self.summary = pkg_version.summary
Example #7
0
    def stopwords(self):
        if not self._stopwords:
            stopwords_path = Config().stopwords
            with open(stopwords_path, 'r') as stopwords:
                for word in stopwords:
                    self._stopwords.add(word.strip())

            return self._stopwords
        else:
            return self._stopwords
Example #8
0
 def setUpClass(self):
     cfg = Config()
     self.axi = xapian.Database(cfg.axi)
     packages = [
         "gimp", "aaphoto", "eog", "emacs", "dia", "ferret", "festival",
         "file", "inkscape", "xpdf"
     ]
     path = "apprecommender/tests/test_data/.sample_axi"
     self.sample_axi = SampleAptXapianIndex(packages, self.axi, path)
     self.user = User({"gimp": 1, "aaphoto": 1, "eog": 1, "emacs": 1})
Example #9
0
    def __init__(self):
        self.axi = xapian.Database(MachineLearningData.XAPIAN_DATABASE_PATH)
        self.stemmer = Stemmer.Stemmer('english')

        valid_tags = []
        with open(path.join(Config().filters_dir, "debtags")) as tags:
            valid_tags = [
                line.strip() for line in tags if not line.startswith("#")
            ]
        self.filter_tag = FilterTag(valid_tags)
        self.filter_description = FilterDescription()
Example #10
0
    def get_user(self, no_auto_pkg_profile):
        config = Config()

        user = LocalSystem()
        user.filter_pkg_profile(os.path.join(config.filters_dir,
                                             "desktopapps"))
        user.maximal_pkg_profile()

        if no_auto_pkg_profile:
            user.no_auto_pkg_profile()

        return user
 def setUpClass(self):
     cfg = Config()
     cfg.popcon_index = "test_data/.sample_pxi"
     cfg.popcon_dir = "test_data/popcon_dir"
     cfg.clusters_dir = "test_data/clusters_dir"
     cfg.popcon = 0
     self.rec = Recommender()
Example #12
0
def run_apprecommender(options):
    try:
        recommendation_size = 20
        no_auto_pkg_profile = True

        app_recommender = AppRecommender()
        app_recommender.make_recommendation(recommendation_size,
                                            no_auto_pkg_profile)
        return SUCCESS
    except xapian.DatabaseOpeningError:
        return ERROR_INIT
    except IOError:
        if "ml" in Config().strategy:
            return ERROR_TRAIN
    def load(self):
        config = Config()
        short_options = 'hdvo:d:v:s:z:idvo:tdvo:b:n:cdvo'
        long_options = ['help', 'debug', 'verbose', 'strategy=',
                        'profile_size=', 'init', 'train', 'because',
                        'nrecommendation', 'contribute']
        try:
            opts, args = getopt.getopt(sys.argv[1:], short_options,
                                       long_options)
            self.options = opts
        except getopt.GetoptError as error:
            config.set_logger()
            logging.error('Bad syntax: {}'.format(str(error)))
            self.usage()
            sys.exit()

        for o, p in opts:
            if o in ('-h', '--help'):
                self.usage()
                sys.exit()
            elif o in ('-d', '--debug'):
                config.debug = 1
            elif o in ('-v', '--verbose'):
                config.verbose = 1
            elif o in ('-s', '--strategy'):
                config.strategy = p
            elif o in ('-z', '--profile_size'):
                config.profile_size = int(p)
            elif o in ('-i', '--init'):
                continue
            elif o in ('-t', '--train'):
                continue
            elif o in ('-b', '--because'):
                config.because = True
            elif o in ('-n', '--num-recommendations'):
                config.num_recommendations = int(p)
            elif o in ('-c', '--contribute'):
                continue
            else:
                assert False, "unhandled option"
Example #14
0
    def load(self):
        config = Config()
        short_options = "hdvo:f:b:a:e:p:m:u:l:c:x:w:s:z:r:n:idvo:tdvo"
        long_options = [
            "help", "debug", "verbose", "output=", "filtersdir=",
            "pkgsfilter=", "axi=", "dde=", "popconindex=", "popcondir=",
            "indexmode=", "clustersdir=", "kmedoids=", "maxpopcon=", "weight=",
            "strategy=", "profile_size=", "profiling=", "neighbors=", "init",
            "train"
        ]
        try:
            opts, args = getopt.getopt(sys.argv[1:], short_options,
                                       long_options)
            self.options = opts
        except getopt.GetoptError as error:
            config.set_logger()
            logging.error("Bad syntax: %s" % str(error))
            self.usage()
            sys.exit()

        for o, p in opts:
            if o in ("-h", "--help"):
                self.usage()
                sys.exit()
            elif o in ("-d", "--debug"):
                config.debug = 1
            elif o in ("-v", "--verbose"):
                config.verbose = 1
            elif o in ("-o", "--output"):
                config.output = p
            elif o in ("-f", "--filtersdir"):
                config.filters_dir = p
            elif o in ("-b", "--pkgsfilter"):
                config.pkgs_filter = p
            elif o in ("-a", "--axi"):
                config.axi = p
            elif o in ("-e", "--dde"):
                config.dde_url = p
            elif o in ("-p", "--popconindex"):
                config.popcon_index = p
            elif o in ("-m", "--popcondir"):
                config.popcon_dir = p
            elif o in ("-u", "--index_mode"):
                config.index_mode = p
            elif o in ("-l", "--clustersdir"):
                config.clusters_dir = p
            elif o in ("-c", "--kmedoids"):
                config.k_medoids = int(p)
            elif o in ("-x", "--max_popcon"):
                config.max_popcon = int(p)
            elif o in ("-w", "--weight"):
                config.weight = p
            elif o in ("-s", "--strategy"):
                config.strategy = p
            elif o in ("-z", "--profile_size"):
                config.profile_size = int(p)
            elif o in ("-z", "--profiling"):
                config.profiling = p
            elif o in ("-n", "--neighbors"):
                config.k_neighbors = int(p)
            elif o in ("-i", "--init"):
                continue
            elif o in ("-t", "--train"):
                continue
            else:
                assert False, "unhandled option"
Example #15
0
class BagOfWords():

    USER_DATA_DIR = Config().user_data_dir
    BAG_OF_WORDS_DIR = USER_DATA_DIR + 'bag_of_words/'
    BAG_OF_WORDS_MODEL = BAG_OF_WORDS_DIR + 'bag_of_words_model.pickle'
    BAG_OF_WORDS_TERMS = BAG_OF_WORDS_DIR + 'bag_of_words_terms.pickle'
    BAG_OF_WORDS_DEBTAGS = BAG_OF_WORDS_DIR + 'bag_of_words_debtags.pickle'
    BAG_OF_WORDS_PKGS_CLASSIFICATION = BAG_OF_WORDS_DIR + \
        'bow_pkgs_classification.pickle'

    MODEL_ALREADY_CREATED = 1
    CREATED_MODEL = 0

    @staticmethod
    def save(bag_of_words, file_path):
        with open(file_path, 'wb') as text:
            pickle.dump(bag_of_words, text)

    @staticmethod
    def load(file_path):
        with open(file_path, 'rb') as text:
            bag_of_words = pickle.load(text)

        return bag_of_words

    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_df=0.8,
                                          max_features=5000,
                                          min_df=5,
                                          stop_words='english',
                                          use_idf=True)

    def check_dir(self):
        return os.path.exists(BagOfWords.BAG_OF_WORDS_DIR)

    def combine_pkg_info(self, description, debtags, section):
        description.extend(debtags)
        description.append(section)

        return description

    def classify_pkg(self, attribute_vector, transform=True):
        if transform:
            pkg_feature = self.vectorizer.transform([attribute_vector])
            pkg_feature = pkg_feature.toarray()
        else:
            pkg_feature = attribute_vector

        label = self.classifier.predict(pkg_feature)

        return label[0]

    def create_pkg_data(self, pkg, axi, cache, ml_data):
        description = self.get_pkg_description(pkg, cache, ml_data)
        debtags = self.get_pkg_debtags(pkg, axi, ml_data)
        section = self.get_pkg_section(pkg, cache, ml_data)

        return ' '.join(self.combine_pkg_info(description, debtags, section))

    def get_pkgs_classification(self, pkgs_list):
        pkgs_classification = []

        with open(MachineLearningData.PKGS_CLASSIFICATIONS) as pkgs:
            pkgs_data = pickle.load(pkgs)

            for pkg_name in pkgs_list:
                pkgs_classification.append(pkgs_data[pkg_name][-1])

        return pkgs_classification

    def get_pkg_description(self, pkg, cache, ml_data):
        return ml_data.get_pkg_terms(cache, pkg)

    def get_pkg_debtags(self, pkg, axi, ml_data):
        return map(lambda x: x.replace('::', '_'),
                   ml_data.get_pkg_debtags(axi, pkg))

    def get_pkg_section(self, pkg, cache, ml_data):
        return ml_data.get_pkg_section(cache, pkg)

    def get_used_terms_and_debtags(self, features_lists):
        terms, debtags = [], []

        for feature in features_lists:
            if '_' in feature:
                debtags.append(feature.replace('_', '::'))
            else:
                terms.append(feature)

        return terms, debtags

    def prepare_data(self, pkg_list, axi, cache, ml_data):
        pkgs_description = []
        pkgs_classification = []

        for pkg in pkg_list:
            pkg_data = self.create_pkg_data(pkg, axi, cache, ml_data)
            pkgs_description.append(pkg_data)

        pkgs_classification = self.get_pkgs_classification(pkg_list)

        return (pkgs_description, pkgs_classification)

    def save_features(self, features, path):
        if not self.check_dir():
            os.mkdir(BagOfWords.BAG_OF_WORDS_DIR)

        with open(path, 'wa') as feature_file:
            pickle.dump(features, feature_file)

    def save_pkgs_features(self, path, pkgs_list, features_array,
                           pkg_classification):
        pkgs_classification = {}

        for index, pkg in enumerate(pkgs_list):
            value = features_array[index, :].tolist()
            value.append(pkg_classification[index])

            pkgs_classification[pkg] = value

        with open(path, 'wa') as bow_pkgs_classification:
            pickle.dump(pkgs_classification, bow_pkgs_classification)

    def train_model(self, pkgs_list, axi, save_files=True):
        cache = Cache()
        ml_data = MachineLearningData()

        pkgs_description, pkg_classification = self.prepare_data(
            pkgs_list, axi, cache, ml_data)
        pkg_features = self.vectorizer.fit_transform(pkgs_description)
        features_array = pkg_features.toarray()

        terms, debtags = self.get_used_terms_and_debtags(
            self.vectorizer.get_feature_names())

        self.classifier = GaussianNB()
        self.classifier.fit(features_array, pkg_classification)

        path = BagOfWords.BAG_OF_WORDS_PKGS_CLASSIFICATION

        if save_files:
            self.save_features(terms, BagOfWords.BAG_OF_WORDS_TERMS)
            self.save_features(debtags, BagOfWords.BAG_OF_WORDS_DEBTAGS)
            self.save_pkgs_features(path, pkgs_list, features_array,
                                    pkg_classification)

        return BagOfWords.CREATED_MODEL
Example #16
0
 def load_details(self):
     cfg = Config()
     if self.connect_to_dde(cfg.dde_server, cfg.dde_port):
         self.load_details_from_dde(cfg.dde_url)
     else:
         self.load_details_from_apt()
Example #17
0
class MachineLearningData():

    XAPIAN_DATABASE_PATH = path.expanduser(
        '~/.app-recommender/axi_desktopapps/')
    USER_DATA_DIR = Config().user_data_dir
    BASE_DIR = Config().base_dir

    PKG_DATA_PATH = USER_DATA_DIR + 'pkg_data.txt'

    PKGS_CLASSIFICATIONS = USER_DATA_DIR + 'pkgs_classifications.txt'
    MACHINE_LEARNING_TERMS = USER_DATA_DIR + 'machine_learning_terms.txt'
    MACHINE_LEARNING_DEBTAGS = USER_DATA_DIR + 'machine_learning_debtags.txt'
    MACHINE_LEARNING_TRAINING = USER_DATA_DIR + 'machine_learning_training.txt'

    def __init__(self):
        self.axi = xapian.Database(MachineLearningData.XAPIAN_DATABASE_PATH)
        self.stemmer = Stemmer.Stemmer('english')

        valid_tags = []
        with open(path.join(Config().filters_dir, "debtags")) as tags:
            valid_tags = [
                line.strip() for line in tags if not line.startswith("#")
            ]
        self.filter_tag = FilterTag(valid_tags)
        self.filter_description = FilterDescription()

    def create_data(self, labels):
        if not path.exists(MachineLearningData.USER_DATA_DIR):
            makedirs(MachineLearningData.USER_DATA_DIR)

        pkgs = self.get_pkgs_classification(data_cl.square_percent_function,
                                            labels)

        cache = apt.Cache()

        terms_name = self.get_terms_for_all_pkgs(cache, pkgs.keys())
        debtags_name = self.get_debtags_for_all_pkgs(self.axi, pkgs.keys())

        debtags_name = self.filter_debtags(debtags_name)
        debtags_name = sorted(debtags_name)
        terms_name = self.filter_terms(terms_name)
        terms_name = sorted(terms_name)

        pkgs_classifications = (self.get_pkgs_table_classification(
            self.axi, pkgs, cache, debtags_name, terms_name))

        self.save_pkg_data(terms_name,
                           MachineLearningData.MACHINE_LEARNING_TERMS)
        self.save_pkg_data(debtags_name,
                           MachineLearningData.MACHINE_LEARNING_DEBTAGS)
        self.save_pkg_data(pkgs_classifications,
                           MachineLearningData.PKGS_CLASSIFICATIONS)

        return pkgs_classifications

    def get_pkgs_classification(self, percent_function, labels):
        pkgs_percent = {}
        pkgs_classification = {}
        time_now = calendar.timegm(time.gmtime())
        pkg_time = PkgTime()
        pkg_data = pkg_time.get_package_data()

        for name, time_values in pkg_data.iteritems():
            modify = time_values[0]
            access = time_values[1]

            pkgs_percent[name] = percent_function(modify, access, time_now)

        pkgs = pkgs_percent.keys()
        pkgs = sorted(pkgs, key=lambda pkg: pkgs_percent[pkg])
        pkgs = list(reversed(pkgs))

        size = len(pkgs) / len(labels)
        for index, label in enumerate(labels):
            index_begin = size * index
            index_end = index_begin + size
            classifications = dict.fromkeys(pkgs[index_begin:index_end], label)
            pkgs_classification.update(classifications)

        index_begin = size * len(labels)
        if index_begin < len(labels):
            classifications = dict.fromkeys(pkgs[index_begin], label[-1])
            pkgs_classification.update(classifications)

        return pkgs_classification

    def get_pkg_data(self, axi, pkg_name, data_type):
        pkg_name = 'XP' + pkg_name

        query = xapian.Query(xapian.Query.OP_OR, [pkg_name])
        enquire = xapian.Enquire(axi)
        enquire.set_query(query)

        mset = enquire.get_mset(0, 10)

        pkg_info = []
        for pkg in mset:
            for term in axi.get_document(pkg.docid).termlist():

                pkg_term = term.term

                if pkg_term.startswith(data_type):
                    pkg_info.append(pkg_term[len(data_type):])
                elif data_type == 'term':
                    if pkg_term[0].islower():
                        pkg_info.append(pkg_term)

        return pkg_info

    def get_pkg_debtags(self, axi, pkg_name):
        return self.get_pkg_data(axi, pkg_name, 'XT')

    def get_pkg_terms(self, cache, pkg_name):
        description = cache[pkg_name].candidate.description.strip()
        description = re.sub('[^a-zA-Z]', ' ', description)

        tokens = description.lower().split()
        stems = [
            self.stemmer.stemWord(token) for token in tokens
            if self.filter_description(token)
        ]

        return stems

    def get_pkg_section(self, cache, pkg_name):
        return cache[pkg_name].section

    def get_debtags_name(self, file_path):
        with open(file_path, 'r') as text:
            debtags_name = [debtag.strip() for debtag in text]

        return debtags_name

    def create_row_table_list(self, labels_name, pkg_elements):
        row_list = []

        for debtag in labels_name:
            row_list.append(1 if debtag in pkg_elements else 0)

        return row_list

    def get_terms_for_all_pkgs(self, cache, pkgs):
        pkg_terms = set()
        for pkg in pkgs:
            pkg_terms = pkg_terms | set(self.get_pkg_terms(cache, pkg))

        return pkg_terms

    def get_debtags_for_all_pkgs(self, axi, pkgs):
        pkg_debtags = set()
        for pkg in pkgs:
            pkg_debtags = pkg_debtags | set(self.get_pkg_debtags(axi, pkg))

        return pkg_debtags

    def filter_terms(self, terms):
        filtered_terms = []
        for term in terms:
            if self.filter_description(term):
                filtered_terms.append(term)

        return filtered_terms

    def filter_debtags(self, debtags):
        filtered_debtags = []
        for tag in debtags:
            if self.filter_tag('XT' + tag):
                filtered_debtags.append(tag)

        return filtered_debtags

    def get_pkgs_table_classification(self, axi, pkgs, cache, debtags_name,
                                      terms_name):
        pkgs_classification = {}

        for key, value in pkgs.iteritems():

            pkgs_classification[key] = []

            debtags = self.get_pkg_debtags(axi, key)
            debtags = self.create_row_table_list(debtags_name, debtags)
            pkgs_classification[key].extend(debtags)

            terms = self.get_pkg_terms(cache, key)
            terms = self.create_row_table_list(list(terms_name), terms)
            pkgs_classification[key].extend(terms)

            pkgs_classification[key].append(value)

        return pkgs_classification

    def save_pkg_data(self, pkg_data, file_path):
        with open(file_path, 'wb') as text:
            pickle.dump(pkg_data, text)
Example #18
0
 def setUpClass(self):
     cfg = Config()
     self.axi = xapian.Database(cfg.axi)
Example #19
0
import os
import re
import commands

from apprecommender.data_classification import get_time_from_package
from apprecommender.config import Config
from apprecommender.user import LocalSystem

USER_DATA_DIR = Config().user_data_dir


class PkgTime:

    def __init__(self):
        pass

    def create_pkg_data(self):
        user = LocalSystem()
        user.maximal_pkg_profile()
        user.no_auto_pkg_profile()
        user_pkgs = user.pkg_profile

        pkgs_time = self.get_packages_time(user_pkgs)
        self.save_package_time(pkgs_time)
        return pkgs_time

    def get_best_time(self, pkg):
        valid_regex = re.compile(
            r'/usr/bin/|/usr/sbin|/usr/game/|/usr/lib/.+/')
        pkg_files = commands.getoutput('dpkg -L {}'.format(pkg))
    # iterations = 3
    # content_based = ['cb']
    # collaborative = ['knn_eset']
    # hybrid = ['knnco']
    # profile_size = [50,100]
    # neighbors = [50]
    iterations = 20
    content_based = ['cb', 'cbt', 'cbd', 'cbh',
                     'cb_eset', 'cbt_eset', 'cbd_eset', 'cbh_eset']
    collaborative = ['knn_eset', 'knn', 'knn_plus']
    hybrid = ['knnco', 'knnco_eset']
    profile_size = [10, 20, 40, 60, 80, 100, 140, 170, 200, 240]
    neighbors = [3, 5, 10, 20, 30, 50, 70, 100, 150, 200]

    cfg = Config()
    cfg.strategy = sys.argv[1]

    # user =
    # PopconSystem("/root/.app-recommender/popcon-entries/4a/4a67a295ec14826db2aa1d90be2f1623")
    user = PopconSystem(
        "/root/.app-recommender/popcon-entries/8b/8b44fcdbcf676e711a153d5db09979d7")  # noqa
    # user = PopconSystem(sys.argv[1])
    user.filter_pkg_profile(cfg.pkgs_filter)
    user.maximal_pkg_profile()

    if cfg.strategy in content_based:
        run_content(user, cfg)
    if cfg.strategy in collaborative:
        run_collaborative(user, cfg)
    if cfg.strategy in hybrid:
Example #21
0
    # collaborative_strategies = ['knn']

    strategy_category = sys.argv[1]
    if strategy_category == "content":
        strategies = content_strategies
        sizes = profile_size
        option_str = "profile"
    elif strategy_category == "collaborative":
        strategies = collaborative_strategies
        sizes = neighbor_size
        option_str = "neighborhood"
    else:
        print "Usage: profile-suite strategy_category sample_file"
        exit(1)

    cfg = Config()
    population_sample = []
    sample_file = sys.argv[2]
    sample_str = sample_file.split('/')[-1]
    with open(sample_file, 'r') as f:
        for line in f.readlines():
            user_id = line.strip('\n')
            population_sample.append(
                os.path.join(cfg.popcon_dir, user_id[:2], user_id))
    sample_dir = ("results/%s/%s" %
                  (strategy_category, sample_str))
    if not os.path.exists(sample_dir):
        os.makedirs(sample_dir)

    for strategy in strategies:
        cfg.strategy = strategy
import os
import pickle
import sys
import getopt

sys.path.insert(0, "{0}/../".format(os.path.dirname(__file__)))

from apprecommender.ml.cross_validation import (CrossValidationBVA,
                                                CrossValidationBOW)
from apprecommender.evaluation import (SimpleAccuracy, Precision, Recall, FPR,
                                       F_score)
from apprecommender.ml.data import MachineLearningData
from apprecommender.ml.bag_of_words import BagOfWords
from apprecommender.config import Config

BASE_DIR = Config().base_dir
CROSS_VALIDATION_FOLDER = BASE_DIR + '/cross_validation_data/'


def get_strategy(ml_strategy_str, pkg_data, partition_size, rounds,
                 metrics_list, labels):
    if ml_strategy_str == 'bow':
        return CrossValidationBOW(pkg_data, partition_size, rounds,
                                  metrics_list, labels)
    else:
        return CrossValidationBVA(pkg_data, partition_size, rounds,
                                  metrics_list, labels)


def get_pkg_data(ml_strategy_str, ml_data, labels):
    if ml_strategy_str == 'bow':
Example #23
0
    def load(self):
        config = Config()
        short_options = "hdvo:f:b:a:e:p:m:u:l:c:x:w:s:z:r:n:idvo:tdvo"
        long_options = ["help", "debug", "verbose", "output=", "filtersdir=",
                        "pkgsfilter=", "axi=", "dde=", "popconindex=",
                        "popcondir=", "indexmode=", "clustersdir=",
                        "kmedoids=", "maxpopcon=", "weight=", "strategy=",
                        "profile_size=", "profiling=", "neighbors=", "init",
                        "train"]
        try:
            opts, args = getopt.getopt(sys.argv[1:], short_options,
                                       long_options)
            self.options = opts
        except getopt.GetoptError as error:
            config.set_logger()
            logging.error("Bad syntax: %s" % str(error))
            self.usage()
            sys.exit()

        for o, p in opts:
            if o in ("-h", "--help"):
                self.usage()
                sys.exit()
            elif o in ("-d", "--debug"):
                config.debug = 1
            elif o in ("-v", "--verbose"):
                config.verbose = 1
            elif o in ("-o", "--output"):
                config.output = p
            elif o in ("-f", "--filtersdir"):
                config.filters_dir = p
            elif o in ("-b", "--pkgsfilter"):
                config.pkgs_filter = p
            elif o in ("-a", "--axi"):
                config.axi = p
            elif o in ("-e", "--dde"):
                config.dde_url = p
            elif o in ("-p", "--popconindex"):
                config.popcon_index = p
            elif o in ("-m", "--popcondir"):
                config.popcon_dir = p
            elif o in ("-u", "--index_mode"):
                config.index_mode = p
            elif o in ("-l", "--clustersdir"):
                config.clusters_dir = p
            elif o in ("-c", "--kmedoids"):
                config.k_medoids = int(p)
            elif o in ("-x", "--max_popcon"):
                config.max_popcon = int(p)
            elif o in ("-w", "--weight"):
                config.weight = p
            elif o in ("-s", "--strategy"):
                config.strategy = p
            elif o in ("-z", "--profile_size"):
                config.profile_size = int(p)
            elif o in ("-z", "--profiling"):
                config.profiling = p
            elif o in ("-n", "--neighbors"):
                config.k_neighbors = int(p)
            elif o in ("-i", "--init"):
                continue
            elif o in ("-t", "--train"):
                continue
            else:
                assert False, "unhandled option"
Example #24
0
 def __init__(self):
     self.config = Config()
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import sys
import os
import logging

sys.path.insert(0, '../')

from apprecommender.config import Config
from apprecommender.data import PopconXapianIndex

if __name__ == '__main__':

    cfg = Config()
    cfg.index_mode = "recluster"
    logging.info("Starting clustering experiments")
    logging.info("Medoids: %d\t Max popcon:%d" %
                 (cfg.k_medoids, cfg.max_popcon))
    cfg.popcon_dir = os.path.expanduser(
        "~/org/popcon.debian.org/popcon-mail/popcon-entries/")
    cfg.popcon_index = cfg.popcon_index + ("_%dmedoids%dmax" %
                                           (cfg.k_medoids, cfg.max_popcon))
    cfg.clusters_dir = cfg.clusters_dir + ("_%dmedoids%dmax" %
                                           (cfg.k_medoids, cfg.max_popcon))
    pxi = PopconXapianIndex(cfg)
    logging.info("Overall dispersion: %f\n" % pxi.cluster_dispersion)
    # Write clustering log
    output = open(("results/clustering/%dmedoids%dmax" %
                  (cfg.k_medoids, cfg.max_popcon)), 'w')
Example #26
0
    def setUp(self):
        logging.getLogger().disabled = True

        self.axi_desktopapps = Config().axi_desktopapps
Example #27
0
 def tearDown(self):
     Config().axi_desktopapps = self.axi_desktopapps
Example #28
0
    def test_error_init_on_run_apprec(self):
        Config().axi_desktopapps = "asd"
        result = apprec.run()

        self.assertEqual(apprec.ERROR_INIT, result)
Example #29
0
import sys
sys.path.insert(0, '../')
import logging
import datetime

from apprecommender.config import Config
from apprecommender.data import FilteredPopconXapianIndex

if __name__ == '__main__':
    base_dir = os.path.expanduser("~/.app-recommender/")
    axi_path = os.path.join(base_dir, "axi_XD")
    path = os.path.join(base_dir, "popcon_XD")
    popcon_dir = os.path.join(base_dir, "popcon-entries")
    tags_filter = os.path.join(base_dir, "filters/debtags")

    # set up config for logging
    cfg = Config()

    begin_time = datetime.datetime.now()
    logging.info("Popcon indexing started at %s" % begin_time)
    # use config file or command line options
    index = FilteredPopconXapianIndex(path, popcon_dir, axi_path, tags_filter)

    end_time = datetime.datetime.now()
    logging.info("Popcon indexing completed at %s" % end_time)
    logging.info("Number of documents (submissions): %d" %
                 index.get_doccount())

    delta = end_time - begin_time
    logging.info("Time elapsed: %d seconds." % delta.seconds)
            return 0
        return sum(self.f05) / len(self.f05)

    def get_mcc_summary(self):
        if not self.mcc:
            return 0
        return sum(self.mcc) / len(self.mcc)

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print "Usage: k-suite strategy_str sample_file"
        exit(1)
    threshold = 20
    iterations = 30
    neighbors = [3, 5, 10, 50, 100, 150, 200, 300, 400, 500]
    cfg = Config()
    cfg.strategy = sys.argv[1]
    sample_file = sys.argv[2]
    population_sample = []
    with open(sample_file, 'r') as f:
        for line in f.readlines():
            user_id = line.strip('\n')
            population_sample.append(
                os.path.join(cfg.popcon_dir, user_id[:2], user_id))
    # setup dictionaries and files
    roc_summary = {}
    recommended = {}
    precision_summary = {}
    f05_summary = {}
    mcc_summary = {}
    sample_dir = ("results/k-suite/%s" % sample_file.split('/')[-1])