Exemple #1
0
def main(db_fpath,
         db_name,
         cross_val_folder,
         param_value,
         est_name,
         rand_seed=None,
         num_cores=-1):
    '''Dispatches jobs in multiple cores'''

    seed(rand_seed)

    #get cross validation dicts
    user_items_to_filter, user_validation_tags, user_test_tags = \
            load_train_test_validation(cross_val_folder)

    #all tags used by all users. Used o create a random set of tags excluding
    #these ones
    used_tags = set()
    for user in user_items_to_filter:
        used_tags.update(user_validation_tags[user])
        used_tags.update(user_test_tags[user])

    with AnnotReader(db_fpath) as reader:
        reader.change_table(db_name)

        annot_filter = FilteredUserItemAnnotations(user_items_to_filter)

        #Generate 50 random tags not used by any user the test set
        #Also creates some indexes used to define gamma items
        annotations = annot_filter.annotations(reader.iterate())
        user_to_item = defaultdict(set)
        items = set()
        tags = set()
        random_tags = []
        for annotation in annotations:
            user = annotation['user']
            item = annotation['item']
            tag = annotation['tag']

            user_to_item[user].add(item)
            items.add(item)
            tags.add(tag)

            if tag not in used_tags and tag not in random_tags:
                random_tags.append(tag)

        shuffle(random_tags)
        random_tags = random_tags[:NUM_RANDOM_TAGS]

        #Gets number of tags and items
        num_items = len(items)
        num_tags = len(tags)

        #Create estimator
        annotations = annot_filter.annotations(reader.iterate())
        if est_name == 'lda':
            est = create_lda_estimator(annotations, param_value, num_items,
                                       num_tags)
        else:
            est = create_bayes_estimator(annotations, param_value)

        annotations = annot_filter.annotations(reader.iterate())
        value_calc = ValueCalculator(est, annotations)

        run_exp(user_items_to_filter, user_test_tags, user_to_item, num_items,
                random_tags, value_calc)
Exemple #2
0
def run_one(args):
    '''
    This method will be run by parallel processes. Basically, it is the
    main method for each possible parameter being tested. It will work as
    follows:
    
    1. Loads train, validation and test separation from files
    
    2. Values of p(i|u) are computed for the gamma items set for each user
       based on the train set. Gamma items is just every item excluding the
       user items.
       
    3. Computes p(i|t,u) for a set of tags gamma items for each user. The set
       of tags is composed of the previous user tags (those on the test set), 
       the tags which were used on the validation set, the tags used on the
       train set and 50 random tags not previously used by the user.
    
    4. Saves p(i|u) and p(i|t,u) for items and tags considered above on the
       output folder. This provides sufficient information for choosing the best
       estimator (on the validation set) and performing further experiments 
       (actually computing tag values) on the test set.  
    '''

    #unbox arguments
    db_fpath, db_name, output_folder, cross_val_folder, est_name, \
            param_one, value_one, param_two, value_two = args

    #get cross validation dicts
    user_items_to_filter, user_validation_tags, user_test_tags = \
            load_train_test_validation(cross_val_folder)

    #all tags used by all users. Used o create a random set of tags excluding
    #these ones
    used_tags = set()
    for user in user_items_to_filter:
        used_tags.update(user_validation_tags[user])
        used_tags.update(user_test_tags[user])

    with AnnotReader(db_fpath) as reader:
        reader.change_table(db_name)

        annot_filter = FilteredUserItemAnnotations(user_items_to_filter)

        #Generate 50 random tags not used by any user in validation or test
        #Also creates some indexes used to define gamma items
        annotations = annot_filter.annotations(reader.iterate())
        user_to_item = defaultdict(set)
        items = set()
        tags = set()
        random_tags = []
        for annotation in annotations:
            user = annotation['user']
            item = annotation['item']
            tag = annotation['tag']

            user_to_item[user].add(item)
            items.add(item)
            tags.add(tag)

            if tag not in used_tags and tag not in random_tags:
                random_tags.append(tag)

        shuffle(random_tags)
        random_tags = random_tags[:NUM_RANDOM_TAGS]

        #Gets number of tags and items
        num_items = len(items)
        num_tags = len(tags)

        #Create estimator
        annotations = annot_filter.annotations(reader.iterate())
        save_lhood = False
        if est_name == 'lda':
            est = create_lda_estimator(annotations, value_one, num_items,
                                       num_tags, value_two)
            save_lhood = True
        else:
            est = create_bayes_estimator(annotations, value_one, value_two)

        param_out_folder = os.path.join(output_folder, \
                'params-%s-%f_%s-%f' % \
                (param_one, value_one, param_two, value_two))

        os.mkdir(param_out_folder)
        run_exp(user_items_to_filter, user_validation_tags, user_test_tags,
                user_to_item, num_items, random_tags, est, param_out_folder,
                save_lhood)
Exemple #3
0
def run_one(args):
    """
    This method will be run by parallel processes. Basically, it is the
    main method for each possible parameter being tested. It will work as
    follows:
    
    1. Loads train, validation and test separation from files
    
    2. Values of p(i|u) are computed for the gamma items set for each user
       based on the train set. Gamma items is just every item excluding the
       user items.
       
    3. Computes p(i|t,u) for a set of tags gamma items for each user. The set
       of tags is composed of the previous user tags (those on the test set), 
       the tags which were used on the validation set, the tags used on the
       train set and 50 random tags not previously used by the user.
    
    4. Saves p(i|u) and p(i|t,u) for items and tags considered above on the
       output folder. This provides sufficient information for choosing the best
       estimator (on the validation set) and performing further experiments 
       (actually computing tag values) on the test set.  
    """

    # unbox arguments
    db_fpath, db_name, output_folder, cross_val_folder, est_name, param_one, value_one, param_two, value_two = args

    # get cross validation dicts
    user_items_to_filter, user_validation_tags, user_test_tags = load_train_test_validation(cross_val_folder)

    # all tags used by all users. Used o create a random set of tags excluding
    # these ones
    used_tags = set()
    for user in user_items_to_filter:
        used_tags.update(user_validation_tags[user])
        used_tags.update(user_test_tags[user])

    with AnnotReader(db_fpath) as reader:
        reader.change_table(db_name)

        annot_filter = FilteredUserItemAnnotations(user_items_to_filter)

        # Generate 50 random tags not used by any user in validation or test
        # Also creates some indexes used to define gamma items
        annotations = annot_filter.annotations(reader.iterate())
        user_to_item = defaultdict(set)
        items = set()
        tags = set()
        random_tags = []
        for annotation in annotations:
            user = annotation["user"]
            item = annotation["item"]
            tag = annotation["tag"]

            user_to_item[user].add(item)
            items.add(item)
            tags.add(tag)

            if tag not in used_tags and tag not in random_tags:
                random_tags.append(tag)

        shuffle(random_tags)
        random_tags = random_tags[:NUM_RANDOM_TAGS]

        # Gets number of tags and items
        num_items = len(items)
        num_tags = len(tags)

        # Create estimator
        annotations = annot_filter.annotations(reader.iterate())
        save_lhood = False
        if est_name == "lda":
            est = create_lda_estimator(annotations, value_one, num_items, num_tags, value_two)
            save_lhood = True
        else:
            est = create_bayes_estimator(annotations, value_one, value_two)

        param_out_folder = os.path.join(
            output_folder, "params-%s-%f_%s-%f" % (param_one, value_one, param_two, value_two)
        )

        os.mkdir(param_out_folder)
        run_exp(
            user_items_to_filter,
            user_validation_tags,
            user_test_tags,
            user_to_item,
            num_items,
            random_tags,
            est,
            param_out_folder,
            save_lhood,
        )
def main(db_fpath, db_name, cross_val_folder, param_value, est_name, 
         rand_seed=None, num_cores=-1):
    '''Dispatches jobs in multiple cores'''
    
    seed(rand_seed)
    
    #get cross validation dicts
    user_items_to_filter, user_validation_tags, user_test_tags = \
            load_train_test_validation(cross_val_folder)

    #all tags used by all users. Used o create a random set of tags excluding 
    #these ones
    used_tags = set()
    for user in user_items_to_filter:
        used_tags.update(user_validation_tags[user])
        used_tags.update(user_test_tags[user])
    
    with AnnotReader(db_fpath) as reader:
        reader.change_table(db_name)
        
        annot_filter = FilteredUserItemAnnotations(user_items_to_filter)
        
        #Generate 50 random tags not used by any user the test set
        #Also creates some indexes used to define gamma items
        annotations = annot_filter.annotations(reader.iterate())
        user_to_item = defaultdict(set)
        items = set()
        tags = set()
        random_tags = []
        for annotation in annotations:
            user = annotation['user']
            item = annotation['item']
            tag = annotation['tag']
            
            user_to_item[user].add(item)
            items.add(item)
            tags.add(tag)
            
            if tag not in used_tags and tag not in random_tags:
                random_tags.append(tag)
        
        shuffle(random_tags)
        random_tags = random_tags[:NUM_RANDOM_TAGS]    
        
        #Gets number of tags and items
        num_items = len(items)
        num_tags = len(tags)
        
        #Create estimator
        annotations = annot_filter.annotations(reader.iterate())
        if est_name == 'lda':
            est = create_lda_estimator(annotations, param_value, 
                num_items, num_tags)
        else:
            est = create_bayes_estimator(annotations, param_value)

        annotations = annot_filter.annotations(reader.iterate())
        value_calc = ValueCalculator(est, annotations)
        
        run_exp(user_items_to_filter, user_test_tags, user_to_item, num_items, 
                random_tags, value_calc)