def real_main(database, table, smooth_func, lambda_,
              out_folder):
    
    with AnnotReader(database) as reader:
        reader.change_table(table) 
        
        #Create Graph
        create_graph(reader.iterate(), out_folder)
      
        #Compute popularity
        tag_pop = collections.defaultdict(int)
        for annotation in reader.iterate():
            tag = annotation['tag']
            tag_pop[tag] += 1
            
        #Compute tag value
        tag_to_item, item_to_tag = \
            index_creator.create_double_occurrence_index(reader.iterate(), 
                                                        'tag', 'item')
        compute_tag_values(smooth_func, lambda_,
                           reader.iterate(), tag_to_item, tag_pop, out_folder)

        with io.open(os.path.join(out_folder, 'relevant_item.tags'), 'w') as rel:
            rel.write(u'#ITEM TAG\n')
            for item in item_to_tag:
                for tag in item_to_tag[tag]:
                    rel.write(u'%d %d\n' %(item, tag))  
Example #2
0
def compute_for_user(database, table, user, relevant, annotated, 
                     smooth_func, lambda_, user_profile_size, out_folder):
    with AnnotReader(database) as reader:
        reader.change_table(table)
        
        #Relevant items by user are left out with this query
        query = {'$or' : [
                          { 'user':{'$ne'  : user} }, 
                          { 'item':{'$nin' : relevant} }
                         ]
                }
        
        #Probability estimator
        est = SmoothEstimator(smooth_func, lambda_, 
                              reader.iterate(query = query),
                              user_profile_size = user_profile_size)
        value_calc = value_calculator.ValueCalculator(est)
        
        fname = 'user_%d' % user
        user_folder = os.path.join(out_folder, fname)
        os.mkdir(user_folder)
        
        #Initial information
        with io.open(os.path.join(user_folder, 'info'), 'w') as info:
            info.write(u'#UID: %d\n' %user)
            
            relevant_str = ' '.join([str(i) for i in relevant])
            annotated_str = ' '.join([str(i) for i in annotated])
            
            info.write(u'# %d relevant  items: %s\n' %(len(relevant), 
                                                       str(relevant_str)))
            info.write(u'# %d annotated items: %s\n' %(len(annotated), 
                                                       str(annotated_str)))
        
        #Create Graph
        iterator = reader.iterate(query = query)
        tag_to_item, item_to_tag = \
            index_creator.create_double_occurrence_index(iterator, 
                                                         'tag', 'item')
            
        #Items to consider <-> Gamma items
        items_to_consider = set(xrange(est.num_items()))
        annotated_set = set(annotated)
        items_to_consider.difference_update(annotated_set)
        
        compute_tag_values(est, value_calc, tag_to_item, user, 
                           user_folder, 
                           np.array([i for i in items_to_consider]))
        
        relevant_tags_fpath = os.path.join(user_folder, 'relevant_item.tags')
        with io.open(relevant_tags_fpath, 'w') as rel:
            rel.write(u'#ITEM TAG\n')
            for item in relevant:
                for tag in item_to_tag[item]:
                    rel.write(u'%d %d\n' %(item, tag))
def user_tag_pairs_to_filter(users_to_consider, annotations, perc_tags=.1, \
        num_random_tags=100):
    '''
    Gets use tag pairs to filter. Random tags are filtered if they are used
    by more than one user. This method also returns random tags to compute
    value for.
    '''

    user_to_tags, tags_to_user = create_double_occurrence_index(
        annotations, 'user', 'tag')

    #Generate candidate tags for removal, they have to be used by more than
    #one user.
    tags_to_remove = {}
    for user in users_to_consider:
        possible_tags = []
        for tag in user_to_tags[user]:
            if len(tags_to_user[tag]) > 1:  #We only consider tags with >1 user
                possible_tags.append(tag)
                tags_to_user[tag].remove(
                    user)  #Remove this user from the count

        #num tags to remove for this user
        num_tags = int(perc_tags * len(user_to_tags[user]))

        #Generate random candidates
        candidate_tags = possible_tags[:num_tags]
        shuffle(candidate_tags)  #In place

        tags_to_remove[user] = candidate_tags

    #Generate Random tags
    possible_tags = range(len(tags_to_user))
    shuffle(possible_tags)
    random_tags = []

    for tag in possible_tags:
        used_or_hidden = False

        for user in users_to_consider:
            #gets tags not used by any considered user (hidden or not)
            if tag in user_to_tags[user] or tag in tags_to_remove[user]:
                used_or_hidden = True
                break

        if not used_or_hidden:
            random_tags.append(tag)

        if len(random_tags) == num_random_tags:
            break

    return tags_to_remove, random_tags
def user_tag_pairs_to_filter(users_to_consider, annotations, perc_tags=.1, \
        num_random_tags=100):
    
    '''
    Gets use tag pairs to filter. Random tags are filtered if they are used
    by more than one user. This method also returns random tags to compute
    value for.
    '''
    
    user_to_tags, tags_to_user = create_double_occurrence_index(annotations, 
            'user', 'tag')
    
    #Generate candidate tags for removal, they have to be used by more than
    #one user.
    tags_to_remove = {}
    for user in users_to_consider:
        possible_tags = []
        for tag in user_to_tags[user]:
            if len(tags_to_user[tag]) > 1: #We only consider tags with >1 user
                possible_tags.append(tag)
                tags_to_user[tag].remove(user) #Remove this user from the count
        
        #num tags to remove for this user
        num_tags = int(perc_tags * len(user_to_tags[user]))
        
        #Generate random candidates
        candidate_tags = possible_tags[:num_tags]
        shuffle(candidate_tags) #In place
        
        tags_to_remove[user] = candidate_tags
    
    #Generate Random tags
    possible_tags = range(len(tags_to_user))
    shuffle(possible_tags)
    random_tags = []
    
    for tag in possible_tags:
        used_or_hidden = False
        
        for user in users_to_consider:
            #gets tags not used by any considered user (hidden or not)
            if tag in user_to_tags[user] or tag in tags_to_remove[user]: 
                used_or_hidden = True
                break
        
        if not used_or_hidden:
            random_tags.append(tag)
        
        if len(random_tags) == num_random_tags:
            break
        
    return tags_to_remove, random_tags
 def test_double_occurrence_index(self):
     no_impact = 1
     
     a1 = data_parser.to_json(1, no_impact, 1, no_impact)
     a2 = data_parser.to_json(1, no_impact, 2, no_impact)
     a3 = data_parser.to_json(1, no_impact, 1, no_impact)
     a4 = data_parser.to_json(2, no_impact, 2, no_impact)
     a5 = data_parser.to_json(2, no_impact, 3, no_impact)
 
     from_to, inv = create_double_occurrence_index([a1, a2, a3, a4, a5], 
                                                   'user', 'tag')
     self.assertEqual(from_to[1], set([1, 2, 1]))
     self.assertEqual(from_to[2], set([2, 3]))
     
     self.assertEqual(inv[1], set([1]))
     self.assertEqual(inv[2], set([1, 2]))
     self.assertEqual(inv[3], set([2]))
    def test_double_occurrence_index(self):
        no_impact = 1

        a1 = data_parser.to_json(1, no_impact, 1, no_impact)
        a2 = data_parser.to_json(1, no_impact, 2, no_impact)
        a3 = data_parser.to_json(1, no_impact, 1, no_impact)
        a4 = data_parser.to_json(2, no_impact, 2, no_impact)
        a5 = data_parser.to_json(2, no_impact, 3, no_impact)

        from_to, inv = create_double_occurrence_index([a1, a2, a3, a4, a5],
                                                      'user', 'tag')
        self.assertEqual(from_to[1], set([1, 2, 1]))
        self.assertEqual(from_to[2], set([2, 3]))

        self.assertEqual(inv[1], set([1]))
        self.assertEqual(inv[2], set([1, 2]))
        self.assertEqual(inv[3], set([2]))
Example #7
0
def iedge_from_annotations(annotation_it, use=1, return_sink=True):
    '''
    Returns the edge list for the navigational graph.
    
    Arguments
    ---------
    annotation_it: iterator
        Iterator to annotations to use
    use = int {1, 2}
        Indicates whether to use items or users:
            1: Items
            2: Users
    return_sink = bool (defaults to True)
        Tells whether to return tag to sink edges
    '''
    choices = {1: 'item', 2: 'user'}
    dest = choices[use]

    tag_index, sink_index = create_double_occurrence_index(
        annotation_it, 'tag', dest)
    return iedge_from_indexes(tag_index, sink_index, return_sink)
Example #8
0
def iedge_from_annotations(annotation_it, use=1, return_sink = True):
    '''
    Returns the edge list for the navigational graph.
    
    Arguments
    ---------
    annotation_it: iterator
        Iterator to annotations to use
    use = int {1, 2}
        Indicates whether to use items or users:
            1: Items
            2: Users
    return_sink = bool (defaults to True)
        Tells whether to return tag to sink edges
    '''
    choices = {1:'item',
               2:'user'}
    dest = choices[use]
    
    tag_index, sink_index = create_double_occurrence_index(annotation_it, 
                                                           'tag', dest)
    return iedge_from_indexes(tag_index, sink_index, return_sink)
Example #9
0
def get_user_item_pairs_to_filter(users_to_consider,
                                  annotations,
                                  perc_items=.1):
    '''
    Gets user item pairs to filter. A percentage (`perc_items`) is filtered for
    each user. 

    The code to guarantees that we do not delete items from the trace 
    completely, that is, while removing items for users we guarantee that
    at we do not make an item be annotated by zero users. Thus, this code does 
    not guarantee that exactly `perc_items` will be removed per user.
    '''

    user_to_items = {}
    item_to_users = {}

    user_to_items, item_to_users = create_double_occurrence_index(
        annotations, 'user', 'item')

    user_item_pairs_to_filter = {}
    for user in user_to_items:

        #num items to remove for this user
        num_item = int(perc_items * len(user_to_items[user]))

        #Generate random candidates
        user_items = [item for item in user_to_items[user]]
        shuffle(user_items)  #in place shuffle

        to_remove = []
        for item in user_items[:num_item]:
            if len(item_to_users[item]) > 1:  #at least one user left
                item_to_users[item].remove(user)
                to_remove.append(item)

        user_item_pairs_to_filter[user] = to_remove

    return user_item_pairs_to_filter
def get_user_item_pairs_to_filter(users_to_consider, annotations, 
        perc_items=.1):
    
    '''
    Gets user item pairs to filter. A percentage (`perc_items`) is filtered for
    each user. 

    The code to guarantees that we do not delete items from the trace 
    completely, that is, while removing items for users we guarantee that
    at we do not make an item be annotated by zero users. Thus, this code does 
    not guarantee that exactly `perc_items` will be removed per user.
    '''
    
    user_to_items = {}
    item_to_users = {}
    
    user_to_items, item_to_users = create_double_occurrence_index(
            annotations, 'user', 'item')
    
    user_item_pairs_to_filter = {}
    for user in user_to_items:

        #num items to remove for this user
        num_item = int(perc_items * len(user_to_items[user]))
        
        #Generate random candidates
        user_items = [item for item in user_to_items[user]]
        shuffle(user_items) #in place shuffle
        
        to_remove = []
        for item in user_items[:num_item]:
            if len(item_to_users[item]) > 1: #at least one user left
                item_to_users[item].remove(user)
                to_remove.append(item)

        user_item_pairs_to_filter[user] = to_remove
    
    return user_item_pairs_to_filter
Example #11
0
def compute_for_user(database, table, user, relevant, annotated, smooth_func,
                     lambda_, user_profile_size, out_folder):
    with AnnotReader(database) as reader:
        reader.change_table(table)

        #Relevant items by user are left out with this query
        query = {
            '$or': [{
                'user': {
                    '$ne': user
                }
            }, {
                'item': {
                    '$nin': relevant
                }
            }]
        }

        #Probability estimator
        est = SmoothEstimator(smooth_func,
                              lambda_,
                              reader.iterate(query=query),
                              user_profile_size=user_profile_size)
        value_calc = value_calculator.ValueCalculator(est)

        fname = 'user_%d' % user
        user_folder = os.path.join(out_folder, fname)
        os.mkdir(user_folder)

        #Initial information
        with io.open(os.path.join(user_folder, 'info'), 'w') as info:
            info.write(u'#UID: %d\n' % user)

            relevant_str = ' '.join([str(i) for i in relevant])
            annotated_str = ' '.join([str(i) for i in annotated])

            info.write(u'# %d relevant  items: %s\n' %
                       (len(relevant), str(relevant_str)))
            info.write(u'# %d annotated items: %s\n' %
                       (len(annotated), str(annotated_str)))

        #Create Graph
        tag_to_item, item_to_tag = \
            index_creator.create_double_occurrence_index(reader.iterate(query = query),
                                                         'tag', 'item')

        create_graph(tag_to_item, item_to_tag, user_folder)

        #Items to consider <-> Gamma items
        annotated_set = set(annotated)
        iestimates = value_calc.item_value(user)

        #Filter top 10
        top_vals = iestimates.argsort()
        items_to_consider = set()
        for item in top_vals:
            if item in annotated_set:
                continue

            items_to_consider.add(item)
            if len(items_to_consider) == 10:
                break

        compute_tag_values(est, value_calc, tag_to_item, user, user_folder,
                           np.array([i for i in items_to_consider]))

        with io.open(os.path.join(user_folder, 'relevant_item.tags'),
                     'w') as rel:
            rel.write(u'#ITEM TAG\n')
            for item in relevant:
                for tag in item_to_tag[item]:
                    rel.write(u'%d %d\n' % (item, tag))