Example #1
0
def get_baselines(annot_filter, reader, user_to_tags):
    
    annotations = annot_filter.annotations(reader.iterate())
    user_to_item = create_occurrence_index(annotations, 'user', 'item')
    
    annotations = annot_filter.annotations(reader.iterate())
    item_to_tags = create_occurrence_index(annotations, 'item', 'tag')
    
    overlap = {}
    for user in user_to_tags:
        for item in user_to_item:
            for tag in item_to_tags[item]:
                if (user, tag) not in overlap:
                    overlap[user, tag] = 0
                    
                if tag not in user_to_tags[user]:
                    overlap[user, tag] += 1
    
    idf = {}
    annotations = annot_filter.annotations(reader.iterate())
    for annot in annotations:
        tag = annot['tag']
        if tag not in idf:
            idf[tag] = 0
            
        idf[tag] += 1
    
    for tag in idf.keys():
        idf[tag] = 1.0 / idf[tag] 
    
    return idf, overlap
Example #2
0
def get_baselines(annot_filter, reader, user_to_tags):
    
    annotations = annot_filter.annotations(reader.iterate())
    user_to_item = create_occurrence_index(annotations, 'user', 'item')
    
    annotations = annot_filter.annotations(reader.iterate())
    item_to_tags = create_occurrence_index(annotations, 'item', 'tag')
    
    overlap = {}
    for user in user_to_tags:
        for item in user_to_item:
            for tag in item_to_tags[item]:
                if (user, tag) not in overlap:
                    overlap[user, tag] = 0
                    
                if tag not in user_to_tags[user]:
                    overlap[user, tag] += 1
    
    idf = {}
    annotations = annot_filter.annotations(reader.iterate())
    for annot in annotations:
        tag = annot['tag']
        if tag not in idf:
            idf[tag] = 0
            
        idf[tag] += 1
    
    for tag in idf.keys():
        idf[tag] = 1.0 / idf[tag] 
    
    return idf, overlap
Example #3
0
def fetch_tags_and_items(reader, min_tag_freq=1):
    '''
    This method retrieves an array of every item id, another one for 
    every tag id and a dict mapping tag ids to the items ids annotated
    by every tag. We also return the popularity of each tag.
    
    Arguments
    ---------
    reader: `AnnotReader`
        reader which connects to DB
        
    min_tag_freq: int
        Indicates that we should ignore tags with a frequency lower than
        this argument.
    '''
    tag_to_item = {}
    tags = []
    items = set()

    #Filter some very infrequent tags?
    tag_pop = defaultdict(int)
    for row in reader.iterate():
        items.add(row['item'])
        tag_pop[row['tag']] += 1

    temp_index = create_occurrence_index(reader.iterate(), 'tag', 'item')
    for tag_id in temp_index:
        if min_tag_freq == -1 or tag_pop[tag_id] >= min_tag_freq:
            tags.append(tag_id)
            tag_to_item[tag_id] = np.array([i for i in temp_index[tag_id]])

    return np.arange(len(items)), np.array(sorted(tags), dtype='int64'), \
            tag_to_item, tag_pop
Example #4
0
def fetch_tags_and_items(reader, min_tag_freq=1):
    '''
    This method retrieves an array of every item id, another one for 
    every tag id and a dict mapping tag ids to the items ids annotated
    by every tag. We also return the popularity of each tag.
    
    Arguments
    ---------
    reader: `AnnotReader`
        reader which connects to DB
        
    min_tag_freq: int
        Indicates that we should ignore tags with a frequency lower than
        this argument.
    '''
    tag_to_item = {}
    tags = []
    items = set()
    
    #Filter some very infrequent tags?
    tag_pop = defaultdict(int)
    for row in reader.iterate():
        items.add(row['item'])
        tag_pop[row['tag']] += 1 
            
    temp_index = create_occurrence_index(reader.iterate(), 'tag', 'item')
    for tag_id in temp_index:
        if min_tag_freq == -1 or tag_pop[tag_id] >= min_tag_freq:
            tags.append(tag_id)
            tag_to_item[tag_id] = np.array([i for i in temp_index[tag_id]])
            
    return np.arange(len(items)), np.array(sorted(tags), dtype='int64'), \
            tag_to_item, tag_pop
Example #5
0
def main(library_thing_annotations_fpath,
         output_folder,
         num_users=20,
         perc_items=.1,
         estimator='lda',
         rand_seed=None):

    seed(rand_seed)

    #Basic asserts for the folder
    assert os.path.isdir(output_folder)
    assert len(os.listdir(output_folder)) == 0

    #Load LT file
    base_annotations, user_ids, item_ids, tag_ids = \
            create_annotations(library_thing_annotations_fpath)

    #Get most popular users
    user_pop = np.zeros(len(user_ids))
    for annot in base_annotations:
        user_pop[annot['user']] += 1
    users_to_consider = user_pop.argsort()[::-1][:num_users]

    user_item_pairs_to_filter = \
            get_user_item_pairs_to_filter(users_to_consider,
                    base_annotations)

    #Create estimator
    filtered_annotations = FilteredAnnotations(user_item_pairs_to_filter)
    annotations = filtered_annotations.annotations(base_annotations)
    if estimator == 'smooth':
        est = create_smooth_estimator(annotations)
    elif estimator == 'lda':
        est = create_lda_estimator(annotations, len(item_ids), len(tag_ids))
    else:
        raise Exception('Unknown estimator, please choose from {lda, smooth}')

    #Run experiment!
    annotations = filtered_annotations.annotations(base_annotations)
    user_to_item = create_occurrence_index(annotations, 'user', 'item')

    for user in users_to_consider:
        gamma_items = [item for item in xrange(len(item_ids)) \
                                    if item not in user_to_item[item]]

        probs_i_given_u = est.prob_items_given_user(user,
                                                    np.asarray(gamma_items))

        piu_fpath = os.path.join(output_folder, 'probs-user-%d.dat' % user)
        np.savetxt(piu_fpath, probs_i_given_u)

        hidden_fpath = os.path.join(output_folder,
                                    'hidden-items-for-user-%d.dat' % user)
        np.savetxt(hidden_fpath, user_item_pairs_to_filter[user])

        item_ids_fpath = os.path.join(output_folder,
                                      'gamma-item-ids-user-%d.dat' % user)
        np.savetxt(item_ids_fpath, gamma_items)
def main(library_thing_annotations_fpath, output_folder, 
         num_users=20, perc_items=.1, estimator='lda', rand_seed=None):
    
    seed(rand_seed)
    
    #Basic asserts for the folder
    assert os.path.isdir(output_folder)
    assert len(os.listdir(output_folder)) == 0
    
    #Load LT file
    base_annotations, user_ids, item_ids, tag_ids = \
            create_annotations(library_thing_annotations_fpath)

    #Get most popular users
    user_pop = np.zeros(len(user_ids))
    for annot in base_annotations:
        user_pop[annot['user']] += 1
    users_to_consider = user_pop.argsort()[::-1][:num_users]

    user_item_pairs_to_filter = \
            get_user_item_pairs_to_filter(users_to_consider, 
                    base_annotations)

    #Create estimator
    filtered_annotations = FilteredAnnotations(user_item_pairs_to_filter)
    annotations = filtered_annotations.annotations(base_annotations)
    if estimator == 'smooth':
        est = create_smooth_estimator(annotations)
    elif estimator == 'lda':
        est = create_lda_estimator(annotations, len(item_ids), len(tag_ids))
    else:
        raise Exception('Unknown estimator, please choose from {lda, smooth}')

    #Run experiment!
    annotations = filtered_annotations.annotations(base_annotations)
    user_to_item = create_occurrence_index(annotations, 'user', 'item')
    
    for user in users_to_consider:
        gamma_items = [item for item in xrange(len(item_ids)) \
                                    if item not in user_to_item[item]]

        probs_i_given_u = est.prob_items_given_user(user, 
                np.asarray(gamma_items))

        piu_fpath = os.path.join(output_folder, 'probs-user-%d.dat' % user)
        np.savetxt(piu_fpath, probs_i_given_u)

        hidden_fpath = os.path.join(output_folder, 
                'hidden-items-for-user-%d.dat' % user)
        np.savetxt(hidden_fpath, user_item_pairs_to_filter[user])
        
        item_ids_fpath = os.path.join(output_folder,
                'gamma-item-ids-user-%d.dat' % user)
        np.savetxt(item_ids_fpath, gamma_items)
def main(library_thing_annotations_fpath,
         output_folder,
         num_users=20,
         perc_tags=.1,
         estimator='lda',
         num_random_tags=100,
         rand_seed=None):

    seed(rand_seed)

    #Basic asserts for the folder
    assert os.path.isdir(output_folder)
    assert len(os.listdir(output_folder)) == 0

    #Load LT file
    base_annotations, user_ids, item_ids, tag_ids = \
            create_annotations(library_thing_annotations_fpath)

    #Get most popular users
    user_pop = np.zeros(len(user_ids))
    for annot in base_annotations:
        user_pop[annot['user']] += 1
    users_to_consider = user_pop.argsort()[::-1][:num_users]

    #Get user tag pairs to filter and random tags
    user_to_hidden_tags, random_tags = \
            user_tag_pairs_to_filter(users_to_consider, base_annotations,
                                     perc_tags)

    #Create estimator
    filtered_annotations = FilteredAnnotations(user_to_hidden_tags)
    annotations = filtered_annotations.annotations(base_annotations)
    if estimator == 'smooth':
        est = create_smooth_estimator(annotations)
    elif estimator == 'lda':
        est = create_lda_estimator(annotations, len(item_ids), len(tag_ids))
    else:
        raise Exception('Unknown estimator, please choose from {lda, smooth}')

    #This next line is needed to create a new generator
    annotations = filtered_annotations.annotations(base_annotations)
    value_calculator = ValueCalculator(est, annotations)

    #Run experiment!
    annotations = filtered_annotations.annotations(base_annotations)
    user_to_item = create_occurrence_index(annotations, 'user', 'item')
    for user in users_to_consider:
        gamma_items = [item for item in xrange(len(item_ids)) \
                                    if item not in user_to_item[item]]
        tags_hidden = user_to_hidden_tags[user]
        run_one_user(user, value_calculator, gamma_items, tags_hidden, \
                random_tags, output_folder)
 def test_occurence_index_user_to_item(self):
     #Not the best of names, but we attribute this to fields
     #which have no impact on the test.
     no_impact = 1
     
     a1 = data_parser.to_json(1, 1, no_impact, no_impact)
     a2 = data_parser.to_json(1, 2, no_impact, no_impact)
     a3 = data_parser.to_json(1, 1, no_impact, no_impact)
     a4 = data_parser.to_json(2, 2, no_impact, no_impact)
     a5 = data_parser.to_json(2, 3, no_impact, no_impact)
 
     index = create_occurrence_index([a1, a2, a3, a4, a5], 'user', 'item')
     self.assertEqual(index[1], set([1, 2, 1]))
     self.assertEqual(index[2], set([2, 3]))
    def test_occurence_index_user_to_item(self):
        #Not the best of names, but we attribute this to fields
        #which have no impact on the test.
        no_impact = 1

        a1 = data_parser.to_json(1, 1, no_impact, no_impact)
        a2 = data_parser.to_json(1, 2, no_impact, no_impact)
        a3 = data_parser.to_json(1, 1, no_impact, no_impact)
        a4 = data_parser.to_json(2, 2, no_impact, no_impact)
        a5 = data_parser.to_json(2, 3, no_impact, no_impact)

        index = create_occurrence_index([a1, a2, a3, a4, a5], 'user', 'item')
        self.assertEqual(index[1], set([1, 2, 1]))
        self.assertEqual(index[2], set([2, 3]))
def main(library_thing_annotations_fpath, output_folder, 
         num_users=20, perc_tags=.1, estimator='lda', num_random_tags=100, 
         rand_seed=None):
    
    seed(rand_seed)
    
    #Basic asserts for the folder
    assert os.path.isdir(output_folder)
    assert len(os.listdir(output_folder)) == 0
    
    #Load LT file
    base_annotations, user_ids, item_ids, tag_ids = \
            create_annotations(library_thing_annotations_fpath)

    #Get most popular users
    user_pop = np.zeros(len(user_ids))
    for annot in base_annotations:
        user_pop[annot['user']] += 1
    users_to_consider = user_pop.argsort()[::-1][:num_users]

    #Get user tag pairs to filter and random tags
    user_to_hidden_tags, random_tags = \
            user_tag_pairs_to_filter(users_to_consider, base_annotations, 
                                     perc_tags)

    #Create estimator
    filtered_annotations = FilteredAnnotations(user_to_hidden_tags)
    annotations = filtered_annotations.annotations(base_annotations)
    if estimator == 'smooth':
        est = create_smooth_estimator(annotations)
    elif estimator == 'lda':
        est = create_lda_estimator(annotations, len(item_ids), len(tag_ids))
    else:
        raise Exception('Unknown estimator, please choose from {lda, smooth}')

    #This next line is needed to create a new generator
    annotations = filtered_annotations.annotations(base_annotations)
    value_calculator = ValueCalculator(est, annotations)
    
    #Run experiment!
    annotations = filtered_annotations.annotations(base_annotations)
    user_to_item = create_occurrence_index(annotations, 'user', 'item')
    for user in users_to_consider:
        gamma_items = [item for item in xrange(len(item_ids)) \
                                    if item not in user_to_item[item]]
        tags_hidden = user_to_hidden_tags[user]
        run_one_user(user, value_calculator, gamma_items, tags_hidden, \
                random_tags, output_folder)
Example #11
0
    def generator():
        with AnnotReader(database) as reader:
            '''Yields parameters for each user'''
            reader.change_table(table)
            uitem_idx = index_creator.create_occurrence_index(
                reader.iterate(), 'user', 'item')

            filt = lambda u: len(uitem_idx[u]) >= 10
            for user in ifilter(filt, uitem_idx.iterkeys()):
                items = [item for item in uitem_idx[user]]
                half = len(items) // 2

                relevant = items[:half]
                annotated = items[half:]
                yield database, table, user, relevant, annotated, \
                      smooth_func, lambda_, user_profile_size, out_folder
Example #12
0
    def generator():
        with AnnotReader(database) as reader:
            '''Yields parameters for each user'''
            reader.change_table(table)
            uitem_idx = index_creator.create_occurrence_index(
                reader.iterate(), 'user', 'item')

            filt = lambda u: len(uitem_idx[u]) >= 30
            for user in ifilter(filt, uitem_idx.iterkeys()):
                items = [item for item in uitem_idx[user]]

                cut = len(items) - num_relevant
                relevant = items[cut:]
                annotated = items[:cut]
                yield database, table, user, relevant, annotated, \
                      smooth_func, lambda_, user_profile_size, out_folder
Example #13
0
def run_exp(user_validation_tags, user_test_tags, user_test_items, est, 
            annot_filter, reader):
    
    user_to_tags = {}
    for user in est.get_valid_users():
        #Remove validation tags. The script focuses on test tags
        tags_to_compute = []
        tags = est.tags_for_user(user)
        for tag in tags:
            if tag not in user_validation_tags[user]:
                tags_to_compute.append(tag)
                
        user_to_tags[user] = tags_to_compute
    
    annotations = annot_filter.annotations(reader.iterate())
    tag_to_items = create_occurrence_index(annotations, 'tag', 'item')
#    item_to_tags = create_occurrence_index(annotations, 'item', 'tag')
    
    print('#user', 'tag', 'precision', 'recall', 'hidden')
    for user in est.get_valid_users():
        tags = user_to_tags[user]
        for tag in tags:
            hidden = tag in user_test_tags[user]
            
            relevant = user_test_items[user]
            retrieved = tag_to_items[tag]
            
            intersect = retrieved.intersection(relevant)
            
            precision = len(intersect) / len(retrieved)
            recall = len(intersect) / len(relevant)
            
#            tags_for_relevant = set()
#            for item in relevant:
#                tags_for_relevant.update(item_to_tags[item])
            
            print(user, tag, precision, recall, hidden)