コード例 #1
0
def real_main(database, table, smooth_func, lambda_,
              out_folder):
    
    with AnnotReader(database) as reader:
        reader.change_table(table) 
        
        #Create Graph
        create_graph(reader.iterate(), out_folder)
      
        #Compute popularity
        tag_pop = collections.defaultdict(int)
        for annotation in reader.iterate():
            tag = annotation['tag']
            tag_pop[tag] += 1
            
        #Compute tag value
        tag_to_item, item_to_tag = \
            index_creator.create_double_occurrence_index(reader.iterate(), 
                                                        'tag', 'item')
        compute_tag_values(smooth_func, lambda_,
                           reader.iterate(), tag_to_item, tag_pop, out_folder)

        with io.open(os.path.join(out_folder, 'relevant_item.tags'), 'w') as rel:
            rel.write(u'#ITEM TAG\n')
            for item in item_to_tag:
                for tag in item_to_tag[tag]:
                    rel.write(u'%d %d\n' %(item, tag))  
コード例 #2
0
ファイル: DumpProbs.py プロジェクト: flaviovdf/tag_assess
def main(database, table, smooth_func, lambda_, min_tag_freq):
    
    with AnnotReader(database) as reader:
        reader.change_table(table)
        
        #Builds value calculator
        estimator = SmoothEstimator(smooth_func, lambda_, reader.iterate())
        calculator = ValueCalculator(estimator)
        
        #Determine tags which will be considered
        tags_to_consider = []
        if min_tag_freq < 0: #All tags
            tags_to_consider = range(estimator.num_tags())
        else:
            counter = Counter(annot['tag'] for annot in reader.iterate())
            for tag, pop in counter.iteritems():
                if pop >= min_tag_freq:
                    tags_to_consider.append(tag)
                    
        #Dumps probabilities
        connection = None
        database = None
        try:            
            items = np.arange(estimator.num_items())
            for tag in tags_to_consider:
                v_prob_it = calculator.rnorm_prob_items_given_tag(tag, items)
                for item in xrange(len(v_prob_it)):
                    prob = float(v_prob_it[item])
                    print({'tag':tag, 'item':item, 'prob_it':prob})
                
        finally:
            if connection:
                connection.disconnect()
コード例 #3
0
def main(database, table):
    with AnnotReader(database) as reader:
        reader.change_table(table)
        print('#tag', 'item', 'user', 'date', sep=',')
        for row in reader.iterate():
            timestmp = time.strftime('%Y-%m-%d %H:%M:%S', 
                                     time.localtime(row['date']))
            print(row['tag'], row['item'], row['user'], timestmp, sep=',')
コード例 #4
0
def real_main(database, table, smooth_func, lambda_, user):
    with AnnotReader(database) as reader:
        reader.change_table(table)
        est = SmoothEstimator(smooth_func, lambda_, reader.iterate())
        vc = value_calculator.ValueCalculator(est)
        
        iitem_value = vc.item_value(user)
        for item, item_val in iitem_value.iteritems():
            print(item, item_val)
コード例 #5
0
def compute_for_user(database, table, user, relevant, annotated,
                    smooth_func, lambda_, user_profile_size, out_folder):
    
    with AnnotReader(database) as reader:
        reader.change_table(table)
        #Relevant items by user are left out with this query
        query = {'$or' : [
                          { 'user':{'$ne'  : user} }, 
                          { 'item':{'$nin' : relevant} }
                         ]
                }
        
        
        #Probability estimator
        est = SmoothEstimator(smooth_func, lambda_, reader.iterate(query=query),
                              user_profile_size = user_profile_size)
        value_calc = value_calculator.ValueCalculator(est)
        
        fname = 'user_%d' % user
        user_folder = os.path.join(out_folder, fname)
        os.mkdir(user_folder)
        
        #Initial information
        with open(os.path.join(user_folder, 'info'), 'w') as info:
            print('#UID: %d' %user, file=info)
            
            relevant_str = ' '.join([str(i) for i in relevant])
            annotated_str = ' '.join([str(i) for i in annotated])
            
            print('#%d relevant: %s' %(len(relevant), str(relevant_str)), 
                  file=info)
            print('#%d annotated: %s' %(len(annotated), str(annotated_str)), 
                  file=info)
        
        items = np.array(relevant, dtype='l')
        v_piu = value_calc.rnorm_prob_items_given_user(user, items)
        v_dkl = value_calc.tag_value_personalized(user, gamma_items=items)
        
        v_dkl_argsort = v_dkl.argsort()
        top_5_tags = v_dkl_argsort[:5]
        bottom_5_tags = v_dkl_argsort[len(v_dkl) - 5:]
        
        write_points_file(v_piu, os.path.join(user_folder, 'v_piu.dat'))
        write_points_file(v_dkl, os.path.join(user_folder, 'v_dkl.dat'))
        
        for i, tag in enumerate(top_5_tags):
            v_pitu = value_calc.rnorm_prob_items_given_user_tag(user, tag, items)
            write_points_file(v_pitu, os.path.join(user_folder, 
                                                   'v_pitu_tag_%d_top_%d.dat'
                                                   % (tag, i + 1)))
        for i, tag in enumerate(bottom_5_tags):
            v_pitu = value_calc.rnorm_prob_items_given_user_tag(user, tag, items)
            write_points_file(v_pitu, os.path.join(user_folder, 
                                                   'v_pitu_tag_%d_bottom_%d.dat' 
                                                   % (tag, 5 - i)))
コード例 #6
0
def write_good_annots(database, table, new_database, good_items):
    '''Writes new annotations based on filters'''
    with AnnotReader(database) as reader, AnnotWriter(new_database) as writer:
        reader.change_table(table)
        writer.create_table(table)
        iterator = reader.iterate(query={'item': {'$in': good_items}})

        parser = data_parser.Parser()
        iparse = parser.iparse(iterator, data_parser.json_parser)
        for new_annot in iparse:
            writer.append_row(new_annot)

    return parser.user_ids, parser.item_ids, parser.tag_ids
コード例 #7
0
def main(database,
         table,
         smooth_func,
         lambda_,
         alpha,
         output_folder,
         min_tag_freq=1):

    assert os.path.isdir(
        output_folder), '%s is not a directory' % output_folder
    tag_value_fpath = os.path.join(output_folder, 'tag.values')
    item_tag_fpath = os.path.join(output_folder, 'item_tag.pairs')
    item_probs_fpath = os.path.join(output_folder, 'item.probs')

    with AnnotReader(database) as reader:
        reader.change_table(table)

        #Determine the items annotated by each tag and array of all items
        items_array, tags_array, tag_to_item, tag_pop = \
                fetch_tags_and_items(reader, min_tag_freq)

        #Generates user profile based on zipf and computes value
        n_items = items_array.shape[0]
        seeker_profile = np.zeros(n_items, dtype='float64')
        n_dists = 10
        for i in xrange(n_dists):
            seeker_profile += np.random.zipf(alpha, n_items)

        #Average it out and transform to probabilities
        seeker_profile /= n_dists
        seeker_profile /= seeker_profile.sum()

        #Tag Value
        estimator = SmoothEstimator(smooth_func, lambda_, reader.iterate())
        with open(tag_value_fpath, 'w') as tag_value_file:
            tag_values(estimator, tags_array, items_array, tag_to_item,
                       seeker_profile, tag_pop, tag_value_file)

        #Item tag pairs
        with open(item_tag_fpath, 'w') as item_tag_file:
            print('#tag_id', 'item_id', file=item_tag_file)
            for tag_id in tag_to_item:
                for item_id in tag_to_item[tag_id]:
                    print(tag_id, item_id, file=item_tag_file)

        with open(item_probs_fpath, 'w') as item_probs_file:
            print('#item_id', 'prob', file=item_probs_file)
            for item_id, prob in enumerate(seeker_profile):
                print(item_id, prob, file=item_probs_file)
コード例 #8
0
    def generator():
        with AnnotReader(database) as reader:
            '''Yields parameters for each user'''
            reader.change_table(table)
            uitem_idx = index_creator.create_occurrence_index(
                reader.iterate(), 'user', 'item')

            filt = lambda u: len(uitem_idx[u]) >= 10
            for user in ifilter(filt, uitem_idx.iterkeys()):
                items = [item for item in uitem_idx[user]]
                half = len(items) // 2

                relevant = items[:half]
                annotated = items[half:]
                yield database, table, user, relevant, annotated, \
                      smooth_func, lambda_, user_profile_size, out_folder
コード例 #9
0
def determine_good_items(database, table, min_users_per_item):
    '''Filters the items used by a minimum number of users'''
    pop_items = defaultdict(set)
    good_items = set()
    with AnnotReader(database) as reader:
        reader.change_table(table)
        iterator = reader.iterate()
        for annotation in iterator:
            user = annotation['user']
            item = annotation['item']

            if item not in good_items:
                pop_items[item].add(user)
                if len(pop_items[item]) >= min_users_per_item:
                    del pop_items[item]
                    good_items.add(item)

    return [item for item in sorted(good_items)]
コード例 #10
0
def real_main(database, table, out_file, use):
    with AnnotReader(database) as reader:
        reader.change_table(table) 
        ntags, nsinks, iedges = \
         graph.iedge_from_annotations(reader.iterate(), use)
        n_nodes = ntags + nsinks
    
    tmp_fname = tempfile.mktemp()
    n_edges = 0
    with open(tmp_fname, 'w') as tmp:
        for source, dest in sorted(iedges):
            print(source, dest, file=tmp)
            n_edges += 1 
    
    with open(tmp_fname) as tmp:
        with open(out_file, 'w') as out:
            print('#Nodes:  %d'%n_nodes, file=out)
            print('#Edges:  %d'%n_edges, file=out)
            print('#Directed', file=out)
            for line in tmp:
                print(line[:-1], file=out)
コード例 #11
0
def compute_for_user(database, table, user, relevant, annotated, smooth_func,
                     lambda_, user_profile_size, out_folder):
    with AnnotReader(database) as reader:
        reader.change_table(table)

        #Relevant items by user are left out with this query
        query = {
            '$or': [{
                'user': {
                    '$ne': user
                }
            }, {
                'item': {
                    '$nin': relevant
                }
            }]
        }

        #Probability estimator
        est = SmoothEstimator(smooth_func,
                              lambda_,
                              reader.iterate(query=query),
                              user_profile_size=user_profile_size)
        value_calc = value_calculator.ValueCalculator(est)

        fname = 'user_%d' % user
        user_folder = os.path.join(out_folder, fname)
        os.mkdir(user_folder)

        #Initial information
        with io.open(os.path.join(user_folder, 'info'), 'w') as info:
            info.write(u'#UID: %d\n' % user)

            relevant_str = ' '.join([str(i) for i in relevant])
            annotated_str = ' '.join([str(i) for i in annotated])

            info.write(u'# %d relevant  items: %s\n' %
                       (len(relevant), str(relevant_str)))
            info.write(u'# %d annotated items: %s\n' %
                       (len(annotated), str(annotated_str)))

        #Create Graph
        tag_to_item, item_to_tag = \
            index_creator.create_double_occurrence_index(reader.iterate(query = query),
                                                         'tag', 'item')

        create_graph(tag_to_item, item_to_tag, user_folder)

        #Items to consider <-> Gamma items
        annotated_set = set(annotated)
        iestimates = value_calc.item_value(user)

        #Filter top 10
        top_vals = iestimates.argsort()
        items_to_consider = set()
        for item in top_vals:
            if item in annotated_set:
                continue

            items_to_consider.add(item)
            if len(items_to_consider) == 10:
                break

        compute_tag_values(est, value_calc, tag_to_item, user, user_folder,
                           np.array([i for i in items_to_consider]))

        with io.open(os.path.join(user_folder, 'relevant_item.tags'),
                     'w') as rel:
            rel.write(u'#ITEM TAG\n')
            for item in relevant:
                for tag in item_to_tag[item]:
                    rel.write(u'%d %d\n' % (item, tag))