def real_main(database, table, smooth_func, lambda_, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Create Graph create_graph(reader.iterate(), out_folder) #Compute popularity tag_pop = collections.defaultdict(int) for annotation in reader.iterate(): tag = annotation['tag'] tag_pop[tag] += 1 #Compute tag value tag_to_item, item_to_tag = \ index_creator.create_double_occurrence_index(reader.iterate(), 'tag', 'item') compute_tag_values(smooth_func, lambda_, reader.iterate(), tag_to_item, tag_pop, out_folder) with io.open(os.path.join(out_folder, 'relevant_item.tags'), 'w') as rel: rel.write(u'#ITEM TAG\n') for item in item_to_tag: for tag in item_to_tag[tag]: rel.write(u'%d %d\n' %(item, tag))
def main(database, table, smooth_func, lambda_, min_tag_freq): with AnnotReader(database) as reader: reader.change_table(table) #Builds value calculator estimator = SmoothEstimator(smooth_func, lambda_, reader.iterate()) calculator = ValueCalculator(estimator) #Determine tags which will be considered tags_to_consider = [] if min_tag_freq < 0: #All tags tags_to_consider = range(estimator.num_tags()) else: counter = Counter(annot['tag'] for annot in reader.iterate()) for tag, pop in counter.iteritems(): if pop >= min_tag_freq: tags_to_consider.append(tag) #Dumps probabilities connection = None database = None try: items = np.arange(estimator.num_items()) for tag in tags_to_consider: v_prob_it = calculator.rnorm_prob_items_given_tag(tag, items) for item in xrange(len(v_prob_it)): prob = float(v_prob_it[item]) print({'tag':tag, 'item':item, 'prob_it':prob}) finally: if connection: connection.disconnect()
def main(database, table): with AnnotReader(database) as reader: reader.change_table(table) print('#tag', 'item', 'user', 'date', sep=',') for row in reader.iterate(): timestmp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(row['date'])) print(row['tag'], row['item'], row['user'], timestmp, sep=',')
def real_main(database, table, smooth_func, lambda_, user): with AnnotReader(database) as reader: reader.change_table(table) est = SmoothEstimator(smooth_func, lambda_, reader.iterate()) vc = value_calculator.ValueCalculator(est) iitem_value = vc.item_value(user) for item, item_val in iitem_value.iteritems(): print(item, item_val)
def compute_for_user(database, table, user, relevant, annotated, smooth_func, lambda_, user_profile_size, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Relevant items by user are left out with this query query = {'$or' : [ { 'user':{'$ne' : user} }, { 'item':{'$nin' : relevant} } ] } #Probability estimator est = SmoothEstimator(smooth_func, lambda_, reader.iterate(query=query), user_profile_size = user_profile_size) value_calc = value_calculator.ValueCalculator(est) fname = 'user_%d' % user user_folder = os.path.join(out_folder, fname) os.mkdir(user_folder) #Initial information with open(os.path.join(user_folder, 'info'), 'w') as info: print('#UID: %d' %user, file=info) relevant_str = ' '.join([str(i) for i in relevant]) annotated_str = ' '.join([str(i) for i in annotated]) print('#%d relevant: %s' %(len(relevant), str(relevant_str)), file=info) print('#%d annotated: %s' %(len(annotated), str(annotated_str)), file=info) items = np.array(relevant, dtype='l') v_piu = value_calc.rnorm_prob_items_given_user(user, items) v_dkl = value_calc.tag_value_personalized(user, gamma_items=items) v_dkl_argsort = v_dkl.argsort() top_5_tags = v_dkl_argsort[:5] bottom_5_tags = v_dkl_argsort[len(v_dkl) - 5:] write_points_file(v_piu, os.path.join(user_folder, 'v_piu.dat')) write_points_file(v_dkl, os.path.join(user_folder, 'v_dkl.dat')) for i, tag in enumerate(top_5_tags): v_pitu = value_calc.rnorm_prob_items_given_user_tag(user, tag, items) write_points_file(v_pitu, os.path.join(user_folder, 'v_pitu_tag_%d_top_%d.dat' % (tag, i + 1))) for i, tag in enumerate(bottom_5_tags): v_pitu = value_calc.rnorm_prob_items_given_user_tag(user, tag, items) write_points_file(v_pitu, os.path.join(user_folder, 'v_pitu_tag_%d_bottom_%d.dat' % (tag, 5 - i)))
def write_good_annots(database, table, new_database, good_items): '''Writes new annotations based on filters''' with AnnotReader(database) as reader, AnnotWriter(new_database) as writer: reader.change_table(table) writer.create_table(table) iterator = reader.iterate(query={'item': {'$in': good_items}}) parser = data_parser.Parser() iparse = parser.iparse(iterator, data_parser.json_parser) for new_annot in iparse: writer.append_row(new_annot) return parser.user_ids, parser.item_ids, parser.tag_ids
def main(database, table, smooth_func, lambda_, alpha, output_folder, min_tag_freq=1): assert os.path.isdir( output_folder), '%s is not a directory' % output_folder tag_value_fpath = os.path.join(output_folder, 'tag.values') item_tag_fpath = os.path.join(output_folder, 'item_tag.pairs') item_probs_fpath = os.path.join(output_folder, 'item.probs') with AnnotReader(database) as reader: reader.change_table(table) #Determine the items annotated by each tag and array of all items items_array, tags_array, tag_to_item, tag_pop = \ fetch_tags_and_items(reader, min_tag_freq) #Generates user profile based on zipf and computes value n_items = items_array.shape[0] seeker_profile = np.zeros(n_items, dtype='float64') n_dists = 10 for i in xrange(n_dists): seeker_profile += np.random.zipf(alpha, n_items) #Average it out and transform to probabilities seeker_profile /= n_dists seeker_profile /= seeker_profile.sum() #Tag Value estimator = SmoothEstimator(smooth_func, lambda_, reader.iterate()) with open(tag_value_fpath, 'w') as tag_value_file: tag_values(estimator, tags_array, items_array, tag_to_item, seeker_profile, tag_pop, tag_value_file) #Item tag pairs with open(item_tag_fpath, 'w') as item_tag_file: print('#tag_id', 'item_id', file=item_tag_file) for tag_id in tag_to_item: for item_id in tag_to_item[tag_id]: print(tag_id, item_id, file=item_tag_file) with open(item_probs_fpath, 'w') as item_probs_file: print('#item_id', 'prob', file=item_probs_file) for item_id, prob in enumerate(seeker_profile): print(item_id, prob, file=item_probs_file)
def generator(): with AnnotReader(database) as reader: '''Yields parameters for each user''' reader.change_table(table) uitem_idx = index_creator.create_occurrence_index( reader.iterate(), 'user', 'item') filt = lambda u: len(uitem_idx[u]) >= 10 for user in ifilter(filt, uitem_idx.iterkeys()): items = [item for item in uitem_idx[user]] half = len(items) // 2 relevant = items[:half] annotated = items[half:] yield database, table, user, relevant, annotated, \ smooth_func, lambda_, user_profile_size, out_folder
def determine_good_items(database, table, min_users_per_item): '''Filters the items used by a minimum number of users''' pop_items = defaultdict(set) good_items = set() with AnnotReader(database) as reader: reader.change_table(table) iterator = reader.iterate() for annotation in iterator: user = annotation['user'] item = annotation['item'] if item not in good_items: pop_items[item].add(user) if len(pop_items[item]) >= min_users_per_item: del pop_items[item] good_items.add(item) return [item for item in sorted(good_items)]
def real_main(database, table, out_file, use): with AnnotReader(database) as reader: reader.change_table(table) ntags, nsinks, iedges = \ graph.iedge_from_annotations(reader.iterate(), use) n_nodes = ntags + nsinks tmp_fname = tempfile.mktemp() n_edges = 0 with open(tmp_fname, 'w') as tmp: for source, dest in sorted(iedges): print(source, dest, file=tmp) n_edges += 1 with open(tmp_fname) as tmp: with open(out_file, 'w') as out: print('#Nodes: %d'%n_nodes, file=out) print('#Edges: %d'%n_edges, file=out) print('#Directed', file=out) for line in tmp: print(line[:-1], file=out)
def compute_for_user(database, table, user, relevant, annotated, smooth_func, lambda_, user_profile_size, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Relevant items by user are left out with this query query = { '$or': [{ 'user': { '$ne': user } }, { 'item': { '$nin': relevant } }] } #Probability estimator est = SmoothEstimator(smooth_func, lambda_, reader.iterate(query=query), user_profile_size=user_profile_size) value_calc = value_calculator.ValueCalculator(est) fname = 'user_%d' % user user_folder = os.path.join(out_folder, fname) os.mkdir(user_folder) #Initial information with io.open(os.path.join(user_folder, 'info'), 'w') as info: info.write(u'#UID: %d\n' % user) relevant_str = ' '.join([str(i) for i in relevant]) annotated_str = ' '.join([str(i) for i in annotated]) info.write(u'# %d relevant items: %s\n' % (len(relevant), str(relevant_str))) info.write(u'# %d annotated items: %s\n' % (len(annotated), str(annotated_str))) #Create Graph tag_to_item, item_to_tag = \ index_creator.create_double_occurrence_index(reader.iterate(query = query), 'tag', 'item') create_graph(tag_to_item, item_to_tag, user_folder) #Items to consider <-> Gamma items annotated_set = set(annotated) iestimates = value_calc.item_value(user) #Filter top 10 top_vals = iestimates.argsort() items_to_consider = set() for item in top_vals: if item in annotated_set: continue items_to_consider.add(item) if len(items_to_consider) == 10: break compute_tag_values(est, value_calc, tag_to_item, user, user_folder, np.array([i for i in items_to_consider])) with io.open(os.path.join(user_folder, 'relevant_item.tags'), 'w') as rel: rel.write(u'#ITEM TAG\n') for item in relevant: for tag in item_to_tag[item]: rel.write(u'%d %d\n' % (item, tag))