Beispiel #1
0
def main():

    # create and load
    ranker = RankerByTags()
    #filename = '../data/rpoland--1000.tagged_graph'
    #filename = '../data/rpoland--2000.tagged_graph'
    #filename = '../data/jcl5m--39370.tagged_graph'    
    #filename = '../data/MIX.tagged_graph'
    #filename = '../data/flickr.tagged_graph'
    #filename = '../data/jcl5m-cuantos.tagged_graph'
    #filename = '../data/flickr_med.tagged_graph'
    #filename = '../data/yt.tagged_graph'
    #filename = '../data/fr.tagged_graph'

    #filename = '../data/yt_nd.tagged_graph'
    filename = '../data/fr_nd.tagged_graph'

    top_tags_size = 20

    ranker.load(filename)
    
        # filter by tag boolean formula
    tag_form = TagBooleanFormula()
#    tag_form.run_tests()
#    and1 = TagBooleanConjunction()
#    and1.addAtom(TagBooleanAtom(True,'fun'))
#    tag_form.addTagAnd(and1)
#    and2 = TagBooleanConjunction()
#    and2.addAtom(TagBooleanAtom(True,'fun'))
#    tag_form.addTagAnd(and2)
    print str(tag_form)
    ranker.filter(tag_form)    

           
    # save new subgraph withou tags.
    #outfilename = filename + '-%s.graph' % str(tag_form)
    outfilename = filename
    #ranker.save(outfilename)
    ranker.save_edges(outfilename + '.edges')
    ranker.save_nwb(outfilename + '.nwb')
    
    # now save graphs of top tags
    tags = Tags(filename)
    top_tags = tags.get_top_tags(top_tags_size)
    #top_tags = map(lambda x: x[0], top_tags)
    #top_tags = ['music', 'funny']
    top_tags = ['blue', 'flower']

    for tag in top_tags:

            # filter by tag boolean formula
        tag_form = TagBooleanFormula()
    #    tag_form.run_tests()
        and1 = TagBooleanConjunction()
        and1.addAtom(TagBooleanAtom(True,tag))
        tag_form.addTagAnd(and1)
    #    and2 = TagBooleanConjunction()
    #    and2.addAtom(TagBooleanAtom(True,'fun'))
    #    tag_form.addTagAnd(and2)
        print str(tag_form)
        ranker.filter(tag_form)    
        # save new subgraph withou tags.
        #outfilename = filename + '-%s.graph' % str(tag_form)
        outfilename = filename
        #ranker.save(outfilename)
        ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges')
        ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb')
    

    # now save graphs of top tags by pairs, ANDed.
    tags = Tags(filename)
    top_tags = tags.get_top_tags(top_tags_size)
    #top_tags = map(lambda x: x[0], top_tags)
    #top_tags = ['music', 'funny']
    top_tags = ['blue', 'flower']

    for tag1, i in zip(top_tags,range(len(top_tags))):
        for tag2 in top_tags[i+1:]:

	    # AND
                # filter by tag boolean formula
            tag_form = TagBooleanFormula()
        #    tag_form.run_tests()
            and1 = TagBooleanConjunction()
            and1.addAtom(TagBooleanAtom(True,tag1))
            and1.addAtom(TagBooleanAtom(True,tag2))
            tag_form.addTagAnd(and1)
#            and2 = TagBooleanConjunction()
#            and2.addAtom(TagBooleanAtom(True,tag2))
#            tag_form.addTagAnd(and2)
            print str(tag_form)
            ranker.filter(tag_form)    
            # save new subgraph withou tags.
            #outfilename = filename + '-%s.graph' % str(tag_form)
            outfilename = filename
            #ranker.save(outfilename)
            ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges')
            ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb')
	    ranker.rank(10)
	    ranker.saveRank(outfilename + '.ranks/' + str(tag_form) + '.graph.rank' )
        
	    # OR
                # filter by tag boolean formula
            tag_form = TagBooleanFormula()
        #    tag_form.run_tests()
            and1 = TagBooleanConjunction()
            and1.addAtom(TagBooleanAtom(True,tag1))
	    tag_form.addTagAnd(and1)
            
            and2 = TagBooleanConjunction()
            and2.addAtom(TagBooleanAtom(True,tag2))
	    tag_form.addTagAnd(and2)
            
#	    and1.addAtom(TagBooleanAtom(True,tag2))
#            tag_form.addTagAnd(and1)
#            and2 = TagBooleanConjunction()
#            and2.addAtom(TagBooleanAtom(True,tag2))
#            tag_form.addTagAnd(and2)
            print str(tag_form)
            ranker.filter(tag_form)    
            # save new subgraph withou tags.
            #outfilename = filename + '-%s.graph' % str(tag_form)
            outfilename = filename
            #ranker.save(outfilename)
            ranker.save_edges(outfilename + '.'+ str(tag_form) + '.edges')
            ranker.save_nwb(outfilename + '.' + str(tag_form) + '.nwb')
	    ranker.rank()
	    ranker.saveRank(outfilename + '.ranks/' + str(tag_form) + '.graph.rank' )        
    
    print 'finish.'
Beispiel #2
0
class EggOMatic:
    
    def __build_map_user_tags(self):
        
        return EggOUserTagranksDict(self.__dataset)
        
#        map = {}
#
#        filename = self.__dataset + '.tagged_graph'
#        f = open(filename, 'r')
#        lines = f.readlines()
#        f.close()        
#        total_tags = []
#        pairs = 0
#        
#        i = 0
#        for line in lines:
#            if i % 10000 == 0:
#                print '%d lines read of %d'%(i,len(lines))
#            i += 1
#            cols = line.split('\t\t')
#            if len(cols) < 3:
#                continue
#            src = cols[0]
#            dst = cols[1]
#            cols[2] = cols[2].strip().lower()
#            tags = cols[2].split('|')
#                        
#            if not src in map:
#                map[src] = copy.deepcopy(tags)
#            else:
#                map[src] += copy.deepcopy(tags)
#
#            if not dst in map:
#                map[dst] = tags
#            else:
#                map[dst] += tags
#                
#            total_tags += tags
#        return map
    
    def __build_map_tag_users(self):
        
        return EggOTagUserDict(self.__dataset)
                
#        map = {}
#        i = 0        
#        for tag, userrank in self.__tag_userrank.iteritems():
#            if i % 1000 == 0:
#                print '%d users read of %d'%(i,len(self.__map_user_tags.keys()))
#            i += 1
#            #print 'user: %s  tags: %s' % (user,tags)
#            users = map(lambda x:x[0], userrank)
#            map[tag] = set(users)
#        return map

    
    def __build_map_tag_userrank(self):
 
        #return self.__load_obj('.tagged_graph.all_tags_ranks')
        return EggOTagUserrankDict(self.__dataset)       
        

    def __read_rank(self):
        return self.__map_tag_userrank['']
        
#        f = open(self.__dataset+'.tagged_graph-.graph.rank', 'r')
#        lines = f.readlines()
#        f.close()
#        return [(l.strip().split(' ')[0],float(l.strip().split(' ')[1])) for l in lines]

    def __read_tags(self):
        f = open(self.__dataset+ '.tagged_graph.ranks/all.tags', 'r')
        lines = f.readlines()
        f.close()
        return map(lambda line:line.strip().split(' ')[0], lines)

    def __read_clusters(self, clust_limit=5):
        clusters = list(self.__tags)
        return clusters

    def __build_map_tag_clusters(self):
        
        map = {}
        map[''] = ['']
        
        for tag in self.__tags:
            map[tag] = [tag]
            
        return map
        

    
    def __dump_obj(self, extension, obj):
        filename = self.__dataset + extension
        f = open(filename, 'w')
        pickle.dump(obj, f)
        f.close()
    
    def __load_obj(self, extension):
        filename = self.__dataset + extension
        f = open(filename, 'r')
        obj = pickle.load(f)
        f.close()
        return obj

    def __build_user_index(self):
        filename = self.__dataset + '.tagged_graph'
        tags_filename = filename + '.ranks/all.tags'
        f = open(tags_filename)
        lines = f.readlines()
        f.close()
        self.__users = set([])
        for line in lines:
                if line.strip() == '':
                    continue
                tag = line.strip().split(' ')[0]
                if len(tag) >= 64:
                    continue
                try: # some tags may be missing if we only rank top_tags
                    u_lines = open(filename + '.ranks/%s.graph.rank' % tag)
                    users = map(lambda line: line.strip().split(' ')[0], u_lines)
                    for user, pos in zip(users,range(1,len(users)+1)):
                        u_file = open(filename + '.ranks/%s.user.ranks' % user, 'aw')
                        u_file.write('%s %d\n' % (tag,pos))
                        u_file.close()
                        self.__users.add(user)
                except:
                    pass
        # sort user better ranks by position.
        users_file = open(filename + '.ranks/all.users','w')
        for user in self.__users:
            lines = open(filename + '.ranks/%s.user.ranks' % user).readlines()
            user_ranks = map(lambda x: tuple(x.strip().split(' ')), filter(lambda x:len(x.strip())>0, lines))
            user_ranks = map(lambda (tag,pos): (tag,int(pos)), user_ranks)
            user_ranks.sort(snd_cmp)
            u_file = open(filename + '.ranks/%s.user.ranks' % user, 'w')
            for tag, pos in user_ranks:
                u_file.write('%s %d\n' % (tag,pos))
            u_file.close()
            users_file.write('%s\n' % user)
        users_file.close()
    
    def __init__(self, dataset, compute_ranks=True, compute_mono_rank=False, build_index=False, max_per_rank=10000000000, top_tags=None):
        
        self.__max_per_rank = max_per_rank        
        self.__dataset = dataset
        self.__ranker = RankerByTags()
        self.__ranker.load(dataset + '.tagged_graph')
        self.__compute_ranks = compute_ranks
        self.__compute_mono_rank = compute_mono_rank

        filename = dataset + '.tagged_graph'            
        if self.__compute_ranks or self.__compute_mono_rank:
            self.__ranker.all_ranks(filename, compute_ranks, compute_mono_rank, top_tags)        
            self.__build_user_index()
    

        print 'loading map tag userrank'
        self.__map_tag_userrank = self.__build_map_tag_userrank()
        print 'loading rank'
        self.__rank = self.__read_rank()
        print 'loading tags'
        self.__tags = self.__read_tags()
        print 'loading map tag users'
        self.__map_tag_users = self.__build_map_tag_users()            
        print 'loading map user tags'
        self.__map_user_tags = self.__build_map_user_tags()            


        if build_index:
            
            
#            print 'saving pickles'
#            print 'saving tag userrank'
#            self.__dump_obj('.eggomatic_map_tag_userrank', self.__map_tag_userrank)
#            print 'saving rank'
#            self.__dump_obj('.eggomatic_rank', self.__rank)
#            print 'saving tags'
#            self.__dump_obj('.eggomatic_tags', self.__tags)
#            print 'saving map tag users'
#            self.__dump_obj('.eggomatic_map_tag_users', self.__map_tag_users)
#            print 'saving map user tags'
#            self.__dump_obj('.tagged_graph.ranks/eggomatic_map.user_tags', self.__map_user_tags)
            pass
        
        else: # used pickled objects

#            print 'loading pickles'
#            print 'loading map user tags'
#            self.__map_user_tags = self.__load_obj('.tagged_graph.ranks/eggomatic_map.user_tags')
            pass
        
        print 'finish EggOMatic initialization.'
        
    def good_tag(self, tag):
        return tag in self.__map_tag_userrank
        #return tag in self.__tags
        
    def best_tag(self, tag):
        return self.__ranker.best_tag(tag)
        
    def tag_weight(self, tag):
        return self.__ranker.tag_weight(tag)
        
    def has_bigger_cluster(self, tag):       
        return False 
#        for cluster in self.__map_tag_clusters[tag]:
#            if len(cluster) > 1:
#                return True
#        return False
        
    def has_many_clusters(self, tag):        
        return len(self.__map_tag_clusters[tag]) > 1
        
    def clusters(self, tag):
        return self.__map_tag_clusters[tag]
        
    def rank_by_tag(self, tag='', use_cluster=False, cluster_number=0):
#        if not tag in self.__tags:
#            return []
        if use_cluster:
            cluster = self.clusters(tag)[cluster_number]
            return self.rank_by_tags(cluster)
        rank = self.__map_tag_userrank[tag][:self.__max_per_rank]
        return rank, self.__merge_rank_and_monolitic([tag]), rank, rank, rank, rank, rank 
    
    def rank_by_tags(self, tags):
        
        if len(tags)==0:
            return []
        tags = list(tags)
        rank0, mono_rank, rank, rank2, rank3, rank4, rank6 = self.rank_by_tag(tags[0])        
        for tag in tags[1:]:            
            rank = self.__merge_rank_and(rank, self.rank_by_tag(tag)[2], '1')
            rank2 = self.__merge_rank_and(rank2, self.rank_by_tag(tag)[3], '2')
            rank3 = self.__merge_rank_and(rank3, self.rank_by_tag(tag)[4], '3')        
            rank4 = self.__merge_rank_and(rank4, self.rank_by_tag(tag)[5], '4', tag)
            rank6 = self.__merge_rank_and(rank6, self.rank_by_tag(tag)[6], '6')
        and_rank0 = add_pos(and_rank0_gold1(tags, self.__ranker))        
        and_rank5 = add_pos(and_rank5_gold2(tags, self.__ranker))
        mono_rank = self.__merge_rank_and_monolitic(tags)
        return and_rank0, mono_rank, rank, rank2, rank3, rank4, and_rank5, rank6

    def rank_by_tag_fast(self, tag='', use_cluster=False, cluster_number=0):
        rank = self.__map_tag_userrank[tag][:self.__max_per_rank]
	return rank, rank
    
    def rank_by_tags_fast(self, tags):
        
        if len(tags)==0:
            return []
        tags = list(tags)
        rank1, rank3  = self.rank_by_tag_fast(tags[0])        
        for tag in tags[1:]:            
            rank1 = self.__merge_rank_and(rank1, self.rank_by_tag_fast(tag)[0], '1')
            rank3 = self.__merge_rank_and(rank3, self.rank_by_tag_fast(tag)[1], '3')        
        return rank1, rank3

    def __merge_rank_and_monolitic(self, tags):
        users = self.__map_tag_users[tags[0]]
        for tag in tags[1:]:
            users = users.intersection(self.__map_tag_users[tag])
        mono_rank = []        
        for name, pagerank, pos in self.__rank:
            if name in users:
                mono_rank.append((name,pagerank))
                if self.__max_per_rank <= len(mono_rank):
                    break
        mono_rank = PageRank.normalize(mono_rank)
        return add_pos(mono_rank)
                                    

    def __merge_rank_and(self, rank1, rank2, and_type='1', tag=None):
        rank1_no_pos = [(name,pagerank) for name, pagerank, pos in rank1]
        rank2_no_pos = [(name,pagerank) for name, pagerank, pos in rank2]
        if and_type=='1':
            new_rank = and_rank(rank1_no_pos, rank2_no_pos)
        elif and_type=='2':
            new_rank = and_rank2(rank1_no_pos, rank2_no_pos)
        elif and_type=='3':
            new_rank = and_rank3(rank1_no_pos, rank2_no_pos)
        elif and_type=='4':
            new_rank = and_rank4(rank1_no_pos, rank2_no_pos, self.__ranker, tag, self.__max_per_rank)
        elif and_type=='6':
            new_rank = and_rank6(rank1_no_pos, rank2_no_pos)
        else:
            raise Exception('bad and_type in __merge_rank_and')
        new_rank_pos = add_pos(new_rank)
        return new_rank_pos

    def __merge_rank_or(self, rank1, rank2):
        
        pos = 0
        present_pagerank = -3.0
        rank = []
        index1, index2 = 0, 0
        while index1 < len(rank1) and index2 < len(rank2) and len(rank) < self.__max_per_rank:
            if rank1[index1][1] > rank2[index2][1] or (rank1[index1][1] == rank2[index2][1] and rank1[index1][0] <= rank2[index2][0]):
                name, pagerank, old_pos1 = rank1[index1]
                index1 += 1
            else:
                name, pagerank, old_pos2 = rank2[index2]
                index2 += 1
            if pagerank != present_pagerank:
                pos += 1
                present_pagerank = pagerank
            if len(rank)==0 or rank[-1][0] != name:
                rank.append((name,pagerank,pos))    

        if index1 < len(rank1):
            for name, pagerank, old_pos1 in rank1[index1:]:
                if pagerank != present_pagerank:
                    pos += 1
                    present_pagerank = pagerank
                if len(rank) >= self.__max_per_rank:
                    break
                if len(rank)==0 or rank[-1][0] != name:
                    rank.append((name,pagerank,pos))   
        elif index2 < len(rank2):
            for name, pagerank, old_pos2 in rank1[index2:]:
                if pagerank != present_pagerank:
                    pos += 1
                    present_pagerank = pagerank
                if len(rank) >= self.__max_per_rank:
                    break
                if len(rank)==0 or rank[-1][0] != name:
                    rank.append((name,pagerank,pos))   
                
        return rank
    
    def total_users(self):
        return len(self.__map_user_tags)        
        
    def total_tags(self):
        return len(self.__tags)
    
    def good_user(self, user):
        return user in self.__map_user_tags
    
    def __cmp_fst(self, A, B):
        if A[0][0] < B[0][0]:
            return -1
        elif A[0][0] > B[0][0]:
            return 1
        else:
            return 0
    
    def __sort_inside_pos(self, ranks):
        if len(ranks) == 0:
            return []
        ret = []
        aux, aux_pos = [ranks[0]], ranks[0][2]
        for tags, pr, pos in ranks[1:]:
            if pos == aux_pos:
                aux.append((tags,pr,pos))
            else:
                aux.sort(self.__cmp_fst)
                ret += aux
                aux, aux_pos = [(tags,pr,pos)], pos
        aux.sort()
        ret += aux
        return ret            
    
    def user_ranks(self, user, filter_set=None, max_per=500, sorting=True):
        user_ranks = self.__map_user_tags.__getitem__(user, max_per)
        ranks = map(lambda (tag, pos): ([tag], 0.0, int(pos)), user_ranks)
        if sorting:
            ranks = self.__sort_inside_pos(ranks)
        if filter_set:
            ranks = filter(lambda (tags,pr,pos): tags[0] in filter_set, ranks)
        return ranks
#        good_tags = []
#        ranks = []
#        for tag in tags:
#            tag_rank = self.__map_tag_userrank[tag]
#            for rank_user, pagerank, pos in tag_rank:
#                if rank_user == user:
#                    ranks.append((tag, pagerank, pos))
#        ranks = list(set(ranks))
#        ranks = map(lambda (tag, pr, pos): ([tag], pr, pos), ranks)        
#        ranks.sort(thrd_fst_cmp)
#        return ranks
    
    def user_ranks_clustering(self, user):        
        good_tags = []
        ranks = []
        added_clusters = set([])
        for cluster in self.__clusters:
            rank = self.rank_by_tags(cluster)
            for rank_user, pagerank, pos in rank:
                if rank_user == user and not '|'.join(cluster) in added_clusters:
                    ranks.append((cluster, pagerank, pos))
                    added_clusters.add('|'.join(cluster))        
        ranks.sort(thrd_fst_cmp)
        return ranks
class EggOMaticExperiments:

    __exp_OSim = True
    __exp_KSim = True
    
    __offline_list = ['offline1'] # or 'gold2'
    __online_list = ['online1']
    __top_tags = 20
    
    __begin_top_many_users = 1
    __end_top_many_users = 10
    # exponential step 2, 4 ,8 16
    __step_top_many_users = 2

    def set_top_tags(self, top_tags=20):
        self.__top_tags = top_tags

    def set_forb_tags(self, forb_tags):
        self.__forb_tags = forb_tags

    def set_offline_list(self, type=['offline1']):
        self.__offline_list = type

    def set_online_list(self, list):
        self.__online_list = list

    def set_error_list(self, list=['osim','ksim']):
        self.__error_list = list

    def __init__(self, dataset, compute_ranks=True,  compute_mono_rank=True, build_index=True, max_per_rank=10000000000, top_tags=None):
        self.__dataset = dataset
        self.__eggomatic = EggOMatic(dataset, compute_ranks, compute_mono_rank, build_index, max_per_rank, top_tags)
        
    def __calc_error_save(self, offline_type, online_type, type_rank, error_type):
        f = open(self.__dataset + '.' + offline_type + '_VS_' + online_type + '.' + error_type, 'a')
        print 'writing ' + self.__dataset + '.' + offline_type + '_VS_' + online_type + '.' + error_type
        
        offline_rank = type_rank[offline_type]
        online_rank = type_rank[online_type]
        for top_many_users in [self.__step_top_many_users**i for i in range(self.__begin_top_many_users,self.__end_top_many_users)]:
            float_error, info_val = eval('rank_dist_%s(offline_rank, online_rank, top_many_users)' % error_type)
            if info_val >= 0: 
                f.write('%d %d %f \n' % (top_many_users, len(offline_rank), float_error))

    def __empty_result_files(self):        
        for offline_type in self.__offline_list:            
            for online_type in self.__online_list:                
                for error_type in self.__error_list:
                    f = open(self.__dataset + '.' + offline_type + '_VS_' + online_type + '.' + error_type, 'w')
                    f.close()
        
    def run(self):
        filename = self.__dataset + '.tagged_graph'
        tags_filename = filename + '.ranks/all.tags'
        f = open(tags_filename)
        lines = []
        for i in range(self.__top_tags + 1000):
            lines.append(f.readline())
        f.close()
        top_tags = map(lambda line: line.strip().split(' ')[0], lines)
        top_tags = filter(lambda x:not x in self.__forb_tags, top_tags)[0:self.__top_tags]
        #top_tags = tags.get_top_tags(self.__top_tags)
        #top_tags = map(lambda x: x[0], top_tags)
        
        self.__ranker = RankerByTags()
        self.__ranker.load(filename)

#        tag_sets = []
#        for i in range(len(top_tags)):
#	    tag1 = top_tags[i]
#            for tag2 in top_tags[:i-1]:
#                tag_sets.append((tag1, tag2))

        # empty result files
        self.__empty_result_files()

	count = 0
        nro_exps = len(top_tags)*(len(top_tags)-1) / 2
        len_top_tags = len(top_tags)
        for i in range(1,len(top_tags)):
	    tag1 = top_tags[i]
            for tag2 in top_tags[:i]:
                ranks = self.__eggomatic.rank_by_tags([tag1, tag2])
                type_rank = {}
                type_rank['offline1'] = ranks[0]                 
                type_rank['mono'] = ranks[1]
                type_rank['online1'] = ranks[2]
                type_rank['online2'] = ranks[3]
                type_rank['online3'] = ranks[4]
                type_rank['online4'] = ranks[5]
                type_rank['offline2'] = ranks[6]
                type_rank['online6'] = ranks[7]
		if (tag1=='portugal' and tag2=='music') or (tag2=='portugal' and tag1=='music'):
			f = open('ranks.txt','w')
			for trank, rank in type_rank.iteritems():
				f.write('----------------------------------------\n')
				f.write('%s\n' % trank)
				for t in rank:
					f.write('%s\n' % str(t))
			f.close()
                for offline_type in self.__offline_list:                
                    for online_type in self.__online_list:                    
                        for error_type in self.__error_list:
                            print 'for tags: %s %s' % (tag1,tag2)
                            self.__calc_error_save(offline_type, online_type, type_rank, error_type)
                count += 1
                print 'NUMBER OF EXPERIMENTS/PAIRS COMPLETED: %d of %d' % (count, nro_exps)
            print 'EXPERIMENTS COMPLETED FOR THE TOP %d of %d TAGS (all pairs)' % (i,len_top_tags)