class ExtractTags: client = None database = None collection = None def __init__(self): self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM) self.database, self.collection = self.client.get_handler() jieba.enable_parallel(5) def remove_duplicate(self): # remove the duplicate video according to 'url' all, find, delete= 0, 0, 0 a = self.collection.distinct('url') for i in range(self.collection.count(),0,-1): all += 1 url = self.collection.find()[i-1]["url"] if url in a: a.remove(url) find += 1 elif url not in a: self.collection.remove(self.collection.find()[i-1]) delete += 1 print("Unique:%d\t\tDelete:%d\t\tProcess:%d\t\t" % (find, delete, all)) def remove_items(self): # remove the video whose comments is less than 50 items = self.collection.find({"$where": "this.comments.length < 50" }) for item in items: self.database.get_collection('remove_items').insert({"url" : item["url"]}) self.collection.remove({"url":item["url"]}) def cut_comments(self): var = 0 for item in self.collection.find(): comments = "" tags = [] for comment in item["comments"]: try: comments += str(comment["comment_content"]) except TypeError: break for char in REMOVE_CHAR: comments = comments.replace(char, "") # cut_a = '/'.join(jieba.cut_for_search(comments)) # search engine mode cut_a = '/'.join(jieba.cut(comments, cut_all=False)) # accurate mode # cut_a = '/'.join(jieba.cut(comments, cut_all=True)) # full mode self.collection.update({"url": item["url"]}, {"$unset": {"comments": ""}}) self.collection.update({"url": item["url"]}, {"$set": {"comments": cut_a}}) text_rank = jieba.analyse.textrank(cut_a, topK=20, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v')) for tag in text_rank: tags.append({"tag": tag[0], "weight": tag[1]}) self.collection.update({"url":item["url"]}, {"$set":{"tags":tags}}) value = time.localtime(int(time.time())) dt = time.strftime(DATE_FORMAT, value) var += 1 print("%s\t\tprocess %d" % (dt, var)) def close(self): self.client.close()
def __init__(self, cut='search', similar=2): self.client = ConnectDB(DATABASE_TV, 'users') self.database, self.users_c = self.client.get_handler() self.users = [] self.item_c = self.database.get_collection('WeiboItem_similar_search') collection = 'WeiboGroup_' + cut self.group_c = self.database.get_collection(collection) self.similar = similar self.all_videos = []
class Analyse: client = None database = None collection = None def __init__(self): self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM) self.database, self.collection = self.client.get_handler() def average(self): result = {} for i in range(10): var = 0 total = 0 for item in self.collection.find(): var += len(item["relative"][NUM[i]]) total += 1 result[NUM[i]] = var / total print(result) def export(self, num): file = open('result.csv', 'w', encoding='utf-8') for item in self.collection.find(): for url in item["relative"][NUM[num]]: file.writelines([ item["url"], ',', url["url"], ',', str(url["value"])[0:5], '\n' ]) file.close()
class Analyse: client = None database = None collection = None def __init__(self): self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM) self.database, self.collection = self.client.get_handler() def average(self): result = {} for i in range(10): var = 0 total = 0 for item in self.collection.find(): var += len(item["relative"][NUM[i]]) total +=1 result[NUM[i]] = var/total print(result) def export(self, num): file = open('result_%s.csv' % str(num), 'w', encoding='utf-8') file.writelines("Source,Target,Weight,Type\n") temp = set() for item in self.collection.find(): for url in item["relative"][NUM[num]]: if (url["url"], item["url"]) in temp: continue else: temp.add((item["url"],url["url"])) file.writelines([item["url"], ',', url["url"], ',', str(url["value"])[0:5], ',', 'undirected\n']) file.close() def count(self): user = {} for item in self.collection.find(): for forward in item["forwards"]: usercard = forward["forward_usercard"] if user.get(usercard) is None: user[usercard] = set() user[usercard].add(item["url"]) data = self.database.get_collection('users') var = 0 for (u, v) in user.items(): if len(v) > 9: data.insert({"usercard":u},{"$set":{"forwards":len(v)}}) var += 1 print(var)
def __init__(self): self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM) self.database, self.collection = self.client.get_handler()
class Similar: client = None database = None collection = None def __init__(self): self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM) self.database, self.collection = self.client.get_handler() @staticmethod def cosine(a, b): return a.dot(b)/sqrt(a.dot(a))/sqrt(b.dot(b)) def add_tags(self, item, tag_set): for t in item["tags"]: tag_set.add(t["tag"]) return tag_set def cut_split(self, item): comment = item["comments"] cut = comment.split('/') comment = comment.replace('/','') length = len(comment.replace('/','')) return comment, cut, length def init_vec(self, tags): vec = {} for i in tags: vec[i] = 0 return vec def frequence(self, cut, len, tags): vec = self.init_vec(tags) for word in cut: if word in tags: vec[word] += 1.0 / len li = list(map(lambda x:x, vec.values())) ar = numpy.array(li) return ar def process(self): links = 0 var = 0 for item_a in self.collection.find(): # all = 0 # pos = 0 tag = self.add_tags(item_a, set()) relative = {} for i in range(10): relative[NUM[i]] = [] comments_a, cut_a, len_a = self.cut_split(item_a) for item_b in self.collection.find(): if item_a != item_b: tags = self.add_tags(item_b, deepcopy(tag)) comments_b, cut_b, len_b = self.cut_split(item_b) vec_a = self.frequence(cut_a, len_a, tags) vec_b = self.frequence(cut_b, len_b, tags) cos = self.cosine(vec_a, vec_b) for i in range(10): if cos > 0.1 * i : links += 1 relative[NUM[i]].append({"url":item_b["url"], "value":cos}) if len(relative[NUM[i]]) == 0: break self.collection.update({"url":item_a["url"]},{"$set":{"relative":relative}}) var += 1 value = time.localtime(int(time.time())) dt = time.strftime(DATE_FORMAT, value) print("%s\t\tprocesse %d\t\t" % (dt, var))
class Recommend: client = None database = None users_c = None users = None item_c = None group_c = None video_set = None similar = 0 def __init__(self, cut='search', similar=2): self.client = ConnectDB(DATABASE_TV, 'users') self.database, self.users_c = self.client.get_handler() self.users = [] self.item_c = self.database.get_collection('WeiboItem_similar_search') collection = 'WeiboGroup_' + cut self.group_c = self.database.get_collection(collection) self.similar = similar self.all_videos = [] def pick_user(self, num=20): length = self.users_c.count() result = self.users_c.find() for i in range(num): self.users.append(result[random.randrange(length)]["usercard"]) def pick_a_video(self, user): self.video_set = self.item_c.find({"forwards.forward_usercard": user}) ran = self.video_set.count() return self.video_set[random.randrange(ran)]["url"] def get_watched(self, user): watched = [] for video in self.item_c.find({"forwards.forward_usercard":user}): watched.append(video["url"]) return watched def get_group(self, item): try: group = item[str(self.similar)] except: if self.similar == 0: raise ValueError("wrong similar!") else: self.similar -= 1 group = self.get_group(item) return group def get_videos(self, group, source): global ALL_VIDEOS temp = [] source = self.item_c.find_one({"url":source["url"]}) for item in self.group_c.find({str(self.similar):group}): temp.append(item["url"]) all = {} for items in source["relative"]["zero"]: if items["url"] in temp: all[items["url"]] = items["value"] sort = sorted(all.items(), key=lambda item: item[1], reverse=True) recommend = [] for (u,v) in sort: if len(recommend) > 20: break else: if v >= self.similar * 0.1: recommend.append(u) else: break recommend_random = [] for i in range(20): ran = random.randint(0, len(ALL_VIDEOS) - 1) recommend_random.append(ALL_VIDEOS[ran]) return recommend, recommend_random def process(self): global PICKED global USERS, RECOMMEND_TOP user_num = user_num_random = user_num_top = 30 if not PICKED: self.pick_user(user_num) USERS = self.users PICKED = True else: self.users = USERS rate_p = 0 rate_r = 0 rate_p_random = 0 rate_r_random = 0 rate_p_top = 0 rate_r_top = 0 for user in self.users: url = self.pick_a_video(user) item = self.group_c.find_one({"url": url}) group = self.get_group(item) recommend, recommend_random = self.get_videos(group, item) watched = self.get_watched(user) correct = 0 correct_random = 0 correct_top = 0 total = len(recommend) total_random = len(recommend_random) total_top = 20 t = len (watched) for video in recommend: if video in watched: correct += 1 try: rate_p += correct / total rate_r += correct / t file.write("%s\t\t%s\t\t%s\n" % (user, str(correct / total), str(correct / t))) print(user, correct/total, correct/t) except ZeroDivisionError: user_num -= 1 for video in recommend_random: if video in watched and video != url: correct_random += 1 try: rate_p_random += correct_random / total_random rate_r_random += correct_random / t file.write("%s\t\t%s\t\t%s\n" % (user, str(correct_random / total_random), str(correct_random / t))) print(user, correct_random/total_random, correct_random/t) except ZeroDivisionError: user_num_random -= 1 for video in RECOMMEND_TOP: if video in watched and video != url: correct_top += 1 try: rate_p_top += correct_top / total_top rate_r_top += correct_top / t file.write("%s\t\t%s\t\t%s\n" % (user, str(correct_top / total_top), str(correct_top / t))) print(user, correct_top/total_top, correct_top/t) except ZeroDivisionError: user_num_top -= 1 p = rate_p/user_num r = rate_r/user_num print("======================================") try: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p, r, 2 * p * r / (p + r))) file.write("average:%lf\t\t%lf\t\tscore:%lf\n" % (p, r, 2 * p * r / (p + r))) except ZeroDivisionError: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p, r, 0)) file.write("average:%lf\t\t%lf\t\tscore:%lf\n" % (p, r, 0)) p_random = rate_p_random / user_num_random r_random = rate_r_random / user_num_random try: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r, 2 * p_random * r_random / (p_random + r_random))) file.write("random_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r_random, 2 * p_random * r_random / (p_random + r_random))) except ZeroDivisionError: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r, 0)) file.write("random_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r_random, 0)) p_top = rate_p_top / user_num_top r_top = rate_r_top / user_num_top try: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top))) file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top))) except ZeroDivisionError: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0)) file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0))
r_top = rate_r_top / user_num_top try: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top))) file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top))) except ZeroDivisionError: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0)) file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0)) # file = open('search.txt','w') # for i in range(10): # file.write("===================================================================\n") # pro = Recommend('search', i) # pro.process() # file.close() # file = open('full.txt','w') # for i in range(10): # file.write("===================================================================\n") # pro = Recommend('full', i) # pro.process() # file.close() ALL_VIDEOS = [] client = ConnectDB("WeiboTV", "WeiboItem") d, c = client.get_handler() for item in c.find(): ALL_VIDEOS.append(item["url"]) file = open('result.txt','w') for i in range(500): file.write("\nExperiment %d\n===================================================================\n" % i) pro = Recommend('accurate', i) pro.process() file.close()
import igraph from igraph import * from database_utils import ConnectDB from values import DATABASE_TV, COLLECTION_URL, DATE_FORMAT import time client = ConnectDB(DATABASE_TV, COLLECTION_URL) database, collection = client.get_handler() # all, find, delete = 0, 0, 0 # a = collection.distinct('url') # for i in range(collection.count(), 0, -1): # all += 1 # url = collection.find()[i - 1]["url"] # if url in a: # a.remove(url) # find += 1 # elif url not in a: # collection.remove(collection.find()[i - 1]) # delete += 1 # print("Unique:%d\t\tDelete:%d\t\tProcess:%d\t\t" % (find, delete, all)) g = Graph(directed=True) urls = set() urls.add('index') re = collection.find() for url in collection.find(): urls.add(url["from_url"]) urls.add(url["url"]) dic = {} var = 0
except ZeroDivisionError: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r, 0)) file.write("random_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r_random, 0)) p_top = rate_p_top / user_num_top r_top = rate_r_top / user_num_top try: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top))) file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top))) except ZeroDivisionError: print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0)) file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0)) ALL_VIDEOS = [] client = ConnectDB(DB_Name, "WeiboItem") d, c = client.get_handler() for item in c.find(): ALL_VIDEOS.append(item["url"]) file = open('result.txt', 'w') for i in range(500): file.write( "\nExperiment %d\n===================================================================\n" % i) pro = Recommend('accurate', i) pro.process() file.close()
class Similar: client = None database = None collection = None def __init__(self): self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM) self.database, self.collection = self.client.get_handler() @staticmethod def cosine(a, b): return a.dot(b) / sqrt(a.dot(a)) / sqrt( b.dot(b)) # evaluate the similarity def add_tags(self, item, tag_set): # add item's tags into tag_set for t in item["tags"]: tag_set.add(t["tag"]) return tag_set def cut_split(self, item): # comment have been cut(word cut) and use '/' as a mark comment = item["comments"] cut = comment.split('/') # comment to group of words comment = comment.replace('/', '') length = len(comment.replace( '/', '')) # get the words number of this comment return comment, cut, length def init_vec(self, tags): # init word vector vec = {} for i in tags: vec[i] = 0 return vec def frequence(self, cut, len, tags): # evaluate the word's frequence vec = self.init_vec(tags) for word in cut: if word in tags: vec[word] += 1.0 / len li = list(map(lambda x: x, vec.values())) ar = numpy.array(li) return ar def process(self): links = 0 var = 0 for item_a in self.collection.find(): # all = 0 # pos = 0 tag = self.add_tags(item_a, set()) # add tags into an empty set relative = {} # record the relative video to item_a # init the relative for i in range(10): relative[NUM[i]] = [] comments_a, cut_a, len_a = self.cut_split(item_a) for item_b in self.collection.find(): if item_a != item_b: tags = self.add_tags( item_b, deepcopy(tag) ) # copy the tag (just used above) and add tags into it comments_b, cut_b, len_b = self.cut_split(item_b) # evaluate the word (in the tags) frequence vec_a = self.frequence(cut_a, len_a, tags) vec_b = self.frequence(cut_b, len_b, tags) cos = self.cosine(vec_a, vec_b) # evaluate the similarity # according to the similarity, we record the relative video links and its similarity value # group by 0.1 interval( 0~0.1, 0.1~0.2, etc.) for i in range(10): if cos > 0.1 * i: links += 1 relative[NUM[i]].append({ "url": item_b["url"], "value": cos }) if len(relative[NUM[i]]) == 0: break self.collection.update({"url": item_a["url"]}, {"$set": { "relative": relative }}) var += 1 value = time.localtime(int(time.time())) dt = time.strftime(DATE_FORMAT, value) print("%s\t\tprocesse %d\t\t" % (dt, var))