Beispiel #1
0
class Analyse:
    client = None
    database = None
    collection = None

    def __init__(self):
        self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM)
        self.database, self.collection = self.client.get_handler()

    def average(self):
        result = {}
        for i in range(10):
            var = 0
            total = 0
            for item in self.collection.find():
                var += len(item["relative"][NUM[i]])
                total += 1
            result[NUM[i]] = var / total
        print(result)

    def export(self, num):
        file = open('result.csv', 'w', encoding='utf-8')
        for item in self.collection.find():
            for url in item["relative"][NUM[num]]:
                file.writelines([
                    item["url"], ',', url["url"], ',',
                    str(url["value"])[0:5], '\n'
                ])
        file.close()
Beispiel #2
0
class ExtractTags:
    client = None
    database = None
    collection = None

    def __init__(self):
        self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM)
        self.database, self.collection = self.client.get_handler()
        jieba.enable_parallel(5)

    def remove_duplicate(self):
        # remove the duplicate video according to 'url'
        all, find, delete= 0, 0, 0
        a = self.collection.distinct('url')
        for i in range(self.collection.count(),0,-1):
            all += 1
            url = self.collection.find()[i-1]["url"]
            if url in a:
                a.remove(url)
                find += 1
            elif url not in a:
                self.collection.remove(self.collection.find()[i-1])
                delete += 1
            print("Unique:%d\t\tDelete:%d\t\tProcess:%d\t\t" % (find, delete, all))

    def remove_items(self):
        # remove the video whose comments is less than 50
        items = self.collection.find({"$where": "this.comments.length  < 50" })
        for item in items:
            self.database.get_collection('remove_items').insert({"url" : item["url"]})
            self.collection.remove({"url":item["url"]})

    def cut_comments(self):
        var = 0
        for item in self.collection.find():
            comments = ""
            tags = []
            for comment in item["comments"]:
                try:
                    comments += str(comment["comment_content"])
                except TypeError:
                    break
            for char in REMOVE_CHAR:
                comments = comments.replace(char, "")
            # cut_a = '/'.join(jieba.cut_for_search(comments)) # search engine mode
            cut_a = '/'.join(jieba.cut(comments, cut_all=False)) # accurate mode
            # cut_a = '/'.join(jieba.cut(comments, cut_all=True)) # full mode
            self.collection.update({"url": item["url"]}, {"$unset": {"comments": ""}})
            self.collection.update({"url": item["url"]}, {"$set": {"comments": cut_a}})
            text_rank = jieba.analyse.textrank(cut_a, topK=20, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v'))
            for tag in text_rank:
                tags.append({"tag": tag[0], "weight": tag[1]})
            self.collection.update({"url":item["url"]}, {"$set":{"tags":tags}})
            value = time.localtime(int(time.time()))
            dt = time.strftime(DATE_FORMAT, value)
            var += 1
            print("%s\t\tprocess %d" % (dt, var))

    def close(self):
        self.client.close()
Beispiel #3
0
class Analyse:
    client = None
    database = None
    collection = None

    def __init__(self):
        self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM)
        self.database, self.collection = self.client.get_handler()

    def average(self):
        result = {}
        for i in range(10):
            var = 0
            total = 0
            for item in self.collection.find():
                var += len(item["relative"][NUM[i]])
                total +=1
            result[NUM[i]] = var/total
        print(result)

    def export(self, num):
        file = open('result_%s.csv' % str(num), 'w', encoding='utf-8')
        file.writelines("Source,Target,Weight,Type\n")
        temp = set()
        for item in self.collection.find():
            for url in item["relative"][NUM[num]]:
                if (url["url"], item["url"]) in temp:
                    continue
                else:
                    temp.add((item["url"],url["url"]))
                    file.writelines([item["url"], ',', url["url"], ',', str(url["value"])[0:5], ',', 'undirected\n'])
        file.close()

    def count(self):
        user = {}
        for item in self.collection.find():
            for forward in item["forwards"]:
                usercard = forward["forward_usercard"]
                if user.get(usercard) is None:
                    user[usercard] = set()
                user[usercard].add(item["url"])

        data = self.database.get_collection('users')
        var = 0
        for (u, v) in user.items():
            if len(v) > 9:
                data.insert({"usercard":u},{"$set":{"forwards":len(v)}})
            var += 1
            print(var)
Beispiel #4
0
class Similar:
    client = None
    database = None
    collection = None

    def __init__(self):
        self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM)
        self.database, self.collection = self.client.get_handler()

    @staticmethod
    def cosine(a, b):
        return a.dot(b)/sqrt(a.dot(a))/sqrt(b.dot(b))

    def add_tags(self, item, tag_set):
        for t in item["tags"]:
            tag_set.add(t["tag"])
        return tag_set

    def cut_split(self, item):
        comment = item["comments"]
        cut = comment.split('/')
        comment = comment.replace('/','')
        length = len(comment.replace('/',''))
        return comment, cut, length

    def init_vec(self, tags):
        vec = {}
        for i in tags:
            vec[i] = 0
        return vec

    def frequence(self, cut, len, tags):
        vec = self.init_vec(tags)
        for word in cut:
            if word in tags:
                vec[word] += 1.0 / len
        li = list(map(lambda x:x, vec.values()))
        ar = numpy.array(li)
        return ar

    def process(self):
        links = 0
        var = 0
        for item_a in self.collection.find():
            # all = 0
            # pos = 0
            tag = self.add_tags(item_a, set())
            relative = {}
            for i in range(10):
                relative[NUM[i]] = []
            comments_a, cut_a, len_a = self.cut_split(item_a)
            for item_b in self.collection.find():
                if item_a != item_b:
                    tags = self.add_tags(item_b, deepcopy(tag))
                    comments_b, cut_b, len_b = self.cut_split(item_b)
                    vec_a = self.frequence(cut_a, len_a, tags)
                    vec_b = self.frequence(cut_b, len_b, tags)
                    cos = self.cosine(vec_a, vec_b)
                    for i in range(10):
                        if cos > 0.1 * i :
                            links += 1
                            relative[NUM[i]].append({"url":item_b["url"], "value":cos})
                            if len(relative[NUM[i]]) == 0:
                                break
            self.collection.update({"url":item_a["url"]},{"$set":{"relative":relative}})
            var += 1
            value = time.localtime(int(time.time()))
            dt = time.strftime(DATE_FORMAT, value)
            print("%s\t\tprocesse %d\t\t" % (dt, var))
Beispiel #5
0
class Recommend:
    client = None
    database = None
    users_c = None
    users = None
    item_c = None
    group_c = None
    video_set = None
    similar = 0

    def __init__(self, cut='search', similar=2):
        self.client = ConnectDB(DATABASE_TV, 'users')
        self.database, self.users_c = self.client.get_handler()
        self.users = []
        self.item_c = self.database.get_collection('WeiboItem_similar_search')
        collection = 'WeiboGroup_' + cut
        self.group_c = self.database.get_collection(collection)
        self.similar = similar
        self.all_videos = []

    def pick_user(self, num=20):
        length = self.users_c.count()
        result = self.users_c.find()
        for i in range(num):
            self.users.append(result[random.randrange(length)]["usercard"])

    def pick_a_video(self, user):
        self.video_set = self.item_c.find({"forwards.forward_usercard": user})
        ran = self.video_set.count()
        return self.video_set[random.randrange(ran)]["url"]

    def get_watched(self,  user):
        watched = []
        for video in self.item_c.find({"forwards.forward_usercard":user}):
            watched.append(video["url"])
        return watched

    def get_group(self, item):
        try:
            group = item[str(self.similar)]
        except:
            if self.similar == 0:
                raise ValueError("wrong similar!")
            else:
                self.similar -= 1
                group = self.get_group(item)
        return group

    def get_videos(self, group, source):
        global ALL_VIDEOS
        temp = []
        source = self.item_c.find_one({"url":source["url"]})
        for item in self.group_c.find({str(self.similar):group}):
            temp.append(item["url"])
        all = {}
        for items in source["relative"]["zero"]:
            if items["url"] in temp:
                all[items["url"]] = items["value"]
        sort = sorted(all.items(), key=lambda item: item[1], reverse=True)
        recommend = []
        for (u,v) in sort:
            if len(recommend) > 20:
                break
            else:
                if v >= self.similar * 0.1:
                    recommend.append(u)
                else:
                    break
        recommend_random = []
        for i in range(20):
            ran = random.randint(0, len(ALL_VIDEOS) - 1)
            recommend_random.append(ALL_VIDEOS[ran])
        return recommend, recommend_random

    def process(self):
        global PICKED
        global USERS, RECOMMEND_TOP
        user_num = user_num_random = user_num_top = 30
        if not PICKED:
            self.pick_user(user_num)
            USERS = self.users
            PICKED = True
        else:
            self.users = USERS
        rate_p = 0
        rate_r = 0
        rate_p_random = 0
        rate_r_random = 0
        rate_p_top = 0
        rate_r_top = 0
        for user in self.users:
            url = self.pick_a_video(user)
            item = self.group_c.find_one({"url": url})
            group = self.get_group(item)
            recommend, recommend_random = self.get_videos(group, item)
            watched = self.get_watched(user)
            correct = 0
            correct_random = 0
            correct_top = 0
            total = len(recommend)
            total_random = len(recommend_random)
            total_top = 20
            t = len (watched)
            for video in recommend:
                if video in watched:
                    correct += 1
            try:
                rate_p += correct / total
                rate_r += correct / t
                file.write("%s\t\t%s\t\t%s\n" % (user, str(correct / total), str(correct / t)))
                print(user, correct/total, correct/t)
            except ZeroDivisionError:
                user_num -= 1
            for video in recommend_random:
                if video in watched and video != url:
                    correct_random += 1
            try:
                rate_p_random += correct_random / total_random
                rate_r_random += correct_random / t
                file.write("%s\t\t%s\t\t%s\n" % (user, str(correct_random / total_random), str(correct_random / t)))
                print(user, correct_random/total_random, correct_random/t)
            except ZeroDivisionError:
                user_num_random -= 1
            for video in RECOMMEND_TOP:
                if video in watched and video != url:
                    correct_top += 1
            try:
                rate_p_top += correct_top / total_top
                rate_r_top += correct_top / t
                file.write("%s\t\t%s\t\t%s\n" % (user, str(correct_top / total_top), str(correct_top / t)))
                print(user, correct_top/total_top, correct_top/t)
            except ZeroDivisionError:
                user_num_top -= 1
        p = rate_p/user_num
        r = rate_r/user_num
        print("======================================")
        try:
            print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p, r, 2 * p * r / (p + r)))
            file.write("average:%lf\t\t%lf\t\tscore:%lf\n" % (p, r, 2 * p * r / (p + r)))
        except ZeroDivisionError:
            print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p, r, 0))
            file.write("average:%lf\t\t%lf\t\tscore:%lf\n" % (p, r, 0))
        p_random = rate_p_random / user_num_random
        r_random = rate_r_random / user_num_random
        try:
            print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r, 2 * p_random * r_random / (p_random + r_random)))
            file.write("random_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r_random, 2 * p_random * r_random / (p_random + r_random)))
        except ZeroDivisionError:
            print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r, 0))
            file.write("random_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_random, r_random, 0))
        p_top = rate_p_top / user_num_top
        r_top = rate_r_top / user_num_top
        try:
            print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top)))
            file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top)))
        except ZeroDivisionError:
            print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0))
            file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0))
Beispiel #6
0
        r_top = rate_r_top / user_num_top
        try:
            print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top)))
            file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 2 * p_top * r_top / (p_top + r_top)))
        except ZeroDivisionError:
            print("average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0))
            file.write("top_average:%lf\t\t%lf\t\tscore:%lf\n" % (p_top, r_top, 0))

# file = open('search.txt','w')
# for i in range(10):
#     file.write("===================================================================\n")
#     pro = Recommend('search', i)
#     pro.process()
# file.close()
# file = open('full.txt','w')
# for i in range(10):
#     file.write("===================================================================\n")
#     pro = Recommend('full', i)
#     pro.process()
# file.close()
ALL_VIDEOS = []
client = ConnectDB("WeiboTV", "WeiboItem")
d, c = client.get_handler()
for item in c.find():
    ALL_VIDEOS.append(item["url"])
file = open('result.txt','w')
for i in range(500):
    file.write("\nExperiment %d\n===================================================================\n" % i)
    pro = Recommend('accurate', i)
    pro.process()
file.close()
Beispiel #7
0
import igraph
from igraph import *
from database_utils import ConnectDB
from values import DATABASE_TV, COLLECTION_URL, DATE_FORMAT
import time

client = ConnectDB(DATABASE_TV, COLLECTION_URL)
database, collection = client.get_handler()

# all, find, delete = 0, 0, 0
# a = collection.distinct('url')
# for i in range(collection.count(), 0, -1):
#     all += 1
#     url = collection.find()[i - 1]["url"]
#     if url in a:
#         a.remove(url)
#         find += 1
#     elif url not in a:
#         collection.remove(collection.find()[i - 1])
#         delete += 1
#     print("Unique:%d\t\tDelete:%d\t\tProcess:%d\t\t" % (find, delete, all))

g = Graph(directed=True)
urls = set()
urls.add('index')
re = collection.find()
for url in collection.find():
    urls.add(url["from_url"])
    urls.add(url["url"])
dic = {}
var = 0
Beispiel #8
0
class Similar:
    client = None
    database = None
    collection = None

    def __init__(self):
        self.client = ConnectDB(DATABASE_TV, COLLECTION_ITEM)
        self.database, self.collection = self.client.get_handler()

    @staticmethod
    def cosine(a, b):
        return a.dot(b) / sqrt(a.dot(a)) / sqrt(
            b.dot(b))  # evaluate the similarity

    def add_tags(self, item, tag_set):
        # add item's tags into tag_set
        for t in item["tags"]:
            tag_set.add(t["tag"])
        return tag_set

    def cut_split(self, item):
        # comment have been cut(word cut) and use '/' as a mark
        comment = item["comments"]
        cut = comment.split('/')  # comment to group of words
        comment = comment.replace('/', '')
        length = len(comment.replace(
            '/', ''))  # get the words number of this comment
        return comment, cut, length

    def init_vec(self, tags):
        # init word vector
        vec = {}
        for i in tags:
            vec[i] = 0
        return vec

    def frequence(self, cut, len, tags):
        # evaluate the word's frequence
        vec = self.init_vec(tags)
        for word in cut:
            if word in tags:
                vec[word] += 1.0 / len
        li = list(map(lambda x: x, vec.values()))
        ar = numpy.array(li)
        return ar

    def process(self):
        links = 0
        var = 0
        for item_a in self.collection.find():
            # all = 0
            # pos = 0
            tag = self.add_tags(item_a, set())  # add tags into an empty set
            relative = {}  # record the relative video to item_a
            # init the relative
            for i in range(10):
                relative[NUM[i]] = []
            comments_a, cut_a, len_a = self.cut_split(item_a)
            for item_b in self.collection.find():
                if item_a != item_b:
                    tags = self.add_tags(
                        item_b, deepcopy(tag)
                    )  # copy the tag (just used above) and add tags into it
                    comments_b, cut_b, len_b = self.cut_split(item_b)
                    # evaluate the word (in the tags) frequence
                    vec_a = self.frequence(cut_a, len_a, tags)
                    vec_b = self.frequence(cut_b, len_b, tags)
                    cos = self.cosine(vec_a, vec_b)  # evaluate the similarity
                    # according to the similarity, we record the relative video links and its similarity value
                    # group by 0.1 interval( 0~0.1, 0.1~0.2, etc.)
                    for i in range(10):
                        if cos > 0.1 * i:
                            links += 1
                            relative[NUM[i]].append({
                                "url": item_b["url"],
                                "value": cos
                            })
                            if len(relative[NUM[i]]) == 0:
                                break
            self.collection.update({"url": item_a["url"]},
                                   {"$set": {
                                       "relative": relative
                                   }})
            var += 1
            value = time.localtime(int(time.time()))
            dt = time.strftime(DATE_FORMAT, value)
            print("%s\t\tprocesse %d\t\t" % (dt, var))