def get_interest(uid, username, passwd):
    '''
        uid:用户id,unicode
        得到uid用户所发微博中的前20高频词
    '''
    with codecs.open('stop_words.txt', 'r', 'utf-8') as fr:
        stop_words = [line.strip() for line in fr]
    stop_words.append(' ')
    stop_words.append('\n')
    stop_words.append('\t')

    myFetcher = Fetcher(username, passwd)
    myFetcher.login()

    follows_list = myFetcher.get_user_follows(uid)
    follows_list.append(uid)

    fans_list = myFetcher.get_user_fans(uid)
    print len(follows_list)
    print len(fans_list)

    follows_keywords = get_keywords(myFetcher, follows_list)
    follows_interest = {}
    for word in follows_keywords:
        follows_interest[word[0]] = word[1]

    fans_keywords = get_keywords(myFetcher, fans_list)
    fans_interest = {}
    for word in fans_keywords:
        fans_interest[word[0]] = word[1]

    user_weibos = myFetcher.get_user_weibos(uid)
    weibos = ".".join(user_weibos)

    content_interest = {}  #从用户发布的微博信息中提取的兴趣关键词
    words = jieba.cut(weibos)
    filtered_words = [
        word for word in words if word not in stop_words and len(word) > 1
    ]
    all_words_count = float(len(filtered_words))
    counter = Counter(filtered_words)
    key_words = counter.most_common(30)
    outputs = []
    for item in key_words:
        if isinstance(item[0], unicode):
            k_word = item[0].decode('utf-8')
            weight = float(item[1]) / all_words_count
        else:
            k_word = item[0]
            weight = float(item[1]) / all_words_count
        outputs.append("%s\t%f\n" % (k_word, weight))
        content_interest[k_word] = weight

    #对两类兴趣词的权重进行归一化
    max_weight_content = max(content_interest.values())
    max_weight_follows = max(follows_interest.values())
    max_weight_fans = max(fans_interest.values())

    for word1, weight1 in content_interest.iteritems():
        weight1 /= max_weight_content

    for word2, weight2 in follows_interest.iteritems():
        weight2 /= max_weight_follows

    for word3, weight3 in fans_interest.iteritems():
        weight3 /= max_weight_fans

    interest_words = {}
    all_words = follows_interest.keys() + content_interest.keys(
    ) + fans_interest.keys()

    for word in all_words:
        content_weight = content_interest.get(word, 0)
        follows_weight = follows_interest.get(word, 0)
        fans_weight = fans_interest.get(word, 0)
        all_weight = 0.5 * follows_weight + 0.4 * content_weight + 0.1 * fans_weight
        interest_words[word] = all_weight

    sorted_words = sorted(interest_words.iteritems(),
                          key=lambda (k, v): v,
                          reverse=True)

    for item in sorted_words[:30]:
        print item[0].encode('utf-8', ''), item[1]
Esempio n. 2
0
    def fetch(self):
        """
        从队列中获取地址并查询内容
        :return: None
        """
        log.info("Running thread %s" % threading.current_thread().name)
        while True:
            try:
                cur_url, cur_depth = self._url_queue.get(timeout=1)
                print self._url_queue.unfinished_tasks
                cur_url = cur_url.strip()
            except Queue.Empty as e:
                log.warn(e)
                continue

            fetch_tool = Fetcher.Fetcher(cur_url, self._output_dir,
                                         self._timeout, self._cookie)

            if self.pro_match.match(cur_url):
                # 产品详情页
                self.save_product_info_url(cur_url)
                content = fetch_tool.read_content()
                info = fetch_tool.get_product_info(content)
                product_info = []
                [product_info.append(item) for item in info]
                if self._lock.acquire():
                    self._url_visited.append(cur_url)
                self._lock.release()

                self.save_product_info(product_info)
            elif self.fig_pattern.match(cur_url):
                # 产品大类
                content = fetch_tool.read_content()
                if content is None or len(content) == 0:
                    continue

                # 获取大类下的子类页面
                sub_urls = fetch_tool.get_sub_urls(content)
                if sub_urls is None:
                    continue
                for item in sub_urls:
                    if self._lock.acquire():  # lock _url_visited, check
                        if item in self._url_visited:
                            continue
                        self._lock.release()
                    try:
                        if self._lock.acquire():  # lock _url_visited, add
                            self._url_visited.append(item)
                        self._lock.release()
                        self._url_queue.put([item, cur_depth + 1], timeout=1)
                    except Queue.Full as e:
                        log.warn(e)
                        break
            else:
                # 子类页面, 获取产品详情地址
                content = fetch_tool.read_content()
                product_urls = fetch_tool.get_product_url(content)
                if product_urls is None:
                    continue
                for item in product_urls:
                    if self._lock.acquire():  # lock _url_visited, check
                        if item in self._url_visited:
                            self._lock.release()
                            continue
                        try:
                            self._url_visited.append(item)
                            self._url_queue.put([item, cur_depth + 1],
                                                timeout=1)
                        except Queue.Full as e:
                            log.warn(e)
                            break
                        finally:
                            self._lock.release()

            self._url_queue.task_done()