def get_interest(uid, username, passwd): ''' uid:用户id,unicode 得到uid用户所发微博中的前20高频词 ''' with codecs.open('stop_words.txt', 'r', 'utf-8') as fr: stop_words = [line.strip() for line in fr] stop_words.append(' ') stop_words.append('\n') stop_words.append('\t') myFetcher = Fetcher(username, passwd) myFetcher.login() follows_list = myFetcher.get_user_follows(uid) follows_list.append(uid) fans_list = myFetcher.get_user_fans(uid) print len(follows_list) print len(fans_list) follows_keywords = get_keywords(myFetcher, follows_list) follows_interest = {} for word in follows_keywords: follows_interest[word[0]] = word[1] fans_keywords = get_keywords(myFetcher, fans_list) fans_interest = {} for word in fans_keywords: fans_interest[word[0]] = word[1] user_weibos = myFetcher.get_user_weibos(uid) weibos = ".".join(user_weibos) content_interest = {} #从用户发布的微博信息中提取的兴趣关键词 words = jieba.cut(weibos) filtered_words = [ word for word in words if word not in stop_words and len(word) > 1 ] all_words_count = float(len(filtered_words)) counter = Counter(filtered_words) key_words = counter.most_common(30) outputs = [] for item in key_words: if isinstance(item[0], unicode): k_word = item[0].decode('utf-8') weight = float(item[1]) / all_words_count else: k_word = item[0] weight = float(item[1]) / all_words_count outputs.append("%s\t%f\n" % (k_word, weight)) content_interest[k_word] = weight #对两类兴趣词的权重进行归一化 max_weight_content = max(content_interest.values()) max_weight_follows = max(follows_interest.values()) max_weight_fans = max(fans_interest.values()) for word1, weight1 in content_interest.iteritems(): weight1 /= max_weight_content for word2, weight2 in follows_interest.iteritems(): weight2 /= max_weight_follows for word3, weight3 in fans_interest.iteritems(): weight3 /= max_weight_fans interest_words = {} all_words = follows_interest.keys() + content_interest.keys( ) + fans_interest.keys() for word in all_words: content_weight = content_interest.get(word, 0) follows_weight = follows_interest.get(word, 0) fans_weight = fans_interest.get(word, 0) all_weight = 0.5 * follows_weight + 0.4 * content_weight + 0.1 * fans_weight interest_words[word] = all_weight sorted_words = sorted(interest_words.iteritems(), key=lambda (k, v): v, reverse=True) for item in sorted_words[:30]: print item[0].encode('utf-8', ''), item[1]
def fetch(self): """ 从队列中获取地址并查询内容 :return: None """ log.info("Running thread %s" % threading.current_thread().name) while True: try: cur_url, cur_depth = self._url_queue.get(timeout=1) print self._url_queue.unfinished_tasks cur_url = cur_url.strip() except Queue.Empty as e: log.warn(e) continue fetch_tool = Fetcher.Fetcher(cur_url, self._output_dir, self._timeout, self._cookie) if self.pro_match.match(cur_url): # 产品详情页 self.save_product_info_url(cur_url) content = fetch_tool.read_content() info = fetch_tool.get_product_info(content) product_info = [] [product_info.append(item) for item in info] if self._lock.acquire(): self._url_visited.append(cur_url) self._lock.release() self.save_product_info(product_info) elif self.fig_pattern.match(cur_url): # 产品大类 content = fetch_tool.read_content() if content is None or len(content) == 0: continue # 获取大类下的子类页面 sub_urls = fetch_tool.get_sub_urls(content) if sub_urls is None: continue for item in sub_urls: if self._lock.acquire(): # lock _url_visited, check if item in self._url_visited: continue self._lock.release() try: if self._lock.acquire(): # lock _url_visited, add self._url_visited.append(item) self._lock.release() self._url_queue.put([item, cur_depth + 1], timeout=1) except Queue.Full as e: log.warn(e) break else: # 子类页面, 获取产品详情地址 content = fetch_tool.read_content() product_urls = fetch_tool.get_product_url(content) if product_urls is None: continue for item in product_urls: if self._lock.acquire(): # lock _url_visited, check if item in self._url_visited: self._lock.release() continue try: self._url_visited.append(item) self._url_queue.put([item, cur_depth + 1], timeout=1) except Queue.Full as e: log.warn(e) break finally: self._lock.release() self._url_queue.task_done()