Beispiel #1
0
 def get_topic_data(self, topic):
     logger.debug(f'Crawling data for topic {topic["name"]}')
     # topic data
     url = os.path.join(self.main_url, f'topic/{topic["topic_id"]}/hot')
     flag, result = get_http_respense(url, method='GET', rtype='HTML')
     if not flag:
         logger.error(f'Got unusual http response:\n{result}')
         return None
     tree = etree.HTML(result)
     try:
         number_board = tree.xpath('//strong[@class="NumberBoard-itemValue"]')
     except Exception as e:
         logger.exception(f'number board was changed!\n{e}')
         return None
     follower_num, question_num = tuple(map(lambda nb: int(nb.get('title')), number_board))
     logger.debug(f'follower_num: {follower_num}, question_num: {question_num}')
     # relative topic ids
     parent_topic_ids = self._get_relative_topic_ids(topic['topic_id'], 'parent', 10)
     children_topic_ids = self._get_relative_topic_ids(topic['topic_id'], 'children', 10)
     logger.debug(f'parent_topic_ids: {parent_topic_ids}, children_topic_ids: {children_topic_ids}')
     # update data
     topic.update({
             'follower_num': follower_num,
             'question_num': question_num,
             'parent_topic_ids': parent_topic_ids,
             'children_topic_ids': children_topic_ids,
             })
     return topic
Beispiel #2
0
 def _get_relative_topic_ids(self, topic_id, relative_type, page_size):
     offset = 0
     topic_ids = []
     while True:
         url = os.path.join(self.main_url, f'api/v3/topics/{topic_id}/{relative_type}?limit={page_size}&offset={offset}')
         flag, result = get_http_respense(url, method='GET', rtype='JSON')
         if not flag:
             logger.error(f'Got unusual http response:\n{result}')
             return None
         data = result.get('data')
         if not data:
             return topic_ids
         topic_ids.extend(map(itemgetter('id'), data))
         offset += page_size
Beispiel #3
0
 def get_home_topics(self):
     url = os.path.join(self.main_url, 'topics')
     flag, result = get_http_respense(url, method='GET', rtype='HTML')
     if not flag:
         logger.error(f'Got unusual http response:\n{result}')
         return None
     tree = etree.HTML(result)
     try:
         home_topic_elements = tree.xpath('//li[@class="zm-topic-cat-item"]/a')
     except Exception as e:
         logger.exception(f'Home topic page was changed!\n{e}')
         return None
     home_topics = [hte.text for hte in home_topic_elements]
     logger.info(f'home topics:\n{home_topics}')
     return home_topics