Exemple #1
0
 def get_urls(self):
     """
     by parse start url to get detail urls for each book
     :return: detail url list
     """
     element = get_element(targetUrl=self.start_url,
                           workLogger=novellogger,
                           headers={"User-Agent": user_agent()})
     trs = element.xpath('.//tbody[@id="rankList"]//tr')[1:]
     for tr in trs:
         link = tr.xpath(".//a[@target='_blank']/@href")
         if len(link) > 0:
             chuangShiNovelUrlQueue.put(link[0])
def _check_and_save():
    """
    get data from novelDataQueue and check this data dictionary if saved by finger print in redis database,
    if not save it
    """
    while True:
        data_dict = qiDianNovelDataQueue.get()

        data_finger = data_fingerprint(data_dict["name"], data_dict["author"],
                                       data_dict["link"])

        result = novel_redis_client.sadd("novel_data_finger", data_finger)

        if result == 0:
            novellogger.info(
                "data repeat which finger with :{}".format(data_finger))

        if result == 1:

            headers = {"User-Agent": user_agent(), "Connection": "close"}
            detail_element = get_element(targetUrl=data_dict["link"],
                                         workLogger=novellogger,
                                         headers=headers)

            wordCount = detail_element.xpath(
                './/div[@class="book-info "]//p[3]/em[1]/text()')
            intro = detail_element.xpath(
                '//div[@class="book-content-wrap cf"]//div[@class="book-intro"]//text()'
            )
            link = detail_element.xpath(
                '//a[@class="red-btn J-getJumpUrl "]/@href')

            data_dict["source"] = "起点中文网"
            data_dict["wordCount"] = float(
                wordCount[0]) if len(wordCount) > 0 else None
            data_dict["link"] = "https:" + link[0] if len(link) > 0 else None
            data_dict["intro"] = "".join(
                intro).strip() if len(intro) > 0 else None
            novel_mongo_clinet.insert_one(data_dict)
            novellogger.info("save data :{}".format(data_dict))

        qiDianNovelDataQueue.task_done()
Exemple #3
0
    def parse_list_urls(self):
        """
        :param urlQueue:        to give target url
        :param elementQueue:    to save the element of the page by parse url
        :param workLogger:      to record the work information
        :param redisClient:     save url finger print
        :param redisKey:        the redis data save key for url finger print
        :return:                None
        """
        while True:
            # every time to parse url use different User-Agent
            headers = {"User-Agent": user_agent(), "Connection": "close"}

            # use function import from customTools.respDownloader
            parse_target_urls(urlQueue=qiDianNovelUrlQueue,
                              elementQueue=qiDianNovelElementQueue,
                              headers=headers,
                              workLogger=novellogger,
                              redisClient=novel_redis_client,
                              redisKey="novel_url_finger")
Exemple #4
0
 def __init__(self):
     self.start_url = "https://www.qidian.com/rank/recom?dateType=2&chn=9&page={}"
     self.headers = {"User-Agent": user_agent()}
Exemple #5
0
 def __init__(self):
     self.start_url = "http://music.163.com/discover/playlist/?order=hot&cat=%E5%85%A8%E9%83%A8&limit=35&offset=0"
     self.headers = {"User-Agent": user_agent()}