Exemple #1
0
    def proceed(self, url, level=0):
        self.fetchedCount += 1
        # print ('{} lemmas collected'.format(self.fetchedCount))
        crawler = WebCrawler(url)
        if crawler.response.url == BAIKE_404:
            print("url: {} returns 404".format(url))
            return
        self.soup = BaikeSoup(crawler.source)
        self.soup.parse_current_page()
        lemmaName = self.soup.lemma.encode('utf-8')

        if not self.download_related:
            crawler.save_source_to_file(
                LEMMA_PATTERN_WITH_BOLD.format(lemmaName))
            return
        else:
            crawler.save_source_to_file(LEMMA_PATTERN.format(lemmaName))

        self.loadedUrls[url] = True
        self.loadedLemma.append(lemmaName)
        if (url != crawler.response.url):
            self.loadedUrls[crawler.response.url] = True

        if (self.soup.lemmaid == ID_UNSET):
            return
        tried = 0
        while True:
            relatedApi = RELATED_URL_PATTERN.format(self.soup.lemmaid)
            crawler = WebCrawler(relatedApi)
            source = crawler.response.text.encode(crawler.response.encoding)
            jsonObj = json.loads(source)
            if isinstance(jsonObj, list):
                break
            else:
                tried += 1
                if tried > MAX_RETRY:
                    print(
                        'tried 5 times but still return error, url: {}'.format(
                            relatedApi))
                    return

        level += 1
        for relatedLemma in jsonObj[0]['data']:
            if os.path.isfile(
                    LEMMA_PATTERN.format(
                        relatedLemma['title'].encode('utf-8'))):
                pass
            # if (relatedLemma['title'].encode('utf-8') in self.loadedLemma):
            # print('{} already downloaded, will not start download').format(relatedLemma['title'].encode('utf-8'))
            elif level > MAX_RECURSION_DEPTH:
                # print('reach max recursion depth, will not start download')
                pass
            elif self.fetchedCount > MAX_DOWNLOAD_COUNT:
                # print('reach max search count, will not start download')
                pass
            elif relatedLemma['url'] in self.loadedUrls:
                # print('{} already downloaded, will not start download').format(relatedLemma['url'])
                pass
            else:
                self.proceed(relatedLemma['url'], level)
Exemple #2
0
class BaiduWorker(object):
    def __init__(self, keyword):
        self.keyword = keyword
        self.totalDic = {}
        self.totalLemmas = []
        self.crawler = None
        self.soup = None
        self.fetchedCount = 0

        if not os.path.exists(FOLDER_PREFIX):
            os.makedirs(FOLDER_PREFIX)

        self.proceed()
        self.save_lemma_info()

    def proceed(self):
        while True:
            self.crawler = WebCrawler(self.get_url())
            # self.crawler.save_source_to_file(BAIDU_RESULT_FOLDER.format(self.fetchedCount))
            self.soup = BaiduSoup(self.crawler.source)
            self.soup.parse_current_page()
            # 试图寻找交集
            for newLemma in self.soup.lemmas:
                # duplicated = 0
                if md5_unicode(newLemma[LEMMA_NAME]) in self.totalDic:
                    print('find duplicated lemma: {}, skip saving'.format(
                        newLemma[LEMMA_NAME].encode('utf-8')))
                    # ++duplicated
                    return
                else:
                    self.save_lemma_page(newLemma)
                    self.totalLemmas.append(newLemma)
                    self.totalDic[md5_unicode(newLemma[LEMMA_NAME])] = True
                # if duplicated == LEMMAS_EVERY_PAGE:
                #     print ('find 10 duplicated items, return to 1st page, stop crawling')
                #     return
                if len(self.totalLemmas) > MAX_LEMMA_COUNT:
                    print('over max lemma count, stop crawling')
                    return
            if (len(self.soup.lemmas) < LEMMAS_EVERY_PAGE):
                print('search results less than 10, stop searching')
                return

            self.fetchedCount += LEMMAS_EVERY_PAGE

    def save_lemma_page(self, lemma):
        # print (lemma[LEMMA_URL])
        self.crawler = WebCrawler(lemma[LEMMA_URL])
        lemmaName = lemma[LEMMA_NAME].encode('utf-8')
        lemmaName = lemmaName.replace('/', '__')
        self.crawler.save_source_to_file(LEMMA_PATTERN.format(lemmaName))

    def save_lemma_info(self):
        json_str = json.dumps(self.totalLemmas,
                              ensure_ascii=False,
                              indent=4,
                              sort_keys=True)
        save_to_file(LEMMA_RECORD_PATH, json_str.encode('utf-8'))

    def get_url(self):
        url = SEARCH_QUERY.format(quote(self.keyword), self.fetchedCount)
        print('fetch baidu search url: {}'.format(url))
        return url