def parse_detail(self, response):
        woaidu_item = WoaiduCrawlerItem()

        response_selector = HtmlXPathSelector(response)
        woaidu_item['book_name'] = list_first_item(
            response_selector.select(
                '//div[@class="zizida"][1]/text()').extract())
        woaidu_item['author'] = [
            list_first_item(
                response_selector.select('//div[@class="xiaoxiao"][1]/text()').
                extract())[5:].strip(),
        ]
        woaidu_item['book_description'] = list_first_item(
            response_selector.select(
                '//div[@class="lili"][1]/text()').extract()).strip()
        woaidu_item['book_covor_image_url'] = list_first_item(
            response_selector.select(
                '//div[@class="hong"][1]/img/@src').extract())

        download = []
        for i in response_selector.select(
                '//div[contains(@class,"xiazai_xiao")]')[1:]:
            download_item = {}
            download_item['url'] = strip_null(
                deduplication([
                    list_first_item(
                        i.select('./div')[0].select('./a/@href').extract()),
                    list_first_item(
                        i.select('./div')[1].select('./a/@href').extract())
                ]))

            download_item['progress'] = list_first_item(
                i.select('./div')[2].select('./text()').extract())
            download_item['update_time'] = list_first_item(
                i.select('./div')[3].select('./text()').extract())
            download_item['source_site'] = \
                [
                    list_first_item(i.select('./div')[4].select('./a/text()').extract()), \
                    list_first_item(i.select('./div')[4].select('./a/@href').extract()) \
                ]

            download.append(download_item)

        woaidu_item['book_download'] = download
        woaidu_item['original_url'] = response.url

        yield woaidu_item
    def parse_detail(self, response):
        woaidu_item = WoaiduCrawlerItem()

        response_selector = HtmlXPathSelector(response)
        woaidu_item["book_name"] = list_first_item(
            response_selector.select('//div[@class="zizida"][1]/text()').extract()
        )
        woaidu_item["author"] = [
            list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip()
        ]
        woaidu_item["book_description"] = list_first_item(
            response_selector.select('//div[@class="lili"][1]/text()').extract()
        ).strip()
        woaidu_item["book_covor_image_url"] = list_first_item(
            response_selector.select('//div[@class="hong"][1]/img/@src').extract()
        )

        download = []
        for i in response_selector.select('//div[contains(@class,"xiazai_xiao")]')[1:]:
            download_item = {}
            download_item["url"] = strip_null(
                deduplication(
                    [
                        list_first_item(i.select("./div")[0].select("./a/@href").extract()),
                        list_first_item(i.select("./div")[1].select("./a/@href").extract()),
                    ]
                )
            )

            download_item["progress"] = list_first_item(i.select("./div")[2].select("./text()").extract())
            download_item["update_time"] = list_first_item(i.select("./div")[3].select("./text()").extract())
            download_item["source_site"] = [
                list_first_item(i.select("./div")[4].select("./a/text()").extract()),
                list_first_item(i.select("./div")[4].select("./a/@href").extract()),
            ]

            download.append(download_item)

        woaidu_item["book_download"] = download
        woaidu_item["original_url"] = response.url

        yield woaidu_item
Example #3
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__, item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        bookfile_ids_urls = [(x['book_file_id'], x['url']) for ok, x in results if ok]
        bookfile_id_url = list_first_item(bookfile_ids_urls)
        if bookfile_id_url:
            item['book_file_id'] = bookfile_id_url[0]
            item['book_file_url'] = bookfile_id_url[1]
            return item
        else:
            if self.item_download[item['original_url']]:
                next = list_first_item(self.item_download[item['original_url']])
                self.item_download[item['original_url']] = self.item_download[item['original_url']][1:]
                return Request(next)
            else:
                return item
Example #4
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__, item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        bookfile_paths_urls = [(x['path'], x['url']) for ok, x in results if ok]
        bookfile_path_url = list_first_item(bookfile_paths_urls)
        if bookfile_path_url:
            item['book_file'] = os.path.join(os.path.abspath(self.bookfile_store), bookfile_path_url[0])
            item['book_file_url'] = bookfile_path_url[1]
            return item
        else:
            if self.item_download[item['original_url']]:
                next = list_first_item(self.item_download[item['original_url']])
                self.item_download[item['original_url']] = self.item_download[item['original_url']][1:]
                return Request(next)
            else:
                return item
    def parse(self, response):
        response_selector = HtmlXPathSelector(response)
        next_link = list_first_item(response_selector.select(u'//div[@class="k2"]/div/a[text()="下一页"]/@href').extract())
        if next_link:
            next_link = clean_url(response.url, next_link, response.encoding)
            yield Request(url=next_link, callback=self.parse)

        for detail_link in response_selector.select(u'//div[contains(@class,"sousuolist")]/a/@href').extract():
            if detail_link:
                detail_link = clean_url(response.url, detail_link, response.encoding)
                yield Request(url=detail_link, callback=self.parse_detail)
Example #6
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__, item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        image_paths = [x['path'] for ok, x in results if ok]
        image_path = list_first_item(image_paths)
        item['book_covor_image_path'] = os.path.join(os.path.abspath(self.images_store),
                                                     image_path) if image_path else ""

        return item
Example #7
0
    def get_media_requests(self, item, info):
        """
            Only download once per book,so it pick out one from all of the download urls.
        """

        # XXX:To test specific url,you can use the following method:
        # return Request("http://down.wmtxt.com/wmtxt/wmtxt/UploadFile/2010-6/%A1%B6%D3%F6%BC%FB%C4%E3%A1%B7.rar")
        if item.get('book_download'):
            downloadfile_urls = [i['url'] for i in item.get('book_download') if i['url']]
            downloadfile_urls = list(set(itertools.chain(*downloadfile_urls)))
            first_download_file = list_first_item(downloadfile_urls)
            self.item_download[item['original_url']] = downloadfile_urls[1:]
            if first_download_file:
                return Request(first_download_file)
Example #8
0
    def get_media_requests(self, item, info):
        """
            Only download once per book,so it pick out one from all of the download urls.
        """

        # XXX:To test specific url,you can use the following method:
        # return Request("http://down.wmtxt.com/wmtxt/wmtxt/UploadFile/2010-6/%A1%B6%D3%F6%BC%FB%C4%E3%A1%B7.rar")
        if item.get('book_download'):
            downloadfile_urls = [i['url'] for i in item.get('book_download') if i['url']]
            downloadfile_urls = list(set(itertools.chain(*downloadfile_urls)))
            first_download_file = list_first_item(downloadfile_urls)
            self.item_download[item['original_url']] = downloadfile_urls[1:]
            if first_download_file:
                return Request(first_download_file)
Example #9
0
    def item_completed(self, results, item, info):
        if self.LOG_FAILED_RESULTS:
            msg = '%s found errors proessing %s' % (self.__class__.__name__,
                                                    item)
            for ok, value in results:
                if not ok:
                    log.err(value, msg, spider=info.spider)

        image_paths = [x['path'] for ok, x in results if ok]
        image_path = list_first_item(image_paths)
        item['book_covor_image_path'] = os.path.join(
            os.path.abspath(self.images_store),
            image_path) if image_path else ""

        return item
    def parse(self, response):
        response_selector = HtmlXPathSelector(response)
        next_link = list_first_item(
            response_selector.select(
                u'//div[@class="k2"]/div/a[text()="下一页"]/@href').extract())
        if next_link:
            next_link = clean_url(response.url, next_link, response.encoding)
            yield Request(url=next_link, callback=self.parse)

        for detail_link in response_selector.select(
                u'//div[contains(@class,"sousuolist")]/a/@href').extract():
            if detail_link:
                detail_link = clean_url(response.url, detail_link,
                                        response.encoding)
                yield Request(url=detail_link, callback=self.parse_detail)
Example #11
0
    def stat_file(self, key, info):
        """
            the stat is the file key dir,
            the last_modified is the file that saved to the file key dir.
        """

        keydir = os.path.join(self.basedir, *key.split('/'))
        filenames = os.listdir(keydir)
        if len(filenames) != 1:
            shutil.rmtree(keydir, True)
            return {}
        else:
            filename = list_first_item(filenames)

        absolute_path = self._get_filesystem_path(key)
        try:
            last_modified = os.path.getmtime(absolute_path)
        except:  # FIXME: catching everything!
            return {}

        with open(os.path.join(absolute_path, filename), 'rb') as file_content:
            checksum = md5sum(file_content)

        return {'last_modified': last_modified, 'checksum': checksum}
Example #12
0
    def stat_file(self, key, info):
        """
            the stat is the file key dir,
            the last_modified is the file that saved to the file key dir.
        """

        keydir = os.path.join(self.basedir, *key.split('/'))
        filenames = os.listdir(keydir)
        if len(filenames) != 1:
            shutil.rmtree(keydir, True)
            return {}
        else:
            filename = list_first_item(filenames)

        absolute_path = self._get_filesystem_path(key)
        try:
            last_modified = os.path.getmtime(absolute_path)
        except:  # FIXME: catching everything!
            return {}

        with open(os.path.join(absolute_path, filename), 'rb') as file_content:
            checksum = md5sum(file_content)

        return {'last_modified': last_modified, 'checksum': checksum}