Python CloseSpider Examples

Programming Language: Python

Namespace/Package Name: scrapy.exceptions

Class/Type: CloseSpider

Examples at hotexamples.com: 30

Python CloseSpider - 30 examples found. These are the top rated real world Python examples of scrapy.exceptions.CloseSpider extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CloseSpider(30)

Frequently Used Methods

CloseSpider (30)

Example #1

Show file

File: django_spider.py Project: arefin070/WebScrappingII

    def parse(self, response):
        xs = Selector(response)
        base_objects = []
        base_elem = self.scraper.get_base_elem()
        rpt = response.request.meta['rpt']

        page_num = response.request.meta['page_num']
        page = self.pages[page_num - 1]
        follow_page_num = response.request.meta['follow_page_num']

        if rpt.page_type == 'MP':
            if self.current_output_num_mp_response_bodies < self.conf[
                    'OUTPUT_NUM_MP_RESPONSE_BODIES']:
                self.current_output_num_mp_response_bodies += 1
                self.log(
                    "Response body ({url})\n\n***** RP_MP_{num}_START *****\n{resp_body}\n***** RP_MP_{num}_END *****\n\n"
                    .format(url=response.url,
                            resp_body=response.body,
                            num=self.current_output_num_mp_response_bodies),
                    logging.INFO)

        if rpt.content_type == 'J':
            json_resp = None
            try:
                json_resp = json.loads(response.body_as_unicode())
            except ValueError:
                msg = "JSON response for MP could not be parsed!"
                self.log(msg, logging.ERROR)
            if json_resp:
                try:
                    jsonpath_expr = parse(base_elem.x_path)
                except JsonPathLexerError:
                    msg = "JsonPath for base elem could not be processed!"
                    self.dds_logger.error(msg)
                    raise CloseSpider()
                base_objects = [
                    match.value for match in jsonpath_expr.find(json_resp)
                ]
                if len(base_objects) > 0:
                    base_objects = base_objects[0]
        else:
            base_objects = response.xpath(base_elem.x_path)

        if (len(base_objects) == 0):
            self.log(
                "{cs}No base objects found.{ce}".format(
                    cs=self.bcolors["INFO"], ce=self.bcolors["ENDC"]),
                logging.ERROR)

        if (self.conf['MAX_ITEMS_READ']):
            items_left = min(
                len(base_objects),
                self.conf['MAX_ITEMS_READ'] - self.items_read_count)
            base_objects = base_objects[0:items_left]

        for obj in base_objects:
            item_num = self.items_read_count + 1
            self.tmp_non_db_results[item_num] = {}
            page_str = str(page_num) + '(' + str(follow_page_num) + ')'
            self.dds_logger.info("")
            self.dds_logger.info(
                self.bcolors['BOLD'] +
                '--------------------------------------------------------------------------------------'
                + self.bcolors['ENDC'])
            self.struct_log(
                "{cs}Starting to crawl item {i} from page {p}.{ce}".format(
                    i=str(item_num),
                    p=page_str,
                    cs=self.bcolors["HEADER"],
                    ce=self.bcolors["ENDC"]))
            self.dds_logger.info(
                self.bcolors['BOLD'] +
                '--------------------------------------------------------------------------------------'
                + self.bcolors['ENDC'])
            item = self.parse_item(response, obj, rpt.page_type, item_num)
            item._dds_item_page = page
            item._dds_item_page_num = page_num
            item._dds_item_follow_page_num = follow_page_num
            item._dds_item_id = item_num
            item._dds_id_str = str(item._dds_item_page_num) + '(' + str(
                item._dds_item_follow_page_num) + ')-' + str(item._dds_item_id)

            if item:
                only_main_page_idfs = True
                idf_elems = self.scraper.get_id_field_elems()
                for idf_elem in idf_elems:
                    if idf_elem.request_page_type != 'MP':
                        only_main_page_idfs = False

                is_double = False
                if only_main_page_idfs:
                    item, is_double = self._check_for_double_item(item)

                # Don't go on reading detail pages when...
                # No detail page URLs defined or
                # DOUBLE item with only main page IDFs and no standard update elements to be scraped from detail pages or
                # generally no attributes scraped from detail pages
                cnt_sue_detail = self.scraper.get_standard_update_elems_from_detail_pages(
                ).count()
                cnt_detail_scrape = self.scraper.get_from_detail_pages_scrape_elems(
                ).count()

                if self.scraper.get_detail_page_url_elems().count() == 0 or \
                    (is_double and cnt_sue_detail == 0) or cnt_detail_scrape == 0:
                    self.non_db_results[id(
                        item)] = self.tmp_non_db_results[item_num].copy()
                    yield item
                else:
                    #self.run_detail_page_request()
                    url_elems = self.scraper.get_detail_page_url_elems()
                    for url_elem in url_elems:
                        if not url_elem.scraped_obj_attr.save_to_db:
                            url_before = self.tmp_non_db_results[item_num][
                                url_elem.scraped_obj_attr.name]
                            url, applied = self._replace_placeholders(
                                url_before, item, item_num, True)
                            self.tmp_non_db_results[item_num][
                                url_elem.scraped_obj_attr.name] = url
                        else:
                            url_before = item[url_elem.scraped_obj_attr.name]
                            url, applied = self._replace_placeholders(
                                url_before, item, item_num, True)
                            item[url_elem.scraped_obj_attr.name] = url
                        if len(applied) > 0:
                            msg = "Detail page URL placeholder(s) applied (item {id}): {a}".format(
                                a=str(applied), id=item._dds_id_str)
                            self.log(msg, logging.DEBUG)
                            self.log("URL before: " + url_before,
                                     logging.DEBUG)
                            self.log("URL after : " + url, logging.DEBUG)
                        dp_rpt = self.scraper.get_rpt_for_scraped_obj_attr(
                            url_elem.scraped_obj_attr)
                        kwargs = self.dp_request_kwargs[
                            dp_rpt.page_type].copy()

                        if 'meta' not in kwargs:
                            kwargs['meta'] = {}
                        kwargs['meta']['page_num'] = page_num
                        kwargs['meta']['follow_page_num'] = follow_page_num
                        kwargs['meta']['item'] = item
                        kwargs['meta']['from_page'] = dp_rpt.page_type
                        kwargs['meta']['item_num'] = item_num

                        kwargs['meta']['rpt'] = dp_rpt

                        if 'headers' in kwargs:
                            kwargs['headers'] = self._do_req_info_replacements(
                                item, item_num, page, kwargs['headers'],
                                "HEADERS")
                        if 'body' in kwargs:
                            body_before = kwargs['body']
                            kwargs['body'] = kwargs['body'].replace(
                                '{page}', str(page))
                            kwargs[
                                'body'], applied = self._replace_placeholders(
                                    kwargs['body'], item, item_num, True)
                            if len(applied) > 0:
                                msg = "Request info placeholder(s) applied (item {id}): {a}".format(
                                    a=str(applied), id=item._dds_id_str)
                                self.log(msg, logging.DEBUG)
                                self.log("BODY before: " + body_before,
                                         logging.DEBUG)
                                self.log("BODY after : " + kwargs['body'],
                                         logging.DEBUG)
                        if 'cookies' in kwargs:
                            kwargs['cookies'] = self._do_req_info_replacements(
                                item, item_num, page, kwargs['cookies'],
                                "COOKIES")
                        form_data = None
                        if dp_rpt.request_type == 'F' and dp_rpt.form_data:
                            form_data = json.loads(dp_rpt.form_data).copy()
                            form_data = self._do_req_info_replacements(
                                item, item_num, page, form_data, "FORM DATA")

                        if url_elem == url_elems[len(url_elems) - 1]:
                            kwargs['meta']['last'] = True
                        else:
                            kwargs['meta']['last'] = False
                        self._set_meta_splash_args()
                        #logging.info(str(kwargs))
                        self.log(
                            ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>",
                            logging.INFO)
                        msg = "{cs}Calling {dp} URL for item {id}...{ce}".format(
                            dp=dp_rpt.page_type,
                            id=item._dds_id_str,
                            cs=self.bcolors["HEADER"],
                            ce=self.bcolors["ENDC"])
                        self.log(msg, logging.INFO)
                        msg = "URL     : {url}".format(url=url)
                        self.log(msg, logging.INFO)
                        self._log_request_info(dp_rpt, form_data, kwargs)
                        self.log(
                            ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>",
                            logging.INFO)

                        if dp_rpt.request_type == 'R':
                            yield response.follow(
                                url,
                                callback=self.parse_item,
                                method=dp_rpt.method,
                                dont_filter=dp_rpt.dont_filter,
                                **kwargs)
                        else:
                            yield FormRequest(url,
                                              callback=self.parse_item,
                                              method=dp_rpt.method,
                                              formdata=form_data,
                                              dont_filter=dp_rpt.dont_filter,
                                              **kwargs)
                for key, value in list(item.items()):
                    #Fixing some extremely weird Python 2 encoding failure, 2017-06-29
                    if type(value).__name__ == 'str':
                        try:
                            value = value.decode('utf-8')
                        except AttributeError:
                            pass
                    if value and (type(value).__name__
                                  in ['str', 'unicode']) and '{page}' in value:
                        msg = "Applying page placeholder on {k}...".format(
                            k=key)
                        self.log(msg, logging.DEBUG)
                        self.log("Value before: " + value, logging.DEBUG)
                        value = value.replace('{page}', str(page))
                        item[key] = value
                        self.log("Value after: " + value, logging.DEBUG)
            else:
                self.log("Item could not be read!", logging.ERROR)

        mir_reached = False
        if self.conf['MAX_ITEMS_READ'] and (
                self.conf['MAX_ITEMS_READ'] - self.items_read_count <= 0):
            mir_reached = True
        if self.scraper.follow_pages_url_xpath and not mir_reached:
            if not self.conf['NUM_PAGES_FOLLOW'] or follow_page_num < self.conf[
                    'NUM_PAGES_FOLLOW']:
                url = response.xpath(
                    self.scraper.follow_pages_url_xpath).extract_first()
                if url is not None:
                    self._set_meta_splash_args()
                    follow_page = ''
                    if self.scraper.follow_pages_page_xpath:
                        follow_page = response.xpath(
                            self.scraper.follow_pages_page_xpath
                        ).extract_first()
                    form_data_orig = None
                    if self.scraper.get_follow_page_rpts().count() > 0:
                        f_rpt = self.scraper.get_follow_page_rpts()[0]
                        form_data_orig = self.scraper.get_follow_page_rpts(
                        )[0].form_data
                    else:
                        f_rpt = self.scraper.get_main_page_rpt()
                        form_data_orig = self.scraper.get_main_page_rpt(
                        ).form_data
                    kwargs, form_data = self._prepare_mp_req_data(
                        self.fp_request_kwargs, form_data_orig, page,
                        follow_page)

                    follow_page_num += 1
                    kwargs['meta']['page_num'] = page_num
                    kwargs['meta']['follow_page_num'] = follow_page_num
                    kwargs['meta']['rpt'] = f_rpt

                    self._log_page_info(page_num, follow_page_num, url, f_rpt,
                                        form_data, kwargs)

                    if f_rpt.request_type == 'R':
                        yield response.follow(url,
                                              callback=self.parse,
                                              method=f_rpt.method,
                                              dont_filter=f_rpt.dont_filter,
                                              **kwargs)
                    else:
                        url = response.urljoin(url)
                        yield FormRequest(url,
                                          callback=self.parse,
                                          method=f_rpt.method,
                                          formdata=form_data,
                                          dont_filter=f_rpt.dont_filter,
                                          **kwargs)

Example #2

Show file

    def parse(self, response):
        type = response.css('head').xpath(
            './meta[@property="og:type"]/@content').get().split('.')[1]
        if not type == "movie":
            yield None
            return
        titleSection = response.css('.subpage_title_block')
        if titleSection is None:
            yield None
            return
        idMovie = titleSection.css(".parent").xpath(
            "./h3/a/@href").get().split('/')[2]
        if idMovie in self.moviesScrapped:
            yield None
            return
        movieYear = titleSection.css('.nobr').xpath(
            './text()').get().strip().replace(')',
                                              '(').split('(')[1].split(' ')[0]
        if movieYear is None:
            yield None
            return
        if (int(movieYear) < 1980 or int(movieYear) > 1989):
            yield None
            return

        movieName = titleSection.xpath('./div/h3/a/text()').get()
        actorList = response.css('.cast_list').xpath('./tr')[1::]
        nextScrap = []
        for c in actorList:
            if self.documentscount >= 5000:
                yield None
                raise CloseSpider('Number of documents reached')
            if (len(c.xpath('./td').getall()) < 3):
                continue
            actorURL = c.xpath('./td/a/@href').get()
            actorId = actorURL.split('/')[2]
            actorName = c.xpath(
                './td[@class="primary_photo"]//a/img/@alt').get()
            actorRole = c.xpath(
                './td[@class="character"]/text()').get().strip().replace(
                    "\n", "")
            if actorRole == '':
                actorRole = c.xpath('./td[@class="character"]/a/text()').get()

            if actorURL is not None:
                nextScrap.append({
                    "url": self.allowed_domains[0] + actorURL,
                    "id": actorId
                })
            yield {
                "movie_id": idMovie,
                "movie_name": movieName,
                "movie_year": movieYear,
                "actor_name": actorName,
                "actor_id": actorId,
                "role_name": actorRole
            }
            self.documentscount = self.documentscount + 1
        self.moviesScrapped.append(idMovie)
        for a in nextScrap:
            if a['id'] not in self.actorsScrapped:
                self.actorsScrapped.append(a['id'])
                next_page = "https://" + a['url']
                yield Request(next_page, callback=self.parse_artist)

Example #3

Show file

    def parse_page(self, response):
        # inspect_response(response, self)
        if 'reaction_units/more' in response.url:
            json_data = json.loads(response.body_as_unicode().replace(
                'for (;;);', ''))
            post_html = json_data.get('domops')[0][-1].get('__html')
            structural_json_data = self._create_structed_json_data(json_data)
        else:
            main_content_id = response.css(
                '#pagelet_timeline_main_column>div::attr(id)').extract_first()
            if not main_content_id:
                raise CloseSpider('Main content id not found')

            main_script = response.xpath(
                f'//script/text()[contains(.,"{main_content_id}") '
                f'and contains(.,"content:")]').extract_first()
            main_id = re.search(r'container_id\:"(.*?)"', main_script).group(1)
            post_html = response.css(f'#{main_id}').extract_first()
            post_html = post_html.replace('-->', '').replace('<!--', '')

        sel = Selector(text=post_html)
        posts = sel.xpath(
            '//div[@class="_1xnd"]'
            '/div[@class and not(descendant::*[contains(@class,"uiMorePagerPrimary")])]'
        )
        page_name = response.meta.get('page_name') or \
                    response.css('#pageTitle::text').extract_first()
        page_name = page_name.split('-')[0].rstrip()
        page_id = response.meta.get('page_id')

        for post in posts:
            loader = FacebookPostItemLoader(selector=post)
            loader.add_value('page_name', page_name)
            loader.add_value('page_id', page_id)
            loader.add_css('post_id', 'input[name*="identifier"]::attr(value)')
            post_id = loader.get_output_value('post_id')
            loader.add_value(
                'post_url',
                f'https://www.facebook.com/{page_id}/posts/{post_id}')
            loader.add_xpath(
                'post_text', './/div[@data-testid="post_message"]'
                '//text()[not(ancestor::span[@class="text_exposed_hide"])]')
            loader.add_css('image_urls', '.mtm a::attr(data-ploi)',
                           MapCompose(lambda v: v.split('?')[0]))
            loader.add_css(
                'video_url', '.fsm>a::attr(href)',
                MapCompose(response.urljoin, lambda v: v
                           if 'videos' in v else None,
                           lambda v: v.split('?')[0]))
            if 'reaction_units/more' in response.url:
                post_json_data = structural_json_data.get(post_id)
            else:
                # inspect_response(response, self)
                post_script = response.xpath(
                    f'//script/text()[contains(.,"{post_id}") '
                    f'and (contains(.,"post_fbid") or contains(.,"photo_fbid"))]'
                ).extract_first()
                post_script = re.search(
                    r'onPageletArrive\((\{.*\})',
                    post_script).group(1).split('all_phases')[0] + '}'
                json_data = demjson.decode(post_script)
                json_data = json_data.get('jsmods').get(
                    'pre_display_requires')[0][3][1].get('__bbox')
                variables = json_data.get('variables')
                post_json_data = json_data.get('result').get('data').get(
                    'feedback')

            loader.add_value(
                'comment_count',
                post_json_data.get('comment_count').get('total_count'))
            loader.add_value('reaction_count',
                             post_json_data.get('reaction_count').get('count'))
            loader.add_value('share_count',
                             post_json_data.get('share_count').get('count'))
            comment_json = post_json_data.get('display_comments')
            edges = comment_json.get('edges')
            for edge in edges:
                comment_loader = FacebookCommentItemLoader()
                node = edge.get('node')
                comment_loader.add_value('comment_id', node.get('id'))
                try:
                    comment_loader.add_value('comment_text',
                                             node.get('body').get('text'))
                except AttributeError:
                    pass
                author = node.get('author')
                comment_loader.add_value('author_name', author.get('name'))
                comment_loader.add_value('author_id', author.get('id'))
                comment_loader.add_value('author_url', author.get('www_url'))
                loader.add_value('comments', comment_loader.load_item())

            yield loader.load_item()

            # TODO: Fetch first 50 comments
            # page_info = comment_json.get('page_info')
            # has_next_comment_page = page_info.get('has_next_page')
            # if has_next_comment_page:
            #     end_cursor = page_info.get('end_cursor')
            #     variables['after'] = end_cursor
            #     variables['before'] = None
            #
            #     # yield Request(
            #     #     url='https://www.facebook.com/api/graphql/',
            #     #     method='POST',
            #     #     body=json.dumps(body),
            #     #     callback=self.parse_next_comment,
            #     #     headers=headers,
            #     # )

        async_get_token = response.xpath(
            '//script/text()[contains(.,"async_get_token")]').extract_first(
            ) or response.body_as_unicode()
        async_get_token = re.search(r'"async_get_token"\:"(.*?)"',
                                    async_get_token).group(1)

        next_page = sel.css(
            '.uiMorePagerPrimary::attr(ajaxify)').extract_first()
        if next_page:
            next_url = response.urljoin(next_page)
            extra_params = urllib.parse.urlencode({
                '__a': 1,
                'fb_dtsg_ag': async_get_token
            })
            next_url += '&' + extra_params
            yield Request(next_url,
                          callback=self.parse_page,
                          meta={
                              'page_name': page_name,
                              'page_id': page_id
                          })

Example #4

Show file

class ZoominTvSpider(scrapy.Spider):
    name = "zoomin.tv"
    allowed_domains = ["zoomin.tv"]
    callbacked = False
    pids = [
        'corporateusahddp', 'corporateuk', 'corporateke', 'corporatees',
        'corporatelatamdp', 'corporatecataldp', 'corporatenl', 'corporatevla',
        'corporatede', 'corporateit', 'corporatefr', 'corporatewal',
        'corporatebradp', 'corporatetr', 'corporateswedp', 'corporateru',
        'corporatejp', 'corporatechinacndp', 'corporatearabdp'
    ]

    # start_urls = (
    #     'http://www.zoomin.tv/',
    # )
    # http://blackbird.zoomin.tv/ProgramXml/.json?feedtype=json&pid=corporateusahddp&vtype=direct&aid=754116
    # http://zoomin.tv/video/#!v/754116/

    def __init__(self,
                 url,
                 uuid,
                 upload_url,
                 callback,
                 check_video_url=None,
                 *args,
                 **kwargs):
        super(ZoominTvSpider, self).__init__(*args, **kwargs)
        print 'init', url
        self.config = ConfigParser.ConfigParser()
        self.config.read("config/config.ini")
        self.uuid = uuid
        self.upload_url = upload_url
        self.callback = callback
        self.check_video_url = check_video_url
        # initialize db
        with open("config/database.cnf") as f:
            config = json.load(f)
        db_cls = get_database(config.get("database_type", None))
        self.db = db_cls(**config.get("database", {}))
        self.start_urls.append(url)

    def parse(self, response):
        print 'parsePlayurl', response.url
        try:
            video_id = self._match_id(self.start_urls[0])
        except AssertionError, e:
            raise CloseSpider('link not supported')

        logger.warn('[parse]' + self.start_urls[0] + ' [uuid]' + self.uuid +
                    ' [video_id]' + video_id)
        if self.check_db():
            return

        video = None
        for pid in self.pids:
            getinfo_url = 'http://blackbird.zoomin.tv/ProgramXml/.json?feedtype=json&pid=%s&vtype=direct&aid=%s' % (
                pid, video_id)
            resp = requests.get(getinfo_url)
            info = resp.json()
            print info
            if len(info['programme']) > 0:
                video = info['programme'][0]
                break

        video_url = video['videourl']
        endpoint, backet, obj = service.utils.paseUploadUrl(self.upload_url)
        print endpoint, backet, obj
        result = service.utils.uploadVideoByUrl(video_url, endpoint, backet,
                                                obj)
        if not result:
            raise CloseSpider('upload oss failed')

        filesize = video['videosize']
        length = int(video['videoduration']) / 1000.0
        title = video['title']
        print 'filesize', filesize
        # callback
        data = {
            "video_id": self.uuid,
            "state": 1,
            "message": u'成功',
            "length": length,
            "play_id": self.uuid,
            "size": filesize,
            "cover": '',
            "title": title
        }
        self.callbacked = service.utils.callback_result(self.callback,
                                                        data=data)
        logger.info('[finished]' + str(self.callbacked) + '[uuid]' + self.uuid)

        video_data = {
            'title': title,
            'video_id': video_id,
            'author': self.name,
            'publish': time.strftime('%Y-%m-%d %H:%M:%S'),
            'page_url': self.start_urls[0],
            'video_length': length,
            'video_size': filesize,
            'video_url': video_url,
            'easub_uuid': self.uuid
        }
        self.db.save_video(video_data)

Example #5

Show file

File: django_checker.py Project: arefin070/WebScrappingII

    def parse(self, response):
        # x_path test
        checker = response.request.meta['checker']
        rpt = response.request.meta['rpt']

        if self.conf['OUTPUT_RESPONSE_BODY']:
            self.log(
                "Response body ({url})\n\n***** RP_START *****\n{resp_body}\n***** RP_END *****\n\n"
                .format(url=response.url,
                        resp_body=response.body.decode('utf-8')), logging.INFO)

        if checker.checker_type == '4':
            self.log(
                "{cs}No 404 result ({c} checker type).{ce}".format(
                    c=str(checker),
                    cs=self.bcolors["OK"],
                    ce=self.bcolors["ENDC"]), logging.INFO)
            if self.conf['DO_ACTION']:
                self.dds_logger.info("{cs}Item kept.{ce}".format(
                    cs=self.bcolors["OK"], ce=self.bcolors["ENDC"]))
            return
        if rpt.content_type == 'J':
            json_resp = json.loads(response.body_as_unicode())
            try:
                jsonpath_expr = parse(checker.checker_x_path)
            except JsonPathLexerError:
                msg = "Invalid checker JSONPath ({c})!".format(c=str(checker))
                self.dds_logger.error(msg)
                raise CloseSpider()
            test_select = [
                match.value for match in jsonpath_expr.find(json_resp)
            ]
            #self.log(unicode(test_select), logging.INFO)
        else:
            try:
                test_select = response.xpath(checker.checker_x_path).extract()
            except ValueError:
                self.log("Invalid checker XPath ({c})!".format(c=str(checker)),
                         logging.ERROR)
                return

        if len(test_select) > 0 and checker.checker_x_path_result == '':
            self.log(
                "{cs}Elements for XPath found on page (no result string defined) ({c}). Delete reason.{ce}"
                .format(c=str(checker),
                        cs=self.bcolors["ERROR"],
                        ce=self.bcolors["ENDC"]), logging.INFO)
            if self.conf['DO_ACTION']:
                self._del_ref_object()
            return
        elif len(test_select
                 ) > 0 and test_select[0] == checker.checker_x_path_result:
            self.log(
                "{cs}XPath result string '{s}' found on page ({c}). Delete reason.{ce}"
                .format(s=checker.checker_x_path_result,
                        c=str(checker),
                        cs=self.bcolors["ERROR"],
                        ce=self.bcolors["ENDC"]), logging.INFO)
            if self.conf['DO_ACTION']:
                self._del_ref_object()
            return
        else:
            self.log(
                "{cs}XPath result string not found ({c}).{ce}".format(
                    c=str(checker),
                    cs=self.bcolors["OK"],
                    ce=self.bcolors["ENDC"]), logging.INFO)
            if self.conf['DO_ACTION']:
                self.dds_logger.info("{cs}Item kept.{ce}".format(
                    cs=self.bcolors["OK"], ce=self.bcolors["ENDC"]))
            return

Example #6

Show file

    def parse(self, response):
        '''
        this part parses the response, then call the request again for the next pages and so on
        '''

        print("")
        print("")
        print("")
        print("")
        print(" ======== " + self.name + " from " + str(self.page_0) + " to " +
              str(self.page_1) + "  ========")
        print("page ============================ ", str(self.page))
        print("page ============================ ", str(self.page))
        print("iterations =======================", str(self.iters))
        print("timestamp ======================= ", datetime.datetime.now())
        print("time since start ==================== ",
              datetime.datetime.now() - self.start_time)

        #uncomment below to check IP one by one
        #yield scrapy.Request('http://checkip.dyndns.org/', headers = {'Connection': 'close'}, callback=self.check_ip, dont_filter = True) #uncomment to check IP one by one

        #randomizes the user agents to make detection harder
        ua_files = open('ua_files.txt').read().splitlines()
        user_agents = random.choice(ua_files)

        url = self.url.replace('__pagenum__', str((self.page * 50)))

        print('attempts on this page ============================',
              str(self.attempts + 1))

        print("user agent ====================", user_agents)

        #headers for the request, might need checking once in a while whether it match the actual request headers
        headers = {
            'accept': '*/*',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9'
            #,'if-none-match-' : '55b03-20443a68390f59aa1bc448bc3b42fa6e'
            ,
            'referer': self.referer.replace('__pagenum__', str(self.page)),
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': user_agents,
            'x-api-source': 'pc',
            'x-requested-with': 'XMLHttpRequest',
            'Connection': 'close'
        }

        yield scrapy.Request(url=url,
                             callback=self.parse,
                             headers=headers,
                             dont_filter=True)

        data = json.loads(response.text)
        #print(data)

        #every region has different error patterns, as shown below. Need to be checked manually
        #give up after 50 tries, also the possibility of the category have less than 160 pages
        if self.attempts <= 10:
            if self.region in ['id', 'vn', 'th']:
                self.check_corrupt(data=data, zeroes=5, pagenum=50)
            elif self.region == 'ph':
                self.check_corrupt(data=data, zeroes=5, pagenum=45)
            elif self.region == 'my':
                self.check_corrupt(data=data, zeroes=3, pagenum=50)
            print("data corrupted ================ ", self.corrupt)
        else:
            print(
                "data corrupted ================ but gave up trying on this page"
            )
            self.corrupt = 0
            self.breaker = 1
            pass

        print("data corrupted ================ ", self.corrupt)

        self.iters += 1

        #if hit max page, that call the cleaning function

        ##if you're using unbatched pagination and wants to use single process cleaning, uncomment the cleaning func
        ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment the df processs
        ##the integrated df method should be more pythonic and efficient. It is still somewhat unstable though, use with caution

        if self.page >= self.page_max + 1:
            #if you're using unbatched pagination and wants to use single process cleaning, uncomment this
            #cleaning(self.name, self.output, self.region, self.category, self.subcategory, self.subsubcategory)

            ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment this
            '''if 'rank' not in self.df:
                self.df['rank'] = np.arange(len(self.df))

            print(' =================== raw' + self.name + '.csv')
            self.df.to_csv('raw' + self.name + '.csv', index=False)     
            print(' =================== raw' + self.name + '.csv')'''

            raise CloseSpider("====MAX PAGE HAS BEEN REACHED!==== ")

        ##if you're using unbatched pagination and wants to use single process cleaning, you can comment out this entire elif part as it becomes redundant
        elif self.page >= self.page_1:
            #self.cleaning(self.name, self.output, self.region, self.category, self.subcategory, self.subsubcategory)
            '''if 'rank' not in self.df:
                self.df['rank'] = np.arange(len(self.df))

            print(' =================== raw' + self.name + '.csv')
            self.df.to_csv('raw' + self.name + '.csv', index=False)     
            print(' =================== raw' + self.name + '.csv')'''

            raise CloseSpider("====MAX PAGE HAS BEEN REACHED!==== ")

        #if error occurs, and max threshold is hit, print JSON as-is
        elif self.corrupt == 0 and self.breaker == 1:
            cleaning(self.name, self.output, self.region, self.category,
                     self.subcategory, self.subsubcategory)
            if data['items'] is None:
                raise CloseSpider("====NO DATA IS RETURNED!==== ")
            elif data['query_rewrite'] is None:
                raise CloseSpider("====DATA CORRUPTED!====")
            else:
                raise CloseSpider("==== UNKNOWN ERROR ==== ")
        #if error occurs, print out the corresponding error types, then loop to scrapthe same page again
        elif self.corrupt == 1:
            self.attempts += 1
            print("Something went wrong!, retry attempts ===== ",
                  self.attempts)
            if data['items'] is None:
                print(
                    "Error =========== data[item] is None, no data is returned!"
                )
                time.sleep(5)
            elif self.corrupt == 1:
                print(
                    "Error =========== data is corrupted!, retrying in 5 secs")
                time.sleep(5)
        #if OK, then print the acquired JSON data to a JSON file to be compiled later by the cleaning function
        else:
            with open(os.path.join(
                    'raw_shopee/raw_shopee_' + self.region + '/' + self.name,
                    'data_q_' + str(self.page) + '.json'),
                      'w',
                      encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

            #data = data['items']
            #data.update({'page':self.page})
            #data.update({'rating_star': obj['items'][k]['item_rating']['rating_star']})
            #data.update({'timestamp':pd.datetime.now().replace(microsecond=0)})

            #df = pd.DataFrame

            ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment this section
            '''k=0
            for j in data['items']:
                j.update({'page_num':self.page})
                j.update({'rating_star': data['items'][k]['item_rating']['rating_star']})
                j.update({'timestamp':pd.datetime.now().replace(microsecond=0)})
                for m in range(0,6):
                    j.update({'star_' + str(5-m): data['items'][k]['item_rating']['rating_count'][m]})
                self.df = self.df.append(j, ignore_index = True) 
                k+=1

            #if 'rank' not in df:
             #   df['rank'] = np.arange(len(data))
                
            if 'category' not in self.df:
                self.df['category'] = self.category

            if 'subcategory' not in self.df:
                self.df['subcategory'] = self.subcategory

            if 'subsubcategory' not in self.df:
                self.df['subsubcategory'] = self.subsubcategory
            
            if 'platform' not in self.df:
                self.df['platform'] = 'shopee'
            if 'region' not in data:
                self.df['region'] =self.region

            if 'engine_ver' not in self.df:
                self.df['engine_ver'] = 'v0.4.2'


            #print(self.df)'''

            self.page += 1
            self.attempts = 0

Example #7

Show file

 def parse_weibo(self, response):
     """解析网页中的微博信息"""
     keyword = response.meta.get('keyword')
     for sel in response.xpath("//div[@class='card-wrap']"):
         info = sel.xpath(
             "div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']"
         )
         if info:
             weibo = WeiboItem()
             weibo['id'] = sel.xpath('@mid').extract_first()
             weibo['bid'] = sel.xpath(
                 '(.//p[@class="from"])[last()]/a[1]/@href').extract_first(
                 ).split('/')[-1].split('?')[0]
             weibo['user_id'] = info[0].xpath(
                 'div[2]/a/@href').extract_first().split('?')[0].split(
                     '/')[-1]
             weibo['screen_name'] = info[0].xpath(
                 'div[2]/a/@nick-name').extract_first()
             txt_sel = sel.xpath('.//p[@class="txt"]')[0]
             retweet_sel = sel.xpath('.//div[@class="card-comment"]')
             retweet_txt_sel = ''
             if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'):
                 retweet_txt_sel = retweet_sel[0].xpath(
                     './/p[@class="txt"]')[0]
             content_full = sel.xpath(
                 './/p[@node-type="feed_list_content_full"]')
             is_long_weibo = False
             is_long_retweet = False
             if content_full:
                 if not retweet_sel:
                     txt_sel = content_full[0]
                     is_long_weibo = True
                 elif len(content_full) == 2:
                     txt_sel = content_full[0]
                     retweet_txt_sel = content_full[1]
                     is_long_weibo = True
                     is_long_retweet = True
                 elif retweet_sel[0].xpath(
                         './/p[@node-type="feed_list_content_full"]'):
                     retweet_txt_sel = retweet_sel[0].xpath(
                         './/p[@node-type="feed_list_content_full"]')[0]
                     is_long_retweet = True
                 else:
                     txt_sel = content_full[0]
                     is_long_weibo = True
             weibo['text'] = txt_sel.xpath(
                 'string(.)').extract_first().replace('\u200b', '').replace(
                     '\ue627', '')
             weibo['article_url'] = self.get_article_url(txt_sel)
             weibo['location'] = self.get_location(txt_sel)
             if weibo['location']:
                 weibo['text'] = weibo['text'].replace(
                     '2' + weibo['location'], '')
             weibo['text'] = weibo['text'][2:].replace(' ', '')
             if is_long_weibo:
                 weibo['text'] = weibo['text'][:-6]
             weibo['at_users'] = self.get_at_users(txt_sel)
             weibo['topics'] = self.get_topics(txt_sel)
             reposts_count = sel.xpath(
                 './/a[@action-type="feed_list_forward"]/text()'
             ).extract_first()
             try:
                 reposts_count = re.findall(r'\d+.*', reposts_count)
             except TypeError:
                 print('cookie无效或已过期，请按照'
                       'https://github.com/dataabc/weibo-search#如何获取cookie'
                       ' 获取cookie')
                 raise CloseSpider()
             weibo['reposts_count'] = reposts_count[
                 0] if reposts_count else '0'
             comments_count = sel.xpath(
                 './/a[@action-type="feed_list_comment"]/text()'
             ).extract_first()
             comments_count = re.findall(r'\d+.*', comments_count)
             weibo['comments_count'] = comments_count[
                 0] if comments_count else '0'
             attitudes_count = sel.xpath(
                 '(.//a[@action-type="feed_list_like"])[last()]/em/text()'
             ).extract_first()
             weibo['attitudes_count'] = (attitudes_count
                                         if attitudes_count else '0')
             created_at = sel.xpath(
                 '(.//p[@class="from"])[last()]/a[1]/text()').extract_first(
                 ).replace(' ', '').replace('\n', '').split('前')[0]
             weibo['created_at'] = util.standardize_date(created_at)
             source = sel.xpath('(.//p[@class="from"])[last()]/a[2]/text()'
                                ).extract_first()
             weibo['source'] = source if source else ''
             pics = ''
             is_exist_pic = sel.xpath(
                 './/div[@class="media media-piclist"]')
             if is_exist_pic:
                 pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()
                 pics = [pic[2:] for pic in pics]
                 pics = [
                     re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics
                 ]
                 pics = ['http://' + pic for pic in pics]
             video_url = ''
             is_exist_video = sel.xpath(
                 './/div[@class="thumbnail"]/a/@action-data')
             if is_exist_video:
                 video_url = is_exist_video.extract_first()
                 video_url = unquote(
                     str(video_url)).split('video_src=//')[-1]
                 video_url = 'http://' + video_url
             if not retweet_sel:
                 weibo['pics'] = pics
                 weibo['video_url'] = video_url
             else:
                 weibo['pics'] = ''
                 weibo['video_url'] = ''
             weibo['retweet_id'] = ''
             if retweet_sel and retweet_sel[0].xpath(
                     './/div[@node-type="feed_list_forwardContent"]/a[1]'):
                 retweet = WeiboItem()
                 retweet['id'] = retweet_sel[0].xpath(
                     './/a[@action-type="feed_list_like"]/@action-data'
                 ).extract_first()[4:]
                 retweet['bid'] = retweet_sel[0].xpath(
                     './/p[@class="from"]/a/@href').extract_first().split(
                         '/')[-1].split('?')[0]
                 info = retweet_sel[0].xpath(
                     './/div[@node-type="feed_list_forwardContent"]/a[1]'
                 )[0]
                 retweet['user_id'] = info.xpath(
                     '@href').extract_first().split('/')[-1]
                 retweet['screen_name'] = info.xpath(
                     '@nick-name').extract_first()
                 retweet['text'] = retweet_txt_sel.xpath(
                     'string(.)').extract_first().replace('\u200b',
                                                          '').replace(
                                                              '\ue627', '')
                 retweet['article_url'] = self.get_article_url(
                     retweet_txt_sel)
                 retweet['location'] = self.get_location(retweet_txt_sel)
                 if retweet['location']:
                     retweet['text'] = retweet['text'].replace(
                         '2' + retweet['location'], '')
                 retweet['text'] = retweet['text'][2:].replace(' ', '')
                 if is_long_retweet:
                     retweet['text'] = retweet['text'][:-6]
                 retweet['at_users'] = self.get_at_users(retweet_txt_sel)
                 retweet['topics'] = self.get_topics(retweet_txt_sel)
                 reposts_count = retweet_sel[0].xpath(
                     './/ul[@class="act s-fr"]/li/a[1]/text()'
                 ).extract_first()
                 reposts_count = re.findall(r'\d+.*', reposts_count)
                 retweet['reposts_count'] = reposts_count[
                     0] if reposts_count else '0'
                 comments_count = retweet_sel[0].xpath(
                     './/ul[@class="act s-fr"]/li[2]/a[1]/text()'
                 ).extract_first()
                 comments_count = re.findall(r'\d+.*', comments_count)
                 retweet['comments_count'] = comments_count[
                     0] if comments_count else '0'
                 attitudes_count = retweet_sel[0].xpath(
                     './/a[@action-type="feed_list_like"]/em/text()'
                 ).extract_first()
                 retweet['attitudes_count'] = (attitudes_count
                                               if attitudes_count else '0')
                 created_at = retweet_sel[0].xpath(
                     './/p[@class="from"]/a[1]/text()').extract_first(
                     ).replace(' ', '').replace('\n', '').split('前')[0]
                 retweet['created_at'] = util.standardize_date(created_at)
                 source = retweet_sel[0].xpath(
                     './/p[@class="from"]/a[2]/text()').extract_first()
                 retweet['source'] = source if source else ''
                 retweet['pics'] = pics
                 retweet['video_url'] = video_url
                 retweet['retweet_id'] = ''
                 yield {'weibo': retweet, 'keyword': keyword}
                 weibo['retweet_id'] = retweet['id']
             # print(weibo)
             yield {'weibo': weibo, 'keyword': keyword}

Example #8

Show file

File: jobbkk.py Project: tonny62/Text_Classification

    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code    = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id       = self.web_id,
                    url          = response.url.encode('utf-8'),
                    meta         = response.meta,
                    html_path    = html_path,
                    crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status   = 'FAILED',
                    error_message= "Empty request's response"
                )
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info('[ JobPageRetry ] {url} with proxy {proxy}'.format(url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url, callback=self.parse_detail , meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(url=response.url.encode('utf-8')))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(url=response.url.encode('utf-8')))

        try:
            ret = {}

            ret['company'] = response.xpath('.//h1[@itemprop="hiringOrganization"]/a/span/text()').extract_first()
            ret['pos']     = response.xpath('.//div[@class="job-detail-top col-xs-12"]/h2/a/text()').extract_first()
            ret['etype']   = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[0])
            ret['loc']     = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[1])
            ret['sal']     = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[2])
            ret['hour']    = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[4])
            ret['desc']    = '|'.join([i.strip() for i in response.xpath('.//div[@itemprop="responsibilities"]/text()').extract()])
            ret['qual']    = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="skills"]').extract_first().split('\n')] if i])
            ret['benef']   = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="incentives"]').extract_first().replace('<li>','\n').split('\n')] if i])
            ret['pdate']   = self.convert_pdate(response.xpath('.//div[@itemprop="datePosted"]/text()').extract_first())

            if ret['pdate'].split()[0].split('-')[0] == "2017":
                self.logger.info("[ JobEndReached ] 2017 reached")
                self.killed = 1
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ret[key].strip().encode('utf-8')

            _hash = hash_dn(ret['desc'],ret['company'])

            #log result to MySQL
            try:
                self.sqllogger.log_crawled_page(
                    hash_code    = _hash,
                    position     = ret['pos'],
                    employer     = ret['company'],
                    exp          = '',
                    salary       = ret['sal'],
                    location     = ret['loc'],
                    web_id       = self.web_id,
                    url          = response.url.encode('utf-8'),
                    meta         = response.meta,
                    html_path    = html_path,
                    crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time    = ret['pdate'],
                    job_status   = 'SUCCESS',
                    error_message= ''
                )
                self.logger.info('[ RDSLogged ] {url}'.format(url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold")
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error('[ JobDetailException ] {url} {html_path} {e}'.format(url=response.url.encode('utf-8'),html_path=html_path.encode('utf-8'),e=e))
            self.sqllogger.log_error_page(
                hash_code    = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id       = self.web_id,
                url          = response.url.encode('utf-8'),
                meta         = response.meta,
                html_path    = html_path,
                crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status   = 'FAILED',
                error_message= e
            )

Example #9

Show file

File: test_run_fenghuang.py Project: zhengjianglong915/news_spider-1

                         website_url=website_url,
                         website_key=json_key,
                         settings=settings)
            #logging.info('结束网站爬虫'+json_key+':'+url_key+':'+website_urls[url_key]+'-'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))

        wait = runner.join()

        wait.addBoth(lambda _: reactor.stop())

        #阻塞进程直到爬虫完毕
        reactor.run()

        #end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
        #information = "开始爬虫时间:"+ begin_time + "\n爬虫结束时间: "+ end_time + " 凤凰类别数据爬虫完毕"
        #email_object.send_information(information,"完成凤凰类别数据爬虫通知",True)
        #print "通知成功"
        #end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) #结束时间
        #info_spider = ' begin at :'+begin_time+' end at :'+end_time
        #logging.info(info_spider)
        os._exit(0)
    except BaseException, error:
        end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        information = "time: " + end_time + "错误:" + str(error) + '\n'
        email_object.send_information(information)
        logging.exception(error)
        raise CloseSpider('爬虫识别')
        os._exit(1)

    finally:
        read_json_file.changejson(settings['SPLIT_JSON_FILE'])

Example #10

Show file

    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url,
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(
                    url=response.url.encode('utf-8')))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))

        try:
            ret = {}

            head = {}

            row = response.xpath(
                '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/p|//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/ul'
            )[1:]
            topic = response.xpath(
                '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]//b/u/text()'
            ).extract()
            head['amnt'] = u'\u0e2d\u0e31\u0e15\u0e23\u0e32'
            head[
                'sal'] = u'\u0e40\u0e07\u0e34\u0e19\u0e40\u0e14\u0e37\u0e2d\u0e19'
            head[
                'benef'] = u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e34\u0e01\u0e32\u0e23'
            head[
                'req'] = u'\u0e04\u0e38\u0e13\u0e2a\u0e21\u0e1a\u0e31\u0e15\u0e34\u0e1c\u0e39\u0e49\u0e2a\u0e21\u0e31\u0e04\u0e23'
            head[
                'loc_det'] = u'\u0e2a\u0e16\u0e32\u0e19\u0e17\u0e35\u0e48\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e07\u0e32\u0e19'
            head['loc'] = u'\u0e08\u0e31\u0e07\u0e2b\u0e27\u0e31\u0e14'

            ret['pos'], ret['desc'] = [
                self.clean_tag(x) for x in response.xpath(
                    '//div[@class="w3-theme-l4"]/div').extract()
            ]
            ret['pdate'] = self.cdate[response.url]
            ret['company'] = self.comnm[response.url]
            del self.cdate[response.url]
            del self.comnm[response.url]
            ret['loc'] = ''
            ret['sal'] = ''

            for key in head.keys():
                try:
                    idx = topic.index(head[key])
                except ValueError:
                    continue
                ret[key] = '|'.join([
                    i for i in [
                        remove_tags(i)
                        for i in row[idx].xpath('./text()|./li').extract()
                    ] if i
                ])

            if ret['pdate'].split()[-1] == "2560":
                self.killed += 1
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ' '.join(
                        ret[key].strip().split()).encode('utf-8')

            _hash = hash_dn(ret['desc'], ret['company'])

            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position=ret['pos'],
                    employer=ret['company'],
                    exp='',
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            for key in ret.keys():
                if not ret[key]:
                    del ret[key]

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)

Example #11

Show file

File: middlewares.py Project: mengguiyouziyi/jianjie_three

 def process_response(self, request, response, spider):
     if response.status == 402:
         raise CloseSpider('402 proxy no use')
     else:
         return response

Example #12

Show file

File: exploitdbDos.py Project: 5l1v3r1/VikingBoat

    def parse(self, response):
        global ult
        datas = response.selector.xpath(
            '//tr/td[@class="date"]/text()').extract()
        links = response.selector.xpath('//tr/td[5]/a/@href').extract()
        descs = response.selector.xpath('//tr/td[5]/a[@href]/text()').extract()

        for data, desc, link in zip(datas, descs, links):
            if desc.encode('utf-8') != ult and ult == '':
                with open('baseDos.txt', 'a+') as arq:
                    arq.write(data.strip() + '\n')
                    arq.write(desc.encode('utf-8') + '\n')
                    arq.write(link + '\n\n')
                    arq.close()
                i = 0
            elif desc.encode('utf-8') != ult:
                with open('aux.txt', 'a+') as arq:
                    arq.write(data.strip() + '\n')
                    arq.write(desc.encode('utf-8') + '\n')
                    arq.write(link + '\n\n')
                    arq.close()
                i = 1
            else:
                i = 0
                break

        if not i:
            if os.path.exists('aux.txt'):
                os.remove('baseDos.txt')
                os.rename('aux.txt', 'baseDos.txt')
                # HERE YOU PUT THE TOKEN OF YOUR PAGE ON FACEBOOK
                access_token = "HERE YOU PUT THE TOKEN OF YOUR PAGE ON FACEBOOK"
                api = facebook.GraphAPI(access_token)
                total = 0
                arq = open('baseDos.txt', 'r')
                linhas = arq.readlines()
                for i in linhas:
                    if i == '\n':
                        total = total + 1
                a = 0
                b = 3
                lista = []
                for i in range(total):
                    xxx = linhas[a:b]
                    lista.append(xxx)
                    xxx = ''
                    a = b + 1
                    b = a + 3
                lista.reverse()
                for i in lista:
                    x = ''.join(i)
                    api.put_wall_post(x)

            raise CloseSpider('[+] BASE ATUALIZADA [+]')
        else:
            try:
                proxima_pagina = response.xpath(
                    '//a[@href and contains(.,"next")]/@href').extract()[0]
                if proxima_pagina:
                    yield scrapy.Request(url=proxima_pagina,
                                         callback=self.parse)
            except:
                pass

Example #13

Show file

File: stockPriceSpider.py Project: dfr1238/StockSpider_wGUI

    def twse_mining_Data_Parse(self, response):
        if (not (self.is_TPEX_open and self.is_TWSE_open)):
            print(self.se_status)
            pass
        else:
            local_Co_ids = []
            if (self.TPEX_First_Run):
                local_Co_ids = self.Co_ids
            else:
                local_Co_ids = self.possible_Co_ids_TWSE
            for data in response.xpath('body'):
                domain = urlParse.urlparse(response.url).hostname
                print('First RUN:', self.TWSE_First_Run)
                print('爬取開始')
                print(f'網域：{domain}')

                for co_id in local_Co_ids:
                    self.not_manual_cancel = sg.one_line_progress_meter(
                        '目前爬取進度',
                        self.current,
                        self.isExist - 1,
                        'Stock',
                        '運行時請勿點擊視窗，顯示沒有回應請勿關閉，為正常現象。\nElapsed Time 為已運行時間\nTime Remaining 為剩餘時間\nEstimated Total Time 為估計完成時間',
                        no_titlebar=False,
                        orientation='h')
                    if (not self.not_manual_cancel
                            and self.current < self.isExist - 1):
                        Button = sg.popup_yes_no('是否取消？', '取消爬取')
                        if (Button == 'Yes'):
                            sg.popup('已手動取消！')
                            raise CloseSpider("使用者取消！")
                    items = StockPrice_items()
                    print('First RUN:', self.TWSE_First_Run)
                    print(co_id)
                    twse_get = ''
                    twse_get = str(
                        response.xpath(
                            f'//td[text()="{co_id}"]//text()').get())
                    twse_co_name = str(
                        data.xpath(
                            f'//td[text()="{co_id}"]/following-sibling::td[1]//text()'
                        ).get())
                    print(twse_get)
                    if (twse_get == 'None' and (not twse_co_name.isnumeric())):
                        if (self.TPEX_First_Run):
                            print(f'股號 {co_id} 不存在於交易所，可能為TPEX的股號，丟入至暫存中...')
                            self.possible_Co_ids_TPEX.append(co_id)
                            continue
                        else:
                            print(f'股號 {co_id} 不存在兩邊交易所，丟入到未存在股號中...')
                            self.noExist.append(co_id)
                            continue
                    else:
                        print('TWSE GET ITEMS')
                        self.current += 1
                        twse_price = str(
                            data.xpath(
                                f'//td[text()="{co_id}"]/following-sibling::td[6]//text()'
                            ).get())
                        twse_price = twse_price.replace(',', '')
                        print(twse_price)
                        if (self.is_number(twse_price)):
                            twse_price = float(twse_price)
                        else:
                            twse_price = None
                        items['CO_ID'] = str(co_id)
                        items['CO_SHORT_NAME'] = str(twse_co_name)
                        items['Price'] = twse_price
                        items['SUB_DATA_TYPE'] = 'TWSE'
                        items['SYear'] = str(self.Year)
                        items['SDate'] = str(self.Date)
                        items['DATA_TYPE'] = self.Type
                        yield (items)
            self.TWSE_First_Run = False
            yield scrapy.Request(self.se_urls[0],
                                 callback=self.tpex_mining_Data_Parse,
                                 dont_filter=True)

Example #14

Show file

 def process_item(self, response, spider):
     self.count += 1
     if (self.count == 5):
         print("======in test pipeline========")
         raise CloseSpider("in exception")

Example #15

Show file

    def parse(self, response):

        # def get_proxy():
        #     return requests.get("http://127.0.0.1:5010/get/").content

        #
        # def delete_proxy(proxy):
        #     requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))

        #或者可以设置随机ip
        #不需要在这里设置，在retry中间件中设置即可
        #轮询使用ip，假设有500可用ip,一分钟500个页面，对服务器来说相当于每台主机访问一页面

        # while response.status == 403 or response.status == 302:
        #
        #     print(response.status)
        #
        #     print(response.meta)
        #
        #     # delete_proxy(response.headers)
        #
        #     #删除proxy
        #
        #     # 获取proxy
        #
        #     proxy = get_proxy()
        #
        #     print("使用新代理：" + str(proxy))
        #
        #     #如果proxy_pool耗尽，暂时暂停爬虫或者更换目标网站，移动端或者wap,或者各大网站的cache
        #
        #     response = scrapy.Request(url=response.url, meta={'proxy':'http://' + str(proxy)})
        #
        #     print(type(response))

        # print("有respose")

        item = LearningItem()

        #爬取书名

        #作者有联合作者，会和译者一样放在一个span里面，单个作者单独放在文本为 作者 的span 的后面的同级a节点，所以也要分类讨论
        #或者作者无链接——不会，会有search
        #单个作者也会用一组嵌套的span括住
        #翻译者的链接也是author，既然是爬取图书，就没有关系了，如果要研究翻译相关的话，主数据库有译者字段
        def is_exist(item_argv, xpath1, **xpath2):
            # item[item_argv] = info.xpath(xpath1).extract().strip()
            try:
                item[item_argv] = info.xpath(xpath1).extract()
            except:
                print(str(item_argv) + "出错")
                item[item_argv] = ''

            if len(item[item_argv]) == 1:

                item[item_argv] = item[item_argv][0].strip()

            # if len(item[item_argv]) == 0 and item[item_argv] != '':
            #
            #     item[item_argv] = ''

            # return item[item_argv][0].strip() if len(item[item_argv]) == 1 else item[item_argv]

            return item[item_argv]

        # try:
        #先确定豆瓣会出错的几种方式
        #返回403
        #返回200，但需登陆
        #返回此应用出错
        # print("尝试爬取")

        # except:
        # print()
        # print("被ban!!!!!!!!!!!!!")
        #只会停止其中一个协程，其他要逐渐停止，强行ctrl + z 会导致后面的链接被添加到filter中，以后都不会再被爬取
        if response.status != 200:

            #不知道会不会将缺少 '/"的页面重定向到别的地方，导致状态码变为301，改next_page的代码
            #shell后发现不会，重定向会直接返回200的response,服务器补全了后面的 /
            raise CloseSpider('强制停止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            # time.sleep(600)
            # raise CloseSpider()
            # return
            ##这里写ADSL拨号或者换ip的逻辑
            # print()
            # return

        print("此时的URL为：" + str(response.url))
        # writer_link_list = []
        # series_link_list = []
        try:
            info = response.xpath(u'//*[@id="info"]')[0]
        except:
            raise CloseSpider("出现200以外的错误，此时的url为 %s" % response.url)

            #在这里一并处理了作者列表和翻译者列表

            #判断有无作者
            #判断有无翻译者
            #翻译者以上的author link 的text 加入到作者列表中
            #如无翻译者，则author link 的 text 默认为全是作者
            #容易出错，比如出现个志愿者什么的，举例而已

            #作者节点：作者节点的下一个同辈span节点的所有前同辈a节点，因为作者节点排第一，没有其他节点会影响它

            #先确定是两种模式的哪一种

            #直接写四种模式，用 a = b or c = d的写法，一句

        #如果以某个字段为基准，比如出版社以上的a tag 为作者，以下为翻译者的话，当出版社字段不存在，就会出错，所以还是以自身为基准，爬虫会更具健壮性
        #有冒号无嵌套
        w_name1 = info.xpath(
            u'//span[./text()="作者:"]/following-sibling::span[1]/preceding-sibling::a'
        )
        #有冒号有嵌套
        w_name2 = info.xpath(u'//span[./text()="作者:"]/parent::span/a')
        #无冒号无嵌套
        w_name3 = info.xpath(
            u'//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a'
        )
        #无冒号有嵌套
        w_name4 = info.xpath(u'//span[./text()=" 作者"]/parent::span/a')

        if w_name1:
            item['writers'] = w_name1.xpath("./text()").extract()
            item['writers_link'] = w_name1.xpath("./@href").extract()

        elif w_name2:
            item['writers'] = w_name2.xpath("./text()").extract()
            item['writers_link'] = w_name2.xpath("./@href").extract()

        elif w_name3:
            item['writers'] = w_name3.xpath("./text()").extract()
            item['writers_link'] = w_name3.xpath("./@href").extract()

        elif w_name4:
            item['writers'] = w_name4.xpath("./text()").extract()
            item['writers_link'] = w_name4.xpath("./@href").extract()

        else:
            item['writers'] = ''
            item['writers_link'] = ''

#————————————————————————————————————————————————————————————————————————————————————————————————————————————————#

#译者
# contains(@name,'na')

#有冒号无嵌套
        t_name1 = info.xpath(
            u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"search")]'
        )
        #有冒号有嵌套
        t_name2 = info.xpath(
            u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"author")]'
        )
        #无冒号无嵌套
        #选中属性中包含某个字符串的href
        #链接可以直接爬取了，但是中文字段还是要靠后续的处理和提取

        #出错
        #仍有问题，无法替换和正确拼接
        # t_name3 = info.xpath(u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search") or contains(@href,"author")]')
        t_name3 = info.xpath(
            u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search")]'
        )
        #无冒号有嵌套
        t_name4 = info.xpath(
            u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"author")]'
        )

        if t_name4:
            item['translators'] = t_name4.xpath("./text()").extract()
            item['translators_link'] = t_name4.xpath("./@href").extract()

        elif t_name3:
            item['translators'] = t_name3.xpath("./text()").extract()
            item['translators_link'] = t_name3.xpath("./@href").extract()

        elif t_name2:
            item['translators'] = t_name2.xpath("./text()").extract()
            item['translators_link'] = t_name2.xpath("./@href").extract()

        elif t_name1:
            item['translators'] = t_name1.xpath("./text()").extract()
            item['translators_link'] = t_name1.xpath("./@href").extract()
        else:
            item['translators'] = ''
            item['translators_link'] = ''

#————————————————————————————————————————————————————————————————————————————————————————————————————————————————#

        item["publish"] = is_exist(
            "publish", u'//span[./text()="出版社:"]/following::text()[1]')

        item["publish_date"] = is_exist(
            "publish_date", u'//span[./text()="出版年:"]/following::text()[1]')
        item["pages"] = is_exist(
            "pages", u'//span[./text()="页数:"]/following::text()[1]')
        item["price"] = is_exist(
            "price", u'//span[./text()="定价:"]/following::text()[1]')
        item["binding"] = is_exist(
            "binding", u'//span[./text()="装帧:"]/following::text()[1]')
        item["ISBN"] = is_exist(
            "ISBN", u'//span[./text()="ISBN:"]/following::text()[1]')
        item["orgin_name"] = is_exist(
            "orgin_name", u'//span[./text()="原作名:"]/following::text()[1]')
        item["series"] = is_exist(
            "series", u'//span[./text()="丛书:"]/following::a[1]/text()')
        item["series_link"] = is_exist(
            "series_link",
            u'//span[./text()="丛书:"]/following-sibling::a[1]/@href')

        # item["summary"] = is_exist("summary",)
        # item["w_summary"] = is_exist("w_summary",)

        item["catalog"] = is_exist("catalog",
                                   '//*[contains(@id,"dir_")]/text()')
        item["tag"] = is_exist("tag",
                               '//*[@id="db-tags-section"]/div/span/a/text()')
        item["series_info"] = is_exist(
            "series_info",
            '//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()'
        )

        # item["readers"] = is_exist("readers",).extract().strip()

        # item["title"] = is_exist("title",).extract().strip()
        # item["url"] = is_exist("url",).extract().strip()
        # item["score"] = is_exist("score",).extract().strip()

        try:
            item['title'] = response.xpath(
                "//*[@id='wrapper']/h1/span/text()").extract_first()
        except:
            item['title'] = ''

        item['url'] = response.url.replace("https://book.douban.com/subject/",
                                           "").strip('/')

        try:
            item['score'] = response.css(
                '#interest_sectl > div > div.rating_self.clearfix > strong::text'
            ).extract_first().strip()
            if item['score'] == '':
                item['score'] = '0'
        except:
            item['score'] = '0'

        # try:
        #     item['publish'] = info.xpath().extract_first().strip()
        # except:
        #     item['publish'] = ''
        # try:
        #     item['publish_date'] = info.xpath(u'//span[./text()="出版年:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['publish_date'] = ''

        # try:
        #     item['pages'] = info.xpath(u'//span[./text()="页数:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['pages'] = ''

        # try:
        #     item['price'] = info.xpath(u'//span[./text()="定价:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['price'] = ''
        # try:
        #     item['binding'] = info.xpath(u'//span[./text()="装帧:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['binding'] = ''
        # try:
        #     item['ISBN'] = info.xpath(u'//span[./text()="ISBN:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['ISBN'] = ''
        # try:
        #     item['orgin_name'] = info.xpath(u'//span[./text()="原作名:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['orgin_name'] = ''
        # try:
        #     item['series'] = info.xpath(u'//span[./text()="丛书:"]/following::a[1]/text()').extract_first().strip()
        # except:
        #     item['series'] = ''
        # try:
        #     item['series_link'] = info.xpath(u'//span[./text()="丛书:"]/following-sibling::a[1]/@href').extract_first().strip()
        # except:
        #     item['series_link'] = ''

        #这里有两种情况，一种有折叠，一种没有，先提取包含折叠内容的，没有再提取另一个

        try:

            summary = response.xpath(
                '//*[@id="link-report"]/span/div/div[@class="intro"]/p/text()')

            if summary:
                item['summary'] = summary.extract()
            else:
                item['summary'] = response.xpath(
                    '//*[@id="link-report"]/div[1]/div/p/text()').extract()

            # if len(item['summary']) == 0 and item['summary'] != '':
            #
            #     item['summary'] = ''

        except:

            item['summary'] = ''

        try:
            w_summary = response.css(
                '#content > div > div.article > div.related_info > div:nth-child(4) > span.all.hidden > div > p::text'
            )

            if w_summary:
                item['w_summary'] = w_summary.extract()
            else:
                item['w_summary'] = response.css(
                    '#content > div > div.article > div.related_info > div:nth-child(4) > span.short > div > p::text'
                ).extract()

            # if len(item['w_summary']) == 0 and item['w_summary'] != '':
            #
            #     item['w_summary'] = ''
        except:
            item['w_summary'] = ''

        # try:
        #     #出错
        #     # item['catalog'] = response.xpath('//*[contains(@id,"full") and contains(@id,"dir")]/text()').extract()
        #     item['catalog'] = response.xpath('//*[contains(@id,"dir_")]/text()').extract()
        # except:
        #     item['catalog'] = ''

        # try:

        #     item['tag'] = response.xpath('//*[@id="db-tags-section"]/div/span/a/text()').extract()
        # except:
        #     item['tag'] = ''

        # try:
        #     #丛书信息会随机抽取
        #     item['series_info'] = response.xpath('//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()').extract()
        # except:
        #     item['series_info'] = ''

        try:
            item['readers'] = response.css(
                '#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span::text'
            ).extract_first()

            if item['readers'] is None:
                item['readers'] = '0'
        except:
            item['readers'] = '0'

        # '//*[@id="link-report"]/div[1]/div/p'/div/div[@class="intro"]/p/text()

        # if w_name_mode1:
        #     # w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","")
        #     w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()')

        #     #如果能捕获作者名字，则写入，否则，为span嵌套模式
        #     if w_name:
        #         item['writer'] = w_name.extract()

        #     else:
        #         item['writer'] = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()')

        #     /
        #     writer_name_type2 = links.xpath('//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","")
        #     writer_name_type3 =
        #     #单个作者节点已经完成，需要完成一组的作者节点,具体参考大学教材
        #     #一组作者节点同一组翻译者节点
        #     #翻译者节点：翻译者节点的下一个span节点

        #     #一组翻译者的已经解决，单个翻译者的参考傅雷

        #     # link_extract = item.extract()
        #     if "author" in link:
        #         # print(item.xpath('./@href').extract())
        #         #这里可以缩减
        #         writer_link_list.append(link)
        #     #存储完整的网址，日后爬取可以少一个拼接网址的逻辑，加快爬取速度，硬盘开销不大
        #     if "search" in link:
        #         link = "https://book.douban.com/" + link
        #         writer_link_list.append(link)

        #     if "series" in link:
        #         series_link_list.append(link)

        # item['writer_link'] = writer_link_list
        # item['series_link'] = series_link_list
        #         # item['writer'] = response.xpath(u'//span[./text()="作者:"]/following::a[2]')
        # # # // *[ @ id = "info"] / a[1]
        # # item['publish'] = response.xpath(u'//span[./text()="出版社:"]/following::text()[1]')
        # # item['orgin_name'] = response.xpath(u'//span[./text()="原作名:"]/following::text()[1]')

        # #这里只是其中一种情况，还有一种，要增加对应的try...except,以及中文图书没有翻译的问题，全半角符号的问题

        # c = ""#单个翻译者

        # try:
        #     if a:
        #         item['translator'] = a[0].xpath('./a/text()').extract()
        #     if b:
        #         item['translator'] = b[0].xpath('./a/text()').extract()
        # except:
        #     item['translator'] = ''

        #有效评分人数
        # if item['readers']:

        #     v = int(item['readers'])

        # else:
        #     v = 0

        # #入选top250的最低人数
        # m = 10000

        # #书本得分
        # if item['score']:
        #     R = float(item['score'])
        # else:
        #     R = 0

        # # C是所有书本的得分平均分，都存在数据库中，取个大概值就行了
        # C = 7

        item["weighting"] = 0
        item['seen'] = 0

        yield item

        # item['p_date']
        # item['total_pages']
        # item['price']
        # item['binding']
        # item['series']
        # item['ISBN']
        # item['summary']
        # item['w_introduce']
        # item['ca']
        # item['tag']
        # item['s_info']
        # item['score']
        # item['readers']
        # print(item['title'])
        # all = response.xpath("string(//*[@id='info'])")
        # all =
        # print(all.extract())
        # print(all.extract()[0].replace("\n",""))
        # print(all.extract()[0].replace("\n","").replace(" ",""))
        # print(type(all.extract()))
        # yield item
        #id一般固定，可以忽略css的变化
        #先不清洗，换取爬取的速度提升
        # all = response.xpath('//*[@id="info"]')
        # all = all.extract()[0].replace("\n","").replace("\t","").split("<br>")
        # for item in all:
        # print(item.replace('<spanclass="pl">',"").replace("</span>","").replace("""<divid="info"class="">""","").replace("</div>","").replace("</a>","").replace("""<aclass=""href=""","").replace("<span>","").replace("<ahref=",""))
        # all = response.xpath(u'//span[./text()=" 作者"]/following::text()')
        # print(all)

        #mysql批量写入，不要每次写入

        #
        #抽取"喜欢这本书的用户也喜欢"的链接
        link = LinkExtractor(
            restrict_xpaths=('//*[@id="db-rec-section"]/div//dl//dd'))
        links = link.extract_links(response)

        #如果链接是直接相关的话，也可以用response.follow，会返回一个url实例，然后可以yield相关的url：
        # links = response.xpath('//*[@id="db-rec-section"]/div//dl//dd').extract()

        # for link in links:
        #     yield response.follow(link,callback=self.parse)

        for link in links:
            # print("弹出一个url")

            # if link.url.endswith('/'):
            # pass
            # else:
            # link.url = link.url + "/"
            #没有"/"作为结尾的话，网址会重定向，不必要，但是可能是识别爬虫的依据
            yield scrapy.Request(url=link.url, callback=self.parse)

Example #16

Show file

File: toscrape-css.py Project: robbysoerya/JournalCrawler

    def parse_item(self, response):

        if (self.count < int(self.limit)):
            item = MyItem()
            item['url'] = response.url
            p = r"^\S*article\/view\/\S*$"
            a = r"^(\s*Abstrak\s*$)|(^\s*Abstract\s*$)"

            if (re.match(p, item['url'])):

                journal = JournalItem()
                article = ArticleItem()
                references = ReferencesItem()
                author = AuthorItem()

                item['title'] = response.css('title::text').getall()

                dc = "//meta[@name='DC.{}']/@content"
                citation = "//meta[@name='citation_{}']/@content"

                author_name = response.xpath(
                    dc.format('Creator.PersonalName')).extract()
                abstract = response.xpath(
                    dc.format('Description')).extract_first()
                doi = response.xpath(
                    dc.format('Identifier.DOI')).extract_first()
                issn = response.xpath(dc.format('Source.ISSN')).extract_first()
                issue = response.xpath(
                    dc.format('Source.Issue')).extract_first()
                volume = response.xpath(
                    dc.format('Source.Volume')).extract_first()
                title = response.xpath(dc.format('Title')).extract_first()
                uri = response.xpath(
                    dc.format('Identifier.URI')).extract_first()
                journal_title = response.xpath(
                    citation.format('journal_title')).extract_first()
                author_institution = response.xpath(
                    citation.format('author_institution')).extract()
                date = response.xpath(citation.format('date')).extract_first()
                keyword = response.xpath(
                    citation.format('keywords')).extract_first()
                pdf_uri = response.xpath(
                    citation.format('pdf_url')).extract_first()
                language = response.xpath(
                    citation.format('language')).extract_first()

                if not abstract:
                    abstract = response.xpath(
                        '//*[text()[re:test(., "{}")]]/parent::*//text()'.
                        format(a)).extract()

                article['title'] = title
                article['abstract'] = abstract
                article['doi'] = doi
                article['uri'] = uri
                article['pdf_uri'] = pdf_uri
                article['publication_date'] = date
                article['keyword'] = keyword
                article['issn'] = issn
                article['language'] = language

                journal['title'] = journal_title
                journal['issn'] = issn
                journal['issue'] = issue
                journal['volume'] = volume

                author['name'] = author_name
                author['affiliate'] = author_institution

                #Match reference with regex
                pattern = "^(\s*References\s*$)|(^\s*Referensi\s*$)"
                pattern2 = r"^[a-zA-Z/[]|['__']{2}"
                pattern3 = r"\s?[a-zA-Z0-9\.\ ]{1}$"

                result = response.xpath(
                    '//*[text()[re:test(., "{}")]]/parent::*//text()'.format(
                        pattern)).extract()

                #Remove control character like \n,\t, etc.
                t = dict.fromkeys(range(32))
                ref = [
                    x.translate(t) for x in result
                    if x.translate(t) and x.translate(t) != "References"
                    and x.translate(t) != "Referensi" and len(x) > 20
                ]

                references['title'] = ""
                references['classification'] = ""

                if len(ref) > 0:
                    data = pd.read_csv(
                        '/home/bandreg/Skripsi/Program/JournalCrawler/scrapy_app/scrapy_app/spiders/data2.csv',
                        index_col=None)

                    vectorizer = CountVectorizer()
                    X1 = vectorizer.fit_transform(data['Reference'].values)

                    test = vectorizer.transform(ref)
                    model = joblib.load(
                        '/home/bandreg/Skripsi/Program/JournalCrawler/scrapy_app/scrapy_app/spiders/model.sav'
                    )
                    result = model.predict(test)

                    references['title'] = ref
                    references['classification'] = result

                #Count item
                self.count += 1
                yield {
                    'journal': journal,
                    'item': item,
                    'article': article,
                    'author': author,
                    'references': references
                }
        else:
            raise CloseSpider('limit reached')

Example #17

Show file

File: orange.py Project: oceancloud82/scraping

    def parse_subscriptions_period_variants(self, response):
        if len(self.devices) < 1:
            self.log("[[ORANGECH]] No devices collected on previous steps. Stopping!")
            return
        self.current_step = 'PROCESS_PLANS_DEVICES'

        if not self._browser_load_page_with_tries(devices_url):
            self.errors.append("Failed to load page with PhantomJS: %s" % devices_url)
            raise CloseSpider("Failed to load page with PhantomJS: %s" % devices_url)

        # reset to SIM-only
        time.sleep(30)
        el = self._browser.find_element_by_xpath("//div[@class='product-item'][not(@id)]//button[contains(text(), 'Select')]")
        self._do_browser_action_tries(el.click)
        time.sleep(30)

        if not self._browser_load_page_with_tries(response.url):
            self.errors.append("Failed to load page with PhantomJS: %s" % response.url)
            raise CloseSpider("Failed to load page with PhantomJS: %s" % response.url)

        for self.current_period in ['12', '24']:

            if self.current_period not in self.processed_priceplans:
                self.processed_priceplans[self.current_period] = {}

            if len(self.priceplans) < 1:
                return

            self.log('[[ORANGECH]] Processing period: %s months' % self.current_period)

            drop_down_el = self._browser.find_element_by_xpath("//form[@id='form_subscription_length']//a[@class='select2-choice']")
            self._do_browser_action_tries(drop_down_el.click)
            el = self._browser.find_element_by_xpath("//ul[@id='select2-results-6']/li/div[contains(text(), '%s')]" % self.current_period)
            self._do_browser_action_tries(el.click)

            for plan_name_base in sorted(self.priceplans):
                # plan_formdata = self.priceplans_formdata[plan_name_base]
                self.log('[[ORANGECH]] Processing base price plan %s with period %s months' % (plan_name_base, self.current_period))
                drop_down_el = self._browser.find_element_by_xpath("//form[@id='form_subscription_choice']//a[@class='select2-choice']")
                self._do_browser_action_tries(drop_down_el.click)
                el = self._browser.find_element_by_xpath("//ul[@id='select2-results-4']/li/div[contains(text(), '%s')]" % plan_name_base)
                self._do_browser_action_tries(el.click)

                if plan_name_base not in self.processed_priceplans[self.current_period]:
                    self.processed_priceplans[self.current_period][plan_name_base] = set()

                for i, variant in enumerate(sorted(self.priceplans_variants[plan_name_base])):
                    grouped_key = ";".join(["%s:%s" % (key, variant[key]) for key in sorted(variant.keys())])

                    if grouped_key in self.processed_priceplans[self.current_period][plan_name_base]:
                        continue

                    if 'young' in plan_name_base.lower():
                        plan_name = plan_name_base[:]
                        for key, value in variant.items():
                            if 'Young' in self.priceplans[plan_name_base][key][value]['name']:
                                plan_name = plan_name + ' ' + self.priceplans[plan_name_base][key][value]['name'].replace("Orange Young", "").replace("Young", "").strip()
                        for key, value in variant.items():
                            if 'Young' not in self.priceplans[plan_name_base][key][value]['name']:
                                plan_name = plan_name + ', ' + self.priceplans[plan_name_base][key][value]['name']
                    else:
                        plan_name = plan_name_base + " " + ", ".join([self.priceplans[plan_name_base][key][variant[key]]['name'] for key in sorted(variant.keys())])

                    price = sum([int(self.priceplans[plan_name_base][key][variant[key]]['price']) for key in variant.keys()])

                    meta = {
                        'plan_name_base': plan_name_base,
                        'grouped_key': grouped_key,
                        'plan_name': plan_name,
                        'per_month': price
                    }

                    self.log('[[ORANGECH]] Selecting price plan %s with period %s months' % (plan_name, self.current_period))

                    for key, value in variant.items():
                        el = self._browser.find_element_by_xpath("//input[@name='%s'][@value='%s']" % (key, value))
                        self._do_browser_action_tries(el.click)
                        time.sleep(5)

                    self.log('[[ORANGECH]] Clicking period again: %s months' % self.current_period)
                    el = self._browser.find_element_by_xpath("//select[@name='contract_length']/option[@value='%s']" % self.current_period)
                    if not el.is_selected():
                        self._do_browser_action_tries(el.click)
                        time.sleep(5)

                    self.log('[[ORANGECH]] Loading device prices for price plan: %s, %s months' % (plan_name, self.current_period))
                    # time.sleep(30)

                    if not self._browser_load_page_with_tries(devices_url):
                        self.errors.append("Failed to load page with PhantomJS: %s" % devices_url)
                        raise CloseSpider("Failed to load page with PhantomJS: %s" % devices_url)
                    hxs = HtmlXPathSelector(text=self._browser.page_source)

                    for item in self.parse_device_prices_for_priceplan(hxs, meta):
                        yield item

                    self.processed_priceplans[self.current_period][plan_name_base].add(grouped_key)

                    if not self._browser_load_page_with_tries(subscriptions_url):
                        self.errors.append("Failed to load page with PhantomJS: %s" % subscriptions_url)
                        raise CloseSpider("Failed to load page with PhantomJS: %s" % subscriptions_url)

Example #18

Show file

File: iproperty_spider.py Project: chan0923/scrapy

    def parse_item_page(self, response):
        if self.close_down:
            raise CloseSpider()
        item = IpropertyItem()
        item['url'] = response.url
        item['scraped_date'] = time.strftime("%Y-%m-%d %H:%M:%S")

        # categories
        # pre-filled with None
        for x in xrange(1, 7):
            item['cat_{}'.format(x)] = None
        categories = [
            x for x in response.css("div.breadcrumbs-ld a::text").extract()
            if x != 'Home'
        ]
        for index, cat in enumerate(categories):
            if index > 5:
                raise CloseSpider("Category tree too long: {}".format(
                    ','.join(categories)))

            item['cat_{}'.format(index + 1)] = cat

        # unique ID
        result = re.search(r'.+-(\d+)$', response.url)
        if result:
            item['unique_id'] = result.group(1)

        # title
        item['title'] = next(
            iter(response.css("h1.main-title::text").extract()), '')
        if item['title'][-3:] == '...':
            item['title'] = next(iter(response.css("title ::text").extract()),
                                 '')

        # price
        item['price'] = next(iter(response.css("h2.price::text").extract()),
                             '').replace('RM', '').replace(',', '').strip()

        # address
        item['address'] = next(
            iter(response.css(".building-info-one h2::attr(title)").extract()),
            '')

        # item details
        details = {}
        for d in response.css("ul.infos>li::text").extract():
            if ':' not in d:
                details.setdefault('facility', []).append(d.strip())
            else:
                splitted = d.split(' : ')
                if len(splitted) == 2:
                    details[splitted[0].strip()] = splitted[1].strip()

        # bedroom
        if 'Bedrooms' in details:
            item['bedroom'] = details['Bedrooms']
        else:
            item['bedroom'] = next(
                iter(
                    response.css(
                        ".ld_mis_detail p.room span.bedroom::attr(title)").
                    extract()), '').replace('Bedrooms', '').strip()

        # bathroom
        if 'Bathrooms' in details:
            item['bathroom'] = details['Bathrooms']
        else:
            item['bathroom'] = next(
                iter(
                    response.css(
                        ".ld_mis_detail p.room span.bathroom::attr(title)").
                    extract()), '').replace('Bathrooms', '').strip()

        item['carpark'] = next(
            iter(
                response.css(".ld_mis_detail p.room span.garage::attr(title)").
                extract()), '').replace('Car parks', '').strip()
        item['agent_name'] = next(
            iter(response.css("#agent-info .name a::text").extract()), '')
        item['agent_url'] = next(
            iter(response.css("#agent-info .name a::text").extract()), '')
        item['agent_phone'] = next(
            iter(response.css("#agentPhone::attr(value)").extract()), '')
        item['images'] = list(
            set(response.css("ul.gallery a::attr(href)").extract()))
        item['property_type'] = details.get('Property Type:', '')
        item['tenure'] = details.get('Tenure', '')
        item['land_area'] = details.get('Land Area', '')
        item['builtup'] = details.get('Built-Up', '')
        item['occupancy'] = details.get('Occupancy', '')
        item['furnishing'] = details.get('Furnishing', '')
        item['posted_date'] = details.get('Posted Date', '')
        item['facing_direction'] = details.get('Facing Direction', '')
        item['facility'] = details.get('facility', [])
        item['description'] = ' '.join([x for x in response.css("div.detail-info-wide ::text").extract() if x.strip() != ''])\
            .replace("\n", ' ').replace("\r", " ").replace("  ", " ")

        # expired
        expired = False
        for tag in response.css("h6 ::text").extract():
            if 'expired listing' in tag.lower():
                expired = True
                break
        item['expired'] = expired

        yield item

Example #19

Show file

    def parse(self, response):
        # Parse articles
        flux_state_script = response \
            .xpath("//script[contains(., 'window.FLUX_STATE')]/text()")

        if not flux_state_script:
            raise CloseSpider(reason='FLUX_STATE not found')

        flux_state_json = flux_state_script.extract_first()[20:]
        flux_state = json.loads(flux_state_json)
        articles = flux_state['adSearch']['data']['ads']

        print(articles)

        for article in articles:
            yield {
                'search':
                self.search_id,
                'url':
                article['url'],
                'original_id':
                article['list_id'],
                'title':
                article['subject'],
                'description':
                article['body'],
                'price':
                article['price'][0],
                'charges_included':
                LeboncoinSpider.get_attribute(article, 'charges_included',
                                              lambda x: bool(int(x))),
                'publication_date':
                self.get_publication_date(article),
                'real_estate_type':
                LeboncoinSpider.get_attribute(article, 'real_estate_type',
                                              None, None, True),
                'rooms':
                LeboncoinSpider.get_attribute(article, 'rooms', int),
                'furnished':
                LeboncoinSpider.get_attribute(article, 'furnished',
                                              lambda x: bool(int(x))),
                'surface':
                LeboncoinSpider.get_attribute(article, 'square', int),
                'images':
                LeboncoinSpider.get_images(article),
                'zipcode':
                article['location']['zipcode'],
                'city':
                article['location']['city'],
                'ges':
                LeboncoinSpider.get_attribute(article, 'ges'),
                'energy_rate':
                LeboncoinSpider.get_attribute(article, 'energy_rate'),
            }

        # Follow pagination (max=nbr_of_pages)
        if self.cur_nbr_of_pages < self.nbr_of_pages:
            self.cur_nbr_of_pages += 1
            next_url = '{}/p-{}'.format(self.start_urls[0],
                                        self.cur_nbr_of_pages)
            yield response.follow(next_url, self.parse)

Example #20

Show file

    def you_get(self):
        command = ['you-get', '--json', self.start_urls[0]]
        print command
        stdout, stderr = subprocess.Popen(
            command, stdout=subprocess.PIPE,
            stderr=subprocess.PIPE).communicate()
        print 'stdout', stdout, 'stderr', stderr
        if len(stdout) < 2:
            return False

        logger.info('[you-get]' + '[uuid]' + self.uuid)
        video = json.loads(stdout)
        if 'streams' not in video:
            return False
        title = video['title']
        srcs = []
        for key in video['streams'].keys():
            print key
            if 'src' in video['streams'][key]:
                srcs = video['streams'][key]['src']
                print srcs
                break
        concatfile = 'cache/' + self.uuid + '.txt'
        mp4file = 'cache/' + self.uuid + '.mp4'
        for idx, src in enumerate(srcs):
            src_path = 'cache/' + self.uuid + '_' + str(idx) + '.mp4'
            _, success = service.utils.download_file(src, src_path)
            if not success:
                return False
            open(concatfile,
                 'a+').write('file ' + string.replace(src_path, 'cache/', '') +
                             "\n")
        length = service.utils.mergeVideo(mp4file, concatfile)
        print '[merged video duration]', length
        if length == 0:
            return False
        filesize = os.path.getsize(mp4file)
        endpoint, backet, obj = service.utils.paseUploadUrl(self.upload_url)
        print endpoint, backet, obj
        uploadResult = service.utils.uploadVideo(mp4file, endpoint, backet,
                                                 obj)
        print 'uploadResult:', uploadResult
        if not uploadResult:
            return False

        logger.warn('[uploadVideo]' + '[uuid]' + self.uuid)

        data = {
            "video_id": self.uuid,
            "state": 1,
            "message": u'成功',
            "length": length,
            "play_id": self.uuid,
            "size": filesize,
            "cover": '',
            "title": title
        }
        self.callbacked = service.utils.callback_result(self.callback,
                                                        data=data)
        logger.info('[finished]' + str(self.callbacked) + '[uuid]' + self.uuid)

        video_data = {
            'title': title,
            'video_id': self.video_id,
            'author': self.name,
            'publish': time.strftime('%Y-%m-%d %H:%M:%S'),
            'page_url': self.start_urls[0],
            'video_length': length,
            'video_size': filesize,
            'video_url': '',
            'easub_uuid': self.uuid
        }
        self.db.save_video(video_data)
        raise CloseSpider('finished')

Example #21

Show file

    def process_item(self, item, spider):
        if isinstance(item, Huangye88KunmingItem):
            # sql = """insert into kuchuan_all(id, app_package, down, trend) VALUES(%s, %s, %s, %s) ON DUPLICATE KEY UPDATE app_package=VALUES(app_package), down=VALUES(down), down=VALUES(trend)"""
            sql = """insert into jianjie_huangye88_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, Huangye88LiuzhouItem):
            sql = """insert into jianjie_huangye88_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, ShunqiLiuzhouItem):
            sql = """insert into jianjie_shunqi_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, ShunqiKunmingItem):
            sql = """insert into jianjie_shunqi_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, MinglujiLiuzhouItem):
            sql = """insert into jianjie_mingluji_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, MinglujiKunmingItem):
            sql = """insert into jianjie_mingluji_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, ShunqiAllItem):
            sql = """insert into jianjie_shunqi_all (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['intro'],
                item['city']
            ]
        # print(str(item['comp_url']) + ' ' + str(item['comp_name']))
        # if len(self.item_list) == 500:
        # 	sql = """insert into jianjie_shunqi_all_copy (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)"""
        # 	self.cursor.executemany(sql, self.item_list)
        # 	self.conn.commit()
        # 	self.item_list.clear()
        # 	print('200 insert')
        # else:
        # 	self.item_list.append([item['comp_url'], item['comp_name'], item['intro'], item['city']])
        elif isinstance(item, Huangye88AllItem):
            sql = """insert into jianjie_huangye88_all (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['intro'],
                item['city']
            ]
        elif isinstance(item, Huangye88AotuItem):
            sql = """insert into jianjie_huangye88_aotu (comp_url, comp_name, intro, posi, shengshi, cat) VALUES(%s, %s, %s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['intro'],
                item['posi'], item['shengshi'], item['cat']
            ]
        elif isinstance(item, WuyouAllItem):
            sql = """insert into jianjie_wuyou_all (comp_url, comp_name, intro, area) VALUES(%s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['intro'],
                item['area']
            ]
        elif isinstance(item, huang114AllItem):
            sql = """insert into jianjie_114_all_copy (comp_url, comp_name, link_man, tel, email, addr, intro) VALUES(%s, %s, %s, %s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['link_man'],
                item['tel'], item['email'], item['addr'], item['intro']
            ]
        elif isinstance(item, ZhizaoAllItem):
            sql = """insert into jianjie_zhizao_all (comp_url, comp_name, addr, intro) VALUES(%s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['addr'],
                item['intro']
            ]
        elif isinstance(item, Ca800Item):
            sql = """insert into jianjie_ca800_all (comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['cat_url'],
                item['cat'], item['loc'], item['sheng'], item['shi'],
                item['intro']
            ]
        elif isinstance(item, JiqirenItem):
            sql = """insert into jianjie_jiqiren_all (zhuying, comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)"""
            args = [
                item['zhuying'], item['comp_url'], item['comp_name'],
                item['cat_url'], item['cat'], item['loc'], item['sheng'],
                item['shi'], item['intro']
            ]
        elif isinstance(item, JiqirenItem):
            sql = """insert into ChuanItem (zhuying, comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)"""
            args = [
                item['zhuying'], item['comp_url'], item['comp_name'],
                item['cat_url'], item['cat'], item['loc'], item['sheng'],
                item['shi'], item['intro']
            ]

        else:
            raise CloseSpider('no item match...')
        try:
            self.cursor.execute(sql, args)
            self.conn.commit()
            # print(str(item['comp_url']) + ' ' + str(item['comp_name']))
        except pymysql.err.InterfaceError:
            print('reconnect mysql...')
            time.sleep(3)
            self.__init__()
            self.process_item(item, spider)

Example #22

Show file

    def hooks(self, d):

        if d['status'] == 'finished':
            filename = d['filename']
            l = filename.split('.')
            ext = l[len(l) - 1]
            print ext
            jsonfile = string.replace(filename, ext, 'info.json')
            info = json.loads(open(jsonfile).read())

            outpath = 'cache/' + self.uuid + '_.mp4'
            length = service.utils.coverterMp4(filename, outpath)
            print length, outpath
            if not length:
                logger.error('error trancode mp4' + self.uuid)
                raise CloseSpider('covert failed')
            total_bytes = os.path.getsize(outpath)

            endpoint, backet, obj = service.utils.paseUploadUrl(
                self.upload_url)
            print endpoint, backet, obj
            result = service.utils.uploadVideo(outpath, endpoint, backet, obj)
            # os.remove('cache/' + info['id'] + '*')
            if not result:
                self.logger.error('upload video error', self.uuid)
                raise CloseSpider('upload oss failed')

            print 'easub_uuid', result
            if 'thumbnail' in info:
                cover = service.utils.get_clip_cover_url(
                    info['thumbnail'], self.uuid)
            else:
                cover = ''
            data = {
                "video_id": self.uuid,
                "state": 1,
                "message": u'成功',
                "length": length,
                "play_id": self.uuid,
                "size": total_bytes,
                "cover": cover,
                "title": info['title']
            }
            self.callbacked = service.utils.callback_result(self.callback,
                                                            data=data)
            logger.info('[finished]' + str(self.callbacked) + '[uuid]' +
                        self.uuid)

            video_data = {
                'title': info['title'],
                'video_id': self.video_id,
                'author': info['extractor'],
                'publish': time.strftime('%Y-%m-%d %H:%M:%S'),
                'page_url': info['webpage_url'],
                'video_length': length,
                'video_size': total_bytes,
                'video_url': '',
                'easub_uuid': self.uuid,
                'cover': cover
            }
            self.db.save_video(video_data)

        if d['status'] == 'error':
            print 'error', d['filename']
            raise CloseSpider('download failed')

Example #23

Show file

 def parse(self, response):
     print 'parsePlayurl', response.url
     try:
         video_id = self._match_id(self.start_urls[0])
     except AssertionError, e:
         raise CloseSpider('link not supported')

Example #24

Show file

File: new_thanhnien_par.py Project: ringthebel/crawl_news

    def get_detail_post(self, response):
        """
        get detail post
        :param response:
        :return:
        """
        if self.close_down:
            raise CloseSpider('OVER NUMBER_POST')
        post_title = response.meta['post_title']
        post_link = response.meta['post_link']

        author = response.xpath(
            '//div[@class="details__author"]//a/img/@alt').extract_first()
        public_date = response.xpath(
            '//div[@class="details__meta"]/div[@class="meta"]/time/text()'
        ).extract_first()
        public_date = datetime.strptime(public_date, '%H:%M - %d/%m/%Y')
        public_date = public_date.timestamp() * 1000

        div_body = response.xpath('//div[@class="pswp-content"]')
        arr_summary = div_body.xpath('//div[@class="sapo"]//text()').extract()
        summary = ''
        for i in arr_summary:
            i = re.sub('\s\s+', ' ', i)
            summary += i
        summary = summary.strip()
        div_content = div_body.xpath('//div[@class="cms-body detail"]/div/div')
        content = ''
        for _ in div_content:
            arr_content = _.xpath('//text()').extract()
            for i in arr_content:
                i = re.sub('\s\s+', ' ', i)
                content += i.strip()

        tag = ''
        try:
            div_tag = response.xpath('//div[@class="details__tags"]/a')
            for _ in div_tag:
                str_tag = _.xpath('//text()').extract_first()
                tag = str_tag.strip('') + '/'
        except:
            pass
        id_picture = str(uuid.uuid1()) + str(uuid.uuid1())
        item = CrawlNewsItem()
        item_image = ImageItem()

        item['tbl_tag'] = 'tbl_news'
        item['id_picture'] = id_picture
        if 'source_title' in self.arr_detail:
            item['source_title'] = 'thanh nien'
        if 'source_link' in self.arr_detail:
            item['source_link'] = 'https://thanhnien.vn/'
        if 'category_title' in self.arr_detail:
            item['category_title'] = self.category_title
        if 'category_link' in self.arr_detail:
            item['category_link'] = self.category_link
        if 'post_title' in self.arr_detail:
            item['post_title'] = post_title
        if 'post_link' in self.arr_detail:
            item['post_link'] = post_link
        if 'sumary' in self.arr_detail:
            item['sumary'] = summary
        if 'content' in self.arr_detail:
            item['content'] = content
        if 'author' in self.arr_detail:
            item['author'] = author or ''
        if 'update_time' in self.arr_detail:
            item['update_time'] = int(round(time.time() * 1000))
        if 'public_date' in self.arr_detail:
            item['public_date'] = public_date
        if 'tag' in self.arr_detail:
            item['tag'] = tag
        yield item

        arr_image = div_body.xpath('//img/@src').extract()
        arr_image = list(set(arr_image))
        for i in arr_image:
            if i.find('https://image.thanhnien.vn') == 0:
                item_image['tbl_tag'] = 'tbl_images'
                item_image['id_picture'] = id_picture
                item_image['image'] = i
                yield item_image

Example #25

Show file

File: fbcrawl.py Project: MottiG/fbcrawl

    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #        #open page in browser for debug
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        #select all posts
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date(date, {'lang': self.lang})
            current_date = datetime.strptime(
                date, '%Y-%m-%d %H:%M:%S') if date is not None else date

            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                current_date = datetime(date.year, date.month,
                                        date.day) if date is not None else date
                date = str(date)

            #if 'date' argument is reached stop crawling
            if self.date > current_date:
                raise CloseSpider('Reached date: {}'.format(self.date))

            new = ItemLoader(item=FbcrawlItem(), selector=post)
            if abs(self.count) + 1 > self.max:
                raise CloseSpider(
                    'Reached max num of post: {}. Crawling finished'.format(
                        abs(self.count)))
            self.logger.info('Parsing post n = {}, post_date = {}'.format(
                abs(self.count) + 1, date))
            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
            new.add_value('date', date)
            new.add_xpath('post_id', './@data-ft')
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
            #page_url #new.add_value('url',response.url)

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={'item': new})

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet
        #click once on the year and go back to clicking "more"

        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
        else:
            new_page = response.xpath(
                "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
            ).extract()
            #this is why lang is needed                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^

        if not new_page:
            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    while not new_page:  #sometimes the years are skipped this handles small year gaps
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider(
                                'Reached date: {}. Crawling finished'.format(
                                    self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})

Example #26

Show file

    def start_requests(self):
        with open('config.json', 'r') as f:
            data = json.load(f)
            for i in data.items():
                if i[0] == self.name:
                    self.config.append(i)
                    print(i[0])
            f.close()

        for v in self.config:
            if len(v[1]) == 1:
                self.Index_Url = v[1][0]['Index_Url']
                print(
                    "At Time %s : 爬虫开始爬取层数为1的页面Title = %s , Index_Url = %s " %
                    (time.ctime(), v[0], self.Index_Url),
                    file=self.log)
                Max_Page = v[1][0]['Max_Page']
                Final_Url = v[1][0]['Final_Url']
                One_Xpath = v[1][0]['One_Xpath']

                if Max_Page:
                    headers = {
                        'User-Agent':
                        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"
                    }
                    response = requests.get(self.Index_Url, headers=headers)
                    soup = BeautifulSoup(response.content, "lxml")
                    result = str(soup.select(Max_Page['soup']))
                    pageNums = re.search(Max_Page['re'], result).group()
                if Final_Url:
                    url = re.sub(Final_Url, "{limit}", self.Index_Url)
                    real_url = url.format(limit=pageNums)
                else:
                    real_url = self.Index_Url
                request = Request(real_url, callback=self.parse)
                request.meta['One_Xpath'] = One_Xpath
                yield request

            if len(v[1]) == 2:
                self.Index_Url = v[1][0]['Index_Url']
                print(
                    "At Time %s : 爬虫开始爬取层数为2的页面Title = %s , Index_Url = %s " %
                    (time.ctime(), v[0], self.Index_Url),
                    file=self.log)
                print("!!!!!!!!!!!!!!!!!!!!!!!!!Index_Url = %s" %
                      self.Index_Url)
                Max_Page = v[1][0]['Max_Page']
                #Head_Url = v[1][0]['Head_Url']
                Post_Data = v[1][0]['Post_Data']

                Two_Xpath = v[1][1]['Two_Xpath']
                headers = {
                    'User-Agent':
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"
                }
                response = requests.get(self.Index_Url, headers=headers)

                soup = BeautifulSoup(response.content, "lxml")
                result = str(soup.select(Max_Page['soup']))
                pageNums = re.search(Max_Page['re'], result).group()
                #urls = re.sub(Head_Url,"%s",self.Index_Url)
                if Post_Data:
                    self.flag = 1
                urls = get_HeadUrl(self.Index_Url, self.flag)
                if urls == -1:
                    raise CloseSpider(
                        "______________________________ 构造url失败，爬取结束，请查看日志！_____________________________"
                    )

                postdata = ""
                if Post_Data:
                    keys = list(Post_Data.keys())
                    for key in keys:
                        if Post_Data[key]:
                            if re.search(Post_Data[key], str(soup)):
                                postdata += (key + "=" + str(
                                    (re.search(Post_Data[key],
                                               str(soup)).group())).replace(
                                                   "\"", "") + "&")
                            else:
                                postdata += (key + "=" + Post_Data[key] + "&")
                        else:
                            postdata += (key + "={page}&")
                if not postdata:
                    urls = urls.replace("%s", "{page}")
                else:
                    urls = urls % postdata

                for i in range(1, int(pageNums)):
                    url = urls.format(page=str(i))
                    request = Request(url, callback=self.parse)
                    request.meta['Two_Xpath'] = Two_Xpath
                    yield request
            elif len(v[1]) == 3:
                self.Index_Url = v[1][0]['Index_Url']
                print(
                    "At Time %s : 爬虫开始爬取层数为3的页面Title = %s , Index_Url = %s " %
                    (time.ctime(), v[0], self.Index_Url),
                    file=self.log)
                Max_Page = v[1][0]['Max_Page']
                #Head_Url = v[1][0]['Head_Url']
                Post_Data = v[1][0]['Post_Data']

                Valid_Url = v[1][1]['Valid_Url']

                Three_Xpath = v[1][2]['Three_Xpath']
                headers = {
                    'User-Agent':
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"
                }
                response = requests.get(self.Index_Url, headers=headers)

                soup = BeautifulSoup(response.content, "lxml")
                result = str(soup.select(Max_Page['soup']))
                pageNums = re.search(Max_Page['re'], result).group()
                #urls = re.sub(Head_Url,"%s",self.Index_Url)
                print("最大页数是:%s" % pageNums)
                if Post_Data:
                    self.flag = 1
                urls = get_HeadUrl(self.Index_Url, self.flag)
                if urls == -1:
                    raise CloseSpider(
                        "______________________________ 构造url失败，爬取结束，请查看日志！_____________________________"
                    )
                #print urls
                postdata = ""
                if Post_Data:
                    keys = list(Post_Data.keys())
                    for key in keys:
                        if Post_Data[key]:
                            if re.search(Post_Data[key], str(soup)):
                                postdata += (key + "=" + quote_plus(
                                    (re.search(Post_Data[key],
                                               str(soup)).group()).replace(
                                                   '"', "")) + "&")
                            else:
                                postdata += (key + "=" + Post_Data[key] + "&")
                        else:
                            postdata += (key + "={page}&")
                if not postdata:
                    urls = urls.replace("%s", "{page}")
                else:
                    urls = urls % postdata
                for i in range(1, int(pageNums)):
                    url = urls.format(page=str(i))
                    request = Request(url, callback=self.parse_first)
                    request.meta['Valid_Url'] = Valid_Url
                    request.meta['Three_Xpath'] = Three_Xpath
                    yield request

Example #27

Show file

File: Spider_ElConfidencial.py Project: andrescalvente997/historialNoticias

    def parse_item(self, response):

        item = item_Noticia()

        # TITULAR
        item['titularNoticia'] = response.xpath(
            XPATH_NOTICIA_TITULO).extract()[0]

        # LINK
        item['linkNoticia'] = response.url

        # KEYWORDS
        # Las keywords se ponen con el formato "A,B,C"
        item['keywordsNoticia'] = []
        try:
            keywords = response.xpath(
                XPATH_NOTICIA_KEYWORDS).extract()[0].split(",")
            for keyword in keywords:
                item['keywordsNoticia'].append(keyword.strip())
        except:
            item['keywordsNoticia'] = []

        # DESCRIPCIÓN
        item['resumenNoticia'] = response.xpath(
            XPATH_NOTICIA_RESUMEN).extract()

        # AUTORES
        # Los autores, en el caso de haber más de uno, se posicionan en tags diferentes
        item['autorNoticia'] = []
        autores = response.xpath(XPATH_NOTICIA_AUTORES).extract()
        for autor in autores:
            item['autorNoticia'].append(autor.strip())

        # LOCALIZACIONES
        # No se muestran en la noticia. Hay veces que aparece con el autor, pero aparecen de esta manera:
        # Juan Pérez. Barcelona
        # Lo cuál interfiere con los nombres de autores los cuales firmán con las iniciales de su nombre y apellidos.
        # Ejemplo: J. P.
        item['localizacionNoticia'] = []

        # FECHA
        # Se encuentra en el interior de la noticia como "YYYY-MM-ddThh:mm:ssZ"
        try:
            item['fechaPublicacionNoticia'] = response.xpath(
                XPATH_NOTICIA_FECHA_PUBLICACION).extract()[0]
        except:
            return

        # PIE DE FOTO
        # 3 casos: 1) No foto.  2) Pie de foto pero NO firma.  3) Pie y firma de foto
        try:
            pieDeFoto = response.xpath(
                XPATH_NOTICIA_FOTO_PIE).extract()[0].strip()
            item['pieDeFotoNoticia'] = pieDeFoto.split("(")[0].strip()
        except:
            item['pieDeFotoNoticia'] = ""
            item['firmaDeFotoNoticia'] = ""

        # FIRMA DE FOTO
        try:
            item['firmaDeFotoNoticia'] = pieDeFoto.split("(")[1].split(
                ")")[0].strip()
        except:
            item['firmaDeFotoNoticia'] = ""

        # CUERPO
        listPartesCuerpo = response.xpath(XPATH_NOTICIA_CUERPO).extract()
        cuerpoNoticia = "".join(listPartesCuerpo)
        cuerpoNoticia = TAG_RE.sub('', cuerpoNoticia)
        item['cuerpoNoticia'] = cuerpoNoticia

        # TAGS
        item['tagsNoticia'] = []
        tagsNoticia = response.xpath(XPATH_NOTICIA_TAGS).extract()
        for tag in tagsNoticia:
            item['tagsNoticia'].append(tag)

        # ZONA DE TEST
        #self.newsCount+=1
        if self.newsCount > 10:
            raise CloseSpider("\x1b[1;33m" + "Noticias de test recogidas" +
                              "\033[0;m")

        yield item

Example #28

Show file

    def parse(self, response):
        item = CollectorSpiderItem()
        One_Xpath = response.meta.get('One_Xpath', None)
        Two_Xpath = response.meta.get('Two_Xpath', None)
        Three_Xpath = response.meta.get('Three_Xpath', None)

        if One_Xpath:
            for i in response.xpath(One_Xpath['Lost_Xpath']):
                item['lost_url'] = response.url
                item['lost_from'] = "" if not re.search(
                    One_Xpath['Lost_From'],
                    response.url).group() else re.search(
                        One_Xpath['Lost_From'], response.url).group()
                item['lost_id'] = format_string("" if not i.xpath(
                    One_Xpath['Lost_Id'] if One_Xpath['Lost_Id'] else "/"
                ).extract() else i.xpath(
                    One_Xpath['Lost_Id'] if One_Xpath['Lost_Id'] else "/").
                                                extract()[0])
                item['lost_title'] = format_string("" if not i.xpath(
                    One_Xpath['Lost_Title'] if One_Xpath['Lost_Title'] else "/"
                ).extract() else i.xpath(
                    One_Xpath['Lost_Title'] if One_Xpath['Lost_Title'] else "/"
                ).extract()[0])
                item['lost_describe'] = format_string("" if not i.xpath(
                    One_Xpath['Lost_Describe'] if One_Xpath['Lost_Describe']
                    else "/").extract() else i.xpath(
                        One_Xpath['Lost_Describe']
                        if One_Xpath['Lost_Describe'] else "/").extract()[0])
                item['lost_person'] = format_string("" if not i.xpath(
                    One_Xpath['Lost_Person'] if One_Xpath['Lost_Person'] else
                    "/").extract() else i.xpath(
                        One_Xpath['Lost_Person']
                        if One_Xpath['Lost_Person'] else "/").extract()[0])
                item['lost_time'] = format_time(
                    format_string("" if not i.xpath(
                        One_Xpath['Lost_Time'] if One_Xpath['Lost_Time'] else
                        "/").extract() else i.xpath(
                            One_Xpath['Lost_Time']
                            if One_Xpath['Lost_Time'] else "/").extract()[0]))
                item['lost_location'] = One_Xpath['Lost_Location'][1] + (
                    format_string("" if not i.xpath(One_Xpath['Lost_Location'][
                        0] if One_Xpath['Lost_Location'][0] else "/").extract(
                        ) else i.xpath(One_Xpath['Lost_Location'][0] if
                                       One_Xpath['Lost_Location'][0] else "/").
                                  extract()[0]))
                item['lost_mid'] = hashlib.md5(
                    (item['lost_from'] + item['lost_id'] +
                     item['lost_describe'] +
                     item['lost_time']).encode('utf-8')).hexdigest()[8:-8]
                if os.path.exists(
                        "/home/hong/文档/sina_working/2to3_test/filter.bloom"):
                    #token = str(item['lost_url'])+str(item['lost_id'])+str(item['lost_describe'])
                    token = item['lost_mid']
                    if self.bf.__contains__(token):
                        print(
                            "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________"
                            % (time.ctime(), token),
                            file=self.log)
                        self.log.close()
                        #time.sleep(10)
                        raise CloseSpider(
                            "______________________________ item已经捕获重复，爬取结束！_____________________________"
                        )
                yield item

        elif Two_Xpath:
            for i in response.xpath(Two_Xpath['Lost_Xpath']):
                #item['lost_mid'] = resposne.url
                item['lost_url'] = response.url
                item['lost_from'] = "" if not re.search(
                    Two_Xpath['Lost_From'],
                    response.url).group() else re.search(
                        Two_Xpath['Lost_From'], response.url).group()
                item['lost_id'] = format_string("" if not i.xpath(
                    Two_Xpath['Lost_Id'] if Two_Xpath['Lost_Id'] else "/"
                ).extract() else i.xpath(
                    Two_Xpath['Lost_Id'] if Two_Xpath['Lost_Id'] else "/").
                                                extract()[0])
                item['lost_title'] = format_string("" if not i.xpath(
                    Two_Xpath['Lost_Title'] if Two_Xpath['Lost_Title'] else "/"
                ).extract() else i.xpath(
                    Two_Xpath['Lost_Title'] if Two_Xpath['Lost_Title'] else "/"
                ).extract()[0])
                item['lost_describe'] = format_string("" if not i.xpath(
                    Two_Xpath['Lost_Describe'] if Two_Xpath['Lost_Describe']
                    else "/").extract() else i.xpath(
                        Two_Xpath['Lost_Describe']
                        if Two_Xpath['Lost_Describe'] else "/").extract()[0])
                item['lost_person'] = format_string("" if not i.xpath(
                    Two_Xpath['Lost_Person'] if Two_Xpath['Lost_Person'] else
                    "/").extract() else i.xpath(
                        Two_Xpath['Lost_Person']
                        if Two_Xpath['Lost_Person'] else "/").extract()[0])
                item['lost_time'] = format_time(
                    format_string("" if not i.xpath(
                        Two_Xpath['Lost_Time'] if Two_Xpath['Lost_Time'] else
                        "/").extract() else i.xpath(
                            Two_Xpath['Lost_Time']
                            if Two_Xpath['Lost_Time'] else "/").extract()[0]))
                item['lost_location'] = Two_Xpath['Lost_Location'][
                    1] + format_string("" if not i.xpath(
                        Two_Xpath['Lost_Location'][0]
                        if Two_Xpath['Lost_Location'][0] else "/").extract(
                        ) else i.xpath(Two_Xpath['Lost_Location'][0] if
                                       Two_Xpath['Lost_Location'][0] else "/").
                                       extract()[0])
                item['lost_mid'] = hashlib.md5(
                    (item['lost_from'] + item['lost_id'] +
                     item['lost_describe'] +
                     item['lost_time']).encode('utf-8')).hexdigest()[8:-8]
                #time_temp = re.search(r'\d+-\d+-\d+',str(item['lost_time'])).group()
                #if not re.search(r'20',time_temp):
                #	time_temp = "20"+time_temp
                #print "time_temp = %s"%time_temp
                #time_stamp = datetime.datetime(int(re.search(r'\d+',time_temp).group()),int(re.search(r'(?<=-)\d+',time_temp).group()),int(re.search(r'\d+$',time_temp).group()))
                #if time.mktime(time_stamp.timetuple()) < time.mktime(self.one_month_ago.timetuple()):
                #	print >> self.log,"At Time %s , the item[%s] : the datetime is overtimed._____________"%(time.ctime(),time_stamp)
                #	raise CloseSpider("_____________________________The datetime is overtimed，爬取结束!!_______________________")

                if os.path.exists(
                        "/home/hong/文档/sina_working/2to3_test/filter.bloom"):
                    #token = str(item['lost_url'])+str(item['lost_id'])+str(item['lost_describe'])
                    token = item['lost_mid']
                    if self.bf.__contains__(token):
                        #self.log.write("TRUE，存在重复元素，到达这里没有?")
                        print(
                            "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________"
                            % (time.ctime(), token),
                            file=self.log)
                        self.log.close()
                        #time.sleep(10)
                        raise CloseSpider(
                            "______________________________ item已经捕获重复，爬取结束！_____________________________"
                        )

                yield item
        else:
            item['lost_url'] = response.url
            item['lost_from'] = "" if not re.search(
                Three_Xpath['Lost_From'], response.url).group() else re.search(
                    Three_Xpath['Lost_From'], response.url).group()
            item['lost_id'] = format_string("" if not response.xpath(
                Three_Xpath['Lost_Id'] if Three_Xpath['Lost_Id'] else "/"
            ).extract() else response.xpath(
                Three_Xpath['Lost_Id'] if Three_Xpath['Lost_Id'] else "/").
                                            extract()[0])
            item['lost_title'] = format_string("" if not response.xpath(
                Three_Xpath['Lost_Title'] if Three_Xpath['Lost_Title'] else "/"
            ).extract() else response.xpath(
                Three_Xpath['Lost_Title'] if Three_Xpath['Lost_Title'] else "/"
            ).extract()[0])
            item['lost_describe'] = format_string("" if not response.xpath(
                Three_Xpath['Lost_Describe'] if Three_Xpath['Lost_Describe']
                else "/").extract() else response.xpath(
                    Three_Xpath['Lost_Describe']
                    if Three_Xpath['Lost_Describe'] else "/").extract()[0])
            item['lost_person'] = format_string("" if not response.xpath(
                Three_Xpath['Lost_Person'] if Three_Xpath['Lost_Person'] else
                "/").extract() else response.xpath(
                    Three_Xpath['Lost_Person']
                    if Three_Xpath['Lost_Person'] else "/").extract()[0])
            item['lost_time'] = format_time(
                format_string("" if not response.xpath(
                    Three_Xpath['Lost_Time'] if Three_Xpath['Lost_Time'] else
                    "/").extract() else response.xpath(
                        Three_Xpath['Lost_Time']
                        if Three_Xpath['Lost_Time'] else "/").extract()[0]))
            #print(type(Three_Xpath['Lost_Location'][1]))
            item['lost_location'] = Three_Xpath['Lost_Location'][
                1] + format_string("" if not response.xpath(
                    Three_Xpath['Lost_Location'][0]
                    if Three_Xpath['Lost_Location'][0] else "/"
                ).extract()[0] else response.xpath(
                    Three_Xpath['Lost_Location'][0]
                    if Three_Xpath['Lost_Location'][0] else "/").extract()[0])
            item['lost_mid'] = hashlib.md5(
                (item['lost_from'] + item['lost_id'] + item['lost_describe'] +
                 item['lost_time']).encode('utf-8')).hexdigest()[8:-8]
            if os.path.exists(
                    "/home/hong/文档/sina_working/2to3_test/filter.bloom"):
                #token = str(item['lost_url']+item['lost_id']+item['lost_describe'])
                token = item['lost_mid']
                if self.bf.__contains__(token):
                    print(
                        "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________"
                        % (time.ctime(), token),
                        file=self.log)
                    self.log.close()
                    #time.sleep(10)
                    raise CloseSpider(
                        "______________________________ url已经捕获重复，爬取结束！_____________________________"
                    )
            yield item

Example #29

Show file

File: pipelines.py Project: Dankeee/spider

 def spider_closed(self, spider):
     self.file.close()
     raise CloseSpider('Shutdown by ctrl-c')

Example #30

Show file

File: django_spider.py Project: arefin070/WebScrappingII

    def _set_start_urls(self, scrape_url):
        self.start_urls = []

        if self.scraper.pagination_type in [
                'R',
                'F',
        ]:
            if not self.scraper.pagination_page_replace:
                msg = 'Please provide a pagination_page_replace context corresponding to pagination_type!'
                self.dds_logger.error(msg)
                raise CloseSpider()

        if self.scraper.pagination_type == 'R':
            try:
                pages = self.scraper.pagination_page_replace
                pages = pages.split(',')
                if len(pages) > 3:
                    raise Exception
                pages = list(range(*list(map(int, pages))))
            except Exception:
                msg = 'Pagination_page_replace for pagination_type "RANGE_FUNCT" ' +\
                      'has to be provided as python range function arguments ' +\
                      '[start], stop[, step] (e.g. "1, 50, 10", no brackets)!'
                self.dds_logger.error(msg)
                raise CloseSpider()
            pages = self.limit_page_nums(pages)

        if self.scraper.pagination_type == 'F':
            try:
                pages = self.scraper.pagination_page_replace
                pages = pages.strip(', ')
                pages = ast.literal_eval("[" + pages + ",]")
            except:
                msg = 'Wrong pagination_page_replace format for pagination_type "FREE_LIST", ' +\
                      "Syntax: 'Replace string 1', 'Another replace string 2', 'A number 3', ..."
                self.dds_logger.error(msg)
                raise CloseSpider()
            pages = self.limit_page_nums(pages)

        if self.scraper.pagination_type in [
                'R',
                'F',
        ]:
            append_str = self.scraper.pagination_append_str
            if scrape_url[-1:] == '/' and append_str[0:1] == '/':
                append_str = append_str[1:]

            self.pages = pages
            if self.conf['MAX_PAGES_READ']:
                self.pages = self.pages[0:self.conf['MAX_PAGES_READ']]
            for page in self.pages:
                url = scrape_url + append_str.format(page=page)
                self.start_urls.append(url)
            if not self.scraper.pagination_on_start and not self.conf[
                    'START_PAGE']:
                self.start_urls.insert(0, scrape_url)
                self.pages.insert(0, "")

        if self.scraper.pagination_type in [
                'N',
                'O',
        ]:
            self.start_urls.append(scrape_url)
            self.pages = [
                "",
            ]
        num = len(self.start_urls)
        if (num == 1):
            url_str = 'URL'
        else:
            url_str = 'URLs'
        self.log(
            "Scraper set to run on {num} start {url_str}.".format(
                num=num, url_str=url_str), logging.INFO)