Esempio n. 1
0
    def parse(self, response):
        xs = Selector(response)
        base_objects = []
        base_elem = self.scraper.get_base_elem()
        rpt = response.request.meta['rpt']

        page_num = response.request.meta['page_num']
        page = self.pages[page_num - 1]
        follow_page_num = response.request.meta['follow_page_num']

        if rpt.page_type == 'MP':
            if self.current_output_num_mp_response_bodies < self.conf[
                    'OUTPUT_NUM_MP_RESPONSE_BODIES']:
                self.current_output_num_mp_response_bodies += 1
                self.log(
                    "Response body ({url})\n\n***** RP_MP_{num}_START *****\n{resp_body}\n***** RP_MP_{num}_END *****\n\n"
                    .format(url=response.url,
                            resp_body=response.body,
                            num=self.current_output_num_mp_response_bodies),
                    logging.INFO)

        if rpt.content_type == 'J':
            json_resp = None
            try:
                json_resp = json.loads(response.body_as_unicode())
            except ValueError:
                msg = "JSON response for MP could not be parsed!"
                self.log(msg, logging.ERROR)
            if json_resp:
                try:
                    jsonpath_expr = parse(base_elem.x_path)
                except JsonPathLexerError:
                    msg = "JsonPath for base elem could not be processed!"
                    self.dds_logger.error(msg)
                    raise CloseSpider()
                base_objects = [
                    match.value for match in jsonpath_expr.find(json_resp)
                ]
                if len(base_objects) > 0:
                    base_objects = base_objects[0]
        else:
            base_objects = response.xpath(base_elem.x_path)

        if (len(base_objects) == 0):
            self.log(
                "{cs}No base objects found.{ce}".format(
                    cs=self.bcolors["INFO"], ce=self.bcolors["ENDC"]),
                logging.ERROR)

        if (self.conf['MAX_ITEMS_READ']):
            items_left = min(
                len(base_objects),
                self.conf['MAX_ITEMS_READ'] - self.items_read_count)
            base_objects = base_objects[0:items_left]

        for obj in base_objects:
            item_num = self.items_read_count + 1
            self.tmp_non_db_results[item_num] = {}
            page_str = str(page_num) + '(' + str(follow_page_num) + ')'
            self.dds_logger.info("")
            self.dds_logger.info(
                self.bcolors['BOLD'] +
                '--------------------------------------------------------------------------------------'
                + self.bcolors['ENDC'])
            self.struct_log(
                "{cs}Starting to crawl item {i} from page {p}.{ce}".format(
                    i=str(item_num),
                    p=page_str,
                    cs=self.bcolors["HEADER"],
                    ce=self.bcolors["ENDC"]))
            self.dds_logger.info(
                self.bcolors['BOLD'] +
                '--------------------------------------------------------------------------------------'
                + self.bcolors['ENDC'])
            item = self.parse_item(response, obj, rpt.page_type, item_num)
            item._dds_item_page = page
            item._dds_item_page_num = page_num
            item._dds_item_follow_page_num = follow_page_num
            item._dds_item_id = item_num
            item._dds_id_str = str(item._dds_item_page_num) + '(' + str(
                item._dds_item_follow_page_num) + ')-' + str(item._dds_item_id)

            if item:
                only_main_page_idfs = True
                idf_elems = self.scraper.get_id_field_elems()
                for idf_elem in idf_elems:
                    if idf_elem.request_page_type != 'MP':
                        only_main_page_idfs = False

                is_double = False
                if only_main_page_idfs:
                    item, is_double = self._check_for_double_item(item)

                # Don't go on reading detail pages when...
                # No detail page URLs defined or
                # DOUBLE item with only main page IDFs and no standard update elements to be scraped from detail pages or
                # generally no attributes scraped from detail pages
                cnt_sue_detail = self.scraper.get_standard_update_elems_from_detail_pages(
                ).count()
                cnt_detail_scrape = self.scraper.get_from_detail_pages_scrape_elems(
                ).count()

                if self.scraper.get_detail_page_url_elems().count() == 0 or \
                    (is_double and cnt_sue_detail == 0) or cnt_detail_scrape == 0:
                    self.non_db_results[id(
                        item)] = self.tmp_non_db_results[item_num].copy()
                    yield item
                else:
                    #self.run_detail_page_request()
                    url_elems = self.scraper.get_detail_page_url_elems()
                    for url_elem in url_elems:
                        if not url_elem.scraped_obj_attr.save_to_db:
                            url_before = self.tmp_non_db_results[item_num][
                                url_elem.scraped_obj_attr.name]
                            url, applied = self._replace_placeholders(
                                url_before, item, item_num, True)
                            self.tmp_non_db_results[item_num][
                                url_elem.scraped_obj_attr.name] = url
                        else:
                            url_before = item[url_elem.scraped_obj_attr.name]
                            url, applied = self._replace_placeholders(
                                url_before, item, item_num, True)
                            item[url_elem.scraped_obj_attr.name] = url
                        if len(applied) > 0:
                            msg = "Detail page URL placeholder(s) applied (item {id}): {a}".format(
                                a=str(applied), id=item._dds_id_str)
                            self.log(msg, logging.DEBUG)
                            self.log("URL before: " + url_before,
                                     logging.DEBUG)
                            self.log("URL after : " + url, logging.DEBUG)
                        dp_rpt = self.scraper.get_rpt_for_scraped_obj_attr(
                            url_elem.scraped_obj_attr)
                        kwargs = self.dp_request_kwargs[
                            dp_rpt.page_type].copy()

                        if 'meta' not in kwargs:
                            kwargs['meta'] = {}
                        kwargs['meta']['page_num'] = page_num
                        kwargs['meta']['follow_page_num'] = follow_page_num
                        kwargs['meta']['item'] = item
                        kwargs['meta']['from_page'] = dp_rpt.page_type
                        kwargs['meta']['item_num'] = item_num

                        kwargs['meta']['rpt'] = dp_rpt

                        if 'headers' in kwargs:
                            kwargs['headers'] = self._do_req_info_replacements(
                                item, item_num, page, kwargs['headers'],
                                "HEADERS")
                        if 'body' in kwargs:
                            body_before = kwargs['body']
                            kwargs['body'] = kwargs['body'].replace(
                                '{page}', str(page))
                            kwargs[
                                'body'], applied = self._replace_placeholders(
                                    kwargs['body'], item, item_num, True)
                            if len(applied) > 0:
                                msg = "Request info placeholder(s) applied (item {id}): {a}".format(
                                    a=str(applied), id=item._dds_id_str)
                                self.log(msg, logging.DEBUG)
                                self.log("BODY before: " + body_before,
                                         logging.DEBUG)
                                self.log("BODY after : " + kwargs['body'],
                                         logging.DEBUG)
                        if 'cookies' in kwargs:
                            kwargs['cookies'] = self._do_req_info_replacements(
                                item, item_num, page, kwargs['cookies'],
                                "COOKIES")
                        form_data = None
                        if dp_rpt.request_type == 'F' and dp_rpt.form_data:
                            form_data = json.loads(dp_rpt.form_data).copy()
                            form_data = self._do_req_info_replacements(
                                item, item_num, page, form_data, "FORM DATA")

                        if url_elem == url_elems[len(url_elems) - 1]:
                            kwargs['meta']['last'] = True
                        else:
                            kwargs['meta']['last'] = False
                        self._set_meta_splash_args()
                        #logging.info(str(kwargs))
                        self.log(
                            ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>",
                            logging.INFO)
                        msg = "{cs}Calling {dp} URL for item {id}...{ce}".format(
                            dp=dp_rpt.page_type,
                            id=item._dds_id_str,
                            cs=self.bcolors["HEADER"],
                            ce=self.bcolors["ENDC"])
                        self.log(msg, logging.INFO)
                        msg = "URL     : {url}".format(url=url)
                        self.log(msg, logging.INFO)
                        self._log_request_info(dp_rpt, form_data, kwargs)
                        self.log(
                            ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>",
                            logging.INFO)

                        if dp_rpt.request_type == 'R':
                            yield response.follow(
                                url,
                                callback=self.parse_item,
                                method=dp_rpt.method,
                                dont_filter=dp_rpt.dont_filter,
                                **kwargs)
                        else:
                            yield FormRequest(url,
                                              callback=self.parse_item,
                                              method=dp_rpt.method,
                                              formdata=form_data,
                                              dont_filter=dp_rpt.dont_filter,
                                              **kwargs)
                for key, value in list(item.items()):
                    #Fixing some extremely weird Python 2 encoding failure, 2017-06-29
                    if type(value).__name__ == 'str':
                        try:
                            value = value.decode('utf-8')
                        except AttributeError:
                            pass
                    if value and (type(value).__name__
                                  in ['str', 'unicode']) and '{page}' in value:
                        msg = "Applying page placeholder on {k}...".format(
                            k=key)
                        self.log(msg, logging.DEBUG)
                        self.log("Value before: " + value, logging.DEBUG)
                        value = value.replace('{page}', str(page))
                        item[key] = value
                        self.log("Value after: " + value, logging.DEBUG)
            else:
                self.log("Item could not be read!", logging.ERROR)

        mir_reached = False
        if self.conf['MAX_ITEMS_READ'] and (
                self.conf['MAX_ITEMS_READ'] - self.items_read_count <= 0):
            mir_reached = True
        if self.scraper.follow_pages_url_xpath and not mir_reached:
            if not self.conf['NUM_PAGES_FOLLOW'] or follow_page_num < self.conf[
                    'NUM_PAGES_FOLLOW']:
                url = response.xpath(
                    self.scraper.follow_pages_url_xpath).extract_first()
                if url is not None:
                    self._set_meta_splash_args()
                    follow_page = ''
                    if self.scraper.follow_pages_page_xpath:
                        follow_page = response.xpath(
                            self.scraper.follow_pages_page_xpath
                        ).extract_first()
                    form_data_orig = None
                    if self.scraper.get_follow_page_rpts().count() > 0:
                        f_rpt = self.scraper.get_follow_page_rpts()[0]
                        form_data_orig = self.scraper.get_follow_page_rpts(
                        )[0].form_data
                    else:
                        f_rpt = self.scraper.get_main_page_rpt()
                        form_data_orig = self.scraper.get_main_page_rpt(
                        ).form_data
                    kwargs, form_data = self._prepare_mp_req_data(
                        self.fp_request_kwargs, form_data_orig, page,
                        follow_page)

                    follow_page_num += 1
                    kwargs['meta']['page_num'] = page_num
                    kwargs['meta']['follow_page_num'] = follow_page_num
                    kwargs['meta']['rpt'] = f_rpt

                    self._log_page_info(page_num, follow_page_num, url, f_rpt,
                                        form_data, kwargs)

                    if f_rpt.request_type == 'R':
                        yield response.follow(url,
                                              callback=self.parse,
                                              method=f_rpt.method,
                                              dont_filter=f_rpt.dont_filter,
                                              **kwargs)
                    else:
                        url = response.urljoin(url)
                        yield FormRequest(url,
                                          callback=self.parse,
                                          method=f_rpt.method,
                                          formdata=form_data,
                                          dont_filter=f_rpt.dont_filter,
                                          **kwargs)
Esempio n. 2
0
    def parse(self, response):
        type = response.css('head').xpath(
            './meta[@property="og:type"]/@content').get().split('.')[1]
        if not type == "movie":
            yield None
            return
        titleSection = response.css('.subpage_title_block')
        if titleSection is None:
            yield None
            return
        idMovie = titleSection.css(".parent").xpath(
            "./h3/a/@href").get().split('/')[2]
        if idMovie in self.moviesScrapped:
            yield None
            return
        movieYear = titleSection.css('.nobr').xpath(
            './text()').get().strip().replace(')',
                                              '(').split('(')[1].split(' ')[0]
        if movieYear is None:
            yield None
            return
        if (int(movieYear) < 1980 or int(movieYear) > 1989):
            yield None
            return

        movieName = titleSection.xpath('./div/h3/a/text()').get()
        actorList = response.css('.cast_list').xpath('./tr')[1::]
        nextScrap = []
        for c in actorList:
            if self.documentscount >= 5000:
                yield None
                raise CloseSpider('Number of documents reached')
            if (len(c.xpath('./td').getall()) < 3):
                continue
            actorURL = c.xpath('./td/a/@href').get()
            actorId = actorURL.split('/')[2]
            actorName = c.xpath(
                './td[@class="primary_photo"]//a/img/@alt').get()
            actorRole = c.xpath(
                './td[@class="character"]/text()').get().strip().replace(
                    "\n", "")
            if actorRole == '':
                actorRole = c.xpath('./td[@class="character"]/a/text()').get()

            if actorURL is not None:
                nextScrap.append({
                    "url": self.allowed_domains[0] + actorURL,
                    "id": actorId
                })
            yield {
                "movie_id": idMovie,
                "movie_name": movieName,
                "movie_year": movieYear,
                "actor_name": actorName,
                "actor_id": actorId,
                "role_name": actorRole
            }
            self.documentscount = self.documentscount + 1
        self.moviesScrapped.append(idMovie)
        for a in nextScrap:
            if a['id'] not in self.actorsScrapped:
                self.actorsScrapped.append(a['id'])
                next_page = "https://" + a['url']
                yield Request(next_page, callback=self.parse_artist)
Esempio n. 3
0
    def parse_page(self, response):
        # inspect_response(response, self)
        if 'reaction_units/more' in response.url:
            json_data = json.loads(response.body_as_unicode().replace(
                'for (;;);', ''))
            post_html = json_data.get('domops')[0][-1].get('__html')
            structural_json_data = self._create_structed_json_data(json_data)
        else:
            main_content_id = response.css(
                '#pagelet_timeline_main_column>div::attr(id)').extract_first()
            if not main_content_id:
                raise CloseSpider('Main content id not found')

            main_script = response.xpath(
                f'//script/text()[contains(.,"{main_content_id}") '
                f'and contains(.,"content:")]').extract_first()
            main_id = re.search(r'container_id\:"(.*?)"', main_script).group(1)
            post_html = response.css(f'#{main_id}').extract_first()
            post_html = post_html.replace('-->', '').replace('<!--', '')

        sel = Selector(text=post_html)
        posts = sel.xpath(
            '//div[@class="_1xnd"]'
            '/div[@class and not(descendant::*[contains(@class,"uiMorePagerPrimary")])]'
        )
        page_name = response.meta.get('page_name') or \
                    response.css('#pageTitle::text').extract_first()
        page_name = page_name.split('-')[0].rstrip()
        page_id = response.meta.get('page_id')

        for post in posts:
            loader = FacebookPostItemLoader(selector=post)
            loader.add_value('page_name', page_name)
            loader.add_value('page_id', page_id)
            loader.add_css('post_id', 'input[name*="identifier"]::attr(value)')
            post_id = loader.get_output_value('post_id')
            loader.add_value(
                'post_url',
                f'https://www.facebook.com/{page_id}/posts/{post_id}')
            loader.add_xpath(
                'post_text', './/div[@data-testid="post_message"]'
                '//text()[not(ancestor::span[@class="text_exposed_hide"])]')
            loader.add_css('image_urls', '.mtm a::attr(data-ploi)',
                           MapCompose(lambda v: v.split('?')[0]))
            loader.add_css(
                'video_url', '.fsm>a::attr(href)',
                MapCompose(response.urljoin, lambda v: v
                           if 'videos' in v else None,
                           lambda v: v.split('?')[0]))
            if 'reaction_units/more' in response.url:
                post_json_data = structural_json_data.get(post_id)
            else:
                # inspect_response(response, self)
                post_script = response.xpath(
                    f'//script/text()[contains(.,"{post_id}") '
                    f'and (contains(.,"post_fbid") or contains(.,"photo_fbid"))]'
                ).extract_first()
                post_script = re.search(
                    r'onPageletArrive\((\{.*\})',
                    post_script).group(1).split('all_phases')[0] + '}'
                json_data = demjson.decode(post_script)
                json_data = json_data.get('jsmods').get(
                    'pre_display_requires')[0][3][1].get('__bbox')
                variables = json_data.get('variables')
                post_json_data = json_data.get('result').get('data').get(
                    'feedback')

            loader.add_value(
                'comment_count',
                post_json_data.get('comment_count').get('total_count'))
            loader.add_value('reaction_count',
                             post_json_data.get('reaction_count').get('count'))
            loader.add_value('share_count',
                             post_json_data.get('share_count').get('count'))
            comment_json = post_json_data.get('display_comments')
            edges = comment_json.get('edges')
            for edge in edges:
                comment_loader = FacebookCommentItemLoader()
                node = edge.get('node')
                comment_loader.add_value('comment_id', node.get('id'))
                try:
                    comment_loader.add_value('comment_text',
                                             node.get('body').get('text'))
                except AttributeError:
                    pass
                author = node.get('author')
                comment_loader.add_value('author_name', author.get('name'))
                comment_loader.add_value('author_id', author.get('id'))
                comment_loader.add_value('author_url', author.get('www_url'))
                loader.add_value('comments', comment_loader.load_item())

            yield loader.load_item()

            # TODO: Fetch first 50 comments
            # page_info = comment_json.get('page_info')
            # has_next_comment_page = page_info.get('has_next_page')
            # if has_next_comment_page:
            #     end_cursor = page_info.get('end_cursor')
            #     variables['after'] = end_cursor
            #     variables['before'] = None
            #
            #     # yield Request(
            #     #     url='https://www.facebook.com/api/graphql/',
            #     #     method='POST',
            #     #     body=json.dumps(body),
            #     #     callback=self.parse_next_comment,
            #     #     headers=headers,
            #     # )

        async_get_token = response.xpath(
            '//script/text()[contains(.,"async_get_token")]').extract_first(
            ) or response.body_as_unicode()
        async_get_token = re.search(r'"async_get_token"\:"(.*?)"',
                                    async_get_token).group(1)

        next_page = sel.css(
            '.uiMorePagerPrimary::attr(ajaxify)').extract_first()
        if next_page:
            next_url = response.urljoin(next_page)
            extra_params = urllib.parse.urlencode({
                '__a': 1,
                'fb_dtsg_ag': async_get_token
            })
            next_url += '&' + extra_params
            yield Request(next_url,
                          callback=self.parse_page,
                          meta={
                              'page_name': page_name,
                              'page_id': page_id
                          })
Esempio n. 4
0
class ZoominTvSpider(scrapy.Spider):
    name = "zoomin.tv"
    allowed_domains = ["zoomin.tv"]
    callbacked = False
    pids = [
        'corporateusahddp', 'corporateuk', 'corporateke', 'corporatees',
        'corporatelatamdp', 'corporatecataldp', 'corporatenl', 'corporatevla',
        'corporatede', 'corporateit', 'corporatefr', 'corporatewal',
        'corporatebradp', 'corporatetr', 'corporateswedp', 'corporateru',
        'corporatejp', 'corporatechinacndp', 'corporatearabdp'
    ]

    # start_urls = (
    #     'http://www.zoomin.tv/',
    # )
    # http://blackbird.zoomin.tv/ProgramXml/.json?feedtype=json&pid=corporateusahddp&vtype=direct&aid=754116
    # http://zoomin.tv/video/#!v/754116/

    def __init__(self,
                 url,
                 uuid,
                 upload_url,
                 callback,
                 check_video_url=None,
                 *args,
                 **kwargs):
        super(ZoominTvSpider, self).__init__(*args, **kwargs)
        print 'init', url
        self.config = ConfigParser.ConfigParser()
        self.config.read("config/config.ini")
        self.uuid = uuid
        self.upload_url = upload_url
        self.callback = callback
        self.check_video_url = check_video_url
        # initialize db
        with open("config/database.cnf") as f:
            config = json.load(f)
        db_cls = get_database(config.get("database_type", None))
        self.db = db_cls(**config.get("database", {}))
        self.start_urls.append(url)

    def parse(self, response):
        print 'parsePlayurl', response.url
        try:
            video_id = self._match_id(self.start_urls[0])
        except AssertionError, e:
            raise CloseSpider('link not supported')

        logger.warn('[parse]' + self.start_urls[0] + ' [uuid]' + self.uuid +
                    ' [video_id]' + video_id)
        if self.check_db():
            return

        video = None
        for pid in self.pids:
            getinfo_url = 'http://blackbird.zoomin.tv/ProgramXml/.json?feedtype=json&pid=%s&vtype=direct&aid=%s' % (
                pid, video_id)
            resp = requests.get(getinfo_url)
            info = resp.json()
            print info
            if len(info['programme']) > 0:
                video = info['programme'][0]
                break

        video_url = video['videourl']
        endpoint, backet, obj = service.utils.paseUploadUrl(self.upload_url)
        print endpoint, backet, obj
        result = service.utils.uploadVideoByUrl(video_url, endpoint, backet,
                                                obj)
        if not result:
            raise CloseSpider('upload oss failed')

        filesize = video['videosize']
        length = int(video['videoduration']) / 1000.0
        title = video['title']
        print 'filesize', filesize
        # callback
        data = {
            "video_id": self.uuid,
            "state": 1,
            "message": u'成功',
            "length": length,
            "play_id": self.uuid,
            "size": filesize,
            "cover": '',
            "title": title
        }
        self.callbacked = service.utils.callback_result(self.callback,
                                                        data=data)
        logger.info('[finished]' + str(self.callbacked) + '[uuid]' + self.uuid)

        video_data = {
            'title': title,
            'video_id': video_id,
            'author': self.name,
            'publish': time.strftime('%Y-%m-%d %H:%M:%S'),
            'page_url': self.start_urls[0],
            'video_length': length,
            'video_size': filesize,
            'video_url': video_url,
            'easub_uuid': self.uuid
        }
        self.db.save_video(video_data)
Esempio n. 5
0
    def parse(self, response):
        # x_path test
        checker = response.request.meta['checker']
        rpt = response.request.meta['rpt']

        if self.conf['OUTPUT_RESPONSE_BODY']:
            self.log(
                "Response body ({url})\n\n***** RP_START *****\n{resp_body}\n***** RP_END *****\n\n"
                .format(url=response.url,
                        resp_body=response.body.decode('utf-8')), logging.INFO)

        if checker.checker_type == '4':
            self.log(
                "{cs}No 404 result ({c} checker type).{ce}".format(
                    c=str(checker),
                    cs=self.bcolors["OK"],
                    ce=self.bcolors["ENDC"]), logging.INFO)
            if self.conf['DO_ACTION']:
                self.dds_logger.info("{cs}Item kept.{ce}".format(
                    cs=self.bcolors["OK"], ce=self.bcolors["ENDC"]))
            return
        if rpt.content_type == 'J':
            json_resp = json.loads(response.body_as_unicode())
            try:
                jsonpath_expr = parse(checker.checker_x_path)
            except JsonPathLexerError:
                msg = "Invalid checker JSONPath ({c})!".format(c=str(checker))
                self.dds_logger.error(msg)
                raise CloseSpider()
            test_select = [
                match.value for match in jsonpath_expr.find(json_resp)
            ]
            #self.log(unicode(test_select), logging.INFO)
        else:
            try:
                test_select = response.xpath(checker.checker_x_path).extract()
            except ValueError:
                self.log("Invalid checker XPath ({c})!".format(c=str(checker)),
                         logging.ERROR)
                return

        if len(test_select) > 0 and checker.checker_x_path_result == '':
            self.log(
                "{cs}Elements for XPath found on page (no result string defined) ({c}). Delete reason.{ce}"
                .format(c=str(checker),
                        cs=self.bcolors["ERROR"],
                        ce=self.bcolors["ENDC"]), logging.INFO)
            if self.conf['DO_ACTION']:
                self._del_ref_object()
            return
        elif len(test_select
                 ) > 0 and test_select[0] == checker.checker_x_path_result:
            self.log(
                "{cs}XPath result string '{s}' found on page ({c}). Delete reason.{ce}"
                .format(s=checker.checker_x_path_result,
                        c=str(checker),
                        cs=self.bcolors["ERROR"],
                        ce=self.bcolors["ENDC"]), logging.INFO)
            if self.conf['DO_ACTION']:
                self._del_ref_object()
            return
        else:
            self.log(
                "{cs}XPath result string not found ({c}).{ce}".format(
                    c=str(checker),
                    cs=self.bcolors["OK"],
                    ce=self.bcolors["ENDC"]), logging.INFO)
            if self.conf['DO_ACTION']:
                self.dds_logger.info("{cs}Item kept.{ce}".format(
                    cs=self.bcolors["OK"], ce=self.bcolors["ENDC"]))
            return
Esempio n. 6
0
    def parse(self, response):
        '''
        this part parses the response, then call the request again for the next pages and so on
        '''

        print("")
        print("")
        print("")
        print("")
        print(" ======== " + self.name + " from " + str(self.page_0) + " to " +
              str(self.page_1) + "  ========")
        print("page ============================ ", str(self.page))
        print("page ============================ ", str(self.page))
        print("iterations =======================", str(self.iters))
        print("timestamp ======================= ", datetime.datetime.now())
        print("time since start ==================== ",
              datetime.datetime.now() - self.start_time)

        #uncomment below to check IP one by one
        #yield scrapy.Request('http://checkip.dyndns.org/', headers = {'Connection': 'close'}, callback=self.check_ip, dont_filter = True) #uncomment to check IP one by one

        #randomizes the user agents to make detection harder
        ua_files = open('ua_files.txt').read().splitlines()
        user_agents = random.choice(ua_files)

        url = self.url.replace('__pagenum__', str((self.page * 50)))

        print('attempts on this page ============================',
              str(self.attempts + 1))

        print("user agent ====================", user_agents)

        #headers for the request, might need checking once in a while whether it match the actual request headers
        headers = {
            'accept': '*/*',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9'
            #,'if-none-match-' : '55b03-20443a68390f59aa1bc448bc3b42fa6e'
            ,
            'referer': self.referer.replace('__pagenum__', str(self.page)),
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': user_agents,
            'x-api-source': 'pc',
            'x-requested-with': 'XMLHttpRequest',
            'Connection': 'close'
        }

        yield scrapy.Request(url=url,
                             callback=self.parse,
                             headers=headers,
                             dont_filter=True)

        data = json.loads(response.text)
        #print(data)

        #every region has different error patterns, as shown below. Need to be checked manually
        #give up after 50 tries, also the possibility of the category have less than 160 pages
        if self.attempts <= 10:
            if self.region in ['id', 'vn', 'th']:
                self.check_corrupt(data=data, zeroes=5, pagenum=50)
            elif self.region == 'ph':
                self.check_corrupt(data=data, zeroes=5, pagenum=45)
            elif self.region == 'my':
                self.check_corrupt(data=data, zeroes=3, pagenum=50)
            print("data corrupted ================ ", self.corrupt)
        else:
            print(
                "data corrupted ================ but gave up trying on this page"
            )
            self.corrupt = 0
            self.breaker = 1
            pass

        print("data corrupted ================ ", self.corrupt)

        self.iters += 1

        #if hit max page, that call the cleaning function

        ##if you're using unbatched pagination and wants to use single process cleaning, uncomment the cleaning func
        ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment the df processs
        ##the integrated df method should be more pythonic and efficient. It is still somewhat unstable though, use with caution

        if self.page >= self.page_max + 1:
            #if you're using unbatched pagination and wants to use single process cleaning, uncomment this
            #cleaning(self.name, self.output, self.region, self.category, self.subcategory, self.subsubcategory)

            ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment this
            '''if 'rank' not in self.df:
                self.df['rank'] = np.arange(len(self.df))

            print(' =================== raw' + self.name + '.csv')
            self.df.to_csv('raw' + self.name + '.csv', index=False)     
            print(' =================== raw' + self.name + '.csv')'''

            raise CloseSpider("====MAX PAGE HAS BEEN REACHED!==== ")

        ##if you're using unbatched pagination and wants to use single process cleaning, you can comment out this entire elif part as it becomes redundant
        elif self.page >= self.page_1:
            #self.cleaning(self.name, self.output, self.region, self.category, self.subcategory, self.subsubcategory)
            '''if 'rank' not in self.df:
                self.df['rank'] = np.arange(len(self.df))

            print(' =================== raw' + self.name + '.csv')
            self.df.to_csv('raw' + self.name + '.csv', index=False)     
            print(' =================== raw' + self.name + '.csv')'''

            raise CloseSpider("====MAX PAGE HAS BEEN REACHED!==== ")

        #if error occurs, and max threshold is hit, print JSON as-is
        elif self.corrupt == 0 and self.breaker == 1:
            cleaning(self.name, self.output, self.region, self.category,
                     self.subcategory, self.subsubcategory)
            if data['items'] is None:
                raise CloseSpider("====NO DATA IS RETURNED!==== ")
            elif data['query_rewrite'] is None:
                raise CloseSpider("====DATA CORRUPTED!====")
            else:
                raise CloseSpider("==== UNKNOWN ERROR ==== ")
        #if error occurs, print out the corresponding error types, then loop to scrapthe same page again
        elif self.corrupt == 1:
            self.attempts += 1
            print("Something went wrong!, retry attempts ===== ",
                  self.attempts)
            if data['items'] is None:
                print(
                    "Error =========== data[item] is None, no data is returned!"
                )
                time.sleep(5)
            elif self.corrupt == 1:
                print(
                    "Error =========== data is corrupted!, retrying in 5 secs")
                time.sleep(5)
        #if OK, then print the acquired JSON data to a JSON file to be compiled later by the cleaning function
        else:
            with open(os.path.join(
                    'raw_shopee/raw_shopee_' + self.region + '/' + self.name,
                    'data_q_' + str(self.page) + '.json'),
                      'w',
                      encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

            #data = data['items']
            #data.update({'page':self.page})
            #data.update({'rating_star': obj['items'][k]['item_rating']['rating_star']})
            #data.update({'timestamp':pd.datetime.now().replace(microsecond=0)})

            #df = pd.DataFrame

            ##if you're using unbatched pagination and wants to use single process cleaning with an integrated dataframe (ie not reading the entire printed JSON object), uncomment this section
            '''k=0
            for j in data['items']:
                j.update({'page_num':self.page})
                j.update({'rating_star': data['items'][k]['item_rating']['rating_star']})
                j.update({'timestamp':pd.datetime.now().replace(microsecond=0)})
                for m in range(0,6):
                    j.update({'star_' + str(5-m): data['items'][k]['item_rating']['rating_count'][m]})
                self.df = self.df.append(j, ignore_index = True) 
                k+=1

            #if 'rank' not in df:
             #   df['rank'] = np.arange(len(data))
                
            if 'category' not in self.df:
                self.df['category'] = self.category

            if 'subcategory' not in self.df:
                self.df['subcategory'] = self.subcategory

            if 'subsubcategory' not in self.df:
                self.df['subsubcategory'] = self.subsubcategory
            
            if 'platform' not in self.df:
                self.df['platform'] = 'shopee'
            if 'region' not in data:
                self.df['region'] =self.region

            if 'engine_ver' not in self.df:
                self.df['engine_ver'] = 'v0.4.2'


            #print(self.df)'''

            self.page += 1
            self.attempts = 0
Esempio n. 7
0
 def parse_weibo(self, response):
     """解析网页中的微博信息"""
     keyword = response.meta.get('keyword')
     for sel in response.xpath("//div[@class='card-wrap']"):
         info = sel.xpath(
             "div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']"
         )
         if info:
             weibo = WeiboItem()
             weibo['id'] = sel.xpath('@mid').extract_first()
             weibo['bid'] = sel.xpath(
                 '(.//p[@class="from"])[last()]/a[1]/@href').extract_first(
                 ).split('/')[-1].split('?')[0]
             weibo['user_id'] = info[0].xpath(
                 'div[2]/a/@href').extract_first().split('?')[0].split(
                     '/')[-1]
             weibo['screen_name'] = info[0].xpath(
                 'div[2]/a/@nick-name').extract_first()
             txt_sel = sel.xpath('.//p[@class="txt"]')[0]
             retweet_sel = sel.xpath('.//div[@class="card-comment"]')
             retweet_txt_sel = ''
             if retweet_sel and retweet_sel[0].xpath('.//p[@class="txt"]'):
                 retweet_txt_sel = retweet_sel[0].xpath(
                     './/p[@class="txt"]')[0]
             content_full = sel.xpath(
                 './/p[@node-type="feed_list_content_full"]')
             is_long_weibo = False
             is_long_retweet = False
             if content_full:
                 if not retweet_sel:
                     txt_sel = content_full[0]
                     is_long_weibo = True
                 elif len(content_full) == 2:
                     txt_sel = content_full[0]
                     retweet_txt_sel = content_full[1]
                     is_long_weibo = True
                     is_long_retweet = True
                 elif retweet_sel[0].xpath(
                         './/p[@node-type="feed_list_content_full"]'):
                     retweet_txt_sel = retweet_sel[0].xpath(
                         './/p[@node-type="feed_list_content_full"]')[0]
                     is_long_retweet = True
                 else:
                     txt_sel = content_full[0]
                     is_long_weibo = True
             weibo['text'] = txt_sel.xpath(
                 'string(.)').extract_first().replace('\u200b', '').replace(
                     '\ue627', '')
             weibo['article_url'] = self.get_article_url(txt_sel)
             weibo['location'] = self.get_location(txt_sel)
             if weibo['location']:
                 weibo['text'] = weibo['text'].replace(
                     '2' + weibo['location'], '')
             weibo['text'] = weibo['text'][2:].replace(' ', '')
             if is_long_weibo:
                 weibo['text'] = weibo['text'][:-6]
             weibo['at_users'] = self.get_at_users(txt_sel)
             weibo['topics'] = self.get_topics(txt_sel)
             reposts_count = sel.xpath(
                 './/a[@action-type="feed_list_forward"]/text()'
             ).extract_first()
             try:
                 reposts_count = re.findall(r'\d+.*', reposts_count)
             except TypeError:
                 print('cookie无效或已过期,请按照'
                       'https://github.com/dataabc/weibo-search#如何获取cookie'
                       ' 获取cookie')
                 raise CloseSpider()
             weibo['reposts_count'] = reposts_count[
                 0] if reposts_count else '0'
             comments_count = sel.xpath(
                 './/a[@action-type="feed_list_comment"]/text()'
             ).extract_first()
             comments_count = re.findall(r'\d+.*', comments_count)
             weibo['comments_count'] = comments_count[
                 0] if comments_count else '0'
             attitudes_count = sel.xpath(
                 '(.//a[@action-type="feed_list_like"])[last()]/em/text()'
             ).extract_first()
             weibo['attitudes_count'] = (attitudes_count
                                         if attitudes_count else '0')
             created_at = sel.xpath(
                 '(.//p[@class="from"])[last()]/a[1]/text()').extract_first(
                 ).replace(' ', '').replace('\n', '').split('前')[0]
             weibo['created_at'] = util.standardize_date(created_at)
             source = sel.xpath('(.//p[@class="from"])[last()]/a[2]/text()'
                                ).extract_first()
             weibo['source'] = source if source else ''
             pics = ''
             is_exist_pic = sel.xpath(
                 './/div[@class="media media-piclist"]')
             if is_exist_pic:
                 pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()
                 pics = [pic[2:] for pic in pics]
                 pics = [
                     re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics
                 ]
                 pics = ['http://' + pic for pic in pics]
             video_url = ''
             is_exist_video = sel.xpath(
                 './/div[@class="thumbnail"]/a/@action-data')
             if is_exist_video:
                 video_url = is_exist_video.extract_first()
                 video_url = unquote(
                     str(video_url)).split('video_src=//')[-1]
                 video_url = 'http://' + video_url
             if not retweet_sel:
                 weibo['pics'] = pics
                 weibo['video_url'] = video_url
             else:
                 weibo['pics'] = ''
                 weibo['video_url'] = ''
             weibo['retweet_id'] = ''
             if retweet_sel and retweet_sel[0].xpath(
                     './/div[@node-type="feed_list_forwardContent"]/a[1]'):
                 retweet = WeiboItem()
                 retweet['id'] = retweet_sel[0].xpath(
                     './/a[@action-type="feed_list_like"]/@action-data'
                 ).extract_first()[4:]
                 retweet['bid'] = retweet_sel[0].xpath(
                     './/p[@class="from"]/a/@href').extract_first().split(
                         '/')[-1].split('?')[0]
                 info = retweet_sel[0].xpath(
                     './/div[@node-type="feed_list_forwardContent"]/a[1]'
                 )[0]
                 retweet['user_id'] = info.xpath(
                     '@href').extract_first().split('/')[-1]
                 retweet['screen_name'] = info.xpath(
                     '@nick-name').extract_first()
                 retweet['text'] = retweet_txt_sel.xpath(
                     'string(.)').extract_first().replace('\u200b',
                                                          '').replace(
                                                              '\ue627', '')
                 retweet['article_url'] = self.get_article_url(
                     retweet_txt_sel)
                 retweet['location'] = self.get_location(retweet_txt_sel)
                 if retweet['location']:
                     retweet['text'] = retweet['text'].replace(
                         '2' + retweet['location'], '')
                 retweet['text'] = retweet['text'][2:].replace(' ', '')
                 if is_long_retweet:
                     retweet['text'] = retweet['text'][:-6]
                 retweet['at_users'] = self.get_at_users(retweet_txt_sel)
                 retweet['topics'] = self.get_topics(retweet_txt_sel)
                 reposts_count = retweet_sel[0].xpath(
                     './/ul[@class="act s-fr"]/li/a[1]/text()'
                 ).extract_first()
                 reposts_count = re.findall(r'\d+.*', reposts_count)
                 retweet['reposts_count'] = reposts_count[
                     0] if reposts_count else '0'
                 comments_count = retweet_sel[0].xpath(
                     './/ul[@class="act s-fr"]/li[2]/a[1]/text()'
                 ).extract_first()
                 comments_count = re.findall(r'\d+.*', comments_count)
                 retweet['comments_count'] = comments_count[
                     0] if comments_count else '0'
                 attitudes_count = retweet_sel[0].xpath(
                     './/a[@action-type="feed_list_like"]/em/text()'
                 ).extract_first()
                 retweet['attitudes_count'] = (attitudes_count
                                               if attitudes_count else '0')
                 created_at = retweet_sel[0].xpath(
                     './/p[@class="from"]/a[1]/text()').extract_first(
                     ).replace(' ', '').replace('\n', '').split('前')[0]
                 retweet['created_at'] = util.standardize_date(created_at)
                 source = retweet_sel[0].xpath(
                     './/p[@class="from"]/a[2]/text()').extract_first()
                 retweet['source'] = source if source else ''
                 retweet['pics'] = pics
                 retweet['video_url'] = video_url
                 retweet['retweet_id'] = ''
                 yield {'weibo': retweet, 'keyword': keyword}
                 weibo['retweet_id'] = retweet['id']
             # print(weibo)
             yield {'weibo': weibo, 'keyword': keyword}
Esempio n. 8
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code    = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id       = self.web_id,
                    url          = response.url.encode('utf-8'),
                    meta         = response.meta,
                    html_path    = html_path,
                    crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status   = 'FAILED',
                    error_message= "Empty request's response"
                )
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info('[ JobPageRetry ] {url} with proxy {proxy}'.format(url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url, callback=self.parse_detail , meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(url=response.url.encode('utf-8')))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(url=response.url.encode('utf-8')))

        try:
            ret = {}

            ret['company'] = response.xpath('.//h1[@itemprop="hiringOrganization"]/a/span/text()').extract_first()
            ret['pos']     = response.xpath('.//div[@class="job-detail-top col-xs-12"]/h2/a/text()').extract_first()
            ret['etype']   = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[0])
            ret['loc']     = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[1])
            ret['sal']     = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[2])
            ret['hour']    = self.clean_tag(response.xpath('.//div[@class="job-detail border-b col-xs-12"]/div[@class="col-xs-12"]/span').extract()[4])
            ret['desc']    = '|'.join([i.strip() for i in response.xpath('.//div[@itemprop="responsibilities"]/text()').extract()])
            ret['qual']    = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="skills"]').extract_first().split('\n')] if i])
            ret['benef']   = '|'.join([ i for i in [self.clean_tag(i).strip() for i in response.xpath('.//div[@itemprop="incentives"]').extract_first().replace('<li>','\n').split('\n')] if i])
            ret['pdate']   = self.convert_pdate(response.xpath('.//div[@itemprop="datePosted"]/text()').extract_first())

            if ret['pdate'].split()[0].split('-')[0] == "2017":
                self.logger.info("[ JobEndReached ] 2017 reached")
                self.killed = 1
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ret[key].strip().encode('utf-8')

            _hash = hash_dn(ret['desc'],ret['company'])

            #log result to MySQL
            try:
                self.sqllogger.log_crawled_page(
                    hash_code    = _hash,
                    position     = ret['pos'],
                    employer     = ret['company'],
                    exp          = '',
                    salary       = ret['sal'],
                    location     = ret['loc'],
                    web_id       = self.web_id,
                    url          = response.url.encode('utf-8'),
                    meta         = response.meta,
                    html_path    = html_path,
                    crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time    = ret['pdate'],
                    job_status   = 'SUCCESS',
                    error_message= ''
                )
                self.logger.info('[ RDSLogged ] {url}'.format(url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info("[ JobEndReached ] crawled record reached exceeding threshold")
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info("[ JobRepeat ] crawled record found within threshold #%d" % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error('[ JobDetailException ] {url} {html_path} {e}'.format(url=response.url.encode('utf-8'),html_path=html_path.encode('utf-8'),e=e))
            self.sqllogger.log_error_page(
                hash_code    = hash_dn(response.url.encode('utf-8'),datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id       = self.web_id,
                url          = response.url.encode('utf-8'),
                meta         = response.meta,
                html_path    = html_path,
                crawl_time   = datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status   = 'FAILED',
                error_message= e
            )
                         website_url=website_url,
                         website_key=json_key,
                         settings=settings)
            #logging.info('结束网站爬虫'+json_key+':'+url_key+':'+website_urls[url_key]+'-'+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()))

        wait = runner.join()

        wait.addBoth(lambda _: reactor.stop())

        #阻塞进程直到爬虫完毕
        reactor.run()

        #end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
        #information = "开始爬虫时间:"+ begin_time + "\n爬虫结束时间: "+ end_time + " 凤凰类别数据爬虫完毕"
        #email_object.send_information(information,"完成凤凰类别数据爬虫通知",True)
        #print "通知成功"
        #end_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) #结束时间
        #info_spider = ' begin at :'+begin_time+' end at :'+end_time
        #logging.info(info_spider)
        os._exit(0)
    except BaseException, error:
        end_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        information = "time: " + end_time + "错误:" + str(error) + '\n'
        email_object.send_information(information)
        logging.exception(error)
        raise CloseSpider('爬虫识别')
        os._exit(1)

    finally:
        read_json_file.changejson(settings['SPLIT_JSON_FILE'])
Esempio n. 10
0
    def parse_detail(self, response):

        if self.killed:
            raise CloseSpider("Spider already died.")

        if not response.body:
            self.error_count += 1

            if self.error_count >= self.error_threshold:
                self.logger.error('[ JobPageRequestException ] {url}'.format(
                    url=response.url.encode('utf-8')))
                self.sqllogger.log_error_page(
                    hash_code=hash_dn(response.url.encode('utf-8'),
                                      datetime.now().strftime('%Y%m%d%H%M%S')),
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    job_status='FAILED',
                    error_message="Empty request's response")
                yield None
                return
            if self.use_proxy:
                proxy = choice(self.proxies)
                self.logger.info(
                    '[ JobPageRetry ] {url} with proxy {proxy}'.format(
                        url=response.url.encode('utf-8'), proxy=proxy))
                yield scrapy.Request(response.url,
                                     callback=self.parse_detail,
                                     meta={'proxy': proxy})
                return
            else:
                self.logger.info('[ JobPageRetry ] {url}'.format(
                    url=response.url.encode('utf-8')))
                yield scrapy.Request(response.url, callback=self.parse_detail)
                return
        self.error_count = 0

        try:
            html_path = self.html_path.format(
                dttm=datetime.now().strftime('%Y%m%d_%H%M%S'))
            with open(html_path, 'w') as f:
                f.write(response.text.encode('utf-8'))
                self.logger.info('[ HTMLArchived ] {url}'.format(
                    url=response.url.encode('utf-8')))
        except Exception as e:
            self.logger.error('[ HTMLArchiveException ] {url}'.format(
                url=response.url.encode('utf-8')))

        try:
            ret = {}

            head = {}

            row = response.xpath(
                '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/p|//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]/ul'
            )[1:]
            topic = response.xpath(
                '//div[@class="w3-container w3-left-align w3-medium w3-theme-l5"]//b/u/text()'
            ).extract()
            head['amnt'] = u'\u0e2d\u0e31\u0e15\u0e23\u0e32'
            head[
                'sal'] = u'\u0e40\u0e07\u0e34\u0e19\u0e40\u0e14\u0e37\u0e2d\u0e19'
            head[
                'benef'] = u'\u0e2a\u0e27\u0e31\u0e2a\u0e14\u0e34\u0e01\u0e32\u0e23'
            head[
                'req'] = u'\u0e04\u0e38\u0e13\u0e2a\u0e21\u0e1a\u0e31\u0e15\u0e34\u0e1c\u0e39\u0e49\u0e2a\u0e21\u0e31\u0e04\u0e23'
            head[
                'loc_det'] = u'\u0e2a\u0e16\u0e32\u0e19\u0e17\u0e35\u0e48\u0e1b\u0e0f\u0e34\u0e1a\u0e31\u0e15\u0e34\u0e07\u0e32\u0e19'
            head['loc'] = u'\u0e08\u0e31\u0e07\u0e2b\u0e27\u0e31\u0e14'

            ret['pos'], ret['desc'] = [
                self.clean_tag(x) for x in response.xpath(
                    '//div[@class="w3-theme-l4"]/div').extract()
            ]
            ret['pdate'] = self.cdate[response.url]
            ret['company'] = self.comnm[response.url]
            del self.cdate[response.url]
            del self.comnm[response.url]
            ret['loc'] = ''
            ret['sal'] = ''

            for key in head.keys():
                try:
                    idx = topic.index(head[key])
                except ValueError:
                    continue
                ret[key] = '|'.join([
                    i for i in [
                        remove_tags(i)
                        for i in row[idx].xpath('./text()|./li').extract()
                    ] if i
                ])

            if ret['pdate'].split()[-1] == "2560":
                self.killed += 1
                raise CloseSpider("2017 reached")

            for key in ret.keys():
                if ret[key]:
                    ret[key] = ' '.join(
                        ret[key].strip().split()).encode('utf-8')

            _hash = hash_dn(ret['desc'], ret['company'])

            try:
                self.sqllogger.log_crawled_page(
                    hash_code=_hash,
                    position=ret['pos'],
                    employer=ret['company'],
                    exp='',
                    salary=ret['sal'],
                    location=ret['loc'],
                    web_id=self.web_id,
                    url=response.url.encode('utf-8'),
                    meta=response.meta,
                    html_path=html_path,
                    crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    post_time=ret['pdate'],
                    job_status='SUCCESS',
                    error_message='')
                self.logger.info('[ RDSLogged ] {url}'.format(
                    url=response.url.encode('utf-8')))
            except exc.IntegrityError as e:
                if e.orig.args[
                        0] == 1062 and self.repeat_count >= self.repeat_threshold:
                    self.logger.info(
                        "[ JobEndReached ] crawled record reached exceeding threshold"
                    )
                    self.killed = 1
                    raise CloseSpider("Crawled record reached")
                elif e.orig.args[
                        0] == 1062 and self.repeat_count < self.repeat_threshold:
                    self.repeat_count += 1
                    self.logger.info(
                        "[ JobRepeat ] crawled record found within threshold #%d"
                        % self.repeat_count)
                    yield None
                    return
                else:
                    raise e
            self.repeat_count = 0

            for key in ret.keys():
                if not ret[key]:
                    del ret[key]

            yield ret

        except CloseSpider as e:
            raise CloseSpider(e.message)

        except Exception as e:
            self.logger.error(
                '[ JobDetailException ] {url} {html_path} {e}'.format(
                    url=response.url.encode('utf-8'),
                    html_path=html_path.encode('utf-8'),
                    e=e))
            self.sqllogger.log_error_page(
                hash_code=hash_dn(response.url.encode('utf-8'),
                                  datetime.now().strftime('%Y%m%d%H%M%S')),
                web_id=self.web_id,
                url=response.url.encode('utf-8'),
                meta=response.meta,
                html_path=html_path,
                crawl_time=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                job_status='FAILED',
                error_message=e)
Esempio n. 11
0
 def process_response(self, request, response, spider):
     if response.status == 402:
         raise CloseSpider('402 proxy no use')
     else:
         return response
Esempio n. 12
0
    def parse(self, response):
        global ult
        datas = response.selector.xpath(
            '//tr/td[@class="date"]/text()').extract()
        links = response.selector.xpath('//tr/td[5]/a/@href').extract()
        descs = response.selector.xpath('//tr/td[5]/a[@href]/text()').extract()

        for data, desc, link in zip(datas, descs, links):
            if desc.encode('utf-8') != ult and ult == '':
                with open('baseDos.txt', 'a+') as arq:
                    arq.write(data.strip() + '\n')
                    arq.write(desc.encode('utf-8') + '\n')
                    arq.write(link + '\n\n')
                    arq.close()
                i = 0
            elif desc.encode('utf-8') != ult:
                with open('aux.txt', 'a+') as arq:
                    arq.write(data.strip() + '\n')
                    arq.write(desc.encode('utf-8') + '\n')
                    arq.write(link + '\n\n')
                    arq.close()
                i = 1
            else:
                i = 0
                break

        if not i:
            if os.path.exists('aux.txt'):
                os.remove('baseDos.txt')
                os.rename('aux.txt', 'baseDos.txt')
                # HERE YOU PUT THE TOKEN OF YOUR PAGE ON FACEBOOK
                access_token = "HERE YOU PUT THE TOKEN OF YOUR PAGE ON FACEBOOK"
                api = facebook.GraphAPI(access_token)
                total = 0
                arq = open('baseDos.txt', 'r')
                linhas = arq.readlines()
                for i in linhas:
                    if i == '\n':
                        total = total + 1
                a = 0
                b = 3
                lista = []
                for i in range(total):
                    xxx = linhas[a:b]
                    lista.append(xxx)
                    xxx = ''
                    a = b + 1
                    b = a + 3
                lista.reverse()
                for i in lista:
                    x = ''.join(i)
                    api.put_wall_post(x)

            raise CloseSpider('[+] BASE ATUALIZADA [+]')
        else:
            try:
                proxima_pagina = response.xpath(
                    '//a[@href and contains(.,"next")]/@href').extract()[0]
                if proxima_pagina:
                    yield scrapy.Request(url=proxima_pagina,
                                         callback=self.parse)
            except:
                pass
Esempio n. 13
0
    def twse_mining_Data_Parse(self, response):
        if (not (self.is_TPEX_open and self.is_TWSE_open)):
            print(self.se_status)
            pass
        else:
            local_Co_ids = []
            if (self.TPEX_First_Run):
                local_Co_ids = self.Co_ids
            else:
                local_Co_ids = self.possible_Co_ids_TWSE
            for data in response.xpath('body'):
                domain = urlParse.urlparse(response.url).hostname
                print('First RUN:', self.TWSE_First_Run)
                print('爬取開始')
                print(f'網域:{domain}')

                for co_id in local_Co_ids:
                    self.not_manual_cancel = sg.one_line_progress_meter(
                        '目前爬取進度',
                        self.current,
                        self.isExist - 1,
                        'Stock',
                        '運行時請勿點擊視窗,顯示沒有回應請勿關閉,為正常現象。\nElapsed Time 為已運行時間\nTime Remaining 為剩餘時間\nEstimated Total Time 為估計完成時間',
                        no_titlebar=False,
                        orientation='h')
                    if (not self.not_manual_cancel
                            and self.current < self.isExist - 1):
                        Button = sg.popup_yes_no('是否取消?', '取消爬取')
                        if (Button == 'Yes'):
                            sg.popup('已手動取消!')
                            raise CloseSpider("使用者取消!")
                    items = StockPrice_items()
                    print('First RUN:', self.TWSE_First_Run)
                    print(co_id)
                    twse_get = ''
                    twse_get = str(
                        response.xpath(
                            f'//td[text()="{co_id}"]//text()').get())
                    twse_co_name = str(
                        data.xpath(
                            f'//td[text()="{co_id}"]/following-sibling::td[1]//text()'
                        ).get())
                    print(twse_get)
                    if (twse_get == 'None' and (not twse_co_name.isnumeric())):
                        if (self.TPEX_First_Run):
                            print(f'股號 {co_id} 不存在於交易所,可能為TPEX的股號,丟入至暫存中...')
                            self.possible_Co_ids_TPEX.append(co_id)
                            continue
                        else:
                            print(f'股號 {co_id} 不存在兩邊交易所,丟入到未存在股號中...')
                            self.noExist.append(co_id)
                            continue
                    else:
                        print('TWSE GET ITEMS')
                        self.current += 1
                        twse_price = str(
                            data.xpath(
                                f'//td[text()="{co_id}"]/following-sibling::td[6]//text()'
                            ).get())
                        twse_price = twse_price.replace(',', '')
                        print(twse_price)
                        if (self.is_number(twse_price)):
                            twse_price = float(twse_price)
                        else:
                            twse_price = None
                        items['CO_ID'] = str(co_id)
                        items['CO_SHORT_NAME'] = str(twse_co_name)
                        items['Price'] = twse_price
                        items['SUB_DATA_TYPE'] = 'TWSE'
                        items['SYear'] = str(self.Year)
                        items['SDate'] = str(self.Date)
                        items['DATA_TYPE'] = self.Type
                        yield (items)
            self.TWSE_First_Run = False
            yield scrapy.Request(self.se_urls[0],
                                 callback=self.tpex_mining_Data_Parse,
                                 dont_filter=True)
Esempio n. 14
0
 def process_item(self, response, spider):
     self.count += 1
     if (self.count == 5):
         print("======in test pipeline========")
         raise CloseSpider("in exception")
Esempio n. 15
0
    def parse(self, response):

        # def get_proxy():
        #     return requests.get("http://127.0.0.1:5010/get/").content

        #
        # def delete_proxy(proxy):
        #     requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))

        #或者可以设置随机ip
        #不需要在这里设置,在retry中间件中设置即可
        #轮询使用ip,假设有500可用ip,一分钟500个页面,对服务器来说相当于每台主机访问一页面

        # while response.status == 403 or response.status == 302:
        #
        #     print(response.status)
        #
        #     print(response.meta)
        #
        #     # delete_proxy(response.headers)
        #
        #     #删除proxy
        #
        #     # 获取proxy
        #
        #     proxy = get_proxy()
        #
        #     print("使用新代理:" + str(proxy))
        #
        #     #如果proxy_pool耗尽,暂时暂停爬虫或者更换目标网站,移动端或者wap,或者各大网站的cache
        #
        #     response = scrapy.Request(url=response.url, meta={'proxy':'http://' + str(proxy)})
        #
        #     print(type(response))

        # print("有respose")

        item = LearningItem()

        #爬取书名

        #作者有联合作者,会和译者一样放在一个span里面,单个作者单独放在文本为 作者 的span 的后面的同级a节点,所以也要分类讨论
        #或者作者无链接——不会,会有search
        #单个作者也会用一组嵌套的span括住
        #翻译者的链接也是author,既然是爬取图书,就没有关系了,如果要研究翻译相关的话,主数据库有译者字段
        def is_exist(item_argv, xpath1, **xpath2):
            # item[item_argv] = info.xpath(xpath1).extract().strip()
            try:
                item[item_argv] = info.xpath(xpath1).extract()
            except:
                print(str(item_argv) + "出错")
                item[item_argv] = ''

            if len(item[item_argv]) == 1:

                item[item_argv] = item[item_argv][0].strip()

            # if len(item[item_argv]) == 0 and item[item_argv] != '':
            #
            #     item[item_argv] = ''

            # return item[item_argv][0].strip() if len(item[item_argv]) == 1 else item[item_argv]

            return item[item_argv]

        # try:
        #先确定豆瓣会出错的几种方式
        #返回403
        #返回200,但需登陆
        #返回此应用出错
        # print("尝试爬取")

        # except:
        # print()
        # print("被ban!!!!!!!!!!!!!")
        #只会停止其中一个协程,其他要逐渐停止,强行ctrl + z 会导致后面的链接被添加到filter中,以后都不会再被爬取
        if response.status != 200:

            #不知道会不会将缺少 '/"的页面重定向到别的地方,导致状态码变为301,改next_page的代码
            #shell后发现不会,重定向会直接返回200的response,服务器补全了后面的 /
            raise CloseSpider('强制停止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            # time.sleep(600)
            # raise CloseSpider()
            # return
            ##这里写ADSL拨号或者换ip的逻辑
            # print()
            # return

        print("此时的URL为:" + str(response.url))
        # writer_link_list = []
        # series_link_list = []
        try:
            info = response.xpath(u'//*[@id="info"]')[0]
        except:
            raise CloseSpider("出现200以外的错误,此时的url为 %s" % response.url)

            #在这里一并处理了作者列表和翻译者列表

            #判断有无作者
            #判断有无翻译者
            #翻译者以上的author link 的text 加入到作者列表中
            #如无翻译者,则author link 的 text 默认为全是作者
            #容易出错,比如出现个志愿者什么的,举例而已

            #作者节点:作者节点的下一个同辈span节点的所有前同辈a节点,因为作者节点排第一,没有其他节点会影响它

            #先确定是两种模式的哪一种

            #直接写四种模式,用 a = b or c = d的写法,一句

        #如果以某个字段为基准,比如出版社以上的a tag 为作者,以下为翻译者的话,当出版社字段不存在,就会出错,所以还是以自身为基准,爬虫会更具健壮性
        #有冒号无嵌套
        w_name1 = info.xpath(
            u'//span[./text()="作者:"]/following-sibling::span[1]/preceding-sibling::a'
        )
        #有冒号有嵌套
        w_name2 = info.xpath(u'//span[./text()="作者:"]/parent::span/a')
        #无冒号无嵌套
        w_name3 = info.xpath(
            u'//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a'
        )
        #无冒号有嵌套
        w_name4 = info.xpath(u'//span[./text()=" 作者"]/parent::span/a')

        if w_name1:
            item['writers'] = w_name1.xpath("./text()").extract()
            item['writers_link'] = w_name1.xpath("./@href").extract()

        elif w_name2:
            item['writers'] = w_name2.xpath("./text()").extract()
            item['writers_link'] = w_name2.xpath("./@href").extract()

        elif w_name3:
            item['writers'] = w_name3.xpath("./text()").extract()
            item['writers_link'] = w_name3.xpath("./@href").extract()

        elif w_name4:
            item['writers'] = w_name4.xpath("./text()").extract()
            item['writers_link'] = w_name4.xpath("./@href").extract()

        else:
            item['writers'] = ''
            item['writers_link'] = ''

#————————————————————————————————————————————————————————————————————————————————————————————————————————————————#

#译者
# contains(@name,'na')

#有冒号无嵌套
        t_name1 = info.xpath(
            u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"search")]'
        )
        #有冒号有嵌套
        t_name2 = info.xpath(
            u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"author")]'
        )
        #无冒号无嵌套
        #选中属性中包含某个字符串的href
        #链接可以直接爬取了,但是中文字段还是要靠后续的处理和提取

        #出错
        #仍有问题,无法替换和正确拼接
        # t_name3 = info.xpath(u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search") or contains(@href,"author")]')
        t_name3 = info.xpath(
            u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search")]'
        )
        #无冒号有嵌套
        t_name4 = info.xpath(
            u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"author")]'
        )

        if t_name4:
            item['translators'] = t_name4.xpath("./text()").extract()
            item['translators_link'] = t_name4.xpath("./@href").extract()

        elif t_name3:
            item['translators'] = t_name3.xpath("./text()").extract()
            item['translators_link'] = t_name3.xpath("./@href").extract()

        elif t_name2:
            item['translators'] = t_name2.xpath("./text()").extract()
            item['translators_link'] = t_name2.xpath("./@href").extract()

        elif t_name1:
            item['translators'] = t_name1.xpath("./text()").extract()
            item['translators_link'] = t_name1.xpath("./@href").extract()
        else:
            item['translators'] = ''
            item['translators_link'] = ''

#————————————————————————————————————————————————————————————————————————————————————————————————————————————————#

        item["publish"] = is_exist(
            "publish", u'//span[./text()="出版社:"]/following::text()[1]')

        item["publish_date"] = is_exist(
            "publish_date", u'//span[./text()="出版年:"]/following::text()[1]')
        item["pages"] = is_exist(
            "pages", u'//span[./text()="页数:"]/following::text()[1]')
        item["price"] = is_exist(
            "price", u'//span[./text()="定价:"]/following::text()[1]')
        item["binding"] = is_exist(
            "binding", u'//span[./text()="装帧:"]/following::text()[1]')
        item["ISBN"] = is_exist(
            "ISBN", u'//span[./text()="ISBN:"]/following::text()[1]')
        item["orgin_name"] = is_exist(
            "orgin_name", u'//span[./text()="原作名:"]/following::text()[1]')
        item["series"] = is_exist(
            "series", u'//span[./text()="丛书:"]/following::a[1]/text()')
        item["series_link"] = is_exist(
            "series_link",
            u'//span[./text()="丛书:"]/following-sibling::a[1]/@href')

        # item["summary"] = is_exist("summary",)
        # item["w_summary"] = is_exist("w_summary",)

        item["catalog"] = is_exist("catalog",
                                   '//*[contains(@id,"dir_")]/text()')
        item["tag"] = is_exist("tag",
                               '//*[@id="db-tags-section"]/div/span/a/text()')
        item["series_info"] = is_exist(
            "series_info",
            '//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()'
        )

        # item["readers"] = is_exist("readers",).extract().strip()

        # item["title"] = is_exist("title",).extract().strip()
        # item["url"] = is_exist("url",).extract().strip()
        # item["score"] = is_exist("score",).extract().strip()

        try:
            item['title'] = response.xpath(
                "//*[@id='wrapper']/h1/span/text()").extract_first()
        except:
            item['title'] = ''

        item['url'] = response.url.replace("https://book.douban.com/subject/",
                                           "").strip('/')

        try:
            item['score'] = response.css(
                '#interest_sectl > div > div.rating_self.clearfix > strong::text'
            ).extract_first().strip()
            if item['score'] == '':
                item['score'] = '0'
        except:
            item['score'] = '0'

        # try:
        #     item['publish'] = info.xpath().extract_first().strip()
        # except:
        #     item['publish'] = ''
        # try:
        #     item['publish_date'] = info.xpath(u'//span[./text()="出版年:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['publish_date'] = ''

        # try:
        #     item['pages'] = info.xpath(u'//span[./text()="页数:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['pages'] = ''

        # try:
        #     item['price'] = info.xpath(u'//span[./text()="定价:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['price'] = ''
        # try:
        #     item['binding'] = info.xpath(u'//span[./text()="装帧:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['binding'] = ''
        # try:
        #     item['ISBN'] = info.xpath(u'//span[./text()="ISBN:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['ISBN'] = ''
        # try:
        #     item['orgin_name'] = info.xpath(u'//span[./text()="原作名:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['orgin_name'] = ''
        # try:
        #     item['series'] = info.xpath(u'//span[./text()="丛书:"]/following::a[1]/text()').extract_first().strip()
        # except:
        #     item['series'] = ''
        # try:
        #     item['series_link'] = info.xpath(u'//span[./text()="丛书:"]/following-sibling::a[1]/@href').extract_first().strip()
        # except:
        #     item['series_link'] = ''

        #这里有两种情况,一种有折叠,一种没有,先提取包含折叠内容的,没有再提取另一个

        try:

            summary = response.xpath(
                '//*[@id="link-report"]/span/div/div[@class="intro"]/p/text()')

            if summary:
                item['summary'] = summary.extract()
            else:
                item['summary'] = response.xpath(
                    '//*[@id="link-report"]/div[1]/div/p/text()').extract()

            # if len(item['summary']) == 0 and item['summary'] != '':
            #
            #     item['summary'] = ''

        except:

            item['summary'] = ''

        try:
            w_summary = response.css(
                '#content > div > div.article > div.related_info > div:nth-child(4) > span.all.hidden > div > p::text'
            )

            if w_summary:
                item['w_summary'] = w_summary.extract()
            else:
                item['w_summary'] = response.css(
                    '#content > div > div.article > div.related_info > div:nth-child(4) > span.short > div > p::text'
                ).extract()

            # if len(item['w_summary']) == 0 and item['w_summary'] != '':
            #
            #     item['w_summary'] = ''
        except:
            item['w_summary'] = ''

        # try:
        #     #出错
        #     # item['catalog'] = response.xpath('//*[contains(@id,"full") and contains(@id,"dir")]/text()').extract()
        #     item['catalog'] = response.xpath('//*[contains(@id,"dir_")]/text()').extract()
        # except:
        #     item['catalog'] = ''

        # try:

        #     item['tag'] = response.xpath('//*[@id="db-tags-section"]/div/span/a/text()').extract()
        # except:
        #     item['tag'] = ''

        # try:
        #     #丛书信息会随机抽取
        #     item['series_info'] = response.xpath('//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()').extract()
        # except:
        #     item['series_info'] = ''

        try:
            item['readers'] = response.css(
                '#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span::text'
            ).extract_first()

            if item['readers'] is None:
                item['readers'] = '0'
        except:
            item['readers'] = '0'

        # '//*[@id="link-report"]/div[1]/div/p'/div/div[@class="intro"]/p/text()

        # if w_name_mode1:
        #     # w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","")
        #     w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()')

        #     #如果能捕获作者名字,则写入,否则,为span嵌套模式
        #     if w_name:
        #         item['writer'] = w_name.extract()

        #     else:
        #         item['writer'] = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()')

        #     /
        #     writer_name_type2 = links.xpath('//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","")
        #     writer_name_type3 =
        #     #单个作者节点已经完成,需要完成一组的作者节点,具体参考大学教材
        #     #一组作者节点同一组翻译者节点
        #     #翻译者节点:翻译者节点的下一个span节点

        #     #一组翻译者的已经解决,单个翻译者的参考傅雷

        #     # link_extract = item.extract()
        #     if "author" in link:
        #         # print(item.xpath('./@href').extract())
        #         #这里可以缩减
        #         writer_link_list.append(link)
        #     #存储完整的网址,日后爬取可以少一个拼接网址的逻辑,加快爬取速度,硬盘开销不大
        #     if "search" in link:
        #         link = "https://book.douban.com/" + link
        #         writer_link_list.append(link)

        #     if "series" in link:
        #         series_link_list.append(link)

        # item['writer_link'] = writer_link_list
        # item['series_link'] = series_link_list
        #         # item['writer'] = response.xpath(u'//span[./text()="作者:"]/following::a[2]')
        # # # // *[ @ id = "info"] / a[1]
        # # item['publish'] = response.xpath(u'//span[./text()="出版社:"]/following::text()[1]')
        # # item['orgin_name'] = response.xpath(u'//span[./text()="原作名:"]/following::text()[1]')

        # #这里只是其中一种情况,还有一种,要增加对应的try...except,以及中文图书没有翻译的问题,全半角符号的问题

        # c = ""#单个翻译者

        # try:
        #     if a:
        #         item['translator'] = a[0].xpath('./a/text()').extract()
        #     if b:
        #         item['translator'] = b[0].xpath('./a/text()').extract()
        # except:
        #     item['translator'] = ''

        #有效评分人数
        # if item['readers']:

        #     v = int(item['readers'])

        # else:
        #     v = 0

        # #入选top250的最低人数
        # m = 10000

        # #书本得分
        # if item['score']:
        #     R = float(item['score'])
        # else:
        #     R = 0

        # # C是所有书本的得分平均分,都存在数据库中,取个大概值就行了
        # C = 7

        item["weighting"] = 0
        item['seen'] = 0

        yield item

        # item['p_date']
        # item['total_pages']
        # item['price']
        # item['binding']
        # item['series']
        # item['ISBN']
        # item['summary']
        # item['w_introduce']
        # item['ca']
        # item['tag']
        # item['s_info']
        # item['score']
        # item['readers']
        # print(item['title'])
        # all = response.xpath("string(//*[@id='info'])")
        # all =
        # print(all.extract())
        # print(all.extract()[0].replace("\n",""))
        # print(all.extract()[0].replace("\n","").replace(" ",""))
        # print(type(all.extract()))
        # yield item
        #id一般固定,可以忽略css的变化
        #先不清洗,换取爬取的速度提升
        # all = response.xpath('//*[@id="info"]')
        # all = all.extract()[0].replace("\n","").replace("\t","").split("<br>")
        # for item in all:
        # print(item.replace('<spanclass="pl">',"").replace("</span>","").replace("""<divid="info"class="">""","").replace("</div>","").replace("</a>","").replace("""<aclass=""href=""","").replace("<span>","").replace("<ahref=",""))
        # all = response.xpath(u'//span[./text()=" 作者"]/following::text()')
        # print(all)

        #mysql批量写入,不要每次写入

        #
        #抽取"喜欢这本书的用户也喜欢"的链接
        link = LinkExtractor(
            restrict_xpaths=('//*[@id="db-rec-section"]/div//dl//dd'))
        links = link.extract_links(response)

        #如果链接是直接相关的话,也可以用response.follow,会返回一个url实例,然后可以yield相关的url:
        # links = response.xpath('//*[@id="db-rec-section"]/div//dl//dd').extract()

        # for link in links:
        #     yield response.follow(link,callback=self.parse)

        for link in links:
            # print("弹出一个url")

            # if link.url.endswith('/'):
            # pass
            # else:
            # link.url = link.url + "/"
            #没有"/"作为结尾的话,网址会重定向,不必要,但是可能是识别爬虫的依据
            yield scrapy.Request(url=link.url, callback=self.parse)
Esempio n. 16
0
    def parse_item(self, response):

        if (self.count < int(self.limit)):
            item = MyItem()
            item['url'] = response.url
            p = r"^\S*article\/view\/\S*$"
            a = r"^(\s*Abstrak\s*$)|(^\s*Abstract\s*$)"

            if (re.match(p, item['url'])):

                journal = JournalItem()
                article = ArticleItem()
                references = ReferencesItem()
                author = AuthorItem()

                item['title'] = response.css('title::text').getall()

                dc = "//meta[@name='DC.{}']/@content"
                citation = "//meta[@name='citation_{}']/@content"

                author_name = response.xpath(
                    dc.format('Creator.PersonalName')).extract()
                abstract = response.xpath(
                    dc.format('Description')).extract_first()
                doi = response.xpath(
                    dc.format('Identifier.DOI')).extract_first()
                issn = response.xpath(dc.format('Source.ISSN')).extract_first()
                issue = response.xpath(
                    dc.format('Source.Issue')).extract_first()
                volume = response.xpath(
                    dc.format('Source.Volume')).extract_first()
                title = response.xpath(dc.format('Title')).extract_first()
                uri = response.xpath(
                    dc.format('Identifier.URI')).extract_first()
                journal_title = response.xpath(
                    citation.format('journal_title')).extract_first()
                author_institution = response.xpath(
                    citation.format('author_institution')).extract()
                date = response.xpath(citation.format('date')).extract_first()
                keyword = response.xpath(
                    citation.format('keywords')).extract_first()
                pdf_uri = response.xpath(
                    citation.format('pdf_url')).extract_first()
                language = response.xpath(
                    citation.format('language')).extract_first()

                if not abstract:
                    abstract = response.xpath(
                        '//*[text()[re:test(., "{}")]]/parent::*//text()'.
                        format(a)).extract()

                article['title'] = title
                article['abstract'] = abstract
                article['doi'] = doi
                article['uri'] = uri
                article['pdf_uri'] = pdf_uri
                article['publication_date'] = date
                article['keyword'] = keyword
                article['issn'] = issn
                article['language'] = language

                journal['title'] = journal_title
                journal['issn'] = issn
                journal['issue'] = issue
                journal['volume'] = volume

                author['name'] = author_name
                author['affiliate'] = author_institution

                #Match reference with regex
                pattern = "^(\s*References\s*$)|(^\s*Referensi\s*$)"
                pattern2 = r"^[a-zA-Z/[]|['__']{2}"
                pattern3 = r"\s?[a-zA-Z0-9\.\ ]{1}$"

                result = response.xpath(
                    '//*[text()[re:test(., "{}")]]/parent::*//text()'.format(
                        pattern)).extract()

                #Remove control character like \n,\t, etc.
                t = dict.fromkeys(range(32))
                ref = [
                    x.translate(t) for x in result
                    if x.translate(t) and x.translate(t) != "References"
                    and x.translate(t) != "Referensi" and len(x) > 20
                ]

                references['title'] = ""
                references['classification'] = ""

                if len(ref) > 0:
                    data = pd.read_csv(
                        '/home/bandreg/Skripsi/Program/JournalCrawler/scrapy_app/scrapy_app/spiders/data2.csv',
                        index_col=None)

                    vectorizer = CountVectorizer()
                    X1 = vectorizer.fit_transform(data['Reference'].values)

                    test = vectorizer.transform(ref)
                    model = joblib.load(
                        '/home/bandreg/Skripsi/Program/JournalCrawler/scrapy_app/scrapy_app/spiders/model.sav'
                    )
                    result = model.predict(test)

                    references['title'] = ref
                    references['classification'] = result

                #Count item
                self.count += 1
                yield {
                    'journal': journal,
                    'item': item,
                    'article': article,
                    'author': author,
                    'references': references
                }
        else:
            raise CloseSpider('limit reached')
Esempio n. 17
0
    def parse_subscriptions_period_variants(self, response):
        if len(self.devices) < 1:
            self.log("[[ORANGECH]] No devices collected on previous steps. Stopping!")
            return
        self.current_step = 'PROCESS_PLANS_DEVICES'

        if not self._browser_load_page_with_tries(devices_url):
            self.errors.append("Failed to load page with PhantomJS: %s" % devices_url)
            raise CloseSpider("Failed to load page with PhantomJS: %s" % devices_url)

        # reset to SIM-only
        time.sleep(30)
        el = self._browser.find_element_by_xpath("//div[@class='product-item'][not(@id)]//button[contains(text(), 'Select')]")
        self._do_browser_action_tries(el.click)
        time.sleep(30)

        if not self._browser_load_page_with_tries(response.url):
            self.errors.append("Failed to load page with PhantomJS: %s" % response.url)
            raise CloseSpider("Failed to load page with PhantomJS: %s" % response.url)

        for self.current_period in ['12', '24']:

            if self.current_period not in self.processed_priceplans:
                self.processed_priceplans[self.current_period] = {}

            if len(self.priceplans) < 1:
                return

            self.log('[[ORANGECH]] Processing period: %s months' % self.current_period)

            drop_down_el = self._browser.find_element_by_xpath("//form[@id='form_subscription_length']//a[@class='select2-choice']")
            self._do_browser_action_tries(drop_down_el.click)
            el = self._browser.find_element_by_xpath("//ul[@id='select2-results-6']/li/div[contains(text(), '%s')]" % self.current_period)
            self._do_browser_action_tries(el.click)

            for plan_name_base in sorted(self.priceplans):
                # plan_formdata = self.priceplans_formdata[plan_name_base]
                self.log('[[ORANGECH]] Processing base price plan %s with period %s months' % (plan_name_base, self.current_period))
                drop_down_el = self._browser.find_element_by_xpath("//form[@id='form_subscription_choice']//a[@class='select2-choice']")
                self._do_browser_action_tries(drop_down_el.click)
                el = self._browser.find_element_by_xpath("//ul[@id='select2-results-4']/li/div[contains(text(), '%s')]" % plan_name_base)
                self._do_browser_action_tries(el.click)

                if plan_name_base not in self.processed_priceplans[self.current_period]:
                    self.processed_priceplans[self.current_period][plan_name_base] = set()

                for i, variant in enumerate(sorted(self.priceplans_variants[plan_name_base])):
                    grouped_key = ";".join(["%s:%s" % (key, variant[key]) for key in sorted(variant.keys())])

                    if grouped_key in self.processed_priceplans[self.current_period][plan_name_base]:
                        continue

                    if 'young' in plan_name_base.lower():
                        plan_name = plan_name_base[:]
                        for key, value in variant.items():
                            if 'Young' in self.priceplans[plan_name_base][key][value]['name']:
                                plan_name = plan_name + ' ' + self.priceplans[plan_name_base][key][value]['name'].replace("Orange Young", "").replace("Young", "").strip()
                        for key, value in variant.items():
                            if 'Young' not in self.priceplans[plan_name_base][key][value]['name']:
                                plan_name = plan_name + ', ' + self.priceplans[plan_name_base][key][value]['name']
                    else:
                        plan_name = plan_name_base + " " + ", ".join([self.priceplans[plan_name_base][key][variant[key]]['name'] for key in sorted(variant.keys())])

                    price = sum([int(self.priceplans[plan_name_base][key][variant[key]]['price']) for key in variant.keys()])

                    meta = {
                        'plan_name_base': plan_name_base,
                        'grouped_key': grouped_key,
                        'plan_name': plan_name,
                        'per_month': price
                    }

                    self.log('[[ORANGECH]] Selecting price plan %s with period %s months' % (plan_name, self.current_period))

                    for key, value in variant.items():
                        el = self._browser.find_element_by_xpath("//input[@name='%s'][@value='%s']" % (key, value))
                        self._do_browser_action_tries(el.click)
                        time.sleep(5)

                    self.log('[[ORANGECH]] Clicking period again: %s months' % self.current_period)
                    el = self._browser.find_element_by_xpath("//select[@name='contract_length']/option[@value='%s']" % self.current_period)
                    if not el.is_selected():
                        self._do_browser_action_tries(el.click)
                        time.sleep(5)

                    self.log('[[ORANGECH]] Loading device prices for price plan: %s, %s months' % (plan_name, self.current_period))
                    # time.sleep(30)

                    if not self._browser_load_page_with_tries(devices_url):
                        self.errors.append("Failed to load page with PhantomJS: %s" % devices_url)
                        raise CloseSpider("Failed to load page with PhantomJS: %s" % devices_url)
                    hxs = HtmlXPathSelector(text=self._browser.page_source)

                    for item in self.parse_device_prices_for_priceplan(hxs, meta):
                        yield item

                    self.processed_priceplans[self.current_period][plan_name_base].add(grouped_key)

                    if not self._browser_load_page_with_tries(subscriptions_url):
                        self.errors.append("Failed to load page with PhantomJS: %s" % subscriptions_url)
                        raise CloseSpider("Failed to load page with PhantomJS: %s" % subscriptions_url)
Esempio n. 18
0
    def parse_item_page(self, response):
        if self.close_down:
            raise CloseSpider()
        item = IpropertyItem()
        item['url'] = response.url
        item['scraped_date'] = time.strftime("%Y-%m-%d %H:%M:%S")

        # categories
        # pre-filled with None
        for x in xrange(1, 7):
            item['cat_{}'.format(x)] = None
        categories = [
            x for x in response.css("div.breadcrumbs-ld a::text").extract()
            if x != 'Home'
        ]
        for index, cat in enumerate(categories):
            if index > 5:
                raise CloseSpider("Category tree too long: {}".format(
                    ','.join(categories)))

            item['cat_{}'.format(index + 1)] = cat

        # unique ID
        result = re.search(r'.+-(\d+)$', response.url)
        if result:
            item['unique_id'] = result.group(1)

        # title
        item['title'] = next(
            iter(response.css("h1.main-title::text").extract()), '')
        if item['title'][-3:] == '...':
            item['title'] = next(iter(response.css("title ::text").extract()),
                                 '')

        # price
        item['price'] = next(iter(response.css("h2.price::text").extract()),
                             '').replace('RM', '').replace(',', '').strip()

        # address
        item['address'] = next(
            iter(response.css(".building-info-one h2::attr(title)").extract()),
            '')

        # item details
        details = {}
        for d in response.css("ul.infos>li::text").extract():
            if ':' not in d:
                details.setdefault('facility', []).append(d.strip())
            else:
                splitted = d.split(' : ')
                if len(splitted) == 2:
                    details[splitted[0].strip()] = splitted[1].strip()

        # bedroom
        if 'Bedrooms' in details:
            item['bedroom'] = details['Bedrooms']
        else:
            item['bedroom'] = next(
                iter(
                    response.css(
                        ".ld_mis_detail p.room span.bedroom::attr(title)").
                    extract()), '').replace('Bedrooms', '').strip()

        # bathroom
        if 'Bathrooms' in details:
            item['bathroom'] = details['Bathrooms']
        else:
            item['bathroom'] = next(
                iter(
                    response.css(
                        ".ld_mis_detail p.room span.bathroom::attr(title)").
                    extract()), '').replace('Bathrooms', '').strip()

        item['carpark'] = next(
            iter(
                response.css(".ld_mis_detail p.room span.garage::attr(title)").
                extract()), '').replace('Car parks', '').strip()
        item['agent_name'] = next(
            iter(response.css("#agent-info .name a::text").extract()), '')
        item['agent_url'] = next(
            iter(response.css("#agent-info .name a::text").extract()), '')
        item['agent_phone'] = next(
            iter(response.css("#agentPhone::attr(value)").extract()), '')
        item['images'] = list(
            set(response.css("ul.gallery a::attr(href)").extract()))
        item['property_type'] = details.get('Property Type:', '')
        item['tenure'] = details.get('Tenure', '')
        item['land_area'] = details.get('Land Area', '')
        item['builtup'] = details.get('Built-Up', '')
        item['occupancy'] = details.get('Occupancy', '')
        item['furnishing'] = details.get('Furnishing', '')
        item['posted_date'] = details.get('Posted Date', '')
        item['facing_direction'] = details.get('Facing Direction', '')
        item['facility'] = details.get('facility', [])
        item['description'] = ' '.join([x for x in response.css("div.detail-info-wide ::text").extract() if x.strip() != ''])\
            .replace("\n", ' ').replace("\r", " ").replace("  ", " ")

        # expired
        expired = False
        for tag in response.css("h6 ::text").extract():
            if 'expired listing' in tag.lower():
                expired = True
                break
        item['expired'] = expired

        yield item
Esempio n. 19
0
    def parse(self, response):
        # Parse articles
        flux_state_script = response \
            .xpath("//script[contains(., 'window.FLUX_STATE')]/text()")

        if not flux_state_script:
            raise CloseSpider(reason='FLUX_STATE not found')

        flux_state_json = flux_state_script.extract_first()[20:]
        flux_state = json.loads(flux_state_json)
        articles = flux_state['adSearch']['data']['ads']

        print(articles)

        for article in articles:
            yield {
                'search':
                self.search_id,
                'url':
                article['url'],
                'original_id':
                article['list_id'],
                'title':
                article['subject'],
                'description':
                article['body'],
                'price':
                article['price'][0],
                'charges_included':
                LeboncoinSpider.get_attribute(article, 'charges_included',
                                              lambda x: bool(int(x))),
                'publication_date':
                self.get_publication_date(article),
                'real_estate_type':
                LeboncoinSpider.get_attribute(article, 'real_estate_type',
                                              None, None, True),
                'rooms':
                LeboncoinSpider.get_attribute(article, 'rooms', int),
                'furnished':
                LeboncoinSpider.get_attribute(article, 'furnished',
                                              lambda x: bool(int(x))),
                'surface':
                LeboncoinSpider.get_attribute(article, 'square', int),
                'images':
                LeboncoinSpider.get_images(article),
                'zipcode':
                article['location']['zipcode'],
                'city':
                article['location']['city'],
                'ges':
                LeboncoinSpider.get_attribute(article, 'ges'),
                'energy_rate':
                LeboncoinSpider.get_attribute(article, 'energy_rate'),
            }

        # Follow pagination (max=nbr_of_pages)
        if self.cur_nbr_of_pages < self.nbr_of_pages:
            self.cur_nbr_of_pages += 1
            next_url = '{}/p-{}'.format(self.start_urls[0],
                                        self.cur_nbr_of_pages)
            yield response.follow(next_url, self.parse)
Esempio n. 20
0
    def you_get(self):
        command = ['you-get', '--json', self.start_urls[0]]
        print command
        stdout, stderr = subprocess.Popen(
            command, stdout=subprocess.PIPE,
            stderr=subprocess.PIPE).communicate()
        print 'stdout', stdout, 'stderr', stderr
        if len(stdout) < 2:
            return False

        logger.info('[you-get]' + '[uuid]' + self.uuid)
        video = json.loads(stdout)
        if 'streams' not in video:
            return False
        title = video['title']
        srcs = []
        for key in video['streams'].keys():
            print key
            if 'src' in video['streams'][key]:
                srcs = video['streams'][key]['src']
                print srcs
                break
        concatfile = 'cache/' + self.uuid + '.txt'
        mp4file = 'cache/' + self.uuid + '.mp4'
        for idx, src in enumerate(srcs):
            src_path = 'cache/' + self.uuid + '_' + str(idx) + '.mp4'
            _, success = service.utils.download_file(src, src_path)
            if not success:
                return False
            open(concatfile,
                 'a+').write('file ' + string.replace(src_path, 'cache/', '') +
                             "\n")
        length = service.utils.mergeVideo(mp4file, concatfile)
        print '[merged video duration]', length
        if length == 0:
            return False
        filesize = os.path.getsize(mp4file)
        endpoint, backet, obj = service.utils.paseUploadUrl(self.upload_url)
        print endpoint, backet, obj
        uploadResult = service.utils.uploadVideo(mp4file, endpoint, backet,
                                                 obj)
        print 'uploadResult:', uploadResult
        if not uploadResult:
            return False

        logger.warn('[uploadVideo]' + '[uuid]' + self.uuid)

        data = {
            "video_id": self.uuid,
            "state": 1,
            "message": u'成功',
            "length": length,
            "play_id": self.uuid,
            "size": filesize,
            "cover": '',
            "title": title
        }
        self.callbacked = service.utils.callback_result(self.callback,
                                                        data=data)
        logger.info('[finished]' + str(self.callbacked) + '[uuid]' + self.uuid)

        video_data = {
            'title': title,
            'video_id': self.video_id,
            'author': self.name,
            'publish': time.strftime('%Y-%m-%d %H:%M:%S'),
            'page_url': self.start_urls[0],
            'video_length': length,
            'video_size': filesize,
            'video_url': '',
            'easub_uuid': self.uuid
        }
        self.db.save_video(video_data)
        raise CloseSpider('finished')
Esempio n. 21
0
    def process_item(self, item, spider):
        if isinstance(item, Huangye88KunmingItem):
            # sql = """insert into kuchuan_all(id, app_package, down, trend) VALUES(%s, %s, %s, %s) ON DUPLICATE KEY UPDATE app_package=VALUES(app_package), down=VALUES(down), down=VALUES(trend)"""
            sql = """insert into jianjie_huangye88_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, Huangye88LiuzhouItem):
            sql = """insert into jianjie_huangye88_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, ShunqiLiuzhouItem):
            sql = """insert into jianjie_shunqi_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, ShunqiKunmingItem):
            sql = """insert into jianjie_shunqi_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, MinglujiLiuzhouItem):
            sql = """insert into jianjie_mingluji_liuzhou (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, MinglujiKunmingItem):
            sql = """insert into jianjie_mingluji_kunming (comp_url, comp_name, intro) VALUES(%s, %s, %s)"""
            args = [item['comp_url'], item['comp_name'], item['intro']]
        elif isinstance(item, ShunqiAllItem):
            sql = """insert into jianjie_shunqi_all (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['intro'],
                item['city']
            ]
        # print(str(item['comp_url']) + ' ' + str(item['comp_name']))
        # if len(self.item_list) == 500:
        # 	sql = """insert into jianjie_shunqi_all_copy (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)"""
        # 	self.cursor.executemany(sql, self.item_list)
        # 	self.conn.commit()
        # 	self.item_list.clear()
        # 	print('200 insert')
        # else:
        # 	self.item_list.append([item['comp_url'], item['comp_name'], item['intro'], item['city']])
        elif isinstance(item, Huangye88AllItem):
            sql = """insert into jianjie_huangye88_all (comp_url, comp_name, intro, city) VALUES(%s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['intro'],
                item['city']
            ]
        elif isinstance(item, Huangye88AotuItem):
            sql = """insert into jianjie_huangye88_aotu (comp_url, comp_name, intro, posi, shengshi, cat) VALUES(%s, %s, %s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['intro'],
                item['posi'], item['shengshi'], item['cat']
            ]
        elif isinstance(item, WuyouAllItem):
            sql = """insert into jianjie_wuyou_all (comp_url, comp_name, intro, area) VALUES(%s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['intro'],
                item['area']
            ]
        elif isinstance(item, huang114AllItem):
            sql = """insert into jianjie_114_all_copy (comp_url, comp_name, link_man, tel, email, addr, intro) VALUES(%s, %s, %s, %s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['link_man'],
                item['tel'], item['email'], item['addr'], item['intro']
            ]
        elif isinstance(item, ZhizaoAllItem):
            sql = """insert into jianjie_zhizao_all (comp_url, comp_name, addr, intro) VALUES(%s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['addr'],
                item['intro']
            ]
        elif isinstance(item, Ca800Item):
            sql = """insert into jianjie_ca800_all (comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)"""
            args = [
                item['comp_url'], item['comp_name'], item['cat_url'],
                item['cat'], item['loc'], item['sheng'], item['shi'],
                item['intro']
            ]
        elif isinstance(item, JiqirenItem):
            sql = """insert into jianjie_jiqiren_all (zhuying, comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)"""
            args = [
                item['zhuying'], item['comp_url'], item['comp_name'],
                item['cat_url'], item['cat'], item['loc'], item['sheng'],
                item['shi'], item['intro']
            ]
        elif isinstance(item, JiqirenItem):
            sql = """insert into ChuanItem (zhuying, comp_url, comp_name, cat_url, cat, loc, sheng, shi, intro) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)"""
            args = [
                item['zhuying'], item['comp_url'], item['comp_name'],
                item['cat_url'], item['cat'], item['loc'], item['sheng'],
                item['shi'], item['intro']
            ]

        else:
            raise CloseSpider('no item match...')
        try:
            self.cursor.execute(sql, args)
            self.conn.commit()
            # print(str(item['comp_url']) + ' ' + str(item['comp_name']))
        except pymysql.err.InterfaceError:
            print('reconnect mysql...')
            time.sleep(3)
            self.__init__()
            self.process_item(item, spider)
Esempio n. 22
0
    def hooks(self, d):

        if d['status'] == 'finished':
            filename = d['filename']
            l = filename.split('.')
            ext = l[len(l) - 1]
            print ext
            jsonfile = string.replace(filename, ext, 'info.json')
            info = json.loads(open(jsonfile).read())

            outpath = 'cache/' + self.uuid + '_.mp4'
            length = service.utils.coverterMp4(filename, outpath)
            print length, outpath
            if not length:
                logger.error('error trancode mp4' + self.uuid)
                raise CloseSpider('covert failed')
            total_bytes = os.path.getsize(outpath)

            endpoint, backet, obj = service.utils.paseUploadUrl(
                self.upload_url)
            print endpoint, backet, obj
            result = service.utils.uploadVideo(outpath, endpoint, backet, obj)
            # os.remove('cache/' + info['id'] + '*')
            if not result:
                self.logger.error('upload video error', self.uuid)
                raise CloseSpider('upload oss failed')

            print 'easub_uuid', result
            if 'thumbnail' in info:
                cover = service.utils.get_clip_cover_url(
                    info['thumbnail'], self.uuid)
            else:
                cover = ''
            data = {
                "video_id": self.uuid,
                "state": 1,
                "message": u'成功',
                "length": length,
                "play_id": self.uuid,
                "size": total_bytes,
                "cover": cover,
                "title": info['title']
            }
            self.callbacked = service.utils.callback_result(self.callback,
                                                            data=data)
            logger.info('[finished]' + str(self.callbacked) + '[uuid]' +
                        self.uuid)

            video_data = {
                'title': info['title'],
                'video_id': self.video_id,
                'author': info['extractor'],
                'publish': time.strftime('%Y-%m-%d %H:%M:%S'),
                'page_url': info['webpage_url'],
                'video_length': length,
                'video_size': total_bytes,
                'video_url': '',
                'easub_uuid': self.uuid,
                'cover': cover
            }
            self.db.save_video(video_data)

        if d['status'] == 'error':
            print 'error', d['filename']
            raise CloseSpider('download failed')
Esempio n. 23
0
 def parse(self, response):
     print 'parsePlayurl', response.url
     try:
         video_id = self._match_id(self.start_urls[0])
     except AssertionError, e:
         raise CloseSpider('link not supported')
Esempio n. 24
0
    def get_detail_post(self, response):
        """
        get detail post
        :param response:
        :return:
        """
        if self.close_down:
            raise CloseSpider('OVER NUMBER_POST')
        post_title = response.meta['post_title']
        post_link = response.meta['post_link']

        author = response.xpath(
            '//div[@class="details__author"]//a/img/@alt').extract_first()
        public_date = response.xpath(
            '//div[@class="details__meta"]/div[@class="meta"]/time/text()'
        ).extract_first()
        public_date = datetime.strptime(public_date, '%H:%M - %d/%m/%Y')
        public_date = public_date.timestamp() * 1000

        div_body = response.xpath('//div[@class="pswp-content"]')
        arr_summary = div_body.xpath('//div[@class="sapo"]//text()').extract()
        summary = ''
        for i in arr_summary:
            i = re.sub('\s\s+', ' ', i)
            summary += i
        summary = summary.strip()
        div_content = div_body.xpath('//div[@class="cms-body detail"]/div/div')
        content = ''
        for _ in div_content:
            arr_content = _.xpath('//text()').extract()
            for i in arr_content:
                i = re.sub('\s\s+', ' ', i)
                content += i.strip()

        tag = ''
        try:
            div_tag = response.xpath('//div[@class="details__tags"]/a')
            for _ in div_tag:
                str_tag = _.xpath('//text()').extract_first()
                tag = str_tag.strip('') + '/'
        except:
            pass
        id_picture = str(uuid.uuid1()) + str(uuid.uuid1())
        item = CrawlNewsItem()
        item_image = ImageItem()

        item['tbl_tag'] = 'tbl_news'
        item['id_picture'] = id_picture
        if 'source_title' in self.arr_detail:
            item['source_title'] = 'thanh nien'
        if 'source_link' in self.arr_detail:
            item['source_link'] = 'https://thanhnien.vn/'
        if 'category_title' in self.arr_detail:
            item['category_title'] = self.category_title
        if 'category_link' in self.arr_detail:
            item['category_link'] = self.category_link
        if 'post_title' in self.arr_detail:
            item['post_title'] = post_title
        if 'post_link' in self.arr_detail:
            item['post_link'] = post_link
        if 'sumary' in self.arr_detail:
            item['sumary'] = summary
        if 'content' in self.arr_detail:
            item['content'] = content
        if 'author' in self.arr_detail:
            item['author'] = author or ''
        if 'update_time' in self.arr_detail:
            item['update_time'] = int(round(time.time() * 1000))
        if 'public_date' in self.arr_detail:
            item['public_date'] = public_date
        if 'tag' in self.arr_detail:
            item['tag'] = tag
        yield item

        arr_image = div_body.xpath('//img/@src').extract()
        arr_image = list(set(arr_image))
        for i in arr_image:
            if i.find('https://image.thanhnien.vn') == 0:
                item_image['tbl_tag'] = 'tbl_images'
                item_image['id_picture'] = id_picture
                item_image['image'] = i
                yield item_image
Esempio n. 25
0
    def parse_page(self, response):
        '''
        Parse the given page selecting the posts.
        Then ask recursively for another page.
        '''
        #        #open page in browser for debug
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        #select all posts
        for post in response.xpath(
                "//div[contains(@data-ft,'top_level_post_id')]"):

            many_features = post.xpath('./@data-ft').get()
            date = []
            date.append(many_features)
            date = parse_date(date, {'lang': self.lang})
            current_date = datetime.strptime(
                date, '%Y-%m-%d %H:%M:%S') if date is not None else date

            if current_date is None:
                date_string = post.xpath('.//abbr/text()').get()
                date = parse_date2([date_string], {'lang': self.lang})
                current_date = datetime(date.year, date.month,
                                        date.day) if date is not None else date
                date = str(date)

            #if 'date' argument is reached stop crawling
            if self.date > current_date:
                raise CloseSpider('Reached date: {}'.format(self.date))

            new = ItemLoader(item=FbcrawlItem(), selector=post)
            if abs(self.count) + 1 > self.max:
                raise CloseSpider(
                    'Reached max num of post: {}. Crawling finished'.format(
                        abs(self.count)))
            self.logger.info('Parsing post n = {}, post_date = {}'.format(
                abs(self.count) + 1, date))
            new.add_xpath('comments', './div[2]/div[2]/a[1]/text()')
            new.add_value('date', date)
            new.add_xpath('post_id', './@data-ft')
            new.add_xpath('url', ".//a[contains(@href,'footer')]/@href")
            #page_url #new.add_value('url',response.url)

            #returns full post-link in a list
            post = post.xpath(".//a[contains(@href,'footer')]/@href").extract()
            temp_post = response.urljoin(post[0])
            self.count -= 1
            yield scrapy.Request(temp_post,
                                 self.parse_post,
                                 priority=self.count,
                                 meta={'item': new})

        #load following page, try to click on "more"
        #after few pages have been scraped, the "more" link might disappears
        #if not present look for the highest year not parsed yet
        #click once on the year and go back to clicking "more"

        #new_page is different for groups
        if self.group == 1:
            new_page = response.xpath(
                "//div[contains(@id,'stories_container')]/div[2]/a/@href"
            ).extract()
        else:
            new_page = response.xpath(
                "//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href"
            ).extract()
            #this is why lang is needed                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^

        if not new_page:
            self.logger.info(
                '[!] "more" link not found, will look for a "year" link')
            #self.k is the year link that we look for
            if response.meta['flag'] == self.k and self.k >= self.year:
                xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                    self.k) + "')]/@href"
                new_page = response.xpath(xpath).extract()
                if new_page:
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
                else:
                    while not new_page:  #sometimes the years are skipped this handles small year gaps
                        self.logger.info(
                            'Link not found for year {}, trying with previous year {}'
                            .format(self.k, self.k - 1))
                        self.k -= 1
                        if self.k < self.year:
                            raise CloseSpider(
                                'Reached date: {}. Crawling finished'.format(
                                    self.date))
                        xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(
                            self.k) + "')]/@href"
                        new_page = response.xpath(xpath).extract()
                    self.logger.info(
                        'Found a link for year "{}", new_page = {}'.format(
                            self.k, new_page))
                    new_page = response.urljoin(new_page[0])
                    self.k -= 1
                    yield scrapy.Request(new_page,
                                         callback=self.parse_page,
                                         meta={'flag': self.k})
            else:
                self.logger.info('Crawling has finished with no errors!')
        else:
            new_page = response.urljoin(new_page[0])
            if 'flag' in response.meta:
                self.logger.info(
                    'Page scraped, clicking on "more"! new_page = {}'.format(
                        new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': response.meta['flag']})
            else:
                self.logger.info(
                    'First page scraped, clicking on "more"! new_page = {}'.
                    format(new_page))
                yield scrapy.Request(new_page,
                                     callback=self.parse_page,
                                     meta={'flag': self.k})
Esempio n. 26
0
    def start_requests(self):
        with open('config.json', 'r') as f:
            data = json.load(f)
            for i in data.items():
                if i[0] == self.name:
                    self.config.append(i)
                    print(i[0])
            f.close()

        for v in self.config:
            if len(v[1]) == 1:
                self.Index_Url = v[1][0]['Index_Url']
                print(
                    "At Time %s : 爬虫开始爬取层数为1的页面Title = %s , Index_Url = %s " %
                    (time.ctime(), v[0], self.Index_Url),
                    file=self.log)
                Max_Page = v[1][0]['Max_Page']
                Final_Url = v[1][0]['Final_Url']
                One_Xpath = v[1][0]['One_Xpath']

                if Max_Page:
                    headers = {
                        'User-Agent':
                        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"
                    }
                    response = requests.get(self.Index_Url, headers=headers)
                    soup = BeautifulSoup(response.content, "lxml")
                    result = str(soup.select(Max_Page['soup']))
                    pageNums = re.search(Max_Page['re'], result).group()
                if Final_Url:
                    url = re.sub(Final_Url, "{limit}", self.Index_Url)
                    real_url = url.format(limit=pageNums)
                else:
                    real_url = self.Index_Url
                request = Request(real_url, callback=self.parse)
                request.meta['One_Xpath'] = One_Xpath
                yield request

            if len(v[1]) == 2:
                self.Index_Url = v[1][0]['Index_Url']
                print(
                    "At Time %s : 爬虫开始爬取层数为2的页面Title = %s , Index_Url = %s " %
                    (time.ctime(), v[0], self.Index_Url),
                    file=self.log)
                print("!!!!!!!!!!!!!!!!!!!!!!!!!Index_Url = %s" %
                      self.Index_Url)
                Max_Page = v[1][0]['Max_Page']
                #Head_Url = v[1][0]['Head_Url']
                Post_Data = v[1][0]['Post_Data']

                Two_Xpath = v[1][1]['Two_Xpath']
                headers = {
                    'User-Agent':
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"
                }
                response = requests.get(self.Index_Url, headers=headers)

                soup = BeautifulSoup(response.content, "lxml")
                result = str(soup.select(Max_Page['soup']))
                pageNums = re.search(Max_Page['re'], result).group()
                #urls = re.sub(Head_Url,"%s",self.Index_Url)
                if Post_Data:
                    self.flag = 1
                urls = get_HeadUrl(self.Index_Url, self.flag)
                if urls == -1:
                    raise CloseSpider(
                        "______________________________ 构造url失败,爬取结束,请查看日志!_____________________________"
                    )

                postdata = ""
                if Post_Data:
                    keys = list(Post_Data.keys())
                    for key in keys:
                        if Post_Data[key]:
                            if re.search(Post_Data[key], str(soup)):
                                postdata += (key + "=" + str(
                                    (re.search(Post_Data[key],
                                               str(soup)).group())).replace(
                                                   "\"", "") + "&")
                            else:
                                postdata += (key + "=" + Post_Data[key] + "&")
                        else:
                            postdata += (key + "={page}&")
                if not postdata:
                    urls = urls.replace("%s", "{page}")
                else:
                    urls = urls % postdata

                for i in range(1, int(pageNums)):
                    url = urls.format(page=str(i))
                    request = Request(url, callback=self.parse)
                    request.meta['Two_Xpath'] = Two_Xpath
                    yield request
            elif len(v[1]) == 3:
                self.Index_Url = v[1][0]['Index_Url']
                print(
                    "At Time %s : 爬虫开始爬取层数为3的页面Title = %s , Index_Url = %s " %
                    (time.ctime(), v[0], self.Index_Url),
                    file=self.log)
                Max_Page = v[1][0]['Max_Page']
                #Head_Url = v[1][0]['Head_Url']
                Post_Data = v[1][0]['Post_Data']

                Valid_Url = v[1][1]['Valid_Url']

                Three_Xpath = v[1][2]['Three_Xpath']
                headers = {
                    'User-Agent':
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"
                }
                response = requests.get(self.Index_Url, headers=headers)

                soup = BeautifulSoup(response.content, "lxml")
                result = str(soup.select(Max_Page['soup']))
                pageNums = re.search(Max_Page['re'], result).group()
                #urls = re.sub(Head_Url,"%s",self.Index_Url)
                print("最大页数是:%s" % pageNums)
                if Post_Data:
                    self.flag = 1
                urls = get_HeadUrl(self.Index_Url, self.flag)
                if urls == -1:
                    raise CloseSpider(
                        "______________________________ 构造url失败,爬取结束,请查看日志!_____________________________"
                    )
                #print urls
                postdata = ""
                if Post_Data:
                    keys = list(Post_Data.keys())
                    for key in keys:
                        if Post_Data[key]:
                            if re.search(Post_Data[key], str(soup)):
                                postdata += (key + "=" + quote_plus(
                                    (re.search(Post_Data[key],
                                               str(soup)).group()).replace(
                                                   '"', "")) + "&")
                            else:
                                postdata += (key + "=" + Post_Data[key] + "&")
                        else:
                            postdata += (key + "={page}&")
                if not postdata:
                    urls = urls.replace("%s", "{page}")
                else:
                    urls = urls % postdata
                for i in range(1, int(pageNums)):
                    url = urls.format(page=str(i))
                    request = Request(url, callback=self.parse_first)
                    request.meta['Valid_Url'] = Valid_Url
                    request.meta['Three_Xpath'] = Three_Xpath
                    yield request
    def parse_item(self, response):

        item = item_Noticia()

        # TITULAR
        item['titularNoticia'] = response.xpath(
            XPATH_NOTICIA_TITULO).extract()[0]

        # LINK
        item['linkNoticia'] = response.url

        # KEYWORDS
        # Las keywords se ponen con el formato "A,B,C"
        item['keywordsNoticia'] = []
        try:
            keywords = response.xpath(
                XPATH_NOTICIA_KEYWORDS).extract()[0].split(",")
            for keyword in keywords:
                item['keywordsNoticia'].append(keyword.strip())
        except:
            item['keywordsNoticia'] = []

        # DESCRIPCIÓN
        item['resumenNoticia'] = response.xpath(
            XPATH_NOTICIA_RESUMEN).extract()

        # AUTORES
        # Los autores, en el caso de haber más de uno, se posicionan en tags diferentes
        item['autorNoticia'] = []
        autores = response.xpath(XPATH_NOTICIA_AUTORES).extract()
        for autor in autores:
            item['autorNoticia'].append(autor.strip())

        # LOCALIZACIONES
        # No se muestran en la noticia. Hay veces que aparece con el autor, pero aparecen de esta manera:
        # Juan Pérez. Barcelona
        # Lo cuál interfiere con los nombres de autores los cuales firmán con las iniciales de su nombre y apellidos.
        # Ejemplo: J. P.
        item['localizacionNoticia'] = []

        # FECHA
        # Se encuentra en el interior de la noticia como "YYYY-MM-ddThh:mm:ssZ"
        try:
            item['fechaPublicacionNoticia'] = response.xpath(
                XPATH_NOTICIA_FECHA_PUBLICACION).extract()[0]
        except:
            return

        # PIE DE FOTO
        # 3 casos: 1) No foto.  2) Pie de foto pero NO firma.  3) Pie y firma de foto
        try:
            pieDeFoto = response.xpath(
                XPATH_NOTICIA_FOTO_PIE).extract()[0].strip()
            item['pieDeFotoNoticia'] = pieDeFoto.split("(")[0].strip()
        except:
            item['pieDeFotoNoticia'] = ""
            item['firmaDeFotoNoticia'] = ""

        # FIRMA DE FOTO
        try:
            item['firmaDeFotoNoticia'] = pieDeFoto.split("(")[1].split(
                ")")[0].strip()
        except:
            item['firmaDeFotoNoticia'] = ""

        # CUERPO
        listPartesCuerpo = response.xpath(XPATH_NOTICIA_CUERPO).extract()
        cuerpoNoticia = "".join(listPartesCuerpo)
        cuerpoNoticia = TAG_RE.sub('', cuerpoNoticia)
        item['cuerpoNoticia'] = cuerpoNoticia

        # TAGS
        item['tagsNoticia'] = []
        tagsNoticia = response.xpath(XPATH_NOTICIA_TAGS).extract()
        for tag in tagsNoticia:
            item['tagsNoticia'].append(tag)

        # ZONA DE TEST
        #self.newsCount+=1
        if self.newsCount > 10:
            raise CloseSpider("\x1b[1;33m" + "Noticias de test recogidas" +
                              "\033[0;m")

        yield item
Esempio n. 28
0
    def parse(self, response):
        item = CollectorSpiderItem()
        One_Xpath = response.meta.get('One_Xpath', None)
        Two_Xpath = response.meta.get('Two_Xpath', None)
        Three_Xpath = response.meta.get('Three_Xpath', None)

        if One_Xpath:
            for i in response.xpath(One_Xpath['Lost_Xpath']):
                item['lost_url'] = response.url
                item['lost_from'] = "" if not re.search(
                    One_Xpath['Lost_From'],
                    response.url).group() else re.search(
                        One_Xpath['Lost_From'], response.url).group()
                item['lost_id'] = format_string("" if not i.xpath(
                    One_Xpath['Lost_Id'] if One_Xpath['Lost_Id'] else "/"
                ).extract() else i.xpath(
                    One_Xpath['Lost_Id'] if One_Xpath['Lost_Id'] else "/").
                                                extract()[0])
                item['lost_title'] = format_string("" if not i.xpath(
                    One_Xpath['Lost_Title'] if One_Xpath['Lost_Title'] else "/"
                ).extract() else i.xpath(
                    One_Xpath['Lost_Title'] if One_Xpath['Lost_Title'] else "/"
                ).extract()[0])
                item['lost_describe'] = format_string("" if not i.xpath(
                    One_Xpath['Lost_Describe'] if One_Xpath['Lost_Describe']
                    else "/").extract() else i.xpath(
                        One_Xpath['Lost_Describe']
                        if One_Xpath['Lost_Describe'] else "/").extract()[0])
                item['lost_person'] = format_string("" if not i.xpath(
                    One_Xpath['Lost_Person'] if One_Xpath['Lost_Person'] else
                    "/").extract() else i.xpath(
                        One_Xpath['Lost_Person']
                        if One_Xpath['Lost_Person'] else "/").extract()[0])
                item['lost_time'] = format_time(
                    format_string("" if not i.xpath(
                        One_Xpath['Lost_Time'] if One_Xpath['Lost_Time'] else
                        "/").extract() else i.xpath(
                            One_Xpath['Lost_Time']
                            if One_Xpath['Lost_Time'] else "/").extract()[0]))
                item['lost_location'] = One_Xpath['Lost_Location'][1] + (
                    format_string("" if not i.xpath(One_Xpath['Lost_Location'][
                        0] if One_Xpath['Lost_Location'][0] else "/").extract(
                        ) else i.xpath(One_Xpath['Lost_Location'][0] if
                                       One_Xpath['Lost_Location'][0] else "/").
                                  extract()[0]))
                item['lost_mid'] = hashlib.md5(
                    (item['lost_from'] + item['lost_id'] +
                     item['lost_describe'] +
                     item['lost_time']).encode('utf-8')).hexdigest()[8:-8]
                if os.path.exists(
                        "/home/hong/文档/sina_working/2to3_test/filter.bloom"):
                    #token = str(item['lost_url'])+str(item['lost_id'])+str(item['lost_describe'])
                    token = item['lost_mid']
                    if self.bf.__contains__(token):
                        print(
                            "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________"
                            % (time.ctime(), token),
                            file=self.log)
                        self.log.close()
                        #time.sleep(10)
                        raise CloseSpider(
                            "______________________________ item已经捕获重复,爬取结束!_____________________________"
                        )
                yield item

        elif Two_Xpath:
            for i in response.xpath(Two_Xpath['Lost_Xpath']):
                #item['lost_mid'] = resposne.url
                item['lost_url'] = response.url
                item['lost_from'] = "" if not re.search(
                    Two_Xpath['Lost_From'],
                    response.url).group() else re.search(
                        Two_Xpath['Lost_From'], response.url).group()
                item['lost_id'] = format_string("" if not i.xpath(
                    Two_Xpath['Lost_Id'] if Two_Xpath['Lost_Id'] else "/"
                ).extract() else i.xpath(
                    Two_Xpath['Lost_Id'] if Two_Xpath['Lost_Id'] else "/").
                                                extract()[0])
                item['lost_title'] = format_string("" if not i.xpath(
                    Two_Xpath['Lost_Title'] if Two_Xpath['Lost_Title'] else "/"
                ).extract() else i.xpath(
                    Two_Xpath['Lost_Title'] if Two_Xpath['Lost_Title'] else "/"
                ).extract()[0])
                item['lost_describe'] = format_string("" if not i.xpath(
                    Two_Xpath['Lost_Describe'] if Two_Xpath['Lost_Describe']
                    else "/").extract() else i.xpath(
                        Two_Xpath['Lost_Describe']
                        if Two_Xpath['Lost_Describe'] else "/").extract()[0])
                item['lost_person'] = format_string("" if not i.xpath(
                    Two_Xpath['Lost_Person'] if Two_Xpath['Lost_Person'] else
                    "/").extract() else i.xpath(
                        Two_Xpath['Lost_Person']
                        if Two_Xpath['Lost_Person'] else "/").extract()[0])
                item['lost_time'] = format_time(
                    format_string("" if not i.xpath(
                        Two_Xpath['Lost_Time'] if Two_Xpath['Lost_Time'] else
                        "/").extract() else i.xpath(
                            Two_Xpath['Lost_Time']
                            if Two_Xpath['Lost_Time'] else "/").extract()[0]))
                item['lost_location'] = Two_Xpath['Lost_Location'][
                    1] + format_string("" if not i.xpath(
                        Two_Xpath['Lost_Location'][0]
                        if Two_Xpath['Lost_Location'][0] else "/").extract(
                        ) else i.xpath(Two_Xpath['Lost_Location'][0] if
                                       Two_Xpath['Lost_Location'][0] else "/").
                                       extract()[0])
                item['lost_mid'] = hashlib.md5(
                    (item['lost_from'] + item['lost_id'] +
                     item['lost_describe'] +
                     item['lost_time']).encode('utf-8')).hexdigest()[8:-8]
                #time_temp = re.search(r'\d+-\d+-\d+',str(item['lost_time'])).group()
                #if not re.search(r'20',time_temp):
                #	time_temp = "20"+time_temp
                #print "time_temp = %s"%time_temp
                #time_stamp = datetime.datetime(int(re.search(r'\d+',time_temp).group()),int(re.search(r'(?<=-)\d+',time_temp).group()),int(re.search(r'\d+$',time_temp).group()))
                #if time.mktime(time_stamp.timetuple()) < time.mktime(self.one_month_ago.timetuple()):
                #	print >> self.log,"At Time %s , the item[%s] : the datetime is overtimed._____________"%(time.ctime(),time_stamp)
                #	raise CloseSpider("_____________________________The datetime is overtimed,爬取结束!!_______________________")

                if os.path.exists(
                        "/home/hong/文档/sina_working/2to3_test/filter.bloom"):
                    #token = str(item['lost_url'])+str(item['lost_id'])+str(item['lost_describe'])
                    token = item['lost_mid']
                    if self.bf.__contains__(token):
                        #self.log.write("TRUE,存在重复元素,到达这里没有?")
                        print(
                            "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________"
                            % (time.ctime(), token),
                            file=self.log)
                        self.log.close()
                        #time.sleep(10)
                        raise CloseSpider(
                            "______________________________ item已经捕获重复,爬取结束!_____________________________"
                        )

                yield item
        else:
            item['lost_url'] = response.url
            item['lost_from'] = "" if not re.search(
                Three_Xpath['Lost_From'], response.url).group() else re.search(
                    Three_Xpath['Lost_From'], response.url).group()
            item['lost_id'] = format_string("" if not response.xpath(
                Three_Xpath['Lost_Id'] if Three_Xpath['Lost_Id'] else "/"
            ).extract() else response.xpath(
                Three_Xpath['Lost_Id'] if Three_Xpath['Lost_Id'] else "/").
                                            extract()[0])
            item['lost_title'] = format_string("" if not response.xpath(
                Three_Xpath['Lost_Title'] if Three_Xpath['Lost_Title'] else "/"
            ).extract() else response.xpath(
                Three_Xpath['Lost_Title'] if Three_Xpath['Lost_Title'] else "/"
            ).extract()[0])
            item['lost_describe'] = format_string("" if not response.xpath(
                Three_Xpath['Lost_Describe'] if Three_Xpath['Lost_Describe']
                else "/").extract() else response.xpath(
                    Three_Xpath['Lost_Describe']
                    if Three_Xpath['Lost_Describe'] else "/").extract()[0])
            item['lost_person'] = format_string("" if not response.xpath(
                Three_Xpath['Lost_Person'] if Three_Xpath['Lost_Person'] else
                "/").extract() else response.xpath(
                    Three_Xpath['Lost_Person']
                    if Three_Xpath['Lost_Person'] else "/").extract()[0])
            item['lost_time'] = format_time(
                format_string("" if not response.xpath(
                    Three_Xpath['Lost_Time'] if Three_Xpath['Lost_Time'] else
                    "/").extract() else response.xpath(
                        Three_Xpath['Lost_Time']
                        if Three_Xpath['Lost_Time'] else "/").extract()[0]))
            #print(type(Three_Xpath['Lost_Location'][1]))
            item['lost_location'] = Three_Xpath['Lost_Location'][
                1] + format_string("" if not response.xpath(
                    Three_Xpath['Lost_Location'][0]
                    if Three_Xpath['Lost_Location'][0] else "/"
                ).extract()[0] else response.xpath(
                    Three_Xpath['Lost_Location'][0]
                    if Three_Xpath['Lost_Location'][0] else "/").extract()[0])
            item['lost_mid'] = hashlib.md5(
                (item['lost_from'] + item['lost_id'] + item['lost_describe'] +
                 item['lost_time']).encode('utf-8')).hexdigest()[8:-8]
            if os.path.exists(
                    "/home/hong/文档/sina_working/2to3_test/filter.bloom"):
                #token = str(item['lost_url']+item['lost_id']+item['lost_describe'])
                token = item['lost_mid']
                if self.bf.__contains__(token):
                    print(
                        "\ntime waiting......\ntime waiting......\ntime waiting......\n\nAt Time %s , The spider TOKEN : %s has been destroied_______________"
                        % (time.ctime(), token),
                        file=self.log)
                    self.log.close()
                    #time.sleep(10)
                    raise CloseSpider(
                        "______________________________ url已经捕获重复,爬取结束!_____________________________"
                    )
            yield item
Esempio n. 29
0
 def spider_closed(self, spider):
     self.file.close()
     raise CloseSpider('Shutdown by ctrl-c')
Esempio n. 30
0
    def _set_start_urls(self, scrape_url):
        self.start_urls = []

        if self.scraper.pagination_type in [
                'R',
                'F',
        ]:
            if not self.scraper.pagination_page_replace:
                msg = 'Please provide a pagination_page_replace context corresponding to pagination_type!'
                self.dds_logger.error(msg)
                raise CloseSpider()

        if self.scraper.pagination_type == 'R':
            try:
                pages = self.scraper.pagination_page_replace
                pages = pages.split(',')
                if len(pages) > 3:
                    raise Exception
                pages = list(range(*list(map(int, pages))))
            except Exception:
                msg = 'Pagination_page_replace for pagination_type "RANGE_FUNCT" ' +\
                      'has to be provided as python range function arguments ' +\
                      '[start], stop[, step] (e.g. "1, 50, 10", no brackets)!'
                self.dds_logger.error(msg)
                raise CloseSpider()
            pages = self.limit_page_nums(pages)

        if self.scraper.pagination_type == 'F':
            try:
                pages = self.scraper.pagination_page_replace
                pages = pages.strip(', ')
                pages = ast.literal_eval("[" + pages + ",]")
            except:
                msg = 'Wrong pagination_page_replace format for pagination_type "FREE_LIST", ' +\
                      "Syntax: 'Replace string 1', 'Another replace string 2', 'A number 3', ..."
                self.dds_logger.error(msg)
                raise CloseSpider()
            pages = self.limit_page_nums(pages)

        if self.scraper.pagination_type in [
                'R',
                'F',
        ]:
            append_str = self.scraper.pagination_append_str
            if scrape_url[-1:] == '/' and append_str[0:1] == '/':
                append_str = append_str[1:]

            self.pages = pages
            if self.conf['MAX_PAGES_READ']:
                self.pages = self.pages[0:self.conf['MAX_PAGES_READ']]
            for page in self.pages:
                url = scrape_url + append_str.format(page=page)
                self.start_urls.append(url)
            if not self.scraper.pagination_on_start and not self.conf[
                    'START_PAGE']:
                self.start_urls.insert(0, scrape_url)
                self.pages.insert(0, "")

        if self.scraper.pagination_type in [
                'N',
                'O',
        ]:
            self.start_urls.append(scrape_url)
            self.pages = [
                "",
            ]
        num = len(self.start_urls)
        if (num == 1):
            url_str = 'URL'
        else:
            url_str = 'URLs'
        self.log(
            "Scraper set to run on {num} start {url_str}.".format(
                num=num, url_str=url_str), logging.INFO)