Example #1
0
    def parse_first(self, response):
        print('@@@start first page@@@')
        sel = scrapy.Selector(response)
        result_links = sel.xpath('//h3[@class="r"]')
        print("result counts: " + str(len(result_links)))
        for result in result_links:
            url = result.xpath('./a/@href').extract()[0]
            if url[-4:] == ".pdf":
                continue
            yield SplashRequest(
                url,
                self.parse,
                #endpoint='execute',
                slot_policy=SlotPolicy.SINGLE_SLOT,
                args={
                    'wait': 25,
                    'timeout': 3600  # ,
                    # 'lua_source': tmp_script
                })

        pages = sel.xpath('//td/a[@class="fl"]')
        count = 0
        for page in pages:
            # if count > 1:
            #     break
            url = 'https://www.google.com' + page.xpath('@href').extract()[0]
            print(url)
            yield SplashRequest(
                url,
                self.parse_page,
                #endpoint='execute',
                slot_policy=SlotPolicy.SINGLE_SLOT,
                args={
                    'wait': 15,
                    'timeout': 3600  #,
                    #'lua_source': tmp_script
                })
            count += 1
    def parse(self, response):
        global Page
        sel = scrapy.Selector(response)

        image_urls = sel.xpath(
            '//ol[@class="commentlist"]//div[@class="row"]//div[@class="text"]//img/@src'
        ).extract()

        image_names = sel.xpath(
            '//ol[@class="commentlist"]//div[@class="row"]//div[@class="text"]//span[@class="righttext"]/a/text()'
        ).extract()

        new_urls = []
        for xxx in image_urls:
            # url = base64.b64decode(xxx).decode('utf-8')
            # new_urls.append('https' + url)
            # print("Debug #############################")
            # print(xxx)
            new_urls.append('https:' + xxx)

        item = JandanPicItem()
        item['image_url'] = new_urls
        item['image_name'] = image_names

        yield item

        if Page < 3:
            # print("Before Debug!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            next_page = response.xpath(
                '//div[@class="comments"]//div[@class="cp-pagenavi"]//a[@class="previous-comment-page"]/@href'
            )
            # print("Debug!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            if next_page:
                # print("next_page is %s"%next_page)
                url = response.urljoin(next_page[1].extract())
                print("Debug%s" % url)
                yield scrapy.Request(url, self.parse)
            Page += 1
Example #3
0
    def parse(self, response):
        now = time.strftime('%Y-%m-%d %H:%M:%S')
        hxs = scrapy.Selector(response)

        for h in hxs.css('div.list-article > h1'):
            item = DapnewsItem()
            item['categoryId'] = '1'

            name = h.xpath('a/text()')
            if not name:
                print('DAP => [' + now + '] No title')
            else:
                item['name'] = name.extract_first()

            description = h.xpath(
                'following-sibling::div[@class="article-content"][1]/p/text()')
            if not description:
                print('DAP => [' + now + '] No description')
            else:
                item['description'] = description.extract_first()

            url = h.xpath("a/@href")
            if not url:
                print('DAP => [' + now + '] No url')
            else:
                item['url'] = url.extract_first()

            imageUrl = h.xpath(
                'following-sibling::div[@class="feature-image"][1]/img/@src')
            item['imageUrl'] = ''
            if not imageUrl:
                print('DAP => [' + now + '] No imageUrl')
            else:
                item['imageUrl'] = imageUrl.extract_first()

            request = scrapy.Request(item['url'], callback=self.parse_detail)
            request.meta['item'] = item
            yield request
Example #4
0
    def _parseRectorOffice(self, response):
        '''
        校长办公室
        :return:
        '''
        selector = scrapy.Selector(response)
        item = self._inititem()
        item["url"] = response.url

        name = selector.xpath(
            '//section[@class="eight phone-four columns "]/h1/text()').extract(
            )
        if name:
            item["name"] = StrUtil.delWhiteSpace(name[0])
            logger.debug('>>UNU>>leader>>name>>%s' % item["name"])
        else:
            logger.error('爬取UNU领导人姓名失败,网页结构可能改变,建议检查')

        work = selector.xpath(
            '//section[@class="eight phone-four columns "]/h4/text()').extract(
            )
        if work:
            item["work"] = StrUtil.delWhiteSpace(work[0])
        else:
            logger.error('爬取校长办公室成员职位出错')

        resume = selector.xpath(
            '//section[@class="eight phone-four columns "]/div/ul/li/div'
        ).xpath('string(.)').extract()
        if resume:
            item["resume"] = StrUtil.delWhiteSpace(resume[0])
        else:
            logger.error('爬取校长办公室成员简历出错')

        logger.debug('>>>OECDleader>>>校长办公室成员work>>>%s' % item["work"])
        logger.debug('>>>OECDleader>>>校长办公室成员name>>>%s' % item["name"])
        logger.debug('>>>OECDleader>>>校长办公室成员resume>>>%s' % item["resume"])
        yield item
Example #5
0
 def _get_session_by_login(self):
     session = requests.session()
     url1 = 'https://account.sogou.com/connect/login?provider=weixin&client_id=2017&ru=https://weixin.sogou.com&third_appid=wx6634d697e8cc0a29&href=https://dlweb.sogoucdn.com/weixin/css/weixin_join.min.css?v=20170315'
     res1 = session.get(url1)
     url2 = res1.url
     state = re.findall(r'state=(.*)&', url2)
     url3 = 'https://pb.sogou.com/cl.gif?uigs_t=%s&uigs_productid=vs_web&terminal=web&vstype=weixin&pagetype=index&channel=index_pc&type=weixin_search_pc&wuid=00F83DEFAFA78A1A5C1BAF0649830928&snuid=&uigs_uuid=%s&login=0&uigs_cl=home_login_top&href=javascript:void(0);&uigs_refer=https://weixin.sogou.com/' % (
         str(int(round(
             time.time() * 1000))), str(int(round(time.time() * 1000000))))
     session.get(url3)
     res2 = session.get(url2)
     res2_html = self._response_decode(res2)
     selector = scrapy.Selector(text=res2_html)
     uuid = selector.xpath('//div[@class="wrp_code"]/img').attrib.get(
         'src').split('/')[-1]
     url4 = 'https://open.weixin.qq.com/connect/qrcode/' + uuid
     res4 = session.get(url4, headers={'Referer': url2})
     with open('./weixin/temp/qrcode.png', 'wb') as f:
         f.write(res4.content)
     img = Image.open('./weixin/temp/qrcode.png')
     img.show()
     time.sleep(10)
     ck = ''
     while ck != '405':
         url5 = 'https://long.open.weixin.qq.com/connect/l/qrconnect?uuid=%s&_=%s' % (
             uuid, str(int(round(time.time() * 1000))))
         res5 = session.get(url5)
         fre = re.findall(
             r"window.wx_errcode=(\d{3});window.wx_code='(.*)'",
             res5.text)[0]
         ck = fre[0]
         code = fre[1]
     img.close()
     url5 = 'https://account.sogou.com/connect/callback/weixin?code=%s&state=%s' % (
         code, state)
     session.get(url5)
     print('logined')
     return session
Example #6
0
    def parse_location(self, response):
        json_data = response.meta["json"]
        address = json_data["address"]
        # decode entities
        name = scrapy.Selector(text=json_data["name"]).xpath("//text()").get()

        # These are weird enough that there's no hope of parsing them, but
        # clean the text up
        hours = response.xpath('//strong[text()="Hours:"]/../text()').extract()
        hours = ';'.join(s.strip().replace('\xa0', ' ') for s in hours)

        properties = {
            "ref":
            re.search(r"postid-(\d+)",
                      response.css("body").attrib["class"])[1],
            "lat":
            address["latitude"],
            "lon":
            address["longitude"],
            "addr_full":
            address["address"],
            "city":
            address["city"],
            "state":
            address["state"],
            "postcode":
            address["zip_code"],
            "name":
            name,
            "website":
            response.url,
            "phone":
            (response.xpath("//*[starts-with(@href, 'tel:')]/@href").get()
             or "")[4:],
            "opening_hours":
            hours,
        }
        return GeojsonPointItem(**properties)
Example #7
0
    def parse(self, response):

        for reviewer in response.xpath('//tr[contains(@id, "reviewer")]/td[3]/a'):
            name = reviewer.xpath('b/text()').extract()
            href = reviewer.xpath('@href').extract()

            rev_url = 'http://www.amazon.com' + href[0]

            self.driver.get(rev_url)
            rev_id = rev_url.split('/')[-1]
            if rev_id == '':
                rev_id = response.url.split('/')[-2]

            email_xpath = '//a[@id="/gp/profile/' + rev_id + '"]'
            email = ''

            try:
                email_link = self.driver.find_element_by_xpath(email_xpath)
                email_link.click()
                time.sleep(1)
            except:
                email = '-'

            sel = scrapy.Selector(text=self.driver.page_source)

            if email != '-':
                email = sel.xpath(email_xpath + '/text()').extract()[0]
            name  = sel.xpath('//h1/text()').extract()[0]

            item = AmazonItem()
            item['name'] = name
            item['email'] = email

            yield item

        self.i += 1
        if self.i <= self.end:
            yield scrapy.Request('http://www.amazon.com/review/top-reviewers?page=' + str(self.i), callback=self.parse)
Example #8
0
    def _parseDirectors(self, response):
        '''
        董事
        :return: 
        '''
        selector = scrapy.Selector(response)
        item = self._inititem()
        item["work"] = "Directors"
        item["url"] = response.url

        name = selector.xpath(
            '//div[@class="col-sm-9 leftnav-content-wrapper"]/h1/text()'
        ).extract()
        if name:
            name[0] = re.sub('-', ',', name[0])
            try:
                item["name"] = StrUtil.delWhiteSpace(name[0].split(',')[0])
            except:
                logger.warning('董事页面可能变化,建议检查')
                item["name"] = StrUtil.delWhiteSpace(name[0])
        elif response.url == "http://www.oecd.org/legal/nicola-bonucci-cv.htm":
            name = selector.xpath(
                '//div[@class="span-19 last"]/h1/text()').extract()[0]
            item["name"] = StrUtil.delWhiteSpace(name.split(',')[0])
        else:
            logger.error('爬取董事姓名出错')

        resume = selector.xpath('//div[@id="webEditContent"]').xpath(
            'string(.)').extract()
        if resume:
            item["resume"] = StrUtil.delWhiteSpace(resume[0])
        else:
            logger.error('爬取董事简历出错')

        logger.debug('>>>OECDleader>>>董事work>>>%s' % item["work"])
        logger.debug('>>>OECDleader>>>董事name>>>%s' % item["name"])
        logger.debug('>>>OECDleader>>>董事resume>>>%s' % item["resume"])
        yield item
Example #9
0
    def parse(self, response):
        xs_item = XsmnItem()
        tmp_data = {}
        data_resp = scrapy.Selector(response)

        xs_item['xs_info'] = [
            # Thứ
            data_resp.xpath("//table[@id='MT0']/tr/th[1]/a/text()"
                            ).extract_first(),
            # Ngày tháng
            data_resp.xpath("//table[@id='MT0']/tr/th[1]/text()"
                            ).extract_first(),
            self.now.year
        ]

        for i in range(2, 5):
            # Các tỉnh trong bảng xổ số
            tmp_location = data_resp.xpath(
                "//table[@id='MT0']/tr/th[{0}]/a/text()".format(
                    i)).extract_first()
            if tmp_location is None:
                continue
            tmp_data[tmp_location] = {}

            for j in range(2, 11):
                # Cột các giải từ giải 8 đến giải đặc biệt
                tmp_giai = data_resp.xpath(
                    "//table[@id='MT0']/tr[{0}]/td[1]/text()".format(
                        j)).extract_first()
                # Các số trúng thưởng trong cột theo tỉnh
                tmp_number = data_resp.xpath(
                    "//table[@id='MT0']/tr[{0}]/td[{1}]//text()".format(
                        j, i)).extract()
                tmp_data[tmp_location][tmp_giai] = ", ".join(tmp_number)

        xs_item['xs_data'] = tmp_data

        yield xs_item
Example #10
0
 def parse_humans_name(self, response):
     if self.check_page(response.url) == False:
         return
     html = scrapy.Selector(text=response.body)
     total_page = html.css(
         "#web-content > div > div > div.pl20.pr20.f14 > div.company_pager > div::text"
     ).extract()
     if len(total_page) != 0:
         pages = int(total_page[0]) + 1
         for page in range(1, pages):
             url = response.url + "/p" + str(page)
             seed = {
                 "url":
                 url,
                 "formUrl":
                 response.url,
                 "status":
                 0,
                 "ts":
                 time.strftime("%Y-%m-%d %H:%M:%S",
                               time.localtime(time.time()))
             }
             self.mongo.humans_page_seed_insert(seed)
         print(response.url + "------------------>" + total_page[0])
     else:
         url = response.url + "/p1"
         seed = {
             "url":
             url,
             "formUrl":
             response.url,
             "status":
             0,
             "ts":
             time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
         }
         self.mongo.humans_page_seed_insert(seed)
         print(response.url + "------------------>1")
 def parse_page(self, response):
     NewSeed = response.meta.get('item', '')
     selector = scrapy.Selector(response)
     for _ in selector.xpath('//div[@class="info-box"]/div[@class="info"]'):
         product = _.xpath('h1/text()').extract()
         NewSeed['product'] = ''.join(str(i).strip() for i in product)
         field = _.xpath('ul[@class="subinfo"]/li[@class="l"]/p[1]/a/text()').extract()
         NewSeed['field'] = ''.join(str(i).strip() for i in field)
         platform = _.xpath('ul[@class="subinfo"]/li[@class="l"]/p[2]/span[1]/text()').extract()
         NewSeed['platform'] = ''.join(str(i).strip() for i in platform)
         location = _.xpath('ul[@class="subinfo"]/li[@class="l"]/p[2]/span[2]/text()').extract()
         NewSeed['location'] = ''.join(str(i).strip() for i in location)
         homepage = _.xpath('ul[@class="subinfo"]/li[@class="l"]/p[3]/span[1]/descendant::text()').extract()
         NewSeed['homepage'] = ''.join(str(i).strip() for i in homepage)
         establish_time = _.xpath('ul[@class="subinfo"]/li[@class="r box-fix-r"]/p[1]/text()').extract()
         NewSeed['establish_time'] = ''.join(str(i).strip() for i in establish_time)
         status = _.xpath('ul[@class="subinfo"]/li[@class="r box-fix-r"]/p[2]/text()').extract()
         NewSeed['status'] = ''.join(str(i).strip() for i in status)
         tags = selector.xpath('//div[@class="project-top"]/div[@class="txt"]/div[1]/a/text()').extract()
         NewSeed['tags'] = ''.join(str(i).strip() for i in tags)
         description = selector.xpath('//div[@class="box-plate"]/div[@class="desc"]/text()').extract()
         NewSeed['description'] = re.sub(r'[\n\r ]', '', ''.join(str(i).strip() for i in description))
         contact = _.xpath(
             '//div[@class="project-status"]/div[@class="people-list"]/h4[@class="title"]/a/text()').extract()
         NewSeed['contact'] = ''.join(str(i).strip() for i in contact)
         NewSeed['project_status'] = _.xpath('//div[@class="project-status"]/a/text()').extract_first(default='N/A')
         leadership = selector.xpath(
             '//div[@class="item-list people-list"]/ul/li/div[2]/descendant::text()').extract()
         leadership = list(filter(lambda x: len(x) > 1, [str(_).strip() for _ in leadership]))
         NewSeed['leadership'] = ''.join(str(i).strip() for i in leadership)
         logo_url = selector.xpath('//div[@class="img"]/span[@class="img-middle"]/img/@src').extract()
         NewSeed['company_name'] = selector.xpath('//div[@class="company-box"]/dl[1]/p/a/text()').extract_first(
             default='N/A')
         brief_intro = selector.xpath('//div[@class="company-box"]/dl[1]/dd//text()').extract()
         NewSeed['brief_intro'] = re.sub('r[\n\r ]','',''.join(str(i).strip() for i in brief_intro))
         NewSeed['logo_url'] = ''.join(str(i).strip() for i in logo_url)
         NewSeed['url'] = response.url
         yield NewSeed
Example #12
0
    def parse(self, response):
        sel = scrapy.Selector(response=response)

        nodes = sel.xpath('//ul[@class="listContent"]/li')
        for node in nodes:
            item = Residential_Brief()
            eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/@href').extract()
            item['residential_id'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/text()').extract()
            item['residential_name'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/a[@class="district"]/text()').extract()
            item['district'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/a[@class="bizcircle"]/text()').extract()
            item['bizcircle'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/text()').extract()
            item['build_year'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemPrice"]/div[@class="totalPrice"]/span/text()').extract()
            item['avg_price'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemPrice"]/div[@class="priceDesc"]/text()').extract()
            item['avg_price_date'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemSellCount"]/a[@class="totalSellCount"]/span/text()').extract()
            item['on_sale_count'] = format(eles[0]) if len(eles) > 0 else ''
            yield item

        for node in nodes:
            eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/@href').extract()
            url = format(eles[0]) if len(eles) > 0 else ''
            if url != '':
                yield scrapy.Request(url, callback=self.resident_detail_parse)
            else:
                continue
Example #13
0
    def get_map(self, response):
        value = response.meta['value']
        city = response.meta['city']

        text = response.text
        text = re.sub(r'\\\n', '', text)
        data = json.loads(text)

        items = []

        if 'markers' in data:
            for d in data['markers']:
                address = d['address']
                address = address.replace('&nbsp;', ' ')

                # for the shit like this:
                # u'&lt;p&gt;\r\nул.Торайгырова, 53/23\r\n&lt;/p&gt;'
                address = self.html_parser.unescape(address)
                address = scrapy.Selector(
                    text=address).xpath('//text()').get("").strip()
                address = address.replace(u'\xa0', ' ')

                name = d['name']
                name = name.replace(u'\xa0', ' ')

                items.append(
                    dict(
                        name=name.strip(),
                        address=address.strip(),
                        lat=d['lat'],
                        lon=d['lng'],
                    ))

        return response.follow(
            'https://www.bcc.kz/local/tmpl/ajax/getmapdata.php?type={}&city={}&lang=s1'
            .format(item_type, value),
            self.get_map_data,
            meta=dict(items=items, city=city))
Example #14
0
 def parse(self, response):
     #print response.body
     #item = DoubanbookItem()
     self.count += 1
     self.filename = "news.txt"
     URL = 'https://www.2cto.com'
     selector = scrapy.Selector(response)
     books = selector.xpath('//li[@class="clearfix"]')  #每条新闻
     # 每一页有15条新闻
     for each in books:
         tag = ""
         author = []
         auth = each.xpath('div/p[@class="tags"]/a').xpath(
             'string(.)').extract()
         for i in range(len(auth)):
             tag = tag + auth[i] + ","
         author.append(tag)
         title = each.xpath('a/text()').extract()
         web = each.xpath('a/@href').extract()
         #print(title)
         with open(self.filename, "a", encoding="utf8") as f:
             # 将我们获取到的信息保存到本地
             #for i in range(len(web)):  # 以某个属性的长度来循环
             # 我们将这些信息保存起来,并用;来分隔
             f.write(web[0] + ";")
             f.write(title[0] + ";")
             f.write(author[0] + "\n")
     #yield item
     print("爬取一页")
     sleep(0.1)
     nextPage = selector.xpath(
         '//div[@class="text-c"]/a[contains(text(),"下一页")]/@href').extract(
         )[0]
     if self.count <= 1317:
         next = URL + nextPage
         yield scrapy.http.Request(next, callback=self.parse)
     else:
         self.database(self.filename)  # 当超过20页时跳出返回,调用database函数存信息到数据库
Example #15
0
 def explore(self, start_url, scrape_func, next_xpath, max_count=-1):
     try:
         self.browser.get(start_url)
         old_page = self.browser.page_source
         counter = 0
         while True:
             # refer to the following blog for wait trick:
             # http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html
             WebDriverWait(self.browser, NEXT_WAIT_TIMEOUT) \
                 .until(EC.element_to_be_clickable((By.XPATH, next_xpath)))
             # always sleep for a while to be polite
             time.sleep(0.3)
             if old_page == self.browser.page_source or \
                 (max_count != -1 and counter >= max_count):
                 break
             else:
                 old_page = self.browser.page_source
             counter += 1
             response = scrapy.Selector(text=self.browser.page_source)
             yield scrape_func(response)
             next_elem = self.browser.find_element_by_xpath(next_xpath)
             cnt = 0
             while cnt < MAX_RETRY:
                 try:
                     ActionChains(self.browser).move_to_element(
                         next_elem).click().perform()
                     break
                 except WebDriverException as we:
                     time.sleep(1)
                     cnt += 1
     except TimeoutException as te:
         sys.stderr.write(
             "Fail to wait for page to be loaded. Error:{}\n".format(te))
     except Exception as oe:
         sys.stderr.write("unexpected exception:{}".format(oe))
         import traceback
         traceback.print_exc()
         raise
Example #16
0
 def list_parse(self, response):
     check(response)
     data = json.loads(response.body)
     selector = scrapy.Selector(text=data['data'], type="html")
     cards = selector.xpath("//div[@class='card']")
     for card in cards:
         item = ListItem()
         item['image'] = card.xpath("a/img/@data-src").extract_first()
         item['code'] = card.xpath("div/a[1]/h4/text()").extract_first()
         if item['code'] in self.database:
             item['title'] = "Crawled"
             yield item
             continue
         item['url'] = "https://www5.javmost.com/{}/".format(item['code'])
         item['title'] = card.xpath("div/a[2]/h5/text()").extract_first()
         item['release_time'] = card.xpath("div/p/text()[2]").extract_first(
         ).split('\t')[0].split(" ")[-1]
         item['rating'] = card.xpath("div/p/text()[5]").extract_first(
         ).split('\t')[0].split(" ")[-1]
         item['duration'] = card.xpath("div/p/span/text()").extract_first()
         item['genre'] = card.xpath(
             "div/p/a[@class='btn btn-warning btn-xs m-r-5 m-t-2']/text()"
         ).extract()
         item['star'] = card.xpath(
             "div/p/a[@class='btn btn-danger btn-xs m-r-5 m-t-2']/text()"
         ).extract()
         item['maker'] = card.xpath(
             "div/p/a[@class='btn btn-info btn-xs m-r-5 m-t-2']/text()"
         ).extract()
         item['director'] = card.xpath(
             "div/p/a[@class='btn btn-success btn-xs m-r-5 m-t-2']/text()"
         ).extract()
         item['tags'] = card.xpath(
             "div/p/a[@class='btn btn-inverse btn-xs m-r-5 m-t-2']/text()"
         ).extract()
         yield scrapy.Request(item['url'],
                              callback=self.parse,
                              meta={'item': item})
Example #17
0
    def request_captcha(self, response):
        selector = scrapy.Selector(response)
        captcha_url = selector.xpath("//img[@class='verifyimg']").xpath("./@src").extract_first()
        randomKey = selector.xpath("//input[@class='randomkey']").xpath("./@value").extract_first()

        full_captcha_url = self.host + captcha_url
        fileName = self.captcha_file_path()
        urlretrieve(full_captcha_url, fileName)

        open_image_command = "open "+fileName
        os.system(open_image_command)

        captcha_str = input("请输入验证码:")
        return scrapy.FormRequest.from_response(
            response,
            formdata={"regionCode": "+86",
                      "account": "手机号",
                      "password": "******",
                      "captcha": captcha_str,
                      "randomKey": randomKey},
            meta={'cookiejar': response.meta['cookiejar']},
            callback=self.after_login
        )
Example #18
0
    def parse(self, response):
        sel = scrapy.Selector(response)
        title = sel.xpath('//div[@id="content"]')
        main_topic = sel.xpath(
            '//div[@class="topic-doc"]//div[@class="topic-content"]')
        reply = sel.xpath(
            '//ul[@class="topic-reply"]//div[@class="reply-doc content"]')
        items = []

        title_item = DoubantopicItem()
        title_item['content'] = title.xpath('h1/text()').extract()[0].strip()
        items.append(title_item)

        main_topic_item = DoubantopicItem()
        main_topic_item['content'] = main_topic.xpath('p/text()').extract()[0]
        items.append(main_topic_item)

        for each in reply:
            item = DoubantopicItem()
            item['content'] = each.xpath('p/text()').extract()[0].strip(' ')
            items.append(item)

        return items
Example #19
0
    def data_parse(self, response):
        data = scrapy.Selector(response).xpath('//p/text()').extract_first()
        data = json.loads(data)['returndata']
        data_nodes = data['datanodes']
        tag_nodes = data['wdnodes'][0]['nodes']

        # 储存指标名
        for i in tag_nodes:
            item = DataNameItem()
            item['name'] = i['name']
            item['memo'] = i['memo']
            item['zb'] = i['code']
            yield item
        # 储存数据
        for j in data_nodes:
            item = DataItem()
            item['zb'] = j['wds'][0]['valuecode']
            item['sj'] = j['wds'][1]['valuecode']
            if j['data']['hasdata']:
                item['data_str'] = j['data']['strdata']
            else:
                item['data_str'] = 'null'
            yield item
Example #20
0
	def parseBooks(self, response):
		selector = scrapy.Selector(response=response)
		VIEWSTATE= selector.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first()
		EVENTVALIDATION = selector.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first()
		VIEWSTATEGENERATOR = selector.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first()
		script = 'ctl00$ContentPlaceHolder1$updatepanelread|ctl00$ContentPlaceHolder1$ddbook'
		
 	   	formdata={
 	    	 # change pages here
 	    	"__EVENTTARGET": "ctl00$ContentPlaceHolder1$ddbook",
 	    	"__LASTFOCUS":"",
 	    	"__VIEWSTATE": VIEWSTATE,
 	    	"__VIEWSTATEGENERATOR": VIEWSTATEGENERATOR,
 	    	"__EVENTVALIDATION": EVENTVALIDATION,
 	   		"ctl00$ContentPlaceHolder1$ddbook": "1",
			"__ASYNCPOST": "true"
		}
		header = {'User-Agent': u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
				'Cookie': 'ASP.NET_SessionId=3ikackn3wx5ujb5hc2d4y3cx',
				'X-MicrosoftAjax': 'Delta=true',
				'X-Requested-With': 'XMLHttpRequest'}
		
		yield scrapy.FormRequest(url=self.url, formdata=formdata, headers=header, callback=self.parseSections)
Example #21
0
    def parse(self, response):
        data = response.body.decode()
        selector = scrapy.Selector(text=data)
        total = selector.xpath('//*[@id="main-container"]/div[2]/div')

        for article in total:
            item = MarvelpttItem()
            x1 = article.xpath('./div[2]/a/text()').extract()
            if len(x1) == 0:
                continue
            else:
                item['title'] = x1[0]
                item['postUser'] = article.xpath(
                    './div[3]/div[1]/text()').extract()[0]
                item['time'] = article.xpath(
                    './div[3]/div[3]/text()').extract()[0]

                x2 = article.xpath('./div/span/text()').extract()
                if len(x2) == 0:
                    item['push'] = '0'
                else:
                    item['push'] = x2[0]
                yield item
Example #22
0
    def parse_result(self, response):
        json_data = json.loads(response.body_as_unicode())
        #extracting html from the json response
        data_html = scrapy.Selector(text=json_data["html"], type="html")
        JOBPOSTS = data_html.xpath('//div[@class="iconcontentpanel"]')
        matched_jobs = {}

        for jobpost in JOBPOSTS:
            JOBTITLE_SELECTOR = 'div div div div h3 span a ::text'  # selects div containing job
            JOBLOC_SELECTOR = '.morelocation span ::text'  # selects span containing job location
            JOBID_SELECTOR = '.text ::text'  # selects element containing jobid
            job = jobpost.css(JOBTITLE_SELECTOR).extract_first(
            ) + " - " + jobpost.css(JOBID_SELECTOR).extract_first()
            matched_jobs[job] = jobpost.css(JOBLOC_SELECTOR).extract_first()

        #print result
        print()
        print("====================Search Result-Jobs====================")
        print()
        for job, location in matched_jobs.items():
            print(job, " - ", location)
        print("==========================================================")
        print()
Example #23
0
 def parse(self, response):
 	sel=scrapy.Selector(response)
 	articles=sel.css('h2 a[href^="/wenxue/"]').css('a[href$=".html"]')
 	for article in articles:
 		articles_url = self.base_url + article.css('a::attr(href)').extract()[0]
 		if(articles_url is not None):
 			yield scrapy.Request(articles_url, meta = {
                   'dont_redirect': True,
                   'handle_httpstatus_list': [302]
               },
               callback=self.parsearticle,
               dont_filter=True)
 	time.sleep(3)
 	next=sel.css('a[href^="/wenxue/"]')
 	for ne in next:
     	if(ne.css("::text").extract()[0] == "下一页"):
       		next_url = self.base_url  + ne.css("::attr(href)").extract()[0]
       		yield scrapy.Request(next_url, meta = {
                   	'dont_redirect': True,
                   	'handle_httpstatus_list': [302]
               	},
               	callback=self.parse,
               	dont_filter=True)
Example #24
0
 def parse(self, response):
     sel = scrapy.Selector(response)
     div_list = sel.xpath('//div[@class="zp-jobNavigater-popContainer"]')
     for div_item in div_list:
         zwlb_big = div_item.xpath(
             'div[@class="zp-jobNavigater-pop-title"]/text()'
         ).extract_first()
         for zwlb in div_item.xpath(
                 'div[@class="zp-jobNavigater-pop-list"]/a/text()').extract(
                 ):
             url = 'https://fe-api.zhaopin.com/c/i/sou?start=0&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={}&kt=3'.format(
                 quote(zwlb))
             yield scrapy.Request(url=url,
                                  callback=self.parse_list,
                                  dont_filter=True,
                                  meta={
                                      'zwlb_big': zwlb_big,
                                      'zwlb': zwlb,
                                      'p': 1,
                                      'size': 60,
                                      'start': 60
                                  },
                                  headers=self.headers)
Example #25
0
 def parse_info(self, response):  #得到所有的info/*.html的页面链接
     selector = scrapy.Selector(response)
     infos = selector.xpath('//li[re:test(@id, "line_u7_\\d*")]//a//@href'
                            ).extract()  #得到每个菜单栏链接页面的所有info/*.html链接
     next = selector.xpath("//a[@class='Next']/@href").extract()
     for info in infos:
         if "../" in info:
             info = info.replace("../", "")
         if "http" not in info:
             info = self.url + info
         if "soc" in info:
             yield scrapy.Request(url=info, callback=self.parse_text)
     if next:  # 假如有下一页继续爬取
         next = next[0]
         if "?" in next:
             head_url = response.url.split("?")[0]
         else:
             word = "".join(response.url.split("/")[-1:])
             head_url = response.url.replace(word, "")
         if "../" in next:
             next = next.replace("../", "")
         nextUrl = head_url + next
         yield scrapy.Request(url=nextUrl, callback=self.parse_info)
Example #26
0
    def parse(self, response):
        sel = scrapy.Selector(response)
        # Get different page format here
        # if searching company_name only return 1 result, extract that CIK
        if sel.xpath('//span[@class="companyName"]'):
            self.company_name = sel.xpath(
                '//span[@class="companyName"]/text()').extract_first()
            cik_temp = sel.xpath(
                '//span[@class="companyName"]/a/text()').extract_first()
            self.CIK = cik_temp.split(" ")[0]
            yield scrapy.Request(self.CIK_lookup_url % self.CIK,
                                 callback=self.CIK_parse)
        # if searching company_name return multiple result, extract the first result's CIK
        else:
            sites = sel.xpath('//div/table/tr')
            for site in sites:
                if site.xpath('td/a/@href'):
                    self.CIK = site.xpath('td[1]/a/text()').extract_first()
                    yield scrapy.Request(self.CIK_lookup_url % self.CIK,
                                         callback=self.CIK_parse)

                if self.CIK is not None:
                    break
Example #27
0
    def parse(self, response):
        print("CALLING PARSE")
        selector = scrapy.Selector(response)

        # print(response.body)

        urls = selector.xpath(
            '//a[@class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb  id-track-click "]/@href'
        ).extract()

        link_flag = 0

        links = []
        for link in urls:
            # print("LINK" + str(link))
            links.append(link)

        for each in urls:
            yield Request(url="http://play.google.com" + links[link_flag],
                          callback=self.parse_next,
                          dont_filter=True)
            print("http://play.google.com" + links[link_flag])
            link_flag += 1
Example #28
0
    def parse(self, response):
        data = json.loads(response.text)
        item = scrapy.Selector(text=data['content'], type="html")

        if not item.css('div.eventon_list_event p.no_events'):
            data = {
                '_type': 'event',
                'id': self._parse_id(item),
                'name': self._parse_name(item),
                'description': self._parse_description(item),
                'classification': self._parse_classification(item),
                'start_time': self._parse_start(item),
                'end_time': self._parse_end(item),
                'all_day': self._parse_all_day(item),
                'timezone': 'America/Chicago',
                'status': self._parse_status(item),
                'location': self._parse_location(item),
                'sources': self._parse_sources(item)
            }
            data['id'] = self._generate_id(data)
            yield data
        else:
            yield
Example #29
0
 def parse_youku(self, response):
     self.loggerWithTime("==================>>>>>>>>>>>>>")
     uid = response.meta["uid"]
     url = response.url
     firstresponse = scrapy.Selector(response)
     if response.status == 200:
         titleurl = firstresponse.xpath(
             '//div[@class="tvinfo"]/h2/a/@href').extract()
         #视频标题
         titlelist = firstresponse.xpath(
             '//div[@class="tvinfo"]/h2/a/text()').extract()
         title = titlelist[0] if titlelist else ''
         #导演和演员列表
         meta = {"uid": uid, "title": title, "url": url}
         fullurl = 'https:' + titleurl[0] if titleurl else ''
         if fullurl != '':
             yield scrapy.http.Request(url=fullurl,
                                       callback=self.parse_youku_second,
                                       meta=meta,
                                       dont_filter=True)
         else:
             self.updateOnSuccess(response.meta["uid"], '', '', 99, '', '')
             self.loggerWithTime(u"youku-else-video[%s]" % url)
Example #30
0
    def parse(self, response):

        # current_url = response.url #爬取时请求的URL
        # body = response.body #返回的Html
        # unicode_body = response.body_as_uncode()#返回的html unicode编码
        hxs = scrapy.Selector(response)
        if re.match('http://www.xiaohuar.com/list-1-\d+.html', response.url):
            items = hxs.xpath('//div[@class="item_list infinite_scroll"]/div')
            for i in range(len(items)):
                src = hxs.xpath(
                    '//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/a/img/@src'
                    % i).extract()
                name = hxs.xpath(
                    '//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/span/text()'
                    % i).extract()
                school = hxs.xpath(
                    '//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/div[@class="btns"]/a/text()'
                    % i).extract()
                if src:
                    ab_src = "http://www.xiaohuar.com" + src[0]
                    file_name = "%s_%s.jpg" % (school[0], name[0])
                    file_path = os.path.join("/data/scrapy/pic/", file_name)
                    urllib.urlretrieve(ab_src, file_path)