Python HtmlXPathSelector.xpathの例、scrapy.selector.lxmlsel.HtmlXPathSelector.xpath Pythonの例

コード例 #1

0

ファイルを表示

    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        meta = response.meta

        nodes = hxs.xpath("//div[@class='list-infoBox']")

        city = "".join(hxs.xpath('//a[@class="choose-city"]/span/text()').re(r'\S+'))

        for node in nodes:

            items_list = []

            title = "".join(node.xpath('.//a[1]/@title').extract())

            nowprice = "".join(node.xpath(".//i[@class='fc-org priType']/text()").extract())

            url = urljoin(self.start_urls[0],"".join(node.xpath('.//a[1]/@href').extract()))

            oldprice = "".join(node.xpath('.//p[@class="priType-s"]/s/text()').extract())

            drivetime = "".join(node.xpath('.//p[@class="fc-gray"]/descendant::text()').extract())

            items_list.append([url,title,nowprice,oldprice,drivetime,city,meta['brand_name']])

            writer.writerow([x.encode("utf8").replace("\n","").replace("\t","").replace("\r","").replace(" ","") for x in items_list[0]])

        next_page = hxs.xpath('//a[@class="next"]/@href').extract()

        if next_page:

            url = urljoin(self.start_urls[0],next_page[0])

            yield Request(url,callback=self.parse_item,meta=meta)

コード例 #2

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: yh623962721/QbSpider

    def parse_userinfo(self, response):

        hxs = HtmlXPathSelector(response)

        item = response.meta["item"]

        item["userloginname"] = "".join(
            hxs.xpath("//div[@id='aliasBefore']/strong/text()").extract())

        item["usermail"] = "".join(
            hxs.xpath(
                u"//span[contains(text(),'邮箱：')]/following-sibling::div[1]/div/strong/text()"
            ).re(r'\S+'))

        item["userrealname"] = "".join(
            hxs.xpath("//input[@id='realName']/@value").extract())

        item["usertype"] = "".join(
            hxs.xpath(
                u"translate(//div[@class='info-m']/div[contains(text(),'会员类型：')]/text(),'会员类型：','')"
            ).extract())

        safetycenter_url = "https://safe.jd.com/user/paymentpassword/safetyCenter.action"

        yield Request(url=safetycenter_url,
                      callback=self.parse_safetycenter,
                      meta={"item": item})

コード例 #3

0

ファイルを表示

ファイル: spider_name.py プロジェクト: yh623962721/QbSpider

    def parse(self, response):

        hxs = HtmlXPathSelector(response)

        print "*" * 66

        print hxs.xpath("//script[@class='J_auto-load']/text()").extract()

        print "-" * 66

        return

コード例 #4

0

ファイルを表示

    def parse_list(self,response):

        hxs = HtmlXPathSelector(response)

        all_brand_name = hxs.xpath('//span[contains(@class,"brand-all")]/descendant::a/text()').re(r'\S+')

        all_brand_url = hxs.xpath('//span[contains(@class,"brand-all")]/descendant::a/@href').extract()

        for item in zip(all_brand_name,all_brand_url):

            if len(item) ==2:

                yield Request(urljoin(self.start_urls[0], item[1]), callback=self.parse_item,meta={'brand_name':item[0]})

コード例 #5

0

ファイルを表示

    def parse(self, response):

        hxs = HtmlXPathSelector(response)

        all_city_name = hxs.xpath('//div[@class="all-city"]/descendant::a[contains(@data-gzlog,"select_city")]/text()').re(r'\S+')

        all_city_url = hxs.xpath('//div[@class="all-city"]/descendant::a[contains(@data-gzlog,"select_city")]/@href').extract()

        for item in zip(all_city_name,all_city_url):

            if len(item) ==2:

                yield Request(urljoin(self.start_urls[0],item[1]),callback=self.parse_list)

コード例 #6

0

ファイルを表示

 def parse_developer(self, response):
     item = response.meta.get("item")
     hxs = HtmlXPathSelector(response)
     item["stars"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Stars")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["watchers"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Watchers")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["forks"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Forks")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["merged_pull_requests"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Merged Pull Requests")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["total_issues"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Total Issues")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["closed_issues"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Closed Issues")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["contributors"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Contributors")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["total_new_commits"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Total new commits")]/following-sibling::p[1]/text()'
     ).extract_first()
     yield item
     self.item_counts += 1
     self.logger.info("current item counts <{}>".format(self.item_counts))

コード例 #7

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: yh623962721/QbSpider

    def parse_safetycenter(self, response):

        hxs = HtmlXPathSelector(response)

        item = response.meta["item"]

        item["userphone"] = "".join(
            hxs.xpath("//strong[@id='mobile']/text()").re(r'\S+'))

        item["useridcard"] = "".join(
            hxs.xpath(
                u"//span[contains(text(),'您认证的实名信息：')]/following::strong[2]/text()"
            ).extract())

        order_url = "https://order.jd.com/center/list.action"

        self.items.append(item)

        yield Request(url=order_url, callback=self.parse_order_year)

コード例 #8

0

ファイルを表示

    def parse_renqizhishu(self, response):

        html = HtmlXPathSelector(response)

        popularity_ranking = ''.join(
            html.xpath(u"//*[contains(text(),'第')]/text()").extract())

        item = {"popularity_ranking": popularity_ranking}

        self.con.hmset(self.jobid, item)

        del response

コード例 #9

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: yh623962721/QbSpider

    def parse_order_list(self, response):

        hxs = HtmlXPathSelector(response)

        orders_urls = hxs.xpath("//a[@name='orderIdLinks']/@href").extract()

        headers = dict(response.request.headers)

        headers.update({"Referer": None})

        sess = {}

        cookie = response.request.headers.getlist('Cookie')[0].split(";")

        for cook in cookie:

            sess.update({
                cook[:cook.index("=")]:
                cook[cook.index("=") + 1:].replace('"', "")
            })

        for order_url in orders_urls:

            if "orderId=" in order_url or "orderid" in order_url:

                #self.queues.push(Request(url=urljoin(self.start_urls[0],order_url),meta={"jobid":self.jobid},headers=headers))

                yield Request(url=urljoin(self.start_urls[0], order_url),
                              cookies=sess,
                              meta={"jobid": self.jobid},
                              callback=self.parse_items)

        next_page_url = hxs.xpath("//a[@class='next']/@href").extract()

        if next_page_url:

            for next_url in next_page_url:

                yield Request(url=urljoin(self.start_urls[0], next_url),
                              callback=self.parse_order_list)

コード例 #10

0

ファイルを表示

 def parse_community(self, response):
     item = response.meta.get("item")
     coin_name = response.meta.get("coin_name")
     hxs = HtmlXPathSelector(response)
     item['subscribers'] = hxs.xpath(
         '//a[@rel="nofollow"][contains(text(),"Subscribers")]/../following-sibling::p[1]/text()'
     ).extract_first()
     item["followers"] = hxs.xpath(
         '//a[@rel="nofollow"][contains(text(),"Followers")]/../following-sibling::p[1]/text()'
     ).extract_first()
     item["likes"] = hxs.xpath(
         '//a[@rel="nofollow"][contains(text(),"Likes")]/../following-sibling::p[1]/text()'
     ).extract_first()
     item["avg_users_online"] = hxs.xpath(
         '//div[contains(@class, "social-media")][contains(text(), "Online")]/p[1]/text()'
     ).extract_first()
     item["avg_new_hot_posts_per_hour"] = hxs.xpath(
         '//div[contains(@class, "social-media")][contains(text(), "New Hot")]/p[1]/text()'
     ).extract_first()
     item["avg_new_comments_on_hot_posts_per_hour"] = hxs.xpath(
         '//div[contains(@class, "col-md")][contains(text(), "Comments")]/p[1]/text()'
     ).extract_first()
     url = "https://www.coingecko.com/en/coins/{}/developer#panel".format(
         coin_name)
     yield Request(url=url,
                   callback=self.parse_developer,
                   meta={"item": item},
                   dont_filter=True)

コード例 #11

0

ファイルを表示

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     for data in hxs.xpath("//*[@id='gecko-table']/tbody/tr"):
         item = IcoItem()
         item["name"] = data.xpath(
             './/td[@class="coin-name"]//span[@class="coin-content-name"]/text()'
         ).extract_first()
         item["symbol"] = data.xpath(
             './/td[@class="coin-name"]//span[@class="coin-content-symbol"]/text()'
         ).extract_first()
         item["img_url"] = "https:" + data.xpath(
             './/td[@class="coin-name"]//img/@data-src').extract_first()
         item["other"] = data.xpath(
             './/td[@class="coin-name"]//small/text()').extract_first()
         item["developer"] = data.xpath(
             './/td[@class="td-developer_score dev"]/div[1]/text()'
         ).extract_first()
         item["community"] = data.xpath(
             './/td[@class="td-community_score community"]/div[1]/text()'
         ).extract_first()
         item["public_interest"] = data.xpath(
             './/td[@class="td-public_interest_score pb-interest"]/div[1]/text()'
         ).extract_first()
         item["total"] = data.xpath(
             './/td[@class="total"]/div[1]/text()').extract_first()
         coin_name = data.xpath(
             './/td[@class="coin-name"]//a[@class="currency_exchangable_chart_link"]/@href'
         ).re(r'price_charts/([\S\s]+?)/usd')
         url = "https://www.coingecko.com/en/coins/{}#panel".format(
             coin_name[0])
         yield Request(url=url,
                       callback=self.parse_baseinfo,
                       meta={
                           "item": item,
                           "coin_name": coin_name[0]
                       },
                       dont_filter=True)
     next_page_url = hxs.xpath('//link[@rel="next"]/@href').extract_first()
     self.logger.info("current page url <{}>".format(next_page_url))
     yield response.follow(next_page_url, self.parse)

コード例 #12

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: yh623962721/QbSpider

    def parse_order_year(self, response):

        hxs = HtmlXPathSelector(response)

        order_urls = hxs.xpath(
            "//div[@class='time-list']/ul/li[position()>1]/a/@_val").extract(
            )[0:2]

        order_url = "https://order.jd.com/center/list.action?search=0&d="

        for urls in order_urls:

            yield Request(url=order_url + urls, callback=self.parse_order_list)

コード例 #13

0

ファイルを表示

    def parse_district_num(self, response):

        html = HtmlXPathSelector(response)

        district_num = ''.join(
            html.xpath("//span[@class='num']/text()").extract()).replace(
                "(", "").replace(")", "")

        item = {"district_num": district_num}

        self.con.hmset(self.jobid, item)

        del response

コード例 #14

0

ファイルを表示

 def parse_baseinfo(self, response):
     item = response.meta.get("item")
     coin_name = response.meta.get("coin_name")
     hxs = HtmlXPathSelector(response)
     item['liquidity'] = hxs.xpath(
         '//div[@class="score"][contains(text(), "Liquidity")]/span/text()'
     ).extract_first()
     item["hash_algorithm"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Hashing Algorithm")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["hash_rate"] = hxs.xpath(
         '//div[@class="hashrate"]/p/text()').extract_first()
     item["block_time"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Block Time")]/following-sibling::p[1]/text()'
     ).extract_first()
     item["homepage"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Homepage")]/following-sibling::p[1]/a/text()'
     ).extract_first()
     item["block_chain_supply"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Blockchain/Supply")]/following-sibling::p[1]/a/text()'
     ).extract_first()
     item["discussion_forum"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Discussion Forum")]/following-sibling::p[1]/a/text()'
     ).extract_first()
     item["available_total_supply"] = hxs.xpath(
         '//div[@class="tab-title"][contains(text(), "Available/Total Supply")]/following-sibling::p[1]/text()'
     ).extract_first()
     url = "https://www.coingecko.com/en/coins/{}/social#panel".format(
         coin_name)
     yield Request(url=url,
                   callback=self.parse_community,
                   meta={
                       "item": item,
                       "coin_name": coin_name
                   },
                   dont_filter=True)

コード例 #15

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: DahuK/Crawler

    def parse_detail(self, response):
        print "--------------------------"
        self.logger.info("--------------------------")
        hxs = HtmlXPathSelector(response)
        items = []
        price_url_pre = "http://p.3.cn/prices/mgets?skuIds=J_"
        for gl_item in hxs.xpath("//*[@id='plist']/ul/li[@class='gl-item']"):
            # self.logger.info("GGGGGGGGGGGGGGGGGGGGGGGGG: %s" % gl_item.extract())
            book_element = gl_item.xpath("div[@class='tab-content-item j-sku-item tab-cnt-i-selected']")
            if book_element is None or len(book_element) == 0:
                book_element = gl_item.xpath("div")

            data_sku_id = self.get_xpath_val(book_element, "@data-sku")
            price_url = price_url_pre + data_sku_id
            item = JdbookItem()
            item["name"] = self.get_xpath_val(book_element, "div[3]/a/em/text()")
            item["publisher"] = self.get_xpath_val(book_element, "div[4]/span[2]/a/text()")
            item["author"] = self.get_xpath_val(book_element, "div[4]/span[1]/span[1]/a[1]/text()")
            item["commit"] = self.get_xpath_val(book_element, "div[6]/strong/a/text()")
            item["shop"] = self.get_xpath_val(book_element, "div[7]/span/text()")
            r = Request(price_url, callback=self.parse_price, dont_filter=True, meta={"item": item})
            items.append(item)
            yield r

コード例 #16

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: yh623962721/QbSpider

    def sendphonecode(self, response):

        hxs = HtmlXPathSelector(response)

        sendphonecodekey = "".join(
            hxs.xpath(
                "translate(//*[@id='sendMobileCode']/@href,'javascript:sendFindPwdCode(|);','')"
            ).extract()).replace("'", "")

        if self.vercode is None:

            item = JdItem()

            logging.warning(
                msg=
                "Login jd need phone vercode, will send phone code to user phone "
            )

            item["status"] = 2

            jobid = self.settings.get("jobid", None)

            item["jobid"] = jobid

            #self.con.lpush(jobid, item)

            self.items.append(item)

            yield Request(url=self.sendcodeurl % sendphonecodekey,
                          dont_filter=True)

        else:

            yield Request(url=self.validatecodeurl %
                          (self.vercode, sendphonecodekey),
                          callback=self.checkphonekey)

コード例 #17

0

ファイルを表示

    def parse_starts(self, response):

        jobid = response.meta["jobid"]

        html = HtmlXPathSelector(response)

        meta = response.meta

        meta["start_5"] = ''.join(
            html.xpath(u"//a[text()='5星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        meta["strat_4"] = ''.join(
            html.xpath(u"//a[text()='4星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        meta["start_3"] = ''.join(
            html.xpath(u"//a[text()='3星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        meta["start_2"] = ''.join(
            html.xpath(u"//a[text()='2星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        meta["start_1"] = ''.join(
            html.xpath(u"//a[text()='1星']/following-sibling::em/text()").
            extract()).replace(")", "").replace("(", "")

        next_page = ''.join(html.xpath(u"//a[text()='下一页']/@href").extract())

        item = {}

        item["business_details"] = meta

        self.items.append(item)

        yield Request(urljoin(meta["review_urls"], "?pageno=1"),
                      callback=self.parse_item,
                      meta=meta)

        del response

コード例 #18

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: yh623962721/QbSpider

    def parse_ballancecount(self, response):

        item = JdItem()

        hxs = HtmlXPathSelector(response)

        item["spidertime"] = time.strftime('%Y%m%d%H%M%S',
                                           time.localtime(time.time()))

        item["username"] = self.username

        item["passwd"] = self.passwd

        item["usernickname"] = "".join(
            hxs.xpath("//div[@class='u-name']/a/text()").extract())

        item["userrank"] = "".join(
            hxs.xpath("//div[@class='u-level']/span/a/text()").extract())

        item["balance"] = "".join(
            hxs.xpath("//a[@id='BalanceCount']/text()").extract())

        item["baitiaobalance"] = "".join(
            hxs.xpath("//span[@class='baitiao-limit']/text()").extract())

        item["wallet"] = "".join(
            hxs.xpath("//div[@id='balance']/a[2]/em/text()").extract())

        item["yesprofit"] = "".join(
            hxs.xpath("//div[@class='ftx01 profit']/a/text()").extract())

        userinfo_url = 'https://i.jd.com/user/info'

        yield Request(url=userinfo_url,
                      callback=self.parse_userinfo,
                      meta={"item": item})

コード例 #19

0

ファイルを表示

    def parse_item(self, response):

        jobid = response.meta["jobid"]

        html = HtmlXPathSelector(response)

        meta = response.meta

        item = {}

        _item = []

        for comment in html.xpath("//*[@class='comment-list']/ul/li"):

            comments = {}

            comments["start"] = ''.join(
                comment.xpath(
                    ".//*[@class='user-info']/span/@title").extract()).replace(
                        "\r", "").replace("\n",
                                          "").replace("\t",
                                                      "").replace(" ", "")

            comments["taste"] = ''.join(
                comment.xpath(u".//*[contains(text(),'口味')]/em/text()").
                extract()).replace(")", "").replace("(", "").replace(
                    "\r", "").replace("\n", "").replace("\t",
                                                        "").replace(" ", "")

            comments["environment"] = ''.join(
                comment.xpath(u".//*[contains(text(),'环境')]/em/text()").
                extract()).replace(")", "").replace("(", "").replace(
                    "\r", "").replace("\n", "").replace("\t",
                                                        "").replace(" ", "")

            comments["service"] = ''.join(
                comment.xpath(u".//*[contains(text(),'服务')]/em/text()").
                extract()).replace(")", "").replace("(", "").replace(
                    "\r", "").replace("\n", "").replace("\t",
                                                        "").replace(" ", "")

            comments["review_text"] = ''.join(
                comment.xpath(".//*[@class='comment-txt']/div/text()").extract(
                )).replace("\r", "").replace("\n",
                                             "").replace("\t",
                                                         "").replace(" ", "")

            comments["review_time"] = ''.join(
                comment.xpath(".//*[@class='time']/text()").extract()).replace(
                    "\r", "").replace("\n", "").replace("\t",
                                                        "").replace(" ", "")

            comments["discussant"] = ''.join(
                comment.xpath(
                    ".//*[@class='name']/a/text()").extract()).replace(
                        "\r", "").replace("\n",
                                          "").replace("\t",
                                                      "").replace(" ", "")

            comments["discussant_contribution"] = ''.join(
                comment.xpath(".//*[@class='contribution']/span/@title").
                extract()).replace("\r", "").replace("\n", "").replace(
                    "\t", "").replace(" ", "")

            _item.append(comments)

        item["review_details"] = _item

        item["review_page"] = ''.join(
            html.xpath("//span[@class='PageSel']/text()").extract())

        self.items.append(item)

        next_page = ''.join(html.xpath(u"//a[text()='下一页']/@href").extract())

        if next_page:

            yield Request(urljoin(meta["review_urls"], next_page),
                          callback=self.parse_item,
                          meta=meta)

        del response

コード例 #20

0

ファイルを表示

    def parse_xiangxi(self, response):

        html = HtmlXPathSelector(response)

        meta = response.meta

        meta["title"] = ''.join(
            html.xpath("//*[@id='basic-info']/h1/text()").extract()).replace(
                "\r", "").replace("\n", "").replace("\t", "").replace(" ", "")

        meta["start"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[1]/@title").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["mean_price"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[3]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["address"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[2]/span[2]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["taste"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[4]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["environment"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[5]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["service"] = ''.join(
            html.xpath("//*[@id='basic-info']/div[1]/span[6]/text()").extract(
            )).replace("\r", "").replace("\n",
                                         "").replace("\t",
                                                     "").replace(" ", "")

        meta["tel"] = ''.join(
            html.xpath("//*[@id='basic-info']/p[1]/span[2]/text()").extract()
        ).replace("\r", "").replace("\n", "").replace("\t",
                                                      "").replace(" ", "")

        meta["review_num"] = ''.join(
            html.xpath(
                "//*[@id='comment']/h2/a[2]/span/text()").extract()).replace(
                    ")", "").replace("(", "").replace("\r", "").replace(
                        "\n", "").replace("\t", "").replace(" ", "")

        # more_review = ''.join(html.xpath("//*[@id='comment']/p/a/@href").extract())
        more_review = ''.join(
            html.xpath(u"//a[contains(text(),'更多点评')]/@href").extract())

        review_url = meta["url"] + "/review_more#start=10"

        meta["review_urls"] = meta["url"] + "/review_more"

        yield Request(review_url, callback=self.parse_starts, meta=meta)

        del response

コード例 #21

0

ファイルを表示

    def parse_get_xinxi(self, response):
        logger.info(msg="<%s>,get HUNANGJJ XX!!!" % self.idcard)
        # f = open("parse_get_xinxi.html","w")
        # f.write(response.body.decode("gb2312", "ignore"))
        # f.close()

        html = HtmlXPathSelector(response)

        item = {}

        item["realName"] = ''.join(
            html.xpath(
                u"//td[text()='职工姓名']/../td[2]//text()").extract()).strip()

        #item["银行账号"] = ''.join(html.xpath("//tr[@class='jtpsoft'][1]/td[4]/font/text()").extract()).strip()

        item["idCard"] = ''.join(
            html.xpath(
                u"//td[text()='身份证号']/../td[2]//text()").extract()).strip()

        if item["idCard"]:
            item["birthday"] = item["idCard"][6:14]
            sex = item["idCard"][-2]
            if int(sex) % 2 == 1:  ####偶数为女，奇数为男
                item["sex"] = u"男"
            elif int(sex) % 2 == 0:
                item["sex"] = u"女"
            else:
                item["sex"] = ""
        else:
            item["sex"] = ""
            item["birthday"] = ""

        #item["职工账号"] = ''.join(html.xpath("//tr[@class='jtpsoft'][2]/td[4]/font/text()").extract()).strip()

        item["comName"] = ''.join(
            html.xpath(
                u"//td[text()='所在单位']/../td[2]//text()").extract()).strip()

        item["officeName"] = ''.join(
            html.xpath(
                u"//td[text()='所属办事处']/../td[4]//text()").extract()).strip()

        item["startDate"] = ''.join(
            html.xpath(
                u"//td[text()='开户日期']/../td[2]//text()").extract()).strip()

        item["fundStatus"] = ''.join(
            html.xpath(
                u"//td[text()='当前状态']/../td[4]//text()").extract()).strip()

        item["monthPayBase"] = ''.join(
            html.xpath(
                u"//td[text()='月缴基数']/../td[2]//text()").extract()).strip()

        item["fundRatio"] = ''.join(
            html.xpath(
                u"//td[text()='缴存比例']/../td[5]//text()").extract()).strip()

        item["monthPayRmb"] = ''.join(
            html.xpath(
                u"//td[text()='月缴金额']/../td[2]//text()").extract()).strip()

        item["lastYearBalanceRmb"] = ''.join(
            html.xpath(
                u"//*[text()='上年余额']/../../td[4]//text()").extract()).strip()

        item["comRmb"] = ''.join(
            html.xpath(
                u"//td[text()='单位月缴额']/../td[2]//text()").extract()).strip()

        item["yearRepayRmb"] = ''.join(
            html.xpath(
                u"//*[text()='本年补缴']/../../td[4]//text()").extract()).strip()

        item["perRmb"] = ''.join(
            html.xpath(
                u"//td[text()='个人月缴额']/../td[2]//text()").extract()).strip()

        item["yearDrawRmb"] = ''.join(
            html.xpath(
                u"//*[text()='本年支取']/../../td[4]//text()").extract()).strip()

        item["yearPayRmb"] = ''.join(
            html.xpath(
                u"//td[text()='本年缴交']/../td[2]//text()").extract()).strip()

        item["yearAccrual"] = ''.join(
            html.xpath(
                u"//*[text()='本年利息']/../../td[4]//text()").extract()).strip()

        item["yearSwitchInRmb"] = ''.join(
            html.xpath(
                u"//td[text()='本年转入']/../td[2]//text()").extract()).strip()

        item["balance"] = ''.join(
            html.xpath(
                u"//*[text()='公积金余额']/../../td[4]//text()").extract()).strip()

        item["endDate"] = ''.join(
            html.xpath(
                u"//td[text()='缴至年月']/../td[2]//text()").extract()).strip()

        logger.info(msg="<%s>,get HUNANGJJ XX OVER!!!" % self.idcard)

        if not item:

            self.sign = 0
            self.item_status["fatch_code"] = 2199

            self.con.hmset(self.key, dict(self.item_status))

            # print u"采集失败，没采到数据！2199"

            logger.info(msg="<%s>,get HUNANGJJ XX NO DATA!!!" % self.idcard)

        else:
            self.tb = HbClient()

            self.tb.insert(colname=u"公积金-湖南-信息查询 ",
                           url=response.url,
                           html=response.body,
                           struct_dic={"basic_info": [item]},
                           id=self.idcard,
                           post_dic=self.form_data,
                           token=self.token)

            logger.info(msg="<%s>,get HUNANGJJ XX OVER!!!" % self.idcard)

コード例 #22

0

ファイルを表示

    def parse(self, response):

        html = HtmlXPathSelector(response)

        if u"没有找到相应的商户" in "".join(
                html.xpath(u"//h4[contains(text(),'没有找到相应的商户')]/text()")):

            self.items[0]["status"] = 3  # 3代表没搜到商户

            logging.warning(msg="Did not find the corresponding merchant")

            return

        city_name = "".join(
            html.xpath("//a[@class='city J-city']/text()").extract())

        meta = {
            "city": city_name,
            "vendor_name": self.vendor_name,
            "district_name": self.district_name,
            "jobid": self.jobid,
            "base_url": "http://www.dianping.com",
            "type": ""
        }
        Effective_num = 0

        for items in html.xpath("//div[@id='shop-all-list']/ul/li"):

            title_name = "".join(
                items.xpath(".//div[@class='tit']/a/h4/text()").extract())

            if meta["vendor_name"] in title_name:

                Effective_num += 1

                title_url = "".join(
                    items.xpath(".//div[@class='tit']/a[position()=1]/@href").
                    extract())

                meta["type"] = "".join(
                    items.xpath(
                        ".//div[@class='tag-addr']/a[position()=1]/span/text()"
                    ).extract()).replace("\r", "").replace("\n", "").replace(
                        "\t", "").replace(" ", "")

                meta["business_district"] = "".join(
                    items.xpath(
                        ".//div[@class='tag-addr']/a[position()=2]/span/text()"
                    ).extract()).replace("\r", "").replace("\n", "").replace(
                        "\t", "").replace(" ", "")

                meta["url"] = urljoin(meta["base_url"], title_url)

                yield Request(urljoin(meta["base_url"], title_url),
                              meta=meta,
                              callback=self.parse_xiangxi)

        if Effective_num == 0:

            self.items[0]["status"] = 5  # 5代表搜索到商户，但不是指定的商户

            logging.warning(msg="Not a specified merchant")

            return

        next_page = "".join(html.xpath(u"//a[text()='下一页']/@href").extract())

        if next_page:

            yield Request(urljoin(meta["base_url"], next_page),
                          callback=self.parse,
                          meta=meta)

        del response

コード例 #23

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: yh623962721/QbSpider

    def parse_items(self, response):

        self.jobid = response.meta["jobid"]

        item = {}

        hxs = HtmlXPathSelector(response)

        item["url"] = response.url

        item["orderid"] = "".join(
            hxs.xpath(
                u"//input[@id='orderid']/@value|//div[@class='w o-detail cj-share']/@orderid|//div[contains(text(),'订单号：')]/text()"
            ).re(r"%s" % price_re))

        if item["orderid"] == "":

            return Request(url=response.url,
                           meta={"jobid": self.jobid},
                           callback=self.parse_items,
                           dont_filter=True)

        item["ordertime"] = "".join(
            hxs.xpath(
                u"translate(//li[contains(text(),'下单时间：')]/text()|//input[contains(@id,'datesubmit-')]/@value|//td[contains(text(),'下单时间')]/following-sibling::td[1]/text(),'下单时间：','')"
            ).re(r'\S+'))

        item["ordercount"] = "".join(
            hxs.xpath(
                u"translate(//li[contains(text(),'充值面额：')]/text()|//span[contains(text(),'商品总额：')]/following-sibling::div[1]/span/text()|//td[contains(text(),'商品金额')]/following-sibling::td[1]/strong/text(),'充值面额：','')"
            ).re(r"%s" % price_re))

        if "orderId" in response.url:

            item["receivername"] = "".join(
                hxs.xpath(
                    u"//div[contains(text(),'收货人信息：')]/following-sibling::div[1]/text()"
                ).re(r'([\S\s]+?)（'))

            item["receiverphone"] = "".join(
                hxs.xpath(
                    u"//div[contains(text(),'收货人信息：')]/following-sibling::div[1]/text()"
                ).re(r"（([\S\s]+?)）"))

            item["receiveraddress"] = "".join(
                hxs.xpath(
                    u"//div[contains(text(),'收货地址：')]/following-sibling::div[1]/text()"
                ).re(r'\S+'))

            item["receiveridno"] = "".join(
                hxs.xpath(
                    u"translate(//div[contains(text(),'收货人信息：')]/following-sibling::div[2]/text(),'，','')"
                ).re(r"\S+"))

            item["paycount"] = "".join(
                hxs.xpath(
                    u"//div[contains(text(),'应付金额：')]/following-sibling::div[1]/b/text()"
                ).re(r'%s' % price_re))

        else:

            item["paycount"] = "".join(
                hxs.xpath(
                    u"translate(//li[contains(text(),'在线支付：')]/text()|//span[contains(text(),'应支付金额：')]/following-sibling::div[1]/span/text()|//td[contains(text(),'商品金额')]/following-sibling::td[1]/strong/text(),'在线支付：','')"
                ).re(r'%s' % price_re))

            item["receiveridno"] = ""

            item["receivername"] = "".join(
                hxs.xpath(
                    u"//span[contains(text(),'收货人：')]/following-sibling::div[1]/text()|//td[contains(text(),'收货人姓名')]/following-sibling::td[1]/text()"
                ).re(r'\S+'))

            item["receiverphone"] = "".join(
                hxs.xpath(
                    u"translate(//li[contains(text(),'手机号码：')]/text()|//span[contains(text(),'手机号码：')]/following-sibling::div[1]/text()|//td[contains(text(),'固定电话')]/following-sibling::td[1]/text(),'手机号码：','')"
                ).re(r"\S+"))

            item["receiveraddress"] = "".join(
                hxs.xpath(
                    u"//span[contains(text(),'地址：')]/following-sibling::div[1]/text()|//td[contains(text(),'地址')]/following-sibling::td[1]/text()"
                ).re(r'\S+'))

        item["paytime"] = "".join(
            hxs.xpath(
                u"//span[contains(text(),'付款时间：')]/following-sibling::div[1]/text()|//td[contains(text(),'下单时间')]/following-sibling::td[1]/text()"
            ).re(r'\S+'))

        item["billtype"] = "".join(
            hxs.xpath(
                u"//span[contains(text(),'发票类型：')]/following-sibling::div[1]/text()|//td[contains(text(),'发票类型')]/following-sibling::td[1]/text()"
            ).re(r'\S+'))

        item["billtitle"] = "".join(
            hxs.xpath(
                u"//span[contains(text(),'发票抬头：')]/following-sibling::div[1]/text()|//td[contains(text(),'发票抬头')]/following-sibling::td[1]/text()"
            ).re(r'\S+'))

        item["billcontent"] = "".join(
            hxs.xpath(
                u"//span[contains(text(),'发票内容：')]/following-sibling::div[1]/text()|//td[contains(text(),'发票内容')]/following-sibling::td[1]/text()"
            ).re(r'\S+'))

        _item = []

        for goods in hxs.xpath(
                "//tr[contains(@class,'product-')]|//table[@class='tb-void tb-none']/tbody/tr|//td[@class='itemName']/../following-sibling::tr"
        ):

            _goods = {}

            _goods["itemname"] = "".join(
                goods.xpath(".//a[contains(@href,'item.jd')]/text()").re(
                    r'\S+'))

            _goods["itemprice"] = "".join(
                goods.xpath(
                    ".//*[@class='f-price']/text()|.//td[3]/strong/text()|.//*[@class='jdPrice']/text()"
                ).re(r'%s' % price_re))

            _goods["itemnum"] = "".join(
                goods.xpath(".//td[5]/text()|.//*[@class='num']/text()").re(
                    r'%s' % price_re))

            _goods["itemid"] = "".join(
                goods.xpath(".//a[contains(@href,'item.jd')][1]/@href").re(
                    r'%s' % price_re))

            _goods["itemurl"] = urljoin(
                self.start_urls[0], "".join(
                    goods.xpath(
                        ".//a[contains(@href,'item.jd')][1]/@href").extract()))

            _item.append(_goods)

        item["items"] = _item

        item["jobid"] = self.jobid

        self.items.append(item)

コード例 #24

0

ファイルを表示

    def parse_get_login(self, response):

        # f = open("parse_get_login.html","w")
        # f.write(response.body)
        # f.close()

        html = HtmlXPathSelector(response)

        self.cxyd = "当前年度"

        self.zgzh = ''.join(
            html.xpath("//*[@name='zgzh']/@value").extract())  ##职工账户

        self.sfzh = ''.join(
            html.xpath("//*[@name='sfzh']/@value").extract())  ##身份证号码

        self.zgxm = ''.join(
            html.xpath("//*[@name='zgxm']/@value").extract())  ##职工姓名

        self.dwbm = ''.join(
            html.xpath("//*[@name='dwbm']/@value").extract())  ##单位编码？

        self.zgzt = ''.join(
            html.xpath("//*[@name='zgzt']/@value").extract())  ##职工状态 ，当前状态

        if not self.zgxm and not self.zgzh or '错误' in response.body or '错误' in response.body.decode(
                "gb2312", "ignore"):
            logger.error(msg="<%s>,Login HUNANGJJ error, password error!!!" %
                         self.idcard)
            self.sign = 0
            self.item_status["code"] = 2104

            self.con.hmset(self.key, dict(self.item_status))

            # print u"密码输入错误2104"
            return
        else:
            self.item_status["code"] = 2102

            self.con.hmset(self.key, dict(self.item_status))
            logger.info(msg="<%s>,Login HUNANGJJ success!!!" % self.idcard)
            self.sign = 1
            # print u"登录成功2102"

        self.form_data = {
            "sfzh": str(self.sfzh),
            "zgxm": str(self.zgxm),
            "zgzh": str(self.zgzh),
            "dwbm": str(self.dwbm),
            "cxyd": str(self.cxyd),
            "zgzt": str(self.zgzt),
        }

        yield FormRequest(url=self.post_xx_url,
                          formdata=self.form_data,
                          callback=self.parse_get_xinxi,
                          dont_filter=True)

        yield FormRequest(url=self.post_mx_url,
                          formdata=self.form_data,
                          callback=self.parse_get_mx,
                          dont_filter=True)

        yield FormRequest(url=self.post_dk_url,
                          formdata=self.form_data,
                          callback=self.parse_get_daikuanxinxi,
                          dont_filter=True)

コード例 #25

0

ファイルを表示

    def parse_get_daikuanxinxi(self, response):
        logger.info(msg="<%s>,get HUNANGJJ DKXX!!!" % self.idcard)
        # f = open("parse_get_daikuanxinxi.html", "w")
        # f.write(response.body.decode("gb2312", "ignore"))
        # f.close()

        if "该职工没有贷款".encode("gb2312", "ignore") in response.body:
            logger.info(msg="<%s>,DKXX NO DATA!!!" % self.idcard)
            # print u"该职工没有贷款"
            return

        html = HtmlXPathSelector(response)

        item = {}

        #item["贷款合同编号"] = ''.join(html.xpath("//tr[@class='jtpsoft'][1]/td[2]//text()").extract()).strip()

        item["userName"] = ''.join(
            html.xpath(
                u"//td[text()='姓名']/../td[4]//text()").extract()).strip()

        item["loanMoney"] = ''.join(
            html.xpath(
                u"//td[text()='贷款金额']/../td[2]//text()").extract()).strip()

        loanLimit = ''.join(
            html.xpath(u"//td[text()='贷款年限']/../td[4]//text()").extract()
        ).strip()  ##贷款年限
        if loanLimit:
            loanLimit_year = loanLimit.replace("年", "").replace(u"年", "")
            item["loanLimit"] = int(loanLimit_year) * 12
        else:
            item["loanLimit"] = ""
        item["repaidPrincipal"] = ''.join(
            html.xpath(
                u"//td[text()='已还本金']/../td[2]//text()").extract()).strip()

        item["repaidInterest"] = ''.join(
            html.xpath(
                u"//td[text()='已还利息']/../td[4]//text()").extract()).strip()

        item["loanBalance"] = ''.join(
            html.xpath(
                u"//td[text()='贷款余额']/../td[2]//text()").extract()).strip()

        item["monthLeastRepayment"] = ''.join(
            html.xpath(
                u"//td[text()='月最低还款']/../td[4]//text()").extract()).strip()

        item["overdueMoney"] = ''.join(
            html.xpath(
                u"//td[text()='当前逾期金额']/../td[2]//text()").extract()).strip()

        item["overdueAccrual"] = ''.join(
            html.xpath(
                u"//td[text()='当前逾期利息']/../td[4]//text()").extract()).strip()

        item["repaidDay"] = ''.join(
            html.xpath(
                u"//td[text()='月还款日']/../td[2]//text()").extract()).strip()

        #item["还至年月"] = ''.join(html.xpath("//tr[@class='jtpsoft'][6]/td[4]//text()").extract()).strip()

        item["loanDay"] = ''.join(
            html.xpath(
                u"//td[text()='放款日期']/../td[2]//text()").extract()).strip()

        item["entrustBank"] = ''.join(
            html.xpath(
                u"//td[text()='受托银行']/../td[4]//text()").extract()).strip()

        item["loanInterestRate"] = ''.join(
            html.xpath(
                u"//td[text()='贷款利率']/../td[2]//text()").extract()).strip()

        item["overdueTimes"] = ''.join(
            html.xpath(
                u"//td[text()='当前逾期期数']/../td[4]//text()").extract()).strip()

        item["repaidType"] = ''.join(
            html.xpath(
                u"//td[text()='还款方式']/../td[2]//text()").extract()).strip()

        item["securityType"] = ''.join(
            html.xpath(
                u"//td[text()='担保方式']/../td[4]//text()").extract()).strip()

        item["loanType"] = ''.join(
            html.xpath(
                u"//td[text()='购房类型']/../td[2]//text()").extract()).strip()

        #item["历史逾期金额"] = ''.join(html.xpath("//tr[@class='jtpsoft'][10]/td[4]//text()").extract()).strip()

        #item["历史逾期期数"] = ''.join(html.xpath("//tr[@class='jtpsoft'][11]/td[2]//text()").extract()).strip()

        item["monthHedging"] = ''.join(
            html.xpath(
                u"//td[text()='是否办理月对冲']/../td[4]//text()").extract()).strip()

        logger.info(msg="<%s>,get HUNANGJJ DKXX OVER!!!" % self.idcard)

        self.tb = HbClient()

        self.tb.insert(colname=u"公积金-湖南-贷款信息查询 ",
                       url=response.url,
                       html=response.body,
                       struct_dic={"loan_info": [item]},
                       id=self.idcard,
                       post_dic=self.form_data,
                       token=self.token)

コード例 #26

0

ファイルを表示

ファイル: jd_spider.py プロジェクト: yh623962721/QbSpider

    def parse(self, response):

        self.passwd = urllib.unquote(self.settings.get(
            "PASSWD", None))  #.decode("ascii").encode("utf8")

        self.username = urllib.unquote(self.settings.get(
            "USERNAME", None))  #.decode("ascii").encode("utf8")

        self.jobid = urllib.unquote(self.settings.get(
            "JOBID", None))  #.decode("ascii").encode("utf8")

        self.vercode = urllib.unquote(self.settings.get(
            "VERCODE", None))  #.decode("ascii").encode("utf8")

        # self.passwd = "Zqp821907280&@#"
        #
        # self.username = "******"
        #
        # self.jobid = "y32783y2cnj2neckjn2c"
        #
        # self.vercode = ""

        self.con.hmset(self.jobid, {"status": 0})

        hxs = HtmlXPathSelector(response)

        pubKey = "".join(hxs.xpath('//input[@name="pubKey"]/@value').extract())

        keyDER = b64decode(pubKey)

        keyPub = RSA.importKey(keyDER)

        nloginpwd = b64encode(rsa.encrypt(b"%s" % self.passwd, keyPub))

        self.uuid = "".join(
            hxs.xpath('//input[@name="uuid"]/@value').extract())

        fp = "".join(hxs.xpath('//input[@name="fp"]/@value').extract())

        _t = "".join(hxs.xpath('//input[@name="_t"]/@value').extract())

        loginType = "".join(
            hxs.xpath('//input[@name="loginType"]/@value').extract())

        eid = "".join(hxs.xpath('//input[@name="eid"]/@value').extract())

        self.authcode = ""

        self.post_data = {
            "uuid": self.uuid,
            "eid": eid,
            "fp": fp,
            "_t": _t,
            "loginType": loginType,
            "loginname": "%s" % self.username,
            "nloginpwd": nloginpwd,
            "chkRememberMe": "on",
            "authcode": self.authcode,
        }

        auth_dat = {
            'loginName': self.username,
        }

        self.sess = {}

        cookie = [
            i.split(";")[0] for i in response.headers.getlist('Set-Cookie')
        ]

        for cook in cookie:

            self.sess.update(
                {cook[:cook.index("=")]: cook[cook.index("=") + 1:]})

        code_url = self.code_url % (random.random(), 2015)

        yield FormRequest(url=code_url,
                          cookies=self.sess,
                          formdata=auth_dat,
                          callback=self.checkauthcode)

コード例 #27

0

ファイルを表示

    def parse_get_mx(self, response):
        logger.info(msg="<%s>,get HUNANGJJ MX!!!" % self.idcard)
        # f = open("parse_get_mx.html", "w")
        # f.write(response.body.decode("gb2312", "ignore"))
        # f.close()
        #print response.body.decode("gb2312", "ignore")

        html = HtmlXPathSelector(response)

        list_item = []

        for items in html.xpath("//*[@class='jtpsoft']"):

            item = {}

            item["payDate"] = ''.join(
                items.xpath(".//td[1]/text()").extract())  ##支付时间/日期

            if not item["payDate"]:
                continue

            item["balanceRmb"] = ''.join(
                items.xpath(".//td[4]/text()").extract())  ##余额

            item["payType"] = ''.join(
                items.xpath(".//td[6]/text()").extract())  #缴费类型/摘要

            item["debtorRmb"] = ''.join(
                items.xpath(".//td[2]/text()").extract())  ##借方金额

            item["lenderRmb"] = ''.join(
                items.xpath(".//td[3]/text()").extract())  ##贷方金额

            item["trend"] = ''.join(
                items.xpath(".//td[5]/text()").extract())  #借贷方向

            list_item.append(item)

        if not list_item:
            self.sign = 0
            self.item_status["fatch_code"] = 2199

            self.con.hmset(self.key, dict(self.item_status))

            logger.error(msg="<%s>,get HUNANGJJ MX NO DATA!!!" % self.idcard)

            # print u"采集失败，没采到数据！2199"

        else:

            dic_item = {"detail_info": list_item}

            self.tb = HbClient()

            self.tb.insert(colname=u"公积金-湖南-明细查询 ",
                           url=response.url,
                           html=response.body,
                           struct_dic=dic_item,
                           id=self.idcard,
                           post_dic=self.form_data,
                           token=self.token)

            logger.info(msg="<%s>,get HUNANGJJ MX OVER!!!" % self.idcard)