Beispiel #1
0
    def parse_start_url(self, response):
        print('request_url= ', response.request.url)
        li = response.xpath('//ul[@class="newslist"]//a')
        for a in li:
            item = BiddinginfospiderItem()
            title = a.xpath("..//h1").xpath('normalize-space(string(.))').get()
            href = a.xpath('.//@href').get()
            code = a.xpath(
                './/ul[@class="newsinfo"]//li[1]//span//text()').get()
            t = a.xpath('.//div[@class="newsDate"]').xpath(
                'normalize-space(string(.))').get()

            if t:
                t = t.replace(" ", "").replace("/", "-")
            ctime = t[:3] + "-" + t[4:]

            item.update(
                code=code,
                industry=self.industry,
                category=self.category,
                title=title,
                ctime=ctime,
                href=href,
            )
            # print(item)
            yield item
Beispiel #2
0
    def parse_start_url(self, response):
        print('request_url= ', response.request.url)
        body = json.loads(str(response.body, "utf-8"))
        li = body.get("obj")
        print(len(li))
        for l in li:
            item = BiddinginfospiderItem()
            title = l.get("PROJECTNAME")
            ctime = l.get("RECEIVETIME")
            category = l.get("TABLENAME")
            code = l.get("PROJECTCODE")
            url = l.get("URL", "") + "&id="
            id = l.get("ID", "")

            href = response.urljoin("?getNoticeDetail&url=" + url + id)

            print(href)
            item.update(
                category=self.category_dict[category],
                title=title,
                ctime=ctime,
                href=href,
                code=code
            )
            # yield scrapy.Request(method="GET", url=href, dont_filter=True, callback=self.parse_item,
            #                      meta={'item': item})
            yield item
Beispiel #3
0
    def parse_page(self, response):
        if not response:
            return BiddinginfospiderItem()

        print('request_url= ', response.request.url)
        body = json.loads(str(response.body, "utf-8"))
        li = body.get("data")
        print("Num :", len(li))

        for l in li:
            item = BiddinginfospiderItem()
            sheng = l.get('districtShow')
            shiQu = l.get('platformName')
            shi = self.getSHI(shiQu)

            href = l.get("url"),
            if isinstance(href, tuple):
                href = href[0]
            print("href is,", href)
            # href = href.replace("a", "b")
            item.update(
                city=sheng + "-" + shi if shi else sheng,
                title=l.get("title"),
                ctime=l.get("timeShow"),
                category=l.get("classifyShow"),
                href=href,
                industry=l.get("tradeShow"),
            )
            print("ITEM IS")
            # print(item)
            yield item
Beispiel #4
0
    def parse(self, response):
        for i in range(1, self.endPageNum):
            form_data = {
                "page.currentPage": str(i),
                "page.perPageSize": "20",
                "noticeBean.companyName": "",
                "noticeBean.title": "",
                "noticeBean.startDate": "",
                "noticeBean.endDate": "",
            }
            response = requests.post(self.tmpl_url,
                                     headers=self.headers,
                                     data=form_data)
            res = scrapy.Selector(text=response.text)
            li = res.xpath('//table[@class="jtgs_table"]//tr')
            article_tmp_url = 'https://b2b.10086.cn/b2b/main/viewNoticeContent.html?noticeBean.id={0}'
            for l in li[1:]:
                item = BiddinginfospiderItem()
                a = l.xpath(".//a")
                id = l.xpath('@onclick').get()[14:-2]
                href = article_tmp_url.format(id)

                title = a.xpath('.//text()').get()
                item.update(
                    title=title,
                    href=href,
                )
                yield item
Beispiel #5
0
 def parse_page(self, response):
     li_lst = response.xpath('//div[@class="filter-content"]/ul/li')
     for l in li_lst:
         item = BiddinginfospiderItem()
         a = l.xpath('./a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(a.xpath('.//span[@class="time"]//text()'))
         item.update(
             title=title,
             ctime=ctime,
             href=href,
         )
         yield item
Beispiel #6
0
 def parse_page(self, response):
     res = scrapy.Selector(response)
     li = res.xpath('//div[@class="titlecss"]')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath(".//a")
         title = a.xpath('.//@title').get()
         href = response.urljoin(a.xpath('.//@href').get())
         ctime = self.get_ctime(l.xpath('../following-sibling::td[1]//text()'))
         item.update(
             title=title,
             href=href,
             ctime=ctime,
         )
         # print(item)
         yield item
Beispiel #7
0
    def parse_shangdong_xml(self, response):
        # response已经完成返回xml的解析
        # datastore = etree.XML(response.text)
        html = ''.join(response.xpath('//record/text()').extract())

        # 产生内容页的请求
        for li in etree.HTML(html).xpath('//li'):
            item = BiddinginfospiderItem()
            a = li.xpath(".//a")[0]
            item['href'] = a.xpath(".//@href")[0]
            item['title'] = a.xpath(".//text()")[0]
            item['ctime'] = li.xpath('.//span//text()')[0]
            # print(item)
            yield item
        # 如果是全部爬取则继续获取下一页
        if not self.biddingInfo_update:
            selector = etree.XML(response.body)
            a = [0]
            next_select = selector.xpath('//nextgroup/text()')
            if next_select:
                a = next_select[0]
                href = etree.HTML(a).xpath('//a/@href')[0]
                req = self.make_requests_from_url(response.urljoin(href))
                req.callback = self.parse_shangdong_xml
                yield req
Beispiel #8
0
    def parse(self, response):
        a = response.xpath('//div[@class="W750 Right"]//li//a')

        for a1 in a:
            item = BiddinginfospiderItem()
            href = response.urljoin(a1.xpath('.//@href').extract_first())
            title = a1.xpath(".//text()").extract_first().strip()
            ctime = a1.xpath('..//..//span//text()').extract_first()
            city = '南方电网'
            item.update(
                href=href,
                title=title,
                ctime=ctime,
                city=city
            )
            yield item
Beispiel #9
0
    def parse_page(self, response):

        rs = json.loads(response.body.decode('utf8'))
        rs = rs['data'].get("datalist", False)

        for a in rs:
            item = BiddinginfospiderItem()
            item[
                'href'] = 'http://www.wysggzy.cn:81/hyweb/wysebid/bidDetails.do?handle=1&tenderProjCode={0}&noticeType=1&flag=1&tenderProjId={1}&proj_id={2}&pre_evaId=&evaId={3}&signUpType={4}'.format(
                    a['tenderProjCode'], a['tenderProjId'], a['proj_id'],
                    a['evaId'], a['signUpType'])
            item['title'] = a['noticeTitle']
            item['ctime'] = a['sendTime']
            item['city'] = '武夷山'

            data = {
                "tenderProjCode": a['tenderProjCode'],
                "noticeType": "1",
                "noticeId": ""
            }

            # yield scrapy.Request(url='http://www.wysggzy.cn:81/hyweb/transInfo/getProjBuildNoticeById.do',
            #                      dont_filter=True, callback=self.parse_item, method="POST", body=json.dumps(data),
            #                      meta={'meta': item, })
            yield item
Beispiel #10
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//li[@class="now-hd-items clearfix"]')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath('./a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(l.xpath('.//span//text()'))
         item.update(
             title=title,
             ctime=ctime,
             href=href,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Beispiel #11
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//div[@class="list_service"]//tr')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath('.//a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(l.xpath(".//td[2]//text()"))
         item.update(
             category=self.category,
             title=title,
             ctime=ctime,
             href=href,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Beispiel #12
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//li[@name="li_name"]//a')
     for a in li:
         item = BiddinginfospiderItem()
         title = a.xpath('@title').get()
         href = response.urljoin(a.xpath('.//@href').get())
         ctime = self.get_ctime(a.xpath('.//em[1]//text()'))
         item.update(
             ctime=ctime,
             industry=self.industry,
             category=self.category,
             title=title,
             href=href,
         )
         # print(item)
         yield item
Beispiel #13
0
    def parse_start_url(self, response):
        print('request_url= ', response.request.url)
        li = response.xpath('//ul[@class="ewb-news-items"]//li')
        for l in li:
            item = BiddinginfospiderItem()
            a = l.xpath('.//a')
            title = a.xpath('.//@title').extract_first()
            href = response.urljoin(a.xpath('.//@href').extract_first())
            ctime = self.get_ctime(a.xpath('.//span//text()'))

            item.update(
                city="湖北",
                title=title,
                ctime=ctime,
                href=href,
            )
            yield item
Beispiel #14
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//li[@class="list-item"]')
     for l in li:
         item = BiddinginfospiderItem()
         a = l.xpath('./a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(a.xpath('.//span//text()'))
         item.update(
             industry=self.industry,
             category=self.category,
             title=title,
             ctime=ctime,
             href=href,
         )
         yield item
Beispiel #15
0
 def parse_start_url(self, response):
     a_list = response.xpath('//div[@id="listChangeDiv"]//td//a')
     for a in a_list:
         item = BiddinginfospiderItem()
         item['href'] = response.urljoin(a.xpath('@href').get())
         item['ctime'] = a.xpath('../../td[3]//text()').get()
         item['title'] = a.xpath("@title").get()
         # print(item)
         yield item
Beispiel #16
0
    def parse_start_url(self, response):
        li = response.xpath('//div[@class="lb-link"]/ul//li')
        for l in li:
            item = BiddinginfospiderItem()
            a = l.xpath('.//a')
            title = a.xpath("@title").get()
            href = a.xpath("@href").get()
            ctime = self.get_ctime(l.xpath('.//span[@class="bidDate"]//text()'))

            item.update(
                industry=self.industry,
                category=self.category,
                title=title,
                ctime=ctime,
                href=href,
            )
            # print(item)
            yield item
Beispiel #17
0
    def parse_page(self, response):
        print(response.request.url)
        a_lst = response.xpath('//table[@class="wsbs-table"]//a')
        for a in a_lst:
            item = BiddinginfospiderItem()

            title = a.xpath('.//text()').extract_first()
            href = response.urljoin(a.xpath('.//@href').extract_first())
            ctime = self.get_ctime(a.xpath('../../td//text()'))
            item.update(
                category=self.category,
                industry=self.industry,
                title=title,
                ctime=ctime,
                href=href,
                city="广东",
            )
            yield item
Beispiel #18
0
    def parse(self, response):
        # a标签
        a = response.xpath('//ul[@class="newslist"]//li//a')

        for a1 in a:
            item = BiddinginfospiderItem()
            item['href'] = response.urljoin(a1.xpath('.//@href').get())
            item['title'] = a1.xpath(".//h1//text()").getall()[3].strip()
            item['ctime'] = a1.xpath('normalize-space(string(.//div[@class="newsDate"]))').get().replace('/','-').replace(' ','-')
            yield item
Beispiel #19
0
 def parse_start_url(self, response):
     print('reponse_request_url= ', response.request.url)
     li = response.xpath('//div[@id="listChangeDiv"]//li')
     for line in li:
         item = BiddinginfospiderItem()
         a = line.xpath('.//a')
         item['href'] = response.urljoin(a.xpath('@href').get())
         item['title'] = a.xpath('@title').get()
         item['ctime'] = line.xpath(".//span/text()").get()
         yield item
Beispiel #20
0
 def parse_page(self, response):
     res = Selector(response)
     li_lst = res.xpath('//tr[@class="gridview1_RowStyle"]')
     for l in li_lst:
         item = BiddinginfospiderItem()
         a = l.xpath(".//a")
         title = a.xpath('.//text()').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         c = l.xpath('.//td[@class="gridview_RowTD"][last()]')
         ctime = self.get_ctime(c)
         item.update(
             category=self.category,
             industry=self.industry,
             title=title,
             ctime=ctime,
             href=href,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Beispiel #21
0
    def parse(self, response):
        # a标签
        a = response.xpath('//div[@class="lb-link"]//ul//li//a')

        for a1 in a:
            item = BiddinginfospiderItem()
            item['href'] = response.urljoin(a1.xpath('.//@href').get())
            item['title'] = a1.xpath(".//@title").get().strip()
            item['ctime'] = a1.xpath('.//span[3]//text()').get()
            yield item
Beispiel #22
0
    def parse(self, response):
        rs = json.loads(response.body.decode('utf8'))

        for a1 in rs:
            item = BiddinginfospiderItem()
            item['href'] = a1['pdf_url']
            item['title'] = a1['message_title']
            time0 = time.localtime(int(str(a1['publish_time'])[0:10]))
            item['ctime'] = time.strftime("%Y-%m-%d", time0)
            yield item
Beispiel #23
0
 def parse_start_url(self, response):
     res = Selector(text=response.text[11:-2])
     a_list = res.xpath('//a')
     for a in a_list:
         item = BiddinginfospiderItem()
         item['href'] = urljoin("http://qdstc.qingdao.gov.cn",
                                a.xpath('@href').get())
         item['ctime'] = a.xpath('../../td[3]//text()').get()
         item['title'] = a.xpath("@title").get()
         # print(item)
         yield item
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = json.loads(str(response.body, "utf-8"))
     data = li.get("rows")
     article_tmp_url = "http://epp.ctg.com.cn/infoview/?fileId={0}&openFor=ZBGG&typeFor=undefined"
     for li in data:
         item = BiddinginfospiderItem()
         title = li.get('TITLE')
         ctime = li.get('CREATED_TIME')
         id = li.get('ARTICLE_ID')
         href = article_tmp_url.format(id)
         item.update(
             industry=self.industry,
             category=self.category,
             title=title,
             ctime=ctime,
             href=href,
         )
         # print(item)
         yield item
Beispiel #25
0
    def parse_page(self, response):
        li_lst = response.xpath(
            '//div[@class="abstract-box mg-t25 ebnew-border-bottom mg-r15"]')
        for l in li_lst:
            item = BiddinginfospiderItem()
            a = l.xpath('.//a')

            title = a.xpath('.//@title').extract_first()
            href = response.urljoin(a.xpath('.//@href').extract_first())
            ctime = self.get_ctime(l.xpath('.//i[2]//text()'))
            city = l.xpath(
                './/div[@class="abstract-content-items fl pd-l15 pd-t20 pd-b20 width-50"][2]//p[2]//span[2]//text()'
            ).extract_first()
            item.update(
                title=title,
                ctime=ctime,
                href=href,
                city=city,
            )
            yield item
Beispiel #26
0
 def parse_page(self, response):
     res = scrapy.Selector(response)
     article_tmp_url = 'http://ec.ccccltd.cn/PMS/gysCggg.shtml?id={0}'
     li = res.xpath('//td[@class="listCss"]//a')
     for a in li:
         item = BiddinginfospiderItem()
         title = a.xpath('normalize-space(string(.))').get()
         x = "".join(
             a.xpath('.//@href').get().replace("\\r", "").replace(
                 "\\n", "").split())[23:-3]
         href = article_tmp_url.format(x)
         ctime = self.get_ctime(
             a.xpath('../following-sibling::td[1]//text()'))
         item.update(
             title=title,
             href=href,
             ctime=ctime,
         )
         # print(item)
         yield item
Beispiel #27
0
 def parse(self, response):
     rs = json.loads(response.body.decode('utf8'))
     rs = rs['_rows']
     for a1 in rs:
         item = BiddinginfospiderItem()
         item[
             'href'] = 'http://ecm.crcc.cn/unlogin/queryPurchaseTenderDetailInit.jhtml?model=1&id=' + a1[
                 'id']
         item['title'] = a1['v_notice_title']
         item['ctime'] = a1['d_createdate']
         yield item
Beispiel #28
0
 def parse(self, response):
     rs = json.loads(response.body.decode('utf8'))
     rs = rs['_rows']
     for a1 in rs:
         item = BiddinginfospiderItem()
         item[
             'href'] = 'http://ece.crcc.cn/homepage/inviteInfo.jhtml?type=1&id=' + str(
                 a1['inviteid'])
         item['title'] = a1['texttitle']
         item['ctime'] = a1['recordtime'][0:10]
         yield item
Beispiel #29
0
 def parse_start_url(self, response):
     print('request_url= ', response.request.url)
     li = response.xpath('//table[@class="table_text"]//tr')
     for l in li[1:]:
         item = BiddinginfospiderItem()
         a = l.xpath('.//a')
         title = a.xpath('.//@title').extract_first()
         href = response.urljoin(a.xpath('.//@href').extract_first())
         ctime = self.get_ctime(a.xpath('.//td[5]//span//text()'))
         industry = l.xpath(".//td[2]//span//text()").extract_first()
         city = l.xpath(".//td[3]//span//@title").extract_first()
         item.update(
             industry=industry,
             title=title,
             ctime=ctime,
             href=href,
             city=city,
         )
         # yield scrapy.Request(url=href, dont_filter=True, callback=self.parse_item, meta={'item': item})
         yield item
Beispiel #30
0
    def parse(self, response):
        rs = json.loads(response.body.decode('utf8'))
        rs = eval(rs['return'])['Table']
        for a in rs:
            item = BiddinginfospiderItem()
            item['href'] = response.urljoin(a['href'])
            item['title'] = a['title']
            item['ctime'] = a['infodate']
            item['city'] = a['infoc']

            # yield scrapy.Request(url=item['href'], dont_filter=True, callback=self.parse_item, meta={'meta': item, })
            yield item