Python DongguanItem Beispiele, dongguan.items.DongguanItem Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: xixi.py Projekt: Willanzhang/python-example

    def parse_item(self, response):
        item = DongguanItem()

        # 链接
        item['url'] = response.url
        # 標題
        title = response.xpath(
            '//div[@class="wzy1"]//td/span[@class="niae2_top"]/text()'
        ).extract()[0]
        item['title'] = title
        # 編號
        number = response.xpath('//div[@class="wzy1"]//td/span[2]/text()'
                                ).extract()[0].split(':')[-1]
        item['number'] = number
        # 内容
        content = response.xpath(
            '//div[@class="wzy1"]//tr[1]/td[@class="txt16_3"]/text()').extract(
            )

        if (len(content)) == 0 or (len(content) == 1
                                   and content[0].replace(u'\xa0', u'') == ''):
            content = response.xpath(
                '//div[@class="wzy1"]//tr[1]/td[@class="txt16_3"]/div[@class="contentext"]/text()'
            ).extract()
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()

        # 交给管道
        yield item

Beispiel #2

0

Datei anzeigen

Datei: xixi_spider类.py Projekt: Lowson666/ScrapyProjects

    def parse_item(self, response):
        item = DongguanItem()

        # 编号
        item['number'] = response.xpath(
            "//div[@class='pagecenter p3']//strong/text()").extract(
            )[0].replace('\xa0', '').split(':')[-1]
        # 链接
        item['url'] = response.url
        # 标题
        item['title'] = response.xpath(
            "//div[@class='pagecenter p3']//strong/text()").extract(
            )[0].replace('\xa0', '').split('编号')[0]
        # 内容，先使用有图片情况下的匹配规则，如果有内容，返回所有内容的列表集合
        content = "".join(
            response.xpath(
                "//div[@class='pagecenter p3']//div[@class='c1 text14_2']/text()"
            ).extract()).replace("\xa0", "")
        #判断内容
        if content != "":
            #若不为空则为纯文本
            item['content'] = content
        else:
            #若为空则查找目标标签
            item['content'] = "".join(
                response.xpath(
                    "//div[@class='pagecenter p3']//div[@class='c1 text14_2']/div[@class='contentext']/text()"
                ).extract()).replace("\xa0", "")

        # 交给管道
        yield item

Beispiel #3

0

Datei anzeigen

 def parse(self, response):
     item = DongguanItem()
     node_list = response.xpath(
         '//*[@id="morelist"]/div/table[2]//tr/td/table')
     for node in node_list:
         id_list = node.xpath('.//tr/td[1]/text()').extract()
         title_list = node.xpath('.//tr/td[2]/a[2]/text()').extract()
         address_list = node.xpath('.//tr/td[2]/a[3]/text()').extract()
         handing_list = node.xpath('.//tr/td[3]/span/text()').extract()
         datime_list = node.xpath('.//tr/td[5]/text()').extract()
         # item['content'] = response.xpath('//div[@class="contentext"]/text() | //div[@class="c1 text14_2"]/text()').extract_first()
         for Id, title, address, handling, datime in zip(
                 id_list, title_list, address_list, handing_list,
                 datime_list):
             item['Id'] = Id
             item['title'] = title
             item['address'] = address
             item['handling'] = handling
             item['datime'] = datime
             yield item
         self.offset += 30
         print('正在爬取' + str(self.offset / 30) + '页.....')
         if node_list == []:
             return None
         yield scrapy.Request(url='http://wz.sun0769.com/index.php/question/questionType?type=4&page=' + \
                              str(self.offset),callback=self.parse)
         if self.offset == 32460:
             continue
         else:
             yield scrapy.Request(url='http://wz.sun0769.com/index.php/question/questionType?type=4&page=' + \
                              str(self.offset),callback=self.parse)

Beispiel #4

0

Datei anzeigen

    def parse_item(self, response):
        # 提问：梁屋山边路与梁屋北路违停严重  编号:194771
        item = DongguanItem()
        re_title = Selector(response=response).xpath(
            "/html/body/div[6]/div/div[1]/div[1]/strong/text()").extract()[0]
        # title =re.match(r'提问：(\S+) +编号：(\d+)',re_title).group(1)
        title = re_title.decode().encode('utf-8').split(" ")[0].split('：')[-1]
        # number =re.match(r'提问：(\S+) +编号：(\d+)',re_title).group(2)
        number = re_title.decode().encode('utf-8').split(" ")[-1].split(
            '：')[-1]
        content = Selector(response=response).xpath(
            "//html/body/div[6]/div/div[2]/div[1]/text()").extract()
        # re_date_time=Selector(response=response).xpath("//p[@class='te12h']/text()").extract()
        # data_time = re.match(r'\S+ 发言时间：(\w \w)',re_date_time).group(1)
        # date_time = re_date_time.split(' ')[1].split('：')[-1]
        status = Selector(response=response).xpath(
            "//div[@class='audit']/div[@class='cleft']/span/text()").extract()

        item['title'] = title
        item['number'] = number
        item['content'] = content[0]
        # item['date_time'] = date_time
        item['status'] = status[0]

        print '\n', re_title, '\n'

        yield item

Beispiel #5

0

Datei anzeigen

    def parse_res(self, response):
        # print(response.text)
        # item = DongguanItem()
        # doc = pq(etree.HTML(response.text))
        # text = doc('.p3 .tgray14').text()
        # item['question'] = re.search('提问：(.*?)编号',text).group(1)
        # item['content'] = doc('.c1').text()
        # item['status'] = doc('.text14_2 .cleft').text().split('：')[-1]
        # item['time'] = doc('.text14_2 .cright .te12h').text().split('：')[-1]

        item = DongguanItem()
        item['question'] = response.xpath(
            '//div[@class="wzy1"]/table[1]//td[2]/span[1]/text()')[0].extract(
            ).strip().replace('\xa0', '')
        item['content'] = response.xpath(
            '//div[@class="wzy1"]/table[2]//tr[1]/td/text()')[0].extract(
            ).replace('\xa0', '')
        item['status'] = response.xpath(
            '//div[@class="wzy3_1"]/span/text()')[0].extract().replace(
                '\xa0', '')
        item['time'] = response.xpath(
            '//div[@class="wzy3_2"]/span[1]/text()')[0].extract().replace(
                '\xa0', '')

        yield item

Beispiel #6

0

Datei anzeigen

    def parse_item(self, response):
        item = DongguanItem()
        title_block = response.xpath(
            '//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()'
        ).extract()[0]
        title_list = title_block.split(
            u"\xa0\xa0")  #html的空格转成unicode之后变成了\xa0了，所以需要通过这种方式分隔
        # logging.debug(title_list)
        title = title_list[0].split('：')[1]
        code = title_list[1].split(':')[1]
        # logging.debug(title, code)

        #content可能会出现有图片的情况
        #有图片，则匹配contentext，没有则匹配c1 text14_2
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        if len(content) == 0:
            content = response.xpath(
                '//div[@class="c1 text14_2"]/text()').extract()
        content = ''.join(content)
        content = content.replace('  ', '')
        # logging.debug(content)

        url = response.url

        item['title'] = title if title else ''
        item['content'] = content if content else ''
        item['url'] = url if url else ''
        item['code'] = code if code else ''

        yield item

Beispiel #7

0

Datei anzeigen

Datei: sun_crawl类.py Projekt: Lowson666/ScrapyProjects

    def parse_item(self, response):

        item = DongguanItem()
        item['number'] = response.xpath(
            "//div[@class='pagecenter p3']//strong/text()").extract(
            )[0].replace('\xa0', '').split(':')[-1]
        item['url'] = response.url
        item['title'] = response.xpath(
            "//div[@class='pagecenter p3']//strong/text()").extract(
            )[0].replace('\xa0', '').split('编号')[0]
        #由于网页保存含有图片文本和纯文字问不问的标签不同，所以需要在查找的时候判断是否为纯文本内容
        content = "".join(
            response.xpath(
                "//div[@class='pagecenter p3']//div[@class='c1 text14_2']/text()"
            ).extract()).replace("\xa0", "")
        #判断内容
        if content != "":
            #若不为空则为纯文本
            item['content'] = content
        else:
            #若为空则查找目标标签
            item['content'] = "".join(
                response.xpath(
                    "//div[@class='pagecenter p3']//div[@class='c1 text14_2']/div[@class='contentext']/text()"
                ).extract()).replace("\xa0", "")

        yield item

Beispiel #8

0

Datei anzeigen

Datei: SunCrawlSpider.py Projekt: yunwangjun/python_code

    def parse_item(self, response):
        print("url:%s" % response.url)
        item = DongguanItem()
        # 标题
        item['title'] = response.xpath(
            '//head/title/text()').extract()[0].replace('_阳光热线问政平台', "")
        #编号
        item['number'] = (
            response.xpath('//div[@class="pagecenter p3"]//strong//text()'
                           ).extract()[0]).split(':')[-1]

        #帖子内容,默认取出有图片情况下的文字内容列表
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        #如果没有图片，则取出没有图片情况下的文字内容列表
        if len(content) == 0:
            content = response.xpath(
                '//div[@class="c1 text14_2"]/text()').extract()

        item['content'] = "".join(content).strip()

        #链接
        item['url'] = response.url
        #帖子状态
        item['status'] = response.xpath(
            '//div[@class="audit"]//span/text()').extract()[0]

        #网友
        item['net_friend'] = (
            response.xpath('//div[@class="cright"]//p//text()').extract()[0]
        ).split("发言时间")[0].split("：")[1].strip()
        #时间
        item['time'] = (response.xpath('//div[@class="cright"]//p//text()').
                        extract()[0]).split("发言时间")[1].strip()
        # yield item
        yield item

Beispiel #9

0

Datei anzeigen

    def parse_item(self, response):
        print response.url
        item = DongguanItem()
        # 标题
        t_n = response.xpath(
            '//div[contains(@class, "pagecenter p3")]//strong/text()').extract(
            )[0]
        item['title'] = t_n.split()[-2][3:]
        # 编号
        item['number'] = t_n.split()[-1].split(":")[-1]

        # 文字内容，默认先取出有图片情况下的文字内容列表
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        # 如果没有图片，则取出没有图片情况下的文字内容列表
        if len(content) == 0:
            content = response.xpath(
                '//div[@class="c1 text14_2"]/text()').extract()
            # content为列表，通过join方法拼接为字符串，并去除首尾空格
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()

        # 链接
        item['url'] = response.url

        yield item

Beispiel #10

0

Datei anzeigen

Datei: sun.py Projekt: zzw12138/learn_python

    def parse_item(self, response):
        item = DongguanItem()
        item['title'] = response.xpath(
            '//div[@class="greyframe"]//strong/text()').extract()[0]
        item['number'] = item['title'].split(' ')[-1].split(':')[-1]
        item['content'] = response.xpath(
            '//div[@class="contentext"]/text()').extract()[0]
        item['url'] = response.url

        yield item

Beispiel #11

0

Datei anzeigen

Datei: dg.py Projekt: liu-xinke/Scrapyexample

    def parse_item(self, response):
        item=DongguanItem()
        item['title'] = response.xpath('/html/body/div[6]/div/div[1]/div[1]/strong/text()').extract()[0]
        item['num']=item['title'].split(" ")[-1].split(':')[-1]

        item['content']=response.xpath('/html/body/div[6]/div/div[2]/div[1]/text()').extract()[0]
        item['url'] = response.url
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        yield item

Beispiel #12

0

Datei anzeigen

Datei: questions.py Projekt: SuneastChen/python_crawler_learning

    def parse_item(self, response):
        # i = {}
        # i['title'] = response.xpath('//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()').extract()[0].split()[0]
        # i['number'] = response.xpath('//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()').extract()[0].split()[-1].split(':')[-1]
        # i['content'] = response.xpath('string(//div[@class="c1 text14_2"])').extract()[0].split()
        # return i

        item = DongguanItem()
        item_loader = QItemLoader(item=DongguanItem(), response=response)

        item_loader.add_xpath(
            'title',
            '//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()')
        item_loader.add_xpath(
            'number',
            '//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()')
        item_loader.add_xpath('content', 'string(//div[@class="c1 text14_2"])')

        item = item_loader.load_item()  # 取出item
        yield item

Beispiel #13

0

Datei anzeigen

 def parse_details(self, response):
     item = DongguanItem()
     item['name'] = response.xpath(
         '//div[@class="pagecenter p3"]//strong[@class="tgray14"]/text()'
     ).extract()[0]
     print(item['name'])
     item['num'] = item['name'].split(' ')[-1].split(":")[-1]
     item['detail_link'] = response.url
     item['content'] = response.xpath(
         '//div[@class="content text14_2"]/div/text()').extract()[0]
     yield item

Beispiel #14

0

Datei anzeigen

Datei: sun.py Projekt: a58982284/dongguansun

    def parse_item(self, response):
        item = DongguanItem()
        item['title'] = response.xpath(
            '//div[contains(@class, "pagecenter p3")]//strong/text()').extract(
            )[0]
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]
        item['content'] = response.xpath(
            '//div[@class="c1 text14_2"]/text()').extract()[0]
        item['url'] = response.url

        yield item

Beispiel #15

0

Datei anzeigen

Datei: dg_house.py Projekt: leiluohui/spider-2018

 def list_detail(self, response):
     item = DongguanItem()
     doc = pq(response.text)
     tr_list = doc("#houseTable_1 tr:gt(0)").items()  # 获取所有的tr, 第一行不要
     for tr in tr_list:
         item['projectUrl'] = self.base_url + tr('td:eq(1) a').attr('href')  # 取第一个td的href
         item['projectTitle'] = tr('td:eq(1) a').text()
         yield scrapy.Request(
             item['projectUrl'],
             callback=self.floorsDetail,
             meta={'item':copy.deepcopy(item)}
         )

Beispiel #16

0

Datei anzeigen

    def parse_item(self, response):
        print response.url
        item = DongguanItem()
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]
        item['number'] = item['title'].split(' ')[-1].split(':')[-1]
        item['content'] = response.xpath('//div[@class="c1 text14_2"]/text()').extract()[0]
        item['url'] = response.url

        yield item

Beispiel #17

0

Datei anzeigen

Datei: question.py Projekt: Zt-1021/Spider

    def parse_item(self, response):
        item = DongguanItem()

        item['question'] = response.xpath(
            "//div[@class='wzy1']//td/span[1]/text()").extract()[0]
        item['num'] = response.xpath(
            "//div[@class='wzy1']//td/span[2]/text()").extract()[0]
        item['context'] = response.xpath(
            "//div[@class='wzy1']//tr[1]/td[@class='txt16_3']/text()").extract(
            )[0]
        item['url'] = response.url

        yield item

Beispiel #18

0

Datei anzeigen

    def parse_item(self, response):
        item = DongguanItem()
        # 拿到提问信息,包括编号
        title = response.xpath(
            '//div[@class="pagecenter p3"]//strong/text()').extract()[0]
        # 通过切片得到提问信息
        item["question"] = title.split(' ')[1]
        # 通过切片得到编号
        item["number"] = title.split(' ')[-1].split(':')[-1]

        item["url"] = response.url
        item["answer"] = response.xpath(
            '//div[@class="c1 text14_2"]/text()').extract()[0]

        yield item

Beispiel #19

0

Datei anzeigen

 def parse_item(self, response):
     item = DongguanItem()
     item['title'] = response.xpath(
         '//div[contains(@class, "pagecenter p3")]//strong/text()').extract(
         )[0]
     item['number'] = item['title'].split(' ')[-1].split(':')[-1]
     content = response.xpath('//div[@class="contentext"]/text()').extract()
     if len(content) == 0:
         content = response.xpath(
             '//div[@class="c1 text14_2"]/text()').extract()
         item['content'] = "".join(content).strip()
     else:
         item['content'] = "".join(content).strip()
     item['url'] = response.url
     yield item

Beispiel #20

0

Datei anzeigen

    def parse_item(self, response):
#         print response.url
        item = DongguanItem()
        item['title'] = response.xpath("//div[@class='pagecenter p3']//strong/text()").extract()[0]
        item['titleId'] = item['title'].strip().split(" ")[-1].split(":")[-1]
        # 如果是有图片时,存在class为contentext的div
        contentext = response.xpath("//div[@class='contentext']/text()").extract()
        if len(contentext) == 0:
            # 没有图片
            contentext = response.xpath("//div[@class='c1 text14_2']/text()").extract()
            item['content'] = ''.join(contentext).strip()
        else:
            # 有图片
            item['content'] = ''.join(contentext).strip()
        item['url'] = response.url
        
        yield item

Beispiel #21

0

Datei anzeigen

Datei: sun.py Projekt: yaoyi2008/Spiders

    def parse_item(self, response):
        item = DongguanItem()

        # 编号：
        item['number_title'] = response.xpath(
            '//div[@class="wzy1"]//td/span[2]/text()').extract()[0].split(
                '：')[-1].split(':')[-1]

        other = response.xpath(
            '//div[@class="wzy3_2"]/span/text()').extract()[0]
        # 网友：
        item['name_title'] = other.split()[0].split('：')[-1]
        # 时间:
        item['time_title'] = other.split()[1].split(
            '：')[-1] + ' ' + other.split()[-1]

        # 标题
        item['title'] = response.xpath(
            '//div[@class="wzy1"]//td/span[1]/text()').extract()[0].split(
                '：')[-1]

        # 内容(无图片):
        # content = response.xpath('//div[@class="wzy1"]//tr/td[@class="txt16_3"]/text()').extract()[0]
        # 内容(有图片)：
        content_has = response.xpath(
            '//div[@class="wzy1"]//td/div[@class="contentext"]/text()'
        ).extract()
        content_no = response.xpath(
            '//div[@class="wzy1"]//tr/td[@class="txt16_3"]/text()').extract()
        string_content = ''

        if len(content_has) == 0:
            for i in content_no:
                string_content += i.strip()
            item['content'] = string_content
        else:
            for i in content_has:
                string_content += i.strip()
            item['content'] = string_content

        # 处理状态：
        item['parsetype'] = response.xpath(
            '//div[@class="wzy3_1"]/span/text()').extract()[0]
        item['url'] = response.url

        yield item

Beispiel #22

0

Datei anzeigen

Datei: sun.py Projekt: Cwen-z/crawler

    def parse_item(self, response):
        print response.url
        item = DongguanItem()
        title = response.xpath(
            "//div[@class='pagecenter p3']//strong/text()").extract()[0]
        item['title'] = title.split(u"：")[-1].split(":")[0][:-2]
        item['url'] = response.url
        item['number'] = title.split(":")[-1]
        content = response.xpath("//div[@class='contentext']/text()").extract()
        if len(content) == 0:
            item['content'] = " ".join(
                response.xpath(
                    "//div[@class='c1 text14_2']/text()").extract()).strip()
        else:
            item['content'] = " ".join(content).strip()

        yield item

Beispiel #23

0

Datei anzeigen

    def parse_item(self, response):
        items = DongguanItem()
        temp = response.xpath(
            "//div[@class='pagecenter p3']//strong/text()").extract()
        # 防止空列表，有值列表取列表第一个，空列表取空字符串
        if temp:
            temp = temp[0]
        else:
            temp = ""
        items['title'] = temp.strip().split('\xa0')[0].split('：')[-1]
        items['number'] = temp.strip().split('\xa0')[-1].split(':')[-1]
        items['content'] = clear_list(
            response.xpath("//div[@class='c1 text14_2']/text()").extract(),
            response)
        items['url'] = response.url

        yield items

Beispiel #24

0

Datei anzeigen

Datei: dongguanSpdier2.py Projekt: ColgateKas/PythonStudy

 def parse_item(self, response):
     content = ''.join(
         response.xpath(
             '//div[@class="contentext"]/text()').extract()).replace(
                 u'\xa0', '').strip()
     if len(content) == 0:
         content = ''.join(
             response.xpath(
                 '//div[@class="c1 text14_2"]/text()').extract()).replace(
                     u'\xa0', '').strip()
     item = DongguanItem()
     item['title'] = response.xpath(
         '//div[@class="pagecenter p3"]//strong/text()').extract_first(
         ).replace(u'\xa0', ' ').strip()
     item['num'] = item['title'].split(':')[-1].strip()
     item['content'] = content
     item['url'] = response.url
     yield item

Beispiel #25

0

Datei anzeigen

 def parse_item(self, response):
     item = DongguanItem()
     #标题
     item['title'] = response.xpath(
         '//div[contains(@class, "pagecenter p3")]//strong/text()').extract(
         )[0]
     #编号
     item['number'] = item['title'].split(' ')[-1].split(':')[-1]
     # 内容，先使用有图片情况下的匹配规则，如果有内容，返回所有内容的列表集合
     content = response.xpath('//div[@class="contentext"]/text()').extract()
     # 如果没有内容，则返回空列表，则使用无图片情况下的匹配规则
     if len(content) == 0:
         content = response.xpath(
             '//div[@class="c1 text14_2"]/text()').extract()
         item['content'] = "".join(content).strip()
     else:
         item['content'] = "".join(content).strip()
     item['url'] = response.url
     yield item

Beispiel #26

0

Datei anzeigen

Datei: dg.py Projekt: Thenlu/spider

    def parse(self, response):
        url_list = response.xpath("//ul[@class='carlist clearfix js-top']/li")
        print(url_list,"*"*100)
        #遍历li
        for li in url_list:
            item = DongguanItem()
            # item['carurl'] = li.xpath("")
            #车名
            item['carname'] = li.xpath(".//h2[@class='t']/text()").extract_first()
            #车的日期
            item['date'] = li.xpath(".//div[@class='t-i']/text()").extract()
            # item['licheng'] = li.xpath(".//div[@class='t-i']/text()")
            item['prize'] = li.xpath(".//div[@class='t-price']/p/text()").extract_first()
            item['state'] = li.xpath(".//i[@class='i-orange']/text()").extract_first()
            yield item

            if self.offset <=161:
                self.offset =self.offset + 1

            yield scrapy.Request(self.url + str(self.offset),callback=self.parse)

Beispiel #27

0

Datei anzeigen

Datei: dg.py Projekt: arch123A/scrapyStudy

    def parse(self, response):
        # print(response)
        tr_list = response.xpath('//table[2]/tr')
        # /td[3]/a[1]/text()
        for tr in tr_list:
            item = DongguanItem()
            item['num'] = tr.xpath('./td[1]/text()').extract_first()
            item['title'] = tr.xpath('./td[3]/a[1]/text()').extract_first()
            item['href'] = tr.xpath('./td[3]/a[1]/@href').extract_first()
            item['status'] = tr.xpath('./td[4]/span/text()').extract_first()
            item['name'] = tr.xpath('./td[5]/text()').extract_first()
            item['date'] = tr.xpath('./td[6]/text()').extract_first()
            yield scrapy.Request(item['href'],
                                 callback=self.detail_parse,
                                 meta={'item': item})

        next_url = response.xpath(
            r'//div[@class="pagination"]//a[text()=">"]/@href').extract_first(
            )
        if next_url is not None:
            yield scrapy.Request(next_url, callback=self.parse)

Beispiel #28

0

Datei anzeigen

Datei: sun.py Projekt: JoyceYang2018/project_spider

    def parse_item(self, response):
        #print response.url

        item = DongguanItem()
        item['title'] = response.xpath(
            '//div[@class="pagecenter p3"]//strong/text()').extract()[0]
        item['number'] = item['title'].split(' ')[-1].split(':')[-1]
        #先取有图片的内容，如果有内容，返回所有列表内容的列表集合；如果没有图片，则这个列表为空
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        #再使用以下匹配规则
        if len(content) == 0:
            content = response.xpath(
                '//div[@class="c1 text14_2"]/text()').extract()
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()
        item['url'] = response.url
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        yield item

Beispiel #29

0

Datei anzeigen

Datei: sun_scrapy.py Projekt: chinajizhipeng/scrapy_example

    def parse_item(self, response):

        item = DongguanItem()
        # 网站
        item['url'] = response.url
        # 标题
        item['title'] = response.xpath(
            '//div[@class = "pagecenter p3"]//strong/text()').extract()[0]
        # 编号
        item['num'] = item['title'].split(' ')[-1].split(":")[-1]
        # 内容,先使用取出有图片的情况下的匹配规则，如果有内容,返回所有内容的列表集合
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        #如果没有内容返回空列表，则使用无图片情况下的匹配规则
        if len(content) == 0:
            content = response.xpath(
                '//div[@class="c1 text14_2"]/text()').extract()
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()

        yield item

Beispiel #30

0

Datei anzeigen

    def parse_item(self, response):
        item = DongguanItem()

        #标题
        item["title"] = response.xpath(
            '//div[@class="pagecenter p3"]//strong//text()').extract()[0]
        #编号
        item["number"] = item["title"].split('.')[-1].split(":")[-1]
        #内容(返回是一个列表) 如果有内用，则返回列表，无无内容则返回为空列表
        content = response.xpath('//div[@class="contentext"]/text()').extract()

        #如果没内用，则是使用无图片的的匹配的规则 (有图片与无图片的是有区别的)
        if len(content) == 0:
            content = response.xpath(
                '//div[@class="c1 text14_2"]/text()').extract()
            item["content"] = "".join(content).strip()
        else:
            item["content"] = "".join(content).strip()

        #链接
        item["url"] = response.url

        yield item