Beispiel #1
0
 def parse(self, response):
     if response.xpath('//*[@name="docdate"]'):
         title = response.xpath('//title/text()').extract_first()
         title = title.strip()
         indexNumber = response.xpath(
             '//*[@id="ess_ctr445_ModuleContent"]/table/tr[1]/td[2]/text()'
         ).extract_first()
         print('title', title)
         print('index', indexNumber)
         IssuedNumber = response.xpath(
             '//*[@id="ess_ctr445_ModuleContent"]/table/tr[3]/td[2]/text()[2]'
         ).extract_first()
         print('Issued', IssuedNumber)
         publishDate = response.xpath(
             '//*[@id="ess_ctr445_ModuleContent"]/table/tr[4]/td[2]/text()'
         ).extract_first()
         print('publish', publishDate)
         IssuedOrgan = response.xpath(
             '//*[@id="ess_ctr445_ModuleContent"]/table/tr[5]/td[4]/text()'
         ).extract_first()
         print('Issued', IssuedOrgan)
         te = textEdit()
         text, files = te.dealWithAll(response,
                                      id='ess_ctr445_ModuleContent')
         print('text', text)
         print('files', files)
         item_seventytwo = four.items.fillinData(title, '', '', '', '', '',
                                                 indexNumber, '',
                                                 IssuedOrgan, '',
                                                 IssuedNumber, '', '', text,
                                                 files, publishDate, '')
         yield item_seventytwo
Beispiel #2
0
 def parse(self, response):
     title = response.xpath('//title/text()').extract_first()
     print(title)
     indexNumber = response.xpath(
         '//*[@id="headContainer"]/tbody/tr[1]/td/table/tr/td[1]/text()'
     ).extract_first()
     IssuingOrgan = response.xpath(
         '//*[@id="headContainer"]/tbody/tr[3]/td/table/tr/td[1]/span/text()'
     ).extract_first()
     publishDate = response.xpath(
         '//*[@id="headContainer"]/tbody/tr[3]/td/table/tr/td[2]/span/text()'
     ).extract_first()
     publishDate = re.sub('\D', '-', publishDate)
     publishDate = publishDate[:-1]
     print('indexNumber:%s' % indexNumber)
     print('IssuingOrgan:%s' % IssuingOrgan)
     print('publishDate:%s' % publishDate)
     te = textEdit()
     text, files = te.dealWithAll(response, classname="content")
     print('text:%s' % text)
     print('files:%s' % files)
     item_fortyfive = four.items.fillinData(title, '', '', '', '', '',
                                            indexNumber, '', IssuingOrgan,
                                            '', '', '', '', text, files,
                                            publishDate, '')
     yield item_fortyfive
Beispiel #3
0
 def parse(self, response):
     title = response.xpath('//*[@name="title"]/@content').extract_first()
     print(title)
     publishDate = response.xpath('//*[@class="sp_time"]/font/text()').extract_first()
     publishDate = publishDate.split(':')[1]
     print(publishDate)
     te = textEdit()
     text,files = te.dealWithAll(response,id="zoom")
     print('text:%s'%text)
     print('file:%s'%files)
Beispiel #4
0
    def parse(self, response):


        title = response.xpath('//h1')
        title = title.xpath('string(.)').extract_first()
        merge = response.xpath('//*[@class="lyd"]/text()').extract_first()
        publishDate = re.findall('\d{4}-\d{2}-\d{2}',merge)[0]
        te = textEdit()
        text,files = te.dealWithAll(response,'con')
        print('text:%s'%text)
        print('file:%s'%files)
        item_twentyeight = four.items.fillinData(title,'','','','','','','','','','','','',text,files,publishDate,'')
Beispiel #5
0
    def parse(self, response):

        title = response.xpath('//*[@id="con_title"]/text()').extract_first()
        publishDate = response.xpath(
            '//*[@id="con_time"]/text()').extract_first()
        publishDate = re.sub('\D', '-', publishDate)
        publishDate = publishDate[:(len(publishDate) - 1)]
        te = textEdit()
        text, files = te.dealWithAll(response, id='con_con')
        item_twentynine = four.items.fillinData(title, '', '', '', '', '', '',
                                                '', '', '', '', '', '', text,
                                                files, publishDate, '')
        yield item_twentynine
Beispiel #6
0
    def parse(self, response):
        title = response.xpath('//*[@class="pageHead"]/h2/text()').extract_first()
        print(title)
        publishDate = response.xpath('//*[@class="pageHead"]/h3/span[1]/text()').extract_first()
        print(publishDate)

        te = textEdit()
        text,files = te.dealWithAll(response,classname="view TRS_UEDITOR trs_paper_default trs_word trs_key4format")
        if not text:
            text,files = te.dealWithAll(response,classname="TRS_Editor")
        print('text:%s'%text)
        print('file:%s'%files)
        item_fortyone = four.items.fillinData(title,'','','','','','','','','','','','',text,files,publishDate,'')
        yield item_fortyone
Beispiel #7
0
    def parse(self, response):
        title = response.xpath('//*[@name="ArticleTitle"]/@content').extract_first()
        print(title)
        publishDate = response.xpath('//*[@name="PubData"]/@content').extract_first()
        print(publishDate)
        source = response.xpath('//*[@name="ContentSource"]/@content').extract_first()
        print(source)

        te = textEdit()
        text,files = te.dealWithAll(response,id="zoom")
        print('text:%s'%text)
        print('file:%s'%files)
        item_fortythree = four.items.fillinData(title,'','','','','','','','','','','','',text,files,publishDate,source)
        yield item_fortythree
Beispiel #8
0
    def parse(self, response):

        title = response.xpath(
            '//*[@class="zsy_cotitle"]/text()').extract_first()
        # print(title)
        publishDate = response.xpath(
            '//*[@class="zsy_cotitle"]/p/text()').extract_first()
        publishDate = re.findall('\d{4}-\d{2}-\d{2}', publishDate)[0]
        # print(publishDate)
        te = textEdit()
        text, files = te.dealWithAll(response, classname='zsy_comain')
        item_thirtyone = four.items.fillinData(title, '', '', '', '', '', '',
                                               '', '', '', '', '', '', text,
                                               files, publishDate, '')
        yield item_thirtyone
Beispiel #9
0
    def parse(self, response):

        title = response.xpath('//h1/text()').extract_first()
        print(title)
        publishDate = response.xpath(
            '//*[@class="date"]/text()').extract_first()
        publishDate = re.sub('\D', '-', publishDate)
        publishDate = publishDate[:-1]
        te = textEdit()
        text, files = te.dealWithAll(response, id='forestry_content')
        print('text:%s' % text)
        print('file:%s' % files)
        item_thirtysix = four.items.fillinData(title, '', '', '', '', '', '',
                                               '', '', '', '', '', '', text,
                                               files, publishDate, '')
        yield item_thirtysix
Beispiel #10
0
    def parse(self, response):

        title = response.xpath('//title/text()').extract_first()
        print(title)
        publishDate = response.xpath(
            '//*[@class="articleAuthor"]/span/strong/text()').extract_first()
        if publishDate:
            publishDate = publishDate.split(' ')[0]
        print(publishDate)
        te = textEdit()
        text, files = te.dealWithAll(response, classname="article art")
        print('text:%s' % text)
        print('file:%s' % files)
        item_thirtyeight = four.items.fillinData(title, '', '', '', '', '', '',
                                                 '', '', '', '', '', '', text,
                                                 files, publishDate, '')
        yield item_thirtyeight
Beispiel #11
0
    def parse(self, response):

        title = response.xpath(
            '//*[@class="main_title"]/text()').extract_first()
        print('title', title)
        publishDate = response.xpath(
            '//*[@class="top_about"]/a/text()').extract_first()
        publishDate = publishDate.split(' ')[0]
        print('publish', publishDate)
        te = textEdit()
        text, files = te.dealWithAll(response, classname='content_word')
        print('text:%s' % text)
        print('file:%s' % files)
        item_two = four.items.fillinData(title, '', '', None, '', '', '', '',
                                         '', '', '', '', '', text, files,
                                         publishDate, '', response.meta['url'])
        yield item_two
Beispiel #12
0
    def parse(self, response):
        title = response.xpath('//h1/text()').extract_first()
        print(title)
        publishDate = response.xpath(
            '//*[@class="caijingt_wztop"]/p/text()').extract_first()
        publishDate = publishDate.split(' ')[0]
        publishDate = re.sub('\D', '-', publishDate)
        publishDate = publishDate[:-1]
        print(publishDate)

        te = textEdit()
        text, files = te.dealWithAll(response, classname="caijingt_conmain")
        print('text:%s' % text)
        print('file:%s' % files)
        item_fortyone = four.items.fillinData(title, '', '', '', '', '', '',
                                              '', '', '', '', '', '', text,
                                              files, publishDate, '')
        yield item_fortyone
Beispiel #13
0
    def parse(self, response):
        title = response.xpath('//title/text()').extract_first()
        print(title)

        merge = response.xpath(
            '//*[@class="detail_main_right_conbg_tit"]/div[3]/text()'
        ).extract_first()
        merge = merge.split(' ')
        publishDate = merge[0].split(':')[1]
        print('publishDate:%s' % publishDate)
        te = textEdit()
        text, files = te.dealWithAll(response,
                                     classname="detail_main_right_conbg_con")
        print('text:%s' % text)
        print('files:%s' % files)
        item_fortyfour = four.items.fillinData(title, '', '', '', '', '', '',
                                               '', '', '', '', '', '', text,
                                               files, publishDate, '')
        yield item_fortyfour
Beispiel #14
0
    def parse(self, response):

        title = response.xpath('//h2/text()').extract_first()
        print('title', title)
        publishDate = response.xpath(
            '//*[@id="pubTime"]/text()').extract_first()
        publishDate = publishDate.split(' ')[0]
        publishDate = re.sub('\D', '-', publishDate)
        publishDate = publishDate[:-1]
        print('publish', publishDate)
        source = response.xpath('//*[@id="sourceName"]/text()').extract_first()
        print('source', source)
        te = textEdit()
        text, files = te.dealWithAll(response, classname='TRS_Editor')
        print('text:%s' % text)
        print('file:%s' % files)
        item_two = four.items.fillinData(title, '', '', None, '', '', '', '',
                                         '', '', '', '', '', text, files,
                                         publishDate, '', response.meta['url'])
        yield item_two
Beispiel #15
0
    def parse(self, response):

        title = response.xpath('//h1')
        title = title.xpath('string(.)').extract_first()
        # title = response.xpath('//h1/text()').extract_first()
        print(title)
        source = response.xpath(
            '//*[@class="fl"]/span[1]/text()').extract_first()
        print(source)
        publishDate = response.xpath(
            '//*[@class="fl"]/span[3]/text()').extract_first()
        publishDate = publishDate.split(' ')[0]
        print(publishDate)
        te = textEdit()
        text, files = te.dealWithAll(response, classname="conbox2 boxcenter")
        print('text:%s' % text)
        print('file:%s' % files)
        item_thirtynine = four.items.fillinData(title, '', '', '', '', '', '',
                                                '', '', '', '', '', '', text,
                                                files, publishDate, '')
        yield item_thirtynine