コード例 #1
0
    def parse_each_pages(self, response):
        page_no = response.meta['page_no']
        last_page_no = response.meta['last_page_no']
        last = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[2]/td[2]/nobr/text()').get()

        if page_no == last_page_no:
            category_last_no = int(last)
        else:
            first = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[32]/td[2]/nobr/text()').get()
            category_last_no = int(last) - int(first) + 1

        category_no = 1
        while True:
            if(category_no > category_last_no):
                break
            item = CrawlnkdbItem()
            post_title = response.xpath(
                '//*[@id="mainIndexTable"]/tbody/tr[' + str(2 * category_no) + ']/td[4]/a').xpath('string()').get()
            post_writer = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[' + str(2*category_no) + ']/td[6]').xpath('string()').get()
            post_date = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[' + str(2*category_no) + ']/td[7]').xpath('string()').get()
            item[config['VARS']['VAR1']] = post_title
            item[config['VARS']['VAR3']] = post_writer
            item[config['VARS']['VAR4']] = post_date

            print("###post_writer >>> ", post_writer)
            print("###post_date >>> ", post_date)

            category_link = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[' + str(2*category_no) + ']/td[4]/a/@href').get()
            # print(category_link)
            url = 'http://www.nkis.kr/' + category_link
            print("###url >>> ", url)
            yield scrapy.Request(url, callback=self.parse_category, meta={'item':item})
            category_no += 1
コード例 #2
0
    def parse_each_pages(self, response):
        page_no = response.meta['page_no']
        last_page_no = response.meta['last_page_no']
        last = response.meta['last']
        first = response.meta['first']
        if page_no == last_page_no:
            last = response.xpath(
                '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr[1]/th/text()'
            ).get()
            category_last_no = int(last)
        else:
            category_last_no = int(last) - int(first) + 1
        category_no = 1
        while True:
            if (category_no > category_last_no):
                break
            item = CrawlnkdbItem()
            number = response.xpath(
                '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr['
                + str(category_no) + ']/th').get()
            #print(number)
            title = response.xpath(
                '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr['
                + str(category_no) + ']/td[1]/p/text()').get()
            # title = title.split(")", maxsplit=1)
            # title = title[1]
            # title = title.strip()
            # print(title)
            writer = "관리자"
            body = ""
            date = response.xpath(
                '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr['
                + str(category_no) +
                ']/td[1]/span[1]').xpath('string()').get()
            ### modify
            top_category = response.xpath(
                '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[2]/ul/li[4]/a'
            ).xpath('string()').get()
            item[config['VARS']['VAR1']] = title
            item[config['VARS']['VAR3']] = writer
            item[config['VARS']['VAR2']] = body
            item[config['VARS']['VAR4']] = date
            item[config['VARS']['VAR5']] = "남북하나재단"
            item[config['VARS']['VAR6']] = "https://www.koreahana.or.kr/"
            item[config['VARS']['VAR7']] = top_category
            file_name = title
            download_url = response.xpath(
                '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr['
                + str(category_no) + ']/td[2]/button/@onclick').get()
            download_url = download_url.split("'")
            # print(download_url)
            file_download_url = 'https://www.koreahana.or.kr' + download_url[1]
            print(file_download_url)
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            category_no += 1

            yield scrapy.Request(file_download_url,
                                 callback=self.save_file,
                                 meta={'item': item})
コード例 #3
0
 def parse_each_pages(self, response):
     page_no = response.meta['page_no']
     last_page_no = response.meta['last_page_no']
     print("###pageno:  ", page_no)
     last = response.xpath(
         '//*[@id="cmsContent"]/div[2]/table/tbody/tr[1]/td[1]/text()').get(
         )
     if page_no == last_page_no:
         category_last_no = int(last)
     else:
         first = response.xpath(
             '//*[@id="cmsContent"]/div[2]/table/tbody/tr[10]/td[1]/text()'
         ).get()
         category_last_no = int(last) - int(first) + 1
     category_no = 1
     while True:
         if (category_no > category_last_no):
             break
         category_link = response.xpath(
             '//*[@id="cmsContent"]/div[2]/table/tbody/tr[' +
             str(category_no) + ']/td[2]/a/@href').get()
         url = 'http://www.kinu.or.kr/www/jsp/prg/api/' + category_link
         # print(url)
         # number = response.xpath('//*[@id="boardActionFrm"]/div[2]/table/tbody/tr['+str(category_no)+']/td[1]').get()
         # print(number)
         item = CrawlnkdbItem()
         date = response.xpath(
             '//*[@id="cmsContent"]/div[2]/table/tbody/tr[' +
             str(category_no) + ']/td[3]').xpath('string()').get()
         item[config['VARS']['VAR4']] = date
         yield scrapy.Request(url,
                              callback=self.parse_post,
                              meta={'item': item})
         category_no += 1
コード例 #4
0
    def parse_each_pages(self, response):
        page_no = response.meta['page_no']
        last_page_no = response.meta['last_page_no']

        last = response.xpath('//*[@id="div_article_contents"]/tr[1]/td[1]/text()').get()
        if page_no == last_page_no:
            first = 1
        else:
            first = response.xpath('//*[@id="div_article_contents"]/tr[29]/td[1]/text()').get()

        category_last_no = int(last) - int(first)+1
        category_no = 1

        while 1:
        # 해당 url을  item에 넣어준다.
            if(category_no > category_last_no):
                break
            category_link = response.xpath('//*[@id="div_article_contents"]/tr[' + str(2*category_no-1) + ']/td[2]/font/a/@href').get()
            category_link = category_link.replace("./", "")
            url =  "http://www.nkorea.or.kr/board/" + category_link
            # print(url)
            date = response.xpath('//*[@id="div_article_contents"]/tr['+ str(2*category_no-1) +']/td[5]/text()').extract()
            writer = response.xpath('//*[@id="div_article_contents"]/tr['+ str(2*category_no-1) +']/td[3]/text()').extract()
 			# item 객체생성
            item = CrawlnkdbItem()
            item["post_date"] = date
            item["post_writer"] = writer
 			# item url에 할당
            yield scrapy.Request(url, callback=self.parse_category, meta={'item':item})
            category_no += 1
コード例 #5
0
    def parse_post(self, response):
        title = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[1]/td').xpath('string()').get()
        body = " "
        writer = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[4]/td').xpath('string()').get()
        date = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[6]/td').xpath('string()').get()
        top_category = response.xpath('//*[@id="container"]/div/nav/ul/li[2]/ul/li[5]/a').xpath('string()').get()
        print(top_category)

        item = CrawlnkdbItem()

        item[config['VARS']['VAR1']] = title
        item[config['VARS']['VAR2']] = body
        item[config['VARS']['VAR3']] = writer
        item[config['VARS']['VAR4']] = date
        item[config['VARS']['VAR7']] = top_category
        item[config['VARS']['VAR5']] = "통일부"
        item[config['VARS']['VAR6']] = "https://unibook.unikorea.go.kr/"
        file_name = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[3]/td/a').xpath('string()').get()
        file_download_url = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[3]/td/a/@href').get()
        file_download_url = "https://unibook.unikorea.go.kr" + file_download_url
        print(file_download_url)
        item[config['VARS']['VAR10']] = file_download_url
        item[config['VARS']['VAR9']] = file_name
        print("@@@@@@file name ", file_name)
        if file_download_url:
            if file_download_url.find("hwp") != -1 :
                print('find hwp')
                yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item':item}) #
            else:
                yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item':item})
        else:
            #print("###############file does not exist#################")
            yield item
コード例 #6
0
    def parse_post(self, response):
        item = CrawlnkdbItem()
        #title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[1]/h1/a/span/text()').get()
        if title is None:
            title = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[1]/h1/a/text()').get()
        #table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()
        body = response.xpath('// *[ @ id = "s_mid21_wrap0"] / div / div[1] / div[2]').get()
        #body = re.search('<body.*/body>', body, re.I | re.S)
        body = re.sub('<script.*?>.*?</script>', '', body, 0, re.I | re.S)
        body = re.sub('<.+?>', '', body, 0, re.I | re.S)
        body = re.sub('&nbsp;| |\t|\r|\n', " ", body)
        body = re.sub('\"', "'", body)
        print(body)


        #body = response.css('.descArea').xpath('string()').extract()

        date = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[1]/p[1]/text()').get()

        writer = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[1]/p[2]/a/text()').get()

        body_text = ''.join(body)

        top_category = "군사주의와 여성"

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "평화를 만드는 여성회"
        item['published_institution_url'] = "http://www.peacewomen.or.kr/"
        item[config['VARS']['VAR7']] = top_category


        yield item

        file_name = title

        file_icon = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[3]/div[1]/ul/li/a/text()').get()
        file_icon = None

        if file_icon:
            file_download_url = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[3]/div[1]/ul/li/a/@href').extract()
            file_download_url = file_download_url[0]
            file_download_url = "http://www.peacewomen.or.kr/" + file_download_url
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}, dont_filter=True)  #
            else:
                yield scrapy.Request(file_download_url, callback=self.save_file,
                                     meta={'item': item, 'file_download_url': file_download_url,
                                           'file_name': file_icon}, dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #7
0
    def parse_each_pages(self, response):
        page_no = response.meta['page_no']
        last_page_no = response.meta['last_page_no']

        last = response.xpath(
            '//*[@id="contents"]/table/tbody/tr[1]/td[1]/text()').get()
        if page_no == last_page_no:
            first = 1
        else:
            first = response.xpath(
                '//*[@id="contents"]/table/tbody/tr[20]/td[1]/text()').get()

        category_last_no = int(last) - int(first) + 1
        category_no = 1

        while 1:
            # 해당 url을  item에 넣어준다.
            if (category_no > category_last_no):
                break
            category_link = response.xpath(
                '//*[@id="contents"]/table/tbody/tr[' + str(category_no) +
                ']/td[2]/a/@href').get()
            category_no += 1
            if str(type(category_link)) == "<class 'NoneType'>":
                continue
            # print(category_link)
            url = "http://nkd.or.kr" + category_link
            item = CrawlnkdbItem()
            yield scrapy.Request(url,
                                 callback=self.parse_category,
                                 meta={'item': item})
コード例 #8
0
ファイル: boardbotColumn.py プロジェクト: qkrwlgml05/crawling
 def parse_each_pages(self, response):
     user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
     headers = {'User-Agent': user_agent}
     page_no = response.meta['page_no']
     last_page_no = response.meta['last_page_no']
     
     last = response.xpath('//*[@id="frm"]/div/table/tbody/tr[1]/td[1]/text()').get()
     if page_no == last_page_no:
         first = 1
     else:
         first = response.xpath('//*[@id="frm"]/div/table/tbody/tr[10]/td[1]/text()').get()
         
     category_last_no = int(last) - int(first)+1
     category_no = 1
     
     while 1:
         # 해당 url을  item에 넣어준다.
         if(category_no > category_last_no):
             break
         
         category_link = response.xpath('//*[@id="frm"]/div/table/tbody/tr[' + str(category_no) + ']/td[2]/a/@href').get()
         url =  "http://www.kolofo.org" + category_link # item 객체생성
         item = CrawlnkdbItem() # item url에 할당
         yield scrapy.Request(url, headers = headers, callback=self.parse_category, meta={'item':item})
         category_no += 1
コード例 #9
0
 def parse_each_pages(self, response):
     page_no = response.meta['page_no']
     last_page_no = response.meta['last_page_no']
     last = response.xpath(
         '//*[@id="smain_all"]/table[2]/tbody/tr[1]/td[1]/div/font/text()'
     ).get()
     if page_no == last_page_no:
         category_last_no = int(last)
     else:
         first = response.xpath(
             '//*[@id="smain_all"]/table[2]/tbody/tr[5]/td[1]/div/font/text()'
         ).get()
         category_last_no = int(last) - int(first) + 1
     category_no = 1
     while True:
         if (category_no > category_last_no):
             break
         category_link = '//*[@id="smain_all"]/table[2]/tbody/tr[' + str(
             category_no) + ']/td[3]/div/a/@onclick'
         onclick_text = response.xpath(category_link).extract()
         url = re.findall("\d+", str(onclick_text))
         url = 'http://www.nuac.go.kr/actions/BbsDataAction?cmd=view&menuid=G' + url[
             1] + '&bbs_id=G' + url[1] + '&bbs_idx=' + url[
                 0] + '&parent_idx=&_template=03&_max=05&_page=' + str(
                     page_no) + '&head='
         item = CrawlnkdbItem()  #
         yield scrapy.Request(url,
                              callback=self.parse_post,
                              meta={'item': item})
         category_no += 1
コード例 #10
0
ファイル: peace1.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()

        title = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[1]/td').xpath('string()').get()
        body = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[5]').xpath('string()').get()
        writer = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[2]/td[1]').xpath('string()').get()
        date = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[2]/td[2]').xpath('string()').get()
        #######modify
        top_category = response.xpath('//*[@id="rep_tab_btn02"]/a').xpath('string()').get()

        item[config['VARS']['VAR1']] = title
        item[config['VARS']['VAR3']] = writer
        item[config['VARS']['VAR2']] = body
        item[config['VARS']['VAR4']] = date
        item[config['VARS']['VAR5']] = "평화재단"
        item[config['VARS']['VAR6']] = "http://www.pf.or.kr/wpages/01-3_research_1.php"
        item[config['VARS']['VAR7']] = top_category
        file_name = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[4]/td/a').xpath('string()').get()
        print("###file_name:  ", file_name)
        if file_name is not None:
            file_download_pre = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[4]/td/a/@href').get()
            file_download_url = "http://www.pf.or.kr" + file_download_pre
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            item[config['VARS']['VAR12']] = body
            print("@@@@@@file name ", file_name)
            yield item
コード例 #11
0
    def parse_each_pages(self, response):
        page_no = response.meta['page_no']
        last_page_no = response.meta['last_page_no']

        last = response.xpath(
            '//*[@id="frm"]/div/table/tbody/tr[1]/td[1]/text()').get()
        if page_no == last_page_no:
            first = 1
        else:
            first = response.xpath(
                '//*[@id="frm"]/div/table/tbody/tr[10]/td[1]/text()').get()

        category_last_no = int(last) - int(first) + 1
        category_no = 1

        while 1:
            # 해당 url을  item에 넣어준다.
            if (category_no > category_last_no):
                break

            category_link = response.xpath(
                '//*[@id="frm"]/div/table/tbody/tr[' + str(category_no) +
                ']/td[2]/a/@href').get()
            url = "http://www.kolofo.org" + category_link
            # item 객체생성
            item = CrawlnkdbItem()
            # item url에 할당
            yield scrapy.Request(url,
                                 headers=self.headers,
                                 callback=self.parse_category,
                                 meta={'item': item})
            category_no += 1
コード例 #12
0
    def parse_each_pages(self, response):
        link = response.meta['link']
        print("###link:  ", link)
        page_no = response.meta['page_no']
        last_page_no = response.meta['last_page_no']
        category_num = response.meta['category_num']
        print("###pageno:  ", page_no)

        if page_no == last_page_no:
            page_total_num = response.xpath(
                '//*[@id="container"]/div/section/div[1]/div[1]/header/h5'
            ).xpath('string()').get()
            page_total_num = re.findall("\d,\d+", str(page_total_num))
            page_total_num = str(page_total_num[0])
            page_total_num = page_total_num.replace(",", "")
            page_total_num = int(page_total_num)
            category_last_no = (last_page_no *
                                category_num) - int(page_total_num)
            print(category_last_no)
        else:
            category_last_no = category_num

        category_no = 1
        while True:
            if (category_no > category_last_no):
                break
            title = response.xpath('//*[@id="sublist"]/ul[' +
                                   str(category_no) +
                                   ']/li[2]/h6/a').xpath('string()').get()
            print(title)
            body = " "
            writer = response.xpath(
                '//*[@id="sublist"]/ul[' + str(category_no) +
                ']/li[2]/div/dl[1]/dd').xpath('string()').get()
            date = response.xpath('//*[@id="sublist"]/ul[' + str(category_no) +
                                  ']/li[2]/div/dl[3]/dd').xpath(
                                      'string()').get()

            item = CrawlnkdbItem()
            item[config['VARS']['VAR1']] = title
            item[config['VARS']['VAR2']] = body
            item[config['VARS']['VAR3']] = writer
            item[config['VARS']['VAR8']] = date
            item[config['VARS']['VAR7']] = "통일부 발간물"
            item[config['VARS']['VAR5']] = "통일부"
            item[config['VARS']['VAR6']] = "https://unibook.unikorea.go.kr/"
            item[config['VARS']['VAR9']] = title
            crawl_url = response.xpath('//*[@id="sublist"]/ul[' +
                                       str(category_no) +
                                       ']/li[2]/h6/a/@href').get()
            url = "https://unibook.unikorea.go.kr/material/" + crawl_url
            category_no += 1
            print("#############category_url", url)
            yield scrapy.Request(url,
                                 callback=self.parse_post,
                                 meta={'item': item})
コード例 #13
0
ファイル: utf3.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/p[1]/text()').get()

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()
        body = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/div[1]/pre/text()').get()
        #print(body)

        date = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/p[2]').get()
        date = re.sub('<script.*?>.*?</script>', '', date, 0, re.I | re.S)
        date = re.sub('<.+?>', '', date, 0, re.I | re.S)
        date = re.sub('&nbsp;| |\t|\r|\n', " ", date)
        date = re.sub('\"', "'", date)
        date = date.split('|')
        writer = date[0]
        date = date[1]

        #writer = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/p[2]/text()[1]/text()').get()

        body_text = ''.join(body)

        top_category = response.xpath('//*[@id="sub_middle"]/div[2]/div[1]/div[1]/h3/text()').get()

        item[config['VARS']['VAR1']] = title.strip()
        item[config['VARS']['VAR4']] = date.strip()
        item[config['VARS']['VAR3']] = writer.strip()
        item[config['VARS']['VAR2']] = body_text.strip()
        item[config['VARS']['VAR5']] = "국회 외교통일위원회"
        item[config['VARS']['VAR6']] = "https://uft.na.go.kr"
        item[config['VARS']['VAR7']] = top_category


        file_name = title
        file_icon = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/div[2]/p[3]/a[1]/text()').get()

        if file_icon:
            file_download_url = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/div[2]/p[3]/a[1]/@href').extract()
            file_download_url = file_download_url[0]
            file_download_url = "https://uft.na.go.kr:444/uft/reference/reference03.do" + file_download_url
            #print(file_download_url)

            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url, callback=self.save_file,
                                     meta={'item': item, 'file_download_url': file_download_url,
                                           'file_name': file_icon}, dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #14
0
ファイル: kinu4_1.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()
        category_no = response.meta['category_no']
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' + str(category_no) + ']/td[2]/text()').get()
        print(title)

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        body = response.xpath('//*[@id="cmsContent"]/div[1]/div/div[2]/ul/li/text()').get()
        # body = "북한당국이 발간한 최초의 종합 통계집인 동시에 현재까지 입수 가능한 거의 유일한 북한 공식 통계집이다. 1945년 이후 1960년대 초까지 북한당국은 주기적으로 공식 통계를 발표해 왔는데, 동 통계집은 이렇게 발표된 통계를 '자연조건' 및 '행정구역'에서부터 '교육', '보건'에 이르기까지 각 항목별로 체계적으로 정리하고 있다. 이들 통계는 북한의 공식통계가 매우 희소한 오늘날의 북한을 이해하기 위해서도 매우 귀중한 자료라 할 수 있다."

        # body = response.css('.descArea').xpath('string()').extract()

        #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get()
        date = "1940-60"
        #print(date)

        #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get()
        writer = "북한당국"
        #print(writer)

        body_text = ''.join(body)

        top_category = response.xpath('//*[@id="container"]/div[3]/div[1]/div/h2/text()').get()

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "통일연구원"
        item['published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/"
        item[config['VARS']['VAR7']] = top_category


        file_name = title
        file_icon = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img').get()
        if file_icon:
            file_download_url = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href').extract()
            file_download_url = file_download_url[0]
            file_download_url = "http://www.kinu.or.kr/" + file_download_url
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url, callback=self.save_file,
                                     meta={'item': item, 'file_download_url': file_download_url,
                                           'file_name': file_icon}, dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #15
0
    def parse_post(self, response):
        item = CrawlnkdbItem()
        category_no = response.meta['category_no']
        category_no = int(category_no)
        title = response.xpath('//*[@id="content"]/div[2]/ul/li[' + str(category_no) + ']/div/div/h3/text()').get()
        print(title)
        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        body = "UNIFICATION WHITE PAPER"

        # body = response.css('.descArea').xpath('string()').extract()

        test = title.split('년')
        date = str(test[0])
        print(date)

        writer = "통일부"
        #print(writer)

        body_text = ''.join(body)

        top_category = "통일백서"

        item[config['VARS']['VAR1']] = title.strip()
        item[config['VARS']['VAR4']] = date.strip()
        item[config['VARS']['VAR3']] = writer.strip()
        item[config['VARS']['VAR2']] = body_text.strip()
        item[config['VARS']['VAR5']] = "통일부"
        item[config['VARS']['VAR6']] = "https://www.unikorea.go.kr"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        # file_icon = response.xpath('//*[@id="content"]/div[2]/ul/li[' + str(category_no) +']/div/div/div/a[2]/@href').extract()
        file_icon = title  # file_icon을 찾을 수 없어서
        # file_icon = None
        if file_icon:
            file_download_url = response.xpath(
                '//*[@id="content"]/div[2]/ul/li[' + str(category_no) + ']/div/div/div/a[2]/@href').extract()
            file_download_url = file_download_url[0]
            file_download_url = "https://www.unikorea.go.kr" + file_download_url
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url, callback=self.save_file,
                                     meta={'item': item, 'file_download_url': file_download_url,
                                           'file_name': file_icon}, dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #16
0
ファイル: kinu4_6.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()
        category_no = response.meta['category_no']
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' + str(category_no) + ']/td[2]/text()').get()
        print(title)

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        #body = response.xpath('//*[@id="tab_con"]').get()
        body = "2002년 북한당국은 UN 경제사회 이사회 등에 자국의 인권상황과 관련된 공식의견을 표명하면서 이를 뒷받침하기 위한 수단의 하나로 당시까지의 주요 공식 통계를 제출하였다. 이들 통계는 현재까지 얻을 수 있는 가장 최근의 북한 통계들로서 1990년대 이후 북한의 모습을 반영하고 있다. 이하에 수록된 통계 자료들은 이렇게 제출된 북한 통계들 가운데 중요한 것들을 취합한 것이다."
        # body = response.css('.descArea').xpath('string()').extract()

        #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get()
        date = "2002년"
        #print(date)

        #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get()
        writer = "북한당국"
        #print(writer)

        body_text = ''.join(body)

        top_category = response.xpath('//*[@id="container"]/div[3]/div[1]/div/h2/text()').get()

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "통일연구원"
        item['published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/"
        item[config['VARS']['VAR7']] = top_category


        file_name = title
        file_icon = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img').get()
        if file_icon:
            file_download_url = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href').extract()
            file_download_url = file_download_url[0]
            file_download_url = "http://www.kinu.or.kr/" + file_download_url
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url, callback=self.save_file,
                                     meta={'item': item, 'file_download_url': file_download_url,
                                           'file_name': file_icon}, dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #17
0
    def parse_each_pages(self, response):
        link = response.meta['link']
        print("###link:  ", link)

        page_no = response.meta['page_no']
        print("###pageno:  ", page_no)
        last_page_no = response.meta['last_page_no']
        last_page_category_num = response.meta['last_page_category_num']

        if page_no == last_page_no:
            category_num = last_page_category_num
        else:
            category_num = 10
        category_no = 1
        while True:
            if (category_no > category_num):
                break
            title = response.xpath('//*[@id="sub_reports"]/ul[' +
                                   str(category_no) +
                                   ']/a/li').xpath('string()').get()
            print(title)
            writer = response.xpath('//*[@id="sub_reports"]/ul[' +
                                    str(category_no) +
                                    ']/li[3]').xpath('string()').get()
            writer = writer.replace("By : ", "")
            writer = writer.strip()
            date = response.xpath('//*[@id="sub_reports"]/ul[' +
                                  str(category_no) +
                                  ']/li[1]/span[1]').xpath('string()').get()
            date = date.replace("DATE : ", "")
            date = date.strip()

            item = CrawlnkdbItem()
            item[config['VARS']['VAR1']] = title
            item[config['VARS']['VAR3']] = writer
            item[config['VARS']['VAR4']] = date
            ### modify
            item[config['VARS']['VAR7']] = response.xpath(
                '//*[@id="left_menu"]/li[4]/a').xpath('string()').get()
            item[config['VARS']['VAR5']] = "제주평화연구원"
            item[config['VARS']
                 ['VAR6']] = "http://www.jpi.or.kr/kor/issue/reports.sky"
            item[config['VARS']['VAR9']] = title
            crawl_url = response.xpath('//*[@id="sub_reports"]/ul[' +
                                       str(category_no) + ']/a/@href').get()

            url = "http://www.jpi.or.kr" + crawl_url
            category_no += 1
            print("#############category_url", url)
            yield scrapy.Request(url,
                                 callback=self.parse_post,
                                 meta={'item': item})
コード例 #18
0
ファイル: kinu2.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="cmsContent"]/div[1]/p/text()').get()
        print(title)

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        body = "no text"

        # body = response.css('.descArea').xpath('string()').extract()

        date = response.xpath('//*[@id="cmsContent"]/div[2]/table/thead/tr[2]/td/text()').get()
        #print(date)

        writer = "KINU"
        #print(writer)

        body_text = ''.join(body)

        top_category = response.xpath('//*[@id="container"]/div[3]/div[1]/div/h2/text()').get()

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "통일연구원"
        item['published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        file_icon = response.xpath('//*[@id="cmsContent"]/div[2]/table/thead/tr[4]/td/a/img').get()
        if file_icon:
            file_download_url = response.xpath('//*[@id="cmsContent"]/div[2]/table/thead/tr[4]/td/a/@href').extract()
            file_download_url = file_download_url[0]
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url, callback=self.save_file,
                                     meta={'item': item, 'file_download_url': file_download_url,
                                           'file_name': file_icon}, dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #19
0
ファイル: nuac4.py プロジェクト: qkrwlgml05/crawling
    def parse_each_pages(self, response):
        page_no = response.meta['page_no']
        last_page_no = response.meta['last_page_no']
        ### modify
        last = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[1]/td[1]/div/font/text()').get()
        if page_no == last_page_no:
            category_last_no = int(last)
        else:
            ### modify
            first = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[5]/td[1]/div/font/text()').get()
            category_last_no = int(last) - int(first) + 1
        category_no = 1
        while True:
            if(category_no > category_last_no):
                break
            item = CrawlnkdbItem()
            ### modify
            title = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr['+str(category_no)+']/td[3]/div').xpath('string()').get()
            body = " "
            writer = "관리자"
            date = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr['+str(category_no)+']/td[6]/div/font').xpath('string()').get()
            top_category = response.xpath('//*[@id="main"]/div/div[1]/div[1]/a').xpath('string()').get()

            item[config['VARS']['VAR1']] = title
            item[config['VARS']['VAR2']] = body
            item[config['VARS']['VAR3']] = writer
            item[config['VARS']['VAR4']] = date
            item[config['VARS']['VAR5']] = "민주평화통일자문회의"
            item[config['VARS']['VAR6']] = "http://www.nuac.go.kr/actions/"
            item[config['VARS']['VAR7']] = top_category

            ### modify
            file_name = title
            file_download_url = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr['+str(category_no)+']/td[5]/div/a/@href').get()
            category_no += 1
            if file_download_url is not None:
                item[config['VARS']['VAR10']] = file_download_url
                item[config['VARS']['VAR9']] = file_name
                #print("@@@@@@file name ", file_name)
                if file_download_url.find("hwp") != -1 :
                    #print('find hwp')
                    yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item':item}) #
                else:
                    yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item':item})
            else:
                print("###############file does not exist#################")
                yield item
コード例 #20
0
ファイル: boardbotNuac3.py プロジェクト: qkrwlgml05/crawling
 def parse_each_pages(self, response):
     page_no = response.meta['page_no']
     last_page_no = response.meta['last_page_no']
     last = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[1]/td[1]/div/font/text()').get()
     if page_no == last_page_no:
         category_last_no = int(last)
     else:
         first = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[5]/td[1]/div/font/text()').get()
         category_last_no = int(last) - int(first) + 1
     category_no = 1
     while True:
         if(category_no > category_last_no):
             break
         url = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[' + str(category_no) + ']/td[3]/div/a/@href').get()
         item = CrawlnkdbItem() #
         yield scrapy.Request(url, callback=self.parse_post, meta={'item':item})
         category_no += 1
コード例 #21
0
ファイル: kinu4_2.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()
        category_no = response.meta['category_no']
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' +
                               str(category_no) + ']/td[2]/text()').get()
        print(title)

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        #body = response.xpath('//*[@id="tab_con"]').get()
        body = "1960년대 이후 1990년대 초까지 북한당국은 공식통계를 체계적으로 하지 않았다. 동 통계집은 이러한 북한통계의 공백기를 메우기 위해 한국 통일부가 북한의 각 문헌자료 속에 산재한 당시의 통계들을 하나로 모아 간행한 것이다. 여기에 수록된 통계들은 1990년대 이전까지의 북한과 관련된 거의 유일한 통계자료라고 할 수 있다."
        # body = response.css('.descArea').xpath('string()').extract()

        #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get()
        date = "1960-90"
        #print(date)

        #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get()
        writer = "통일부"
        #print(writer)

        body_text = ''.join(body)

        top_category = response.xpath(
            '//*[@id="container"]/div[3]/div[1]/div/h2/text()').get()

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "통일연구원"
        item[
            'published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        file_icon = response.xpath(
            '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img'
        ).get()
        if file_icon:
            file_download_url = response.xpath(
                '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href'
            ).extract()
            file_download_url = file_download_url[0]
            file_download_url = "http://www.kinu.or.kr/" + file_download_url
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file_hwp,
                                     meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file,
                                     meta={
                                         'item': item,
                                         'file_download_url':
                                         file_download_url,
                                         'file_name': file_icon
                                     },
                                     dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #22
0
ファイル: unikorea4.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()
        category_no = response.meta['category_no']
        category_no = int(category_no)
        title = response.xpath(
            '//*[@id="bbsForm"]/div/article/div[1]/h3/text()').get()
        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        body = response.xpath(
            '//*[@id="bbsForm"]/div/article/div[2]/div[1]/p/text()').get()

        # body = response.css('.descArea').xpath('string()').extract()

        date = response.xpath(
            '//*[@id="bbsForm"]/div/article/div[1]/div/dl[2]/dd/text()').get()

        writer = "통일부"
        #print(writer)

        body_text = ''.join(body)

        top_category = "북한동향"

        item[config['VARS']['VAR1']] = title.strip()
        item[config['VARS']['VAR4']] = date.strip()
        item[config['VARS']['VAR3']] = writer.strip()
        item[config['VARS']['VAR2']] = body_text.strip()
        item[config['VARS']['VAR5']] = "통일부"
        item[config['VARS']['VAR6']] = "https://www.unikorea.go.kr"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        file_icon = response.xpath(
            '//*[@id="bbsForm"]/div/article/div[2]/section/div[2]/ul/li[1]/a[1]/text()'
        ).get()
        # file_icon = None

        if file_icon:
            file_download_url = response.xpath(
                '//*[@id="bbsForm"]/div/article/div[2]/section/div[2]/ul/li[1]/a[1]/@href'
            ).extract()
            file_download_url = file_download_url[0]
            slice = file_download_url.split("javascript:Jnit_boardDownload(")
            slice = slice[1].split(";")
            slice = slice[0].split("'")
            file_download_url = "https://www.unikorea.go.kr/" + slice[1]
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file_hwp,
                                     meta={'item': item},
                                     dont_filter=True)  #
            else:
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file,
                                     meta={
                                         'item': item,
                                         'file_download_url':
                                         file_download_url,
                                         'file_name': file_icon
                                     },
                                     dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #23
0
ファイル: kinu4_4.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()
        category_no = response.meta['category_no']
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' +
                               str(category_no) + ']/td[2]/text()').get()
        print(title)

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        #body = response.xpath('//*[@id="tab_con"]').get()
        body = "1998년 북한은 국제적 식량 및 농업지원을 목적으로 UNDP와 공동으로 \"Thematic Round Table Meeting on Agricultural Recovery and Environmental Protection For the DPRK\"를 개최하였다. 동 회의를 위해 북한은 사상 최초로 자국의 美달러화 표시 GDP 규모를 밝히는 등 여러 중요한 통계자료를 제출하였다. 이하에 수록된 통계 자료들은 이렇게 제출된 북한의 공식통계를 취합한 것이다."
        # body = response.css('.descArea').xpath('string()').extract()

        #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get()
        date = "1998"
        #print(date)

        #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get()
        writer = "북한당국"
        #print(writer)

        body_text = ''.join(body)

        top_category = response.xpath(
            '//*[@id="container"]/div[3]/div[1]/div/h2/text()').get()

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "통일연구원"
        item[
            'published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        file_icon = response.xpath(
            '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img'
        ).get()
        if file_icon:
            file_download_url = response.xpath(
                '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href'
            ).extract()
            file_download_url = file_download_url[0]
            file_download_url = "http://www.kinu.or.kr/" + file_download_url
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file_hwp,
                                     meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file,
                                     meta={
                                         'item': item,
                                         'file_download_url':
                                         file_download_url,
                                         'file_name': file_icon
                                     },
                                     dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #24
0
    def parse_post(self, response):
        item = CrawlnkdbItem()
        category_no = response.meta['category_no']
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' +
                               str(category_no) + ']/td[2]/text()').get()
        print(title)

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        #body = response.xpath('//*[@id="tab_con"]').get()
        body = "1997년 이후 북한당국은 국제기구 등이 중심이 되어 실시한 북한 어린이 영양실태 조사에 적극적으로 협조해 왓다. 동 조사의 결과는 조사에 참여한 국제기구 등에 의해 발표되었을 뿐만 아니라 동시에 북한 조선중앙통계국의 명의로도 발표되었다. 이하에 수록된 통계 자료들은 이렇게 발표된 조사결과를 취합한 것이다."
        # body = response.css('.descArea').xpath('string()').extract()

        #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get()
        date = "1997년 이후"
        #print(date)

        #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get()
        writer = "북한당국"
        #print(writer)

        body_text = ''.join(body)

        top_category = response.xpath(
            '//*[@id="container"]/div[3]/div[1]/div/h2/text()').get()

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "통일연구원"
        item[
            'published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        file_icon = response.xpath(
            '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img'
        ).get()
        if file_icon:
            file_download_url = response.xpath(
                '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href'
            ).extract()
            file_download_url = file_download_url[0]
            file_download_url = "http://www.kinu.or.kr/" + file_download_url
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file_hwp,
                                     meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file,
                                     meta={
                                         'item': item,
                                         'file_download_url':
                                         file_download_url,
                                         'file_name': file_icon
                                     },
                                     dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #25
0
ファイル: uniedu2.py プロジェクト: qkrwlgml05/crawling
    def parse_post(self, response):
        item = CrawlnkdbItem()
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath(
            '//*[@id="content_section"]/div[2]/div[1]/h4/text()').get()
        #print(title)

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        body = response.xpath('//*[@id="content_section"]/div[2]/div[2]').get()
        if body is None:
            test1 = response.xpath(
                '//*[@id="content_section"]/div[2]/dl[1]').get()
            test1 = re.sub('<script.*?>.*?</script>', '', test1, 0,
                           re.I | re.S)
            test1 = re.sub('<.+?>', '', test1, 0, re.I | re.S)
            test1 = re.sub('&nbsp;| |\t|\r|\n', " ", test1)
            test1 = re.sub('\"', "'", test1)

            test2 = response.xpath(
                '//*[@id="content_section"]/div[2]/dl[2]').get()
            test2 = re.sub('<script.*?>.*?</script>', '', test2, 0,
                           re.I | re.S)
            test2 = re.sub('<.+?>', '', test2, 0, re.I | re.S)
            test2 = re.sub('&nbsp;| |\t|\r|\n', " ", test2)
            test2 = re.sub('\"', "'", test2)
            body = test1 + test2
        else:
            body = re.sub('<script.*?>.*?</script>', '', body, 0, re.I | re.S)
            body = re.sub('<.+?>', '', body, 0, re.I | re.S)
            body = re.sub('&nbsp;| |\t|\r|\n', " ", body)
            body = re.sub('\"', "'", body)
        if body is None:
            body = "No text"
        if body == '':
            body = "No text"

        #print(body)

        # body = response.css('.descArea').xpath('string()').extract()

        date = response.xpath(
            '//*[@id="content_section"]/div[2]/div/div[2]/p[1]/span/text()'
        ).get()
        #print(date)
        if date is None:
            date = "No date"

        writer = response.xpath(
            '//*[@id="content_section"]/div[2]/div/div[2]/p[2]/span/text()'
        ).get()
        #print(writer)
        if writer is None:
            writer = "No writer"

        body_text = ''.join(body)

        top_category = "도서/동영상자료"

        item[config['VARS']['VAR1']] = title.strip()
        item[config['VARS']['VAR4']] = date.strip()
        item[config['VARS']['VAR3']] = writer.strip()
        item[config['VARS']['VAR2']] = body_text.strip()
        item[config['VARS']['VAR5']] = "통일부"
        item[config['VARS']['VAR6']] = "https://www.uniedu.go.kr/"
        item[config['VARS']['VAR7']] = top_category
        file_name = title
        file_icon = response.xpath(
            '//*[@id="content_section"]/div[2]/div/div[3]/p[1]/a/text()').get(
            )
        if not file_icon:
            file_icon = response.xpath(
                '//*[@id="content_section"]/div[2]/div[1]/div[2]/p/a/text()'
            ).get()
        print(file_icon)
        file_icon = None
        if file_icon:
            file_download_url = response.xpath(
                '//*[@id="content_section"]/div[2]/div/div[3]/p[1]/a/@href'
            ).extract()
            if file_download_url is None:
                file_download_url = response.xpath(
                    '//*[@id="content_section"]/div[2]/div[1]/div[2]/p/a/@href'
                ).extract()
            file_download_url = file_download_url[0]
            file_download_url = "https://www.uniedu.go.kr" + file_download_url
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file_hwp,
                                     meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file,
                                     meta={
                                         'item': item,
                                         'file_download_url':
                                         file_download_url,
                                         'file_name': file_icon
                                     },
                                     dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #26
0
    def parse_post(self, response):
        item = CrawlnkdbItem()
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath(
            '//*[@id="kboard-default-document"]/div[2]/div[1]/p/text()').get()

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()
        body = response.xpath(
            '//*[@id="kboard-default-document"]/div[2]/div[3]/div/text()').get(
            )
        if body is None:
            body = "No text"
        if body == '':
            body = "No text"

        # body = response.css('.descArea').xpath('string()').extract()

        date = response.xpath(
            '//*[@id="kboard-default-document"]/div[2]/div[2]/div[2]/div[2]/text()'
        ).get()

        writer = response.xpath(
            '//*[@id="kboard-default-document"]/div[2]/div[2]/div[1]/div[2]/text()'
        ).get()

        body_text = ''.join(body)

        top_category = response.xpath(
            '//*[@id="main"]/header/div/h1/text()').get()

        item[config['VARS']['VAR1']] = title.strip()
        item[config['VARS']['VAR4']] = date.strip()
        item[config['VARS']['VAR3']] = writer.strip()
        item[config['VARS']['VAR2']] = body_text.strip()
        item[config['VARS']['VAR5']] = "동국대학교 북한학연구소"
        item[config['VARS']['VAR6']] = "https://nkstudy.dongguk.edu"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        file_icon = response.xpath(
            '//*[@id="kboard-default-document"]/div[2]/div[4]/a/text()').get()
        file_icon = None

        if file_icon:
            file_download_url = response.xpath(
                ' //*[@id="kboard-default-document"]/div[2]/div[4]/a/@href'
            ).extract()
            file_download_url = file_download_url[0]
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file_hwp,
                                     meta={'item': item},
                                     dont_filter=True)  #
            else:
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file,
                                     meta={
                                         'item': item,
                                         'file_download_url':
                                         file_download_url,
                                         'file_name': file_icon
                                     },
                                     dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #27
0
    def parse_post(self, response):
        item = CrawlnkdbItem()
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath('//*[@id="cmsContent"]/div[1]/p/text()').get()
        #print(title)

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()

        body = response.xpath('//*[@id="tab_con"]').get()
        if body is None:
            body = "no text"
        body = re.sub('<script.*?>.*?</script>', '', body, 0, re.I | re.S)
        body = re.sub('<.+?>', '', body, 0, re.I | re.S)
        body = re.sub('&nbsp;| |\t|\r|\n', " ", body)
        body = re.sub('\"', "'", body)

        file_download_url = response.xpath(
            '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[5]/td/span/a/@href'
        ).extract()
        file_download_url = file_download_url[0]
        body = "본문 : " + body + " URL : " + file_download_url
        print(body)

        # body = response.css('.descArea').xpath('string()').extract()

        date = response.xpath(
            '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()'
        ).get()
        #print(date)

        writer = response.xpath(
            '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()'
        ).get()
        #print(writer)

        body_text = ''.join(body)

        top_category = response.xpath(
            '//*[@id="container"]/div[3]/div[1]/div/h2/text()').get()

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "통일연구원"
        item[
            'published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        file_icon = response.xpath(
            '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[5]/td/span'
        ).get()
        file_icon = None  # 첨부파일 막기
        if file_icon:
            file_download_url = response.xpath(
                '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[5]/td/span/a/@href'
            ).extract()
            file_download_url = file_download_url[0]
            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file_hwp,
                                     meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file,
                                     meta={
                                         'item': item,
                                         'file_download_url':
                                         file_download_url,
                                         'file_name': file_icon
                                     },
                                     dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item
コード例 #28
0
    def parse_post(self, response):
        item = CrawlnkdbItem()
        # title = response.css('#main > table > thead > tr > th font::text').get()
        title = response.xpath(
            '//*[@id="subConts"]/section/article/header/h1/text()').get()

        # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract()
        # body = response.css('.descArea')[0].get_text()
        body = response.xpath(
            '//*[@id="subConts"]/section/article/section').get()
        body = re.sub('<script.*?>.*?</script>', '', body, 0, re.I | re.S)
        body = re.sub('<.+?>', '', body, 0, re.I | re.S)
        body = re.sub('&nbsp;| |\t|\r|\n', " ", body)
        body = re.sub('\"', "'", body)
        #print(body)

        date = response.xpath(
            '//*[@id="subConts"]/section/article/header/address/p[2]/time/text()'
        ).get()

        writer = response.xpath(
            '//*[@id="subConts"]/section/article/header/address/p[1]/text()'
        ).get()

        body_text = ''.join(body)

        top_category = response.xpath('//*[@id="subConts"]/h1/text()').get()

        item['post_title'] = title.strip()
        item['post_date'] = date.strip()
        item['post_writer'] = writer.strip()
        item['post_body'] = body_text.strip()
        item['published_institution'] = "평화와 통일을 여는 사람들"
        item['published_institution_url'] = "http://www.spark946.org/data/"
        item[config['VARS']['VAR7']] = top_category

        file_name = title
        file_icon = response.xpath(
            '//*[@id="subConts"]/section/article/ul[1]/li/a/strong/text()'
        ).get()

        if file_icon:
            file_download_url = response.xpath(
                '//*[@id="subConts"]/section/article/ul[1]/li/a/@href'
            ).extract()
            file_download_url = file_download_url[0]
            file_download_url = "http://www.spark946.org/" + file_download_url

            item[config['VARS']['VAR10']] = file_download_url
            item[config['VARS']['VAR9']] = file_name
            print("@@@@@@file name ", file_name)
            if file_icon.find("hwp") != -1:
                print('find hwp')
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file_hwp,
                                     meta={'item': item})  #
            else:
                yield scrapy.Request(file_download_url,
                                     callback=self.save_file,
                                     meta={
                                         'item': item,
                                         'file_download_url':
                                         file_download_url,
                                         'file_name': file_icon
                                     },
                                     dont_filter=True)
        else:
            print("###############file does not exist#################")
            yield item