Ejemplo n.º 1
0
    def parse(self, response):
        def createItem(date, text, url, ctr):
            item = dict()
            item['title'] = 'Maklumat Gangguan Bekalan Air (' + str(
                ctr) + ') ' + date
            item['text'] = text
            item['page_link'] = response.url
            item['file_link'] = ''
            item['date'] = date
            return item

        trs = response.xpath("//table/tr[*]")
        url = response.url
        ctr = 0
        for tr in trs:
            isDateExist = tr.xpath(
                "td[*]/strong/text()").extract_first() is not None
            isTextExist = tr.xpath("td[*]/p").extract_first() is not None
            if isDateExist is True and isTextExist is True:
                ctr = ctr + 1
                date = tr.xpath("td[*]/strong/text()").extract_first()
                text = tr.xpath("td[*]/p").extract_first()
                thisItem = createItem(date=date, text=text, url=url, ctr=ctr)

                main = dict()
                main['category'] = 'SAINS'
                main['cat_desc'] = 'Syarikat Air Negeri Sembilan'
                main['content'] = thisItem
                jsonstr = json.dumps(main)
                # print(jsonstr)

                # send content to Cache
                save_to_DB(jsonstr)
Ejemplo n.º 2
0
    def parse_full_article(self, response):
        # open the path and scrap
        text = ""
        content = response.xpath("//article/div/p")
        for subcontent in content:
            textincontent = subcontent.xpath("text()").extract()
            if (len(textincontent) > 0):
                text = text + textincontent[0]
        title = re.sub(
            pattern=r'\n+|\t+',
            repl='',
            string=response.xpath("//article/h1/text()").extract_first())

        item = dict()
        item['title'] = title
        item['text'] = text
        item['page_link'] = response.url
        item['file_link'] = ''
        item['date'] = ''

        main = dict()
        main['category'] = 'SAINS'
        main['cat_desc'] = 'Syarikat Air Negeri Sembilan'
        main['content'] = item
        jsonstr = json.dumps(main)
        # print(jsonstr)

        # send content to Cache
        save_to_DB(jsonstr)
Ejemplo n.º 3
0
articles = soup.find_all('article')
title = str(articles[0].p.get_text())

# article content


def get_content(soupObject):
    formattedContent = soupObject.prettify(formatter="html")
    return re.sub(pattern=r'\n', repl='', string=formattedContent)


contents = []
item = dict()
item['date'] = ''
item['title'] = title
item['text'] = get_content(articles[0])
item['file_link'] = ''
item['page_link'] = urlSAP
contents.append(item)

# wrap to JSON
for content in contents:
    main = dict()
    main['category'] = 'SAP'
    main['cat_desc'] = 'Syarikat Air Perlis'
    main['content'] = content
    jsonstr = json.dumps(main)

    # send content to Cache
    save_to_DB(jsonstr)