Esempio n. 1
0
class FETimes:
    """
    금융경제신문을 크롤링 한다. 
    """
    def __init__(self, conf):
        self.url1 = "http://www.fetimes.co.kr/news/articleList.html?sc_section_code=S1N16&view_type=sm"
        self.url2 = "http://www.fetimes.co.kr/news/articleList.html?sc_section_code=S1N17&view_type=sm"
        self.url3 = "http://www.fetimes.co.kr/news/articleList.html?sc_section_code=S1N18&view_type=sm"
        self.burl = "http://www.fetimes.co.kr"
        self.html = HtmlParser(conf)
        self.news = CrawlerNewspaper(conf)
        self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])

    def parseLinkArtcle(self, url):
        buf = []
        doc = self.html.getHtml(url)
        soup = BeautifulSoup(doc, 'html.parser')
        ar = soup.find_all('a')
        for v in ar:
            ss = str(v)
            if 'news/articleView' in ss:
                aa = v.get("href")
                uu = self.burl + str(aa)
                buf.append(uu)
        return buf

    def getStringFilter(self, ss):
        ss = ss.replace("'", "\'")
        return ss

    def parsingNews(self, url):
        ret = []
        links = self.parseLinkArtcle(url)
        for v in links:
            (title, text) = self.news.crawling(v)
            text = text.replace("저작권자 © 금융경제신문 무단전재 및 재배포 금지", "")
            art = Article(self.getStringFilter(title),
                          self.getStringFilter(text), "금융경제")
            #print("title : ", title)
            #print("text : ", text)

            ret.append(art.toDic())
            print(art.toDic())
        return ret

    def crawling(self):
        ret = self.parsingNews(self.url1)
        ret.extend(self.parsingNews(self.url2))
        ret.extend(self.parsingNews(self.url3))

        cl = self.pulsar.getClient()
        pro = self.pulsar.createProcuder(cl, "newspaper")
        for dd in ret:
            jj = json.dumps(dd)
            print("send data : ", jj)
            pro.send(jj.encode('utf-8'))
        cl.close()
Esempio n. 2
0
class HanKyung:
    """
    한국 경제 신문을 크롤링한다. 
    """

    def __init__(self, conf):
        self.url1 = "https://www.hankyung.com/finance/0104"
        self.url2 = "https://www.hankyung.com/finance/0103"
        self.url3 = "https://www.hankyung.com/finance/0102"
        self.conf = conf
        self.html = HtmlParser(conf)
        self.news = CrawlerNewspaper(conf)
        self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])

    def parseLinkArtcle(self, url):
        buf = []
        doc = self.html.getHtml(url)
        soup = BeautifulSoup(doc, 'html.parser')
        ar = soup.find_all('a')
        for v in ar:
            ss = str(v)
            if 'finance/article' in ss and "#" not in ss:
                aa = v.get("href")
                buf.append(str(aa))
        return buf

    def getStringFilter(self, ss):
        ss = ss.replace("'", "\'")
        return ss

    def parsingNews(self, url):
        ret = []
        links = self.parseLinkArtcle(url)
        for v in links:
            (title, text) = self.news.crawling(v)
            art = Article(self.getStringFilter(title), self.getStringFilter(text), "한국경제")
            print("title : ", title)
            print("text : ", text)
            ret.append(art.toDic())
        return ret

    def crawling(self):
        wp = open("test.jsonl", 'w')
        ret = self.parsingNews(self.url1)
        ret.extend(self.parsingNews(self.url2))
        ret.extend(self.parsingNews(self.url3))

        cl = self.pulsar.getClient()
        pro = self.pulsar.createProcuder(cl, "newspaper")
        for dd in ret:
            jj = json.dumps(dd)
            print("send data : ", jj)
            wp.write(jj + "\n")
            pro.send(jj.encode('utf-8'))
        wp.close()
        cl.close()
Esempio n. 3
0
class Maeil:
    """
    매일 경제 신문을 크롤링한다. 
    """
    def __init__(self, conf):
        self.url1 = "https://www.mk.co.kr/news/economy"
        self.conf = conf
        self.html = HtmlParser(conf)
        self.news = CrawlerNewspaper(conf)
        self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])
        self.util = Utils()

    def parseLinkArtcle(self, url):
        buf = []
        doc = self.html.getHtml(url)
        soup = BeautifulSoup(doc, 'html.parser')
        ar = soup.find_all('a')
        for v in ar:
            ss = str(v)
            if "view" in ss:
                aa = v.get("href")
                if "news/economy" in aa or "news/stock" in aa:
                    buf.append(str(aa))
        return buf

    def parsingNews(self, url):
        ret = []
        links = self.parseLinkArtcle(self.url1)
        for v in links:
            try:
                (title, text) = self.news.crawling(v)
                print('title : ', title)
                print("text : ", text)
                art = Article(self.util.getStringFilter(title),
                              self.util.getStringFilter(text), "매일경제")
                ret.append(art.toDic())
            except:
                print('new crawling error ')
        return ret

    def crawling(self):
        ret = self.parsingNews(self.url1)
        print(ret)
        cl = self.pulsar.getClient()
        pro = self.pulsar.createProcuder(cl, "newspaper")
        for dd in ret:
            jj = json.dumps(dd)
            print("send data : ", jj)
            pro.send(jj.encode('utf-8'))
        cl.close()
Esempio n. 4
0
class Seoul:
    """
    서울 경제 신문을 크롤링 한다. 
    """
    def __init__(self, conf):
        self.url = "https://m.sedaily.com"
        self.html = HtmlParser(conf)
        self.news = CrawlerNewspaper(conf)
        self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])

    def parseLinkArtcle(self, url):
        buf = []
        doc = self.html.getHtml(url)
        soup = BeautifulSoup(doc, 'html.parser')
        ar = soup.find_all('a')
        for v in ar:
            ss = str(v)
            if '/NewsView/' in ss:
                aa = v.get("href")
                hh = self.url + "/" + str(aa)
                buf.append(hh)
        return buf

    def getStringFilter(self, ss):
        ss = ss.replace("'", "\'")
        return ss

    def parsingNews(self, url):
        ret = []
        links = self.parseLinkArtcle(url)
        for v in links:
            print(v)
            (title, text) = self.news.crawling(v)
            art = Article(self.getStringFilter(title),
                          self.getStringFilter(text), "서울경제")
            print("title : ", title)
            print("text : ", text)
            ret.append(art.toDic())
        return ret

    def crawling(self):
        ret = self.parsingNews(self.url)
        cl = self.pulsar.getClient()
        pro = self.pulsar.createProcuder(cl, "newspaper")
        for dd in ret:
            jj = json.dumps(dd)
            print("send data : ", jj)
            pro.send(jj.encode('utf-8'))
        cl.close()