def __init__(self, conf): self.url1 = "http://www.sisanews.kr/news/articleList.html?sc_section_code=S1N17&view_type=sm" self.url2 = "http://www.sisanews.kr/news/articleList.html?sc_section_code=S1N16&view_type=sm" self.burl = "http://www.sisanews.kr" self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])
def __init__(self, conf): self.url1 = "https://www.mk.co.kr/news/economy" self.conf = conf self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port']) self.util = Utils()
def __init__(self, conf): self.url1 = "https://www.hankyung.com/finance/0104" self.url2 = "https://www.hankyung.com/finance/0103" self.url3 = "https://www.hankyung.com/finance/0102" self.conf = conf self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])
class FETimes: """ 금융경제신문을 크롤링 한다. """ def __init__(self, conf): self.url1 = "http://www.fetimes.co.kr/news/articleList.html?sc_section_code=S1N16&view_type=sm" self.url2 = "http://www.fetimes.co.kr/news/articleList.html?sc_section_code=S1N17&view_type=sm" self.url3 = "http://www.fetimes.co.kr/news/articleList.html?sc_section_code=S1N18&view_type=sm" self.burl = "http://www.fetimes.co.kr" self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port']) def parseLinkArtcle(self, url): buf = [] doc = self.html.getHtml(url) soup = BeautifulSoup(doc, 'html.parser') ar = soup.find_all('a') for v in ar: ss = str(v) if 'news/articleView' in ss: aa = v.get("href") uu = self.burl + str(aa) buf.append(uu) return buf def getStringFilter(self, ss): ss = ss.replace("'", "\'") return ss def parsingNews(self, url): ret = [] links = self.parseLinkArtcle(url) for v in links: (title, text) = self.news.crawling(v) text = text.replace("저작권자 © 금융경제신문 무단전재 및 재배포 금지", "") art = Article(self.getStringFilter(title), self.getStringFilter(text), "금융경제") #print("title : ", title) #print("text : ", text) ret.append(art.toDic()) print(art.toDic()) return ret def crawling(self): ret = self.parsingNews(self.url1) ret.extend(self.parsingNews(self.url2)) ret.extend(self.parsingNews(self.url3)) cl = self.pulsar.getClient() pro = self.pulsar.createProcuder(cl, "newspaper") for dd in ret: jj = json.dumps(dd) print("send data : ", jj) pro.send(jj.encode('utf-8')) cl.close()
class HanKyung: """ 한국 경제 신문을 크롤링한다. """ def __init__(self, conf): self.url1 = "https://www.hankyung.com/finance/0104" self.url2 = "https://www.hankyung.com/finance/0103" self.url3 = "https://www.hankyung.com/finance/0102" self.conf = conf self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port']) def parseLinkArtcle(self, url): buf = [] doc = self.html.getHtml(url) soup = BeautifulSoup(doc, 'html.parser') ar = soup.find_all('a') for v in ar: ss = str(v) if 'finance/article' in ss and "#" not in ss: aa = v.get("href") buf.append(str(aa)) return buf def getStringFilter(self, ss): ss = ss.replace("'", "\'") return ss def parsingNews(self, url): ret = [] links = self.parseLinkArtcle(url) for v in links: (title, text) = self.news.crawling(v) art = Article(self.getStringFilter(title), self.getStringFilter(text), "한국경제") print("title : ", title) print("text : ", text) ret.append(art.toDic()) return ret def crawling(self): wp = open("test.jsonl", 'w') ret = self.parsingNews(self.url1) ret.extend(self.parsingNews(self.url2)) ret.extend(self.parsingNews(self.url3)) cl = self.pulsar.getClient() pro = self.pulsar.createProcuder(cl, "newspaper") for dd in ret: jj = json.dumps(dd) print("send data : ", jj) wp.write(jj + "\n") pro.send(jj.encode('utf-8')) wp.close() cl.close()
class Maeil: """ 매일 경제 신문을 크롤링한다. """ def __init__(self, conf): self.url1 = "https://www.mk.co.kr/news/economy" self.conf = conf self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port']) self.util = Utils() def parseLinkArtcle(self, url): buf = [] doc = self.html.getHtml(url) soup = BeautifulSoup(doc, 'html.parser') ar = soup.find_all('a') for v in ar: ss = str(v) if "view" in ss: aa = v.get("href") if "news/economy" in aa or "news/stock" in aa: buf.append(str(aa)) return buf def parsingNews(self, url): ret = [] links = self.parseLinkArtcle(self.url1) for v in links: try: (title, text) = self.news.crawling(v) print('title : ', title) print("text : ", text) art = Article(self.util.getStringFilter(title), self.util.getStringFilter(text), "매일경제") ret.append(art.toDic()) except: print('new crawling error ') return ret def crawling(self): ret = self.parsingNews(self.url1) print(ret) cl = self.pulsar.getClient() pro = self.pulsar.createProcuder(cl, "newspaper") for dd in ret: jj = json.dumps(dd) print("send data : ", jj) pro.send(jj.encode('utf-8')) cl.close()
class Seoul: """ 서울 경제 신문을 크롤링 한다. """ def __init__(self, conf): self.url = "https://m.sedaily.com" self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port']) def parseLinkArtcle(self, url): buf = [] doc = self.html.getHtml(url) soup = BeautifulSoup(doc, 'html.parser') ar = soup.find_all('a') for v in ar: ss = str(v) if '/NewsView/' in ss: aa = v.get("href") hh = self.url + "/" + str(aa) buf.append(hh) return buf def getStringFilter(self, ss): ss = ss.replace("'", "\'") return ss def parsingNews(self, url): ret = [] links = self.parseLinkArtcle(url) for v in links: print(v) (title, text) = self.news.crawling(v) art = Article(self.getStringFilter(title), self.getStringFilter(text), "서울경제") print("title : ", title) print("text : ", text) ret.append(art.toDic()) return ret def crawling(self): ret = self.parsingNews(self.url) cl = self.pulsar.getClient() pro = self.pulsar.createProcuder(cl, "newspaper") for dd in ret: jj = json.dumps(dd) print("send data : ", jj) pro.send(jj.encode('utf-8')) cl.close()
def __init__(self, conf): self.url = "https://m.sedaily.com" self.html = HtmlParser(conf) self.news = CrawlerNewspaper(conf) self.pulsar = PulsarStore(conf['pulsar']['ip'], conf['pulsar']['port'])