def revise(row: News): """纠错,根据config文件的关键字分布,对新闻重新分组""" title = row.title content = row.abstract for ign in ignore: if ign in title: return None cls = row.keyword context = title + str(content) pre = predict(context) if pre != "金融": return row company = keyword["商业"] japan = keyword["日本"] economy = keyword["宏观"] finance = keyword["金融"] for key in japan: if any({key in title}): row.keyword = ["日本"] return row for key in company: if any({key in title, key in content}): row.keyword = "商业" return row for key in finance: if any({key in title, key in content}): row.keyword = "金融" break for key in economy: if any({key in title, key in content}): row.keyword = "宏观" return row return row
def extract_content(articles, keyword=None): news_collection = [] for a in articles: head = a.cssselect("div.text > p.title > a")[0] title = head.text.strip() url = head.get("href") abstract = a.cssselect("div.text > p.info")[0] abstract = abstract.get("title") if abstract.get("title") else abstract.text abstract = re.sub(r"[\r\n\s]", "", abstract) abstract = abstract.split("】")[-1] patten = re.compile(r"[(](.*?)[)]", re.S) source = re.findall(patten, abstract) if abstract else None source = source[-1] if source else "东方财富" save_date = a.cssselect("div.text > p.time")[0].text.strip() save_date = save_date.replace("月", "/") save_date = save_date.replace("日", "") save_date = f"{date.today().year}/{save_date}" save_date = time.strptime(save_date, "%Y/%m/%d %H:%M") if keyword: news = News(title=title, abstract=abstract, url=url, source=source, savedate=save_date, keyword=keyword) else: news = News(title=title, abstract=abstract, url=url, source=source, savedate=save_date) news_collection.append(news) return news_collection
def collect(self): url = self.format_url() header["Referer"] = self.refer response = r.get(url, headers=header) if response.status_code != 200: raise NetworkError( f"东方财富网访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}" ) content = response.content.decode("utf-8") data = json.loads(content)["Data"] news_collection = [] if not data: return news_collection for d in data: title = re.sub(r"[<em></em>]", "", d["Title"]) abstract = d["Content"] url = d["ArticleUrl"] source = d["NickName"] save_date = d["ShowTime"] news = News(title=title, abstract=abstract, url=url, savedate=save_date, source=source, keyword=self.keyword) if "期权" not in news.title: news_collection.append(news) return news_collection
def collect(self): url = self.format_url() response = r.get(url, headers=header) if response.status_code != 200: raise NetworkError( f"中国证券报访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}" ) content = response.content html = etree.HTML(content) articles = html.cssselect("ul.list-lm > li") news_collections = [] for a in articles: title = a.cssselect("a")[0].text url = a.cssselect("a")[0].get("href") url = os.path.join(self.url, url) save_date = a.cssselect("span")[0].text save_date = time.strptime(save_date, "%y-%m-%d %H:%M") response = r.get(url) html = etree.HTML(response.content) try: source = html.cssselect( "body > div:nth-child(9) > div.box835.hidden.left > div.article > div.info > " "p:nth-child(2) > em:nth-child(2)")[0].text[3:] except: source = "中国证券报·中证网" try: # abstract = html.cssselect("div.article-t.hidden > p:nth-child(1)")[0].text.strip() abstracts = html.cssselect("div.article-t.hidden > p") for p in abstracts: if p.text: abstract = p.text.strip() break else: abstract = None except IndexError: try: # abstract = html.cssselect("div.article-t.hidden > div > p:nth-child(1)")[0].text.strip() abstracts = html.cssselect( "div.article-t.hidden > div > p") for p in abstracts: if p.text: abstract = p.text.strip() break else: abstract = None except IndexError: abstract = None news = News(title=title, abstract=abstract, url=url, savedate=save_date, source=source, keyword=self.section) news_collections.append(news) return news_collections
def collect(self): url = self.url cookie = self.get_cookie() print(cookie) data = { "index": 0, "f": "3x", "sslm": "all", "ssfw": "ybbt", "gjz": self.keyword, "sjfw": 1, "page": self.page } session = r.session() session.cookies = cookie resp = session.post(url, headers=header, data=data) content = resp.content.decode("utf-8") if not content: self.get_cookie() return content = etree.HTML(content) article = content.cssselect("#table1 > tr") news_collections = [] for a in article: head = a.cssselect( "td > div.tab_divttl > span:nth-child(2) > a")[0] url = "http://www.hibor.org" + head.get("href") title = head.get("title") save_date = a.cssselect( "td > div.tab_divtxt > span:nth-child(1) > strong")[0].text source = a.cssselect( "td > div.tab_divtxt > span:nth-child(3)")[0].text source = f"慧博-{source[3:]}" page = session.post(url, headers=header).content page = str(page, encoding="gbk") page = etree.HTML(page) abstract = page.xpath("//div[@class='p_main']/p/span/text()") abstract = "".join(abstract) news = News(title=title, abstract=abstract, url=url, savedate=save_date, source=source, keyword="资产配置") print(news) if "期货" not in title: news_collections.append(news) time.sleep(5) session.close() return news_collections
def collect(self): url = self.format_url() response = r.get(url, headers=header) if response.status_code != 200: raise NetworkError(f"新浪财经网访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}") content = response.content content = json.loads(content) data = content["result"]['data'] data = [[d['title'], d['intro'], d["url"], d["media_name"], datetime.datetime.fromtimestamp(int(d["mtime"]))] for d in data] news_collections = [] for d in data: title, abstract, url, source, save_ate = d news = News(title=title, abstract=abstract, url=url, source=source, savedate=save_ate, keyword=self.section) if not news.abstract.startswith("http"): news_collections.append(news) return news_collections
def collect(self): url = self.format_url() response = r.get(url, headers=header) if response.status_code != 200: raise NetworkError( f"共同网访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}") content = response.content.decode("utf-8") html = etree.HTML(content) articles = html.cssselect("#js-postListItems > li") news_collection = [] for a in articles: title = a.cssselect("a > h3")[0].text url = "https://china.kyodonews.net" + a.cssselect("a")[0].get( "href") save_date = a.cssselect("p.time")[0].text save_date = re.sub(r"[\s日|]", "", save_date) save_date = re.sub(r"[年月]", "/", save_date) save_date = time.strptime(save_date, "%Y/%m/%d-%H:%M") source = a.cssselect("a")[-1].text response = r.get(url) html = etree.HTML(response.content) abstract = html.cssselect( "div.article-body > p:nth-child(1)")[0].text abstract = re.sub(r"[\r\s\t]", "", abstract) abstract = abstract.split("】")[-1] news = News(title=title, abstract=abstract, url=url, savedate=save_date, source=source, keyword=self.section) # if any({"日本" in news.title, "日本" in news.abstract}) and if "快讯" not in news.title: news_collection.append(news) return news_collection
def collect(self): url = self.format_url() header["Referer"] = url response = r.get(url, headers=header) if response.status_code != 200: raise NetworkError(f"财新网访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}") content = response.content.decode("utf-8") html = etree.HTML(content) articles = html.cssselect("#listArticle > div.boxa") news_collections = [] for a in articles: title = a.cssselect("h4 > a")[0].text url = a.cssselect("h4 > a")[0].get("href") abstract = a.cssselect("p")[0].text save_date = a.cssselect("span")[0].text[-12:] save_date = save_date.replace("月", "/") save_date = save_date.replace("日", "") save_date = f"{date.today().year}/{save_date}" save_date = time.strptime(save_date, "%Y/%m/%d %H:%M") news = News(title=title, abstract=abstract, url=url, savedate=save_date, source="财新网", keyword=self.section) news_collections.append(news) return news_collections
if n.keyword == "资产配置": print(n) all_news.append(n) title = Similarity.reduce(n.title + str(n.abstract)) for s in simple: ratio = Similarity.check(s, title) if ratio > 30: break else: all_news.append(n) return all_news @staticmethod def check_similar(inner): @wraps(inner) def func(*args, **kwargs): ret = inner(*args, **kwargs) different = Similarity.compare(ret) return different return func if __name__ == '__main__': print( Similarity.compare( News(title="区块链“弄潮” 基金揭秘三路径提振个股业绩", abstract="10月28日,A股三大股指全线收涨,盘中最抢眼的当属区块链板块,早盘集合竞价便掀起涨停潮。" "截至收盘,区块链指数单日上涨8.85%,板块中广博股份、晨鑫科技、金冠股份等40只个股涨停。" "虽然市场热情高涨,但多位公私募基金经理却表示对区块链了解并不深入,也并无相关操作。")))