Exemple #1
0
def revise(row: News):
    """纠错,根据config文件的关键字分布,对新闻重新分组"""
    title = row.title
    content = row.abstract

    for ign in ignore:
        if ign in title:
            return None
    cls = row.keyword
    context = title + str(content)
    pre = predict(context)
    if pre != "金融":
        return row

    company = keyword["商业"]
    japan = keyword["日本"]
    economy = keyword["宏观"]
    finance = keyword["金融"]
    for key in japan:
        if any({key in title}):
            row.keyword = ["日本"]
            return row
    for key in company:
        if any({key in title, key in content}):
            row.keyword = "商业"
            return row
    for key in finance:
        if any({key in title, key in content}):
            row.keyword = "金融"
            break
    for key in economy:
        if any({key in title, key in content}):
            row.keyword = "宏观"
            return row
    return row
Exemple #2
0
def extract_content(articles, keyword=None):
    news_collection = []
    for a in articles:
        head = a.cssselect("div.text > p.title > a")[0]
        title = head.text.strip()
        url = head.get("href")
        abstract = a.cssselect("div.text > p.info")[0]
        abstract = abstract.get("title") if abstract.get("title") else abstract.text
        abstract = re.sub(r"[\r\n\s]", "", abstract)
        abstract = abstract.split("】")[-1]
        patten = re.compile(r"[(](.*?)[)]", re.S)
        source = re.findall(patten, abstract) if abstract else None
        source = source[-1] if source else "东方财富"
        save_date = a.cssselect("div.text > p.time")[0].text.strip()
        save_date = save_date.replace("月", "/")
        save_date = save_date.replace("日", "")
        save_date = f"{date.today().year}/{save_date}"
        save_date = time.strptime(save_date, "%Y/%m/%d %H:%M")
        if keyword:
            news = News(title=title, abstract=abstract, url=url, source=source, savedate=save_date,
                        keyword=keyword)
        else:
            news = News(title=title, abstract=abstract, url=url, source=source, savedate=save_date)

        news_collection.append(news)
    return news_collection
Exemple #3
0
    def collect(self):
        url = self.format_url()
        header["Referer"] = self.refer
        response = r.get(url, headers=header)
        if response.status_code != 200:
            raise NetworkError(
                f"东方财富网访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}"
            )
        content = response.content.decode("utf-8")
        data = json.loads(content)["Data"]

        news_collection = []

        if not data:
            return news_collection

        for d in data:
            title = re.sub(r"[<em></em>]", "", d["Title"])
            abstract = d["Content"]
            url = d["ArticleUrl"]
            source = d["NickName"]
            save_date = d["ShowTime"]
            news = News(title=title,
                        abstract=abstract,
                        url=url,
                        savedate=save_date,
                        source=source,
                        keyword=self.keyword)
            if "期权" not in news.title:
                news_collection.append(news)
        return news_collection
Exemple #4
0
    def collect(self):
        url = self.format_url()
        response = r.get(url, headers=header)
        if response.status_code != 200:
            raise NetworkError(
                f"中国证券报访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}"
            )
        content = response.content
        html = etree.HTML(content)
        articles = html.cssselect("ul.list-lm > li")

        news_collections = []

        for a in articles:
            title = a.cssselect("a")[0].text
            url = a.cssselect("a")[0].get("href")
            url = os.path.join(self.url, url)
            save_date = a.cssselect("span")[0].text
            save_date = time.strptime(save_date, "%y-%m-%d %H:%M")
            response = r.get(url)
            html = etree.HTML(response.content)
            try:
                source = html.cssselect(
                    "body > div:nth-child(9) > div.box835.hidden.left > div.article > div.info > "
                    "p:nth-child(2) > em:nth-child(2)")[0].text[3:]
            except:
                source = "中国证券报·中证网"

            try:
                # abstract = html.cssselect("div.article-t.hidden > p:nth-child(1)")[0].text.strip()
                abstracts = html.cssselect("div.article-t.hidden > p")
                for p in abstracts:
                    if p.text:
                        abstract = p.text.strip()
                        break
                else:
                    abstract = None
            except IndexError:
                try:
                    # abstract = html.cssselect("div.article-t.hidden > div > p:nth-child(1)")[0].text.strip()
                    abstracts = html.cssselect(
                        "div.article-t.hidden > div > p")
                    for p in abstracts:
                        if p.text:
                            abstract = p.text.strip()
                            break
                    else:
                        abstract = None
                except IndexError:
                    abstract = None

            news = News(title=title,
                        abstract=abstract,
                        url=url,
                        savedate=save_date,
                        source=source,
                        keyword=self.section)
            news_collections.append(news)
        return news_collections
    def collect(self):
        url = self.url
        cookie = self.get_cookie()
        print(cookie)

        data = {
            "index": 0,
            "f": "3x",
            "sslm": "all",
            "ssfw": "ybbt",
            "gjz": self.keyword,
            "sjfw": 1,
            "page": self.page
        }
        session = r.session()
        session.cookies = cookie
        resp = session.post(url, headers=header, data=data)
        content = resp.content.decode("utf-8")
        if not content:
            self.get_cookie()
            return
        content = etree.HTML(content)
        article = content.cssselect("#table1 >  tr")

        news_collections = []
        for a in article:
            head = a.cssselect(
                "td > div.tab_divttl > span:nth-child(2) > a")[0]
            url = "http://www.hibor.org" + head.get("href")
            title = head.get("title")
            save_date = a.cssselect(
                "td > div.tab_divtxt > span:nth-child(1) > strong")[0].text
            source = a.cssselect(
                "td > div.tab_divtxt > span:nth-child(3)")[0].text
            source = f"慧博-{source[3:]}"
            page = session.post(url, headers=header).content
            page = str(page, encoding="gbk")
            page = etree.HTML(page)
            abstract = page.xpath("//div[@class='p_main']/p/span/text()")
            abstract = "".join(abstract)
            news = News(title=title,
                        abstract=abstract,
                        url=url,
                        savedate=save_date,
                        source=source,
                        keyword="资产配置")
            print(news)
            if "期货" not in title:
                news_collections.append(news)
            time.sleep(5)
        session.close()
        return news_collections
Exemple #6
0
    def collect(self):
        url = self.format_url()
        response = r.get(url, headers=header)
        if response.status_code != 200:
            raise NetworkError(f"新浪财经网访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}")
        content = response.content
        content = json.loads(content)
        data = content["result"]['data']
        data = [[d['title'], d['intro'], d["url"], d["media_name"], datetime.datetime.fromtimestamp(int(d["mtime"]))] for d in data]

        news_collections = []

        for d in data:
            title, abstract, url, source, save_ate = d
            news = News(title=title, abstract=abstract, url=url, source=source, savedate=save_ate, keyword=self.section)
            if not news.abstract.startswith("http"):
                news_collections.append(news)
        return news_collections
Exemple #7
0
    def collect(self):
        url = self.format_url()
        response = r.get(url, headers=header)
        if response.status_code != 200:
            raise NetworkError(
                f"共同网访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}")
        content = response.content.decode("utf-8")
        html = etree.HTML(content)
        articles = html.cssselect("#js-postListItems > li")

        news_collection = []

        for a in articles:
            title = a.cssselect("a > h3")[0].text
            url = "https://china.kyodonews.net" + a.cssselect("a")[0].get(
                "href")
            save_date = a.cssselect("p.time")[0].text
            save_date = re.sub(r"[\s日|]", "", save_date)
            save_date = re.sub(r"[年月]", "/", save_date)
            save_date = time.strptime(save_date, "%Y/%m/%d-%H:%M")
            source = a.cssselect("a")[-1].text
            response = r.get(url)
            html = etree.HTML(response.content)
            abstract = html.cssselect(
                "div.article-body > p:nth-child(1)")[0].text
            abstract = re.sub(r"[\r\s\t]", "", abstract)
            abstract = abstract.split("】")[-1]
            news = News(title=title,
                        abstract=abstract,
                        url=url,
                        savedate=save_date,
                        source=source,
                        keyword=self.section)
            # if any({"日本" in news.title, "日本" in news.abstract}) and
            if "快讯" not in news.title:
                news_collection.append(news)
        return news_collection
Exemple #8
0
    def collect(self):
        url = self.format_url()
        header["Referer"] = url
        response = r.get(url, headers=header)
        if response.status_code != 200:
            raise NetworkError(f"财新网访问失败\n\tstatus_code: {response.status_code}\n\turl:{url}")
        content = response.content.decode("utf-8")
        html = etree.HTML(content)
        articles = html.cssselect("#listArticle > div.boxa")

        news_collections = []

        for a in articles:
            title = a.cssselect("h4 > a")[0].text
            url = a.cssselect("h4 > a")[0].get("href")
            abstract = a.cssselect("p")[0].text
            save_date = a.cssselect("span")[0].text[-12:]
            save_date = save_date.replace("月", "/")
            save_date = save_date.replace("日", "")
            save_date = f"{date.today().year}/{save_date}"
            save_date = time.strptime(save_date, "%Y/%m/%d %H:%M")
            news = News(title=title, abstract=abstract, url=url, savedate=save_date, source="财新网", keyword=self.section)
            news_collections.append(news)
        return news_collections
Exemple #9
0
            if n.keyword == "资产配置":
                print(n)
                all_news.append(n)
            title = Similarity.reduce(n.title + str(n.abstract))
            for s in simple:
                ratio = Similarity.check(s, title)
                if ratio > 30:
                    break
            else:
                all_news.append(n)
        return all_news

    @staticmethod
    def check_similar(inner):
        @wraps(inner)
        def func(*args, **kwargs):
            ret = inner(*args, **kwargs)
            different = Similarity.compare(ret)
            return different

        return func


if __name__ == '__main__':
    print(
        Similarity.compare(
            News(title="区块链“弄潮” 基金揭秘三路径提振个股业绩",
                 abstract="10月28日,A股三大股指全线收涨,盘中最抢眼的当属区块链板块,早盘集合竞价便掀起涨停潮。"
                 "截至收盘,区块链指数单日上涨8.85%,板块中广博股份、晨鑫科技、金冠股份等40只个股涨停。"
                 "虽然市场热情高涨,但多位公私募基金经理却表示对区块链了解并不深入,也并无相关操作。")))