def news_analyze(cls, query_body):
     """
     单篇新闻分析,按url精准取新闻内容,从mysql里取,其次从网络抓取
     :param query_body:
     :return:
     """
     news_content = ""
     news_url = query_body["news_url"]
     news_src = query_body["news_src"]
     src_dict = {
         "新浪": "sina_news.sina_mid",
         "网易": "wangyi_news.wangyi_mid",
         "腾讯": "qq_news.qq_mid",
         "搜狐": "souhu_news.souhu_mid",
         "新华": "xinhua_new.xinhua_mid"
     }
     try:
         sql_statement = "select * from %s where news_link='%s';" % (
             src_dict[news_src], news_url)
         conn = MysqlHelper.create_conn()
         cur = conn.cursor()
         cur.execute(sql_statement)
         mysql_res = cur.fetchall()
         if len(mysql_res) > 0:
             news_content = mysql_res[0][3]
         else:
             # 数据库搜不到,从网络爬,用新华网的session来抓
             news_content = Html2Article.url2article(news_url)
     except Exception, e:
         # 数据库搜不到,从网络爬,用新华网的session来抓
         news_content = Html2Article.url2article(news_url)
Ejemplo n.º 2
0
# -*- coding:utf-8 -*-
import sys
import requests
import pymysql
from elasticsearch import Elasticsearch
from model.db_operate.mysql_helper import MysqlHelper
reload(sys)
sys.setdefaultencoding('utf-8')

# 从mysql数据库全量导入到elastcisearch
conn = MysqlHelper.create_conn()
cur = conn.cursor()

es = Elasticsearch()

db_table_list = [
    "qq_news.qq_mid", "sina_news.sina_mid", "souhu_news.souhu_mid",
    "wangyi_news.wangyi_mid", "xinhua_news.xinhua_mid"
]
for db_table in db_table_list:
    sql_statement = "select * from %s;" % db_table
    cur.execute(sql_statement)
    res = cur.fetchall()

    for news in res:
        try:
            title = news[1]
            pub_time = str(news[2]).split()[0]
            news_content = news[3]
            src = news[4]
            url = news[5]