コード例 #1
0
    def __init__(self, *a, **kw):

        self.rconn = RedisSet().redisSet()
        self.dba = Dba()
        self.keyword = {
            "新浪网": "Sina",
            "环球网": "Huanqiu",
            "搜狐网": "Sohu",
            "网易": "WangYi",
            "凤凰网": "Ifeng",
            "新华网": "Xinhua",
            "篱笆网": "Liba",
            "新民网": "Xinmin",
            "看看新闻网": "KanKan",
            "中国天气网": "Weather",
            "东方网": "Eastday",
            "人民网-上海": "People",
            "上海热线": "Online",
            "上观": "ShangGuan",
            "上海新闻网": "ShangHaiNews",
            "腾讯大申网": "Tencent",
            "宽带山": "KuanDai",
            "中国广播网": "Radio"
        }
        self.current_date = time.strftime('%Y-%m-%d',
                                          time.localtime(time.time()))
        super(QxjSpider, self).__init__(*a, **kw)
コード例 #2
0
 def __init__(self):
     # client = pymongo.MongoClient("192.168.20.216", 27017)
     # db = client["SinaWebchatWeather"]
     # self.sinaComment = db["SinaComment"]
     # self.sinaContent = db["SinaContent"]
     # self.sogou = db["SoGouContent"]
     # self.tweets = db["Tweets"]
     # self.Info = db["Information"]
     self.ora = Dba()
     self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/"
     # self.path = "weatheroutput/weatheroutput/"
     self.keys = self.load_key()
     self.wm = WeatherModel(
         self.path + "LinearSVCl2.model",  self.path + "vectorizer.data",
         self.path + "ch2.data", self.path + "weatherwords.txt")
     self.tags = jieba.analyse.extract_tags
     self.ConSql = """
         INSERT INTO  QXJ.QXJ_YQ_WEIBO_DAY (sinaothid, sinaname, contentid,sinaid,vermicelli,content,flag,dta_date)
         VALUES('%s','%s','%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss')) """
     self.old_key_sql = """
         insert into qxj.qxj_yq_weibo_keyword_day (weibo_id, contentid, keyword, num, ts, dta_date)
         VALUES('%s','%s','%s','%d', to_date('%s','yyyy-mm-dd hh24:mi:ss'), date'%s')"""
     self.all_key_sql = """
         insert into qxj.qxj_keyword_all_day (keyword, type, dta_date) VALUES ('%s','%s',date'%s')"""
     self.CommSql = """
         INSERT INTO  QXJ.QXJ_YQ_PINGLUN_DAY (username, contentid, userid,comments,commentid,dta_date)
         VALUES('%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss'))"""
     self.SinComSql = """
コード例 #3
0
ファイル: SpiltKeyWord.py プロジェクト: chenxy0761/NewsSpider
class SpiltKeyWord(object):
    def __init__(self):
        self.dba = Dba()
        self.keys = self.load_key()
        self.tags = jieba.analyse.extract_tags

    def load_key(self):
        fd = open("../weatheroutput/weatheroutput/weatherwords.txt")
        keyword = fd.readlines()
        keys = []
        for key in keyword:
            key = key.replace("\n", "").replace("\r", "")
            if key != "":
                keys.append(key)
        return keys

    def query_data(self):

        sql = """select dta_date, contentid, sinaid, content from  QXJ.QXJ_YQ_WEIBO_DAY where flag= 1"""
        ins_sql = """insert into qxj.qxj_yq_weibo_keyword_day (weibo_id, contentid, keyword, num, ts, dta_date)
                VALUES('%s','%s','%s', '%d',to_date('%s','yyyy-mm-dd hh24:mi:ss'), date'%s')"""
        ins_all = """
            insert into qxj.qxj_keyword_all_day (keyword, type, dta_date) VALUES ('%s','%s',date'%s')        
        """
        data = self.dba.query_data(sql)
        for i in data:
            # i[3] = ":双休日本市天空云系较多,并伴有短时阵雨,最高气温在29-30℃。其中,明天(周六)上海以多云到阴为主,有分散性短时阵雨。偏东风3-4级。气温在24-29℃左右。下周本市处在副热带高压边缘,多阵雨或雷雨天气。下周初气温开始上升,最高气温将重回30℃以上,其中,周二的最高气温将上升至34℃上下。​​​嗯。。。美的悄无声息[呵呵],不露痕迹!http://t.cn/RNX7sak"
            p = re.compile(r"\w*", re.L)
            sent = p.sub("", i[3])
            dta_date = str(i[0]).split(" ")[0]
            try:
                item = {}
                for key in self.keys:
                    num = sent.count(key)
                    if num != 0:
                        item[key] = num
                key_tup = zip(item.values(), item.keys())
                key_sor = sorted(key_tup, reverse=True)
                for sor in key_sor[:20]:
                    ins_sqlq = ins_sql % (i[2], i[1], sor[1], sor[0], i[0],
                                          dta_date)
                    self.dba.cux_sql(self.dba.connect(), ins_sqlq)
            except Exception as e:
                print("keywords:::", e)
            tags = self.tags(sentence=sent, topK=None, withWeight=True)
            for k, v in tags:
                ins_allq = ins_all % (k, "sina", dta_date)
                try:
                    self.dba.cux_sql(self.dba.connect(), ins_allq)
                except Exception as e:
                    print("Newword:::", e)
            break
コード例 #4
0
ファイル: QxjSpider.py プロジェクト: chenxy0761/NewsSpider2
class QxjSpider(CrawlSpider):

    name = "QxjSpider"

    def __init__(self, *a, **kw):

        self.rconn = RedisSet().redisSet()
        self.dba = Dba()
        self.keyword = {
            "新浪网": "Sina",
            "环球网": "Huanqiu",
            "搜狐网": "Sohu",
            "网易": "WangYi",
            "凤凰网": "Ifeng",
            "新华网": "Xinhua",
            "篱笆网": "Liba",
            "新民网": "Xinmin",
            "看看新闻网": "KanKan",
            "中国天气网": "Weather",
            "东方网": "Eastday",
            "人民网-上海": "People",
            "上海热线": "Online",
            "上观": "ShangGuan",
            "上海新闻网": "ShangHaiNews",
            "腾讯大申网": "Tencent",
            "宽带山": "KuanDai",
            "中国广播网": "Radio"
        }
        self.current_date = time.strftime('%Y-%m-%d',
                                          time.localtime(time.time()))
        super(QxjSpider, self).__init__(*a, **kw)

    def start_requests(self):
        time = (datetime.datetime.now() +
                datetime.timedelta(days=-4)).strftime("%Y-%m-%d")
        lists = self.dba.query(time)
        for i in lists:
            try:
                htmlParse = self.parse_list()[self.keyword[i[0]]]
                data = {"msg": i, "htmlParse": htmlParse}
                yield Request(url=i[1],
                              callback=self.parse,
                              dont_filter=True,
                              meta={"data": data})
            except Exception, e:
                logger.error("No definition of parsing rules for <<%s>> web" %
                             e)
コード例 #5
0
ファイル: QxjSpider.py プロジェクト: chenxy0761/NewsSpider
class QxjSpider (CrawlSpider):

    name = "QxjSpider"

    def __init__(self, *a, **kw):

        self.rconn = RedisSet().redisSet()
        self.dba = Dba()
        self.keyword = {"新浪网": "Sina", "环球网": "Huanqiu", "搜狐网": "Sohu", "网易": "WangYi",
                        "凤凰网": "Ifeng", "新华网": "Xinhua",  "篱笆网": "Liba", "新民网": "Xinmin",
                        "看看新闻网": "KanKan", "中国天气网": "Weather", "东方网": "Eastday",
                        "人民网-上海": "People", "上海热线": "Online", "上观": "ShangGuan",
                        "上海新闻网": "ShangHaiNews",  "腾讯大申网": "Tencent", "宽带山": "KuanDai",
                        "中国广播网": "Radio"}
        self.current_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        super(QxjSpider, self).__init__(*a, **kw)


    def start_requests(self):
        lists = self.dba.query("2018-02-01")
        for i in lists:
            try:
                htmlParse = self.parse_list()[self.keyword[i[0]]]
                data = {"msg": i, "htmlParse": htmlParse}
                yield Request(url=i[1], callback=self.parse, dont_filter=True, meta={"data": data})
            except Exception as e:
                logger.error("No definition of parsing rules for <<%s>> web" % e)


    def parse(self, response):
        soup = BeautifulSoup(response.body, "lxml")
        data = response.meta["data"]
        try:
            title = soup.find("title").get_text(strip=True)
        except:
            title = "Null"
        if title != "Null" or data["msg"][0] == "腾讯大申网":
            htmlParse = data["htmlParse"]
            try:
                try:
                    keywords = soup.find('meta', {"name": "keywords"})['content']
                except TypeError:
                    keywords = soup.find('meta', {"name": "Keywords"})['content']
            except:
                keywords = "Null"
            try:
                try:
                    description = soup.find('meta', {"name": "description"})['content']
                except TypeError:
                    description = soup.find('meta', {"name": "Description"})['content']
            except :
                description = "Null"
            lines = ""
            for parse in htmlParse:
                try:
                    zw = soup.find(parse[0], {parse[1]: parse[2]}).get_text(strip=True)
                    xx = u"([\u4e00-\u9fff]+)"
                    zws = re.findall(xx, zw)
                    for line in zws:
                        lines += line
                    break
                except Exception as e:
                    pass

            if len(lines) > 5:
                item = EastItem()
                msg = data["msg"]
                item["web"] = msg[0]
                item["url"] = msg[1]
                item["datetime"] = msg[1]
                item['title'] = title
                item['keywords'] = keywords
                item['description'] = description
                item['content'] = lines
                yield item

    def parse_list(self):
        # 定义解析规则
        htmlParse = {"Sina": [["div", "id", "artibody"], ["div", "id", "article"]],
                     "Huanqiu": [["div", "class", "text"], ["article", "class", "text"]],
                     "Liba": [["div", "class", "ui-topic-content fn-break", ], ["div", "class", "clearfix"]],
                     "Sohu": [["article", "class", "article"], ["div", "id", "main_content"]],
                     "Ifeng": [["div", "id", "artical_real"], ["div", "id", "picTxt"], ["div", "id", "yc_con_txt"]],
                     "Online": [["div", "class", "newsCon"], ["div", "id", "zoom"]],
                     "Tencent": [["div", "id", "Cnt-Main-Article-QQ"], ["div", "id", "contTxt"], ["div", "class", "article"], ],
                     "KanKan": [["div", "class", "textBody"]],
                     "WangYi": [["div", "class", "post_text"], ["div", "class", "viewport"], ["div", "id", "endText"]],
                     "Eastday": [["div", "id", "zw"], ["div", "class", "main"], ["div", "class", "zw"],
                                 ["div", "class", "article-content"], ["div", "class", "newsContent"]],
                     "Xinhua": [["div", "id", "p-detail"], ["div", "id", "article"], ["div", "id", "content"]],
                     "People": [["div", "class", "box_con"], ["div", "class", "clearfix"]],
                     "Xinmin": [["div", "class", "a_p"], ["article", "class", "padding15 content"],["div", "id", "MP_article"]],
                     "Weather": [["div", "class", "xyn-text"]],
                     "ShangGuan": [["div", "id", "newscontents"]],
                     "ShangHaiNews": [["div", "class", "cms-news-article-content-block"]],
                     "KuanDai": [["div", "class", "reply_message"]],
                     "Radio": [["div", "class", "TRS_Editor"]]
                     }
        return htmlParse
コード例 #6
0
class NewsspiderPipeline(object):


    def __init__(self):
        # client = pymongo.MongoClient("192.168.20.216", 27017)
        # db = client["SinaWebchatWeather"]
        # self.sinaComment = db["SinaComment"]
        # self.sinaContent = db["SinaContent"]
        # self.sogou = db["SoGouContent"]
        # self.tweets = db["Tweets"]
        # self.Info = db["Information"]
        self.ora = Dba()
        self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/"
        # self.path = "weatheroutput/weatheroutput/"
        self.keys = self.load_key()
        self.wm = WeatherModel(
            self.path + "LinearSVCl2.model",  self.path + "vectorizer.data",
            self.path + "ch2.data", self.path + "weatherwords.txt")
        self.tags = jieba.analyse.extract_tags
        self.ConSql = """
            INSERT INTO  QXJ.QXJ_YQ_WEIBO_DAY (sinaothid, sinaname, contentid,sinaid,vermicelli,content,flag,dta_date)
            VALUES('%s','%s','%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss')) """
        self.old_key_sql = """
            insert into qxj.qxj_yq_weibo_keyword_day (weibo_id, contentid, keyword, num, ts, dta_date)
            VALUES('%s','%s','%s','%d', to_date('%s','yyyy-mm-dd hh24:mi:ss'), date'%s')"""
        self.all_key_sql = """
            insert into qxj.qxj_keyword_all_day (keyword, type, dta_date) VALUES ('%s','%s',date'%s')"""
        self.CommSql = """
            INSERT INTO  QXJ.QXJ_YQ_PINGLUN_DAY (username, contentid, userid,comments,commentid,dta_date)
            VALUES('%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss'))"""
        self.SinComSql = """
            INSERT INTO  QXJ.QXJ_YQ_PINGLUN_DAY (username, contentid, userid,comments,commentid,dta_date)
                      VALUES('%s','%s','%s','%s','%s',DATE '%s')"""

    def process_item(self, item, spider):
        if isinstance(item, SinaContentItem):
            try:
                ConSql = self.ConSql % (
                    item["SinaOthID"],
                    item["SinaName"],
                    item["contentId"],
                    item["SinaID"],
                    item["Vermicelli"],
                    item["content"], str(item['flag']),
                    item["pushTime"].split(" ")[0]
                )
                self.ora.cux_sql(self.ora.connect(), ConSql)
            except Exception, e:
                logger.error("ConSql: <<%s>>" % e)


            sent = self.format_string(item["content"])
            dta_date = item["pushTime"].split(" ")[0]
            try:
                item_key = {}
                for key in self.keys:
                    num = sent.count(key)
                    if num != 0:
                        item_key[key] = num
                key_tup = zip(item_key.values(), item_key.keys())
                key_sor = sorted(key_tup, reverse=True)
                for sor in key_sor[:20]:
                    old_sql = self.old_key_sql % (
                        item["SinaID"],
                        item["contentId"],
                        sor[1], sor[0],
                        item["pushTime"], dta_date
                    )
                    self.ora.cux_sql(self.ora.connect(), old_sql)
            except Exception, e:
                logger.error("old_sql: <<%s>>" % e)


            """提取关键字并存入oracle"""
            self.find_new_keyword(sent, dta_date)
コード例 #7
0
ファイル: SpiltKeyWord.py プロジェクト: chenxy0761/NewsSpider
 def __init__(self):
     self.dba = Dba()
     self.keys = self.load_key()
     self.tags = jieba.analyse.extract_tags