def __init__(self, *a, **kw): self.rconn = RedisSet().redisSet() self.dba = Dba() self.keyword = { "新浪网": "Sina", "环球网": "Huanqiu", "搜狐网": "Sohu", "网易": "WangYi", "凤凰网": "Ifeng", "新华网": "Xinhua", "篱笆网": "Liba", "新民网": "Xinmin", "看看新闻网": "KanKan", "中国天气网": "Weather", "东方网": "Eastday", "人民网-上海": "People", "上海热线": "Online", "上观": "ShangGuan", "上海新闻网": "ShangHaiNews", "腾讯大申网": "Tencent", "宽带山": "KuanDai", "中国广播网": "Radio" } self.current_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) super(QxjSpider, self).__init__(*a, **kw)
def __init__(self): # client = pymongo.MongoClient("192.168.20.216", 27017) # db = client["SinaWebchatWeather"] # self.sinaComment = db["SinaComment"] # self.sinaContent = db["SinaContent"] # self.sogou = db["SoGouContent"] # self.tweets = db["Tweets"] # self.Info = db["Information"] self.ora = Dba() self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/" # self.path = "weatheroutput/weatheroutput/" self.keys = self.load_key() self.wm = WeatherModel( self.path + "LinearSVCl2.model", self.path + "vectorizer.data", self.path + "ch2.data", self.path + "weatherwords.txt") self.tags = jieba.analyse.extract_tags self.ConSql = """ INSERT INTO QXJ.QXJ_YQ_WEIBO_DAY (sinaothid, sinaname, contentid,sinaid,vermicelli,content,flag,dta_date) VALUES('%s','%s','%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss')) """ self.old_key_sql = """ insert into qxj.qxj_yq_weibo_keyword_day (weibo_id, contentid, keyword, num, ts, dta_date) VALUES('%s','%s','%s','%d', to_date('%s','yyyy-mm-dd hh24:mi:ss'), date'%s')""" self.all_key_sql = """ insert into qxj.qxj_keyword_all_day (keyword, type, dta_date) VALUES ('%s','%s',date'%s')""" self.CommSql = """ INSERT INTO QXJ.QXJ_YQ_PINGLUN_DAY (username, contentid, userid,comments,commentid,dta_date) VALUES('%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss'))""" self.SinComSql = """
class SpiltKeyWord(object): def __init__(self): self.dba = Dba() self.keys = self.load_key() self.tags = jieba.analyse.extract_tags def load_key(self): fd = open("../weatheroutput/weatheroutput/weatherwords.txt") keyword = fd.readlines() keys = [] for key in keyword: key = key.replace("\n", "").replace("\r", "") if key != "": keys.append(key) return keys def query_data(self): sql = """select dta_date, contentid, sinaid, content from QXJ.QXJ_YQ_WEIBO_DAY where flag= 1""" ins_sql = """insert into qxj.qxj_yq_weibo_keyword_day (weibo_id, contentid, keyword, num, ts, dta_date) VALUES('%s','%s','%s', '%d',to_date('%s','yyyy-mm-dd hh24:mi:ss'), date'%s')""" ins_all = """ insert into qxj.qxj_keyword_all_day (keyword, type, dta_date) VALUES ('%s','%s',date'%s') """ data = self.dba.query_data(sql) for i in data: # i[3] = ":双休日本市天空云系较多,并伴有短时阵雨,最高气温在29-30℃。其中,明天(周六)上海以多云到阴为主,有分散性短时阵雨。偏东风3-4级。气温在24-29℃左右。下周本市处在副热带高压边缘,多阵雨或雷雨天气。下周初气温开始上升,最高气温将重回30℃以上,其中,周二的最高气温将上升至34℃上下。嗯。。。美的悄无声息[呵呵],不露痕迹!http://t.cn/RNX7sak" p = re.compile(r"\w*", re.L) sent = p.sub("", i[3]) dta_date = str(i[0]).split(" ")[0] try: item = {} for key in self.keys: num = sent.count(key) if num != 0: item[key] = num key_tup = zip(item.values(), item.keys()) key_sor = sorted(key_tup, reverse=True) for sor in key_sor[:20]: ins_sqlq = ins_sql % (i[2], i[1], sor[1], sor[0], i[0], dta_date) self.dba.cux_sql(self.dba.connect(), ins_sqlq) except Exception as e: print("keywords:::", e) tags = self.tags(sentence=sent, topK=None, withWeight=True) for k, v in tags: ins_allq = ins_all % (k, "sina", dta_date) try: self.dba.cux_sql(self.dba.connect(), ins_allq) except Exception as e: print("Newword:::", e) break
class QxjSpider(CrawlSpider): name = "QxjSpider" def __init__(self, *a, **kw): self.rconn = RedisSet().redisSet() self.dba = Dba() self.keyword = { "新浪网": "Sina", "环球网": "Huanqiu", "搜狐网": "Sohu", "网易": "WangYi", "凤凰网": "Ifeng", "新华网": "Xinhua", "篱笆网": "Liba", "新民网": "Xinmin", "看看新闻网": "KanKan", "中国天气网": "Weather", "东方网": "Eastday", "人民网-上海": "People", "上海热线": "Online", "上观": "ShangGuan", "上海新闻网": "ShangHaiNews", "腾讯大申网": "Tencent", "宽带山": "KuanDai", "中国广播网": "Radio" } self.current_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) super(QxjSpider, self).__init__(*a, **kw) def start_requests(self): time = (datetime.datetime.now() + datetime.timedelta(days=-4)).strftime("%Y-%m-%d") lists = self.dba.query(time) for i in lists: try: htmlParse = self.parse_list()[self.keyword[i[0]]] data = {"msg": i, "htmlParse": htmlParse} yield Request(url=i[1], callback=self.parse, dont_filter=True, meta={"data": data}) except Exception, e: logger.error("No definition of parsing rules for <<%s>> web" % e)
class QxjSpider (CrawlSpider): name = "QxjSpider" def __init__(self, *a, **kw): self.rconn = RedisSet().redisSet() self.dba = Dba() self.keyword = {"新浪网": "Sina", "环球网": "Huanqiu", "搜狐网": "Sohu", "网易": "WangYi", "凤凰网": "Ifeng", "新华网": "Xinhua", "篱笆网": "Liba", "新民网": "Xinmin", "看看新闻网": "KanKan", "中国天气网": "Weather", "东方网": "Eastday", "人民网-上海": "People", "上海热线": "Online", "上观": "ShangGuan", "上海新闻网": "ShangHaiNews", "腾讯大申网": "Tencent", "宽带山": "KuanDai", "中国广播网": "Radio"} self.current_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) super(QxjSpider, self).__init__(*a, **kw) def start_requests(self): lists = self.dba.query("2018-02-01") for i in lists: try: htmlParse = self.parse_list()[self.keyword[i[0]]] data = {"msg": i, "htmlParse": htmlParse} yield Request(url=i[1], callback=self.parse, dont_filter=True, meta={"data": data}) except Exception as e: logger.error("No definition of parsing rules for <<%s>> web" % e) def parse(self, response): soup = BeautifulSoup(response.body, "lxml") data = response.meta["data"] try: title = soup.find("title").get_text(strip=True) except: title = "Null" if title != "Null" or data["msg"][0] == "腾讯大申网": htmlParse = data["htmlParse"] try: try: keywords = soup.find('meta', {"name": "keywords"})['content'] except TypeError: keywords = soup.find('meta', {"name": "Keywords"})['content'] except: keywords = "Null" try: try: description = soup.find('meta', {"name": "description"})['content'] except TypeError: description = soup.find('meta', {"name": "Description"})['content'] except : description = "Null" lines = "" for parse in htmlParse: try: zw = soup.find(parse[0], {parse[1]: parse[2]}).get_text(strip=True) xx = u"([\u4e00-\u9fff]+)" zws = re.findall(xx, zw) for line in zws: lines += line break except Exception as e: pass if len(lines) > 5: item = EastItem() msg = data["msg"] item["web"] = msg[0] item["url"] = msg[1] item["datetime"] = msg[1] item['title'] = title item['keywords'] = keywords item['description'] = description item['content'] = lines yield item def parse_list(self): # 定义解析规则 htmlParse = {"Sina": [["div", "id", "artibody"], ["div", "id", "article"]], "Huanqiu": [["div", "class", "text"], ["article", "class", "text"]], "Liba": [["div", "class", "ui-topic-content fn-break", ], ["div", "class", "clearfix"]], "Sohu": [["article", "class", "article"], ["div", "id", "main_content"]], "Ifeng": [["div", "id", "artical_real"], ["div", "id", "picTxt"], ["div", "id", "yc_con_txt"]], "Online": [["div", "class", "newsCon"], ["div", "id", "zoom"]], "Tencent": [["div", "id", "Cnt-Main-Article-QQ"], ["div", "id", "contTxt"], ["div", "class", "article"], ], "KanKan": [["div", "class", "textBody"]], "WangYi": [["div", "class", "post_text"], ["div", "class", "viewport"], ["div", "id", "endText"]], "Eastday": [["div", "id", "zw"], ["div", "class", "main"], ["div", "class", "zw"], ["div", "class", "article-content"], ["div", "class", "newsContent"]], "Xinhua": [["div", "id", "p-detail"], ["div", "id", "article"], ["div", "id", "content"]], "People": [["div", "class", "box_con"], ["div", "class", "clearfix"]], "Xinmin": [["div", "class", "a_p"], ["article", "class", "padding15 content"],["div", "id", "MP_article"]], "Weather": [["div", "class", "xyn-text"]], "ShangGuan": [["div", "id", "newscontents"]], "ShangHaiNews": [["div", "class", "cms-news-article-content-block"]], "KuanDai": [["div", "class", "reply_message"]], "Radio": [["div", "class", "TRS_Editor"]] } return htmlParse
class NewsspiderPipeline(object): def __init__(self): # client = pymongo.MongoClient("192.168.20.216", 27017) # db = client["SinaWebchatWeather"] # self.sinaComment = db["SinaComment"] # self.sinaContent = db["SinaContent"] # self.sogou = db["SoGouContent"] # self.tweets = db["Tweets"] # self.Info = db["Information"] self.ora = Dba() self.path = "/data1/crawler/andycrawler/NewsSpider/weatheroutput/weatheroutput/" # self.path = "weatheroutput/weatheroutput/" self.keys = self.load_key() self.wm = WeatherModel( self.path + "LinearSVCl2.model", self.path + "vectorizer.data", self.path + "ch2.data", self.path + "weatherwords.txt") self.tags = jieba.analyse.extract_tags self.ConSql = """ INSERT INTO QXJ.QXJ_YQ_WEIBO_DAY (sinaothid, sinaname, contentid,sinaid,vermicelli,content,flag,dta_date) VALUES('%s','%s','%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss')) """ self.old_key_sql = """ insert into qxj.qxj_yq_weibo_keyword_day (weibo_id, contentid, keyword, num, ts, dta_date) VALUES('%s','%s','%s','%d', to_date('%s','yyyy-mm-dd hh24:mi:ss'), date'%s')""" self.all_key_sql = """ insert into qxj.qxj_keyword_all_day (keyword, type, dta_date) VALUES ('%s','%s',date'%s')""" self.CommSql = """ INSERT INTO QXJ.QXJ_YQ_PINGLUN_DAY (username, contentid, userid,comments,commentid,dta_date) VALUES('%s','%s','%s','%s','%s', to_date('%s','yyyy-mm-dd hh24:mi:ss'))""" self.SinComSql = """ INSERT INTO QXJ.QXJ_YQ_PINGLUN_DAY (username, contentid, userid,comments,commentid,dta_date) VALUES('%s','%s','%s','%s','%s',DATE '%s')""" def process_item(self, item, spider): if isinstance(item, SinaContentItem): try: ConSql = self.ConSql % ( item["SinaOthID"], item["SinaName"], item["contentId"], item["SinaID"], item["Vermicelli"], item["content"], str(item['flag']), item["pushTime"].split(" ")[0] ) self.ora.cux_sql(self.ora.connect(), ConSql) except Exception, e: logger.error("ConSql: <<%s>>" % e) sent = self.format_string(item["content"]) dta_date = item["pushTime"].split(" ")[0] try: item_key = {} for key in self.keys: num = sent.count(key) if num != 0: item_key[key] = num key_tup = zip(item_key.values(), item_key.keys()) key_sor = sorted(key_tup, reverse=True) for sor in key_sor[:20]: old_sql = self.old_key_sql % ( item["SinaID"], item["contentId"], sor[1], sor[0], item["pushTime"], dta_date ) self.ora.cux_sql(self.ora.connect(), old_sql) except Exception, e: logger.error("old_sql: <<%s>>" % e) """提取关键字并存入oracle""" self.find_new_keyword(sent, dta_date)
def __init__(self): self.dba = Dba() self.keys = self.load_key() self.tags = jieba.analyse.extract_tags