def echo(request): if request.is_websocket: try: clients = [] clients.append(request.websocket) for message in request.websocket: print len(clients) data=eval(message) while True: if data["type"]==1: sql_where="" if data["ipSelect"]!="": sql_where+="where FserverIp='{}'".format(data["ipSelect"]) result=MysqlHelper.excuteFindPages("tb_task","*",int(data["pageIndex"]),int(data["pageSize"]),sql_where,"order by Fid asc" ) elif data["type"]==2: sql_where="" table="(SELECT tb_spider.Fid,tb_spider.FspiderName,tb_spider.Ftype,tb_spider.Fwebsite,tb_spider.FtimeInterval," \ "tb_spider.Fstate,tb_spider.FserverIp,COUNT(tb_data.Fid)as Fnum from tb_spider" \ " LEFT JOIN tb_data ON tb_spider.FspiderName=tb_data.Fsource GROUP BY tb_spider.FspiderName) as tb_spider" if data["ipSelect"]!="": sql_where+="where FserverIp='{}'".format(data["ipSelect"]) result=MysqlHelper.excuteFindPages(table,"*",int(data["pageIndex"]),int(data["pageSize"]),sql_where,"order by Fid asc" ) for client in clients: result_message=json.dumps(result,default=date_handler) client.send(result_message) time.sleep(1) except Exception as ex: print "异常----" finally: print "关闭" clients.remove(request.websocket)
def saveData(self, item): #获取ES搜索引擎数据库连接 es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT}]) #获取mysql连接 conn = MysqlHelper.getMyConnect() cur = conn.cursor(cursorclass=MySQLdb.cursors.DictCursor) try: print "----数据保存中------" id = Utility().getId() #向ES插入数据 index:索引,相当于mysql的数据库名,doc_type:相当于mysql的表名 istrue = es.create(index="scdel_index", id=id, doc_type="tb_data", body=item)["created"] if istrue: #向mysql中插入数据 value_str = dictToSqlvalues(item) sql = "insert into tb_data set " + value_str count = cur.execute(sql) if count > 0: return True else: return False except Exception as ex: print ex conn.rollback() #出现异常将异常信息发送给管理员邮箱 Utility().sendEmail('*****@*****.**', '*****@*****.**', 'ggfcyiwvmtzgbaec', self.spiderName + ' 爬虫异常', ex) raise ex finally: cur.close() conn.commit() conn.close()
def setUpBll(request): mydata = {} Fid = request.POST.get("Fid").encode("utf-8") mydata["FspiderName"] = request.POST.get("FspiderName").encode("utf-8") mydata["FserverIp"] = request.POST.get("FserverIp").encode("utf-8") mydata["FtimeInterval"] = request.POST.get("FtimeInterval").encode("utf-8") mydata["FscriptAddress"] = request.POST.get("FscriptAddress").encode( "utf-8").replace("\\", "/") mydata["FuserName"] = request.POST.get("FuserName").encode("utf-8") mydata["FpassWord"] = request.POST.get("FpassWord").encode("utf-8") mydata["Fport"] = request.POST.get("Fport").encode("utf-8") mydata["Fauthkey"] = request.POST.get("Fauthkey").encode("utf-8") mydata["Fwebsite"] = request.POST.get("Fwebsite").encode("utf-8") mydata["Ftype"] = request.POST.get("Ftype").encode("utf-8") print mydata["FscriptAddress"] istrue = MysqlHelper.excuteUpdate("tb_spider", mydata, "Fid={}".format(Fid)) result = {} if istrue: result["istrue"] = True result["msg"] = "设置成功!" else: result["istrue"] = False result["msg"] = "设置失败!" return result
def upTaskBll(request): Fstate = request.GET.get("Fstate") #0 停止 1 启动 Fid = request.GET.get("Fid") task = MysqlHelper.excuteFindOne( "select * from tb_task where Fid={}".format(Fid)) host = Linux("139.159.218.222", task["FuserName"], task["FpassWord"]) host.connect() result = {} if int(Fstate) == 0: #-----------根据pid关闭端口进程----------- msg = 'netstat -apn | grep {}'.format(task["Fport"]) port_result = host.send(msg) # isEnd=False for x in re.findall("(\d+)/python", port_result): res = host.send('kill -9 {}'.format(x.encode("utf-8"))) if re.findall("root@scdel-02:.*?#", res): isEnd = True else: isEnd = False if isEnd: #修改任务爬虫状态 upsql_task = "update tb_task set Fstate=0 where Fid={}".format(Fid) upsal_spider = "update tb_spider set Fstate=0 where FtaskId={}".format( Fid) MysqlHelper.excuteManySqlReturnBool([upsql_task, upsal_spider]) result["istrue"] = True result["msg"] = "停止成功!" else: result["istrue"] = False result["msg"] = "停止失败!" elif int(Fstate) == 1: msg = 'python ' + task["FscriptAddress"] isStart = host.send(msg) p = re.compile('root@scdel-02:.*?#') if p.search(isStart): MysqlHelper.excuteUpdate("tb_task", {"Fstate": 1}, "Fid={}".format(Fid)) result["istrue"] = True result["msg"] = "启动成功!" else: result["istrue"] = False result["msg"] = "启动失败!" host.close() return result
def upSpiderBll(request): Fstate = request.GET.get("Fstate") #0 停止 1 启动 Fid = request.GET.get("Fid") spider = MysqlHelper.excuteFindOne( "select * from tb_spider where Fid={}".format(Fid)) #连接远程主机 host = Linux(spider["FserverIp"], spider["FuserName"], spider["FpassWord"]) host.connect() result = {} if int(Fstate) == 0: istrue = MysqlHelper.excuteUpdate("tb_spider", {"Fstate": 0}, "Fid={}".format(Fid)) isEnd = host.send('killall -9 python ' + spider["FscriptAddress"]) if istrue and isEnd: result["istrue"] = True result["msg"] = "停止成功!" else: result["istrue"] = False result["msg"] = "停止失败!" elif int(Fstate) == 1: task = MysqlHelper.excuteFindOne( "select * from tb_task where Fid={}".format(int(Fid))) if int(task["Fstate"]) != 0: istrue = MysqlHelper.excuteUpdate("tb_spider", {"Fstate": 1}, "Fid={}".format(Fid)) isStart = host.send('python ' + spider["FscriptAddress"]) host.close() if istrue and isStart: result["istrue"] = True result["msg"] = "启动成功!" else: result["istrue"] = False result["msg"] = "启动失败!" else: result["istrue"] = False result["msg"] = "启动失败,请先去派发任务!" return result
def saveDataBatch(self): #获取ES搜索引擎数据库连接 es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT}]) # #获取mysql连接 try: for item in self.data_list: id = MysqlHelper.excuteInsertReturnId("tb_data", item) if id: #向ES插入数据 index:索引,相当于mysql的数据库名,doc_type:相当于mysql的表名 istrue = es.create(index="scdel_index", id=int(id), doc_type="tb_data", body=item)["created"] print "插入结果:" print istrue except Exception as ex: #出现异常将异常信息发送给管理员邮箱 Utility().sendEmail('*****@*****.**', '*****@*****.**', 'ggfcyiwvmtzgbaec', self.spiderName + ' 爬虫异常', ex) raise ex
# s.saveData() except Exception as ex: continue #提取每一页面所有信息的url def getPageInfoUrl(self): url_list=[] text=self.getHtml(self.pageUrl).text #xpath去匹配需要的数据 selector=etree.HTML(text, parser=None, base_url=None) title_url_list=[] title_url_list+=selector.xpath('//th[@class="new"]//a[@target="_blank"]/@href') title_url_list+=selector.xpath('//th[@class="hot"]//a[@target="_blank"]/@href') title_url_list+=selector.xpath('//th[@class="common"]//a[@target="_blank"]/@href') for x in title_url_list: url="http://club.history.sina.com.cn/"+x url_list.append(url) # #url过滤去重 # istrue=RedisHelper.urlFilter(url,"xinlang_url") # if istrue==False: # url_list.append(url) return url_list if __name__ == "__main__": try: task=MysqlHelper.excuteFindOne("select * from tb_task where Fid={}".format("1003")) slave=Slave(task,"tb_spider") slave.start() except Exception as ex: print ex MysqlHelper.excuteUpdate("tb_spider",{"Fstate":0},"Fid={}".format("1003"))
#!/usr/bin/python # -*- coding: utf-8 -*- import sys sys.path.append('/home/spider_project') from multiplexing_class.base_master import Master from tianya_url import TianYaUrl from tianya_spider import TianYaSpider from multiplexing_class.mysql_helper import MysqlHelper #麻辣社区爬取任务派发 if __name__ == "__main__": #代理ip proxies = { "http": "http://115.220.3.253:808", "https": "http://121.31.147.192:8123", } params={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3000.4 Safari/537.36"} spider=TianYaSpider(proxies,params,"","1002") file_url="tianya_url.txt" keywords_list=MysqlHelper.excuteFindAll("select FkeyWord from tb_keywords") page_url_list=TianYaUrl(file_url,3,keywords_list).getUrlAll() print len(page_url_list) master = Master(page_url_list,spider,"1002","tb_task") master.start()
def getItem(self): #获取负面关键词 negative = NegativeKeyWords() negKwList = negative.getNegativeKeyWordsList() #获取ES搜索引擎数据库连接 es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT}]) url_list = self.getPageInfoUrl() #获取页面所有信息的url MysqlHelper.excuteUpdate("tb_spider", {"Fstate": 1}, "Fid={}".format(self.spiderId)) for url in url_list: spider = MysqlHelper.excuteFindOne( "select Fnum from tb_spider where Fid={}".format( self.spiderId)) num = int(spider["Fnum"]) num += 1 #页面源码 text = self.getHtml(url).text #创建一selector,用于xpath去匹配需要的数据 selector = etree.HTML(text, parser=None, base_url=None) #标题 title = selector.xpath(u'//span[@class="s_title"]/span/text()') #发帖人 p_Fauthor = u"<a href=\".*?\" target=\"_blank\" class=\"js-vip-check\" uid=\".*?\" uname=\".*?\">([\s\S]*?)</a>" author = re.findall(p_Fauthor, text) #发帖时间 p_date = u"<span>时间:([\s\S]*?) </span>" date = re.findall(p_date, text) #内容 p_content = u"<div class=\"bbs-content clearfix\">([\s\S]*?)</div>" content = re.findall(p_content, text) data = {} data["Ftitle"] = title[0] if title else "" data["Fdate"] = date[0] if date else "Null" data["Fcontent"] = content[0] if content else "" data["Flink"] = url data["Ftype"] = "论坛" data["Fsource"] = "天涯社区" data["FcreateTime"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) data["Fauthor"] = author[0] if author else "" #判断是否是负面信息 def isNegative(key): if re.findall(key, data["Fcontent"]): return True else: return False isNegKeyWor = map(isNegative, negKwList) #0正面,1负面 if True in isNegKeyWor: data["isNegative"] = 1 else: data["isNegative"] = 0 try: n = random.randint(1, 9999) b = random.randint(1, 9999) id = n + b #向ES插入数据 index:索引,相当于mysql的数据库名,doc_type:相当于mysql的表名 istrue = es.create(index="scdel_index", id=id, doc_type="tb_data", body=data)["created"] print istrue except Exception as ex: print ex istrue = False if istrue: MysqlHelper.excuteUpdate("tb_spider", {"Fnum": num}, "Fid={}".format(self.spiderId)) time.sleep(1)
def getInfoByIdBll(request): Fid = request.GET.get("Fid") result = MysqlHelper.excuteFindOne( "select * from tb_task where Fid={}".format(Fid)) return result