Exemple #1
0
def echo(request):
    if request.is_websocket:
        try:
            clients = []
            clients.append(request.websocket)
            for message in request.websocket:
                print len(clients)
                data=eval(message)
                while True:
                    if data["type"]==1:
                        sql_where=""
                        if data["ipSelect"]!="":
                            sql_where+="where FserverIp='{}'".format(data["ipSelect"])
                        result=MysqlHelper.excuteFindPages("tb_task","*",int(data["pageIndex"]),int(data["pageSize"]),sql_where,"order by Fid asc" )
                    elif data["type"]==2:
                        sql_where=""
                        table="(SELECT tb_spider.Fid,tb_spider.FspiderName,tb_spider.Ftype,tb_spider.Fwebsite,tb_spider.FtimeInterval," \
                                  "tb_spider.Fstate,tb_spider.FserverIp,COUNT(tb_data.Fid)as Fnum from tb_spider" \
                                  " LEFT JOIN tb_data ON tb_spider.FspiderName=tb_data.Fsource GROUP BY tb_spider.FspiderName) as tb_spider"
                        if data["ipSelect"]!="":
                            sql_where+="where FserverIp='{}'".format(data["ipSelect"])
                        result=MysqlHelper.excuteFindPages(table,"*",int(data["pageIndex"]),int(data["pageSize"]),sql_where,"order by Fid asc" )

                    for client in clients:
                        result_message=json.dumps(result,default=date_handler)
                        client.send(result_message)
                    time.sleep(1)
        except Exception as ex:
            print "异常----"
        finally:
            print "关闭"
            clients.remove(request.websocket)
Exemple #2
0
 def saveData(self, item):
     #获取ES搜索引擎数据库连接
     es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT}])
     #获取mysql连接
     conn = MysqlHelper.getMyConnect()
     cur = conn.cursor(cursorclass=MySQLdb.cursors.DictCursor)
     try:
         print "----数据保存中------"
         id = Utility().getId()
         #向ES插入数据  index:索引,相当于mysql的数据库名,doc_type:相当于mysql的表名
         istrue = es.create(index="scdel_index",
                            id=id,
                            doc_type="tb_data",
                            body=item)["created"]
         if istrue:
             #向mysql中插入数据
             value_str = dictToSqlvalues(item)
             sql = "insert into tb_data set " + value_str
             count = cur.execute(sql)
             if count > 0:
                 return True
             else:
                 return False
     except Exception as ex:
         print ex
         conn.rollback()
         #出现异常将异常信息发送给管理员邮箱
         Utility().sendEmail('*****@*****.**', '*****@*****.**',
                             'ggfcyiwvmtzgbaec', self.spiderName + ' 爬虫异常',
                             ex)
         raise ex
     finally:
         cur.close()
         conn.commit()
         conn.close()
Exemple #3
0
def setUpBll(request):
    mydata = {}
    Fid = request.POST.get("Fid").encode("utf-8")
    mydata["FspiderName"] = request.POST.get("FspiderName").encode("utf-8")
    mydata["FserverIp"] = request.POST.get("FserverIp").encode("utf-8")
    mydata["FtimeInterval"] = request.POST.get("FtimeInterval").encode("utf-8")
    mydata["FscriptAddress"] = request.POST.get("FscriptAddress").encode(
        "utf-8").replace("\\", "/")
    mydata["FuserName"] = request.POST.get("FuserName").encode("utf-8")
    mydata["FpassWord"] = request.POST.get("FpassWord").encode("utf-8")
    mydata["Fport"] = request.POST.get("Fport").encode("utf-8")
    mydata["Fauthkey"] = request.POST.get("Fauthkey").encode("utf-8")
    mydata["Fwebsite"] = request.POST.get("Fwebsite").encode("utf-8")
    mydata["Ftype"] = request.POST.get("Ftype").encode("utf-8")
    print mydata["FscriptAddress"]
    istrue = MysqlHelper.excuteUpdate("tb_spider", mydata,
                                      "Fid={}".format(Fid))
    result = {}
    if istrue:
        result["istrue"] = True
        result["msg"] = "设置成功!"
    else:
        result["istrue"] = False
        result["msg"] = "设置失败!"
    return result
Exemple #4
0
def upTaskBll(request):
    Fstate = request.GET.get("Fstate")  #0 停止 1 启动
    Fid = request.GET.get("Fid")
    task = MysqlHelper.excuteFindOne(
        "select * from tb_task where Fid={}".format(Fid))

    host = Linux("139.159.218.222", task["FuserName"], task["FpassWord"])
    host.connect()
    result = {}
    if int(Fstate) == 0:
        #-----------根据pid关闭端口进程-----------
        msg = 'netstat -apn | grep {}'.format(task["Fport"])
        port_result = host.send(msg)
        # isEnd=False
        for x in re.findall("(\d+)/python", port_result):
            res = host.send('kill -9 {}'.format(x.encode("utf-8")))
            if re.findall("root@scdel-02:.*?#", res):
                isEnd = True
            else:
                isEnd = False
        if isEnd:
            #修改任务爬虫状态
            upsql_task = "update tb_task set Fstate=0 where Fid={}".format(Fid)
            upsal_spider = "update tb_spider set Fstate=0 where FtaskId={}".format(
                Fid)
            MysqlHelper.excuteManySqlReturnBool([upsql_task, upsal_spider])
            result["istrue"] = True
            result["msg"] = "停止成功!"
        else:
            result["istrue"] = False
            result["msg"] = "停止失败!"
    elif int(Fstate) == 1:
        msg = 'python ' + task["FscriptAddress"]
        isStart = host.send(msg)
        p = re.compile('root@scdel-02:.*?#')
        if p.search(isStart):
            MysqlHelper.excuteUpdate("tb_task", {"Fstate": 1},
                                     "Fid={}".format(Fid))
            result["istrue"] = True
            result["msg"] = "启动成功!"
        else:
            result["istrue"] = False
            result["msg"] = "启动失败!"
    host.close()
    return result
Exemple #5
0
def upSpiderBll(request):
    Fstate = request.GET.get("Fstate")  #0 停止 1 启动
    Fid = request.GET.get("Fid")
    spider = MysqlHelper.excuteFindOne(
        "select * from tb_spider where Fid={}".format(Fid))
    #连接远程主机
    host = Linux(spider["FserverIp"], spider["FuserName"], spider["FpassWord"])
    host.connect()
    result = {}
    if int(Fstate) == 0:
        istrue = MysqlHelper.excuteUpdate("tb_spider", {"Fstate": 0},
                                          "Fid={}".format(Fid))
        isEnd = host.send('killall -9 python ' + spider["FscriptAddress"])
        if istrue and isEnd:
            result["istrue"] = True
            result["msg"] = "停止成功!"
        else:
            result["istrue"] = False
            result["msg"] = "停止失败!"
    elif int(Fstate) == 1:
        task = MysqlHelper.excuteFindOne(
            "select * from tb_task where Fid={}".format(int(Fid)))
        if int(task["Fstate"]) != 0:
            istrue = MysqlHelper.excuteUpdate("tb_spider", {"Fstate": 1},
                                              "Fid={}".format(Fid))
            isStart = host.send('python ' + spider["FscriptAddress"])
            host.close()
            if istrue and isStart:
                result["istrue"] = True
                result["msg"] = "启动成功!"
            else:
                result["istrue"] = False
                result["msg"] = "启动失败!"
        else:
            result["istrue"] = False
            result["msg"] = "启动失败,请先去派发任务!"
    return result
Exemple #6
0
 def saveDataBatch(self):
     #获取ES搜索引擎数据库连接
     es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT}])
     # #获取mysql连接
     try:
         for item in self.data_list:
             id = MysqlHelper.excuteInsertReturnId("tb_data", item)
             if id:
                 #向ES插入数据  index:索引,相当于mysql的数据库名,doc_type:相当于mysql的表名
                 istrue = es.create(index="scdel_index",
                                    id=int(id),
                                    doc_type="tb_data",
                                    body=item)["created"]
                 print "插入结果:"
                 print istrue
     except Exception as ex:
         #出现异常将异常信息发送给管理员邮箱
         Utility().sendEmail('*****@*****.**', '*****@*****.**',
                             'ggfcyiwvmtzgbaec', self.spiderName + ' 爬虫异常',
                             ex)
         raise ex
Exemple #7
0
                # s.saveData()
            except Exception as ex:
                continue

    #提取每一页面所有信息的url
    def getPageInfoUrl(self):
        url_list=[]
        text=self.getHtml(self.pageUrl).text
        #xpath去匹配需要的数据
        selector=etree.HTML(text, parser=None, base_url=None)
        title_url_list=[]
        title_url_list+=selector.xpath('//th[@class="new"]//a[@target="_blank"]/@href')
        title_url_list+=selector.xpath('//th[@class="hot"]//a[@target="_blank"]/@href')
        title_url_list+=selector.xpath('//th[@class="common"]//a[@target="_blank"]/@href')
        for x in title_url_list:
            url="http://club.history.sina.com.cn/"+x
            url_list.append(url)
            # #url过滤去重
            # istrue=RedisHelper.urlFilter(url,"xinlang_url")
            # if istrue==False:
            #     url_list.append(url)
        return url_list
if __name__ == "__main__":

    try:
        task=MysqlHelper.excuteFindOne("select * from tb_task where Fid={}".format("1003"))
        slave=Slave(task,"tb_spider")
        slave.start()
    except Exception as ex:
        print ex
        MysqlHelper.excuteUpdate("tb_spider",{"Fstate":0},"Fid={}".format("1003"))
Exemple #8
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
sys.path.append('/home/spider_project')
from multiplexing_class.base_master import Master
from tianya_url import TianYaUrl
from tianya_spider import TianYaSpider
from multiplexing_class.mysql_helper import MysqlHelper

#麻辣社区爬取任务派发
if __name__ == "__main__":
    #代理ip
    proxies = { "http": "http://115.220.3.253:808", "https": "http://121.31.147.192:8123", }
    params={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3000.4 Safari/537.36"}
    spider=TianYaSpider(proxies,params,"","1002")
    file_url="tianya_url.txt"
    keywords_list=MysqlHelper.excuteFindAll("select FkeyWord from tb_keywords")
    page_url_list=TianYaUrl(file_url,3,keywords_list).getUrlAll()
    print len(page_url_list)
    master = Master(page_url_list,spider,"1002","tb_task")
    master.start()
Exemple #9
0
    def getItem(self):
        #获取负面关键词
        negative = NegativeKeyWords()
        negKwList = negative.getNegativeKeyWordsList()
        #获取ES搜索引擎数据库连接
        es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT}])
        url_list = self.getPageInfoUrl()  #获取页面所有信息的url
        MysqlHelper.excuteUpdate("tb_spider", {"Fstate": 1},
                                 "Fid={}".format(self.spiderId))
        for url in url_list:
            spider = MysqlHelper.excuteFindOne(
                "select Fnum from tb_spider where Fid={}".format(
                    self.spiderId))
            num = int(spider["Fnum"])
            num += 1
            #页面源码
            text = self.getHtml(url).text
            #创建一selector,用于xpath去匹配需要的数据
            selector = etree.HTML(text, parser=None, base_url=None)
            #标题
            title = selector.xpath(u'//span[@class="s_title"]/span/text()')
            #发帖人
            p_Fauthor = u"<a href=\".*?\" target=\"_blank\" class=\"js-vip-check\" uid=\".*?\" uname=\".*?\">([\s\S]*?)</a>"
            author = re.findall(p_Fauthor, text)
            #发帖时间
            p_date = u"<span>时间:([\s\S]*?) </span>"
            date = re.findall(p_date, text)
            #内容
            p_content = u"<div class=\"bbs-content clearfix\">([\s\S]*?)</div>"
            content = re.findall(p_content, text)

            data = {}
            data["Ftitle"] = title[0] if title else ""
            data["Fdate"] = date[0] if date else "Null"
            data["Fcontent"] = content[0] if content else ""
            data["Flink"] = url
            data["Ftype"] = "论坛"
            data["Fsource"] = "天涯社区"
            data["FcreateTime"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
            data["Fauthor"] = author[0] if author else ""

            #判断是否是负面信息
            def isNegative(key):
                if re.findall(key, data["Fcontent"]):
                    return True
                else:
                    return False

            isNegKeyWor = map(isNegative, negKwList)
            #0正面,1负面
            if True in isNegKeyWor:
                data["isNegative"] = 1
            else:
                data["isNegative"] = 0
            try:
                n = random.randint(1, 9999)
                b = random.randint(1, 9999)
                id = n + b
                #向ES插入数据  index:索引,相当于mysql的数据库名,doc_type:相当于mysql的表名
                istrue = es.create(index="scdel_index",
                                   id=id,
                                   doc_type="tb_data",
                                   body=data)["created"]
                print istrue
            except Exception as ex:
                print ex
                istrue = False
            if istrue:
                MysqlHelper.excuteUpdate("tb_spider", {"Fnum": num},
                                         "Fid={}".format(self.spiderId))
            time.sleep(1)
Exemple #10
0
def getInfoByIdBll(request):
    Fid = request.GET.get("Fid")
    result = MysqlHelper.excuteFindOne(
        "select * from tb_task where Fid={}".format(Fid))
    return result