Exemple #1
0
class PageDict:
    # 创建全局数据连接
    db_pool = MyPymysqlPool("default")
    databaseInsertList = DatabaseInsertList()

    # 采集字典
    def run(self, url, conf):
        rule = Rule()
        type_p = 'rg'
        if 'readtype' in conf.keys():
            type_p = conf['readtype']
        charset = "utf8"
        if 'charset' in conf.keys():
            charset = conf['charset']
        try:
            result, nextPage = rule.crawler_list(url, conf, type_p, charset)
        except Exception as e:
            print(e)
        dic_list = []
        for row in conf['columns']:
            dic_list.append(row['名称'])

        self.databaseInsertList.insertList(result=result, table=conf['tablename'], column_names=dic_list,
                                           db_pool=self.db_pool)
        if nextPage is not None and url != nextPage:
            self.run(url=nextPage, conf=conf)
Exemple #2
0
class SysCrawlerTaskInfo:
    def __init__(self):
        self.dbpool = MyPymysqlPool("default")

    def findAll(self):
        sql = "select * from sys_crawler_task_info t"
        return self.dbpool.getAll(sql)

    def updateDelete(self, id, val):
        sql = " update sys_crawler_task_info set delete_flag = %d where uuid='%s'" % (
            val, id)
        return self.dbpool.update(sql)

    def __del__(self):
        self.dbpool.dispose()

    if __name__ == '__main__':
        pass
Exemple #3
0
def run():
    dbconn = MyPymysqlPool("default")
    myMd5 = hashlib.md5()

    sql = "select  * from 电影天堂_detail t where 下载地址 !='' and not exists(select 1 from addr_thund a  where t.`主键` = a.`主键`)"
    result = dbconn.getAll(sql)

    for row in result:
        addrjson = row["下载地址"]
        jsonarr = json.loads(str(addrjson).replace("\'", "\""),
                             encoding="utf-8")
        for addrobj in jsonarr:
            addr = addrobj["地址"]
            if addr != "":
                if isinstance(addr, list):
                    for add in addr:
                        insertdata(dbconn, row["主键"], add)
                insertdata(dbconn, row["主键"], addr)
    dbconn.dispose()
Exemple #4
0
 def readExsistTop(self, table, top=10):
     db_pool = MyPymysqlPool("default")
     exsitsql = "select * from %s where statue=2 " % table
     dataList = db_pool.getAll(exsitsql)
     if dataList is not False:
         if len(dataList) < top:
             sql = """ select * from %s where statue  is null or statue =0  order by 采集时间 limit 0,%d """ % (
                 table, top - len(dataList))
             return db_pool.getAll(sql)
         else:
             db_pool.dispose()
             time.sleep(5)
             return self.readExsistTop(table, top=top)
     else:
         sql = """ select * from %s where statue  is null or statue =0  order by 采集时间 limit 0,%d """ % (
             table, top)
         return db_pool.getAll(sql)
Exemple #5
0
# task_master.py
# coding=utf-8

# 多进程分布式例子
# 服务器端

from multiprocessing.managers import BaseManager
from multiprocessing import freeze_support  # server启动报错,提示需要引用此包
import random, time, queue
from common.Mysql_Utils import MyPymysqlPool

# 发送任务的队列
task_queue = queue.Queue()
# 接收结果的队列
result_queue = queue.Queue()
dbpool = MyPymysqlPool('default')


# 从BaseManager继承的QueueManager
class QueueManager(BaseManager):
    pass


# win7 64 貌似不支持callable下调用匿名函数lambda,这里封装一下
def return_task_queue():
    global task_queue
    return task_queue


def return_result_queue():
    global result_queue
Exemple #6
0
from common.Mysql_Utils import MyPymysqlPool
dbpool = MyPymysqlPool("default")

#cr = "create table sys_dic_utf8_code(code int primary key,info varchar(3),delete_flag int); "
#dbpool.update(cr);
for i in range(1,55296):
    print(i,chr(i) )
    sql ="insert into sys_dic_utf8_code(code,info,delete_flag) values(%d,'%s',0)" %(i,chr(i).replace("\\","\\\\").replace("'","\\'"))
    dbpool.insert(sql)

dbpool.dispose()

Exemple #7
0
 def __init__(self):
     self.dbpool = MyPymysqlPool("default")
Exemple #8
0
class PageDict():
    db_pool = MyPymysqlPool("default")
    def runDict(self,url,conf):

        rule = Rule()
        result,nextPage =rule.crawler_list(url,conf,type_p='rg')
        print(nextPage)
        # 数据入库 TODO
        dic_list=[]
        for row in conf['columns']:
            dic_list.append(row['名称'])
        self.insertList(result=result,table=conf['tablename'],column_names=dic_list)
        if nextPage is not None and url != nextPage:
            self.runDict(url=nextPage,conf=conf)

    def insertList(self, result='', table='', column_names=[]):
        columns = ''
        index = 0
        for column_name in column_names:
            if index > 0:
                columns += ","
            columns += '`' + column_name + '`'
            index += 1

        for row in result:
            index = 0
            values = ''
            for column_name in column_names:
                if index > 0:
                    values += ","
                values += "'" + str(row[column_name]).replace("\'", "’").replace("\\", "") + "'"
                index += 1

            sql = "insert into `" + table + "` (" + columns + ") values(" + values + ")"
            print(sql)
            try:
                self.db_pool.insert(sql=sql)
                self.db_pool._conn.commit();
            except pymysql.err.ProgrammingError as pye:
                if 1146 == pye.args[0]:
                    createsql = """create table """ + table + """ (`采集时间` varchar(20),`主键` varchar(32) primary key) DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci """
                    print(createsql)
                    self.db_pool.update(createsql)
                    for column_name in column_names:
                        altersql = " alter table " + table + " add column `" + column_name + "` varchar(255) ;"
                        try:
                            self.db_pool.update(altersql)
                        except Exception as e:
                            if e.args[0] == 1060:
                                print(table,column_name,"字段已经存在!")
                            else:
                                print(e.args,"更新表字段")
                    self.db_pool.insert(sql)
                    self.db_pool._conn.commit();
                else:
                    pye.with_traceback()
            except pymysql.err.IntegrityError as pye:
                if 1062 == pye.args[0]:
                    updatesql = "update " + table + " set "
                    index = 0
                    for column_name in column_names:
                        if index > 0:
                            updatesql += ","
                        updatesql += "`" + column_name + "` = '" + str(row[column_name]).replace("\'", "’").replace(
                            "\\", "") + "'"
                        index += 1
                    updatesql += " where `主键` = '" + row['主键'] + "'"
                    print(updatesql)
                    self.db_pool.update(updatesql)
                    self.db_pool._conn.commit();
                    print("主键重复", pye.args[1])
                else:
                    pye.with_traceback()
            except Exception as e:
                e.with_traceback()
Exemple #9
0
class PageDetail:
    db_pool = MyPymysqlPool("default")
    databaseInsertList = DatabaseInsertList()

    # 单进程 采集
    def run(self, conf):
        columnNames = []
        for row in conf['columns']:
            columnNames.append(row['名称'])
        listtable = conf['urltable']
        type_p = 'rg'
        if 'readtype' in conf.keys():
            type_p = conf['readtype']
        charset = "utf8"
        if 'charset' in conf.keys():
            charset = conf['charset']
        self.databaseInsertList.updateAllStatue(self.db_pool, table=listtable, statue=2)
        listList = self.databaseInsertList.readTop(db_pool=self.db_pool, table=listtable)
        while listList is not False:
            for row in listList:
                self.crawlerDetail(conf, listtable, type_p, charset, columnNames, row)
            listList = self.databaseInsertList.readExsistTop(table=listtable, top=10)

    # 多进程采集
    def runProcess(self, conf):
        columnNames = []
        for row in conf['columns']:
            columnNames.append(row['名称'])
        listtable = conf['urltable']
        type_p = 'rg'
        if 'readtype' in conf.keys():
            type_p = conf['readtype']
        charset = "utf8"
        if 'charset' in conf.keys():
            charset = conf['charset']
        self.databaseInsertList.updateAllStatue(self.db_pool, table=listtable, statue=2)
        listList = self.databaseInsertList.readTop(db_pool=self.db_pool, table=listtable, top=10)
        while listList is not False:
            for row in listList:
                p = Process(target=self.crawlerDetail, name="crawlerDetail" + row['主键'],
                            args=(conf, listtable, type_p, charset, columnNames, row))
                p.start()
            time.sleep(3)
            listList = self.databaseInsertList.readExsistTop(table=listtable, top=10)

    # 采集入库
    def crawlerDetail(self, conf, listtable, type_p, charset, columnNames, row):
        rule = Rule()
        self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=listtable, uuid=row['主键'], statue=2)
        try:
            result = rule.crawler_detail(conf=conf, url=row[conf['urlname']], type_p=type_p, charset=charset,row=row)
            self.databaseInsertList.insertDetail(result=result, table=conf['tablename'], column_names=columnNames,
                                                 db_pool=self.db_pool)
            self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=listtable, uuid=row['主键'], statue=1)
        except pymysql.err.DataError as pye:
            print(conf['urltable'], row['主键'], pye)
            self.databaseInsertList.updateMessage(db_pool=self.db_pool, table=conf['urltable'], uuid=row['主键'],
                                                  statue=-int(pye.args[0]),
                                                  message=str(pye.args[1]).replace("\\", "\\\\").replace("\'", "\\\'"))
        except Exception as e:
            self.databaseInsertList.updateMessage(db_pool=self.db_pool, table=conf['urltable'], uuid=row['主键'],
                                                  statue=-100, message=str(e).replace("\\", "\\\\").replace("\'", "\\\'"))
Exemple #10
0
class PageList:
    db_pool = MyPymysqlPool("default")
    databaseInsertList = DatabaseInsertList()

    def runMulity(self, confs):
        for conf in confs:
            self.run(conf)

    def run(self, conf):
        dictable = conf['urltable']
        type_p = 'rg'
        if 'readtype' in conf.keys():
            type_p = conf['readtype']
        charset = "utf8"
        if 'charset' in conf.keys():
            charset = conf['charset']
        print(dictable)
        try:
            self.databaseInsertList.updateAllStatue(db_pool=self.db_pool, table=dictable, statue=2)
            dictList = self.databaseInsertList.readAll(db_pool=self.db_pool, table=dictable)
            if dictList is not False:
                # 数据写入
                for row in dictList:
                    self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=dictable, uuid=row['主键'],
                                                          statue=2)
                    url = row[conf['urlname']]
                    if row['current_url'] is not None and row['current_url'] != '':
                        url = row['current_url']

                    self.crawlerNext(conf, url=url, uuid=row['主键'], type_p=type_p, charset=charset,row=row)
        except Exception as e:
            print(e.args, "runList")
            if e.args[0] == "更新时间" or 'current_url' == e.args[0]:
                try:
                    altersql = " alter table `" + dictable + "` add column `更新时间` timestamp on update current_timestamp"
                    self.db_pool.update(altersql)
                except Exception as e:
                    if e.args[0] == 1060:
                        print(dictable, "更新时间 字段已经存在!")
                    else:
                        print(e.args, "更新表字段")
            self.run(conf)

    def runProcess(self, conf):
        dictable = conf['urltable']
        top = 10
        if 'top' in conf.keys():
            top = conf['top']
        type_p = 'rg'
        if 'readtype' in conf.keys():
            type_p = conf['readtype']
        charset = "utf8"
        if 'charset' in conf.keys():
            charset = conf['charset']
        try:
            self.databaseInsertList.updateAllStatue(db_pool=self.db_pool, table=dictable, statue=2)
            dictList = self.databaseInsertList.readTop(db_pool=self.db_pool, table=dictable, top=top)
            while dictList is not False:
                # 数据写入
                for row in dictList:
                    self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=dictable, uuid=row['主键'],
                                                          statue=2)
                    url = row[conf['urlname']]
                    p = Process(target=self.crawlerNext, name="crawlerNext" + row['主键'],
                                args=(conf, url, row['主键'], type_p, charset))
                    p.start()
                time.sleep(5)
                dictList = self.databaseInsertList.readExsistTop(table=dictable, top=top)


        except Exception as e:
            print(e)
            if e.args[0] == "更新时间" or 'current_url' == e.args[0]:
                try:
                    altersql = " alter table `" + dictable + "` add column `更新时间` timestamp on update current_timestamp"
                    self.db_pool.update(altersql)
                except Exception as e:
                    if e.args[0] == 1060:
                        print(dictable, "更新时间 字段已经存在!")
                    else:
                        print(e.args, "更新表字段")
                try:
                    altersql = " alter table `" + dictable + "` add column `current_url` varchar(500)"
                    self.db_pool.update(altersql)
                except Exception as e:
                    if e.args[0] == 1060:
                        print(dictable, "current_url 字段已经存在!")
                    else:
                        print(e.args, "更新表字段")
            self.runProcess(conf)

    def crawlerNext(self, conf, url='', uuid='', type_p='rg', charset='utf8',row_p={}):
        print(url, uuid, type_p, charset)
        try:
            rule = Rule()
            result, next_page = rule.crawler_list(url, conf, type_p, charset,row=row_p)
            print(next_page)
            if len(result) > 0:
                list_list = []
                for row in conf['columns']:
                    list_list.append(row['名称'])
                self.databaseInsertList.insertList(result=result, table=conf['tablename'], column_names=list_list,
                                                   db_pool=self.db_pool)
                if next_page is not None and url != next_page:
                    self.updateCurrent(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid, current=next_page)
                    self.db_pool.end("commit");
                    self.crawlerNext(conf, url=next_page, uuid=uuid, type_p=type_p, charset=charset)
                else:
                    self.updateStatue(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid, statue=1)
                    self.db_pool.end("commit");
            else:
                self.updateStatue2(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid, statue=-2)
        except Exception as e:
            print(e.args)
            if 1001 == e.args[0]:
                self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid,
                                                      statue=-1)
                self.db_pool.end("commit");
            if 1054 == e.args[0]:
                try:
                    altersql = " alter table `" + conf[
                        'urltable'] + "` add column `更新时间` timestamp on update current_timestamp"
                    self.db_pool.update(altersql)
                except Exception as e:
                    if e.args[0] == 1060:
                        print(conf['urltable'], "更新时间 字段已经存在!")
                    else:
                        print(e.args, "更新表字段")
                try:
                    altersql = " alter table `" + conf['urltable'] + "` add column `current_url` varchar(500)"
                    self.db_pool.update(altersql)
                except Exception as e:
                    if e.args[0] == 1060:
                        print(conf['urltable'], "current_url 字段已经存在!")
                    else:
                        print(e.args, "更新表字段")
                self.crawlerNext(conf, url, uuid)
            else:
                self.databaseInsertList.updateMessage(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid,
                                                      statue=-int(e.args[0]), message=str(e.args[1]))

    def updateStatue(self, db_pool, table='', uuid='', statue=1):
        sql = """ update %s set statue = %d,current_url=null where 主键='%s' """ % (table, statue, uuid)
        return db_pool.update(sql)

    def updateCurrent(self, db_pool, table='', uuid='', current=''):
        sql = """ update %s set current_url='%s' where 主键='%s' """ % (table, current, uuid)
        return db_pool.update(sql)

    def updateError(self, db_pool, table='', uuid='', statue=-1, message=''):
        try:
            sql = """ update %s set statue = %d,message=%s where 主键='%s' """ % (table, statue, message, uuid)
            return db_pool.update(sql)
        except Exception as e:
            upsql = " alter table %s add column `message` varchar(2000)" % table
            db_pool.update(upsql)
            self.updateStatue(db_pool, table, uuid, statue, message)