class PageDict: # 创建全局数据连接 db_pool = MyPymysqlPool("default") databaseInsertList = DatabaseInsertList() # 采集字典 def run(self, url, conf): rule = Rule() type_p = 'rg' if 'readtype' in conf.keys(): type_p = conf['readtype'] charset = "utf8" if 'charset' in conf.keys(): charset = conf['charset'] try: result, nextPage = rule.crawler_list(url, conf, type_p, charset) except Exception as e: print(e) dic_list = [] for row in conf['columns']: dic_list.append(row['名称']) self.databaseInsertList.insertList(result=result, table=conf['tablename'], column_names=dic_list, db_pool=self.db_pool) if nextPage is not None and url != nextPage: self.run(url=nextPage, conf=conf)
class SysCrawlerTaskInfo: def __init__(self): self.dbpool = MyPymysqlPool("default") def findAll(self): sql = "select * from sys_crawler_task_info t" return self.dbpool.getAll(sql) def updateDelete(self, id, val): sql = " update sys_crawler_task_info set delete_flag = %d where uuid='%s'" % ( val, id) return self.dbpool.update(sql) def __del__(self): self.dbpool.dispose() if __name__ == '__main__': pass
def run(): dbconn = MyPymysqlPool("default") myMd5 = hashlib.md5() sql = "select * from 电影天堂_detail t where 下载地址 !='' and not exists(select 1 from addr_thund a where t.`主键` = a.`主键`)" result = dbconn.getAll(sql) for row in result: addrjson = row["下载地址"] jsonarr = json.loads(str(addrjson).replace("\'", "\""), encoding="utf-8") for addrobj in jsonarr: addr = addrobj["地址"] if addr != "": if isinstance(addr, list): for add in addr: insertdata(dbconn, row["主键"], add) insertdata(dbconn, row["主键"], addr) dbconn.dispose()
def readExsistTop(self, table, top=10): db_pool = MyPymysqlPool("default") exsitsql = "select * from %s where statue=2 " % table dataList = db_pool.getAll(exsitsql) if dataList is not False: if len(dataList) < top: sql = """ select * from %s where statue is null or statue =0 order by 采集时间 limit 0,%d """ % ( table, top - len(dataList)) return db_pool.getAll(sql) else: db_pool.dispose() time.sleep(5) return self.readExsistTop(table, top=top) else: sql = """ select * from %s where statue is null or statue =0 order by 采集时间 limit 0,%d """ % ( table, top) return db_pool.getAll(sql)
# task_master.py # coding=utf-8 # 多进程分布式例子 # 服务器端 from multiprocessing.managers import BaseManager from multiprocessing import freeze_support # server启动报错,提示需要引用此包 import random, time, queue from common.Mysql_Utils import MyPymysqlPool # 发送任务的队列 task_queue = queue.Queue() # 接收结果的队列 result_queue = queue.Queue() dbpool = MyPymysqlPool('default') # 从BaseManager继承的QueueManager class QueueManager(BaseManager): pass # win7 64 貌似不支持callable下调用匿名函数lambda,这里封装一下 def return_task_queue(): global task_queue return task_queue def return_result_queue(): global result_queue
from common.Mysql_Utils import MyPymysqlPool dbpool = MyPymysqlPool("default") #cr = "create table sys_dic_utf8_code(code int primary key,info varchar(3),delete_flag int); " #dbpool.update(cr); for i in range(1,55296): print(i,chr(i) ) sql ="insert into sys_dic_utf8_code(code,info,delete_flag) values(%d,'%s',0)" %(i,chr(i).replace("\\","\\\\").replace("'","\\'")) dbpool.insert(sql) dbpool.dispose()
def __init__(self): self.dbpool = MyPymysqlPool("default")
class PageDict(): db_pool = MyPymysqlPool("default") def runDict(self,url,conf): rule = Rule() result,nextPage =rule.crawler_list(url,conf,type_p='rg') print(nextPage) # 数据入库 TODO dic_list=[] for row in conf['columns']: dic_list.append(row['名称']) self.insertList(result=result,table=conf['tablename'],column_names=dic_list) if nextPage is not None and url != nextPage: self.runDict(url=nextPage,conf=conf) def insertList(self, result='', table='', column_names=[]): columns = '' index = 0 for column_name in column_names: if index > 0: columns += "," columns += '`' + column_name + '`' index += 1 for row in result: index = 0 values = '' for column_name in column_names: if index > 0: values += "," values += "'" + str(row[column_name]).replace("\'", "’").replace("\\", "") + "'" index += 1 sql = "insert into `" + table + "` (" + columns + ") values(" + values + ")" print(sql) try: self.db_pool.insert(sql=sql) self.db_pool._conn.commit(); except pymysql.err.ProgrammingError as pye: if 1146 == pye.args[0]: createsql = """create table """ + table + """ (`采集时间` varchar(20),`主键` varchar(32) primary key) DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci """ print(createsql) self.db_pool.update(createsql) for column_name in column_names: altersql = " alter table " + table + " add column `" + column_name + "` varchar(255) ;" try: self.db_pool.update(altersql) except Exception as e: if e.args[0] == 1060: print(table,column_name,"字段已经存在!") else: print(e.args,"更新表字段") self.db_pool.insert(sql) self.db_pool._conn.commit(); else: pye.with_traceback() except pymysql.err.IntegrityError as pye: if 1062 == pye.args[0]: updatesql = "update " + table + " set " index = 0 for column_name in column_names: if index > 0: updatesql += "," updatesql += "`" + column_name + "` = '" + str(row[column_name]).replace("\'", "’").replace( "\\", "") + "'" index += 1 updatesql += " where `主键` = '" + row['主键'] + "'" print(updatesql) self.db_pool.update(updatesql) self.db_pool._conn.commit(); print("主键重复", pye.args[1]) else: pye.with_traceback() except Exception as e: e.with_traceback()
class PageDetail: db_pool = MyPymysqlPool("default") databaseInsertList = DatabaseInsertList() # 单进程 采集 def run(self, conf): columnNames = [] for row in conf['columns']: columnNames.append(row['名称']) listtable = conf['urltable'] type_p = 'rg' if 'readtype' in conf.keys(): type_p = conf['readtype'] charset = "utf8" if 'charset' in conf.keys(): charset = conf['charset'] self.databaseInsertList.updateAllStatue(self.db_pool, table=listtable, statue=2) listList = self.databaseInsertList.readTop(db_pool=self.db_pool, table=listtable) while listList is not False: for row in listList: self.crawlerDetail(conf, listtable, type_p, charset, columnNames, row) listList = self.databaseInsertList.readExsistTop(table=listtable, top=10) # 多进程采集 def runProcess(self, conf): columnNames = [] for row in conf['columns']: columnNames.append(row['名称']) listtable = conf['urltable'] type_p = 'rg' if 'readtype' in conf.keys(): type_p = conf['readtype'] charset = "utf8" if 'charset' in conf.keys(): charset = conf['charset'] self.databaseInsertList.updateAllStatue(self.db_pool, table=listtable, statue=2) listList = self.databaseInsertList.readTop(db_pool=self.db_pool, table=listtable, top=10) while listList is not False: for row in listList: p = Process(target=self.crawlerDetail, name="crawlerDetail" + row['主键'], args=(conf, listtable, type_p, charset, columnNames, row)) p.start() time.sleep(3) listList = self.databaseInsertList.readExsistTop(table=listtable, top=10) # 采集入库 def crawlerDetail(self, conf, listtable, type_p, charset, columnNames, row): rule = Rule() self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=listtable, uuid=row['主键'], statue=2) try: result = rule.crawler_detail(conf=conf, url=row[conf['urlname']], type_p=type_p, charset=charset,row=row) self.databaseInsertList.insertDetail(result=result, table=conf['tablename'], column_names=columnNames, db_pool=self.db_pool) self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=listtable, uuid=row['主键'], statue=1) except pymysql.err.DataError as pye: print(conf['urltable'], row['主键'], pye) self.databaseInsertList.updateMessage(db_pool=self.db_pool, table=conf['urltable'], uuid=row['主键'], statue=-int(pye.args[0]), message=str(pye.args[1]).replace("\\", "\\\\").replace("\'", "\\\'")) except Exception as e: self.databaseInsertList.updateMessage(db_pool=self.db_pool, table=conf['urltable'], uuid=row['主键'], statue=-100, message=str(e).replace("\\", "\\\\").replace("\'", "\\\'"))
class PageList: db_pool = MyPymysqlPool("default") databaseInsertList = DatabaseInsertList() def runMulity(self, confs): for conf in confs: self.run(conf) def run(self, conf): dictable = conf['urltable'] type_p = 'rg' if 'readtype' in conf.keys(): type_p = conf['readtype'] charset = "utf8" if 'charset' in conf.keys(): charset = conf['charset'] print(dictable) try: self.databaseInsertList.updateAllStatue(db_pool=self.db_pool, table=dictable, statue=2) dictList = self.databaseInsertList.readAll(db_pool=self.db_pool, table=dictable) if dictList is not False: # 数据写入 for row in dictList: self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=dictable, uuid=row['主键'], statue=2) url = row[conf['urlname']] if row['current_url'] is not None and row['current_url'] != '': url = row['current_url'] self.crawlerNext(conf, url=url, uuid=row['主键'], type_p=type_p, charset=charset,row=row) except Exception as e: print(e.args, "runList") if e.args[0] == "更新时间" or 'current_url' == e.args[0]: try: altersql = " alter table `" + dictable + "` add column `更新时间` timestamp on update current_timestamp" self.db_pool.update(altersql) except Exception as e: if e.args[0] == 1060: print(dictable, "更新时间 字段已经存在!") else: print(e.args, "更新表字段") self.run(conf) def runProcess(self, conf): dictable = conf['urltable'] top = 10 if 'top' in conf.keys(): top = conf['top'] type_p = 'rg' if 'readtype' in conf.keys(): type_p = conf['readtype'] charset = "utf8" if 'charset' in conf.keys(): charset = conf['charset'] try: self.databaseInsertList.updateAllStatue(db_pool=self.db_pool, table=dictable, statue=2) dictList = self.databaseInsertList.readTop(db_pool=self.db_pool, table=dictable, top=top) while dictList is not False: # 数据写入 for row in dictList: self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=dictable, uuid=row['主键'], statue=2) url = row[conf['urlname']] p = Process(target=self.crawlerNext, name="crawlerNext" + row['主键'], args=(conf, url, row['主键'], type_p, charset)) p.start() time.sleep(5) dictList = self.databaseInsertList.readExsistTop(table=dictable, top=top) except Exception as e: print(e) if e.args[0] == "更新时间" or 'current_url' == e.args[0]: try: altersql = " alter table `" + dictable + "` add column `更新时间` timestamp on update current_timestamp" self.db_pool.update(altersql) except Exception as e: if e.args[0] == 1060: print(dictable, "更新时间 字段已经存在!") else: print(e.args, "更新表字段") try: altersql = " alter table `" + dictable + "` add column `current_url` varchar(500)" self.db_pool.update(altersql) except Exception as e: if e.args[0] == 1060: print(dictable, "current_url 字段已经存在!") else: print(e.args, "更新表字段") self.runProcess(conf) def crawlerNext(self, conf, url='', uuid='', type_p='rg', charset='utf8',row_p={}): print(url, uuid, type_p, charset) try: rule = Rule() result, next_page = rule.crawler_list(url, conf, type_p, charset,row=row_p) print(next_page) if len(result) > 0: list_list = [] for row in conf['columns']: list_list.append(row['名称']) self.databaseInsertList.insertList(result=result, table=conf['tablename'], column_names=list_list, db_pool=self.db_pool) if next_page is not None and url != next_page: self.updateCurrent(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid, current=next_page) self.db_pool.end("commit"); self.crawlerNext(conf, url=next_page, uuid=uuid, type_p=type_p, charset=charset) else: self.updateStatue(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid, statue=1) self.db_pool.end("commit"); else: self.updateStatue2(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid, statue=-2) except Exception as e: print(e.args) if 1001 == e.args[0]: self.databaseInsertList.updateStatue2(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid, statue=-1) self.db_pool.end("commit"); if 1054 == e.args[0]: try: altersql = " alter table `" + conf[ 'urltable'] + "` add column `更新时间` timestamp on update current_timestamp" self.db_pool.update(altersql) except Exception as e: if e.args[0] == 1060: print(conf['urltable'], "更新时间 字段已经存在!") else: print(e.args, "更新表字段") try: altersql = " alter table `" + conf['urltable'] + "` add column `current_url` varchar(500)" self.db_pool.update(altersql) except Exception as e: if e.args[0] == 1060: print(conf['urltable'], "current_url 字段已经存在!") else: print(e.args, "更新表字段") self.crawlerNext(conf, url, uuid) else: self.databaseInsertList.updateMessage(db_pool=self.db_pool, table=conf['urltable'], uuid=uuid, statue=-int(e.args[0]), message=str(e.args[1])) def updateStatue(self, db_pool, table='', uuid='', statue=1): sql = """ update %s set statue = %d,current_url=null where 主键='%s' """ % (table, statue, uuid) return db_pool.update(sql) def updateCurrent(self, db_pool, table='', uuid='', current=''): sql = """ update %s set current_url='%s' where 主键='%s' """ % (table, current, uuid) return db_pool.update(sql) def updateError(self, db_pool, table='', uuid='', statue=-1, message=''): try: sql = """ update %s set statue = %d,message=%s where 主键='%s' """ % (table, statue, message, uuid) return db_pool.update(sql) except Exception as e: upsql = " alter table %s add column `message` varchar(2000)" % table db_pool.update(upsql) self.updateStatue(db_pool, table, uuid, statue, message)