def init(): if '1' in str( ReadConfig(file_name=ConfigName, section=SearchDBName, attr='restart')): CreatResultDBTable(db, Dbresult) CreatUrlBuffTable(db, DbDatabuff) time.sleep(0.1) WriteConfig(file_name=ConfigName, section=SearchDBName, attr='restart', value=0) WriteConfig(file_name=ConfigName, section=SearchDBName, attr='startpage', value=1) WriteConfig(file_name=ConfigName, section=SearchDBName, attr='stopflag', value=0) WriteConfig(file_name=ConfigName, section=SearchDBName, attr='flag_get_all_url', value=0) if '0' in str( ReadConfig(file_name=ConfigName, section=SearchDBName, attr='restart')): db.upda_sql("Update `%s` set `State`=0 where `State`=10" % DbDatabuff) time.sleep(1)
def run(self): print('启动%d号解析线程' % self.number) # 无限循环, while True: # 如何判断解析线程的结束条件 for t in self.req_thread: # 循环所有采集线程 if t.is_alive(): # 判断线程是否存活 break else: # 如果循环完毕,没有执行break语句,则进入else if self.data_list.qsize() == 0: # 判断数据队列是否为空 self.is_parse = False # 设置解析为False # 判断是否继续解析 if self.is_parse or int( ReadConfig(file_name=ConfigName, section=SearchDBName, attr='stopflag')) == 0: # 解析 try: url, data = self.data_list.get(timeout=3) # 从数据队列里提取一个数据 except Exception as e: # 超时以后进入异常 data = None # 如果成功拿到数据,则调用解析方法 if data is not None and Wanfang.running: Paper = Wanfang.GetFurtherPaper(url, data) else: break # 结束while 无限循环 print('退出%d号解析线程' % self.number)
def GetAllUrl(self): total_record_num, self.MaxPage, index_url = self.GetMaxPage() # 最大页数 self.StartPage = ReadConfig(file_name=ConfigName, section=SearchDBName, attr='startpage') # 开始页数 t = time.time() WriteConfig(file_name=ConfigName, section=SearchDBName, attr='flag_get_all_url', value=0) for i in range(int(self.StartPage), self.MaxPage + 1): print("共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100)) WriteConfig(file_name=ConfigName, section=SearchDBName, attr='startpage', value=i + 1) url_list = self.GetFurtherUrl(i, index_url) threading.Thread(target=self.WriteUrlIntoDB, args=(url_list, )).start() # self.further_url.extend(self.GetFurtherUrl(i, index_url)) time.sleep(0.5) WriteConfig(file_name=ConfigName, section=SearchDBName, attr='flag_get_all_url', value=1) print(time.time() - t)
def WanfangProcess(): global db, Wanfang multiprocessing.freeze_support( ) # 在Windows下编译需要这行:windows创建进程没有fork方法,默认是spawn,而linux创建进程默认是fork方法 db = MysqlPool() Wanfang = WanFangCrawler(db=db) init() if '0' in str( ReadConfig(file_name=ConfigName, section=SearchDBName, attr='stopflag')): main()
def __init__(self, db, StartTime=None, EndTime=None, StartPage=None): """ :param db: 数据库的实例化 :param StartTime: 开始年份 :param EndTime: 结束年份 :param StartPage: 开始页码 """ self.db = db self.SearchName = SearchDBName # 万方 self.ConfigPath = ConfigName # 配置文件地址 self._Perpage = 50 # 每页显示50 self.running = False # 标记程序是否正常运行 self.further_url = list() if StartTime is None and EndTime is None and StartPage is None: self.StartTime = ReadConfig(file_name=ConfigName, section=self.SearchName, attr='starttime') # 开始年份 self.EndTime = ReadConfig(file_name=ConfigName, section=self.SearchName, attr='endtime') # 结束年份 self.StartPage = ReadConfig(file_name=ConfigName, section=self.SearchName, attr='startpage') # 开始页数 self.MaxPage = ReadConfig(file_name=ConfigName, section=self.SearchName, attr='maxpage') # 最大页数 self.title = ReadConfig(file_name=ConfigName, section=self.SearchName, attr='title') self.authors = ReadConfig(file_name=ConfigName, section=self.SearchName, attr='authors') self.keywords = ReadConfig(file_name=ConfigName, section=self.SearchName, attr='keywords') self.unit = ReadConfig(file_name=ConfigName, section=self.SearchName, attr='unit') self.BaseKeyword = "" if RemoveSpecialCharacter(self.title) != "": self.BaseKeyword = self.BaseKeyword + " 标题:" + self.title if RemoveSpecialCharacter(self.authors) != "": self.BaseKeyword = self.BaseKeyword + " 作者:" + self.authors if RemoveSpecialCharacter(self.keywords) != "": self.BaseKeyword = self.BaseKeyword + " 关键词:" + self.keywords if RemoveSpecialCharacter(self.unit) != "": self.BaseKeyword = self.BaseKeyword + " 作者单位:" + self.unit else: pass
def run(self): # 输出启动线程信息 print('启动采集线程%d号' % self.number) # 如果请求队列不为空,则无限循环,从请求队列里拿请求url while self.req_list.qsize() > 0 or int( ReadConfig(file_name=ConfigName, section=SearchDBName, attr='stopflag')) == 0: # 从请求队列里提取url url = self.req_list.get( ) # 从queue中(queue[0]) get到一个数据,该数据就会在queue中删除(删除的是queue[0],然后queue[1]就会移动到queue[0],...) # print('%d号线程采集:%s' % (self.number, url)) # 防止请求频率过快,随机设置阻塞时间 time.sleep(random.randint(30, 50) / 10) # 发起http请求,获取响应内容,追加到数据队列里,等待解析 response = Wanfang.VisitHtml(url) self.data_list.put([url, response]) # 向数据队列中添加列表数据
""" __info__ = { 'class1': 'Mysql', 'class2': 'MysqlPool', 'function1': 'CreateDB', 'function2': 'CreateTable', 'author': 'gs', 'time': '20190716' } import pymysql as MySQLdb from DBUtils.PooledDB import PooledDB from ConfigHelper import ReadConfig DBNAME = ReadConfig(file_name="Config.ini", section="DB", attr="dbname") DBHOST = ReadConfig(file_name="Config.ini", section="DB", attr="dbhost") DBUSER = ReadConfig(file_name="Config.ini", section="DB", attr="dbuser") DBPWD = ReadConfig(file_name="Config.ini", section="DB", attr="dbpwd") DBCHARSET = ReadConfig(file_name="Config.ini", section="DB", attr="dbcharset") DBPORT = ReadConfig(file_name="Config.ini", section="DB", attr="dbport") class Mysql: """ 通过普通方式操作MySQL数据库的类 """ # 注,python的self等于其它语言的this def __init__(self, log=None,
import time import random import re from ConfigHelper import WriteConfig, ReadConfig from MysqlHelper import MysqlPool from PublicMethod import RemoveSpecialCharacter, InitDict, CreatResultDBTable, CreatUrlBuffTable SearchDBName = 'Wanfang' ConfigName = 'Config.ini' concurrent = 3 # 采集线程数 conparse = 5 # 解析线程数 req_queue = queue.Queue() # 生成请求队列 data_queue = queue.Queue() # 生成数据队列 ,请求以后,响应内容放到数据队列里 ex_dbname = ReadConfig(file_name=ConfigName, section=SearchDBName, attr='ex_dbname') DbDatabuff = "databuff" + str(ex_dbname) Dbresult = "result" + str(ex_dbname) class Parse(threading.Thread): # 初始化属性 def __init__(self, number, data_list, req_thread): super(Parse, self).__init__() self.number = number # 线程编号 self.data_list = data_list # 数据队列 self.req_thread = req_thread # 请求队列,为了判断采集线程存活状态 self.is_parse = True # 判断是否从数据队列里提取数据 def run(self):