def init_main(): if '1' in str( Read_buff(file_buff="Config.ini", settion=SearchDBName, info='restart')): CreatResultDBTable(db, Dbresult) CreatUrlBuffTable(db, DbDatabuff) time.sleep(0.02) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="restart", state=0) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="startpage", state=1) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="stopflag", state=0) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=0) if '0' in str( Read_buff(file_buff="Config.ini", settion=SearchDBName, info='restart')): db.upda_sql("Update `%s` set `State`=0 where `State`=10" % DbDatabuff) time.sleep(1)
def run(self): print('Cnki:启动%d号解析线程' % self.number) # 无限循环, while True: # 如何判断解析线程的结束条件 for t in self.req_thread: # 循环所有采集线程 if t.is_alive(): # 判断线程是否存活 break else: # 如果循环完毕,没有执行break语句,则进入else if self.data_list.qsize() == 0: # 判断数据队列是否为空 self.is_parse = False # 设置解析为False # 判断是否继续解析 if self.is_parse or '0' in (Read_buff(file_buff="Config.ini", settion=SearchDBName, info='stopflag')): # 解析 try: url, data = self.data_list.get(timeout=3) # 从数据队列里提取一个数据 except Exception as e: # 超时以后进入异常 data = None # 如果成功拿到数据,则调用解析方法 if data is not None: parse(url, data) # 调用解析方法 else: break # 结束while 无限循环 print('Cnki:退出%d号解析线程' % self.number)
def run(self): print('启动%d号解析线程' % self.number) # 无限循环, while True: # 如何判断解析线程的结束条件 for t in self.req_thread: # 循环所有采集线程 if t.is_alive(): # 判断线程是否存活 break else: # 如果循环完毕,没有执行break语句,则进入else if self.data_list.qsize() == 0: # 判断数据队列是否为空 self.is_parse = False # 设置解析为False # 判断是否继续解析 if self.is_parse or int( Read_buff(file_buff="Config.ini", settion="Wanfang", info='stopflag')) == 0: # 解析 try: url, data = self.data_list.get(timeout=3) # 从数据队列里提取一个数据 except Exception as e: # 超时以后进入异常 data = None # 如果成功拿到数据,则调用解析方法 if data is not None and Wanfang.running: Paper = Wanfang.GetFurtherPaper(url, data) else: break # 结束while 无限循环 print('退出%d号解析线程' % self.number)
def WriteAllUrlIntoDBMain(self): summarys, self.MaxPage = self.GetMaxPage() # 最大页数 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='startpage') # 开始页数 t = time.time() Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=0) for i in range(int(self.StartPage), self.MaxPage): print("Cnki:共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100)) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="startpage", state=i + 1) keywordval = self.BaseKeyword page_url = 'http://search.cnki.com.cn/Search.aspx?q=%s&p=%s' % ( quote(keywordval), (i - 1) * 15) threading.Thread(target=self.WriteUrlIntoDB, args=(page_url, i)).start() time.sleep(1) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=1) print(time.time() - t) sys.exit(0)
def GetAllUrl(self): total_record_num, self.MaxPage, index_url = self.GetMaxPage() # 最大页数 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage') # 开始页数 t = time.time() Write_buff(file_buff="Config.ini", settion="Wanfang", info="flag_get_all_url", state=0) for i in range(int(self.StartPage), self.MaxPage + 1): print("共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100)) Write_buff(file_buff="Config.ini", settion="Wanfang", info="startpage", state=i + 1) url_list = self.GetFurtherUrl(i, index_url) threading.Thread(target=self.WriteUrlIntoDB, args=(url_list, )).start() # self.further_url.extend(self.GetFurtherUrl(i, index_url)) time.sleep(0.5) Write_buff(file_buff="Config.ini", settion="Wanfang", info="flag_get_all_url", state=1) print(time.time() - t)
def ProcessMain(): global db,Cqvip db = HCJ_MySQL() Cqvip = Cqvip_Crawler(db=db) multiprocessing.freeze_support() # 多进程打包的话必须加上 init_main() if '0' in (Read_buff(file_buff="Config.ini", settion=SearchDBName, info='stopflag')) : main()
def __init__(self, db, Input=None, SearchMode=None, StartTime=None, EndTime=None, StartPage=None, SettingPath='./Config.ini'): self.db = db self.SearchName = SearchDBName # 万方 self.SettingPath = SettingPath # 配置文件地址 self._Perpage = 10 # 每页显示20 self._ResultDbTable = 'CqvipResult' if Input is None and SearchMode is None: self.StartTime = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='starttime') # 开始年份 self.EndTime = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='endtime') # 结束年份 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage') # 开始页数 self.title = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='title') self.authors = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='authors') self.keywords = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='keywords') self.unit = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='unit') self.BaseKeyword = "" if RemoveSpecialCharacter(self.title) != "": self.BaseKeyword = self.BaseKeyword + "&k=" + quote(self.title) if RemoveSpecialCharacter(self.authors) != "": self.BaseKeyword = self.BaseKeyword + "&w=" + quote(self.authors) if RemoveSpecialCharacter(self.keywords) != "": self.BaseKeyword = self.BaseKeyword + "&k=" + quote(self.keywords) if RemoveSpecialCharacter(self.unit) != "": self.BaseKeyword = self.BaseKeyword + "&o=" + quote(self.unit) else: # Todo pass
def __init__(self, db, Input=None, SearchMode=None, StartTime=None, EndTime=None, StartPage=None, SettingPath='./Config.ini'): self.db = db self.SearchName = 'Wanfang' # 万方 self.SettingPath = SettingPath # 配置文件地址 self._Perpage = 50 # 每页显示50 self._ResultDbTable = 'WanFangResult' self.running = False # 标记程序是否正常运行 self.further_url = list() if Input is None and SearchMode is None: self.StartTime = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='starttime') # 开始年份 self.EndTime = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='endtime') # 结束年份 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage') # 开始页数 self.MaxPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='maxpage') # 开始页数 self.title = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='title') self.authors = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='authors') self.keywords = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='keywords') self.publication = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='unit') self.BaseKeyword = "" if RemoveSpecialCharacter(self.title) != "": self.BaseKeyword = self.BaseKeyword + " 标题:" + self.title if RemoveSpecialCharacter(self.authors) != "": self.BaseKeyword = self.BaseKeyword + " 作者:" + self.authors if RemoveSpecialCharacter(self.keywords) != "": self.BaseKeyword = self.BaseKeyword + " 关键词:" + self.keywords if RemoveSpecialCharacter(self.publication) != "": self.BaseKeyword = self.BaseKeyword + " 作者单位:" + self.publication else: # Todo pass
def ProcessMain(): global db, Wanfang multiprocessing.freeze_support() # db = HCJ_MySQL() Wanfang = WanFangCrawler(db=db) init_main() if '0' in str( Read_buff(file_buff="Config.ini", settion=SearchDBName, info='stopflag')): main()
def run(self): # 输出启动线程信息 print('启动采集线程%d号' % self.number) # 如果请求队列不为空,则无限循环,从请求队列里拿请求url while self.req_list.qsize() > 0 or '0' in ( Read_buff(file_buff="Config.ini", settion=SearchDBName, info='stopflag')): # 从请求队列里提取url url = self.req_list.get() # print('%d号线程采集:%s' % (self.number, url)) # 防止请求频率过快,随机设置阻塞时间 time.sleep(random.randint(interval*10, (interval+2)*10)/10) # 发起http请求,获取响应内容,追加到数据队列里,等待解析 response = GetSoup(url) self.data_list.put([url, response]) # 向数据队列里追加
def WriteAllUrlIntoDBMain(self): summarys, self.MaxPage = self.GetMaxPage() # 最大页数 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage') # 开始页数 t = time.time() Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=0) for i in range(int(self.StartPage), self.MaxPage): print("%s采集器,共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (SearchDBName,self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100)) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="startpage", state=i + 1) page_url = "http://www.cqvip.com/data/main/search.aspx?action=so&curpage=%s&perpage=20&%s" % ( str(i), self.BaseKeyword) threading.Thread(target=self.WriteUrlIntoDB, args=(page_url, i)).start() time.sleep(0.5) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=1) print(time.time() - t)
def __init__(self, db, Input=None, SearchMode=None, StartTime=None, EndTime=None, StartPage=None, SettingPath='./Config.ini'): self.db = db self.SettingPath = SettingPath # 配置文件地址 if Input is None and SearchMode is None: self.StartTime = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='starttime') # 开始年份 self.EndTime = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='endtime') # 结束年份 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='startpage') # 开始页数 self.MaxPage = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='maxpage') # 开始页数 self.title = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='title') self.authors = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='authors') self.keywords = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='keywords') self.unit = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='unit') self.BaseKeyword = "" if RemoveSpecialCharacter(self.title) != "": self.BaseKeyword = self.BaseKeyword + " title:" + self.title if RemoveSpecialCharacter(self.authors) != "": self.BaseKeyword = self.BaseKeyword + " author:" + self.authors if RemoveSpecialCharacter(self.keywords) != "": self.BaseKeyword = self.BaseKeyword + " qw:" + self.keywords if RemoveSpecialCharacter(self.unit) != "": self.BaseKeyword = self.BaseKeyword + " 作者单位:" + self.unit else: # Todo pass
def ShowStatePro(db,SearchDBName,DbDatabuff,Dbresult): sql_count_all = "select count(*) from `%s` where `Source`='%s'"%(DbDatabuff,SearchDBName) num_all = int(db.do_sql_one(sql_count_all)[0]) sql_count_done = "select count(*) from `%s` where `State`=20 and `Source`='%s'"%(DbDatabuff,SearchDBName) num_done = int(db.do_sql_one(sql_count_done)[0]) sql_count_error = "select count(*) from `%s` where `State`=-15 and `Source`='%s'"%(DbDatabuff,SearchDBName) num_error = int(db.do_sql_one(sql_count_error)[0]) num_error = num_error if num_error > 0 else 0 sql_count_done_not_in_year = "select count(*) from `%s` where `State`=-5 and `Source`='%s'"%(DbDatabuff,SearchDBName) num_done_not_in_year = int(db.do_sql_one(sql_count_done_not_in_year)[0]) num_done_not_in_year = num_done_not_in_year if num_done_not_in_year > 0 else 0 num_done = num_done + num_done_not_in_year+num_error if num_all > 0: print( "%s采集器:#############################################目前有%s条数据,其中已处理的有%s,其中年份不符合的有%s,无效链接%s,处理完成度为%.2f,##############################" % ( SearchDBName,num_all, num_done, num_done_not_in_year,num_error, (int(num_done) / int(num_all)) * 100)) if '1' in str(Read_buff(file_buff="Config.ini", settion=SearchDBName, info='flag_get_all_url')) and num_all == num_done: # 完成全部 Write_buff(file_buff="Config.ini", settion=SearchDBName, info="stopflag", state=1) time.sleep(5) print("%s:爬取结束"%SearchDBName) sys.exit(0)
from HCJ_Buff_Control import Read_buff, Write_buff #构造不同条件的关键词搜索 from HCJ_DB_Helper import HCJ_MySQL SearchDBName = "Cnki" from PublicDef import * headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } concurrent = int( Read_buff(file_buff="./Config.ini", settion='Setting', info='Cnki_CollectNum').replace(' ', '')) conparse = int( Read_buff(file_buff="./Config.ini", settion='Setting', info='Cnki_parsenum').replace(' ', '')) interval = int( Read_buff(file_buff="./Config.ini", settion='Setting', info='Cnki_interval').replace(' ', '')) # 生成请求队列 req_list = queue.Queue() # 生成数据队列 ,请求以后,响应内容放到数据队列里 data_list = queue.Queue()
import re from HCJ_Buff_Control import Read_buff, Write_buff # 构造不同条件的关键词搜索 from HCJ_DB_Helper import HCJ_MySQL from PublicDef import * SearchDBName="Cqvip" values = { '1': 'k', # 标题 '2': 'w', # 作者 '3': 'k', # 关键词 '4': 'o', # 单位 '5': 'mn', # 刊名 } headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} concurrent=int(Read_buff(file_buff="./Config.ini",settion='Setting',info='Cqvip_CollectNum').replace(' ','')) conparse=int(Read_buff(file_buff="./Config.ini",settion='Setting',info='Cqvip_parsenum').replace(' ','')) interval=int(Read_buff(file_buff="./Config.ini",settion='Setting',info='Cqvip_interval').replace(' ','')) # 生成请求队列 req_list = queue.Queue() # 生成数据队列 ,请求以后,响应内容放到数据队列里 data_list = queue.Queue() class Parse(threading.Thread): # 初始化属性 def __init__(self, number, data_list, req_thread): super(Parse, self).__init__() self.number = number # 线程编号 self.data_list = data_list # 数据队列 self.req_thread = req_thread # 请求队列,为了判断采集线程存活状态
# encoding:utf-8 # name:mod_db.py ''' 使用方法:1.在主程序中先实例化DB Mysql数据库操作类。 2.使用方法:db=database() db.fetch_all("sql") ''' import time import pymysql as MySQLdb from DBUtils.PooledDB import PooledDB from HCJ_Buff_Control import Read_buff DBNAME = Read_buff(file_buff="Config.ini", settion="DB",info='DBNAME') DBHOST = Read_buff(file_buff="Config.ini", settion="DB",info='DBHOST') DBUSER = Read_buff(file_buff="Config.ini", settion="DB",info='DBUSER') DBPWD = Read_buff(file_buff="Config.ini", settion="DB",info='DBPWD') DBCHARSET =Read_buff(file_buff="Config.ini", settion="DB",info='DBCHARSET') DBPORT =Read_buff(file_buff="Config.ini", settion="DB",info='DBPORT') limit_count1 =Read_buff(file_buff="Config.ini", settion="DB",info='limit_count') class HCJ_MySQL: pool = None limit_count = int(limit_count1.replace(" ","")) # 最低预启动数据库连接数量 def __init__(self,log=None,dbname=None,dbhost=None): if dbname is None: self._dbname = DBNAME else: self._dbname = dbname