def init_main(): if '1' in str( Read_buff(file_buff="Config.ini", settion=SearchDBName, info='restart')): CreatResultDBTable(db, Dbresult) CreatUrlBuffTable(db, DbDatabuff) time.sleep(0.02) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="restart", state=0) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="startpage", state=1) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="stopflag", state=0) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=0) if '0' in str( Read_buff(file_buff="Config.ini", settion=SearchDBName, info='restart')): db.upda_sql("Update `%s` set `State`=0 where `State`=10" % DbDatabuff) time.sleep(1)
def GetAllUrl(self): total_record_num, self.MaxPage, index_url = self.GetMaxPage() # 最大页数 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage') # 开始页数 t = time.time() Write_buff(file_buff="Config.ini", settion="Wanfang", info="flag_get_all_url", state=0) for i in range(int(self.StartPage), self.MaxPage + 1): print("共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100)) Write_buff(file_buff="Config.ini", settion="Wanfang", info="startpage", state=i + 1) url_list = self.GetFurtherUrl(i, index_url) threading.Thread(target=self.WriteUrlIntoDB, args=(url_list, )).start() # self.further_url.extend(self.GetFurtherUrl(i, index_url)) time.sleep(0.5) Write_buff(file_buff="Config.ini", settion="Wanfang", info="flag_get_all_url", state=1) print(time.time() - t)
def WriteAllUrlIntoDBMain(self): summarys, self.MaxPage = self.GetMaxPage() # 最大页数 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='startpage') # 开始页数 t = time.time() Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=0) for i in range(int(self.StartPage), self.MaxPage): print("Cnki:共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100)) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="startpage", state=i + 1) keywordval = self.BaseKeyword page_url = 'http://search.cnki.com.cn/Search.aspx?q=%s&p=%s' % ( quote(keywordval), (i - 1) * 15) threading.Thread(target=self.WriteUrlIntoDB, args=(page_url, i)).start() time.sleep(1) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=1) print(time.time() - t) sys.exit(0)
def WriteAllUrlIntoDBMain(self): summarys, self.MaxPage = self.GetMaxPage() # 最大页数 self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage') # 开始页数 t = time.time() Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=0) for i in range(int(self.StartPage), self.MaxPage): print("%s采集器,共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (SearchDBName,self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100)) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="startpage", state=i + 1) page_url = "http://www.cqvip.com/data/main/search.aspx?action=so&curpage=%s&perpage=20&%s" % ( str(i), self.BaseKeyword) threading.Thread(target=self.WriteUrlIntoDB, args=(page_url, i)).start() time.sleep(0.5) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=1) print(time.time() - t)
def WriteInto(InputDic, SearchDBName): Write_buff(file_buff="Config.ini", settion=SearchDBName, info="restart", state=InputDic['restart']) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="title", state=InputDic['title']) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="authors", state=InputDic['authors']) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="keywords", state=InputDic['keywords']) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="unit", state=InputDic['unit']) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="endtime", state=InputDic['endtime']) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="starttime", state=InputDic['starttime']) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="ex_dbname", state=InputDic['ex_dbname'])
def GetMaxPage(self): index_url = "http://www.cqvip.com/data/main/search.aspx?action=so&curpage=1&perpage=%s&%s" % ( str(self._Perpage), self.BaseKeyword) soup = GetSoup(url=index_url) deff = soup.select('p')[0].text summarys = int(deff.split('\r\n')[1].split('"recordcount":')[1].split(',')[0].strip()) print("查询到共%s相关文献" % summarys) self.MaxPage = Up_division_int(summarys, int(self._Perpage)) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="maxpage", state=self.MaxPage) return summarys, self.MaxPage
def GetMaxPage(self): index_url = 'http://search.cnki.com.cn/Search.aspx?q=%s' % quote( self.BaseKeyword) # quote方法把汉字转换为encodeuri? try: print("GetMaxPage", index_url) soup = GetSoup(url=index_url) pagesum_text = soup.find('span', class_='page-sum').get_text() summarys = math.ceil(int(pagesum_text[7:-1])) self.MaxPage = Up_division_int(summarys, int(15)) Write_buff(file_buff="Config.ini", settion=SearchDBName, info="maxpage", state=self.MaxPage) except: print(index_url, "获得最大出错") return summarys, self.MaxPage
def ShowStatePro(db,SearchDBName,DbDatabuff,Dbresult): sql_count_all = "select count(*) from `%s` where `Source`='%s'"%(DbDatabuff,SearchDBName) num_all = int(db.do_sql_one(sql_count_all)[0]) sql_count_done = "select count(*) from `%s` where `State`=20 and `Source`='%s'"%(DbDatabuff,SearchDBName) num_done = int(db.do_sql_one(sql_count_done)[0]) sql_count_error = "select count(*) from `%s` where `State`=-15 and `Source`='%s'"%(DbDatabuff,SearchDBName) num_error = int(db.do_sql_one(sql_count_error)[0]) num_error = num_error if num_error > 0 else 0 sql_count_done_not_in_year = "select count(*) from `%s` where `State`=-5 and `Source`='%s'"%(DbDatabuff,SearchDBName) num_done_not_in_year = int(db.do_sql_one(sql_count_done_not_in_year)[0]) num_done_not_in_year = num_done_not_in_year if num_done_not_in_year > 0 else 0 num_done = num_done + num_done_not_in_year+num_error if num_all > 0: print( "%s采集器:#############################################目前有%s条数据,其中已处理的有%s,其中年份不符合的有%s,无效链接%s,处理完成度为%.2f,##############################" % ( SearchDBName,num_all, num_done, num_done_not_in_year,num_error, (int(num_done) / int(num_all)) * 100)) if '1' in str(Read_buff(file_buff="Config.ini", settion=SearchDBName, info='flag_get_all_url')) and num_all == num_done: # 完成全部 Write_buff(file_buff="Config.ini", settion=SearchDBName, info="stopflag", state=1) time.sleep(5) print("%s:爬取结束"%SearchDBName) sys.exit(0)
def GetMaxPage(self): total_record_num = 0 index_url = self.GetBaseUrl() response = self.VisitHtml(url=index_url) if self.running: html = BeautifulSoup(response.text, "html.parser") # 获取HTML代码 total_record_text = html.find('div', class_='left_sidebar_border') for item in total_record_text: if '条结果' in item: total_record_num = re.findall(r"\d+\.?\d*", item) if str.isdigit(total_record_num[0]): total_record_num = int(total_record_num[0]) break print("查询到共%s相关文献" % total_record_num) page_count = int(math.ceil(total_record_num / self._Perpage)) # 总页数 self.MaxPage = page_count if self.MaxPage > 100: # 最大记录数只能达到5000条,即100页每页50条 self.MaxPage = 100 Write_buff(file_buff="Config.ini", settion="Wanfang", info="maxpage", state=self.MaxPage) return total_record_num, self.MaxPage, index_url