예제 #1
0
def init_main():
    if '1' in str(
            Read_buff(file_buff="Config.ini",
                      settion=SearchDBName,
                      info='restart')):
        CreatResultDBTable(db, Dbresult)
        CreatUrlBuffTable(db, DbDatabuff)
        time.sleep(0.02)
        Write_buff(file_buff="Config.ini",
                   settion=SearchDBName,
                   info="restart",
                   state=0)
        Write_buff(file_buff="Config.ini",
                   settion=SearchDBName,
                   info="startpage",
                   state=1)
        Write_buff(file_buff="Config.ini",
                   settion=SearchDBName,
                   info="stopflag",
                   state=0)
        Write_buff(file_buff="Config.ini",
                   settion=SearchDBName,
                   info="flag_get_all_url",
                   state=0)
    if '0' in str(
            Read_buff(file_buff="Config.ini",
                      settion=SearchDBName,
                      info='restart')):
        db.upda_sql("Update `%s` set `State`=0 where `State`=10" % DbDatabuff)
    time.sleep(1)
예제 #2
0
 def GetAllUrl(self):
     total_record_num, self.MaxPage, index_url = self.GetMaxPage()  # 最大页数
     self.StartPage = Read_buff(file_buff=self.SettingPath,
                                settion=self.SearchName,
                                info='startpage')  # 开始页数
     t = time.time()
     Write_buff(file_buff="Config.ini",
                settion="Wanfang",
                info="flag_get_all_url",
                state=0)
     for i in range(int(self.StartPage), self.MaxPage + 1):
         print("共有%s页,当前为%s页,获得文献链接的进度完成%.2f" %
               (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100))
         Write_buff(file_buff="Config.ini",
                    settion="Wanfang",
                    info="startpage",
                    state=i + 1)
         url_list = self.GetFurtherUrl(i, index_url)
         threading.Thread(target=self.WriteUrlIntoDB,
                          args=(url_list, )).start()
         # self.further_url.extend(self.GetFurtherUrl(i, index_url))
         time.sleep(0.5)
     Write_buff(file_buff="Config.ini",
                settion="Wanfang",
                info="flag_get_all_url",
                state=1)
     print(time.time() - t)
예제 #3
0
 def WriteAllUrlIntoDBMain(self):
     summarys, self.MaxPage = self.GetMaxPage()  # 最大页数
     self.StartPage = Read_buff(file_buff=self.SettingPath,
                                settion=SearchDBName,
                                info='startpage')  # 开始页数
     t = time.time()
     Write_buff(file_buff="Config.ini",
                settion=SearchDBName,
                info="flag_get_all_url",
                state=0)
     for i in range(int(self.StartPage), self.MaxPage):
         print("Cnki:共有%s页,当前为%s页,获得文献链接的进度完成%.2f" %
               (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100))
         Write_buff(file_buff="Config.ini",
                    settion=SearchDBName,
                    info="startpage",
                    state=i + 1)
         keywordval = self.BaseKeyword
         page_url = 'http://search.cnki.com.cn/Search.aspx?q=%s&p=%s' % (
             quote(keywordval), (i - 1) * 15)
         threading.Thread(target=self.WriteUrlIntoDB,
                          args=(page_url, i)).start()
         time.sleep(1)
     Write_buff(file_buff="Config.ini",
                settion=SearchDBName,
                info="flag_get_all_url",
                state=1)
     print(time.time() - t)
     sys.exit(0)
예제 #4
0
 def WriteAllUrlIntoDBMain(self):
     summarys, self.MaxPage = self.GetMaxPage()  # 最大页数
     self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage')  # 开始页数
     t = time.time()
     Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=0)
     for i in range(int(self.StartPage), self.MaxPage):
         print("%s采集器,共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (SearchDBName,self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100))
         Write_buff(file_buff="Config.ini", settion=SearchDBName, info="startpage", state=i + 1)
         page_url = "http://www.cqvip.com/data/main/search.aspx?action=so&curpage=%s&perpage=20&%s" % (
             str(i), self.BaseKeyword)
         threading.Thread(target=self.WriteUrlIntoDB, args=(page_url, i)).start()
         time.sleep(0.5)
     Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=1)
     print(time.time() - t)
예제 #5
0
def WriteInto(InputDic, SearchDBName):
    Write_buff(file_buff="Config.ini",
               settion=SearchDBName,
               info="restart",
               state=InputDic['restart'])
    Write_buff(file_buff="Config.ini",
               settion=SearchDBName,
               info="title",
               state=InputDic['title'])
    Write_buff(file_buff="Config.ini",
               settion=SearchDBName,
               info="authors",
               state=InputDic['authors'])
    Write_buff(file_buff="Config.ini",
               settion=SearchDBName,
               info="keywords",
               state=InputDic['keywords'])
    Write_buff(file_buff="Config.ini",
               settion=SearchDBName,
               info="unit",
               state=InputDic['unit'])
    Write_buff(file_buff="Config.ini",
               settion=SearchDBName,
               info="endtime",
               state=InputDic['endtime'])
    Write_buff(file_buff="Config.ini",
               settion=SearchDBName,
               info="starttime",
               state=InputDic['starttime'])
    Write_buff(file_buff="Config.ini",
               settion=SearchDBName,
               info="ex_dbname",
               state=InputDic['ex_dbname'])
예제 #6
0
 def GetMaxPage(self):
     index_url = "http://www.cqvip.com/data/main/search.aspx?action=so&curpage=1&perpage=%s&%s" % (
         str(self._Perpage), self.BaseKeyword)
     soup = GetSoup(url=index_url)
     deff = soup.select('p')[0].text
     summarys = int(deff.split('\r\n')[1].split('"recordcount":')[1].split(',')[0].strip())
     print("查询到共%s相关文献" % summarys)
     self.MaxPage = Up_division_int(summarys, int(self._Perpage))
     Write_buff(file_buff="Config.ini", settion=SearchDBName, info="maxpage", state=self.MaxPage)
     return summarys, self.MaxPage
예제 #7
0
    def GetMaxPage(self):

        index_url = 'http://search.cnki.com.cn/Search.aspx?q=%s' % quote(
            self.BaseKeyword)  # quote方法把汉字转换为encodeuri?
        try:
            print("GetMaxPage", index_url)
            soup = GetSoup(url=index_url)
            pagesum_text = soup.find('span', class_='page-sum').get_text()
            summarys = math.ceil(int(pagesum_text[7:-1]))
            self.MaxPage = Up_division_int(summarys, int(15))
            Write_buff(file_buff="Config.ini",
                       settion=SearchDBName,
                       info="maxpage",
                       state=self.MaxPage)
        except:
            print(index_url, "获得最大出错")
        return summarys, self.MaxPage
def ShowStatePro(db,SearchDBName,DbDatabuff,Dbresult):
    sql_count_all = "select count(*) from `%s` where `Source`='%s'"%(DbDatabuff,SearchDBName)
    num_all = int(db.do_sql_one(sql_count_all)[0])
    sql_count_done = "select count(*) from `%s` where `State`=20 and `Source`='%s'"%(DbDatabuff,SearchDBName)
    num_done = int(db.do_sql_one(sql_count_done)[0])
    sql_count_error = "select count(*) from `%s` where `State`=-15 and `Source`='%s'"%(DbDatabuff,SearchDBName)
    num_error = int(db.do_sql_one(sql_count_error)[0])
    num_error = num_error if num_error > 0 else 0
    sql_count_done_not_in_year = "select count(*) from `%s` where `State`=-5 and `Source`='%s'"%(DbDatabuff,SearchDBName)
    num_done_not_in_year = int(db.do_sql_one(sql_count_done_not_in_year)[0])
    num_done_not_in_year = num_done_not_in_year if num_done_not_in_year > 0 else 0
    num_done = num_done + num_done_not_in_year+num_error
    if num_all > 0:
        print(
            "%s采集器:#############################################目前有%s条数据,其中已处理的有%s,其中年份不符合的有%s,无效链接%s,处理完成度为%.2f,##############################" % (
                SearchDBName,num_all, num_done, num_done_not_in_year,num_error, (int(num_done) / int(num_all)) * 100))
    if '1' in str(Read_buff(file_buff="Config.ini", settion=SearchDBName, info='flag_get_all_url')) and num_all == num_done:
        # 完成全部
        Write_buff(file_buff="Config.ini", settion=SearchDBName, info="stopflag", state=1)
        time.sleep(5)
        print("%s:爬取结束"%SearchDBName)
        sys.exit(0)
예제 #9
0
 def GetMaxPage(self):
     total_record_num = 0
     index_url = self.GetBaseUrl()
     response = self.VisitHtml(url=index_url)
     if self.running:
         html = BeautifulSoup(response.text, "html.parser")  # 获取HTML代码
         total_record_text = html.find('div', class_='left_sidebar_border')
         for item in total_record_text:
             if '条结果' in item:
                 total_record_num = re.findall(r"\d+\.?\d*", item)
                 if str.isdigit(total_record_num[0]):
                     total_record_num = int(total_record_num[0])
                     break
         print("查询到共%s相关文献" % total_record_num)
         page_count = int(math.ceil(total_record_num /
                                    self._Perpage))  # 总页数
         self.MaxPage = page_count
         if self.MaxPage > 100:  # 最大记录数只能达到5000条,即100页每页50条
             self.MaxPage = 100
         Write_buff(file_buff="Config.ini",
                    settion="Wanfang",
                    info="maxpage",
                    state=self.MaxPage)
         return total_record_num, self.MaxPage, index_url