Beispiel #1
0
def init_main():
    if '1' in str(
            Read_buff(file_buff="Config.ini",
                      settion=SearchDBName,
                      info='restart')):
        CreatResultDBTable(db, Dbresult)
        CreatUrlBuffTable(db, DbDatabuff)
        time.sleep(0.02)
        Write_buff(file_buff="Config.ini",
                   settion=SearchDBName,
                   info="restart",
                   state=0)
        Write_buff(file_buff="Config.ini",
                   settion=SearchDBName,
                   info="startpage",
                   state=1)
        Write_buff(file_buff="Config.ini",
                   settion=SearchDBName,
                   info="stopflag",
                   state=0)
        Write_buff(file_buff="Config.ini",
                   settion=SearchDBName,
                   info="flag_get_all_url",
                   state=0)
    if '0' in str(
            Read_buff(file_buff="Config.ini",
                      settion=SearchDBName,
                      info='restart')):
        db.upda_sql("Update `%s` set `State`=0 where `State`=10" % DbDatabuff)
    time.sleep(1)
Beispiel #2
0
    def run(self):
        print('Cnki:启动%d号解析线程' % self.number)
        # 无限循环,
        while True:
            # 如何判断解析线程的结束条件
            for t in self.req_thread:  # 循环所有采集线程
                if t.is_alive():  # 判断线程是否存活
                    break
            else:  # 如果循环完毕,没有执行break语句,则进入else
                if self.data_list.qsize() == 0:  # 判断数据队列是否为空
                    self.is_parse = False  # 设置解析为False

            # 判断是否继续解析

            if self.is_parse or '0' in (Read_buff(file_buff="Config.ini",
                                                  settion=SearchDBName,
                                                  info='stopflag')):  # 解析
                try:
                    url, data = self.data_list.get(timeout=3)  # 从数据队列里提取一个数据
                except Exception as e:  # 超时以后进入异常
                    data = None
                # 如果成功拿到数据,则调用解析方法
                if data is not None:
                    parse(url, data)  # 调用解析方法
            else:

                break  # 结束while 无限循环

        print('Cnki:退出%d号解析线程' % self.number)
Beispiel #3
0
    def run(self):
        print('启动%d号解析线程' % self.number)
        # 无限循环,
        while True:
            # 如何判断解析线程的结束条件
            for t in self.req_thread:  # 循环所有采集线程
                if t.is_alive():  # 判断线程是否存活
                    break
            else:  # 如果循环完毕,没有执行break语句,则进入else
                if self.data_list.qsize() == 0:  # 判断数据队列是否为空
                    self.is_parse = False  # 设置解析为False
            # 判断是否继续解析

            if self.is_parse or int(
                    Read_buff(file_buff="Config.ini",
                              settion="Wanfang",
                              info='stopflag')) == 0:  # 解析

                try:
                    url, data = self.data_list.get(timeout=3)  # 从数据队列里提取一个数据
                except Exception as e:  # 超时以后进入异常
                    data = None
                # 如果成功拿到数据,则调用解析方法
                if data is not None and Wanfang.running:
                    Paper = Wanfang.GetFurtherPaper(url, data)
            else:
                break  # 结束while 无限循环

        print('退出%d号解析线程' % self.number)
Beispiel #4
0
 def WriteAllUrlIntoDBMain(self):
     summarys, self.MaxPage = self.GetMaxPage()  # 最大页数
     self.StartPage = Read_buff(file_buff=self.SettingPath,
                                settion=SearchDBName,
                                info='startpage')  # 开始页数
     t = time.time()
     Write_buff(file_buff="Config.ini",
                settion=SearchDBName,
                info="flag_get_all_url",
                state=0)
     for i in range(int(self.StartPage), self.MaxPage):
         print("Cnki:共有%s页,当前为%s页,获得文献链接的进度完成%.2f" %
               (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100))
         Write_buff(file_buff="Config.ini",
                    settion=SearchDBName,
                    info="startpage",
                    state=i + 1)
         keywordval = self.BaseKeyword
         page_url = 'http://search.cnki.com.cn/Search.aspx?q=%s&p=%s' % (
             quote(keywordval), (i - 1) * 15)
         threading.Thread(target=self.WriteUrlIntoDB,
                          args=(page_url, i)).start()
         time.sleep(1)
     Write_buff(file_buff="Config.ini",
                settion=SearchDBName,
                info="flag_get_all_url",
                state=1)
     print(time.time() - t)
     sys.exit(0)
Beispiel #5
0
 def GetAllUrl(self):
     total_record_num, self.MaxPage, index_url = self.GetMaxPage()  # 最大页数
     self.StartPage = Read_buff(file_buff=self.SettingPath,
                                settion=self.SearchName,
                                info='startpage')  # 开始页数
     t = time.time()
     Write_buff(file_buff="Config.ini",
                settion="Wanfang",
                info="flag_get_all_url",
                state=0)
     for i in range(int(self.StartPage), self.MaxPage + 1):
         print("共有%s页,当前为%s页,获得文献链接的进度完成%.2f" %
               (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100))
         Write_buff(file_buff="Config.ini",
                    settion="Wanfang",
                    info="startpage",
                    state=i + 1)
         url_list = self.GetFurtherUrl(i, index_url)
         threading.Thread(target=self.WriteUrlIntoDB,
                          args=(url_list, )).start()
         # self.further_url.extend(self.GetFurtherUrl(i, index_url))
         time.sleep(0.5)
     Write_buff(file_buff="Config.ini",
                settion="Wanfang",
                info="flag_get_all_url",
                state=1)
     print(time.time() - t)
def ProcessMain():
    global db,Cqvip
    db = HCJ_MySQL()
    Cqvip = Cqvip_Crawler(db=db)
    multiprocessing.freeze_support()  # 多进程打包的话必须加上
    init_main()
    if '0' in (Read_buff(file_buff="Config.ini", settion=SearchDBName, info='stopflag')) :
        main()
    def __init__(self, db, Input=None, SearchMode=None, StartTime=None, EndTime=None, StartPage=None,
                 SettingPath='./Config.ini'):
        self.db = db
        self.SearchName = SearchDBName  # 万方
        self.SettingPath = SettingPath  # 配置文件地址
        self._Perpage = 10  # 每页显示20
        self._ResultDbTable = 'CqvipResult'
        if Input is None and SearchMode is None:
            self.StartTime = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='starttime')  # 开始年份
            self.EndTime = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='endtime')  # 结束年份
            self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage')  # 开始页数
            self.title = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='title')
            self.authors = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='authors')
            self.keywords = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='keywords')
            self.unit = Read_buff(file_buff=self.SettingPath, settion=SearchDBName, info='unit')
            self.BaseKeyword = ""
            if RemoveSpecialCharacter(self.title) != "":
                self.BaseKeyword = self.BaseKeyword + "&k=" + quote(self.title)
            if RemoveSpecialCharacter(self.authors) != "":
                self.BaseKeyword = self.BaseKeyword + "&w=" + quote(self.authors)
            if RemoveSpecialCharacter(self.keywords) != "":
                self.BaseKeyword = self.BaseKeyword + "&k=" + quote(self.keywords)
            if RemoveSpecialCharacter(self.unit) != "":
                self.BaseKeyword = self.BaseKeyword + "&o=" + quote(self.unit)

        else:
            # Todo
            pass
Beispiel #8
0
    def __init__(self,
                 db,
                 Input=None,
                 SearchMode=None,
                 StartTime=None,
                 EndTime=None,
                 StartPage=None,
                 SettingPath='./Config.ini'):
        self.db = db
        self.SearchName = 'Wanfang'  # 万方
        self.SettingPath = SettingPath  # 配置文件地址
        self._Perpage = 50  # 每页显示50
        self._ResultDbTable = 'WanFangResult'
        self.running = False  # 标记程序是否正常运行
        self.further_url = list()
        if Input is None and SearchMode is None:
            self.StartTime = Read_buff(file_buff=self.SettingPath,
                                       settion=self.SearchName,
                                       info='starttime')  # 开始年份
            self.EndTime = Read_buff(file_buff=self.SettingPath,
                                     settion=self.SearchName,
                                     info='endtime')  # 结束年份
            self.StartPage = Read_buff(file_buff=self.SettingPath,
                                       settion=self.SearchName,
                                       info='startpage')  # 开始页数
            self.MaxPage = Read_buff(file_buff=self.SettingPath,
                                     settion=self.SearchName,
                                     info='maxpage')  # 开始页数
            self.title = Read_buff(file_buff=self.SettingPath,
                                   settion=SearchDBName,
                                   info='title')
            self.authors = Read_buff(file_buff=self.SettingPath,
                                     settion=SearchDBName,
                                     info='authors')
            self.keywords = Read_buff(file_buff=self.SettingPath,
                                      settion=SearchDBName,
                                      info='keywords')
            self.publication = Read_buff(file_buff=self.SettingPath,
                                         settion=SearchDBName,
                                         info='unit')
            self.BaseKeyword = ""
            if RemoveSpecialCharacter(self.title) != "":
                self.BaseKeyword = self.BaseKeyword + " 标题:" + self.title
            if RemoveSpecialCharacter(self.authors) != "":
                self.BaseKeyword = self.BaseKeyword + " 作者:" + self.authors
            if RemoveSpecialCharacter(self.keywords) != "":
                self.BaseKeyword = self.BaseKeyword + " 关键词:" + self.keywords
            if RemoveSpecialCharacter(self.publication) != "":
                self.BaseKeyword = self.BaseKeyword + " 作者单位:" + self.publication

        else:
            # Todo
            pass
Beispiel #9
0
def ProcessMain():
    global db, Wanfang
    multiprocessing.freeze_support()  #
    db = HCJ_MySQL()
    Wanfang = WanFangCrawler(db=db)
    init_main()
    if '0' in str(
            Read_buff(file_buff="Config.ini",
                      settion=SearchDBName,
                      info='stopflag')):
        main()
 def run(self):
     # 输出启动线程信息
     print('启动采集线程%d号' % self.number)
     # 如果请求队列不为空,则无限循环,从请求队列里拿请求url
     while self.req_list.qsize() > 0 or '0' in (
             Read_buff(file_buff="Config.ini", settion=SearchDBName, info='stopflag')):
         # 从请求队列里提取url
         url = self.req_list.get()
         # print('%d号线程采集:%s' % (self.number, url))
         # 防止请求频率过快,随机设置阻塞时间
         time.sleep(random.randint(interval*10, (interval+2)*10)/10)
         # 发起http请求,获取响应内容,追加到数据队列里,等待解析
         response = GetSoup(url)
         self.data_list.put([url, response])  # 向数据队列里追加
 def WriteAllUrlIntoDBMain(self):
     summarys, self.MaxPage = self.GetMaxPage()  # 最大页数
     self.StartPage = Read_buff(file_buff=self.SettingPath, settion=self.SearchName, info='startpage')  # 开始页数
     t = time.time()
     Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=0)
     for i in range(int(self.StartPage), self.MaxPage):
         print("%s采集器,共有%s页,当前为%s页,获得文献链接的进度完成%.2f" % (SearchDBName,self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100))
         Write_buff(file_buff="Config.ini", settion=SearchDBName, info="startpage", state=i + 1)
         page_url = "http://www.cqvip.com/data/main/search.aspx?action=so&curpage=%s&perpage=20&%s" % (
             str(i), self.BaseKeyword)
         threading.Thread(target=self.WriteUrlIntoDB, args=(page_url, i)).start()
         time.sleep(0.5)
     Write_buff(file_buff="Config.ini", settion=SearchDBName, info="flag_get_all_url", state=1)
     print(time.time() - t)
Beispiel #12
0
 def __init__(self,
              db,
              Input=None,
              SearchMode=None,
              StartTime=None,
              EndTime=None,
              StartPage=None,
              SettingPath='./Config.ini'):
     self.db = db
     self.SettingPath = SettingPath  # 配置文件地址
     if Input is None and SearchMode is None:
         self.StartTime = Read_buff(file_buff=self.SettingPath,
                                    settion=SearchDBName,
                                    info='starttime')  # 开始年份
         self.EndTime = Read_buff(file_buff=self.SettingPath,
                                  settion=SearchDBName,
                                  info='endtime')  # 结束年份
         self.StartPage = Read_buff(file_buff=self.SettingPath,
                                    settion=SearchDBName,
                                    info='startpage')  # 开始页数
         self.MaxPage = Read_buff(file_buff=self.SettingPath,
                                  settion=SearchDBName,
                                  info='maxpage')  # 开始页数
         self.title = Read_buff(file_buff=self.SettingPath,
                                settion=SearchDBName,
                                info='title')
         self.authors = Read_buff(file_buff=self.SettingPath,
                                  settion=SearchDBName,
                                  info='authors')
         self.keywords = Read_buff(file_buff=self.SettingPath,
                                   settion=SearchDBName,
                                   info='keywords')
         self.unit = Read_buff(file_buff=self.SettingPath,
                               settion=SearchDBName,
                               info='unit')
         self.BaseKeyword = ""
         if RemoveSpecialCharacter(self.title) != "":
             self.BaseKeyword = self.BaseKeyword + " title:" + self.title
         if RemoveSpecialCharacter(self.authors) != "":
             self.BaseKeyword = self.BaseKeyword + " author:" + self.authors
         if RemoveSpecialCharacter(self.keywords) != "":
             self.BaseKeyword = self.BaseKeyword + " qw:" + self.keywords
         if RemoveSpecialCharacter(self.unit) != "":
             self.BaseKeyword = self.BaseKeyword + " 作者单位:" + self.unit
     else:
         # Todo
         pass
def ShowStatePro(db,SearchDBName,DbDatabuff,Dbresult):
    sql_count_all = "select count(*) from `%s` where `Source`='%s'"%(DbDatabuff,SearchDBName)
    num_all = int(db.do_sql_one(sql_count_all)[0])
    sql_count_done = "select count(*) from `%s` where `State`=20 and `Source`='%s'"%(DbDatabuff,SearchDBName)
    num_done = int(db.do_sql_one(sql_count_done)[0])
    sql_count_error = "select count(*) from `%s` where `State`=-15 and `Source`='%s'"%(DbDatabuff,SearchDBName)
    num_error = int(db.do_sql_one(sql_count_error)[0])
    num_error = num_error if num_error > 0 else 0
    sql_count_done_not_in_year = "select count(*) from `%s` where `State`=-5 and `Source`='%s'"%(DbDatabuff,SearchDBName)
    num_done_not_in_year = int(db.do_sql_one(sql_count_done_not_in_year)[0])
    num_done_not_in_year = num_done_not_in_year if num_done_not_in_year > 0 else 0
    num_done = num_done + num_done_not_in_year+num_error
    if num_all > 0:
        print(
            "%s采集器:#############################################目前有%s条数据,其中已处理的有%s,其中年份不符合的有%s,无效链接%s,处理完成度为%.2f,##############################" % (
                SearchDBName,num_all, num_done, num_done_not_in_year,num_error, (int(num_done) / int(num_all)) * 100))
    if '1' in str(Read_buff(file_buff="Config.ini", settion=SearchDBName, info='flag_get_all_url')) and num_all == num_done:
        # 完成全部
        Write_buff(file_buff="Config.ini", settion=SearchDBName, info="stopflag", state=1)
        time.sleep(5)
        print("%s:爬取结束"%SearchDBName)
        sys.exit(0)
Beispiel #14
0
from HCJ_Buff_Control import Read_buff, Write_buff
#构造不同条件的关键词搜索
from HCJ_DB_Helper import HCJ_MySQL

SearchDBName = "Cnki"

from PublicDef import *

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
}

concurrent = int(
    Read_buff(file_buff="./Config.ini",
              settion='Setting',
              info='Cnki_CollectNum').replace(' ', ''))
conparse = int(
    Read_buff(file_buff="./Config.ini",
              settion='Setting',
              info='Cnki_parsenum').replace(' ', ''))
interval = int(
    Read_buff(file_buff="./Config.ini",
              settion='Setting',
              info='Cnki_interval').replace(' ', ''))
# 生成请求队列
req_list = queue.Queue()
# 生成数据队列 ,请求以后,响应内容放到数据队列里
data_list = queue.Queue()

import re
from HCJ_Buff_Control import Read_buff, Write_buff
# 构造不同条件的关键词搜索
from HCJ_DB_Helper import HCJ_MySQL
from PublicDef import *
SearchDBName="Cqvip"
values = {
    '1': 'k',  # 标题
    '2': 'w',  # 作者
    '3': 'k',  # 关键词
    '4': 'o',  # 单位
    '5': 'mn',  # 刊名
}

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
concurrent=int(Read_buff(file_buff="./Config.ini",settion='Setting',info='Cqvip_CollectNum').replace(' ',''))
conparse=int(Read_buff(file_buff="./Config.ini",settion='Setting',info='Cqvip_parsenum').replace(' ',''))
interval=int(Read_buff(file_buff="./Config.ini",settion='Setting',info='Cqvip_interval').replace(' ',''))
# 生成请求队列
req_list = queue.Queue()
# 生成数据队列 ,请求以后,响应内容放到数据队列里
data_list = queue.Queue()


class Parse(threading.Thread):
    # 初始化属性
    def __init__(self, number, data_list, req_thread):
        super(Parse, self).__init__()
        self.number = number  # 线程编号
        self.data_list = data_list  # 数据队列
        self.req_thread = req_thread  # 请求队列,为了判断采集线程存活状态
# encoding:utf-8
# name:mod_db.py
'''
使用方法:1.在主程序中先实例化DB Mysql数据库操作类。
      2.使用方法:db=database()  db.fetch_all("sql")
'''
import time
import pymysql as MySQLdb


from DBUtils.PooledDB import PooledDB
from HCJ_Buff_Control import Read_buff


DBNAME = Read_buff(file_buff="Config.ini", settion="DB",info='DBNAME')
DBHOST = Read_buff(file_buff="Config.ini", settion="DB",info='DBHOST')
DBUSER = Read_buff(file_buff="Config.ini", settion="DB",info='DBUSER')
DBPWD = Read_buff(file_buff="Config.ini", settion="DB",info='DBPWD')
DBCHARSET =Read_buff(file_buff="Config.ini", settion="DB",info='DBCHARSET')
DBPORT =Read_buff(file_buff="Config.ini", settion="DB",info='DBPORT')
limit_count1 =Read_buff(file_buff="Config.ini", settion="DB",info='limit_count')


class HCJ_MySQL:
    pool = None
    limit_count = int(limit_count1.replace(" ",""))  # 最低预启动数据库连接数量
    def __init__(self,log=None,dbname=None,dbhost=None):
        if dbname is None:
            self._dbname = DBNAME
        else:
            self._dbname = dbname