Esempio n. 1
0
def init():
    if '1' in str(
            ReadConfig(file_name=ConfigName,
                       section=SearchDBName,
                       attr='restart')):
        CreatResultDBTable(db, Dbresult)
        CreatUrlBuffTable(db, DbDatabuff)
        time.sleep(0.1)
        WriteConfig(file_name=ConfigName,
                    section=SearchDBName,
                    attr='restart',
                    value=0)
        WriteConfig(file_name=ConfigName,
                    section=SearchDBName,
                    attr='startpage',
                    value=1)
        WriteConfig(file_name=ConfigName,
                    section=SearchDBName,
                    attr='stopflag',
                    value=0)
        WriteConfig(file_name=ConfigName,
                    section=SearchDBName,
                    attr='flag_get_all_url',
                    value=0)
    if '0' in str(
            ReadConfig(file_name=ConfigName,
                       section=SearchDBName,
                       attr='restart')):
        db.upda_sql("Update `%s` set `State`=0 where `State`=10" % DbDatabuff)
    time.sleep(1)
Esempio n. 2
0
    def run(self):
        print('启动%d号解析线程' % self.number)
        # 无限循环,
        while True:
            # 如何判断解析线程的结束条件
            for t in self.req_thread:  # 循环所有采集线程
                if t.is_alive():  # 判断线程是否存活
                    break
            else:  # 如果循环完毕,没有执行break语句,则进入else
                if self.data_list.qsize() == 0:  # 判断数据队列是否为空
                    self.is_parse = False  # 设置解析为False
            # 判断是否继续解析

            if self.is_parse or int(
                    ReadConfig(file_name=ConfigName,
                               section=SearchDBName,
                               attr='stopflag')) == 0:  # 解析

                try:
                    url, data = self.data_list.get(timeout=3)  # 从数据队列里提取一个数据
                except Exception as e:  # 超时以后进入异常
                    data = None
                # 如果成功拿到数据,则调用解析方法
                if data is not None and Wanfang.running:
                    Paper = Wanfang.GetFurtherPaper(url, data)
            else:
                break  # 结束while 无限循环

        print('退出%d号解析线程' % self.number)
Esempio n. 3
0
 def GetAllUrl(self):
     total_record_num, self.MaxPage, index_url = self.GetMaxPage()  # 最大页数
     self.StartPage = ReadConfig(file_name=ConfigName,
                                 section=SearchDBName,
                                 attr='startpage')  # 开始页数
     t = time.time()
     WriteConfig(file_name=ConfigName,
                 section=SearchDBName,
                 attr='flag_get_all_url',
                 value=0)
     for i in range(int(self.StartPage), self.MaxPage + 1):
         print("共有%s页,当前为%s页,获得文献链接的进度完成%.2f" %
               (self.MaxPage, i, (int(i) / int(self.MaxPage)) * 100))
         WriteConfig(file_name=ConfigName,
                     section=SearchDBName,
                     attr='startpage',
                     value=i + 1)
         url_list = self.GetFurtherUrl(i, index_url)
         threading.Thread(target=self.WriteUrlIntoDB,
                          args=(url_list, )).start()
         # self.further_url.extend(self.GetFurtherUrl(i, index_url))
         time.sleep(0.5)
     WriteConfig(file_name=ConfigName,
                 section=SearchDBName,
                 attr='flag_get_all_url',
                 value=1)
     print(time.time() - t)
Esempio n. 4
0
def WanfangProcess():
    global db, Wanfang
    multiprocessing.freeze_support(
    )  # 在Windows下编译需要这行:windows创建进程没有fork方法,默认是spawn,而linux创建进程默认是fork方法
    db = MysqlPool()
    Wanfang = WanFangCrawler(db=db)
    init()
    if '0' in str(
            ReadConfig(file_name=ConfigName,
                       section=SearchDBName,
                       attr='stopflag')):
        main()
Esempio n. 5
0
 def __init__(self, db, StartTime=None, EndTime=None, StartPage=None):
     """
     :param db: 数据库的实例化
     :param StartTime: 开始年份
     :param EndTime: 结束年份
     :param StartPage: 开始页码
     """
     self.db = db
     self.SearchName = SearchDBName  # 万方
     self.ConfigPath = ConfigName  # 配置文件地址
     self._Perpage = 50  # 每页显示50
     self.running = False  # 标记程序是否正常运行
     self.further_url = list()
     if StartTime is None and EndTime is None and StartPage is None:
         self.StartTime = ReadConfig(file_name=ConfigName,
                                     section=self.SearchName,
                                     attr='starttime')  # 开始年份
         self.EndTime = ReadConfig(file_name=ConfigName,
                                   section=self.SearchName,
                                   attr='endtime')  # 结束年份
         self.StartPage = ReadConfig(file_name=ConfigName,
                                     section=self.SearchName,
                                     attr='startpage')  # 开始页数
         self.MaxPage = ReadConfig(file_name=ConfigName,
                                   section=self.SearchName,
                                   attr='maxpage')  # 最大页数
         self.title = ReadConfig(file_name=ConfigName,
                                 section=self.SearchName,
                                 attr='title')
         self.authors = ReadConfig(file_name=ConfigName,
                                   section=self.SearchName,
                                   attr='authors')
         self.keywords = ReadConfig(file_name=ConfigName,
                                    section=self.SearchName,
                                    attr='keywords')
         self.unit = ReadConfig(file_name=ConfigName,
                                section=self.SearchName,
                                attr='unit')
         self.BaseKeyword = ""
         if RemoveSpecialCharacter(self.title) != "":
             self.BaseKeyword = self.BaseKeyword + " 标题:" + self.title
         if RemoveSpecialCharacter(self.authors) != "":
             self.BaseKeyword = self.BaseKeyword + " 作者:" + self.authors
         if RemoveSpecialCharacter(self.keywords) != "":
             self.BaseKeyword = self.BaseKeyword + " 关键词:" + self.keywords
         if RemoveSpecialCharacter(self.unit) != "":
             self.BaseKeyword = self.BaseKeyword + " 作者单位:" + self.unit
     else:
         pass
Esempio n. 6
0
 def run(self):
     # 输出启动线程信息
     print('启动采集线程%d号' % self.number)
     # 如果请求队列不为空,则无限循环,从请求队列里拿请求url
     while self.req_list.qsize() > 0 or int(
             ReadConfig(file_name=ConfigName,
                        section=SearchDBName,
                        attr='stopflag')) == 0:
         # 从请求队列里提取url
         url = self.req_list.get(
         )  # 从queue中(queue[0]) get到一个数据,该数据就会在queue中删除(删除的是queue[0],然后queue[1]就会移动到queue[0],...)
         # print('%d号线程采集:%s' % (self.number, url))
         # 防止请求频率过快,随机设置阻塞时间
         time.sleep(random.randint(30, 50) / 10)
         # 发起http请求,获取响应内容,追加到数据队列里,等待解析
         response = Wanfang.VisitHtml(url)
         self.data_list.put([url, response])  # 向数据队列中添加列表数据
Esempio n. 7
0
"""

__info__ = {
    'class1': 'Mysql',
    'class2': 'MysqlPool',
    'function1': 'CreateDB',
    'function2': 'CreateTable',
    'author': 'gs',
    'time': '20190716'
}

import pymysql as MySQLdb
from DBUtils.PooledDB import PooledDB
from ConfigHelper import ReadConfig

DBNAME = ReadConfig(file_name="Config.ini", section="DB", attr="dbname")
DBHOST = ReadConfig(file_name="Config.ini", section="DB", attr="dbhost")
DBUSER = ReadConfig(file_name="Config.ini", section="DB", attr="dbuser")
DBPWD = ReadConfig(file_name="Config.ini", section="DB", attr="dbpwd")
DBCHARSET = ReadConfig(file_name="Config.ini", section="DB", attr="dbcharset")
DBPORT = ReadConfig(file_name="Config.ini", section="DB", attr="dbport")


class Mysql:
    """
    通过普通方式操作MySQL数据库的类
    """

    # 注,python的self等于其它语言的this
    def __init__(self,
                 log=None,
Esempio n. 8
0
import time
import random
import re
from ConfigHelper import WriteConfig, ReadConfig
from MysqlHelper import MysqlPool
from PublicMethod import RemoveSpecialCharacter, InitDict, CreatResultDBTable, CreatUrlBuffTable

SearchDBName = 'Wanfang'
ConfigName = 'Config.ini'
concurrent = 3  # 采集线程数
conparse = 5  # 解析线程数

req_queue = queue.Queue()  # 生成请求队列
data_queue = queue.Queue()  # 生成数据队列 ,请求以后,响应内容放到数据队列里
ex_dbname = ReadConfig(file_name=ConfigName,
                       section=SearchDBName,
                       attr='ex_dbname')
DbDatabuff = "databuff" + str(ex_dbname)
Dbresult = "result" + str(ex_dbname)


class Parse(threading.Thread):
    # 初始化属性
    def __init__(self, number, data_list, req_thread):
        super(Parse, self).__init__()
        self.number = number  # 线程编号
        self.data_list = data_list  # 数据队列
        self.req_thread = req_thread  # 请求队列,为了判断采集线程存活状态
        self.is_parse = True  # 判断是否从数据队列里提取数据

    def run(self):