def __init__(self, conn, db, webpages_table, urls_table, log_table,
                 start_url, source):
        rs = ReadSetting()  #读取用户输入的检索词
        rs.readargs(conn, db, webpages_table)

        self.allstr = rs.allwords
        self.classdict = rs.classdict

        self.eventid = rs.eventid
        self.source = source

        self.exist_urls = rs.exist_urls

        self.conn = conn
        self.cur = conn.cursor()
        self.db = db
        self.webpages_table = webpages_table
        self.urls_table = urls_table
        self.log_table = log_table

        self.start_urls = [start_url]
        self.allowed_domains = (urlparse(start_url).hostname, )
        self.source = source

        self.rules = [
            Rule(LinkExtractor(), follow=True, callback="parse_auto")
        ]
        #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_auto
        #所有Request均经过spidermiddlewares

        super(AutoSpider, self).__init__()
Beispiel #2
0
 def __init__(self):
     rs = ReadSetting()  #读取setting文件中的保存参数
     self.pagecount_max = rs.pagenumber()  #读取“最大爬取页面数”
     self.itemcount_max = rs.itemnumber()  #读取“最大抓取条目数”
     self.pagecount = 0  #设置“爬取页面数”的计数器
     self.itemcount = 0  #设置“抓取下载条目数”的计数器
     self.page_seen = set()  #初始化爬取页面列表
     self.item_seen = set()  #初始化抓取下载条目列表
Beispiel #3
0
    def __init__(self):

        rs = ReadSetting()  #读取各项参数
        self.start_urls = rs.readurl()
        self.linkmatrix = LinkMatrix(rs.projectname())
        self.linkmatrix.setroot(self.start_urls)

        self.allowed_domains = rs.readalloweddomain()
        self.rules = [
            Rule(LinkExtractor(), follow=True, callback="parse_auto")
        ]
        #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_auto
        #所有Request均经过spidermiddlewares

        super(AutoSpider, self).__init__()
Beispiel #4
0
    def __init__(self, projectname):
        self.projectname = projectname
        self.roots = []

        rs = ReadSetting()
        self.allowed_domains = rs.readalloweddomain()

        #此三个字典通过合并duplicate_struct字典,覆盖反方向的链接(即链接两端均被记载,但只记录过单方向链接)
        self.entire_struct = dict(
        )  #保存网站所有的页面结构,referer、url在爬取范围(限制域)内,不一定符合抓取下载规则
        self.forwardlinks = dict()  #保存所有抓取下载范围内的页面的结构,referer、url符合抓取下载规则
        self.outlinks = dict(
        )  #记录所有的外链,referer符合抓取下载规则,url在抓取下载范围外(包括爬取范围内外的页面)

        self.duplicate_struct = dict()  #对于重复的request,记录对应的referer->url

        #此三个字典只覆盖了单方向链接(即已经过滤掉重复Request所对应的链接)
        self.entire_struct_0 = dict(
        )  #保存网站所有的页面结构,referer、url在爬取范围(限制域)内,不一定符合抓取下载规则
        self.forwardlinks_0 = dict(
        )  #保存所有抓取下载范围内的页面的结构,referer不一定符合抓取下载规则,url符合抓取下载规则
        self.outlinks_0 = dict(
        )  #记录所有的外链,referer不一定符合抓取下载规则(但在爬取范围内),url在爬取范围(限制域)外
Beispiel #5
0
    def run(self):
        self.clean_tables()  # 清空表格, 准备记录本次数据

        rs = ReadSetting()
        rs.read_args()
        self.read_starturls()
        self.allwords = rs.allwords
        self.starttime = rs.starttime
        self.endtime = rs.endtime
        self.classify_dict = rs.classfy_dict

        hostname_list = []
        for url, webname in self.start_urls.items():
            hostname = urllib2.urlparse.urlparse(url).hostname
            if hostname.startswith('www.') and hostname != 'www.gov.cn':
                hostname = hostname[4:]
            hostname_list.append([webname, hostname])

        pool = Pool(self.num_crawler)
        pool.map(self.create_crawler, hostname_list)
        pool.close()
        pool.join()

        self.conn.close()
Beispiel #6
0
    def __init__(self):
        rs = ReadSetting()  #读取setting文件中的保存参数
        self.savename = rs.savingname()
        self.location = rs.savinglocation()
        self.saveingformat = rs.savingformat()

        if self.savename == 1:  #判断函数self.getpath对应的函数变量(相当于函数指针)
            self.getpath = self.getpath_1
        elif self.savename == 2:
            self.getpath = self.getpath_2
        elif self.savename == 3:
            self.getpath = self.getpath_3

        self.projectname = rs.projectname()

        try:
            os.mkdir(self.location)  #创建下载内容所保存的文件夹(根据保存参数)
        except OSError as e:
            if e.errno == 17: pass
Beispiel #7
0
    def __init__(self):

        rs = ReadSetting()  #读取各项参数
        self.start_urls = rs.readurl()
        self.linkmatrix = LinkMatrix(rs.projectname())
        self.linkmatrix.setroot(self.start_urls)

        self.allowed_domains = rs.readalloweddomain()
        self.allow, self.deny = rs.readurlmatch()

        self.regex_allow = re.compile('({0})'.format('|'.join(
            [re.escape(e) for e in self.allow])))  #生成正则表达式
        self.regex_deny = re.compile('({0})'.format('|'.join(
            [re.escape(e) for e in self.deny])))

        self.rules = [
            Rule(LinkExtractor(), follow=True, callback="parse_match")
        ]
        #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_match
        #所有Request均经过spidermiddlewares

        super(MatchSpider, self).__init__()
Beispiel #8
0
# -*- coding: utf-8 -*-

# Scrapy settings for myproject project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

from readsetting import ReadSetting

rs = ReadSetting()

BOT_NAME = 'myproject'

SPIDER_MODULES = ['myproject.spiders']
NEWSPIDER_MODULE = 'myproject.spiders'

depth = rs.depth()
if rs.readrule() == "xpath" and depth != 0 and depth != 1:
    depth -= 1
DEPTH_LIMIT = depth  #限制爬取深度

DOWNLOAD_TIMEOUT = rs.requesttime()  #下载器超时时间(单位:秒)

##CLOSESPIDER_PAGECOUNT = rs.pagenumber() #指定最大的抓取响应(reponses)数

##CLOSESPIDER_ITEMCOUNT = rs.itemnumber() #指定Item的个数

CONCURRENT_REQUESTS = 50  #Scrapy downloader并发请求(concurrent requests)的最大值