def __init__(self, conn, db, webpages_table, urls_table, log_table, start_url, source): rs = ReadSetting() #读取用户输入的检索词 rs.readargs(conn, db, webpages_table) self.allstr = rs.allwords self.classdict = rs.classdict self.eventid = rs.eventid self.source = source self.exist_urls = rs.exist_urls self.conn = conn self.cur = conn.cursor() self.db = db self.webpages_table = webpages_table self.urls_table = urls_table self.log_table = log_table self.start_urls = [start_url] self.allowed_domains = (urlparse(start_url).hostname, ) self.source = source self.rules = [ Rule(LinkExtractor(), follow=True, callback="parse_auto") ] #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_auto #所有Request均经过spidermiddlewares super(AutoSpider, self).__init__()
def __init__(self): rs = ReadSetting() #读取setting文件中的保存参数 self.pagecount_max = rs.pagenumber() #读取“最大爬取页面数” self.itemcount_max = rs.itemnumber() #读取“最大抓取条目数” self.pagecount = 0 #设置“爬取页面数”的计数器 self.itemcount = 0 #设置“抓取下载条目数”的计数器 self.page_seen = set() #初始化爬取页面列表 self.item_seen = set() #初始化抓取下载条目列表
def __init__(self): rs = ReadSetting() #读取各项参数 self.start_urls = rs.readurl() self.linkmatrix = LinkMatrix(rs.projectname()) self.linkmatrix.setroot(self.start_urls) self.allowed_domains = rs.readalloweddomain() self.rules = [ Rule(LinkExtractor(), follow=True, callback="parse_auto") ] #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_auto #所有Request均经过spidermiddlewares super(AutoSpider, self).__init__()
def __init__(self, projectname): self.projectname = projectname self.roots = [] rs = ReadSetting() self.allowed_domains = rs.readalloweddomain() #此三个字典通过合并duplicate_struct字典,覆盖反方向的链接(即链接两端均被记载,但只记录过单方向链接) self.entire_struct = dict( ) #保存网站所有的页面结构,referer、url在爬取范围(限制域)内,不一定符合抓取下载规则 self.forwardlinks = dict() #保存所有抓取下载范围内的页面的结构,referer、url符合抓取下载规则 self.outlinks = dict( ) #记录所有的外链,referer符合抓取下载规则,url在抓取下载范围外(包括爬取范围内外的页面) self.duplicate_struct = dict() #对于重复的request,记录对应的referer->url #此三个字典只覆盖了单方向链接(即已经过滤掉重复Request所对应的链接) self.entire_struct_0 = dict( ) #保存网站所有的页面结构,referer、url在爬取范围(限制域)内,不一定符合抓取下载规则 self.forwardlinks_0 = dict( ) #保存所有抓取下载范围内的页面的结构,referer不一定符合抓取下载规则,url符合抓取下载规则 self.outlinks_0 = dict( ) #记录所有的外链,referer不一定符合抓取下载规则(但在爬取范围内),url在爬取范围(限制域)外
def run(self): self.clean_tables() # 清空表格, 准备记录本次数据 rs = ReadSetting() rs.read_args() self.read_starturls() self.allwords = rs.allwords self.starttime = rs.starttime self.endtime = rs.endtime self.classify_dict = rs.classfy_dict hostname_list = [] for url, webname in self.start_urls.items(): hostname = urllib2.urlparse.urlparse(url).hostname if hostname.startswith('www.') and hostname != 'www.gov.cn': hostname = hostname[4:] hostname_list.append([webname, hostname]) pool = Pool(self.num_crawler) pool.map(self.create_crawler, hostname_list) pool.close() pool.join() self.conn.close()
def __init__(self): rs = ReadSetting() #读取setting文件中的保存参数 self.savename = rs.savingname() self.location = rs.savinglocation() self.saveingformat = rs.savingformat() if self.savename == 1: #判断函数self.getpath对应的函数变量(相当于函数指针) self.getpath = self.getpath_1 elif self.savename == 2: self.getpath = self.getpath_2 elif self.savename == 3: self.getpath = self.getpath_3 self.projectname = rs.projectname() try: os.mkdir(self.location) #创建下载内容所保存的文件夹(根据保存参数) except OSError as e: if e.errno == 17: pass
def __init__(self): rs = ReadSetting() #读取各项参数 self.start_urls = rs.readurl() self.linkmatrix = LinkMatrix(rs.projectname()) self.linkmatrix.setroot(self.start_urls) self.allowed_domains = rs.readalloweddomain() self.allow, self.deny = rs.readurlmatch() self.regex_allow = re.compile('({0})'.format('|'.join( [re.escape(e) for e in self.allow]))) #生成正则表达式 self.regex_deny = re.compile('({0})'.format('|'.join( [re.escape(e) for e in self.deny]))) self.rules = [ Rule(LinkExtractor(), follow=True, callback="parse_match") ] #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_match #所有Request均经过spidermiddlewares super(MatchSpider, self).__init__()
# -*- coding: utf-8 -*- # Scrapy settings for myproject project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # from readsetting import ReadSetting rs = ReadSetting() BOT_NAME = 'myproject' SPIDER_MODULES = ['myproject.spiders'] NEWSPIDER_MODULE = 'myproject.spiders' depth = rs.depth() if rs.readrule() == "xpath" and depth != 0 and depth != 1: depth -= 1 DEPTH_LIMIT = depth #限制爬取深度 DOWNLOAD_TIMEOUT = rs.requesttime() #下载器超时时间(单位:秒) ##CLOSESPIDER_PAGECOUNT = rs.pagenumber() #指定最大的抓取响应(reponses)数 ##CLOSESPIDER_ITEMCOUNT = rs.itemnumber() #指定Item的个数 CONCURRENT_REQUESTS = 50 #Scrapy downloader并发请求(concurrent requests)的最大值