def __init__(self, args=Strategy()): self.url = args.url self.max_depth = args.max_depth #指定网页深度 self.max_count = args.max_count #爬行最大数量 self.concurrency = args.concurrency #线程数 self.timeout = args.timeout #超时 self.cookies = args.cookies #cookies self.ssl_verify = args.ssl_verify #ssl self.same_host = args.same_host #是否只抓取相同host的链接 self.same_domain = args.same_domain #是否只抓取相同domain的链接 self.currentDepth = 1 #标注初始爬虫深度,从1开始 self.keyword = args.keyword #指定关键词,使用console的默认编码来解码 self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数 self.visitedHrefs = set() #已访问的链接 self.unvisitedHrefs = deque() #待访问的链接 self.unvisitedHrefs.append(args.url)#添加首个待访问的链接 self.isCrawling = False #标记爬虫是否开始执行任务 self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt' print self.file print 'args.url=\t',args.url ################# #此句有问题 self.database = Database(args.dbFile) #数据库 # print 'hehe' self.lock = Lock()
def __init__(self, args=Strategy()): self.url = args.url self.max_depth = args.max_depth #指定网页深度 self.max_count = args.max_count #爬行最大数量 self.concurrency = args.concurrency #线程数 self.timeout = args.timeout #超时 self.cookies = args.cookies #cookies self.ssl_verify = args.ssl_verify #ssl self.same_host = args.same_host #是否只抓取相同host的链接 self.same_domain = args.same_domain #是否只抓取相同domain的链接 self.currentDepth = 1 #标注初始爬虫深度,从1开始 self.keyword = args.keyword #指定关键词,使用console的默认编码来解码 self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数 self.visitedHrefs = set() #已访问的链接 self.unvisitedHrefs = deque() #待访问的链接 self.unvisitedHrefs.append(args.url) #添加首个待访问的链接 self.isCrawling = False #标记爬虫是否开始执行任务 self.file = BASEDIR + '/cache/crawler/' + genFilename( self.url) + '.txt' # print self.file # print 'args.url=\t',args.url ################# #此句有问题 self.database = Database(args.dbFile) #数据库 # print 'hehe' self.lock = Lock()
def __init__(self, filename='', url=None): super(CrawlerFile, self).__init__() self.file = filename self.url = url if url: self.file = BASEDIR + '/cache/crawler/' + genFilename(url) + '.txt'
def __init__(self,filename='',url=None): super(CrawlerFile, self).__init__() self.file = filename self.url = url if url: self.file = BASEDIR + '/cache/crawler/' + genFilename(url) + '.txt'