예제 #1
0
 def parse(self, response):
     #print "queued %d" % len(self.crawler.engine.slot.scheduler)
     print "[*] open %s" % response.url
     item = AllCrawl2Item()
     item['inurl'] = response.url
     item['site'] = urlparse(response.url).netloc.lower()
     item['ext'] = splitext(urlparse(response.url).path)[1][1:].lower()
     item.update(parsers.get_content(response, item))
     return item
예제 #2
0
	def parse(self, response):
		#print "queued %d" % len(self.crawler.engine.slot.scheduler)
		print colorama.Fore.GREEN + "[+] open %s" % (response.url,) + colorama.Fore.RESET, 
		items = AllCrawl2Item()
		items['inurl'] = response.url
		items['site'] = urlparse( response.url ).netloc.lower()
		items['ext'] = splitext( urlparse( response.url ).path )[1][1:].lower()
		for item in parsers.get_content( response.body, items ):
			yield item
		print ''
예제 #3
0
 def parse(self, response):
     #print "queued %d" % len(self.crawler.engine.slot.scheduler)
     print colorama.Fore.GREEN + "[+] open %s" % (
         response.url, ) + colorama.Fore.RESET,
     item = AllCrawl2Item()
     item['inurl'] = response.url
     item['site'] = 'local'
     item['ext'] = splitext(urlparse(response.url).path)[1][1:].lower()
     item.update(parsers.get_content(response.body, item))
     print ''
     return item
예제 #4
0
파일: imap.py 프로젝트: s0i37/__all_crawl2
 def parse(self, response):
     print "[*] open %s" % response.url
     item = AllCrawl2Item()
     item['inurl'] = response.url
     item['site'] = urlparse(response.url).netloc.lower()
     item['ext'] = splitext(urlparse(response.url).path)[1][1:].lower()
     if not split(response.url)[1]:  # is file
         #print '[debug] +%s' % response.url
         item["intext"] = ''
         for message in json.loads(response.body):
             item["intext"] += _file + ' '
             yield Request(message)
         yield item
     else:  #	is dir
         #print '[debug] %s' % response.url
         yield parsers.get_content(response, item)
예제 #5
0
파일: ftp.py 프로젝트: s0i37/__all_crawl2
	def parse(self, response):
		#print "queued %d" % len(self.crawler.engine.slot.scheduler)
		print "[*] open %s" % response.url
		if not split(response.url)[1]:	# is dir
			files = json.loads( response.body )
			for _file in files:
				print '[debug] %s' % ( _file['filename'] + '/' if _file['filetype'] == 'd' else _file['filename'] )
				if _file['filetype'] == 'd':
					yield Request( response.urljoin( _file['filename'] + '/' ), meta = { 'ftp_user': self.ftp_user, 'ftp_password': self.ftp_password } )
				if _file['filetype'] == '-':
					yield Request( response.urljoin( _file['filename'] ), meta = { 'ftp_user': self.ftp_user, 'ftp_password': self.ftp_password } )
		else:
			item = AllCrawl2Item()
			item['inurl'] = response.url
			item['site'] = urlparse( response.url ).netloc.lower()
			item['ext'] = splitext( urlparse( response.url ).path )[1][1:].lower()
			yield parsers.get_content( response, item )
예제 #6
0
	def parse(self, response):
		#print "queued %d" % len(self.crawler.engine.slot.scheduler)
		print colorama.Fore.GREEN + "[+] open %s" % (response.url,) + colorama.Fore.RESET ,
		items = AllCrawl2Item()
		items['inurl'] = response.url
		items['site'] = urlparse( response.url ).netloc.lower()
		items['ext'] = splitext( urlparse( response.url ).path )[1][1:].lower()
		if not split(response.url)[1]:	# is dir
			#item["intext"] = ''
			for _file in json.loads( response.body ):
				#item["intext"] += _file + ' '
				yield Request( response.url + _file )
			#for _item in item:
			#	yield _item
		else:	# is file
			for item in parsers.get_content( response.body, items ):
				yield item
		print ''