Exemple #1
0
	def saveAllHrefsToFile(self,nonehtml=True):
		try:
			cf = CrawlerFile(url=self.url)
			contentlist = []
			hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
			for href in hrefs:
				if href.endswith('.html') and nonehtml:
					continue
				contentlist.append(href)
			cf.saveSection('Hrefs',contentlist,coverfile=True)
			# fp = open(self.file,'w')
			# fp.write('[Hrefs]'+os.linesep)
			# hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
			# rethrefs = []
			# print 'Totally ',len(hrefs), ' hrefs'
			# for href in hrefs:
			# 	if href.endswith('.html'):
			# 		continue
			# 	rethrefs.append(href)
			# 	fp.write(href + os.linesep)
			# 	print href
			# print 'Totally ',len(rethrefs), ' aviable hrefs'
			# fp.close()
		except:
			pass
Exemple #2
0
	def saveAllFileExtensions(self):
		try:
			cf = CrawlerFile(url=self.url)
			contentlist = self._getCrawlerFileExts()
			cf.saveSection('FileExtensions',contentlist)
		except Exception,e:
			print 'Exception:\t',e
Exemple #3
0
 def saveAllHrefsToFile(self, nonehtml=True):
     try:
         cf = CrawlerFile(url=self.url)
         contentlist = []
         hrefs = [i for i in self.visitedHrefs
                  ] + [j for j in self.unvisitedHrefs]
         for href in hrefs:
             if href.endswith('.html') and nonehtml:
                 continue
             contentlist.append(href)
         cf.saveSection('Hrefs', contentlist, coverfile=True)
         # fp = open(self.file,'w')
         # fp.write('[Hrefs]'+os.linesep)
         # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
         # rethrefs = []
         # print 'Totally ',len(hrefs), ' hrefs'
         # for href in hrefs:
         # 	if href.endswith('.html'):
         # 		continue
         # 	rethrefs.append(href)
         # 	fp.write(href + os.linesep)
         # 	print href
         # print 'Totally ',len(rethrefs), ' aviable hrefs'
         # fp.close()
     except:
         pass
Exemple #4
0
 def saveAllFileExtensions(self):
     try:
         cf = CrawlerFile(url=self.url)
         contentlist = self._getCrawlerFileExts()
         cf.saveSection('FileExtensions', contentlist)
     except Exception, e:
         print 'Exception:\t', e
Exemple #5
0
	def _getCrawlerFileExts(self):
		try:
			exts = []
			cf = CrawlerFile(url=self.url)
			urls = cf.getSection('Hrefs')
			for eachurl in urls:
				eachulp = urlparse(eachurl)
				pos = eachulp.path.rfind('.')
				if pos != -1:
					ext = eachulp.path[pos:]
					ext = ext.lower()
					if ext not in exts:
						exts.append(ext)
			return exts
		except Exception,e:
			print 'Exception:\t',e
			return []
Exemple #6
0
 def _getCrawlerFileExts(self):
     try:
         exts = []
         cf = CrawlerFile(url=self.url)
         urls = cf.getSection('Hrefs')
         for eachurl in urls:
             eachulp = urlparse(eachurl)
             pos = eachulp.path.rfind('.')
             if pos != -1:
                 ext = eachulp.path[pos:]
                 ext = ext.lower()
                 if ext not in exts:
                     exts.append(ext)
         return exts
     except Exception, e:
         print 'Exception:\t', e
         return []
Exemple #7
0
	def saveAllPaths(self):
		try:
			cf = CrawlerFile(url=self.url)
			contentlist = self._getCrawlerPaths(self.url)
			#print contentlist
			cf.saveSection('Paths',contentlist)
			# fp = open(self.file,'w')
			# #filename = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'
			
			# filename = self.file
			# fp = open(filename,'a')
			# fp.write(os.linesep+'[Paths]'+os.linesep)
			# urls = self._getCrawlerPaths(self.url)
			# for eachurl in urls:
			# 	fp.write(eachurl + os.linesep)
			# fp.close()
		except Exception,e:
			print 'Exception:\t',e
Exemple #8
0
    def saveAllPaths(self):
        try:
            cf = CrawlerFile(url=self.url)
            contentlist = self._getCrawlerPaths(self.url)
            # print 'contentlist=',contentlist
            cf.saveSection('Paths', contentlist)
            # fp = open(self.file,'w')
            # #filename = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'

            # filename = self.file
            # fp = open(filename,'a')
            # fp.write(os.linesep+'[Paths]'+os.linesep)
            # urls = self._getCrawlerPaths(self.url)
            # for eachurl in urls:
            # 	fp.write(eachurl + os.linesep)
            # fp.close()
        except Exception, e:
            print 'Exception:\t', e
Exemple #9
0
    def _getCrawlerPaths(self, url):
        ''' '''
        try:
            paths = []
            baseulp = urlparse(url)

            cf = CrawlerFile(url=url)
            urls = cf.getSection('Hrefs')
            #print urls

            for eachline in urls:
                eachline = eachline.replace('\r', '')
                eachline = eachline.replace('\n', '')
                #print eachline
                eachulp = urlparse(eachline)
                if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
                    fullpath = eachulp.path
                    if fullpath.find('.') == -1 and fullpath.endswith(
                            '/') == False:
                        fullpath += '/'
                    pos = 0
                    while True:
                        # print 'fullpath=',fullpath
                        pos = fullpath.find('/', pos)
                        if pos == -1:
                            break
                        tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:
                                                                                        pos]
                        if tmppth.endswith('/'):
                            #tmppth = tmppth[:-1]
                            continue
                        if tmppth not in paths:
                            paths.append(tmppth)
                        pos += 1

            return paths
        except Exception, e:
            print 'Exception:\t', e
            return [url]
Exemple #10
0
	def _getCrawlerPaths(self,url):
		''' '''
		try:
			paths = []
			baseulp = urlparse(url)

			cf = CrawlerFile(url=url)
			urls = cf.getSection('Hrefs')
			#print urls

			for eachline in urls:
				eachline = eachline.replace('\r','')
  				eachline = eachline.replace('\n','')
				#print eachline
				eachulp = urlparse(eachline)
				if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
					fullpath = eachulp.path
					if fullpath.find('.') == -1 and fullpath.endswith('/') == False:
						fullpath += '/'
					pos = 0
					while True:
						# print 'fullpath=',fullpath
						pos = fullpath.find('/',pos)
						if pos == -1:
							break
						tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:pos]
						if tmppth.endswith('/'):
							#tmppth = tmppth[:-1]
							continue
						if tmppth not in paths:
							paths.append(tmppth)
						pos +=1

			return paths
		except Exception,e:
			print 'Exception:\t',e
			return [url]
Exemple #11
0
    def _getCrawlerPaths(self, url):
        """ """
        try:
            paths = []
            baseulp = urlparse(url)

            cf = CrawlerFile(url=url)
            urls = cf.getSection("Hrefs")
            # print urls

            for eachline in urls:
                eachline = eachline.replace("\r", "")
                eachline = eachline.replace("\n", "")
                # print eachline
                eachulp = urlparse(eachline)
                if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
                    fullpath = eachulp.path
                    if fullpath.find(".") == -1 and fullpath.endswith("/") == False:
                        fullpath += "/"
                    pos = 0
                    while True:
                        pos = fullpath.find("/", pos)
                        if pos == -1:
                            break
                        tmppth = eachulp.scheme + "://" + eachulp.netloc + eachulp.path[:pos]
                        if tmppth.endswith("/"):
                            # tmppth = tmppth[:-1]
                            continue
                        if tmppth not in paths:
                            paths.append(tmppth)
                        pos += 1

            return paths
        except Exception, e:
            print "Exception:\t", e
            return [url]