Ejemplo n.º 1
0
	def _getCrawlerFileExts(self):
		try:
			exts = []
			cf = CrawlerFile(url=self.url)
			urls = cf.getSection('Hrefs')
			for eachurl in urls:
				eachulp = urlparse(eachurl)
				pos = eachulp.path.rfind('.')
				if pos != -1:
					ext = eachulp.path[pos:]
					ext = ext.lower()
					if ext not in exts:
						exts.append(ext)
			return exts
		except Exception,e:
			print 'Exception:\t',e
			return []
Ejemplo n.º 2
0
 def _getCrawlerFileExts(self):
     try:
         exts = []
         cf = CrawlerFile(url=self.url)
         urls = cf.getSection('Hrefs')
         for eachurl in urls:
             eachulp = urlparse(eachurl)
             pos = eachulp.path.rfind('.')
             if pos != -1:
                 ext = eachulp.path[pos:]
                 ext = ext.lower()
                 if ext not in exts:
                     exts.append(ext)
         return exts
     except Exception, e:
         print 'Exception:\t', e
         return []
Ejemplo n.º 3
0
    def _getCrawlerPaths(self, url):
        ''' '''
        try:
            paths = []
            baseulp = urlparse(url)

            cf = CrawlerFile(url=url)
            urls = cf.getSection('Hrefs')
            #print urls

            for eachline in urls:
                eachline = eachline.replace('\r', '')
                eachline = eachline.replace('\n', '')
                #print eachline
                eachulp = urlparse(eachline)
                if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
                    fullpath = eachulp.path
                    if fullpath.find('.') == -1 and fullpath.endswith(
                            '/') == False:
                        fullpath += '/'
                    pos = 0
                    while True:
                        # print 'fullpath=',fullpath
                        pos = fullpath.find('/', pos)
                        if pos == -1:
                            break
                        tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:
                                                                                        pos]
                        if tmppth.endswith('/'):
                            #tmppth = tmppth[:-1]
                            continue
                        if tmppth not in paths:
                            paths.append(tmppth)
                        pos += 1

            return paths
        except Exception, e:
            print 'Exception:\t', e
            return [url]
Ejemplo n.º 4
0
	def _getCrawlerPaths(self,url):
		''' '''
		try:
			paths = []
			baseulp = urlparse(url)

			cf = CrawlerFile(url=url)
			urls = cf.getSection('Hrefs')
			#print urls

			for eachline in urls:
				eachline = eachline.replace('\r','')
  				eachline = eachline.replace('\n','')
				#print eachline
				eachulp = urlparse(eachline)
				if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
					fullpath = eachulp.path
					if fullpath.find('.') == -1 and fullpath.endswith('/') == False:
						fullpath += '/'
					pos = 0
					while True:
						# print 'fullpath=',fullpath
						pos = fullpath.find('/',pos)
						if pos == -1:
							break
						tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:pos]
						if tmppth.endswith('/'):
							#tmppth = tmppth[:-1]
							continue
						if tmppth not in paths:
							paths.append(tmppth)
						pos +=1

			return paths
		except Exception,e:
			print 'Exception:\t',e
			return [url]
Ejemplo n.º 5
0
    def _getCrawlerPaths(self, url):
        """ """
        try:
            paths = []
            baseulp = urlparse(url)

            cf = CrawlerFile(url=url)
            urls = cf.getSection("Hrefs")
            # print urls

            for eachline in urls:
                eachline = eachline.replace("\r", "")
                eachline = eachline.replace("\n", "")
                # print eachline
                eachulp = urlparse(eachline)
                if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
                    fullpath = eachulp.path
                    if fullpath.find(".") == -1 and fullpath.endswith("/") == False:
                        fullpath += "/"
                    pos = 0
                    while True:
                        pos = fullpath.find("/", pos)
                        if pos == -1:
                            break
                        tmppth = eachulp.scheme + "://" + eachulp.netloc + eachulp.path[:pos]
                        if tmppth.endswith("/"):
                            # tmppth = tmppth[:-1]
                            continue
                        if tmppth not in paths:
                            paths.append(tmppth)
                        pos += 1

            return paths
        except Exception, e:
            print "Exception:\t", e
            return [url]