def saveAllHrefsToFile(self,nonehtml=True): try: cf = CrawlerFile(url=self.url) contentlist = [] hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] for href in hrefs: if href.endswith('.html') and nonehtml: continue contentlist.append(href) cf.saveSection('Hrefs',contentlist,coverfile=True) # fp = open(self.file,'w') # fp.write('[Hrefs]'+os.linesep) # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] # rethrefs = [] # print 'Totally ',len(hrefs), ' hrefs' # for href in hrefs: # if href.endswith('.html'): # continue # rethrefs.append(href) # fp.write(href + os.linesep) # print href # print 'Totally ',len(rethrefs), ' aviable hrefs' # fp.close() except: pass
def saveAllFileExtensions(self): try: cf = CrawlerFile(url=self.url) contentlist = self._getCrawlerFileExts() cf.saveSection('FileExtensions',contentlist) except Exception,e: print 'Exception:\t',e
def saveAllHrefsToFile(self, nonehtml=True): try: cf = CrawlerFile(url=self.url) contentlist = [] hrefs = [i for i in self.visitedHrefs ] + [j for j in self.unvisitedHrefs] for href in hrefs: if href.endswith('.html') and nonehtml: continue contentlist.append(href) cf.saveSection('Hrefs', contentlist, coverfile=True) # fp = open(self.file,'w') # fp.write('[Hrefs]'+os.linesep) # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] # rethrefs = [] # print 'Totally ',len(hrefs), ' hrefs' # for href in hrefs: # if href.endswith('.html'): # continue # rethrefs.append(href) # fp.write(href + os.linesep) # print href # print 'Totally ',len(rethrefs), ' aviable hrefs' # fp.close() except: pass
def saveAllFileExtensions(self): try: cf = CrawlerFile(url=self.url) contentlist = self._getCrawlerFileExts() cf.saveSection('FileExtensions', contentlist) except Exception, e: print 'Exception:\t', e
def _getCrawlerFileExts(self): try: exts = [] cf = CrawlerFile(url=self.url) urls = cf.getSection('Hrefs') for eachurl in urls: eachulp = urlparse(eachurl) pos = eachulp.path.rfind('.') if pos != -1: ext = eachulp.path[pos:] ext = ext.lower() if ext not in exts: exts.append(ext) return exts except Exception,e: print 'Exception:\t',e return []
def _getCrawlerFileExts(self): try: exts = [] cf = CrawlerFile(url=self.url) urls = cf.getSection('Hrefs') for eachurl in urls: eachulp = urlparse(eachurl) pos = eachulp.path.rfind('.') if pos != -1: ext = eachulp.path[pos:] ext = ext.lower() if ext not in exts: exts.append(ext) return exts except Exception, e: print 'Exception:\t', e return []
def saveAllPaths(self): try: cf = CrawlerFile(url=self.url) contentlist = self._getCrawlerPaths(self.url) #print contentlist cf.saveSection('Paths',contentlist) # fp = open(self.file,'w') # #filename = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt' # filename = self.file # fp = open(filename,'a') # fp.write(os.linesep+'[Paths]'+os.linesep) # urls = self._getCrawlerPaths(self.url) # for eachurl in urls: # fp.write(eachurl + os.linesep) # fp.close() except Exception,e: print 'Exception:\t',e
def saveAllPaths(self): try: cf = CrawlerFile(url=self.url) contentlist = self._getCrawlerPaths(self.url) # print 'contentlist=',contentlist cf.saveSection('Paths', contentlist) # fp = open(self.file,'w') # #filename = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt' # filename = self.file # fp = open(filename,'a') # fp.write(os.linesep+'[Paths]'+os.linesep) # urls = self._getCrawlerPaths(self.url) # for eachurl in urls: # fp.write(eachurl + os.linesep) # fp.close() except Exception, e: print 'Exception:\t', e
def _getCrawlerPaths(self, url): ''' ''' try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection('Hrefs') #print urls for eachline in urls: eachline = eachline.replace('\r', '') eachline = eachline.replace('\n', '') #print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find('.') == -1 and fullpath.endswith( '/') == False: fullpath += '/' pos = 0 while True: # print 'fullpath=',fullpath pos = fullpath.find('/', pos) if pos == -1: break tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[: pos] if tmppth.endswith('/'): #tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos += 1 return paths except Exception, e: print 'Exception:\t', e return [url]
def _getCrawlerPaths(self,url): ''' ''' try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection('Hrefs') #print urls for eachline in urls: eachline = eachline.replace('\r','') eachline = eachline.replace('\n','') #print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find('.') == -1 and fullpath.endswith('/') == False: fullpath += '/' pos = 0 while True: # print 'fullpath=',fullpath pos = fullpath.find('/',pos) if pos == -1: break tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:pos] if tmppth.endswith('/'): #tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos +=1 return paths except Exception,e: print 'Exception:\t',e return [url]
def _getCrawlerPaths(self, url): """ """ try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection("Hrefs") # print urls for eachline in urls: eachline = eachline.replace("\r", "") eachline = eachline.replace("\n", "") # print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find(".") == -1 and fullpath.endswith("/") == False: fullpath += "/" pos = 0 while True: pos = fullpath.find("/", pos) if pos == -1: break tmppth = eachulp.scheme + "://" + eachulp.netloc + eachulp.path[:pos] if tmppth.endswith("/"): # tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos += 1 return paths except Exception, e: print "Exception:\t", e return [url]