import spider from pprint import pprint if __name__ == '__main__': a = spider.ftpurls('ftp://localhost/') print 1; pprint(a) a = spider.ftppaths('ftp://localhost') print 2; pprint(a) a = spider.weburls('http://localhost/') print 3; pprint(a) a = spider.weburls('http://localhost/', 200, 5, 3) print 4; pprint(a) spider.ftpmirror('e:\\ftp\\', 14, 'ftp://localhost/') a = spider.ftpspider('ftp://localhost/') print 5; pprint(a) a = spider.webpaths('http://localhost/') print 6; pprint(a) spider.webreport('e:\\web1.txt', 'http://localhost/') spider.webmirror('e:\\web\\', 18, 'http://localhost/') a = spider.webspider('http://localhost/') print 7; pprint(a) spider.urlreport('e:\\web2.txt', 'http://localhost/',) spider.badurlreport('e:\\web3.txt', 'http://localhost/') spider.badhtmreport('e:\\web4.txt', 'http://localhost/') spider.redireport('e:\\web5.txt', 'http://localhost/') spider.outreport('e:\\web6.txt', 'http://localhost') spider.othereport('e:\\web7.txt', 'http://localhost/') a = spider.Spider('ftp://localhost/', 200, 16) a.ftppaths() print 1; pprint(a.paths) a.ftpurls()
def _mapflat(self, ip): '''Returns PSI map of a list of input paths Arguments: ip -- input path''' def makepath(sp, type='collection'): '''Fills a DOM hierarchy if a PSI node's parent isn't found Arguments: sp -- split path to verify type -- type of PSI node (default: "collection")''' def findparent(tn): '''Finds or creates parents for a PSI node Arguments: tn -- temporary node hierarchy''' # Shorten split path by one if len(sp) != 2: del sp[-1] # Get parent of current PSI node parent = getparent(sp, root, 1) # Attach to parent if current in root node if parent: parent.insertBefore(tn, parent.firstChild) # Otherwise create new parent and look for parent's parent else: temp = collection(sp[-2]) temp.insertBefore(tn, temp.firstChild) findparent(temp) # Look up parent parent = getparent(sp, root, 1) # If parent found... if parent: # Create resource and attach to parent if type == 'resource': walkfile(sp[-1], fp, parent, source) # Or create a collection if it doesn't exist elif not getparent(sp, root, 1, -1): collection(sp[-1], parent) # Otherwise make parent and look for its parent else: # Create parent temp = collection(sp[-2]) # Create resource and attach to new parent if type == 'resource': walkfile(sp[-1], fp, temp, source) # Or create a new collection if it doesn't exist elif not getparent(sp, root, 1, -1): collection(sp[-1], temp) if len(sp) != 2: findparent(temp) # Avoid lookups, clear source, and create root getparent, collection, source = self._getparent, self._collection, None walkfile, root = self._walkfile, self._collection(u'root') # If FTP, process FTP if ip.find('ftp://') != -1: # Import URL parser and FTP spider from spider import ftpspider import urlparse as up self._up = up # Get full paths, partial paths, and FTP session flatlist, fullpaths, self._session = ftpspider(ip, 10, 500) elif ip.find('http://') != -1: # Import web spider and URL library from spider import httpspider import urllib as ulib self._ulib = ulib # Get full and partial paths, flatlist, fullpaths = httpspider(ip, 10, 500) else: root = collection(self._path.split(ip)[1]) # Try opening tarball try: # Import tarfile handler import tarfile # Maintain source for other possible extractions source = tarfile.open(ip) # Make path and file lists flatlist, tl = source.getnames(), source.getmembers() except (NameError, tarfile.ReadError): # Import zipfile handler from zipfile import ZipFile # Maintain source for other possible extractions source = ZipFile(ip) # Make path and file lists flatlist = source.namelist() # If no archive, terminate process except IOError: print 'Error: Invalid pathname "%s"' % ip import sys sys.exit(0) # Process paths for path in flatlist: # Get path position and split the path pp, sp = flatlist.index(path), path.split('/') # Remove empty strings if sp[0] == '': del sp[0] if sp[-1] == '': del sp[-1] # Insert root into each path sp.insert(0, self._getname(root)) # Adjust full path if necessary try: fp = fullpaths[pp] except NameError: fp = path # Try tarfile tests try: # If tarfile directory, make a collection if tl[pp].isdir(): makepath(sp) # If anything else (file), make resource else: makepath(sp, 'resource') except NameError: # If '/' indicates directory, make collection if path[-1] == '/': makepath(sp) # Anything else, make resource else: makepath(sp, 'resource') return root