Beispiel #1
0
import spider
from pprint import pprint

if __name__ == '__main__':
    a = spider.ftpurls('ftp://localhost/')
    print 1; pprint(a)
    a = spider.ftppaths('ftp://localhost')
    print 2; pprint(a)
    a = spider.weburls('http://localhost/')
    print 3; pprint(a)
    a = spider.weburls('http://localhost/', 200, 5, 3)
    print 4; pprint(a)
    spider.ftpmirror('e:\\ftp\\', 14, 'ftp://localhost/')
    a = spider.ftpspider('ftp://localhost/')
    print 5; pprint(a)
    a = spider.webpaths('http://localhost/')
    print 6; pprint(a)
    spider.webreport('e:\\web1.txt', 'http://localhost/')
    spider.webmirror('e:\\web\\', 18, 'http://localhost/')
    a = spider.webspider('http://localhost/')
    print 7; pprint(a)
    spider.urlreport('e:\\web2.txt', 'http://localhost/',)
    spider.badurlreport('e:\\web3.txt', 'http://localhost/')
    spider.badhtmreport('e:\\web4.txt', 'http://localhost/')
    spider.redireport('e:\\web5.txt', 'http://localhost/')
    spider.outreport('e:\\web6.txt', 'http://localhost')
    spider.othereport('e:\\web7.txt', 'http://localhost/')
    a = spider.Spider('ftp://localhost/', 200, 16)
    a.ftppaths()
    print 1; pprint(a.paths)
    a.ftpurls()
Beispiel #2
0
    def _mapflat(self, ip):
        '''Returns PSI map of a list of input paths
        
        Arguments:
        ip -- input path'''

        def makepath(sp, type='collection'):
            '''Fills a DOM hierarchy if a PSI node's parent isn't found
        
            Arguments:
            sp -- split path to verify
            type -- type of PSI node (default: "collection")'''

            def findparent(tn):
                '''Finds or creates parents for a PSI node
        
                Arguments:
                tn -- temporary node hierarchy'''
                # Shorten split path by one
                if len(sp) != 2: del sp[-1]
                # Get parent of current PSI node
                parent = getparent(sp, root, 1)
                # Attach to parent if current in root node
                if parent: parent.insertBefore(tn, parent.firstChild)
                # Otherwise create new parent and look for parent's parent
                else:
                    temp = collection(sp[-2])
                    temp.insertBefore(tn, temp.firstChild)
                    findparent(temp)

            # Look up parent            
            parent = getparent(sp, root, 1)
            # If parent found...
            if parent:
                # Create resource and attach to parent
                if type == 'resource': walkfile(sp[-1], fp, parent, source)
                # Or create a collection if it doesn't exist
                elif not getparent(sp, root, 1, -1): collection(sp[-1], parent)
            # Otherwise make parent and look for its parent
            else:
                # Create parent
                temp = collection(sp[-2])
                # Create resource and attach to new parent
                if type == 'resource': walkfile(sp[-1], fp, temp, source)
                # Or create a new collection if it doesn't exist
                elif not getparent(sp, root, 1, -1): collection(sp[-1], temp)
                if len(sp) != 2: findparent(temp)

        # Avoid lookups, clear source, and create root
        getparent, collection, source = self._getparent, self._collection, None
        walkfile, root = self._walkfile, self._collection(u'root')
        # If FTP, process FTP
        if ip.find('ftp://') != -1:
            # Import URL parser and FTP spider
            from spider import ftpspider
            import urlparse as up
            self._up = up
            # Get full paths, partial paths, and FTP session
            flatlist, fullpaths, self._session = ftpspider(ip, 10, 500)
        elif ip.find('http://') != -1:
            # Import web spider and URL library
            from spider import httpspider
            import urllib as ulib
            self._ulib = ulib
            # Get full and partial paths,
            flatlist, fullpaths = httpspider(ip, 10, 500)
        else:
            root = collection(self._path.split(ip)[1])
            # Try opening tarball
            try:
                # Import tarfile handler
                import tarfile
                # Maintain source for other possible extractions
                source = tarfile.open(ip)
                # Make path and file lists
                flatlist, tl = source.getnames(), source.getmembers()
            except (NameError, tarfile.ReadError):
                # Import zipfile handler
                from zipfile import ZipFile
                # Maintain source for other possible extractions
                source = ZipFile(ip)
                # Make path and file lists
                flatlist = source.namelist()
            # If no archive, terminate process
            except IOError:
                print 'Error: Invalid pathname "%s"' % ip
                import sys
                sys.exit(0)
        # Process paths
        for path in flatlist:
            # Get path position and split the path
            pp, sp = flatlist.index(path), path.split('/')
            # Remove empty strings
            if sp[0] == '': del sp[0]
            if sp[-1] == '': del sp[-1]
            # Insert root into each path
            sp.insert(0, self._getname(root))
            # Adjust full path if necessary
            try: fp = fullpaths[pp]
            except NameError: fp = path
            # Try tarfile tests
            try:
                # If tarfile directory, make a collection
                if tl[pp].isdir(): makepath(sp)
                # If anything else (file), make resource
                else: makepath(sp, 'resource')
            except NameError:
                # If '/' indicates directory, make collection
                if path[-1] == '/': makepath(sp)
                # Anything else, make resource
                else: makepath(sp, 'resource')
        return root