def __init__(self, programpath): self.DOM = None self.getProgram(programpath) self.Collections = {} self.Groups = {} self.Products = {} self.Hierarchy = {} self.Bucket = None self.log = Logger('Program') # Populate everything self.getBucket() self.getProjectFile() self.parseCollections() self.parseGroups() self.parseProducts() self.parseTree(self.DOM.find('Hierarchy/*'))
def s3ProductWalker(bucket, patharr, currpath=[], currlevel=0): """ Given a path array, ending in a Product, snake through the S3 bucket recursively and list all the products available :param patharr: :param path: :param currlevel: :return: """ log = Logger('ProductWalk') s3 = Transfer(bucket) if currlevel >= len(patharr): return # If it's a collection then we need to iterate over folders and recurse on each if patharr[currlevel]['type'] == 'collection': # list everything at this collection pref = "/".join(currpath) + "/" if len(currpath) > 0 else "" result = s3.list(pref, Delimiter='/') if 'CommonPrefixes' in result: for o in result.get('CommonPrefixes'): s3ProductWalker(bucket, patharr, o.get('Prefix')[:-1].split('/'), currlevel + 1) else: return # If it's a container then no iteration necessary. Just append the path and recurse elif patharr[currlevel]['type'] == 'group': currpath.append(patharr[currlevel]['folder']) s3ProductWalker(bucket, patharr, currpath, currlevel + 1) # If it's a project then get the XML file and print it elif patharr[currlevel]['type'] == 'product': currpath.append(patharr[currlevel]['folder']) result = s3.list("/".join(currpath) + "/", Delimiter='/') if 'Contents' in result: for c in result['Contents']: if os.path.splitext(c['Key'])[1] == '.xml': log.info('Project: {0} (Modified: {1})'.format( c['Key'], c['LastModified'])) return
def download(self): """ Just upload one file using Boto3 :param bucket: :param key: :param filepath: :return: """ log = Logger('S3FileDownload') # Make a directory if that's needed dirpath = os.path.dirname(self.abspath) if not os.path.exists(dirpath): try: os.makedirs(dirpath) except Exception as e: raise Exception( "ERROR: Directory `{0}` could not be created.".format( dirpath)) log.info("Downloading: {0} ==> ".format(self.fullkey)) # This step prints straight to stdout and does not log self.s3.download(self.fullkey, self.abspath, size=self.s3size) print "" log.debug("Download Completed: {0}".format(self.abspath))
def menuwalk(program, nodes=None, currpath=[]): """ Walks through the program letting users choose if it's a level or specify if it's a container It returns a set of program paths that we then need to go and lookup to make our download queue :param currlevelObj: :param path: :return: """ log = Logger('menuwalk') if nodes is None: nodes = [program.Hierarchy] name = nodes[0]['node']['name'] if len(nodes) == 1 else "" # Get the list at the current path pathstr = '/'.join(currpath) + '/' if len(currpath) > 0 else "" levellist = s3GetFolderList(program.Bucket, pathstr) querystr = "Collection Choice: {0}{1}".format(pathstr, name) choicename = querychoices(querystr, levellist, "Select:") currpath.append(choicename) if len(nodes) > 1: node = getnodekeyval(nodes, 'folder', choicename) else: node = nodes[0] if node['type'] == 'product': pathstr = '/'.join(currpath) + '/' if len(currpath) > 0 else "" log.info("\nProduct Found: {0}".format(pathstr)) return currpath # No we've made out choice. We need to move on. elif 'children' in node and len(node['children']) > 0: # child1 = node['children'][0] children = node['children'] # if child1['type'] == 'collection': # chil return menuwalk(program, children, currpath[:])
def s3GetFolderList(bucket, prefix): """ Given a path array, ending in a Product, snake through the S3 bucket recursively and list all the products available :param patharr: :param path: :param currlevel: :return: """ log = Logger('CollectionList') s3 = Transfer(bucket) results = [] # list everything at this collection response = s3.list(prefix, Delimiter='/') if 'CommonPrefixes' in response: for o in response.get('CommonPrefixes'): results.append(o['Prefix'].replace(prefix, '').replace('/', '')) return results
def upload(self): """ Just upload one file using Boto3 :param bucket: :param key: :param filepath: :return: """ log = Logger('S3FileUpload') log.info("Uploading: {0} ==> s3://{1}/{2}".format( self.abspath, self.bucket, self.fullkey)) # This step prints straight to stdout and does not log self.s3.upload(self.abspath, self.fullkey) print "" log.debug("Upload Completed: {0}".format(self.abspath))
def localProductWalker(projroot, filedict, currentdir=""): """ This method has a similar recursive structure to s3FolderUpload but we're keeping it separate since it is only used to visualize the files in this folder :param rootDir: :param first: :return: """ log = Logger('localProdWalk') for pathseg in os.listdir(os.path.join(projroot, currentdir)): spaces = len(currentdir) * ' ' + '/' # Remember to sanitize for slash unity. We write unix separators # and then translate back to widnows when we need it. relpath = os.path.join(currentdir, pathseg).replace('\\', '/') abspath = os.path.join(projroot, relpath).replace('\\', '/') if os.path.isfile(abspath): log.debug(spaces + relpath) filedict[relpath] = {'src': abspath} elif os.path.isdir(abspath): log.debug(spaces + pathseg + '/') localProductWalker(projroot, filedict, relpath)
def __init__(self, projectRoot, projXMLFile): self.log = Logger('Project') self.DOM = None self.getProgramFromXML(path.join(projectRoot, projXMLFile)) self.LocalRoot = projectRoot
class Project(): def __init__(self, projectRoot, projXMLFile): self.log = Logger('Project') self.DOM = None self.getProgramFromXML(path.join(projectRoot, projXMLFile)) self.LocalRoot = projectRoot def getProgramFromXML(self, progXMLpath): assert path.isfile( progXMLpath), "ERROR: could not find file called: {}".format( progXMLpath) self.DOM = ET.parse(progXMLpath).getroot() def getPath(self, program): """ Figure out what the repository path should be :param project: :param program: :return: """ self.log.title('Getting remote path...') # First let's get the project type projType = self.DOM.find('./ProjectType').text.strip() assert not _strnullorempty( projType), "ERROR: <ProjectType> not found in project XML." self.log.info("Project Type Detected: {0}".format(projType)) # Now go get the product node from the program XML patharr = program.findprojpath(projType) assert patharr is not None, "ERROR: Product '{0}' not found anywhere in the program XML".format( projType) self.log.title("Building Path to Product: ".format(projType)) extpath = '' for idx, level in enumerate(patharr): if level['type'] == 'collection': col = self.getcollection(level['name']) self.log.info("{0}/collection:{1} => {2}".format( idx * ' ', level['name'], col)) name = col if program.testAllowedCollection(level['id'], col): name = program.getAllowedLookup(level['id'], col) extpath += '/' + name elif level['type'] == 'group': self.log.info("{0}/group:{1}".format(idx * ' ', level['name'])) extpath += '/' + level['folder'] elif level['type'] == 'product': self.log.info("{0}/product:{1}".format(idx * ' ', level['name'])) extpath += '/' + level['folder'] # Trim the first slash for consistency elsewhere if len(extpath) > 0 and extpath[0] == '/': extpath = extpath[1:] self.log.info("Final remote path to product: {0}".format(extpath)) return extpath def getcollection(self, colname): """ Try to pull the Collection out of the project file :param colname: string with the Collection we're looking for :param project: the ET node with the project xml :return: """ try: val = self.DOM.find( "MetaData/Meta[@name='{0}']".format(colname)).text.strip() except AttributeError: raise ValueError( "ERROR: Could not find <Meta name='{0}'>########</Meta> tag in project XML" .format(colname)) return val
def s3BuildOps(conf): """ Compare a source folder with what's already in S3 and given the direction you specify it should figure out what to do. :param src_files: :param keyprefix: :param bucket: :return: """ s3 = Transfer(conf['bucket']) opstore = {} log = Logger("s3BuildOps") prefix = "{0}/".format(conf['keyprefix']).replace("//", "/") log.title('The following locations were found:') if conf['direction'] == S3Operation.Direction.UP: tostr = 's3://{0}/{1}'.format(conf['bucket'], conf['keyprefix']) fromstr = conf['localroot'] else: fromstr = 's3://{0}/{1}'.format(conf['bucket'], conf['keyprefix']) tostr = conf['localroot'] log.info('FROM: {0}'.format(fromstr)) log.info('TO : {0}'.format(tostr)) log.title('The following operations are queued:') response = s3.list(prefix) # Get all the files we have locally files = {} if os.path.isdir(conf['localroot']): files = {} localProductWalker(conf['localroot'], files) # Fill in any files we find on the remote if 'Contents' in response: for result in response['Contents']: dstkey = result['Key'].replace(prefix, '') if dstkey in files: files[dstkey]['dst'] = result else: files[dstkey] = {'dst': result} for relname in files: fileobj = files[relname] opstore[relname] = S3Operation(relname, fileobj, conf) if len(opstore) == 0: log.info("-- NO Operations Queued --") return opstore
class S3Operation: """ A Simple class for storing src/dst file information and the operation we need to perform """ class FileOps: # Kind of an enumeration DELETE_REMOTE = "Delete Remote" DELETE_LOCAL = "Delete Local" UPLOAD = "Upload" DOWNLOAD = "Download" IGNORE = "Ignore" class Direction: # Kind of an enumeration UP = "up" DOWN = "down" class FileState: # Kind of an enumeration LOCALONLY = "Local-Only" REMOTEONLY = "Remote-Only" UPDATENEEDED = "Update Needed" SAME = "Files Match" def __init__(self, key, fileobj, conf): """ :param key: The relative key/path of the file in question :param fileobj: the file object with 'src' and 'dst' :param conf: the configuration dictionary """ self.log = Logger('S3Ops') self.s3 = Transfer(conf['bucket']) self.key = key # Set some sensible defaults self.filestate = self.FileState.SAME self.op = self.FileOps.IGNORE self.delete = conf['delete'] self.force = conf['force'] self.localroot = conf['localroot'] self.bucket = conf['bucket'] self.direction = conf['direction'] self.keyprefix = conf['keyprefix'] self.s3size = 0 # And the final paths we use: self.abspath = self.getAbsLocalPath() self.fullkey = self.getS3Key() # The remote size (if it exists) helps us figure out percent done if 'dst' in fileobj: self.s3size = fileobj['dst']['Size'] # Figure out what we have if 'src' in fileobj and 'dst' not in fileobj: self.filestate = self.FileState.LOCALONLY if 'src' not in fileobj and 'dst' in fileobj: self.filestate = self.FileState.REMOTEONLY if 'src' in fileobj and 'dst' in fileobj: if s3issame(fileobj['src'], fileobj['dst']): self.filestate = self.FileState.SAME else: self.filestate = self.FileState.UPDATENEEDED # The Upload Case # ------------------------------ if self.direction == self.Direction.UP: # Two cases for uploading the file: New file or different file if self.filestate == self.FileState.LOCALONLY or self.filestate == self.FileState.UPDATENEEDED: self.op = self.FileOps.UPLOAD # If we've requested a force, do the upload anyway elif self.FileState.SAME and self.force: self.op = self.FileOps.UPLOAD # If the remote is there but the local is not and we're uploading then clean up the remote # this requires thed delete flag be set elif self.filestate == self.FileState.REMOTEONLY and self.delete: self.op = self.FileOps.DELETE_REMOTE # The Download Case # ------------------------------ elif self.direction == self.Direction.DOWN: if self.filestate == self.FileState.REMOTEONLY or self.filestate == self.FileState.UPDATENEEDED: self.op = self.FileOps.DOWNLOAD # If we've requested a force, do the download anyway elif self.FileState.SAME and self.force: self.op = self.FileOps.DOWNLOAD # If the local is there but the remote is not and we're downloading then clean up the local # this requires thed delete flag be set elif self.filestate == self.FileState.LOCALONLY and self.delete: self.op = self.FileOps.DELETE_LOCAL self.log.info(str(self)) def getS3Key(self): # Not using path.join because can't be guaranteed a unix system return "{1}/{2}".format(self.bucket, self.keyprefix, self.key) def getAbsLocalPath(self): # Not using path.join because can't be guaranteed a unix system return os.path.join(self.localroot, self.key) def execute(self): """ Actually run the command to upload/download/delete the file :return: """ if self.op == self.FileOps.IGNORE: self.log.info(" [{0}] {1}: Nothing to do. Continuing.".format( self.op, self.key)) elif self.op == self.FileOps.UPLOAD: self.upload() elif self.op == self.FileOps.DOWNLOAD: self.download() elif self.op == self.FileOps.DELETE_LOCAL: self.delete_local() elif self.op == self.FileOps.DELETE_REMOTE: self.delete_remote() def __repr__(self): """ When we print this class as a string this is what we output """ forcestr = "(FORCE)" if self.force else "" opstr = "{0:12s} ={2}=> {1:10s}".format(self.filestate, self.op, forcestr) return "./{1:60s} [ {0:21s} ]".format(opstr.strip(), self.key) def delete_remote(self): """ Delete a Remote file """ self.log.info("Deleting: {0} ==> ".format(self.fullkey)) # This step prints straight to stdout and does not log self.s3.delete(self.fullkey) self.log.debug("S3 Deletion Completed: {0}".format(self.fullkey)) def delete_local(self): """ Delete a local file """ dirname = os.path.dirname(self.abspath) os.remove(self.abspath) self.log.info("Deleting Local file: {0} ==> ".format(self.abspath)) # now walk backwards and clean up empty folders try: os.removedirs(dirname) self.log.debug('Cleaning up folders: {0}'.format(dirname)) except: self.log.debug( 'Folder cleanup stopped since there were still files: {0}'. format(dirname)) pass self.log.debug("Local Deletion Completed: {0}".format(self.abspath)) def download(self): """ Just upload one file using Boto3 :param bucket: :param key: :param filepath: :return: """ log = Logger('S3FileDownload') # Make a directory if that's needed dirpath = os.path.dirname(self.abspath) if not os.path.exists(dirpath): try: os.makedirs(dirpath) except Exception as e: raise Exception( "ERROR: Directory `{0}` could not be created.".format( dirpath)) log.info("Downloading: {0} ==> ".format(self.fullkey)) # This step prints straight to stdout and does not log self.s3.download(self.fullkey, self.abspath, size=self.s3size) print "" log.debug("Download Completed: {0}".format(self.abspath)) def upload(self): """ Just upload one file using Boto3 :param bucket: :param key: :param filepath: :return: """ log = Logger('S3FileUpload') log.info("Uploading: {0} ==> s3://{1}/{2}".format( self.abspath, self.bucket, self.fullkey)) # This step prints straight to stdout and does not log self.s3.upload(self.abspath, self.fullkey) print "" log.debug("Upload Completed: {0}".format(self.abspath))
def __init__(self, key, fileobj, conf): """ :param key: The relative key/path of the file in question :param fileobj: the file object with 'src' and 'dst' :param conf: the configuration dictionary """ self.log = Logger('S3Ops') self.s3 = Transfer(conf['bucket']) self.key = key # Set some sensible defaults self.filestate = self.FileState.SAME self.op = self.FileOps.IGNORE self.delete = conf['delete'] self.force = conf['force'] self.localroot = conf['localroot'] self.bucket = conf['bucket'] self.direction = conf['direction'] self.keyprefix = conf['keyprefix'] self.s3size = 0 # And the final paths we use: self.abspath = self.getAbsLocalPath() self.fullkey = self.getS3Key() # The remote size (if it exists) helps us figure out percent done if 'dst' in fileobj: self.s3size = fileobj['dst']['Size'] # Figure out what we have if 'src' in fileobj and 'dst' not in fileobj: self.filestate = self.FileState.LOCALONLY if 'src' not in fileobj and 'dst' in fileobj: self.filestate = self.FileState.REMOTEONLY if 'src' in fileobj and 'dst' in fileobj: if s3issame(fileobj['src'], fileobj['dst']): self.filestate = self.FileState.SAME else: self.filestate = self.FileState.UPDATENEEDED # The Upload Case # ------------------------------ if self.direction == self.Direction.UP: # Two cases for uploading the file: New file or different file if self.filestate == self.FileState.LOCALONLY or self.filestate == self.FileState.UPDATENEEDED: self.op = self.FileOps.UPLOAD # If we've requested a force, do the upload anyway elif self.FileState.SAME and self.force: self.op = self.FileOps.UPLOAD # If the remote is there but the local is not and we're uploading then clean up the remote # this requires thed delete flag be set elif self.filestate == self.FileState.REMOTEONLY and self.delete: self.op = self.FileOps.DELETE_REMOTE # The Download Case # ------------------------------ elif self.direction == self.Direction.DOWN: if self.filestate == self.FileState.REMOTEONLY or self.filestate == self.FileState.UPDATENEEDED: self.op = self.FileOps.DOWNLOAD # If we've requested a force, do the download anyway elif self.FileState.SAME and self.force: self.op = self.FileOps.DOWNLOAD # If the local is there but the remote is not and we're downloading then clean up the local # this requires thed delete flag be set elif self.filestate == self.FileState.LOCALONLY and self.delete: self.op = self.FileOps.DELETE_LOCAL self.log.info(str(self))
class Program(): def __init__(self, programpath): self.DOM = None self.getProgram(programpath) self.Collections = {} self.Groups = {} self.Products = {} self.Hierarchy = {} self.Bucket = None self.log = Logger('Program') # Populate everything self.getBucket() self.getProjectFile() self.parseCollections() self.parseGroups() self.parseProducts() self.parseTree(self.DOM.find('Hierarchy/*')) def parseCollections(self): """ Pull all the collections out of the program XML :return: """ for col in self.DOM.findall('Definitions/Collections/Collection'): self.Collections[col.attrib['id']] = { 'id': col.attrib['id'], 'type': 'collection', 'name': col.attrib['name'], 'allows': self.parseCollectionAllowed(col.findall('Allow')) } allowType = 'fixed' allows = self.Collections[col.attrib['id']]['allows'] if len(allows) > 0: allowType = allows[0]['type'] self.Collections[col.attrib['id']]['allowtype'] = allowType def getProgram(self, progpath): """ Either uses a local path or downloads an online version of the program XML :param path: :return: """ if re.match('^https*:\/\/.*', progpath) is not None: try: request = urllib2.Request(progpath) request.add_header('Pragma', 'no-cache') file = urllib2.build_opener().open(request) data = file.read() file.close() self.DOM = ET.fromstring(data) except: err = "ERROR: Could not download <{0}>".format(progpath) self.log.error(err) raise ValueError(err) else: self.DOM = ET.parse(progpath).getroot() def parseCollectionAllowed(self, allowETs): allows = [] for allow in allowETs: if 'pattern' in allow.attrib: allows.append({ 'type': 'pattern', 'pattern': allow.attrib['pattern'], }) else: attrs = allow.attrib attrs['type'] = 'fixed' allows.append(attrs) return allows def testAllowedCollection(self, colName, desiredName): """ Test if this is a valid collection to ask for :param collection: :param colName: :return: """ collection = self.Collections[colName] if len(collection['allows']) == 0: return True assert len( desiredName ) > 0, "ERROR: Desired collection name for collection {0} is empty.".format( collection['name']) bGood = False for allow in collection['allows']: if allow['type'] == 'pattern': try: matchObj = re.match(allow['pattern'], desiredName) if matchObj: bGood = True continue except Exception as e: self.log.error( "Something went wrong with the allow RegEx in the Program XML file", e) else: if allow['name'] == desiredName: bGood = True continue elif 'aliases' in allow and desiredName in allow[ 'aliases'].split(','): bGood = True continue assert bGood, "ERROR: Desired Collection: {0} did not pass the allowed values test for collection: {1}".format( desiredName, collection['name']) return bGood def getAllowedLookup(self, colName, desiredName): """ Get the actual allowed name. Most of the time this is just what you pass in but in the case of non-pattern allows this will do a lookup :param collection: :param colName: :return: """ if len(self.Collections[colName]['allows']) == 0: return desiredName name = desiredName for allow in self.Collections[colName]['allows']: if allow['type'] == 'fixed' and allow['name'] == desiredName: name = allow['folder'] continue return name def parseGroups(self): for grp in self.DOM.findall('Definitions/Groups/Group'): self.Groups[grp.attrib['id']] = { 'id': grp.attrib['id'], 'type': 'group', 'name': grp.attrib['name'], 'folder': grp.attrib['folder'] } def parseProducts(self): for prod in self.DOM.findall('Definitions/Products/Product'): self.Products[prod.attrib['id']] = { 'id': prod.attrib['id'], 'type': 'product', 'name': prod.attrib['name'], 'folder': prod.attrib['folder'] } def parseTree(self, etNode, treeNode=None): obj = {} if etNode.tag == 'Product' and 'ref' in etNode.attrib: obj['type'] = 'product' obj['node'] = self.Products[etNode.attrib['ref']] elif etNode.tag in ['Group', 'Collection']: obj['children'] = [] if etNode.tag == 'Group': obj['type'] = 'group' obj['node'] = self.Groups[etNode.attrib['ref']] else: obj['type'] = 'collection' obj['node'] = self.Collections[etNode.attrib['ref']] for child in etNode.getchildren(): obj['children'].append(self.parseTree(child, obj['children'])) if treeNode is None: self.Hierarchy = obj return obj def getProjectFile(self): try: self.ProjectFile = self.DOM.find( "MetaData/Meta[@name='projectfile']").text.strip() self.log.info("Project File we're looking for: {0}".format( self.ProjectFile)) except: msg = "ERROR: No <Meta Name='projectfile'>project.rs.xml</Meta> tag found in program XML" self.log.error(msg) raise ValueError(msg) def getBucket(self): try: self.Bucket = self.DOM.find( "MetaData/Meta[@name='s3bucket']").text.strip() self.log.info("S3 Bucket Detected: {0}".format(self.Bucket)) except: msg = "ERROR: No <Meta Name='s3bucket'>riverscapes</Meta> tag found in program XML" self.log.error(msg) raise ValueError(msg) def getProdPath(self, prodName): self.log.title('Getting remote path structure...') # First let's get the project type assert not _strnullorempty( prodName), "ERROR: <ProjectType> not found in project XML." self.log.info("Project Type Detected: {0}".format(prodName)) # Now go get the product node from the program XML patharr = self.findprojpath(prodName) assert patharr is not None, "ERROR: Product '{0}' not found anywhere in the program XML".format( prodName) self.log.title("Building Path to Product: ".format(prodName)) return patharr def findprojpath(self, prodname, node=None, path=[]): """ Find the path to the desired project :param prodname: :param node: :param path: :return: """ if node is None: node = self.Hierarchy if node['type'] == 'product' and node['node']['name'] == prodname: path.append(node['node']) return path elif node['type'] in ['group', 'collection']: newpath = path[:] newpath.append(node['node']) for child in node['children']: result = self.findprojpath(prodname, child, newpath) if result is not None: return result def progtos3path(self, progpath, level=0, currpath=[], paths=[]): """ A program path to a series of real S3 paths :param progpath: :param level: :param currpath: :param paths: :return: """ # Are we at the end yet? last level must be a product if (level - 1) == len(progpath): currpath.append(progpath[level]) paths.append('/'.join(currpath)) return paths # One choice. Just move on: if len(progpath[level]) == 1: currpath.append(progpath[level]) self.progtos3path(progpath, level + 1, paths) else: for el in progpath[level]: newpath = currpath[:].append(el) self.progtos3path(progpath, level + 1, paths)