def GenerateColumnAttributesReportCMDLineArgs(): """ * Get command line arguments for GenerateColumnAttributesReport.py script. """ parser = ArgumentParser() # Mandatory positional arguments: parser.add_argument("datapath", type=str, help="Path to folder containing data.") parser.add_argument("filedatereg", type=str, help="Regular expression string for file dates.") parser.add_argument("reportpath", type=str, help="Output path for report.") # Optional arguments: parser.add_argument( "--filenamereg", type=str, help="(Optional) Regular expression for files in datapath.") # parser.add_argument("--gjsonattr", action="store_true", help="(Optional) Put if want to generate skeleton file for DyETL with column attributes filled in.") ############################### # Check arg validity: ############################### args = parser.parse_args() errs = [] args.datapath = args.datapath.replace('R:\\', '\\\\wanlink.us\\dfsroot\\APPS\\') if '.' in args.datapath: errs.append('datapath must point to folder.') elif not os.path.exists(args.datapath): errs.append(' '.join(['(datapath)', args.datapath, 'does not exist.'])) if '.' not in args.reportpath: errs.append('reportpath must point to a file.') elif os.path.exists(args.reportpath): errs.append(' '.join( ['(reportpath)', args.reportpath, 'already exists.'])) if not IsRegex(args.filedatereg): errs.append(' '.join([ '(filedatereg)', args.filedatereg, 'is not a valid regular expression string.' ])) fileNameReg = None if hasattr(args, 'filenamereg') and not IsRegex(args.filenamereg): errs.append(' '.join([ '(--filenamereg)', args.filenamereg, 'is not a valid regular expression.' ])) elif hasattr(args, 'filenamereg'): fileNameReg = args.filenamereg if errs: raise Exception("\n".join(errs)) return args
def GetAllFolderPaths(cls, headpath, folderreg): """ * Get all folders matching folderreg located in headpath. Inputs: * headpath: String path containing folders want to search. * folderreg: regex string or regex object used to find folders. """ errs = [] if not isinstance(headpath, str): errs.append('headpath must be a string.') elif not os.path.isdir(headpath): errs.append('headpath does not point to valid directory.') if not isinstance(folderreg, (str, FileConverter.__reObj)): errs.append('folderreg must be a string or regular expression.') elif isinstance(folderreg, str): if not IsRegex(folderreg): errs.append('folderreg must be a valid regular expression.') else: folderreg = re.compile(folderreg) if errs: raise Exception('\n'.join(errs)) # Get all matching folders: folders = [] for folder in os.listdir(headpath): fullpath = os.path.join(headpath, folder) if os.path.isdir(fullpath) and folderreg.match(folder): folders.append(fullpath) return folders
def GetAllFilePaths(cls, folderpath, filereg, subdirs=False): """ * Get all full paths for files matching regular expression at folderpath. Inputs: * folderpath: String path to folder. * filereg: Regular expression for files. Optional: * subdirs: Search all subdirectories in folder for files. Output: * Returns { FileName -> Path }. """ errs = [] if not isinstance(folderpath, str): errs.append('folderpath must be a string.') elif not os.path.isdir(folderpath): errs.append('folderpath must point to a folder.') elif not os.path.exists(folderpath): errs.append('folderpath does not exist.') if isinstance(filereg, str) and not IsRegex(filereg): errs.append( 'filereg must be a valid regular expression string or object.') elif isinstance(filereg, str): filereg = re.compile(filereg) if not subdirs is None and not isinstance(subdirs, bool): errs.append('subdirs must be a boolean.') if errs: raise Exception('\n'.join(errs)) filePaths = {} if not subdirs: if filereg: filePaths = { file: os.path.join(folderpath, file) for file in os.listdir(folderpath) if os.path.isfile(os.path.join(folderpath, file)) and filereg.match(file) } else: filePaths = { file: os.path.join(folderpath, file) for file in os.listdir(folderpath) if os.path.isfile(os.path.join(folderpath, file)) } else: # Get all files (with or without signature) in all folders within top directory (only down one level): filePaths = {} # traverse root directory, and list directories as dirs and files as files for root, dirs, files in os.walk(folderpath): if not dirs: pass else: for dir in dirs: for file in files: pass return filePaths
def GenerateFileTransferConfigJsonArgs(): """ * Pull and validate required arguments for GenerateFileTransferConfig script. """ req_args = set(['groupregex', 'outputfolder']) path = 'ScriptArgs\\GenerateFileTransferConfig.json' errs = [] if not os.path.exists(path): raise Exception('%s does not exist.' % path) try: args = ETLDashboardJsonArgs() args.update(LoadJsonFile(path)) except Exception as ex: errs.append('Failed to read %s' % path) errs.append('Reason: %s' % str(ex)) raise Exception('\n'.join(errs)) missing = req_args - set(args) if missing: raise Exception('The following required arguments are missing: %s' % ','.join(missing)) # Fill environment variables in 'etlfilepaths': args['etlfilepaths'] = FillEnvironmentVariables(args['etlfilepaths'], args['config'], 'PROD') ############################ # Required arguments: ############################ # groupregex: if not isinstance(args['groupregex'], str): errs.append('(groupregex) Must be a string.') elif not IsRegex(args['groupregex']): errs.append('(groupregex) Invalid regular expression.') # outputfolder: if not isinstance(args['outputfolder'], str): errs.append('(outputfolder) Must be a string.') elif not os.path.isdir(args['outputfolder']): errs.append('(outputfolder) Does not point to valid directory.') elif not args['outputfolder'].endswith('\\'): args['outputfolder'] += '\\' if errs: raise Exception('\n'.join(errs)) return args
def __Validate(ftsurl, etlpathsjson, chromedriverpath, groupregex): """ * Validate constructor parameters. """ errs = [] if not isinstance(ftsurl, str): errs.append('ftsurl must be a string.') if not isinstance(etlpathsjson, dict): errs.append('etlpathsjson must be a json dictionary.') if not isinstance(chromedriverpath, str): errs.append('chromedriverpath must be a string.') elif not chromedriverpath.endswith('.exe'): errs.append('chromedriverpath must point to an executable.') if not isinstance(groupregex, (str, FileTransferServiceAggregator.__reType)): errs.append( 'groupregex must be a string or regular expression object.') elif isinstance(groupregex, str) and not IsRegex(groupregex): errs.append( 'groupregex must be a valid regular expression string.') if errs: raise Exception('\n'.join(errs))
def PostAllFiles(self, etlname, datafolder, fileregex, testmode="LOCAL", waitseconds=200): """ * Open DynamicETL.WebAPI, post all files located in datapath folder matching fileregex, and run DynamicETL.Service to load data into local tables. Inputs: * etlname: String etl name. Must be configured in serviceappsettings. * datafolder: String folder containing all data files for ETL. * fileregex: Regular expression string or regex object to match files in datafolder. Optional: * waitseconds: Number of seconds to wait before closing DynamicETL.Service and WebAPI. """ errs = [] if not isinstance(etlname, str): errs.append('etlname must be a string.') else: if not etlname in self.__serviceappsettings['Etls']: errs.append( '%s is not configured in the DynamicETL.Service appsettings.json file.' % etlname) # Ensure etl is configured in etlfilepaths.json: isAvailable = False for elem in self.__etlpaths['files']: if elem['subject'].lower() == etlname.lower(): isAvailable = True break if not isAvailable: errs.append('%s ETL is not configured in etlfilepaths.json') if not isinstance(datafolder, str): errs.append('datafolder must be a string.') elif not os.path.isdir(datafolder): errs.append('datafolder does not point to a valid folder.') if isinstance(fileregex, str): if not IsRegex(fileregex): errs.append( 'fileregex is not a valid regular expression string.') else: fileregex = re.compile(fileregex) elif not isinstance(fileregex, LocalLargeDataJobPoster.__reType): errs.append( 'fileregex must be a regular expression string or regular expression object.' ) if not isinstance(testmode, str): errs.append('testmode must be a string.') elif not testmode.upper() in LocalLargeDataJobPoster.__validModes: errs.append('testmode must be one of %s (case insensitive).' % ', '.join(LocalLargeDataJobPoster.__validModes)) if not isinstance(waitseconds, (float, int)): errs.append('waitseconds must be numeric.') elif waitseconds <= 0: errs.append('waitseconds must be positive.') if errs: raise Exception('\n'.join(errs)) testmode = testmode.upper() files = self.__GetAllMatchingFiles(datafolder, fileregex) if len(files) == 0: # Exit immediately if no matching files were found: return False self.__modeserviceappsettings = FillEnvironmentVariables( copy.deepcopy(self.__serviceappsettings), copy.deepcopy(self.__config), testmode) self.__modeetlpaths = FillEnvironmentVariables( copy.deepcopy(self.__etlpaths), copy.deepcopy(self.__config), testmode) if testmode == "LOCAL": self.__webapiurl = self.__modeserviceappsettings["EtlJobsUrl"] # Post all matching files to WebAPI and run Service locally: self.__OpenWebAPI() self.__PostAllJobs(etlname, files) self.__OpenService() self.__CloseAllInstances(waitseconds) return True else: # Drop all matching files to target location to be sucked up by ETL: self.__DropAllFiles(files, etlname) return True
def __CheckArgs(args): """ * Check argument validity, throw exception if any are invalid. """ errs = [] req = Arguments.__reqArgs.copy() missing = req - set(args) if missing: raise Exception(' '.join([ 'The following required arguments are missing:', ', '.join(missing) ])) ############################# # Required Arguments: ############################# # "etlname": if not isinstance(args['etlname'], str): errs.append('etlname must be a string.') # "data": if 'path' not in args['data']: missing.append('data::path') elif not os.path.isdir(args['data']['path']): errs.append('data::path must point to folder.') elif not os.path.exists(args['data']['path']): errs.append(' '.join( ['(data::path)', args['data']['path'], ' does not exist.'])) if 'sheets' in args['data'] and not isinstance(args['data']['sheets'], list): errs.append('data::sheets must be a list.') if 'delim' in args['data'] and not isinstance(args['data']['delim'], str): errs.append('data::delim must be a string.') # "filedatereg": if not 'Regex' in args['filedatereg']: errs.append('filedatereg requires "Regex" argument.') elif not IsRegex(args['filedatereg']['Regex']): errs.append(' '.join([ '(filedatereg)', args['filedatereg']['Regex'], 'Not a valid regular expression.' ])) # "outputfolder": if not os.path.isdir(args['outputfolder']): errs.append('(outputfolder) Folder does not exist.') elif not args['outputfolder'].endswith('\\'): args['outputfolder'] = args['outputfolder'] + '\\' # "filenamereg": if 'filenamereg' in args and not IsRegex(args['filenamereg']): errs.append(' '.join([ '(filenamereg) ', args['filenamereg'], ' is not a valid regular expression.' ])) ##################### # Optional: ##################### # "allnull" arguments: if 'allnull' in args: if not args['allnull'].lower() in ['true', 'false']: errs.append('allnull must be "true"/"false".') # "convert": if 'convert' in args: if not ('convertpath' in args['convert'] and 'toextension' in args['convert']): errs.append( 'convert requires "toextension" and "convertpath" as attributes.' ) elif '.' not in args['convert']['toextension']: errs.append('%s is invalid conversion extension.' % args['convert']['toextension']) if 'convertpath' in args['convert'] and not os.path.exists( args['convert']['convertpath']): errs.append('convertpath does not exist.') if missing: errs.append( 'The following required subarguments are missing: {%s}' % ', '.join(missing)) if errs: raise Exception("\n".join(errs))
def EvaluateDataJsonArgs(): """ * Get arguments used in EvaluateData.py script. """ req = set([ 'data', 'filenamereg', 'outputfolder', 'processname', 'recursive', 'tablename' ]) opt = set(['allnull', 'filedatereg']) path = 'ScriptArgs\\EvaluateData.json' if not os.path.exists(path): raise Exception('%s is missing.' % path) args = json.load(open(path, 'rb')) missing = req - set(args) if missing: raise Exception('The following required arguments are missing: %s' % ','.join(missing)) errs = [] ############################# # Required Arguments: ############################# # "processname": if not isinstance(args['processname'], str): errs.append('(processname) Must be a string.') # "data": if 'path' not in args['data']: missing.append('data::path') elif not os.path.isdir(args['data']['path']): errs.append('data::path must point to folder.') elif not os.path.exists(args['data']['path']): errs.append(' '.join( ['(data::path)', args['data']['path'], ' does not exist.'])) if 'sheets' in args['data']: if not isinstance(args['data']['sheets'], list): errs.append('data::sheets must be a list.') else: args['data']['sheets'] = None if 'delim' in args['data'] and not isinstance(args['data']['delim'], str): errs.append('data::delim must be a string.') else: args['data']['delim'] = None if 'rowstart' in args['data']: if not isinstance(args['data']['rowstart'], int): errs.append('data::rowstart must be an integer.') elif not args['data']['rowstart'] >= 0: errs.append('data::rowstart must be nonnegative integer.') # "filenamereg": if 'filenamereg' in args and not IsRegex(args['filenamereg']): errs.append(' '.join([ '(filenamereg) ', args['filenamereg'], ' is not a valid regular expression.' ])) else: args['filenamereg'] = re.compile(args['filenamereg']) # "outputfolder": if not os.path.isdir(args['outputfolder']): errs.append('(outputfolder) Folder does not exist.') elif not args['outputfolder'].endswith('\\'): args['outputfolder'] = args['outputfolder'] + '\\' # "recursive": tf = {'true': True, 'false': False} if 'recursive' in args and not args['recursive'].lower() in tf: errs.append('(recursive) Must be true/false.') else: args['recursive'] = tf[args['recursive'].lower()] ##################### # Optional: ##################### # "allnull" arguments: if 'allnull' in args: if not args['allnull'].lower() in ['true', 'false']: errs.append('allnull must be "true"/"false".') else: args['allnull'] = tf[args['allnull'].lower()] else: args['allnull'] = False # "filedatereg": if 'filedatereg' in args: if not 'Regex' in args['filedatereg']: errs.append('filedatereg requires "Regex" argument.') elif not IsRegex(args['filedatereg']['Regex']): errs.append(' '.join([ '(filedatereg)', args['filedatereg']['Regex'], 'Not a valid regular expression.' ])) else: args['filedatereg'] = re.compile(args['filedatereg']['Regex']) else: # Pull all files in folder: args['filedatereg'] = None if missing: errs.append('The following required subarguments are missing: {%s}' % ', '.join(missing)) if errs: raise Exception("\n".join(errs)) return args
def ConvertAllFilePaths(cls, outfolder, new_ext, folderpath=None, filereg=None, paths=None, subdirs=False): """ * Convert all files listed in folderpath/paths to files with extension in outfolder. Inputs: * outfolder: string output folder for converted files. Must exist. * new_ext: extension to convert files to. Mutually Exclusive Inputs: * folderpath, filereg: put string folder containing files matching filereg regular expression. * paths: list of filepaths to convert. Optional Inputs: * subdirs: Search all subdirectories of folderpath for matching files. Outputs: * Dictionary mapping { convertedfilename -> path }. """ errs = [] if not isinstance(outfolder, str): errs.append('outfolder must be a string.') elif not os.path.isdir(outfolder): errs.append('outfolder must be point to a folder.') elif not os.path.exists(outfolder): errs.append('outfolder does not exist.') if not isinstance(new_ext, str): errs.append('extension must be a string.') elif '.' not in new_ext: errs.append('extension is invalid.') if folderpath is None and filereg is None and paths is None: errs.append('One of folderpath, filereg and paths must be passed.') elif not (not folderpath is None and not filereg is None) ^ (not paths is None): errs.append( 'folderpath AND filereg or (exclusive) paths must be passed.') elif not folderpath is None: if not isinstance(folderpath, str): errs.append('folderpath must be a string.') elif not os.path.isdir(folderpath): errs.append('folderpath must point to a folder.') elif not os.path.exists(folderpath): errs.append('folderpath does not exist.') if not IsRegex(filereg): errs.append('filereg must be a valid regular expression.') elif not paths is None: if not hasattr(paths, '__iter__'): errs.append('paths must be an iterable.') if errs: raise Exception('\n'.join(errs)) # Convert local folders to current working directory for use with # shutil: if ":" not in outfolder and outfolder[0] != "\\": outfolder = "%s\\%s" % (os.getcwd(), outfolder) if ":" not in folderpath and folderpath[0] != "\\": outfolder = "%s\\%s" % (os.getcwd(), folderpath) # Get all full file paths if not provided: if folderpath: paths = list( FileConverter.GetAllFilePaths(folderpath, filereg, subdirs).values()) if not outfolder[len(outfolder) - 1:len(outfolder)] == "\\": outfolder += "\\" convertedpaths = {} for filepath in paths: filename = os.path.basename(filepath) newfilename = "%s%s" % (filename[0:filename.find('.')], new_ext) convertedpath = "%s%s" % (outfolder, newfilename) if not os.path.exists(convertedpath): shutil.copyfile(filepath, convertedpath) convertedpaths[newfilename] = convertedpath return convertedpaths