def tearDown(self): if not self.stage: return if not os.path.exists(self.stage): return f = Files({}) if not f.is_dir_empty(self.stage): f.empty_dir(self.stage) os.rmdir(self.stage)
def tearDown(self): f = Files({}) dirs = [self.stage, self.source, self.alt['source']['empty'], self.default['stage']] for i in dirs: if not i: continue if not os.path.exists(i): continue if not f.is_dir_empty(i): f.empty_dir(i) os.rmdir(i)
def validate_config(config, command): f = Files(config) if command == "auto": temp = f.db_load_config() else: temp = f.load_config() config = temp if temp else config config['begin'], config['end'] = check_dates(config) config['site'], config['instrument'], config['facility'], config['datastream'] = check_sif_datastream(config) config['source'] = check_source(config) config['stage'] = check_stage(config) config['job'] = check_job(config) return config
def ask_for_dir(message, default=None, error=None, required=False, level=1): """ Ask user for a directory location """ f = Files({}) folder = input(message) if folder == '': if default != None: folder = default elif required: if level < max_tries: folder = ask_for_dir(message, default=default, error=error, required=required, level=level+1) else: exit(error) else: return folder = abspath(f.clean_path(folder)) return folder
def check_stage(config, level=1): f = Files(config) stage = config['stage'] interactive = config['interactive'] quiet = config['quiet'] if stage == None: reproc_home = os.environ.get('REPROC_HOME') home = os.environ.get('HOME') if reproc_home != None: stage = reproc_home elif home.split('/')[1] == 'data': stage = '{}/reprocessing/data'.format(home) else: stage = '/data/home/{}/reprocessing/data'.format(config['username']) if interactive: message = "Specify a stage directory: ({})".format(stage) error = 'A stage directory must be specified. Please try again.' default = stage stage = ask_for_dir(message, error=error, default=default, required=True) if not os.path.exists(stage): try: os.makedirs(stage) except: if level < max_tries and not quiet: print('Unable to create {}'.format(stage)) message = 'Please specify a new location: ' error = 'A stage directory must be specified. Please try again.' config['stage'] = ask_for_dir(message, error=error, required=True) stage = check_stage(config, level+1) else: exit('Unable to create {}. Please try again.'.format(stage)) if not f.is_dir_writable(stage): if level < max_tries and not quiet: print('{} is not writable.'.format(stage)) message = 'Please specify a new location: ' error = 'A stage directory must be specified. Please try again.' config['stage'] = ask_for_dir(message, error=error, required=True) stage = check_stage(config, level+1) else: exit('{} is not writable. Please try again.'.format(stage)) return stage
def check_job(config): job = config['job'] interactive = config['interactive'] if interactive: message = 'Please specify a job name: ' if job: message = '{0}({1})'.format(message, job) temp = input(message) if temp != '': job = temp if not job: import uuid uid = str(uuid.uuid1()) uid = uid.split('-') job = '{0}{1}'.format(uid[0], uid[3]) config['job'] = job f = Files(config) f.setup_job_dir() return job
def main(): if not len(sys.argv) > 1: sys.argv.append("-h") if '-v' in sys.argv: print(apm.__version__) return # Retrieve arguments from user config = parse_args() command = config['command'].lower() # Check to see if this is a test if command == 'test': test_config = test.config() sys.argv = [sys.argv[0]] jprint(config, sort_keys=True, indent=4) unittest.main(buffer=True) # unittest.main() return # Not a test if command == 'info' or command == 'vapinfo': vap = VapMgr({}) vap.vap_info() return if command == 'auto': temp = validate_config(config, command) config = temp if temp else config else: # Validate user arguments temp = validate_config(config, command) config = temp if temp else config if command == 'check': jprint(config, sort_keys=True, indent=4) return # Save the config to file s = time.time() f = Files(config) f.save_config() f.load_filenames() files = f.files # Check to see if any files are not currently being tracked # Or if any tracked files have been deleted print("Checking status of tracked files...", end="") sys.stdout.flush() json_file = '{0}/{1}/{1}.json'.format(config['stage'], config['job']) if os.path.exists(json_file) and config['ingest']: fp = open(json_file, 'r') files = json.loads(fp.read()) fp.close() cwd = os.getcwd() os.chdir('{}/{}/collection'.format(config['stage'], config['job'])) keys = files.keys() sites = set(os.listdir('.')) for site in keys: if site not in sites: files.pop(site) continue os.chdir(site) instruments = set(os.listdir('.')) ins_keys = files[site].keys() for ins in ins_keys: if ins not in instruments: files[site].pop(ins) continue os.chdir(ins) filelist = set(os.listdir('.')) for i in filelist: if i not in files[site][ins] and not (os.path.isdir(i) and i == "other_files"): exit("\nThe file {0}/{1}/{2} is currently untracked.\nPlease edit {3}.json to start tracking this file.\n".format(site, ins, i, config['job'])) for i in files[site][ins]: if i not in filelist: files[site][ins][i]["deleted"] = True os.chdir('..') os.chdir('..') os.chdir(cwd) print("Done") # Done checking status of tracked files sys.stdout.flush() # Run the appropriate command if command == 'auto': print('Attempting to stage files for datastreams: {}'.format(config['datastream'])) skip = False if not config['duplicates']: s = Stage(config, files) config, files = s.run() if config['exit']: exit() if config['duplicates']: skip = True if not skip and not config['vap']: r = Rename(config, files) config, files = r.run() if config['exit']: exit() exit() elif command == 'stage': print("*"*50,"\n", json.dumps(config, indent=2), "*"*50, "\n") skip = False if not config['duplicates']: s = Stage(config, files) config, files = s.run() if config['exit']: exit() if config['duplicates']: skip = True if not skip and not config['vap']: r = Rename(config, files) config, files = r.run() if config['exit']: exit() elif command == 'rename': # If rename is called explicitly, force rename even if config is set to false switch = True if config['rename'] == False else False if switch: config['rename'] = True if not config['vap']: r = Rename(config, files) config, files = r.run() if config['exit']: exit() if switch: config['rename'] = False elif command == 'process': r = Rename(config, files) has_coll = r.check_for_collisions() files = r.files if has_coll: config = r.config files = r.files else: p = Process(config, files) config, files = p.run() if config['exit']: exit() elif command == 'review': r = Rename(config, files) has_coll = r.check_for_collisions() files = r.files if has_coll: config = r.config files = r.files else: r = Review(config, files) config, files = r.run() if config['exit']: exit() elif command == 'remove': r = Rename(config, files) has_coll = r.check_for_collisions() files = r.files if has_coll: config = r.config files = r.files else: r = Remove(config, files) config, files = r.run() if config['exit']: exit() elif command == 'archive': r = Rename(config, files) has_coll = r.check_for_collisions() files = r.files if has_coll: config = r.config files = r.files else: a = Archive(config, files) config, files = a.run() if config['exit']: exit() elif command == 'cleanup': r = Rename(config, files) has_coll = r.check_for_collisions() files = r.files if has_coll: config = r.config files = r.files else: c = Cleanup(config, files) config, files = c.run() if config['exit']: exit() elif command == 'prep': r = Rename(config, files) has_coll = r.check_for_collisions() files = r.files if has_coll: config = r.config files = r.files else: d = Demo(config, files) config, files = d.run() if config['exit']: exit() elif command == "notification": # Alka's module goes here print('Yay notify the user shit has changed.') else: sys.argv.append("-h") config = parse_args() f.config = config f.files = files f.save_config() f.save_filenames()
def parse_args(): """ Setup argument parsing and parse the arguments """ username = os.environ.get('USER') # Setup parser and groups parser = argparse.ArgumentParser(description='ARM Processing Manager') ui_flags = parser.add_mutually_exclusive_group() stage_type = parser.add_mutually_exclusive_group() # Setup positional arguments parser.add_argument('command', help='Which of the APM stages to run: stage, rename, process, review, remove, archive, cleanup') # Demo options parser.add_argument('--demo', help='Prep for different stages of a demo, available options include: remove, archive, cleanup') # Date parser.add_argument('-b', '--begin', type=int, default=0, help='Format: YYYYMMDD - date to start processing data') parser.add_argument('-e', '--end', type=int, default=0, help='Format:YYYYMMDD - date to stop processing data') # SIF/Datastreams parser.add_argument('-s', '--site', help='The site the data is from') parser.add_argument('-i', '--instrument', help='The instrument used to collect the data') parser.add_argument('-f', '--facility', help='The facility where the instrument is located') parser.add_argument('-d', '--datastream', nargs='+', help='One or more datastream patterns. "%%" and "*" can be used as wildcards.') # Job parser.add_argument('-j', '--job', required=True, help='DQR # for job') # Alias parser.add_argument('-a', '--alias', help='An alias for the Ingest to use to connect to the database. Def: apm') # Flow control flags parser.add_argument('--stage', help='Specify a staging directory') parser.add_argument('--source', help='Specify a source directory') parser.add_argument('--no-rename', action='store_false', help='Do not strip the ARM prefix from the files') parser.add_argument('--no-db-up', action='store_false', help='Do not update the config database') parser.add_argument('--no-compare', action='store_false', help='Do not compare the ingest output for re-archiving') # Other parser.add_argument('--ingest-flags', nargs='+', help='Flags you want APM to pass to the INGEST. Ex. --ingest-flags F (Do not use "-F" APM will add the "-") (Will apply to all ingests if running for multiple datastreams)') # Ingest Vs Vap stage_type.add_argument('--ingest', action='store_true', help='Ingest vs. VAP (default)') stage_type.add_argument('--vap', action='store_true', help='VAP vs. Ingest') # UI Flags ui_flags.add_argument('-I', '--interactive', action='store_true', help='Prompt for various inputs') ui_flags.add_argument('-q', '--quiet', action='store_true', help='Suppresses prompts and exits gracefully if unable to run') ui_flags.add_argument('-D', '--devel', action='store_true', help='Run APM in development mode') # Parse the args arguments = parser.parse_args() if (arguments.ingest == False) and (arguments.vap == False): arguments.ingest = True args = { 'command': arguments.command, 'demo': arguments.demo, 'begin': arguments.begin, 'end': arguments.end, 'site': arguments.site, 'instrument': arguments.instrument, 'facility': arguments.facility, 'datastream': arguments.datastream, 'duplicates': False, 'job': arguments.job, 'alias': arguments.alias, 'stage': arguments.stage, 'source': arguments.source, 'rename': arguments.no_rename, 'db_up': arguments.no_db_up, 'compare': arguments.no_compare, 'iflags': arguments.ingest_flags, 'ingest': arguments.ingest, 'vap': arguments.vap, 'interactive': arguments.interactive, 'quiet': arguments.quiet, 'devel': arguments.devel, 'username': username, 'exit': False, "cleanup_status": { "review": { "status": True, }, "remove": { "status": False, "deletion_list": False, "archive_list": False, "files_bundled": False, }, "archive": { "status": False, "files_deleted": False, "move_files": False, "files_released": False, }, "cleanup": { "status": False, "files_archived": False, "files_cleaned_up": False, }, } } f = Files(args) if args['stage'] != None: temp = f.clean_path(args['stage']) args['stage'] = abspath(temp) if args['source'] != None: temp = f.clean_path(args['source']) args['source'] = abspath(temp) return args
def run(self): """ Run the archive portion of the cleanup phase """ if not self.config['cleanup_status']['remove']['status']: print(self.config['cleanup_status']['remove']['status']) print('') print( "Data files must be requested for deletion before the files can be archived." ) self.config['exit'] = True return self.config, self.files # Setup vars stage = self.config['stage'] job = self.config['job'] ############################################################ # Check to see if the current user is `dsmgr` ############################################################ # Verify current user is authenticated to run this command if not self.authenticate(): self.config['exit'] = True return self.config, self.files # Do this if the files have not yet been verified as deleted from the archive if not self.config['cleanup_status']['archive']['files_deleted']: print("Verifying all files have been deleted from the archive...", end="") ############################################################ # Update the local archive database ############################################################ # Setup the datastreams to update datastreams = [] datastream_path = dir_pattern(3).format(stage, job, 'datastream') for site in os.listdir(datastream_path): path = dir_pattern().format(datastream_path, site) for folder in os.listdir(path): abs_folder = dir_pattern().format(path, folder) if os.path.isdir( abs_folder) and not os.path.islink(abs_folder): datastreams.append(folder) # Update the local copy of the archive db if not DEVEL: update_archive(datastreams) ############################################################ # Load the list of files to be removed from the archive ############################################################ deleted_files = [] deletion_file = dir_pattern(3).format(stage, job, "%s.deletion-list.txt" % job) if not os.path.exists(deletion_file): print("Failed") print( "Deletion list does not exist. Please create it and try again." ) self.config['exit'] = True return self.config, self.files fp = open(deletion_file, 'r') deletion_text = fp.readlines() fp.close() for line in deletion_text: if line.endswith("\r\n"): line = line[:-2] tar = {} parts, tar['md5'] = line.split(' ') parts = parts.split('.') tar['version'] = parts[-1][1:] tar['name'] = '.'.join(parts[:-1]) deleted_files.append(tar) del tar, parts if 'line' in locals(): del line ############################################################ # Verify all files have been removed from the archive ############################################################ # Get a list of files that are currently at the archive archive_files = {} db_file = '/apps/ds/conf/datainv/.db_connect' alias = 'inv_read' db = DB(self.config, db_file=db_file, alias=alias) # Store the query query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;" # List the column names so the values can be mapped in a dictionary cols = [ 'file_tag', 'file_name', 'file_version', 'file_size', 'file_stored', 'file_md5', 'file_stamp', 'file_checked', 'file_active' ] # convert the start and end dates to a unix timestamp start = convert_date_to_timestamp(self.config['begin']) end = convert_date_to_timestamp(self.config['end']) # Query the database for each of the datastreams for k, v in enumerate(datastreams): args = (v, start, end) result = db.query(query % args, columns=cols) if len(result) > 0: archive_files[v] = result else: print("Failed") print("No results for %s" % v) # Store the list of what is currently in the archive and their versions to file current_archive = dir_pattern(3).format(stage, job, 'current_archive.json') fp = open(current_archive, 'w') fp.write( json.dumps(archive_files, indent=2, sort_keys=False, separators=(',', ': '))) fp.close() del fp if DEVEL: file_path = dir_pattern(3).format(stage, job, '%s.archive.json' % job) if os.path.exists(file_path): fp = open(file_path, 'r') archive_files = json.loads(fp.read()) fp.close() del fp, file_path # Check to see if any of the "deleted_files" are in the list # If yes, quit # If no, proceed all_files_deleted = None if len(deleted_files) > 0: # Check the list of files from the archive to see if the current file has been deleted for f in deleted_files: process = '.'.join(f['name'].split('.')[0:2]) name = f['name'] if any(d['file_name'] == name for d in archive_files[process]): all_files_deleted = False print("Failed") print( "Not all files have been deleted from the archive." ) print("Please try again later.") self.config['exit'] = True return self.config, self.files else: all_files_deleted = True else: all_files_deleted = True if 'f' in locals(): del f if 'process' in locals(): del process if all_files_deleted != True: print("Failed") print("Not all files have been removed from the archive.") print( "Run this again once all files have been removed from the archive." ) self.config['exit'] = True return self.config, self.files # Files have been deleted self.config['cleanup_status']['archive']['files_deleted'] = True print("Done") ############################################################ # Move any files not being archived to subdirectories # # Processed files: # This includes any processed files outside the # date range specified # Raw/Tar files: # This includes any files that do not need to be rearchived ############################################################ if not self.config['cleanup_status']['archive']['move_files']: print("Moving files that should not be archived...", end="") cwd = os.getcwd() datastream = dir_pattern(3).format(stage, job, 'datastream') # Load the list of tar files that need to be archived os.chdir(dir_pattern().format(stage, job)) fp = open('archive.json', 'r') contents = json.loads(fp.read()) fp.close() tar_archive = {} for k, v in enumerate(contents): s = v['site'] p = v['instrument'] if s not in tar_archive: tar_archive[s] = {} if p not in tar_archive[s]: tar_archive[s][p] = [] tar_archive[s][p].append(v['file_name']) if len(contents) > 0: del s, p, k, v os.chdir(datastream) sites = os.listdir(datastream) for i, s in enumerate(sites): os.chdir(s) processes = os.listdir('.') for j, p in enumerate(processes): no_archive = dir_pattern(4).format(datastream, s, p, 'no_archive') os.chdir(p) if p.split('.')[-1] == '00': # This is a raw datastream # Don't include directories # Get a list of non-tar files from the raw datastreams # Move all of these files to a sub-directory rawfiles = [ x for x in os.listdir('.') if not x.endswith('tar') if not os.path.isdir(x) ] # Get a list of all tar files from the raw datastreams # Retrieve the list of tar files that need to be archived # Move all of the files not in the list to a sub-directory tarfiles = [ x for x in glob("*.tar") if not os.path.isdir(x) ] for x in rawfiles: if not os.path.exists(no_archive): os.mkdir(no_archive) elif not os.path.isdir(no_archive): print("Failed") print( "There is a file called 'no_archive' in %s." ) print( "This file must be removed before proceeding." ) self.config['exit'] = True return self.config, self.files src = dir_pattern(4).format(datastream, s, p, x) try: os.rename(src, no_archive) except OSError: shutil.move(src, no_archive) for x in tarfiles: if not os.path.exists(no_archive): os.mkdir(no_archive) elif not os.path.isdir(no_archive): print("Failed") print( "There is a file called 'no_archive' in %s." ) print( "This file must be removed before proceeding." ) self.config['exit'] = True return self.config, self.files if s not in tar_archive or p not in tar_archive[ s] or x not in tar_archive[s][p]: src = dir_pattern(4).format( datastream, s, p, x) try: os.rename(src, no_archive) except OSError: shutil.move(src, no_archive) else: # For each processed datastream # Get a list of all the files # Move any files that fall outside the specified date range to a sub-directory if not os.path.exists(no_archive): os.mkdir(no_archive) elif not os.path.isdir(no_archive): print("Failed") print("There is a file called 'no_archive' in %s.") print( "This file must be removed before proceeding.") self.config['exit'] = True return self.config, self.files # Don't include directories files = [ x for x in os.listdir('.') if not os.path.isdir(x) ] timeformat = "%Y%m%d" begin = datetime.strptime(str(self.config['begin']), timeformat) end = datetime.strptime(str(self.config['end']), timeformat) for x in files: date = x.split('.')[2] filedate = datetime.strptime(date, timeformat) if not (filedate >= begin and filedate <= end): src = dir_pattern(4).format( datastream, s, p, x) try: os.rename(src, no_archive) except OSError: shutil.move(src, no_archive) os.chdir('..') os.chdir('..') os.chdir(cwd) print("Done") self.config['cleanup_status']['archive']['move_files'] = True ############################################################ # Read environment variables ############################################################ print("Updating environment variables...", end="") env_path = dir_pattern().format(stage, job) if not update_env(env_path): f = Files(self.config) shell = f.get_shell() if shell == "bash": ext = 'sh' else: ext = 'csh' print("Failed") exit("Error: Unable to locate env.%s." % ext) print("Done") # Updating Env Vars ############################################################ # Ensure `DBCONNECT_PATH` does not point to job `.db_connect` file ############################################################ if 'DBCONNECT_PATH' in os.environ: del os.environ['DBCONNECT_PATH'] # The command should be complete up to this point, # however I'm waiting on a response to verify the exact name # of this environment variable ############################################################ # Run `release_data` ############################################################ print("Running release_data...", end="") ############################################# # Need to change this so it supports both # `sif` data and `datastream` data ############################################# db = DB(self.config) data_paths = db.get_data_paths() commands = [] for d in data_paths: output = d['output'] (site, temp) = output.split('/') temp = temp.split('.')[0][3:] for i, e in reversed(list(enumerate(temp))): if not is_number(e): fac = i break else: print("Could not separate facility from %s" % temp) self.config['exit'] = True return self.config, self.files facility = temp[fac:] process = temp[:fac] command = ['release_data', '-s', site, '-f', facility, process] # Check to see if a plugin needs to modify the command command = self.manager.callPluginCommand( 'hook_release_data_command_alter', command) commands.append(command) # code to run a shell command copied from other part of APM # Needs modified to work here # Run the command for command in commands: try: if not DEVEL: ps = Popen(command, stdout=PIPE, stderr=PIPE) ps.communicate() returncode = ps.returncode if returncode != 0: print("Failed") self.config['exit'] = True return self.config, self.files except CalledProcessError as e: print("Failed") self.config['exit'] = True return self.config, self.files except Exception as e: raise e print("Done") # Files have been released self.config['cleanup_status']['archive']['files_released'] = True # Archive is complete self.config['cleanup_status']['archive']['status'] = True return self.config, self.files