def __init__(self, config, archive_path, stage_path, files={}): """ Initialize with args """ self.filenames = files self.config = config self.begin = self.config['begin'] self.end = self.config['end'] self.quiet = self.config['quiet'] self.source = self.config['source'] self.stage = self.config['stage'] self.job = self.config['job'] self.archive_path = dir_pattern().format(self.source, archive_path) self.stage_path = dir_pattern(4).format(self.stage, self.job, 'collection', stage_path) self.cwd = os.getcwd() temp = self.stage_path.split('/') self.local = dir_pattern().format(temp[-2], temp[-1]) # List of tar files self.files = None # duplicate file manipulation self.file_names = [] self.st_files = [] self.members = [] self.duplicates = {} self.dups = None
def __init__(self, config): """ Initialize with config """ if config == {}: return self.config = config self.path = dir_pattern().format(self.config['stage'], self.config['job']) self.job = dir_pattern().format(self.path, 'collection') self.ops = dir_pattern(3).format(self.path, 'conf', 'ops') self.file = dir_pattern().format(self.ops, 'vapmgr.conf') if not os.path.exists(self.ops): os.mkdir(self.ops) if not os.path.exists(self.file): os.symlink('/data/conf/ops/vapmgr.conf', self.file)
def add_to_env(self): ext = get_shell() if ext == 'bash': ext = 'sh' elif ext == 'csh': ext = 'csh' else: exit( "Unable to determine shell. Please run again from Bash or CSH shell." ) db = DB(self.config) site = self.config['site'] ins = self.config['instrument'] fac = self.config['facility'] output = '\n' # Construct the vapmgr command output += "/apps/ds/bin/vapmgr -setup " if site: output += "-r {}".format(site) if fac: output += ".{} ".format(fac) output += ins env = 'env.{}'.format(ext) env = dir_pattern().format(self.path, env) fp = open(env, 'a') fp.write(output) fp.close() return
def __init__(self, tar, tar_file): self.error = False self.result = None self.tar = tar self.file = tar_file self.config = self.tar.config temp = self.tar.stage_path self.local = dir_pattern().format(temp[-2], temp[-1]) threading.Thread.__init__(self)
def email_del_list(self, del_file): global DEVEL # Get the directory where the job and deletion file are stored job_dir = dir_pattern().format(self.config['stage'], self.config['job']) # Get the contents of the deletion list cwd = os.getcwd() os.chdir(job_dir) f = open(del_file, 'r') attach_file = f.read() f.close() os.chdir(cwd) # Setup the email variables email_from = '*****@*****.**' email_to = ['*****@*****.**'] # email_to = ['*****@*****.**'] if DEVEL: email_to.append('*****@*****.**') text = [ 'The deletion list for %s is attached.' % self.config['job'], 'Once the files have been deleted, run the following commands:', '', 'cd %s' % job_dir, 'apm archive -j %s' % self.config['job'], 'Attachment:', del_file, ] text = '\n'.join(text) ############################################################ # Create a message msg = MIMEMultipart() msg['From'] = email_from msg['To'] = ', '.join(email_to) msg['Subject'] = "APM: %s deletion list" % self.config['job'] # Add the body of the message msgText = MIMEText(text) msg.attach(msgText) # Add the attachment attachment = MIMEText(attach_file) attachment.add_header('Content-Disposition', 'attachment', filename=del_file) msg.attach(attachment) # Send the message s = smtplib.SMTP('localhost') s.sendmail(email_from, email_to, msg.as_string()) s.quit()
def run(self): manager = PluginManager() config = self.config f = Files(config) cwd = os.getcwd() stage = config['stage'] collection = dir_pattern(3).format(stage, config['job'], 'collection') # Make sure files are supposed to be renamed if config['rename'] == False: return config, self.files # Verify there are no file collisions if self.check_for_collisions(): return config, self.files # Strip the ARM prefix from all of the files print("\nStripping ARM prefix from files... ", end="") sys.stdout.flush() manager.callPluginCommand('hook_rename_preprocess', {'config': config}) os.chdir(collection) sites = set(os.listdir('.')) for site in sites: os.chdir(site) instruments = set(os.listdir('.')) for ins in instruments: os.chdir(ins) files = set(os.listdir('.')) for i in files: new_name = f.rename_file(i) if new_name != None: if i != new_name: self.files[site][ins][new_name] = self.files[site][ ins][i] self.files[site][ins].pop(i) self.files[site][ins][new_name][ 'current_name'] = new_name self.files[site][ins][new_name][ 'stripped_name'] = new_name os.chdir('..') os.chdir('..') manager.callPluginCommand('hook_renamed_files_alter', {'config': config}) print("Done\n") sys.stdout.flush() return config, self.files
def get_rename_raw_process_list(self): path = binpath binary = "rename_raw" command = dir_pattern().format(path, binary) if not os.path.exists(command): print("Cannot find Rename_raw") return False helptext = None try: ps = Popen([command, '-h'], stdout=PIPE, stderr=PIPE) (output, error) = ps.communicate() returncode = ps.returncode helptext = output except CalledProcessError as e: error.close() output.close() status = e.returncode helptext = e.output if helptext == None: print("Unable to get rename_raw help text") return False helptext = helptext.split("\n\n") processes = None for k,v in enumerate(helptext): if v == "SUPPORTED PROCESSES:" and (k+1) < len(helptext): processes = helptext[k+1] if processes == None: print("Unable to get list of processes from rename_raw") return False processes = processes.split("\n") for k,v in enumerate(processes): v = re.sub(r'\s+', ' ', v).strip().split(' ') if v[0] == "*": processes[k] = v[1].lower() else: processes[k] = v[0].lower() processes = list(set(processes)) processes.sort() return processes
def get_tar_structure(self, path): """ Return the structure of the files contained within the tar files at the given location The given path should be the where a site directory is located Example: datastream should be provided job/datastream/site/process/list_of_tar_files """ global DEVEL file_list = {} for site in os.listdir(path): file_list[site] = {} for process_path in glob( dir_pattern(3).format(path, site, '*.00')): process = process_path.split('/')[-1] file_list[site][process] = {} tar_file_list = os.listdir(process_path) if DEVEL: print("Retrieving file list for %s" % process) pbar = UI() pbar.progress(0) count = len(tar_file_list) i = 1 for tar_file in tar_file_list: if not os.path.isdir(dir_pattern().format( process_path, tar_file)): file_list[site][process][ tar_file] = self.get_all_files_from_tar( tar_file, process_path) if DEVEL: pbar.progress(int((float(i) / float(count)) * 100)) i = i + 1 if DEVEL: print("") return file_list
def check_for_collisions(self): """ Check all unpacked files for file naming collisions """ print("Checking for file naming collisions...", end="") sys.stdout.flush() config = self.config f = Files(config, self.files) cwd = os.getcwd() collection = dir_pattern(3).format(config['stage'], config['job'], 'collection') os.chdir(collection) sites = os.listdir('.') for site in sites: os.chdir(site) instruments = set(os.listdir('.')) for ins in instruments: os.chdir(ins) files = set(os.listdir('.')) names = self.files[site][ins] # Mark files as deleted for k, v in names.items(): if k not in files: names[k]['deleted'] = True # Check for duplicates for k, v in names.items(): if len(v['duplicate_files']) > 0 and v['deleted'] == False: for i in v['duplicate_files']: name = f.get_file_by_uuid(i) if names[name]['uuid'] == i and names[name][ 'deleted'] == False: config['duplicates'] = True print("Fail") print( "Files with naming collisions still exist.\nPlease resolve these issues before continuing.\n" ) return True os.chdir('..') os.chdir('..') os.chdir(cwd) config['duplicates'] = False print("Done") sys.stdout.flush() return False
def bundle_raw_data(self, datastreams): """ Bundle the raw data in <job>/datastream/<site>/<site><instrument><facility>.00 """ if type(datastreams) == str: datastreams = [datastreams] if type(datastreams) != list: return False # Update env variables so bundle_data will tar the right files and put the tar files in the right place p = Process(self.config, self.files) print("\nUpdating environment variables...", end="") if update_env(dir_pattern().format(self.config['stage'], self.config['job'])): print("Done") else: print("Failed") return False # Validate the bundle alias exists home = os.path.expanduser('~') db_file = dir_pattern().format(home, ".db_connect") p.setup_alias(db_file, 'bundle') # Run this process for each of the passed streams print("Bundling raw data...", end="") sys.stdout.flush() for stream in datastreams: # split the stream string to get the needed information stream = stream.split('.')[0] for i, e in reversed(list(enumerate(stream))): if not is_number(e): fac = i break else: print("Failed: Could not separate facility from %s" % stream) return False s = stream[0:3] i = stream[3:fac] f = stream[fac:] # Build the command command = ['bundle_data', '-e', '-s', s, '-f', f, i] # Run the command try: ps = Popen(command, stdout=PIPE, stderr=PIPE) ps.communicate() returncode = ps.returncode if returncode != 0: print("Bad Return...", end="") print("Failed") return False except CalledProcessError as e: print("Called Process Error...", end="") print("Failed") return False except Exception as e: raise e print("Done") return True
def run(self): self.times['ingest']['start'] = datetime.now().replace(microsecond=0) self.command = [] self.command.append(self.options['ingest']) self.command.append("-a") if not self.config['alias']: self.command.append('apm') else: self.command.append(self.config['alias']) # Add the site to the command self.command.append('-s') self.command.append(self.site) # Add the facility to the command self.command.append('-f') self.command.append(self.facility) # If multiple add -n and the instrument to the command if self.options['multiple']: self.command.append('-n') self.command.append(self.instrument) # Add the final option self.command.append('-R') # Add additional user specified flags if self.flags != None and type(self.flags == list): for i in self.flags: self.command.append('-%s' % i) ps = Popen(self.command, stdout=PIPE, stderr=PIPE) (output, error) = ps.communicate() returncode = ps.returncode self.times['ingest']['end'] = datetime.now().replace(microsecond=0) if returncode != 0: self.error = "Error running ingest (%s)" % ' '.join(self.command) self.result = None return else: self.stdout = output self.stderr = error ################################################## # Parse the log file. ################################################## if not os.path.exists(self.logfile): self.error = "ERROR: Unable to find log file" return log = open(self.logfile, 'r') text = log.readlines() log.close() parse = False names = {} for k,line in enumerate(text): if line.startswith('**** OPENED: '): timeformat = "**** OPENED: %Y-%m-%d %X\n" linedate = datetime.strptime(line, timeformat) if linedate >= self.times['ingest']['start'] and linedate <= self.times['ingest']['end']: parse = True elif parse and line.startswith('**** CLOSED: '): timeformat = "**** CLOSED: %Y-%m-%d %X\n" linedate = datetime.strptime(line, timeformat) if linedate >= self.times['ingest']['start'] and linedate <= self.times['ingest']['end']: parse = False elif parse and line.startswith("Renaming: "): old_path = line.replace('Renaming: ', '').replace("\n", '') new_path = text[k + 1].replace(' -> to: ', '').replace("\n", '') parts = old_path.split('/') site = parts[-3] sif = parts[-2] old_name = parts[-1] new_name = new_path.split('/')[-1] if site not in names: names[site] = {} if sif not in names[site]: names[site][sif] = {} names[site][sif][old_name] = new_name folder = dir_pattern(5).format(self.config['stage'], self.config['job'], 'collection', self.config['site'], '{}{}{}.00'.format(self.config['site'], self.config['instrument'], self.config['facility'])) listdir = os.listdir(folder) if len(listdir) > 0: ################################################## # Run Rename Raw ################################################## if self.config['instrument'] in self.get_rename_raw_process_list(): self.times['rename']['start'] = datetime.now().replace(microsecond=0) command = ['%s/rename_raw' % binpath, '-s', self.config['site'], '-f', self.config['facility'], self.config['instrument']] ps = Popen(command, stdout=PIPE, stderr=PIPE) (output, error) = ps.communicate() returncode = ps.returncode self.times['rename']['end'] = datetime.now().replace(microsecond=0) if returncode != 0: self.result = names self.error = error return ################################################## # Parse rename_raw log file ################################################## if not os.path.exists(self.renamelog): self.result = names self.error = "renamelog does not exist" return lf = open(self.renamelog, 'r'); logs = lf.readlines() lf.close() parse = False for k,line in enumerate(logs): if line.startswith("****OPEN"): i = k + 1 timeline = logs[i] timeformat = "Time: %a %b %d %X %Y\n" opentime = datetime.strptime(timeline, timeformat) if opentime >= self.times['rename']['start'] and opentime <= self.times['rename']['end']: parse = True elif parse and line.startswith("Close time: "): i = k timeformat = "Close time: %a %b %d %X %Y\n" closetime = datetime.strptime(line, timeformat) if closetime >= self.times['rename']['start'] and closetime <= self.times['rename']['end']: parse = False elif parse and line.startswith("Renamed: "): old_path = line.replace("Renamed: ", "").replace("\n", '').split(' (')[0] new_path = logs[k + 1].replace(" -> ", '').replace("\n", '') parts = old_path.split('/') site = parts[-3] sif = parts[-2] old_name = parts[-1] new_name = new_path.split('/')[-1] if site not in names: names[site] = {} if sif not in names[site]: names[site][sif] = {} names[site][sif][old_name] = new_name ################################################## # Check for additional files ################################################## listdir = os.listdir(folder) if len(listdir) > 0: self.result = names self.error = "rename_raw did not move all the files in {}".format(folder) return self.result = names return
def handle_duplicate_files(self): # Handle duplicates f = Files(self.config) dup_list = {} duplicates = {} files = self.file_names dups = self.duplicates if len(dups) > 0: for i, n in dups.items(): for j, v in enumerate(n): folder = 'dup_{}'.format(j + 1) delete = False move = False if f.is_same_file( dir_pattern().format(self.stage_path, i), dir_pattern(3).format(self.stage_path, folder, v)): delete = True move = False else: delete = False move = True if delete: os.remove( dir_pattern(3).format(self.stage_path, folder, v)) elif move: if i not in dup_list: name = '{}.v1'.format(i) dup_list[i] = [name] src = dir_pattern().format(self.stage_path, i) dst = dir_pattern().format(self.stage_path, name) try: os.rename(src, dst) except OSError: shutil.move(src, dst) num = len(dup_list[i]) + 1 name = '{}.v{}'.format(v, num) dup_list[i].append(name) src = dir_pattern(3).format(self.stage_path, folder, v) dst = dir_pattern().format(self.stage_path, name) try: os.rename(src, dst) except OSError: shutil.move(src, dst) for i in dup_list: if len(dup_list[i]) > 1: key = dir_pattern().format(self.local, i) duplicates[key] = [] for j in dup_list[i]: duplicates[key].append(dir_pattern().format( self.local, j)) self.dups = duplicates # Delete directory if now empty dupdirs = glob('{}/dup_*'.format(self.stage_path)) for i in dupdirs: f.empty_dir(i) os.rmdir(i) return False if duplicates == {} else duplicates
def find_ingest_exec(self, process): """ Find the appropriate ingest executable """ # Does the ingest run for multiple processes and require the -n option multiple = False cwd = os.getcwd() base_path = binpath os.chdir(base_path) executable = {} # Get a list of ingest executables ingest = glob.glob('*_ingest') skip = [ # 'xsapr_ingest', # 'wacrspec_ingest', # 'wacr_ingest', # 'mwacrspec_ingest', ] # Loop over the executables and get the process names for each for i in ingest: multiple = False if (i not in skip ): # Remove this line for productions, This suppresses an error on Copper # Get the help text help_text = "" try: ps = Popen([i, '-h'], stdout=PIPE, stderr=PIPE) (output, error) = ps.communicate() returncode = ps.returncode help_text = output except CalledProcessError as e: error.close() output.close() help_text = "" status = e.returncode if DEVEL: print("\nCALLED PROCESS ERROR: GET HELP TEXT\n") # Check for process names help_text = help_text.split("VALID PROCESS NAMES") # If process names exist if len(help_text) == 2: names = help_text[1].strip() names = names.split('\n') # Add each of the valid process names to the dict for n in names: n = n.strip() executable[n] = { 'executable': i, 'multiple': True } elif len(help_text) < 2: name = i.split('_') executable[name[0]] = { 'executable': i, 'multiple': False } if process in executable: return dir_pattern().format(base_path, executable[process]['executable']), executable[process]['multiple'] else: return None, None
def run(self): """ Run the archive portion of the cleanup phase """ if not self.config['cleanup_status']['remove']['status']: print(self.config['cleanup_status']['remove']['status']) print('') print( "Data files must be requested for deletion before the files can be archived." ) self.config['exit'] = True return self.config, self.files # Setup vars stage = self.config['stage'] job = self.config['job'] ############################################################ # Check to see if the current user is `dsmgr` ############################################################ # Verify current user is authenticated to run this command if not self.authenticate(): self.config['exit'] = True return self.config, self.files # Do this if the files have not yet been verified as deleted from the archive if not self.config['cleanup_status']['archive']['files_deleted']: print("Verifying all files have been deleted from the archive...", end="") ############################################################ # Update the local archive database ############################################################ # Setup the datastreams to update datastreams = [] datastream_path = dir_pattern(3).format(stage, job, 'datastream') for site in os.listdir(datastream_path): path = dir_pattern().format(datastream_path, site) for folder in os.listdir(path): abs_folder = dir_pattern().format(path, folder) if os.path.isdir( abs_folder) and not os.path.islink(abs_folder): datastreams.append(folder) # Update the local copy of the archive db if not DEVEL: update_archive(datastreams) ############################################################ # Load the list of files to be removed from the archive ############################################################ deleted_files = [] deletion_file = dir_pattern(3).format(stage, job, "%s.deletion-list.txt" % job) if not os.path.exists(deletion_file): print("Failed") print( "Deletion list does not exist. Please create it and try again." ) self.config['exit'] = True return self.config, self.files fp = open(deletion_file, 'r') deletion_text = fp.readlines() fp.close() for line in deletion_text: if line.endswith("\r\n"): line = line[:-2] tar = {} parts, tar['md5'] = line.split(' ') parts = parts.split('.') tar['version'] = parts[-1][1:] tar['name'] = '.'.join(parts[:-1]) deleted_files.append(tar) del tar, parts if 'line' in locals(): del line ############################################################ # Verify all files have been removed from the archive ############################################################ # Get a list of files that are currently at the archive archive_files = {} db_file = '/apps/ds/conf/datainv/.db_connect' alias = 'inv_read' db = DB(self.config, db_file=db_file, alias=alias) # Store the query query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;" # List the column names so the values can be mapped in a dictionary cols = [ 'file_tag', 'file_name', 'file_version', 'file_size', 'file_stored', 'file_md5', 'file_stamp', 'file_checked', 'file_active' ] # convert the start and end dates to a unix timestamp start = convert_date_to_timestamp(self.config['begin']) end = convert_date_to_timestamp(self.config['end']) # Query the database for each of the datastreams for k, v in enumerate(datastreams): args = (v, start, end) result = db.query(query % args, columns=cols) if len(result) > 0: archive_files[v] = result else: print("Failed") print("No results for %s" % v) # Store the list of what is currently in the archive and their versions to file current_archive = dir_pattern(3).format(stage, job, 'current_archive.json') fp = open(current_archive, 'w') fp.write( json.dumps(archive_files, indent=2, sort_keys=False, separators=(',', ': '))) fp.close() del fp if DEVEL: file_path = dir_pattern(3).format(stage, job, '%s.archive.json' % job) if os.path.exists(file_path): fp = open(file_path, 'r') archive_files = json.loads(fp.read()) fp.close() del fp, file_path # Check to see if any of the "deleted_files" are in the list # If yes, quit # If no, proceed all_files_deleted = None if len(deleted_files) > 0: # Check the list of files from the archive to see if the current file has been deleted for f in deleted_files: process = '.'.join(f['name'].split('.')[0:2]) name = f['name'] if any(d['file_name'] == name for d in archive_files[process]): all_files_deleted = False print("Failed") print( "Not all files have been deleted from the archive." ) print("Please try again later.") self.config['exit'] = True return self.config, self.files else: all_files_deleted = True else: all_files_deleted = True if 'f' in locals(): del f if 'process' in locals(): del process if all_files_deleted != True: print("Failed") print("Not all files have been removed from the archive.") print( "Run this again once all files have been removed from the archive." ) self.config['exit'] = True return self.config, self.files # Files have been deleted self.config['cleanup_status']['archive']['files_deleted'] = True print("Done") ############################################################ # Move any files not being archived to subdirectories # # Processed files: # This includes any processed files outside the # date range specified # Raw/Tar files: # This includes any files that do not need to be rearchived ############################################################ if not self.config['cleanup_status']['archive']['move_files']: print("Moving files that should not be archived...", end="") cwd = os.getcwd() datastream = dir_pattern(3).format(stage, job, 'datastream') # Load the list of tar files that need to be archived os.chdir(dir_pattern().format(stage, job)) fp = open('archive.json', 'r') contents = json.loads(fp.read()) fp.close() tar_archive = {} for k, v in enumerate(contents): s = v['site'] p = v['instrument'] if s not in tar_archive: tar_archive[s] = {} if p not in tar_archive[s]: tar_archive[s][p] = [] tar_archive[s][p].append(v['file_name']) if len(contents) > 0: del s, p, k, v os.chdir(datastream) sites = os.listdir(datastream) for i, s in enumerate(sites): os.chdir(s) processes = os.listdir('.') for j, p in enumerate(processes): no_archive = dir_pattern(4).format(datastream, s, p, 'no_archive') os.chdir(p) if p.split('.')[-1] == '00': # This is a raw datastream # Don't include directories # Get a list of non-tar files from the raw datastreams # Move all of these files to a sub-directory rawfiles = [ x for x in os.listdir('.') if not x.endswith('tar') if not os.path.isdir(x) ] # Get a list of all tar files from the raw datastreams # Retrieve the list of tar files that need to be archived # Move all of the files not in the list to a sub-directory tarfiles = [ x for x in glob("*.tar") if not os.path.isdir(x) ] for x in rawfiles: if not os.path.exists(no_archive): os.mkdir(no_archive) elif not os.path.isdir(no_archive): print("Failed") print( "There is a file called 'no_archive' in %s." ) print( "This file must be removed before proceeding." ) self.config['exit'] = True return self.config, self.files src = dir_pattern(4).format(datastream, s, p, x) try: os.rename(src, no_archive) except OSError: shutil.move(src, no_archive) for x in tarfiles: if not os.path.exists(no_archive): os.mkdir(no_archive) elif not os.path.isdir(no_archive): print("Failed") print( "There is a file called 'no_archive' in %s." ) print( "This file must be removed before proceeding." ) self.config['exit'] = True return self.config, self.files if s not in tar_archive or p not in tar_archive[ s] or x not in tar_archive[s][p]: src = dir_pattern(4).format( datastream, s, p, x) try: os.rename(src, no_archive) except OSError: shutil.move(src, no_archive) else: # For each processed datastream # Get a list of all the files # Move any files that fall outside the specified date range to a sub-directory if not os.path.exists(no_archive): os.mkdir(no_archive) elif not os.path.isdir(no_archive): print("Failed") print("There is a file called 'no_archive' in %s.") print( "This file must be removed before proceeding.") self.config['exit'] = True return self.config, self.files # Don't include directories files = [ x for x in os.listdir('.') if not os.path.isdir(x) ] timeformat = "%Y%m%d" begin = datetime.strptime(str(self.config['begin']), timeformat) end = datetime.strptime(str(self.config['end']), timeformat) for x in files: date = x.split('.')[2] filedate = datetime.strptime(date, timeformat) if not (filedate >= begin and filedate <= end): src = dir_pattern(4).format( datastream, s, p, x) try: os.rename(src, no_archive) except OSError: shutil.move(src, no_archive) os.chdir('..') os.chdir('..') os.chdir(cwd) print("Done") self.config['cleanup_status']['archive']['move_files'] = True ############################################################ # Read environment variables ############################################################ print("Updating environment variables...", end="") env_path = dir_pattern().format(stage, job) if not update_env(env_path): f = Files(self.config) shell = f.get_shell() if shell == "bash": ext = 'sh' else: ext = 'csh' print("Failed") exit("Error: Unable to locate env.%s." % ext) print("Done") # Updating Env Vars ############################################################ # Ensure `DBCONNECT_PATH` does not point to job `.db_connect` file ############################################################ if 'DBCONNECT_PATH' in os.environ: del os.environ['DBCONNECT_PATH'] # The command should be complete up to this point, # however I'm waiting on a response to verify the exact name # of this environment variable ############################################################ # Run `release_data` ############################################################ print("Running release_data...", end="") ############################################# # Need to change this so it supports both # `sif` data and `datastream` data ############################################# db = DB(self.config) data_paths = db.get_data_paths() commands = [] for d in data_paths: output = d['output'] (site, temp) = output.split('/') temp = temp.split('.')[0][3:] for i, e in reversed(list(enumerate(temp))): if not is_number(e): fac = i break else: print("Could not separate facility from %s" % temp) self.config['exit'] = True return self.config, self.files facility = temp[fac:] process = temp[:fac] command = ['release_data', '-s', site, '-f', facility, process] # Check to see if a plugin needs to modify the command command = self.manager.callPluginCommand( 'hook_release_data_command_alter', command) commands.append(command) # code to run a shell command copied from other part of APM # Needs modified to work here # Run the command for command in commands: try: if not DEVEL: ps = Popen(command, stdout=PIPE, stderr=PIPE) ps.communicate() returncode = ps.returncode if returncode != 0: print("Failed") self.config['exit'] = True return self.config, self.files except CalledProcessError as e: print("Failed") self.config['exit'] = True return self.config, self.files except Exception as e: raise e print("Done") # Files have been released self.config['cleanup_status']['archive']['files_released'] = True # Archive is complete self.config['cleanup_status']['archive']['status'] = True return self.config, self.files
def run(self): config = self.config manager = self.manager if config['ingest']: # If staging for Ingest # Make sure collection does not have any files that might get overwritten empty = self.check_collection_empty() if not empty: print( "\nFiles currently exist in your collection directory.\nPlease empty {}/{}/collection and try again.\n" .format(config['stage'], config['job'])) config['exit'] = True return config, self.files # cd to the stage directory os.chdir(config['stage']) # Check to see if a plugin needs to modify the datastream temp = manager.callPluginCommand('hook_datastream_alter', {'config': config}) config = temp if temp != None else config # Check to see if a plugin needs to modify the SIF data temp = manager.callPluginCommand('hook_sif_alter', {'config': config}) config = temp if temp != None else config # Establish a database connection db = DB(config) # Get the data_paths data_paths = db.get_data_paths() # Check to see if a plugin needs to modify the data_paths temp = manager.callPluginCommand('hook_data_paths_alter', { 'config': config, 'data_paths': data_paths }) data_paths = temp if temp != None else data_paths # for each instrument for k, v in enumerate(data_paths): archive_path = v['output'] stage_path = v['input'] # Set tar_path and check for plugin modifications tar_path = '{}/{}'.format(config['source'], archive_path) temp = manager.callPluginCommand('hook_tar_path_alter', { 'config': config, 'tar_path': tar_path }) tar_path = temp if temp != None else tar_path if os.path.exists(tar_path): # Get a list of tar files that match specified dates tar = UnPack(config, archive_path, stage_path) tar_files = tar.get_tar_files() temp = manager.callPluginCommand('hook_tar_files_alter', {'config': config}) tar_files = temp if temp != None else tar_files if tar_files and len(tar_files) > 0: # compare_path = '{}/{}/.compare/{}'.format(config['stage'], config['job'], stage_path) compare_path = dir_pattern(5).format( config['stage'], config['job'], 'file_comparison', 'raw', stage_path) tar_backup = dir_pattern(5).format( config['stage'], config['job'], 'file_comparison', 'tar', stage_path) collection_path = '{}/{}/collection/{}'.format( config['stage'], config['job'], stage_path) # Make the above paths if they don't already exist if not os.path.exists(compare_path): os.makedirs(compare_path) if not os.path.exists(tar_backup): os.makedirs(tar_backup) if not os.path.exists(collection_path): os.makedirs(collection_path) # Copy the tar files to the backup location if not tar.copy_files(tar_files, tar_backup): print("Unable to copy tar files") # Unpack the tar files tar.extract_tar_files(tar_files) has_dups = tar.handle_duplicate_files() if has_dups: config['duplicates'] = True for i in has_dups: duplicates[i] = has_dups[i] else: temp = tar_path.split('/') if not config['quiet']: print( '\nData not available for {} using the dates specified' .format(temp[-1])) else: temp = tar_path.split('/') if not config['quiet']: print('\nData for {} does not exist.'.format(temp[-1])) site, process = stage_path.split('/') if self.files == None: self.files = {} if site not in self.files: self.files[site] = {} site = self.files[site] if process not in site: site[process] = {} process = site[process] if os.path.exists( dir_pattern(4).format(self.config['stage'], self.config['job'], 'collection', stage_path)): files = os.listdir( dir_pattern(4).format(self.config['stage'], self.config['job'], 'collection', stage_path)) dup_uuid = {} for i in files: original_name = i temp = i.split('.') if temp[-1][0] == 'v': try: int(temp[-1][1:]) original_name = '.'.join(temp[:-1]) except: pass process[i] = { "uuid": str(uuid.uuid4()), "current_name": i, "original_name": original_name, "stripped_name": None, "processed_name": None, "unpacked_name": i, "duplicate_files": [], "deleted": False, } if original_name != i: dup_uuid[i] = process[i]['uuid'] for i in duplicates: if i.startswith(data_paths[k]['input']): for j in duplicates[i]: site, process, name = j.split('/') for l in duplicates[i]: temp = l.split('/') if j != l: self.files[site][process][name][ 'duplicate_files'].append( dup_uuid[temp[2]]) # Copy the config files from /data/conf to /<stage>/<job>/conf conf_path = "/data/conf/{0}/{0}{1}{2}".format( self.config['site'], self.config['instrument'], self.config['facility']) conf_dest = "{0}/{1}/conf/{2}".format( self.config['stage'], self.config['job'], self.config['site']) dest_folder = "{}{}{}".format(self.config['site'], self.config['instrument'], self.config['facility']) if not os.path.exists(conf_path): conf_path = "/data/conf/{0}/{1}{2}".format( self.config['site'], self.config['instrument'], self.config['facility']) conf_dest = "{0}/{1}/conf/{2}".format( self.config['stage'], self.config['job'], self.config['site']) dest_folder = "{}{}".format(self.config['instrument'], self.config['facility']) if os.path.exists(conf_path): if not os.path.exists(conf_dest): os.makedirs(conf_dest) if os.path.exists(dir_pattern().format( conf_dest, dest_folder)): try: os.rmdir(dir_pattern().format( conf_dest, dest_folder)) except OSError as e: if e.errno == errno.ENOTEMPTY: exit( "Unable to copy config files to {}. Destination is not empty." .format(dir_pattern().format( conf_dest, dest_folder))) else: raise e shutil.copytree( conf_path, dir_pattern().format(conf_dest, dest_folder)) f = Files(self.config) src = dir_pattern(3).format(config['stage'], config['job'], 'collection') # dst = dir_pattern(3).format(config['stage'], config['job'], '.compare') dst = dir_pattern(4).format(config['stage'], config['job'], 'file_comparison', 'raw') if os.path.exists(dst): f.empty_dir(dst) os.rmdir(dst) shutil.copytree(src, dst) if len(duplicates) > 0: print('') print( 'The following files had naming collisions when unpacked.\nPlease verify the contents and keep only the appropriate file(s).' ) print( 'Please do not rename files, simply delete any unwanted files.' ) for i in duplicates: print('') for j in duplicates[i]: print(j) print('') f.save_env() elif config['vap']: f = Files(self.config) f.save_env() vap = VapMgr(self.config) vap.add_to_env() return config, self.files
def run(self): """ Unpack the tar file """ # Setup Vars st_files = self.tar.st_files file_names = self.tar.file_names # files = self.tar.members # temp = self.config[''] files = [] for i in range(len(st_files)): files.append([]) # Open the tar file tar = tarfile.open( dir_pattern().format(self.tar.archive_path, self.file), 'r') # Get the content of the tar file and check for duplicate file names members = tar.getmembers() f = Files(self.config) # Iterate over each tar file for i, m in enumerate(members): # Make sure arrays are not 0 length if len(file_names) == 0: file_names.append([]) if len(files) == 0: files.append([]) if len(st_files) == 0: st_files.append([]) # Iterate over each entry in file_names # Add the file name to the correct array for k, v in enumerate(file_names): sf_names = st_files[k] sn = f.strip_name(m.name) if sn == None or sn == 'orig' or sn == 'bad': sn = m.name if not (m.name in v or sn in sf_names): file_names[k].append(m.name) files[k].append(m) st_files[k].append(sn) break else: file_names.append([m.name]) files.append([m]) st_files.append([sn]) duplicates = {} stripped = st_files[0] full_names = file_names[0] for i in range(1, len(file_names)): for k, v in enumerate(file_names[i]): try: myIndex = stripped.index(st_files[i][k]) except IndexError: pass print("\nOOPS\n") print("\nI: {}\nK: {}".format(i, k)) try: key = full_names[myIndex] except IndexError: pass print("\nOOPS 2\n") if key not in duplicates: duplicates[key] = [] duplicates[key].append(v) # Extract all files for i in range(len(files)): path = None if i > 0: path = 'dup_{}'.format(i) else: path = '' tar.extractall(path=path, members=files[i]) tar.close() self.tar.duplicates = duplicates return
def run(self): """ Run the remove portion of the cleanup phase """ self.start_time = datetime.now() if not self.config['cleanup_status']['review']['status']: print( "\nData must be reviewed before it can be removed from the archive." ) self.config['exit'] = True return self.config, self.files stage = self.config['stage'] job = self.config['job'] del_file = '%s.deletion-list.txt' % job job_folder = dir_pattern().format(stage, job) exists = False replace = False # Check to see if deletion file exists if os.path.exists(dir_pattern().format(job_folder, del_file)): exists = True ui = UI() replace = ui.yn_choice( '%s already exists.\n Would you like to overwrite this file?' % del_file, 'n') if exists and not replace: return self.config, self.files # Either file doesn't exist or user has chosen to overwrite it # Create <job>.deletion-list.txt file # Reset statuses for this run for k in self.config['cleanup_status']['remove']: self.config['cleanup_status']['remove'][k] = False contents = [] ################################################## # Get list of files from datastream folder ################################################## datastreams = [] datastream_path = dir_pattern(3).format(stage, job, 'datastream') for site in os.listdir(datastream_path): path = dir_pattern().format(datastream_path, site) for folder in os.listdir(path): abs_folder = dir_pattern().format(path, folder) if os.path.isdir( abs_folder) and not os.path.islink(abs_folder): datastreams.append(folder) # Processed files p_files = {} for k, v in enumerate(datastreams): if v not in p_files: p_files[v] = [] p_files[v] = os.listdir( dir_pattern(3).format(datastream_path, site, v)) ################################################## # Update the local copy of the archive db ################################################## # print("\nUpdating list of files stored at the archive..." # if not DEVEL: # update_archive(datastreams) # print("Done" ################################################## # Get list of files from archive db ################################################## print("\nRetrieving list of relevant files stored at the archive...", end="") # Connect to the database archive_files = {} db_file = '/apps/ds/conf/datainv/.db_connect' alias = 'inv_read' if not os.path.exists(db_file): print( "\nUnable to connect to the archive database. Please try again later." ) self.config['exit'] = True return self.config, self.files db = DB(self.config, db_file=db_file, alias=alias) # Store the query query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;" # List the column names so the values can be mapped in a dictionary cols = [ 'file_tag', 'file_name', 'file_version', 'file_size', 'file_stored', 'file_md5', 'file_stamp', 'file_checked', 'file_active' ] # convert the start and end dates to a unix timestamp start = convert_date_to_timestamp(self.config['begin']) end = convert_date_to_timestamp(self.config['end']) # Query the database for each of the datastreams for k, v in enumerate(datastreams): args = (v, start, end) result = db.query(query % args, columns=cols) if len(result) > 0: archive_files[v] = result else: print("\nNo results for %s" % v) # Unset loop variables if len(datastreams) > 0: del k, v, args, result print("Done") print("Map original tar bundle structure...", end="") self.maps['orig']['tar'] = self.get_tar_structure( dir_pattern(3).format(stage, job, "file_comparison/tar")) print("Done") if self.config['ingest']: # Add files to the list that should be removed from the archive print("\nGenerating list of files to remove from the archive...") sys.stdout.flush() ################################################## # Compare raw files to see if they changed ################################################## # Setup Variables for the following code to use file_history = self.files # List of files as they have traveled from tar file through the ingest. Mapped by their current name raw_streams = [ ] # The datastreams that contain the raw files (ex. sgpmfrsrC1.00) # Setup the paths for the ingested and untarred raw files new_folder = dir_pattern(3).format(stage, job, 'datastream') old_folder = dir_pattern(3).format(stage, job, 'file_comparison/raw') raw_files = { } # container to hold a mapping of raw files in the <job>/datastream folder archive_tars = { } # Container to hold a list of tar files at the archive bundle_data = False # Does the raw data in "datastream" need to be bundled # Get a list of the sites in "datastream" for site in os.listdir(new_folder): raw_files[site] = {} # Establish a structure for the raw files in "datastream" # This structure follows the same pattern as 'file_history' for site in raw_files: for instrument in glob( dir_pattern(3).format(new_folder, site, '*.00')): instrument = instrument.split('/')[-1] raw_files[site][instrument] = {} raw_streams.append(instrument) for f in os.listdir( dir_pattern(3).format(new_folder, site, instrument)): raw_files[site][instrument][f] = {} # Compare all of the existing files # By comparing existing files instead of files that were unpacked # we make sure to include all files and can check for files that are not being tracked # (This should never happen) c = Files(self.config) for i, s in raw_files.items(): # i = key, s = site for j, p in s.items(): # j = key, p = process/instrument pbar = UI() percent = 0 pbar.progress(percent) count = len(p) l = 1 for k, f in p.items(): # k = key, f = file # Compare the file in 'datastream' with its counterpart in 'file_comparison/raw' if k not in file_history[i][ j]: # This if statement should never evaluate "True" # File is not being tracked # Raw files in datastream need to be rebundled bundle_data = True # Tar file with this raw file needs to be added to the archive # Make sure the site is in the dict if i not in self.archive['add']['raw']: self.archive['add']['raw'][i] = {j: {}} # Make sure the process is in the dict if j not in self.archive['add']['raw'][i]: self.archive['add']['raw'][i][j] = {} # Add the file to the dict self.archive['add']['raw'][i][j][k] = {} continue # Go to the next iteration of the loop (file cannot be compared because there is no counterpart) # Compare the ingested raw file with the unpacked raw file file_path = dir_pattern(5).format( stage, job, '%s', i, j) file_1 = dir_pattern().format(file_path % 'datastream', k) file_2 = dir_pattern().format( file_path % 'file_comparison/raw', file_history[i][j][k]['original_name']) if not c.is_same_file(file_1, file_2): # The files are not the same. Raw files in datastream need to be rebundled bundle_data = True # Ensure self.archive['remove']['raw'] has the proper structure if i not in self.archive['remove']['raw']: self.archive['remove']['raw'][i] = {j: []} if j not in self.archive['remove']['raw'][i]: self.archive['remove']['raw'][i][j] = [] self.archive['remove']['raw'][i][j].append(k) # Make self.archive['remove']['raw'][i][j] a unique list self.archive['remove']['raw'][i][j] = list( set(self.archive['remove']['raw'][i][j])) percent = int((float(l) / float(count)) * 100) pbar.progress(percent) l = l + 1 percent = int((float(l) / float(count)) * 100) pbar.progress(percent) print("") sys.stdout.flush() # Unset loop variables if len(raw_files) > 0: del i, j, k, s, p, f, c if bundle_data: # Fill self.maps['orig']['history'] and bundle the data for site in file_history: if site not in self.maps['orig']['history']: self.maps['orig']['history'][site] = {} for process in file_history[site]: if process not in self.maps['orig']['history'][site]: self.maps['orig']['history'][site][process] = {} for f, d in file_history[site][process].items(): if d['original_name'] not in self.maps['orig'][ 'history'][site][process]: self.maps['orig']['history'][site][process][ d['original_name']] = d # Find any orig/bad files and copy them over (correcting names as necessary) other_files_path = dir_pattern(3).format( stage, job, 'file_comparison/raw/%s/%s/%s') for i, s in self.maps['orig']['history'].items(): for j, p in s.items(): bad_files = glob(other_files_path % (i, j, '*.bad.*')) orig_files = glob(other_files_path % (i, j, '*.orig.*')) edit_files = glob(other_files_path % (i, j, '*.edit*.*')) # if len(orig_files) > 0: # pbar = UI() # count = len(orig_files) # pbar.progress(0) for k, of in enumerate(orig_files): oFile = of.split('/')[-1] if oFile in p: key = oFile.replace('orig', 'raw') if key in p: filename = p[key]['current_name'].replace( '.raw.', '.orig.') filename = dir_pattern(6).format( stage, job, 'datastream', i, j, filename) shutil.copy(of, filename) del k, of, oFile, key # print("" # sys.stdout.flush() # if len(bad_files) > 0: # pbar = UI() # count = len(bad_files) # pbar.progress(0) for k, bf in enumerate(bad_files): bFile = bf.split('/')[-1] if bFile in p: key = bFile.replace('bad', 'raw') if key in p: filename = p[key]['current_name'].replace( '.raw.', '.bad.') else: filename = bFile filename = dir_pattern(6).format( stage, job, 'datastream', i, j, filename) shutil.copy(bf, filename) # # Update progress bar # pbar.progress(int((float(k + 1) / float(count)) * 100)) del k, bf, bFile, key # print("" # sys.stdout.flush() # if len(edit_files) > 0: # pbar = UI() # count = len(edit_files) # pbar.progress(0) for k, ef in enumerate(edit_files): eFile = ef.split('/')[-1] temp = eFile.split('.') edit = None for t in temp: if temp[t].startswith('edit'): edit = temp[t] break if eFile in p: key = eFile.replace(edit, 'raw') if key in p: filename = p[key]['current_name'].replace( '.raw.', ".%s." % edit) filename = dir_pattern(6).format( stage, job, 'datastream', i, j, filename) shutil.copy(ef, filename) # # Update progress bar # pbar.progress(int((float(k + 1) / float(count)) * 100)) del k, ef, eFile, edit, t, key # print("" # sys.stdout.flush() del j, p del i, s # Create any needed orig files print("Create needed orig files...") sys.stdout.flush() for i, s in self.archive['remove']['raw'].items(): for j, p in s.items(): path = dir_pattern(5).format(stage, job, "datastream", i, j) k = 0 count = len(p) for f in p: orig = f.replace('.raw.', '.orig.') if not os.path.exists(dir_pattern().format( path, orig)): src = dir_pattern(6).format( stage, job, "file_comparison/raw", i, j, file_history[i][j][f]['unpacked_name']) dst = dir_pattern().format(path, orig) shutil.copy(src, dst) # del src, dst percent = int((float(k) / float(count)) * 100) pbar.progress(percent) k = k + 1 if percent < 100: percent = int((float(k) / float(count)) * 100) pbar.progress(percent) print("") # Unset loop variables # del i, s, j, p, path, f, orig, src, dst print("Done") # Bundle the data self.bundle_raw_data(raw_streams) self.config['cleanup_status']['remove']['files_bundled'] = True print("Map new tar bundle structure...", end="") self.maps['new']['tar'] = self.get_tar_structure( dir_pattern(3).format(stage, job, "datastream")) print("Done") print("") print("Mapping raw structure from original tar files...", end="") self.maps['orig']['raw'] = self.map_raw_structure( self.maps['orig']['tar']) print("Done") print("Mapping raw structure from new tar files...", end="") self.maps['new']['raw'] = self.map_raw_structure( self.maps['new']['tar']) print("Done") ################################################## # Find all of the tar files that need # to be removed from the archive ################################################## print("") print( "Generating list of tar files to be removed from the archive..." ) sys.stdout.flush() # Find all of the tar files that need to be removed from the archive for i, s in self.archive['remove']['raw'].items(): percent = 0 for j, p in s.items(): pbar = UI() count = len(p) pbar.progress(percent) k = 1 for raw_file in p: tar_files = self.find_original_tar_bundle( file_history[i][j][raw_file]['original_name'], i, j) for f in tar_files: if f not in self.archive['remove']['tar']: tar = { 'site': i, 'instrument': j, 'file_name': f } self.archive['remove']['tar'].append(tar) percent = int((float(k) / float(count)) * 100) pbar.progress(percent) k = k + 1 if percent == 99: pbar.progress(100) print("") sys.stdout.flush() # Unset loop variables if len(self.archive['remove']['raw']) > 0: del i, s, j, p, raw_file, tar_files, f, tar print("Done") ################################################## # Find all of the tar files that need # to be added to the archive ################################################## print("") print( "Generating list of tar files to be added to the archive..." ) pbar = UI() pbar.progress(0) count = len(self.archive['remove']['tar']) percent = 0 i = 1 # Find all of the tar files that need to be added to the archive for tar_file in self.archive['remove']['tar']: files = self.find_all_files_from_original_tar( tar_file['file_name'], tar_file['site'], tar_file['instrument']) for f in files: temp = f if not any(d['file_name'] == temp for d in self.archive['add']['tar']): tar = { 'site': tar_file['site'], 'instrument': tar_file['instrument'], 'file_name': f } self.archive['add']['tar'].append(tar) percent = int((float(i) / float(count)) * 100) pbar.progress(percent) i = i + 1 if percent == 99: pbar.progress(100) print("") sys.stdout.flush() # Unset loop variables if len(self.archive['remove']['tar']) > 0: del tar_file, files, f for i, s in self.archive['add']['raw'].items(): for j, p in s.items(): pbar = UI() pbar.progress(0) percent = 0 count = len(p) i = 1 for raw_file, info in p.items(): tar_files = self.find_original_tar_bundle( raw_file, i, j) for f in tar_files: temp = f if not any( d['file_name'] == temp for d in self.archive['add']['tar']): tar = { 'site': i, 'instrument': j, 'file_name': f } self.archive['add']['tar'].append(tar) percent = int((float(i) / float(count)) * 100) pbar.progress(percent) i = i + 1 if percent == 99: pbar.progress(100) print("") sys.stdout.flush() # Unset loop variables if len(self.archive['add']['raw']) > 0: del i, s, j, p, raw_file, info, tar_files if 'f' in locals(): del f if 'tar' in locals(): del tar ################################################## # Update archive db for raw datastream ################################################## if not DEVEL: update_archive(raw_streams) # Get list of tar files from the archive for k, v in enumerate(raw_streams): stream = dir_pattern(5).format(stage, job, 'file_comparison/tar', site, v) files = os.listdir(stream) files = "','".join(files) args = (v, files) query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_active = true and file_name in ('%s')" result = db.query(query % args, columns=cols) if len(result) > 0: archive_tars[v] = result else: print("\nNo results for %s" % v) # Unset loop variables del k, v, args, result print("Done generating tar file list") # Find data on tar files in list and add it to 'contents' print("") print("Adding tar files to deletion list...", end="") for f in self.archive['remove']['tar']: files = archive_tars[f['instrument']] for k, v in enumerate(files): if v['file_name'] == f['file_name']: index = k break else: print("\nUnable to find %s in archive db" % f['file_name']) self.config['exit'] = True return self.config, self.files temp = f['file_name'] if not any(d['filename'] == temp for d in contents): contents.append({ 'datastream': f['instrument'], 'filename': f['file_name'], 'hash': files[index]['file_md5'], 'version': files[index]['file_version'] }) if len(self.archive['remove']['tar']) > 0: del f, files, k, v, index pass print("Done") # Set proper file names in deletion list print("Setting proper file names in deletion list...", end="") for k, v in archive_files.items(): if k.split('.')[-1] != '00': for key, f in enumerate(v): if f['file_name'] not in p_files[k]: temp = f['file_name'] pass if not any(d['filename'] == temp for d in contents): contents.append({ 'datastream': k, 'filename': f['file_name'], 'hash': f['file_md5'], 'version': f['file_version'] }) print("Done") # Store the list of files that need to be archived to file archive_json_file = dir_pattern(3).format(stage, job, 'archive.json') fp = open(archive_json_file, 'w') fp.write( json.dumps(self.archive['add']['tar'], indent=2, sort_keys=False, separators=(',', ': '))) fp.close() del fp # Update the saved status self.config['cleanup_status']['remove']['archive_list'] = True ################################################## # Write the results to file # (Use '\r\n' for Windows line endings) ################################################## print("\nEmailing deletion list...", end="") sys.stdout.flush() file_contents = [] contents = sorted(contents, key=self.get_sort_key) for line in contents: l = "%s.v%s %s" % (line['filename'], line['version'], line['hash']) file_contents.append(l) fp = open(dir_pattern().format(job_folder, del_file), 'w') fp.write("\r\n".join(file_contents)) fp.close() del fp # Update the saved status self.config['cleanup_status']['remove']['deletion_list'] = True # Send the deletion list to the appropriate place (currently email, may be upload at a later time) self.email_del_list("%s.deletion-list.txt" % self.config['job']) # self.upload_del_list() print("Done") # Update the saved status self.config['cleanup_status']['remove']['status'] = True duration = datetime.now() - self.start_time print(duration) return self.config, self.files
def setup_alias(self, db_file, alias=None, level=1): """ Make sure proper alias exists. Create apm alias if it doesn't exist. """ if not alias: als = 'apm' if not self.config['alias'] else self.config['alias'] del alias else: als = alias del alias ###################################################################### # Remove this code when vapmgr accepts -a argument ###################################################################### if self.config['vap']: als = 'vapmgr' ###################################################################### fp = open(db_file, 'r') contents = fp.read() fp.close() lines = contents.split('\n') for line in lines: words = line.split() if len(words) == 5 and words[0] != '#' and not words[0].startswith('#'): (alias, host, database, user, password) = words if alias == als: break else: # Doesn't have the specified alias if als == 'apm': # Doesn't have alias 'apm' alias = 'apm' host = 'pgdb.dmf.arm.gov' database = 'dsdb_reproc' user = '******' # Ask user for password if level == 1: print("Alias '{}' does not exist. Please enter a password for user '{}'".format(alias, user)) password = getpass.getpass() if not password: if level < 3: print("Error: unable to create alias '{}' without passowrd.\nPlease enter a password".format(alias)) self.setup_alias(db_file, alias=alias, level=level+1) else: print("Error: unable to create alias '{}' without password. Please try again.".format(alias)) return False else: # validate provided password # Write temp file tmp_db_file = dir_pattern(3).format(self.config['stage'], self.config['job'], '.db_connect.tmp') fp = open(tmp_db_file, 'w') fp.write('{} {} {} {} {}'.format(alias, host, database, user, password)) fp.close() if self.validate_alias(alias, tmp_db_file): os.remove(tmp_db_file) else: if level < 3: os.remove(tmp_db_file) print("Error: invalid password provided.\nPlease enter a password") return self.setup_alias(db_file, alias=alias, level=level+1) else: os.remove(tmp_db_file) print("Error: invalid password provided, please try again") return False # Write alias to file db_creds = '{} {} {} {} {}\n'.format(alias, host, database, user, password) fp = open(db_file, 'a') fp.write(db_creds) fp.close() else: # Alias is specified but is not 'apm' print("Unable to find alias '{}'. Please update .db_connect and try again.".format(als)) return False return True
def run(self): """ Run the cleanup portion of the cleanup phase """ if self.config['cleanup_status']['archive']['status'] != True: print("Data files must be archived before they can be cleaned up.") self.config['exit'] = True return self.config, self.files stage = self.config['stage'] job = self.config['job'] ################################################################################ # Update local archive database ################################################################################ if not self.config['cleanup_status']['cleanup']['files_archived']: print("Updating local copy of the archive...", end="") # Setup the datastreams to update datastreams = [] datastream_path = dir_pattern(3).format(stage, job, 'datastream') for site in os.listdir(datastream_path): path = dir_pattern().format(datastream_path, site) for folder in os.listdir(path): abs_folder = dir_pattern().format(path, folder) if os.path.isdir( abs_folder) and not os.path.islink(abs_folder): datastreams.append(folder) # Update the local copy of the archive db if not DEVEL: update_archive(datastreams) print("Done") ################################################################################ # Verify that all files to be added to the archive, were added ################################################################################ print( "Verifying processed and bundled files have been archived...", end="") cwd = os.getcwd() archive_files = {} db_file = '/apps/ds/conf/datainv/.db_connect' alias = 'inv_read' if not os.path.exists(db_file): print("Failed") print( "Unable to connect to the archive database. Please try again later." ) self.config['exit'] = True return self.config, self.files db = DB(self.config, db_file=db_file, alias=alias) # Store the query query = "SELECT * FROM get_remote_files_by_tag('%s') WHERE file_stamp >= %d AND file_stamp <= %d AND file_active = true ORDER BY file_stamp, file_version;" # List the column names so the values can be mapped in a dictionary cols = [ 'file_tag', 'file_name', 'file_version', 'file_size', 'file_stored', 'file_md5', 'file_stamp', 'file_checked', 'file_active' ] # convert the start and end dates to a unix timestamp start = convert_date_to_timestamp(self.config['begin']) end = convert_date_to_timestamp(self.config['end']) archive_file = dir_pattern(3).format(stage, job, 'current_archive.json') fp = open(archive_file, 'r') oArch = json.loads(fp.read()) fp.close() del fp os.chdir(datastream_path) for site in os.listdir('.'): path = dir_pattern().format(datastream_path, site) os.chdir(site) for folder in os.listdir('.'): os.chdir(folder) args = (folder, start, end) result = db.query(query % args, columns=cols) for f in os.listdir('.'): if not os.path.isdir(dir_pattern().format( os.getcwd(), f)): try: new_version = next(d['file_version'] for d in result if d['file_name'] == f) old_version = next(o['file_version'] for o in oArch[folder] if o['file_name'] == f) if not new_version > old_version: print("Failed") print( "Not all files have been successfully archived. Please try again later." ) self.config['exit'] = True return self.config, self.files except StopIteration: pass os.chdir('..') os.chdir('..') os.chdir(cwd) self.config['cleanup_status']['cleanup']['files_archived'] = True print("Done") ################################################################################ # Remove all files from `<job>/datastream` ################################################################################ if not self.config['cleanup_status']['cleanup']['files_cleaned_up']: print("Cleaning up project files...", end="") # Remove archive.json # Remove current_archive.json # Remove <job>.deletion-list.txt f = Files(self.config) path = dir_pattern().format(stage, job) delete = [ "datastream", "collection", "file_comparison/raw", "file_comparison/tar", 'archive.json', 'current_archive.json', '%s.deletion-list.txt' % job, ] try: for i in delete: item = dir_pattern().format(path, i) if os.path.exists(item): if os.path.isdir(item): f.empty_dir(item) elif os.path.isfile(item): os.remove(item) except: print("Failed") print( "Unable to cleanup all files. Please try again, or cleanup project manually." ) self.config['exit'] = True return self.config, self.files print("Done") self.config['cleanup_status']['cleanup']['files_cleaned_up'] = True self.config['cleanup_status']['cleanup']['status'] = True return self.config, self.files
def run(self): """ Run the process phase """ if DEVEL: pass home = os.path.expanduser('~') # Update env variables print("\nUpdating environment vars...", end="") sys.stdout.flush() if not update_env(dir_pattern().format(self.config['stage'], self.config['job'])): shell = get_shell() if shell == "bash": ext = 'sh' else: ext = 'csh' print("Fail") exit("Error: Unable to locate env.%s." % ext) print("Done") # Updating Env Vars # Check for .db_connect file print("\nLocating .db_connect...", end="") sys.stdout.flush() db_file = dir_pattern().format(home, ".db_connect") if not os.path.exists(db_file): fp = open(db_file, 'w') fp.close() # Check for apm or user specified alias if not self.setup_alias(db_file): exit() print("Done") if self.config['ingest']: ################################################## # START INGEST PROCESSING ################################################## # Find the ingest executable for each process print("\nLocating ingest executable...", end="") sys.stdout.flush() processes = self.db.get_data_paths() for k, v in enumerate(processes): ingest, multiple = self.find_ingest_exec(v['proc']) if not ingest: print("Fail") exit("Unable to find Ingest executable for {}".format(v['proc'])) else: processes[k]['ingest'] = ingest processes[k]['multiple'] = multiple # Add a plugin spot to update the ingest as necessary # Then create the IRT ingest plugin # Check to see if a plugin needs to modify the datastream temp = self.manager.callPluginCommand('hook_ingest_alter', {'processes': processes}) processes = temp if temp != None else processes print("Done") print("") db_commands = [] # Update the database if self.config['db_up'] != False: print("\nUpdating the database...", end="") sys.stdout.flush() for process in processes: if not self.update_db(process): print("Fail") exit("ERROR: Unable to update database for {}".format(process['proc'])) print("Done") print("\nExecuting ingest processes...", end="") sys.stdout.flush() # Execute an Ingest process threads = {} status = {} done = False while not done: done = True for k,v in enumerate(processes): # Make sure all needed keys exist if 'complete' not in v: processes[k]['complete'] = False v['complete'] = False key = v['ingest'].split('/')[-1].split('_')[0] if (key not in threads or status[key] == True) and v['complete'] == False: done = False status[key] = False threads[key] = Ingest(v, self.config, k) if not threads[key]: exit("ERROR: Ingest object not created") threads[key].start() elif threads[key].is_alive(): done = False elif (not threads[key].is_alive()) and (v['complete'] == False or status[key] == False): status[key] = True processes[threads[key].key]['complete'] = True processes[threads[key].key]['result'] = threads[key].result result = processes[threads[key].key]['result'] # Notify the user if there was an error, that way they can correct # the error and run again or run manually if needed if threads[key].error != False: print("There was an error: ", end="") print(threads[key].error) if self.files and result: for i,site in result.items(): for j,sif in site.items(): for k,name in sif.items(): if k in self.files[i][j]: self.files[i][j][name] = self.files[i][j][k] self.files[i][j].pop(k) self.files[i][j][name]['processed_name'] = name self.files[i][j][name]['current_name'] = name elif v['complete'] == True: pass else: print("OOPS forgot a status") print("Done") ################################################## # END INGEST PROCESSING ################################################## elif self.config['vap']: ################################################## # START VAP PROCESSING ################################################## print('Running vapmgr...', end="") sys.stdout.flush() starttime = None endtime = None is_success = None # Make sure vappath is in the path env variable syspath = os.environ.get('PATH') syspath = syspath.split(':') if binpath not in syspath: syspath.append(binpath) syspath = ':'.join(syspath) os.environ['PATH'] = syspath ############################################### # Remove this code for production ############################################### vaphome = binpath.split('/')[:-1] vaphome = '/'.join(vaphome) os.environ['VAP_HOME'] = vaphome ############################################### # Run vapmgr setup to create any needed aliases setup = [ '%s/vapmgr' % vappath, '-setup', '-r', '%s.%s' % (self.config['site'], self.config['facility']), self.config['instrument'] ] ps = Popen(setup, stdout=PIPE, stderr=PIPE) (output, error) = ps.communicate() returncode = ps.returncode if returncode != 0: print("ERROR: Unable to setup vapmgr") print("") print(error) exit() # Run vapmgr to process the vaps starttime = datetime.now().replace(microsecond=0) command = [ '%s/vapmgr' % vappath, '-r', '%s.%s' % (self.config['site'], self.config['facility']), '-start', str(self.config['begin']), '-end', str(self.config['end']), '-force', self.config['instrument'] ] ps = Popen(command, stdout=PIPE, stderr=PIPE) (output, error) = ps.communicate() returncode = ps.returncode endtime = datetime.now().replace(microsecond=0) if returncode != 0: print("ERROR: Error running vapmgr") print("") print(error) exit() # vapmgr ran successfully # Find out what log files need parsed path = dir_pattern(5).format(self.config['stage'], self.config['job'], 'logs', self.config['site'], '%s_logs') proc_path = path % 'proc' instr_path = path % 'instr' vaplogs = [] vapmgrlogs = [] vapmgrqclogs = [] year = str(starttime.year).zfill(4) month = str(starttime.month).zfill(2) regex_log_file_pattern = '%s.*%s.*%s\.%s%s00.000000.%s' proc = regex_log_file_pattern % (self.config['site'], self.config['instrument'], self.config['facility'], year, month, 'VAP') instr = regex_log_file_pattern % (self.config['site'], self.config['instrument'], self.config['facility'], year, month, 'vapmgrlog') instrqc = regex_log_file_pattern % (self.config['site'], self.config['instrument'], self.config['facility'], year, month, 'vapmgrqclog') # vap logs don't always exist. Need to check to make sure they do before trying to access them if os.path.exists(proc_path): vaplog_dirs = os.listdir(proc_path) for d in vaplog_dirs: tmp = os.listdir(dir_pattern().format(proc_path, d)) for i in tmp: if re.search(proc, i): vaplogs.append(dir_pattern(3).format(proc_path, d, i)) if not os.path.exists(instr_path): exit("Unable to find vapmgr log files") vapmgrlog_dirs = os.listdir(instr_path) for d in vapmgrlog_dirs: tmp = os.listdir(dir_pattern().format(instr_path, d)) for i in tmp: if re.search(instr, i): vapmgrlogs.append(dir_pattern(3).format(instr_path, d, i)) elif re.search(instrqc, i): vapmgrqclogs.append(dir_pattern(3).format(instr_path, d, i)) logs = {} # Parse VAP log file if len(vaplogs) > 0: for k,log in enumerate(vaplogs): temp = self.parse_vap_log(log, starttime, endtime) logs['vap'] = [] for i in temp: i['log_file'] = vaplogs[k] logs['vap'].append(i) # Parse vapmgr log file if len(vapmgrlogs) > 0: for k,log in enumerate(vapmgrlogs): temp = self.parse_vapmgr_log(log, starttime, endtime) logs['vapmgr'] = [] for i in temp: i['log_file'] = vapmgrlogs[k] logs['vapmgr'].append(i) # Parse vapmgrqclog if len(vapmgrqclogs) > 0: for k,log in enumerate(vapmgrqclogs): temp = self.parse_vapmgr_log(log, starttime, endtime, qc=True) logs['vapmgrqc'] = [] for i in temp: i['log_file'] = vapmgrqclogs[k] logs['vapmgrqc'].append(i) print('Done') if 'vap' in logs and len(logs['vap']) > 0: print('') print("VAP Results") for k,log in enumerate(logs['vap']): print("Running: %s for %s..." % (log['process'], log['dates']), end="") if log['status']: print(log['message']) else: print("ERROR") print("\tFor more information see the log entry starting on line %d of the following log file:\n\t %s" % (log['line_number'], log['log_file'])) elif 'vapmgr' in logs and len(logs['vapmgr']) > 0: print('') print("VapMGR Results") for log in logs['vapmgr']: print(self.vapmgr_log_results(log, 'output')) if 'vapmgr' in logs and len(logs['vapmgr']) > 0: print('') print("VapMGR Quicklooks Results") for log in logs['vapmgr']: print(self.vapmgr_log_results(log, 'quicklooks')) if 'vapmgrqc' in logs and len(logs['vapmgrqc']) > 0: print('') print("VapMGRQC Results") for log in logs['vapmgrqc']: print(self.vapmgr_log_results(log, 'output')) print('') ################################################## # END VAP PROCESSING ################################################## return self.config, self.files