def retrieve_job_info(job, component, strict=True): """Retrieves data stored in our HDFS Shred directory""" file_content = None if component == "master": # The master component always updates the state of the job in the master job list file_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs", job) elif 'worker' in component or 'data' in component: file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job, component) else: raise ValueError("Invalid option passed to function get_hdfs_file") try: with hdfs.read(file_path) as reader: # expecting all content by this program to be serialised as json file_content = reader.read() except HdfsError as e: if strict: raise StandardError("HDFSCli couldn't read a file from path [{0}] with details: {1}" .format(file_path, e)) else: pass # if not strict mode then we will return None if file_content: get_result = loads(file_content) log.debug("Retrieved content [{2}] from component file [{0}] at path [{1}]" .format(component, file_path, get_result)) else: get_result = file_content return get_result
def update_test_files(): shred.log.info("Updating list of test data files") global test_files filelist_cmd = ["hdfs", "dfs", "-ls", ospathjoin(test_file_path, test_file_dir)] filelist_iter = shred.run_shell_command(filelist_cmd) for line in filelist_iter: splits = ssplit(line) if "{0}".format(ospathjoin(test_file_path, test_file_dir)) in splits[-1]: test_files.append(splits[-1]) shred.log.info("Test files are now [{0}]".format(test_files))
def generate_test_data(): # Generate test data shred.log.info("Out of test data, generating new Test Data files...") clean_dir_cmd = ["hdfs", "dfs", "-rmdir", ospathjoin(test_file_path, test_file_dir)] shred.run_shell_command(clean_dir_cmd) gen_test_data_cmd = ["/usr/hdp/current/hadoop-client/bin/hadoop", "jar", glob("/usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-examples-*.jar")[0], "teragen", test_file_size, ospathjoin(test_file_path, test_file_dir)] gen_test_data_iter = shred.run_shell_command(gen_test_data_cmd) for line in gen_test_data_iter: if "Bytes Written" in line: shred.log.info(line) # remove 0 size '_SUCCESS' file del_file_cmd = ["hdfs", "dfs", "-rm", ospathjoin(test_file_path, test_file_dir, "_SUCCESS")] do_nothing = shred.run_shell_command(del_file_cmd)
def set_status(job_id, component, status): """Abstracts setting a given status for a given job, job subcomponent, and status message""" # determine file to be written if component == "master": # The master component always updates the state of the job in the master job list file_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs", job_id) elif component == "data": file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, "status") else: # otherwise we update the file named for that component in the subdir for the job in the general store file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, component, "status") log.debug("Setting status of component [{0}] at path [{1}] to [{2}]".format(component, file_path, status)) if file_path is not None: hdfs.write(file_path, status, overwrite=True) else: raise ValueError("File Path to set job status not set.")
def init_program(passed_args): log.info("shred.py called with args [{0}]".format(passed_args)) parsed_args = parse_user_args(passed_args) # TODO: Further configuration file validation tests log.debug("Checking for config parameters.") if not conf.VERSION: raise StandardError( "Version number in config.py not found, please check configuration file is available and try again." ) # Test necessary connections ensure_hdfs() # Check directories etc. are setup hdfs.makedirs(ospathjoin(conf.HDFS_SHRED_PATH, "jobs")) hdfs.makedirs(ospathjoin(conf.HDFS_SHRED_PATH, "store")) # TODO: Further Application setup tests return parsed_args
def check_for_new_worker_jobs(): """Checks for the existance of new worker jobs and returns a list of them if they exist""" worker_job_list = [] # check if dir exists as worker my load before client is ever used job_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs") job_dir_exists = hdfs.content(job_path, strict=False) if job_dir_exists is not None: # if job dir exists, get listing and any files dirlist = hdfs.list(job_path, status=True) for item in dirlist: if item[1]['type'] == 'FILE': with hdfs.read(ospathjoin(job_path, item[0])) as reader: job_status = reader.read() # if file contains the completion status for stage1, put it in worker list if job_status == "stage1complete": worker_job_list.append(item[0]) return worker_job_list
def persist_job_info(job, component, stage, info): """Writes data to our directory structure in the HDFS shred directory""" if component == "master": file_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs", job) content = dumps(stage + "-" + info) elif 'worker' in component or 'data' in component: file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job, component) if 'status' in component: content = dumps(stage + "-" + info) else: content = dumps(info) else: raise StandardError("Function persist_job_info was passed an unrecognised component name") if file_path is not None: try: hdfs.write(file_path, content, overwrite=True) except HdfsError as e: raise e else: raise ValueError()
def prepare_blocklists(job_id): """Attempts to take leadership for job preparation and creates the block-file lists for each datanode worker""" # attempt to kazoo lease new guid node for sleep period minutes log.debug("Preparing Blocklists for job [{0}]".format(job_id)) log.debug("Attempting to get lease as leader for job") lease = zk.NonBlockingLease( path=conf.ZOOKEEPER['PATH'] + job_id, duration=dttd(minutes=conf.WORKER_SLEEP), identifier="Worker [{0}] preparing blocklists for job [{1}]".format(get_worker_identity(), job_id) ) # http://kazoo.readthedocs.io/en/latest/api/recipe/lease.html # if not get lease, return pipped status if not lease: log.debug("Beaten to leasehold by another worker") return "pipped" else: log.debug("Got lease as leader on job, updating job status") # update job status to stage2prepareblocklist status = "stage2prepareBlocklist" component = "master" set_status(job_id, component, status) # get job target ( returns a list ) targets = get_target_by_jobid(job_id) log.debug("Got target file(s) [{0}] for job".format(targets)) # get fsck data for targets blocklists = {} for target in targets: fsck_data = get_fsck_output(target) # parse fsck data for blocklists blocklists.update(parse_blocks_from_fsck(fsck_data)) log.debug("Parsed FSCK output for target files: [{0}]".format(blocklists)) # match fsck output to worker_ids # block IDs for workers are currently the IP of the datanode, which matches our worker_id in the utility # Therefore no current need to do a match between the fsck output and the local worker ID # write a per-DN file to hdfs job subdir for other workers to read target_workers = blocklists.keys() log.debug("Datanode list for these blockfiles is: [{0}]".format(target_workers)) for this_worker in target_workers: this_worklist = {} for blockfile in blocklists[this_worker]: this_worklist[blockfile] = "new" workfile_content = dumps(this_worklist) file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, this_worker) log.debug("Writing [{0}] to workfile [{1}] for Datanode [{2}]" .format(workfile_content, file_path, this_worker)) hdfs.write(file_path, workfile_content, overwrite=True) # update job status to stage2copyblocks log.debug("Completed leader tasks for blocklist preparation, updating status and returning from function") status = "stage2copyblocks" set_status(job_id, component, status) # TODO: Look for a method to explicitly release the lease when done # apparently there's no release lease command in this recipe, so it'll just timeout? # return success status return "success"
def get_target_by_jobid(job_id): """Gets paths of target files ingested into this jobs data store directory returns list of absolute paths to target files on HDFS""" hdfs_file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, "data") log.debug("getting list of files at path [{0}]".format(hdfs_file_path)) # hdfs.list returns a list of file names in a directory hdfs_file_list = hdfs.list(hdfs_file_path) out = [] for file in hdfs_file_list: # TODO: Check if HDFS always uses / as path separator on Win or Linux etc. out.append(hdfs_file_path + '/' + file) return out
def setup_module(): shred.log.info("Begin Setup") shred.log.info("Checking for existing test data") # Check if test data already exists test_data_exists_cmd = ["hdfs", "dfs", "-ls", ospathjoin(test_file_path, test_file_dir)] test_data_exists_iter = shred.run_shell_command(test_data_exists_cmd) try: out = next(test_data_exists_iter) if "No such file or directory" in out: test_data_exists_state = False else: test_data_exists_state = True except StopIteration: test_data_exists_state = False if test_data_exists_state is False: generate_test_data() update_test_files()
def teardown_module(): shred.log.info("Begin Teardown...") # Remove test data if remove_test_files: shred.log.info("Removing Test Data") rmdir_cmd = ["hdfs", "dfs", "-rm", "-f", "-r", "-skipTrash", ospathjoin(test_file_path, test_file_dir)] rmdir_iter = shred.run_shell_command(rmdir_cmd) for line in rmdir_iter: shred.log.info(line) else: shred.log.info("Skipping removal of test data") if remove_test_zkdata: shred.log.info("Removing test ZK Data") zk_host = shred.conf.ZOOKEEPER['HOST'] + ':' + str(shred.conf.ZOOKEEPER['PORT']) zk = shred.connect_zk() zk.delete(path=shred.conf.ZOOKEEPER['PATH'], recursive=True) else: shred.log.info("Skipping removal of test ZK Data")
def get_jobs(stage): """Prepares a cleaned job list suitable for the stage requested from all active jobs returns list of job UUID4 strings""" worker_job_list = [] target_status = [] if stage == stage_2: target_status = [ stage_1 + "-" + status_success, stage_2 + "-" + status_task_timeout ] elif stage in [stage_3, stage_4]: target_status = [ stage_2 + "-" + status_success, stage_4 + "-" + status_task_timeout ] elif stage in [stage_5, stage_6]: target_status = [ stage_4 + "-" + status_success, stage_6 + "-" + status_task_timeout ] # check if dir exists as worker my load before client is ever used job_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs") job_dir_exists = None try: # hdfscli strict=False returns None rather than an Error if Dir not found job_dir_exists = hdfs.content(job_path, strict=False) except AttributeError: log.error("HDFS Client not connected") if job_dir_exists is not None: # if job dir exists, get listing and any files dir_listing = hdfs.list(job_path, status=True) for item in dir_listing: if item[1]['type'] == 'FILE': # item[0] is the filename, which for master status' is the job ID as a string # we shall be OCD about things and validate it however. job_status = retrieve_job_info(item[0], "master") if job_status in target_status: try: job_id = UUID(item[0], version=4) worker_job_list.append(str(job_id)) except ValueError: pass return worker_job_list
def get_gif_filename(dir,label,args): argcopy = args.copy() filename ="" for s in settings: if args.has_key(s) == True: filename = filename + str(args[s]) filename = filename + label for io in image_operator: if args.has_key(io) == True: filename = filename + str(args[io]) filename = filename+ ".gif" filename = filename.replace("#","") filename = ospathjoin(dir,filename) return(filename)
def ingest_targets(job_id, target): """Moves file from initial location to shred worker folder on HDFS""" # Update status' status = "stage1ingest" component = "master" set_status(job_id, component, status) component = "data" set_status(job_id, component, status) # Move all files to the data directory path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, 'data') # Using the HDFS module's rename function to move the target files log.debug("Moving target file [{0}] to shredder holding pen [{1}]".format(target, path)) # We need to ensure the directory is created, or the rename command will dump the data into the file hdfs.makedirs(path) hdfs.rename(target, path) # update status status = "stage1ingestComplete" set_status(job_id, component, status) return job_id, status
def process_item(self, item, spider): ''' save index doc file ''' if 'indexdocSpider' != spider.name: return item if not item['docvalid']: return None realpath = ospathjoin(spider.savepath, spider.savename, item['docpath']) spider.loger.info("doc %s save path %s" % (item['docname'], realpath)) if not os.path.exists(os.path.dirname(realpath)): os.makedirs(os.path.dirname(realpath)) with open(realpath, 'wb+') as f: f.write(item['docdata']) return item
def store_table_fields(self, table_name: str, playlists_dir: str): response = self.cur.execute('select * from ' + table_name) fields_names = [description[0] + '\n' for description in response.description] with open(ospathjoin(playlists_dir, table_name + '.txt'), 'w+') as f: f.writelines(fields_names)
def get_sql_path_base(exec_path): return ospathjoin(exec_path, "janis/task.db")
################################################################################ # Etapa 1 Carga base de datos a python ################################################################################ # Por ahora tomaré una base desde mi computador local f = NamedTemporaryFile(suffix="BaseValidador", delete = False) myHostname = "162.243.165.69" myUsername = "******" myPassword = "******" filename = "BaseValidador.csv" opts = pysftpCnOpts() opts.hostkeys = None with pysftpConnection(host=myHostname, username=myUsername, password=myPassword, cnopts = opts) as sftp: remoteFilePath = '/root/CMD/EOD_S20/bases/'+filename localFilePath = ospathjoin(f.name) sftp.get(remoteFilePath, localFilePath) #sys.path.append(f.name) #bd = pd.read_stata(ospathjoin(f.name), convert_categoricals=False) #bd.to_csv(ospathjoin(f.name)) #bd = pd.read_csv(ospathjoin(f.name)) bd = pd.read_csv(ospathjoin(f.name)) bd["comentarios_validacion"] = "No hay comentarios de validación" bd=bd.loc[bd.interview__key.notnull()] bd.loc[:,'act']=datetime.now().strftime("%Y-%m-%d %H:%M:%S") bd2=bd
def __init__(self, playlist_name: str, poi_csv_file: str): self.playlist_name = playlist_name self.playlist_path = ospathjoin(self.playlists_dir, str(playlist_name)) self.csv_handler = CSVHandler(ospathjoin(self.playlist_path, poi_csv_file)) self.sql_handler = SQLHandler()
from __future__ import print_function import pytest from os.path import join as ospathjoin import os from tracktime import tracktime import datetime TEST_TIMELOG = ospathjoin("tests", "test_timelog.txt") @pytest.fixture def activity(): name = "admin" category = "work" time1 = datetime.datetime(2016, 6, 9, 6, 5, 35) time2 = datetime.datetime(2016, 6, 9, 11, 23, 2) return tracktime.Activity(time1, name, category, time2) def erase_test_timelog(): """ remove the TEST_TIMELOG (if it exists) """ try: os.remove(TEST_TIMELOG) except OSError: pass return def add_test_timelog_entries_directly(): timelog = TEST_TIMELOG """ Create a finished activity """
from __future__ import print_function import pytest from os.path import join as ospathjoin import os from tracktime import tracktime import datetime TEST_TIMELOG = ospathjoin("tests", "test_timelog.txt") @pytest.fixture def activity(): name = "admin" category = "work" time1 = datetime.datetime(2016, 6, 9, 6, 5, 35) time2 = datetime.datetime(2016, 6, 9, 11, 23, 2) return tracktime.Activity(time1, name, category, time2) def erase_test_timelog(): """ remove the TEST_TIMELOG (if it exists) """ try: os.remove(TEST_TIMELOG) except OSError: pass return def add_test_timelog_entries_directly(): timelog = TEST_TIMELOG
def store_all_tables_names(self, playlists_dir: str): tables_names = self.get_all_tables_name() with open(ospathjoin(playlists_dir, 'tables_names.txt'), 'w+') as f: f.writelines(tables_names)
def create_image_file(self,labels, **kw): if not kw.has_key('overwrite'): kw['overwrite'] = False if not isinstance(labels,list): labels = [labels] self.outputfiles = [] self.labels = labels self.outputdirname = "ic_"+now() cwd = getcwd() try: mkdir(ospathjoin(cwd,self.outputdirname)) except OSError: # directory exists pass self.log.log(self,3,"mkdir="+self.outputdirname) for lbl in self.labels: cmd = ['convert','-verbose'] outputfilename = get_gif_filename(self.outputdirname,lbl,kw) #filename = lbl + "-" + "-".join(map(str,kw.values())) + ".gif" #outputfilename = ospathjoin(self.outputdirname,filename) if os_file_exists(outputfilename) == False or kw['overwrite'] == True: # settings go before the input label/file for s in settings: if kw.has_key(s): cmd = cmd + ["-"+s,str(kw[s])] cmd.append(labelstr_get(lbl)) # image operators come afterwards for im in image_operator: if kw.has_key(im): cmd = cmd + ["-"+im,str(kw[im])] cmd.append(outputfilename) p = process_start(cmd) status = parse_convert_stdout(p,lbl) if status[0] == 0: self.log.log(self,3, "created image="+outputfilename, "status"," ".join(map(str,status))) else: self.log.log(self,3,"failed","status="," ".join(map(str,status))) raise Exception("ImageCreate failure",status) else: self.log.log(self,3,"reused image="+outputfilename) self.outputfiles.append(outputfilename) return(self.outputfiles)
def run_stage(stage, params=None): """ Main program logic As many stages share a lot of similar functionality, they are interleved using the 'stage' parameter as a selector Stages should be able to run independently for testing or admin convenience """ ensure_hdfs() if stage == stage_1: # Stage 1 returns a result and a an ID for the job and has no job list target = params # TODO: Validate passed file target(s) further, for ex trailing slashes or actually a directory in arg parse job = str(uuid4()) log.debug("Generated uuid4 [{0}] for job identification".format(job)) persist_job_info(job, 'master', stage, status_init) persist_job_info(job, 'data_status', stage, status_init) holding_pen_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job, 'data') source_path, source_filename = ospathsplit(target) expected_target_real_path = ospathjoin(holding_pen_path, source_filename) try: # TODO: Update to handle list of input files instead of single file as a string target_details = hdfs.status(target) if target_details['type'] == u'FILE': # We need to ensure the directory is created, or the rename command will dump the data into the file hdfs.makedirs(holding_pen_path) # Using the HDFS module's rename function to move the target files to test permissions log.debug("Moving target file [{0}] to shredder holding pen [{1}]".format(target, holding_pen_path)) # TODO: Do an are-you-sure, then return status_skip if they don't accept hdfs.rename(target, holding_pen_path) # TODO: Write more sanity checks for ingest process persist_job_info(job, "data_file_list", stage_1, expected_target_real_path) log.debug("Job [{0}] prepared, exiting with success".format(job)) persist_job_info(job, 'master', stage, status_success) persist_job_info(job, 'data_status', stage, status_success) return status_success, job else: log.critical("Target is not valid, type returned was [{0}]".format(target_details['type'])) persist_job_info(job, 'master', stage, status_fail) persist_job_info(job, 'data_status', stage, status_fail) return status_fail, job except HdfsError as e: persist_job_info(job, 'master', stage, status_fail) persist_job_info(job, 'data_status', stage, status_fail) log.critical("Ingestion failed for file [{0}] for job [{1}] with details: {2}" .format(target, job, e)) return status_fail, job elif stage in [stage_2, stage_3, stage_4, stage_5, stage_6]: # stages 2 - 6 operate from an active job list predicated by success of the last master stage worker = get_worker_identity() job_list = get_jobs(stage) log.info("Worker [{0}] found [{1}] jobs for stage [{2}]".format(worker, len(job_list), stage)) if len(job_list) > 0: for job in job_list: if stage in [stage_2, stage_4, stage_6]: # Leader Jobs for stages 2, 4, and 6 # We use the absence of a leader_result to control activity within leader tasks leader_result = None # Worker may not yet have status file initialised for s2 of job worker_status = (retrieve_job_info(job, "worker_" + worker + "_status", strict=False)) # TODO: Move worker state validation to a seperate function returning a t/f against worker/stage if ( (worker_status is None and stage != stage_2) or (worker_status is not None and worker_status not in [ stage_3 + "-" + status_success, stage_3 + "-" + status_skip, stage_4 + "-" + status_task_timeout, stage_5 + "-" + status_success, stage_5 + "-" + status_skip, stage_6 + "-" + status_task_timeout, ])): log.critical( "Worker [{0}] is in status [{1}] for job [{2}], which is not valid to be [{3}] leader." .format(worker, worker_status, job, stage) ) leader_result = status_fail persist_job_info(job, 'master', stage, status_init) persist_job_info(job, "worker_" + worker + "_status", stage, status_init) ensure_zk() lease_path = conf.ZOOKEEPER['PATH'] + job lease = zk.NonBlockingLease( path=lease_path, duration=dttd(minutes=conf.LEADER_WAIT), identifier="Worker [{0}] running stage [{1}]".format(worker, stage) ) if not lease: leader_result = status_skip else: while lease: while leader_result is None: if zk.state != KazooState.CONNECTED: log.critical("ZooKeeper disconnected from worker [{0}] during stage [{1}] of job" "[{2}], expiring activity" .format(worker, stage, job)) leader_result = status_task_timeout persist_job_info(job, "worker_" + worker + "_status", stage, status_is_leader) if stage == stage_2: target = retrieve_job_info(job, "data_file_list") master_shard_dict = {} fsck_iter = run_shell_command( ["hdfs", "fsck", target, "-files", "-blocks", "-locations"] ) master_shard_dict.update(parse_fsck_iter(fsck_iter)) target_workers = master_shard_dict.keys() for this_worker in target_workers: worker_shard_dict = {} for shard_file in master_shard_dict[this_worker]: worker_shard_dict[shard_file] = status_no_init persist_job_info( job, "worker_" + worker + "_source_shard_dict", stage, worker_shard_dict ) persist_job_info(job, "worker_list", stage, target_workers) leader_result = status_success elif stage in [stage_4, stage_6]: worker_list = retrieve_job_info(job, "worker_list") wait = True while wait is True: # TODO: Do stuff to validate count and expected names of workers are all correct nodes_finished = True for node in worker_list: node_stage, node_status = ( retrieve_job_info(job, "worker_" + node + "_status")).split("-") if ( node_status == status_fail or # some node failed something stage == stage_4 and node_stage != stage_3 or # bad stage combo stage == stage_6 and node_stage != stage_5 # stage combo breaker! ): # This should crash the outer while loop to fail this process leader_result = status_fail elif node_status not in [status_success, status_skip]: nodes_finished = False if nodes_finished is True: wait = False else: sleep(60 * conf.WORKER_WAIT) else: # We only stop 'wait'ing to start Stage 4/6 if all workers report success # before the leader lease times out persist_job_info(job, 'master', stage, status_init) if stage == stage_4: persist_job_info(job, 'data_status', stage, status_init) # TODO: Handle multiple files instead of a single file as string # TODO: Validate against fresh blocklist in case of changes? delete_target = retrieve_job_info(job, "data_file_list") delete_cmd_result = next( run_shell_command(['hdfs', 'dfs', '-rm', '-skipTrash', delete_target]) ) if "Deleted" in delete_cmd_result: persist_job_info(job, 'data_status', stage, status_success) leader_result = status_success else: log.critical( "Deletion of file from HDFS returned bad result of [{0}], bailing" .format(delete_cmd_result)) persist_job_info(job, 'data_status', stage, status_fail) leader_result = status_fail elif stage == stage_6: # All workers have completed shredding, shut down job and clean up # TODO: Test that job completed as expected leader_result = status_success else: raise StandardError("Bad stage passed to run_stage") lease = False if leader_result is None or leader_result == status_task_timeout: log.warning( "Worker [{0}] timed out on stage [{1}] leader task, " "resetting status for another worker attempt" .format(worker, stage)) persist_job_info(job, "worker_" + worker + "_status", stage, status_task_timeout) persist_job_info(job, 'master', stage, status_task_timeout) elif leader_result in [status_success, status_fail]: # Cleanup lease # TODO: Test if this breaks when the worker test says the worker is in a bad state _ = zk.NonBlockingLease( path=lease_path, duration=dttd(seconds=1), identifier="Worker [{0}] running stage [{1}]".format(worker, stage) ) sleep(2) persist_job_info(job, "worker_" + worker + "_status", stage, leader_result) persist_job_info(job, 'master', stage, leader_result) elif leader_result == status_skip: persist_job_info(job, "worker_" + worker + "_status", stage, status_skip) else: raise StandardError("Bad leader_result returned from ZooKeeper wrapper") elif stage in [stage_3, stage_5]: # Distributed worker jobs for stage 3 and 5 persist_job_info(job, "worker_" + worker + "_status", stage, status_init) if stage == stage_3: targets_dict = retrieve_job_info(job, "worker_" + worker + "_source_shard_dict", strict=False) # allowing for restart of job where shard linking was partially completed. linked_shard_dict = retrieve_job_info( job, "worker_" + worker + "_linked_shard_dict", strict=False ) if linked_shard_dict is None: linked_shard_dict = {} elif stage == stage_5: targets_dict = retrieve_job_info(job, "worker_" + worker + "_linked_shard_dict", strict=False) else: raise StandardError("Bad code pathway") if targets_dict is None: log.debug("Worker [{0}] found no shard list for stage [{1}] in job [{2}]" .format(worker, stage, job)) persist_job_info(job, "worker_" + worker + "_status", stage, status_skip) else: for shard in targets_dict: if targets_dict[shard] in [status_no_init, status_init]: targets_dict[shard] = status_init if stage == stage_3: shard_file_path = find_shard(shard) shard_file_mount = find_mount_point(shard_file_path) this_mount_shred_dir = ospathjoin(shard_file_mount, conf.LINUXFS_SHRED_PATH, job) linked_shard_path = ospathjoin(this_mount_shred_dir, shard) try: if not exists(this_mount_shred_dir): # apparently the exists_ok flag is only in Python2.7+ makedirs(this_mount_shred_dir) link(shard_file_path, linked_shard_path) linked_shard_dict[linked_shard_path] = status_no_init targets_dict[shard] = status_success except OSError as e: log.critical("Failed to link shard file [{0}] at loc [{1}] to shred loc [{2}]" .format(shard, shard_file_path, linked_shard_path)) targets_dict[shard] = status_fail elif stage == stage_5: # Shred returns 0 on success and a 'failed' message on error # run_shell_command handles this behavior for us # TODO: Insert final sanity check before shredding files shred_result = run_shell_command( ['shred', '-n', str(conf.SHRED_COUNT), '-z', '-u', shard], return_iter=False ) if shred_result is not None: log.critical("Worker [{0}] failed to shred shard [{1}] with error: {2}" .format(worker, shard, shred_result)) targets_dict[shard] = status_fail else: targets_dict[shard] = status_success elif targets_dict[shard] == status_success: # Already done, therefore skip pass else: raise StandardError( "Shard control for worker [{0}] on job [{1}] in unexpected state: [{1}]" .format(worker, job, dumps(targets_dict)) ) if stage == stage_3: persist_job_info(job, "worker_" + worker + "_source_shard_dict", stage, targets_dict) persist_job_info(job, "worker_" + worker + "_linked_shard_dict", stage, linked_shard_dict) if stage == stage_5: persist_job_info(job, "worker_" + worker + "_linked_shard_dict", stage, targets_dict) # sanity test if task is completed successfully target_status = [] for shard in targets_dict: target_status.append(targets_dict[shard]) if len(set(target_status)) == 1 and status_success in set(target_status): persist_job_info(job, "worker_" + worker + "_status", stage, status_success) else: persist_job_info(job, "worker_" + worker + "_status", stage, status_fail) else: # Shouldn't be able to get here raise StandardError("Bad stage definition passed to run_stage: {0}".format(stage)) # Now all jobs for stage have run, check all jobs completed successfully before returning for job in job_list: if stage in [stage_2, stage_4, stage_6]: component = "master" else: # must be stage 3 or 5 component = "worker_" + worker + "_status" job_status = (retrieve_job_info(job, component)).split("-")[1] if job_status not in [status_success, status_skip]: log.critical("Worker [{0}] failed or timed out one or more of [{1}] jobs for stage [{2}]" .format(worker, len(job_list), stage)) return status_fail log.info("Worker [{0}] found and processed [{1}] jobs for stage [{2}]" .format(worker, len(job_list), stage)) return status_success else: # No jobs found for this stage/worker return status_skip else: raise StandardError("Bad stage definition passed to run_stage: {0}".format(stage))
def __init__(self): conn = sqlite3.connect(ospathjoin(self.db_dir, 'mixxxdb.sqlite')) self.cur = conn.cursor()
def clear_test_jobs(): shred.log.info("Removing test jobs") rmdir_cmd = ["hdfs", "dfs", "-rm", "-f", "-r", "-skipTrash", ospathjoin(shred.conf.HDFS_SHRED_PATH)] rmdir_iter = shred.run_shell_command(rmdir_cmd) for line in rmdir_iter: shred.log.info(line)
# Por ahora tomaré una base desde mi computador local f = NamedTemporaryFile(suffix="BaseValidador", delete=False) myHostname = "162.243.165.69" myUsername = "******" myPassword = "******" filename = "BaseValidador.dta" opts = pysftpCnOpts() opts.hostkeys = None with pysftpConnection(host=myHostname, username=myUsername, password=myPassword, cnopts=opts) as sftp: remoteFilePath = '/root/CMD/EOD_Call/bases/validador/' + filename localFilePath = ospathjoin(f.name) sftp.get(remoteFilePath, localFilePath) sys.path.append(f.name) vps = pd.read_stata(f, convert_categoricals=False) ################################################################################## # Etapa 4: Union de base VPS a SQL ################################################################################## ############# ### VPS ############# # Cambiamos de formato orden vps['orden'] = vps['orden'].astype('str').str[:-2].astype('float64')
def test_dot_relative(self): fn = "my.txt" self.assertEqual(ospathjoin(self.cwd, fn), fully_qualify_filename("./" + fn))
Description: Track time spent at various activities. Author: Andrew Mattheisen """ from __future__ import print_function import datetime import argparse from sys import argv import fileinput from os.path import expanduser from os.path import join as ospathjoin VERSION = "0.0" TIMELOG = ospathjoin(expanduser("~"), "timelog.txt") DATETIMEFORMAT = "%Y-%m-%dT%H:%M:%S" DAYFORMAT = "%Y-%m-%d" INPROGRESS = "none" DEFAULT_CATEGORY = "general" ACTIVITY_DAY_HEADER = """= TRACKTIME REPORT FOR {weekday:<10} {day} = Start - End (Duration) | Activity@Category ---------------------------+------------------""" # MODELS class Activity(): """ An Activity is a task (e.g. sweep floor) with a start datetime, category, and end datetime. """ def __init__( self, starttime, name, category=DEFAULT_CATEGORY,