def process_file( enable_progress_report, bucket_name, prefix, k, src_filename, p_dst, enable_delete, propagate_error, write_done ): uploaded = False ctx = {"src": src_filename, "key": k.name, "bucket": bucket_name, "prefix": prefix} # 1) Upload to S3 try: k.set_contents_from_filename(src_filename) report(ctx, {"code": "ok", "kind": "upload"}) if enable_progress_report: logging.info("progress: uploaded file %s" % src_filename) uploaded = True if write_done: do_write_done(src_filename) except: if propagate_error: report(ctx, {"code": "error", "kind": "upload"}) return False # 2a) Delete if enable_delete: code, msg = rm(src_filename) if not code.startswith("ok"): logging.debug("Error deleting: %s (%s)" % (src_filename, msg)) if not propagate_error: return True else: if enable_progress_report: logging.info("progress: deleted file %s" % src_filename) report(ctx, {"code": code, "kind": "delete"}) # 2b) Move else: bname = os.path.basename(src_filename) dst_filename = os.path.join(p_dst, bname) code, msg = move(src_filename, dst_filename) if not code.startswith("ok"): ## 1 last chance... try recreating the dst directory for next run... mkdir_p(p_dst) logging.debug("Error moving: %s ==> %s (%s)" % (src_filename, dst_filename, msg)) if not propagate_error: return True else: if enable_progress_report: logging.info("progress: moved file %s ==> %s" % (src_filename, dst_filename)) report(ctx, {"code": code, "kind": "move", "dst": dst_filename}) return uploaded
def run(source_path=None, move_path=None, check_path=None, batch_size=5, polling_interval=None, enable_delete=False ,**_): if check_path is not None: ct=check_transition() if enable_delete and move_path is not None: raise Exception("Options '-mp' and '-d' are mutually exclusive") code, rp=resolve_path(source_path) if not code.startswith("ok"): raise Exception("can't resolve source path '%s'" % source_path) source_path=rp if move_path is not None: code, rp=resolve_path(move_path) if not code.startswith("ok"): raise Exception("can't resolve 'move_path' '%s'" % move_path) move_path=rp logging.info("Creating (if necessary) 'move' path: %s" % move_path) code, msg=mkdir_p(move_path) if not code.startswith("ok"): raise Exception("Can't create move path '%s': %s" % (move_path, str(msg))) logging.info("Checking if 'move' directory is writable") code, msg=can_write(move_path) if not code.startswith("ok"): raise Exception("Can't write to 'move' directory") to_skip=[] logging.info("Process pid: %s" % os.getpid()) ppid=os.getppid() logging.info("Parent pid : %s" % ppid) logging.info("Starting loop...") while True: if os.getppid()!=ppid: logging.warning("Parent terminated... exiting") break if check_path is not None: try: exists=os.path.exists(check_path) except: exists=False maybe_tr, _=ct.send(exists) if maybe_tr=="tr" and exists: logging.info("Check path: passed") if maybe_tr=="tr" and not exists: logging.info("Check path: failed - skipping") else: ## fake 'exists' exists=True if exists: code, files=get_root_files(source_path) if not code.startswith("ok"): logging.error("Can't get root files from %s" % source_path) else: ############################################################### files=files[:batch_size] try: for src_file in files: if src_file in to_skip: continue code, _=can_write(src_file) if not code.startswith("ok"): to_skip.append(src_file) logging.error("Would not be able to move/delete source file '%s'... skipping streaming" % src_file) continue dst_file=None if move_path is not None: bn=os.path.basename(src_file) dst_file=os.path.join(move_path, bn) code, maybe_error=process(src_file, dst_file, enable_delete) if not code.startswith("ok"): to_skip.append(src_file) logging.warning("Problem processing file '%s': %s" % (src_file, maybe_error)) except BrokenPipe: raise except KeyboardInterrupt: raise except Exception, e: logging.error("processing file '%s': %s" % (src_file, str(e))) ############################################################### logging.debug("...sleeping for %s seconds" % polling_interval) sleep(polling_interval)
def run(primary_path=None, compare_path=None, dest_path=None, status_filename=None, check_path=None ,just_basename=None ,topic_name=None ,exts=None ,wait_status=None, polling_interval=None ,just_zppp=None, just_ppzp=None, just_com=None ,**_): if check_path is not None: ct=check_transition() if dest_path: code, dest_path=resolve_path(dest_path) if not code.startswith("ok"): raise Exception("can't destination path '%s'" % dest_path) logging.info("Creating (if necessary) destination path: %s" % dest_path) code, msg=mkdir_p(dest_path) if code!="ok": raise Exception("Can't create path: %s" % dest_path) code, primary_path=resolve_path(primary_path) if not code.startswith("ok"): raise Exception("can't resolve primary path '%s'" % primary_path) logging.info("Creating (if necessary) primary path: %s" % primary_path) mkdir_p(primary_path) code, compare_path=resolve_path(compare_path) if not code.startswith("ok"): raise Exception("can't resolve compare path '%s'" % compare_path) logging.info("Creating (if necessary) compare path: %s" % compare_path) mkdir_p(compare_path) if wait_status: status_path=os.path.join(primary_path, status_filename) logging.info("Using status file path: %s" % status_path) else: status_path=None ### context for logging etc. ctx={ "just_zppp": just_zppp ,"just_ppzp": just_ppzp ,"just_com": just_com ,"just_list": just_zppp or just_ppzp or just_com ,"pp": primary_path ,"zp": compare_path ,"sp": status_path ,"pp_log" :{"up": partial(ilog, primary_path) ,"down": partial(wlog, primary_path) } ,"zp_log" :{"up": partial(ilog, compare_path) ,"down": partial(wlog, compare_path) } ,"topic_name": topic_name ,"exts": exts } ctx["tm"]=transition_manager(ctx) ppid=os.getppid() logging.info("Process pid: %s" % os.getpid()) logging.info("Parent pid: %s" % ppid) logging.info("Starting loop...") while True: if os.getppid()!=ppid: logging.warning("Parent terminated... exiting") break if check_path is not None: try: exists=os.path.exists(check_path) except: exists=False maybe_tr, _=ct.send(exists) if maybe_tr=="tr" and exists: logging.info("Check path: passed") if maybe_tr=="tr" and not exists: logging.info("Check path: failed - skipping") else: ## fake 'exists' exists=True if exists: code, msg=check_if_ok(status_path, default="ok") maybe_process(ctx, code, msg, primary_path, compare_path, just_basename, dest_path) logging.debug("...sleeping for %s seconds" % polling_interval) sleep(polling_interval)
def maybe_process_ok(ctx, _ok, _, primary_path, compare_path, just_basename, dest_path): if dest_path: logging.debug("Emptying dest path: %s" % dest_path) rmdir(dest_path) code, _msg=mkdir_p(dest_path) if code!="ok": raise Exception("Can't create path: %s" % dest_path) ### log rate limiter helper -- need to update status in context 'ctx' doOnTransition(ctx, "status_file.contents", "down", True, None) codep, primary_files=get_root_files(primary_path, strip_dirname=True) codec, compare_files=get_root_files(compare_path, strip_dirname=True) ### output some log info on transitions tm=ctx["tm"] tm.send(("pp_log", codep=="ok")) tm.send(("zp_log", codec=="ok")) ### not much to do if either path isn't accessible... if not codep.startswith("ok") or not codec.startswith("ok"): return exts=ctx["exts"] if exts is not None: primary_files=filter(filtre(exts), primary_files) compare_files=filter(filtre(exts), compare_files) def _mapper(path): bn=os.path.basename(path) return os.path.splitext(bn)[0] if just_basename: pfiles=map(_mapper, primary_files) cfiles=map(_mapper, compare_files) else: pfiles=primary_files cfiles=compare_files try: setpf=set(pfiles) setcf=set(cfiles) common=setpf.intersection(setcf) diff={ "pp": primary_path ,"zp": compare_path ,"pp-zp": list(setpf-setcf) ,"zp-pp": list(setcf-setpf) ,"common": list(common) } topic_name=ctx["topic_name"] if topic_name is not None: diff["topic"]=topic_name if ctx["just_list"]: doout(ctx, diff, dest_path) else: stdoutj(diff) stdoutf() except Exception, e: logging.error("Can't compute diff between paths: %s" % str(e))
def run(source_path=None, dest_path=None, check_path=None, batch_size=5, polling_interval=None, delete_fetch_error=False ,**_): if check_path is not None: ct=check_transition() logging.info("Creating (if necessary) destination path: %s" % dest_path) code, msg=mkdir_p(dest_path) if not code.startswith("ok"): raise Exception("Can't create destination path '%s': %s" % (dest_path, str(msg))) to_skip=[] ppid=os.getppid() logging.info("Process pid: %s" % os.getpid()) logging.info("Parent pid : %s" % ppid) logging.info("Starting loop...") while True: if os.getppid()!=ppid: logging.warning("Parent terminated... exiting") break if check_path is not None: try: exists=os.path.exists(check_path) except: exists=False maybe_tr, _=ct.send(exists) if maybe_tr=="tr" and exists: logging.info("Check path: passed") if maybe_tr=="tr" and not exists: logging.info("Check path: failed - skipping") else: ## fake 'exists' exists=True if exists: code, files=get_root_files(source_path) if not code.startswith("ok"): logging.error("Can't get root files from %s" % source_path) continue ############################################################### files=files[:batch_size] try: for src_file in files: if src_file in to_skip: continue code, _=can_write(src_file) if not code.startswith("ok"): to_skip.append(src_file) logging.error("Would not be able to delete source file '%s'... skipping download" % src_file) continue process(src_file, dest_path, delete_fetch_error) except BrokenPipe: raise except Exception, e: logging.error("processing file '%s': %s" % (src_file, str(e))) ############################################################### logging.debug("...waiting for %s seconds (max)" % polling_interval) ### Implement a "pass-through" for stdin --> stdout ### whilst also handling a maximum time-out start_time=time.time() while True: ir, _w, _e=select.select([sys.stdin], [], [], polling_interval) if len(ir): iline=sys.stdin.readline() sys.stdout.write(iline) elapsed_time = time.time() - start_time if elapsed_time > polling_interval: break
def run(bucket_name=None, path_source=None, path_move=None, delete_source=False, polling_interval=60, extd=None, extf=None ,**_): if not delete_source and path_move is None: raise Exception("Options 'delete source' or 'move path' is required") if delete_source and path_move is not None: raise Exception("Options 'delete source' and 'move path' are mutually exclusive") #if args.enable_debug: # logger=logging.getLogger() # logger.setLevel(logging.DEBUG) bucket_name=bucket_name.strip() path_source=path_source.strip() code, p_src=resolve_path(path_source) if not code.startswith("ok"): raise Exception("Invalid source path: %s" % path_source) mkdir_p(p_src) if path_move is not None: code, path_move=resolve_path(path_move) if not code.startswith("ok"): raise Exception("Invalid move path: %s" % path_move) code,_=mkdir_p(path_move) if not code.startswith("ok"): raise Exception("Can't create move path: %s" % path_move) try: conn = boto.connect_s3() except: ## not much we can do ## but actually no remote calls are made ## at this point so it should be highly improbable raise Exception("Can't 'connect' to S3") ###################### BUCKET logging.info("Getting/creating bucket (unlimited retries with backoff)") def _get_create_bucket(): return conn.create_bucket(bucket_name) bucket=retry(_get_create_bucket) logging.info("Got bucket: %s" % bucket_name) ############################# logging.debug("Starting loop...") ppid=os.getppid() logging.info("Process pid: %s" % os.getpid()) logging.info("Parent pid: %s" % ppid) while True: if os.getppid()!=ppid: logging.warning("Parent terminated... exiting") break ################################################# logging.debug("Start processing...") code, dirs=get_root_dirs(p_src) if not code.startswith("ok"): raise Warning("Source path disappeared: %s" % p_src) dirs=filter_dirs(extd, dirs) for _dir in dirs: process_dir(bucket, _dir, delete_source, extf, path_move) ##################################################### logging.debug("...sleeping for %s seconds" % polling_interval) sleep(polling_interval)
def run( enable_simulate=False, bucket_name=None, path_source=None, path_moveto=None, path_check=None, num_files=5, enable_delete=False, propagate_error=False, prefix=None, polling_interval=None, only_ext=None, filename_input_full=False, filename_input_regex=None, key_output_format=None, enable_progress_report=False, write_done=False, **_ ): if key_output_format is not None: if filename_input_regex is None: raise Exception("-ifnr and -okf options work in tandem") if filename_input_regex is not None: if key_output_format is None: raise Exception("Input filename regex specified but no output S3 key format specified") logging.info("Compiling input filename regex...") try: ireg = re.compile(filename_input_regex.strip("'")) ofmt = key_output_format.strip("'") except: raise Exception("Can't compile input filename regex pattern") else: ireg = None ofmt = None # if args.enable_debug: # logger=logging.getLogger() # logger.setLevel(logging.DEBUG) bucket_name = bucket_name.strip() path_source = path_source.strip() try: prefix = prefix.strip() except: prefix = None try: path_moveto = path_moveto.strip() except: path_moveto = None if path_check is not None: code, path_check = resolve_path(path_check) if not code.startswith("ok"): logging.warning("path_check '%s' might be in error..." % path_check) ### VALIDATE PARAMETERS if not enable_delete and path_moveto is None: raise Exception("either -d or -m must be used") if enable_delete and path_moveto is not None: raise Exception("-d can't be used with -m") code, p_src = resolve_path(path_source) if not code.startswith("ok"): raise Exception("Invalid source path: %s" % path_source) if path_moveto is not None: code, p_dst = resolve_path(path_moveto) if not code.startswith("ok"): raise Exception("Invalid moveto path: %s" % path_moveto) else: p_dst = None ### wait for 'source' path to be available logging.info("Waiting for source path to be accessible... CTRL-c to stop") while True: if os.path.isdir(p_src): break sleep(1) logging.info("* Source path accessible") if path_moveto is not None: logging.info("Creating 'moveto' directory if required") code, _ = mkdir_p(p_dst) if not code.startswith("ok"): raise Exception("Can't create 'moveto' directory: %s" % p_dst) logging.info("* Created moveto directory") if not enable_simulate: try: conn = boto.connect_s3() except: ## not much we can do ## but actually no remote calls are made ## at this point so it should be highly improbable raise Exception("Can't 'connect' to S3") if not enable_simulate: ###################### BUCKET logging.info("Getting/creating bucket (unlimited retries with backoff)") def _get_create_bucket(): return conn.create_bucket(bucket_name) bucket = retry(_get_create_bucket) logging.info("Got bucket") ############################# if enable_simulate: logging.info("Begin simulation...") else: logging.debug("Starting loop...") ppid = os.getppid() logging.info("Process pid: %s" % os.getpid()) logging.info("Parent pid: %s" % ppid) while True: if os.getppid() != ppid: logging.warning("Parent terminated... exiting") break ################################################# _code, path_exists = safe_path_exists(path_check) if path_check is None or path_exists: try: logging.debug("Start processing...") count = 0 gen = gen_walk(p_src, max_files=num_files, only_ext=only_ext) for src_filename in gen: if enable_progress_report: logging.info("Processing file: %s" % src_filename) if write_done: if is_done_file(src_filename): continue try: s3key_name = gen_s3_key(ireg, ofmt, p_src, src_filename, prefix, filename_input_full) except Exception, e: raise Exception( "Error generating S3 key... check your command line parameters... use the 'simulate' facility: %s" % e ) if enable_simulate: simulate(src_filename, s3key_name, enable_delete, p_dst) else: k = S3Key(bucket) k.key = s3key_name was_uploaded = process_file( enable_progress_report, bucket_name, prefix, k, src_filename, p_dst, enable_delete, propagate_error, write_done, ) if was_uploaded: count = count + 1 except Exception, e: logging.error("Error processing files...(%s)" % str(e)) else: logging.info() if count > 0: logging.info("Progress> uploaded %s files" % count) ##################################################### logging.debug("...sleeping for %s seconds" % polling_interval) sleep(polling_interval)