Beispiel #1
0
def process_file(
    enable_progress_report, bucket_name, prefix, k, src_filename, p_dst, enable_delete, propagate_error, write_done
):

    uploaded = False
    ctx = {"src": src_filename, "key": k.name, "bucket": bucket_name, "prefix": prefix}

    # 1) Upload to S3
    try:
        k.set_contents_from_filename(src_filename)
        report(ctx, {"code": "ok", "kind": "upload"})

        if enable_progress_report:
            logging.info("progress: uploaded file %s" % src_filename)
            uploaded = True

        if write_done:
            do_write_done(src_filename)

    except:
        if propagate_error:
            report(ctx, {"code": "error", "kind": "upload"})
        return False

    # 2a) Delete
    if enable_delete:
        code, msg = rm(src_filename)
        if not code.startswith("ok"):
            logging.debug("Error deleting: %s (%s)" % (src_filename, msg))
            if not propagate_error:
                return True
        else:
            if enable_progress_report:
                logging.info("progress: deleted file %s" % src_filename)

        report(ctx, {"code": code, "kind": "delete"})

    # 2b) Move
    else:
        bname = os.path.basename(src_filename)
        dst_filename = os.path.join(p_dst, bname)
        code, msg = move(src_filename, dst_filename)
        if not code.startswith("ok"):
            ## 1 last chance... try recreating the dst directory for next run...
            mkdir_p(p_dst)
            logging.debug("Error moving: %s ==> %s  (%s)" % (src_filename, dst_filename, msg))
            if not propagate_error:
                return True
        else:
            if enable_progress_report:
                logging.info("progress: moved file %s ==> %s" % (src_filename, dst_filename))

        report(ctx, {"code": code, "kind": "move", "dst": dst_filename})

    return uploaded
Beispiel #2
0
def run(source_path=None, move_path=None, check_path=None, 
        batch_size=5, 
        polling_interval=None, enable_delete=False
        ,**_):

    if check_path is not None:
        ct=check_transition()

    if enable_delete and move_path is not None:
        raise Exception("Options '-mp' and '-d' are mutually exclusive")
        
    code, rp=resolve_path(source_path)
    if not code.startswith("ok"):
        raise Exception("can't resolve source path '%s'" % source_path)
    source_path=rp
    
    if move_path is not None:
        code, rp=resolve_path(move_path)
        if not code.startswith("ok"):
            raise Exception("can't resolve 'move_path' '%s'" % move_path)
        move_path=rp

        logging.info("Creating (if necessary) 'move' path: %s" % move_path)
        code, msg=mkdir_p(move_path)
        if not code.startswith("ok"):
            raise Exception("Can't create move path '%s': %s" % (move_path, str(msg)))
            
        logging.info("Checking if 'move' directory is writable")
        code, msg=can_write(move_path)
        if not code.startswith("ok"):
            raise Exception("Can't write to 'move' directory")
            
    to_skip=[]
    logging.info("Process pid: %s" % os.getpid())
    ppid=os.getppid()
    logging.info("Parent pid : %s" % ppid)
    logging.info("Starting loop...")
    while True:
        if os.getppid()!=ppid:
            logging.warning("Parent terminated... exiting")
            break
        
        if check_path is not None:
            try:    exists=os.path.exists(check_path)
            except: exists=False
            
            maybe_tr, _=ct.send(exists)
            if maybe_tr=="tr" and exists:
                logging.info("Check path: passed")
            if maybe_tr=="tr" and not exists:
                logging.info("Check path: failed - skipping")
        else:
            ## fake 'exists'
            exists=True

        if exists:        
            code, files=get_root_files(source_path)
            if not code.startswith("ok"):
                logging.error("Can't get root files from %s" % source_path)
            else:                
                ###############################################################
                files=files[:batch_size]
                try:
                    for src_file in files:
                        
                        if src_file in to_skip:
                            continue
                        
                        code, _=can_write(src_file)
                        if not code.startswith("ok"):
                            to_skip.append(src_file)
                            logging.error("Would not be able to move/delete source file '%s'... skipping streaming" % src_file)
                            continue
        
                        dst_file=None                
                        if move_path is not None:
                            bn=os.path.basename(src_file)
                            dst_file=os.path.join(move_path, bn)
                        
                        code, maybe_error=process(src_file, dst_file, enable_delete)
                        if not code.startswith("ok"):
                            to_skip.append(src_file)
                            logging.warning("Problem processing file '%s': %s" % (src_file, maybe_error))
                except BrokenPipe:
                    raise
                except KeyboardInterrupt:
                    raise
                except Exception, e:
                    logging.error("processing file '%s': %s" % (src_file, str(e)))
                ###############################################################            
        
        
        logging.debug("...sleeping for %s seconds" % polling_interval)
        sleep(polling_interval)
Beispiel #3
0
def run(primary_path=None, compare_path=None, 
        dest_path=None,
        status_filename=None, check_path=None
        ,just_basename=None
        ,topic_name=None
        ,exts=None
        ,wait_status=None, polling_interval=None
        ,just_zppp=None, just_ppzp=None, just_com=None
        ,**_):

    if check_path is not None:
        ct=check_transition()

    if dest_path:
        code, dest_path=resolve_path(dest_path)
        if not code.startswith("ok"):
            raise Exception("can't destination path '%s'" % dest_path)
        
        logging.info("Creating (if necessary) destination path: %s" % dest_path)
        code, msg=mkdir_p(dest_path)
        if code!="ok":
            raise Exception("Can't create path: %s" % dest_path)

    code, primary_path=resolve_path(primary_path)
    if not code.startswith("ok"):
        raise Exception("can't resolve primary path '%s'" % primary_path)
    
    logging.info("Creating (if necessary) primary path: %s" % primary_path)
    mkdir_p(primary_path)
    
    code, compare_path=resolve_path(compare_path)
    if not code.startswith("ok"):
        raise Exception("can't resolve compare path '%s'" % compare_path)

    logging.info("Creating (if necessary) compare path: %s" % compare_path)
    mkdir_p(compare_path)
            
    if wait_status:
        status_path=os.path.join(primary_path, status_filename)
        logging.info("Using status file path: %s" % status_path)
    else: 
        status_path=None

    ### context for logging etc.
    ctx={
          "just_zppp": just_zppp
         ,"just_ppzp": just_ppzp
         ,"just_com":  just_com
         ,"just_list": just_zppp or just_ppzp or just_com
         
         ,"pp": primary_path
         ,"zp": compare_path
         ,"sp": status_path
         
         ,"pp_log" :{"up":    partial(ilog, primary_path)
                     ,"down":  partial(wlog, primary_path)
                     }
         ,"zp_log" :{"up":    partial(ilog, compare_path)
                     ,"down":  partial(wlog, compare_path)
                     }
         ,"topic_name": topic_name
         ,"exts": exts
         }

    ctx["tm"]=transition_manager(ctx)
    
    ppid=os.getppid()        
    logging.info("Process pid: %s" % os.getpid())
    logging.info("Parent  pid: %s" % ppid)
    logging.info("Starting loop...")
    while True:
        if os.getppid()!=ppid:
            logging.warning("Parent terminated... exiting")
            break
            
        if check_path is not None:
            try:    exists=os.path.exists(check_path)
            except: exists=False
            
            maybe_tr, _=ct.send(exists)
            if maybe_tr=="tr" and exists:
                logging.info("Check path: passed")
            if maybe_tr=="tr" and not exists:
                logging.info("Check path: failed - skipping")
        else:
            ## fake 'exists'
            exists=True

        if exists:            
            code, msg=check_if_ok(status_path, default="ok")
            maybe_process(ctx, code, msg, primary_path, compare_path, just_basename, dest_path)
        
        logging.debug("...sleeping for %s seconds" % polling_interval)
        sleep(polling_interval)
Beispiel #4
0
def maybe_process_ok(ctx, _ok, _, primary_path, compare_path, just_basename, dest_path):
    
    if dest_path:
        logging.debug("Emptying dest path: %s" % dest_path)
        rmdir(dest_path)
        
        code, _msg=mkdir_p(dest_path)
        if code!="ok":
            raise Exception("Can't create path: %s" % dest_path)
        
    ### log rate limiter helper -- need to update status in context 'ctx'
    doOnTransition(ctx, "status_file.contents", "down", True, None)
    
    codep, primary_files=get_root_files(primary_path, strip_dirname=True)
    codec, compare_files=get_root_files(compare_path, strip_dirname=True)
    
    ### output some log info on transitions
    tm=ctx["tm"]
    tm.send(("pp_log", codep=="ok"))
    tm.send(("zp_log", codec=="ok"))
    
    ### not much to do if either path isn't accessible...
    if not codep.startswith("ok") or not codec.startswith("ok"):
        return
    
    exts=ctx["exts"]
    if exts is not None:
        primary_files=filter(filtre(exts), primary_files)
        compare_files=filter(filtre(exts), compare_files)
    
    def _mapper(path):
        bn=os.path.basename(path)
        return os.path.splitext(bn)[0]
    
    if just_basename:
        pfiles=map(_mapper, primary_files)
        cfiles=map(_mapper, compare_files)
    else:
        pfiles=primary_files
        cfiles=compare_files
    
    try:
        setpf=set(pfiles)
        setcf=set(cfiles)
        common=setpf.intersection(setcf)
        
        diff={
               "pp":     primary_path
              ,"zp":     compare_path
              ,"pp-zp":  list(setpf-setcf)
              ,"zp-pp":  list(setcf-setpf)
              ,"common": list(common)
              }
        
        topic_name=ctx["topic_name"]
        
        if topic_name is not None:
            diff["topic"]=topic_name
        
        if ctx["just_list"]:
            doout(ctx, diff, dest_path)
        else:
            stdoutj(diff)
            stdoutf()
            
    except Exception, e:
        logging.error("Can't compute diff between paths: %s" % str(e))
Beispiel #5
0
def run(source_path=None, dest_path=None, check_path=None, 
        batch_size=5, 
        polling_interval=None, delete_fetch_error=False
        ,**_):
    
    if check_path is not None:
        ct=check_transition()
    
    logging.info("Creating (if necessary) destination path: %s" % dest_path)
    code, msg=mkdir_p(dest_path)
    if not code.startswith("ok"):
        raise Exception("Can't create destination path '%s': %s" % (dest_path, str(msg)))
            
    to_skip=[]
    ppid=os.getppid()
    logging.info("Process pid: %s" % os.getpid())
    logging.info("Parent pid : %s" % ppid)
    logging.info("Starting loop...")
    while True:
        if os.getppid()!=ppid:
            logging.warning("Parent terminated... exiting")
            break
        
        if check_path is not None:
            try:    exists=os.path.exists(check_path)
            except: exists=False
            
            maybe_tr, _=ct.send(exists)
            if maybe_tr=="tr" and exists:
                logging.info("Check path: passed")
            if maybe_tr=="tr" and not exists:
                logging.info("Check path: failed - skipping")
        else:
            ## fake 'exists'
            exists=True

        if exists:        
            code, files=get_root_files(source_path)
            if not code.startswith("ok"):
                logging.error("Can't get root files from %s" % source_path)
                continue
            
            ###############################################################
            files=files[:batch_size]
            try:
                for src_file in files:
                    
                    if src_file in to_skip:
                        continue
                    
                    code, _=can_write(src_file)
                    if not code.startswith("ok"):
                        to_skip.append(src_file)
                        logging.error("Would not be able to delete source file '%s'... skipping download" % src_file)
                        continue
                    
                    process(src_file, dest_path, delete_fetch_error)
            except BrokenPipe:
                raise
            except Exception, e:
                logging.error("processing file '%s': %s" % (src_file, str(e)))
            ###############################################################            
        
        
        logging.debug("...waiting for %s seconds (max)" % polling_interval)
        
        ### Implement a "pass-through" for stdin --> stdout
        ###  whilst also handling a maximum time-out
        start_time=time.time()
        while True:
            ir, _w, _e=select.select([sys.stdin], [], [], polling_interval)
            if len(ir):
                iline=sys.stdin.readline()
                sys.stdout.write(iline)
                
            elapsed_time = time.time() - start_time
            if elapsed_time > polling_interval:
                break
Beispiel #6
0
def run(bucket_name=None, 
        path_source=None, 
        path_move=None,
        delete_source=False,
        polling_interval=60,
        extd=None,
        extf=None
        ,**_):
        

    if not delete_source and path_move is None:
        raise Exception("Options 'delete source' or 'move path' is required")
    
    if delete_source and path_move is not None:
        raise Exception("Options 'delete source' and 'move path' are mutually exclusive")
    
    
    #if args.enable_debug:
    #    logger=logging.getLogger()
    #    logger.setLevel(logging.DEBUG)
    
    bucket_name=bucket_name.strip()
    path_source=path_source.strip()
    
    code, p_src=resolve_path(path_source)
    if not code.startswith("ok"):
        raise Exception("Invalid source path: %s" % path_source)

    mkdir_p(p_src)

    if path_move is not None:
        code, path_move=resolve_path(path_move)
        if not code.startswith("ok"):
            raise Exception("Invalid move path: %s" % path_move)
    
        code,_=mkdir_p(path_move)
        if not code.startswith("ok"):
            raise Exception("Can't create move path: %s" % path_move)
    
    
    try:
        conn = boto.connect_s3()
    except:
        ## not much we can do
        ## but actually no remote calls are made
        ## at this point so it should be highly improbable
        raise Exception("Can't 'connect' to S3")
    
    ###################### BUCKET
    logging.info("Getting/creating bucket (unlimited retries with backoff)")
    def _get_create_bucket():
        return conn.create_bucket(bucket_name)
              
    bucket=retry(_get_create_bucket)
    logging.info("Got bucket: %s" % bucket_name)
    #############################

    logging.debug("Starting loop...")

    ppid=os.getppid()
    logging.info("Process pid: %s" % os.getpid())
    logging.info("Parent pid:  %s" % ppid)
    while True:
        if os.getppid()!=ppid:
            logging.warning("Parent terminated... exiting")
            break
        #################################################

        logging.debug("Start processing...")
        
        code, dirs=get_root_dirs(p_src)
        if not code.startswith("ok"):
            raise Warning("Source path disappeared: %s" % p_src)
        
        dirs=filter_dirs(extd, dirs)
        
        for _dir in dirs:
            process_dir(bucket, _dir, delete_source, extf, path_move)
        

        #####################################################
        logging.debug("...sleeping for %s seconds" % polling_interval)
        sleep(polling_interval)
Beispiel #7
0
def run(
    enable_simulate=False,
    bucket_name=None,
    path_source=None,
    path_moveto=None,
    path_check=None,
    num_files=5,
    enable_delete=False,
    propagate_error=False,
    prefix=None,
    polling_interval=None,
    only_ext=None,
    filename_input_full=False,
    filename_input_regex=None,
    key_output_format=None,
    enable_progress_report=False,
    write_done=False,
    **_
):

    if key_output_format is not None:
        if filename_input_regex is None:
            raise Exception("-ifnr and -okf options work in tandem")

    if filename_input_regex is not None:

        if key_output_format is None:
            raise Exception("Input filename regex specified but no output S3 key format specified")

        logging.info("Compiling input filename regex...")
        try:
            ireg = re.compile(filename_input_regex.strip("'"))
            ofmt = key_output_format.strip("'")
        except:
            raise Exception("Can't compile input filename regex pattern")
    else:
        ireg = None
        ofmt = None

    # if args.enable_debug:
    #    logger=logging.getLogger()
    #    logger.setLevel(logging.DEBUG)

    bucket_name = bucket_name.strip()
    path_source = path_source.strip()

    try:
        prefix = prefix.strip()
    except:
        prefix = None

    try:
        path_moveto = path_moveto.strip()
    except:
        path_moveto = None

    if path_check is not None:
        code, path_check = resolve_path(path_check)
        if not code.startswith("ok"):
            logging.warning("path_check '%s' might be in error..." % path_check)

    ### VALIDATE PARAMETERS
    if not enable_delete and path_moveto is None:
        raise Exception("either -d or -m must be used")

    if enable_delete and path_moveto is not None:
        raise Exception("-d can't be used with -m")

    code, p_src = resolve_path(path_source)
    if not code.startswith("ok"):
        raise Exception("Invalid source path: %s" % path_source)

    if path_moveto is not None:
        code, p_dst = resolve_path(path_moveto)
        if not code.startswith("ok"):
            raise Exception("Invalid moveto path: %s" % path_moveto)
    else:
        p_dst = None

    ### wait for 'source' path to be available
    logging.info("Waiting for source path to be accessible... CTRL-c to stop")
    while True:
        if os.path.isdir(p_src):
            break
        sleep(1)
    logging.info("* Source path accessible")

    if path_moveto is not None:
        logging.info("Creating 'moveto' directory if required")
        code, _ = mkdir_p(p_dst)
        if not code.startswith("ok"):
            raise Exception("Can't create 'moveto' directory: %s" % p_dst)
        logging.info("* Created moveto directory")

    if not enable_simulate:
        try:
            conn = boto.connect_s3()
        except:
            ## not much we can do
            ## but actually no remote calls are made
            ## at this point so it should be highly improbable
            raise Exception("Can't 'connect' to S3")

    if not enable_simulate:
        ###################### BUCKET
        logging.info("Getting/creating bucket (unlimited retries with backoff)")

        def _get_create_bucket():
            return conn.create_bucket(bucket_name)

        bucket = retry(_get_create_bucket)
        logging.info("Got bucket")
        #############################

    if enable_simulate:
        logging.info("Begin simulation...")
    else:
        logging.debug("Starting loop...")

    ppid = os.getppid()
    logging.info("Process pid: %s" % os.getpid())
    logging.info("Parent pid:  %s" % ppid)
    while True:
        if os.getppid() != ppid:
            logging.warning("Parent terminated... exiting")
            break
        #################################################

        _code, path_exists = safe_path_exists(path_check)

        if path_check is None or path_exists:
            try:
                logging.debug("Start processing...")
                count = 0
                gen = gen_walk(p_src, max_files=num_files, only_ext=only_ext)

                for src_filename in gen:

                    if enable_progress_report:
                        logging.info("Processing file: %s" % src_filename)

                    if write_done:
                        if is_done_file(src_filename):
                            continue

                    try:
                        s3key_name = gen_s3_key(ireg, ofmt, p_src, src_filename, prefix, filename_input_full)
                    except Exception, e:
                        raise Exception(
                            "Error generating S3 key... check your command line parameters... use the 'simulate' facility: %s"
                            % e
                        )

                    if enable_simulate:
                        simulate(src_filename, s3key_name, enable_delete, p_dst)
                    else:
                        k = S3Key(bucket)
                        k.key = s3key_name
                        was_uploaded = process_file(
                            enable_progress_report,
                            bucket_name,
                            prefix,
                            k,
                            src_filename,
                            p_dst,
                            enable_delete,
                            propagate_error,
                            write_done,
                        )
                        if was_uploaded:
                            count = count + 1

            except Exception, e:
                logging.error("Error processing files...(%s)" % str(e))
        else:
            logging.info()

        if count > 0:
            logging.info("Progress> uploaded %s files" % count)

        #####################################################
        logging.debug("...sleeping for %s seconds" % polling_interval)
        sleep(polling_interval)