Beispiel #1
0
def retrieve_job_info(job, component, strict=True):
    """Retrieves data stored in our HDFS Shred directory"""
    file_content = None
    if component == "master":
        # The master component always updates the state of the job in the master job list
        file_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs", job)
    elif 'worker' in component or 'data' in component:
        file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job, component)
    else:
        raise ValueError("Invalid option passed to function get_hdfs_file")
    try:
        with hdfs.read(file_path) as reader:
            # expecting all content by this program to be serialised as json
            file_content = reader.read()
    except HdfsError as e:
        if strict:
            raise StandardError("HDFSCli couldn't read a file from path [{0}] with details: {1}"
                                .format(file_path, e))
        else:
            pass
            # if not strict mode then we will return None
    if file_content:
        get_result = loads(file_content)
        log.debug("Retrieved content [{2}] from component file [{0}] at path [{1}]"
                  .format(component, file_path, get_result))
    else:
        get_result = file_content
    return get_result
Beispiel #2
0
def update_test_files():
    shred.log.info("Updating list of test data files")
    global test_files
    filelist_cmd = ["hdfs", "dfs", "-ls", ospathjoin(test_file_path, test_file_dir)]
    filelist_iter = shred.run_shell_command(filelist_cmd)
    for line in filelist_iter:
        splits = ssplit(line)
        if "{0}".format(ospathjoin(test_file_path, test_file_dir)) in splits[-1]:
            test_files.append(splits[-1])
    shred.log.info("Test files are now [{0}]".format(test_files))
Beispiel #3
0
def generate_test_data():
    # Generate test data
    shred.log.info("Out of test data, generating new Test Data files...")
    clean_dir_cmd = ["hdfs", "dfs", "-rmdir", ospathjoin(test_file_path, test_file_dir)]
    shred.run_shell_command(clean_dir_cmd)
    gen_test_data_cmd = ["/usr/hdp/current/hadoop-client/bin/hadoop", "jar",
                         glob("/usr/hdp/current/hadoop-mapreduce-client/hadoop-mapreduce-examples-*.jar")[0],
                         "teragen", test_file_size, ospathjoin(test_file_path, test_file_dir)]
    gen_test_data_iter = shred.run_shell_command(gen_test_data_cmd)
    for line in gen_test_data_iter:
        if "Bytes Written" in line:
            shred.log.info(line)
    # remove 0 size '_SUCCESS' file
    del_file_cmd = ["hdfs", "dfs", "-rm", ospathjoin(test_file_path, test_file_dir, "_SUCCESS")]
    do_nothing = shred.run_shell_command(del_file_cmd)
Beispiel #4
0
def set_status(job_id, component, status):
    """Abstracts setting a given status for a given job, job subcomponent, and status message"""
    # determine file to be written
    if component == "master":
        # The master component always updates the state of the job in the master job list
        file_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs", job_id)
    elif component == "data":
        file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, "status")
    else:
        # otherwise we update the file named for that component in the subdir for the job in the general store
        file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, component, "status")
    log.debug("Setting status of component [{0}] at path [{1}] to [{2}]".format(component, file_path, status))
    if file_path is not None:
        hdfs.write(file_path, status, overwrite=True)
    else:
        raise ValueError("File Path to set job status not set.")
Beispiel #5
0
def init_program(passed_args):
    log.info("shred.py called with args [{0}]".format(passed_args))
    parsed_args = parse_user_args(passed_args)
    # TODO: Further configuration file validation tests
    log.debug("Checking for config parameters.")
    if not conf.VERSION:
        raise StandardError(
            "Version number in config.py not found, please check configuration file is available and try again."
        )
    # Test necessary connections
    ensure_hdfs()
    # Check directories etc. are setup
    hdfs.makedirs(ospathjoin(conf.HDFS_SHRED_PATH, "jobs"))
    hdfs.makedirs(ospathjoin(conf.HDFS_SHRED_PATH, "store"))
    # TODO: Further Application setup tests
    return parsed_args
Beispiel #6
0
def check_for_new_worker_jobs():
    """Checks for the existance of new worker jobs and returns a list of them if they exist"""
    worker_job_list = []
    # check if dir exists as worker my load before client is ever used
    job_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs")
    job_dir_exists = hdfs.content(job_path, strict=False)
    if job_dir_exists is not None:
        # if job dir exists, get listing and any files
        dirlist = hdfs.list(job_path, status=True)
        for item in dirlist:
            if item[1]['type'] == 'FILE':
                with hdfs.read(ospathjoin(job_path, item[0])) as reader:
                    job_status = reader.read()
                # if file contains the completion status for stage1, put it in worker list
                if job_status == "stage1complete":
                    worker_job_list.append(item[0])
    return worker_job_list
Beispiel #7
0
def persist_job_info(job, component, stage, info):
    """Writes data to our directory structure in the HDFS shred directory"""
    if component == "master":
        file_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs", job)
        content = dumps(stage + "-" + info)
    elif 'worker' in component or 'data' in component:
        file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job, component)
        if 'status' in component:
            content = dumps(stage + "-" + info)
        else:
            content = dumps(info)
    else:
        raise StandardError("Function persist_job_info was passed an unrecognised component name")
    if file_path is not None:
        try:
            hdfs.write(file_path, content, overwrite=True)
        except HdfsError as e:
            raise e
    else:
        raise ValueError()
Beispiel #8
0
def prepare_blocklists(job_id):
    """Attempts to take leadership for job preparation and creates the block-file lists for each datanode worker"""
    # attempt to kazoo lease new guid node for sleep period minutes
    log.debug("Preparing Blocklists for job [{0}]".format(job_id))
    log.debug("Attempting to get lease as leader for job")
    lease = zk.NonBlockingLease(
        path=conf.ZOOKEEPER['PATH'] + job_id,
        duration=dttd(minutes=conf.WORKER_SLEEP),
        identifier="Worker [{0}] preparing blocklists for job [{1}]".format(get_worker_identity(), job_id)
    )
    # http://kazoo.readthedocs.io/en/latest/api/recipe/lease.html
    # if not get lease, return pipped status
    if not lease:
        log.debug("Beaten to leasehold by another worker")
        return "pipped"
    else:
        log.debug("Got lease as leader on job, updating job status")
        # update job status to stage2prepareblocklist
        status = "stage2prepareBlocklist"
        component = "master"
        set_status(job_id, component, status)
        # get job target ( returns a list )
        targets = get_target_by_jobid(job_id)
        log.debug("Got target file(s) [{0}] for job".format(targets))
        # get fsck data for targets
        blocklists = {}
        for target in targets:
            fsck_data = get_fsck_output(target)
            # parse fsck data for blocklists
            blocklists.update(parse_blocks_from_fsck(fsck_data))
        log.debug("Parsed FSCK output for target files: [{0}]".format(blocklists))
        # match fsck output to worker_ids
            # block IDs for workers are currently the IP of the datanode, which matches our worker_id in the utility
            # Therefore no current need to do a match between the fsck output and the local worker ID
        # write a per-DN file to hdfs job subdir for other workers to read
        target_workers = blocklists.keys()
        log.debug("Datanode list for these blockfiles is: [{0}]".format(target_workers))
        for this_worker in target_workers:
            this_worklist = {}
            for blockfile in blocklists[this_worker]:
                this_worklist[blockfile] = "new"
            workfile_content = dumps(this_worklist)
            file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, this_worker)
            log.debug("Writing [{0}] to workfile [{1}] for Datanode [{2}]"
                      .format(workfile_content, file_path, this_worker))
            hdfs.write(file_path, workfile_content, overwrite=True)
        # update job status to stage2copyblocks
        log.debug("Completed leader tasks for blocklist preparation, updating status and returning from function")
        status = "stage2copyblocks"
        set_status(job_id, component, status)
        # TODO: Look for a method to explicitly release the lease when done
        # apparently there's no release lease command in this recipe, so it'll just timeout?
        # return success status
        return "success"
Beispiel #9
0
def get_target_by_jobid(job_id):
    """Gets paths of target files ingested into this jobs data store directory
    returns list of absolute paths to target files on HDFS"""
    hdfs_file_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, "data")
    log.debug("getting list of files at path [{0}]".format(hdfs_file_path))
    # hdfs.list returns a list of file names in a directory
    hdfs_file_list = hdfs.list(hdfs_file_path)
    out = []
    for file in hdfs_file_list:
        # TODO: Check if HDFS always uses / as path separator on Win or Linux etc.
        out.append(hdfs_file_path + '/' + file)
    return out
Beispiel #10
0
def setup_module():
    shred.log.info("Begin Setup")
    shred.log.info("Checking for existing test data")
    # Check if test data already exists
    test_data_exists_cmd = ["hdfs", "dfs", "-ls", ospathjoin(test_file_path, test_file_dir)]
    test_data_exists_iter = shred.run_shell_command(test_data_exists_cmd)
    try:
        out = next(test_data_exists_iter)
        if "No such file or directory" in out:
            test_data_exists_state = False
        else:
            test_data_exists_state = True
    except StopIteration:
        test_data_exists_state = False
    if test_data_exists_state is False:
        generate_test_data()
    update_test_files()
Beispiel #11
0
def teardown_module():
    shred.log.info("Begin Teardown...")
    # Remove test data
    if remove_test_files:
        shred.log.info("Removing Test Data")
        rmdir_cmd = ["hdfs", "dfs", "-rm", "-f", "-r", "-skipTrash", ospathjoin(test_file_path, test_file_dir)]
        rmdir_iter = shred.run_shell_command(rmdir_cmd)
        for line in rmdir_iter:
            shred.log.info(line)
    else:
        shred.log.info("Skipping removal of test data")
    if remove_test_zkdata:
        shred.log.info("Removing test ZK Data")
        zk_host = shred.conf.ZOOKEEPER['HOST'] + ':' + str(shred.conf.ZOOKEEPER['PORT'])
        zk = shred.connect_zk()
        zk.delete(path=shred.conf.ZOOKEEPER['PATH'], recursive=True)
    else:
        shred.log.info("Skipping removal of test ZK Data")
Beispiel #12
0
def get_jobs(stage):
    """Prepares a cleaned job list suitable for the stage requested from all active jobs
    returns list of job UUID4 strings"""
    worker_job_list = []
    target_status = []
    if stage == stage_2:
        target_status = [
            stage_1 + "-" + status_success,
            stage_2 + "-" + status_task_timeout
        ]
    elif stage in [stage_3, stage_4]:
        target_status = [
            stage_2 + "-" + status_success,
            stage_4 + "-" + status_task_timeout
        ]
    elif stage in [stage_5, stage_6]:
        target_status = [
            stage_4 + "-" + status_success,
            stage_6 + "-" + status_task_timeout
        ]
    # check if dir exists as worker my load before client is ever used
    job_path = ospathjoin(conf.HDFS_SHRED_PATH, "jobs")
    job_dir_exists = None
    try:
        # hdfscli strict=False returns None rather than an Error if Dir not found
        job_dir_exists = hdfs.content(job_path, strict=False)
    except AttributeError:
        log.error("HDFS Client not connected")
    if job_dir_exists is not None:
        # if job dir exists, get listing and any files
        dir_listing = hdfs.list(job_path, status=True)
        for item in dir_listing:
            if item[1]['type'] == 'FILE':
                # item[0] is the filename, which for master status' is the job ID as a string
                # we shall be OCD about things and validate it however.
                job_status = retrieve_job_info(item[0], "master")
                if job_status in target_status:
                    try:
                        job_id = UUID(item[0], version=4)
                        worker_job_list.append(str(job_id))
                    except ValueError:
                        pass
    return worker_job_list
Beispiel #13
0
def get_gif_filename(dir,label,args):    
    argcopy = args.copy()
    filename =""
    for s in settings:
        if args.has_key(s) == True:
            filename = filename + str(args[s])
            
    filename = filename + label
        
    for io in image_operator:
        if args.has_key(io) == True:
            filename = filename + str(args[io])

    filename = filename+ ".gif"
    filename = filename.replace("#","")
    
    filename = ospathjoin(dir,filename)
    
    return(filename)
Beispiel #14
0
def ingest_targets(job_id, target):
    """Moves file from initial location to shred worker folder on HDFS"""
    # Update status'
    status = "stage1ingest"
    component = "master"
    set_status(job_id, component, status)
    component = "data"
    set_status(job_id, component, status)
    # Move all files to the data directory
    path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job_id, 'data')
    # Using the HDFS module's rename function to move the target files
    log.debug("Moving target file [{0}] to shredder holding pen [{1}]".format(target, path))
    # We need to ensure the directory is created, or the rename command will dump the data into the file
    hdfs.makedirs(path)
    hdfs.rename(target, path)
    # update status
    status = "stage1ingestComplete"
    set_status(job_id, component, status)
    return job_id, status
Beispiel #15
0
    def process_item(self, item, spider):
        ''' save index doc file '''
        if 'indexdocSpider' != spider.name:
            return item
        
        if not item['docvalid']:
            return None
        
        realpath = ospathjoin(spider.savepath, spider.savename, item['docpath'])
        spider.loger.info("doc %s save path %s" %
                              (item['docname'], realpath))

        if not os.path.exists(os.path.dirname(realpath)):
            os.makedirs(os.path.dirname(realpath))
        
        with open(realpath, 'wb+') as f:
            f.write(item['docdata'])
                
        return item
Beispiel #16
0
 def store_table_fields(self, table_name: str, playlists_dir: str):
     response = self.cur.execute('select * from ' + table_name)
     fields_names = [description[0] + '\n' for description in response.description]
     with open(ospathjoin(playlists_dir, table_name + '.txt'), 'w+') as f:
         f.writelines(fields_names)
 def get_sql_path_base(exec_path):
     return ospathjoin(exec_path, "janis/task.db")
Beispiel #18
0
################################################################################
# Etapa 1 Carga base de datos a python
################################################################################

# Por ahora tomaré una base desde mi computador local
f  =  NamedTemporaryFile(suffix="BaseValidador", delete = False)

myHostname = "162.243.165.69"
myUsername = "******"
myPassword = "******"
filename = "BaseValidador.csv"
opts = pysftpCnOpts()
opts.hostkeys = None
with pysftpConnection(host=myHostname, username=myUsername, password=myPassword, cnopts = opts) as sftp:
    remoteFilePath = '/root/CMD/EOD_S20/bases/'+filename
    localFilePath = ospathjoin(f.name)
    sftp.get(remoteFilePath, localFilePath)

#sys.path.append(f.name)
#bd = pd.read_stata(ospathjoin(f.name), convert_categoricals=False)
#bd.to_csv(ospathjoin(f.name))
#bd = pd.read_csv(ospathjoin(f.name))

bd = pd.read_csv(ospathjoin(f.name))

bd["comentarios_validacion"] = "No hay comentarios de validación"

    
bd=bd.loc[bd.interview__key.notnull()]
bd.loc[:,'act']=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
bd2=bd
    def __init__(self, playlist_name: str, poi_csv_file: str):

        self.playlist_name = playlist_name
        self.playlist_path = ospathjoin(self.playlists_dir, str(playlist_name))
        self.csv_handler = CSVHandler(ospathjoin(self.playlist_path, poi_csv_file))
        self.sql_handler = SQLHandler()
Beispiel #20
0
from __future__ import print_function
import pytest
from os.path import join as ospathjoin
import os
from tracktime import tracktime
import datetime

TEST_TIMELOG = ospathjoin("tests", "test_timelog.txt")


@pytest.fixture
def activity():
    name = "admin"
    category = "work"
    time1 = datetime.datetime(2016, 6, 9, 6, 5, 35)
    time2 = datetime.datetime(2016, 6, 9, 11, 23, 2)
    return tracktime.Activity(time1, name, category, time2)


def erase_test_timelog():
    """ remove the TEST_TIMELOG (if it exists) """
    try:
        os.remove(TEST_TIMELOG)
    except OSError:
        pass
    return


def add_test_timelog_entries_directly():
    timelog = TEST_TIMELOG
    """ Create a finished activity """
from __future__ import print_function
import pytest
from os.path import join as ospathjoin
import os
from tracktime import tracktime
import datetime

TEST_TIMELOG = ospathjoin("tests", "test_timelog.txt")


@pytest.fixture
def activity():
    name = "admin"
    category = "work"
    time1 = datetime.datetime(2016, 6, 9, 6, 5, 35)
    time2 = datetime.datetime(2016, 6, 9, 11, 23, 2)
    return tracktime.Activity(time1, name, category, time2)


def erase_test_timelog():
    """ remove the TEST_TIMELOG (if it exists) """
    try:
        os.remove(TEST_TIMELOG)
    except OSError:
        pass
    return


def add_test_timelog_entries_directly():
    timelog = TEST_TIMELOG
Beispiel #22
0
 def store_all_tables_names(self, playlists_dir: str):
     tables_names = self.get_all_tables_name()
     with open(ospathjoin(playlists_dir, 'tables_names.txt'), 'w+') as f:
         f.writelines(tables_names)
Beispiel #23
0
    def create_image_file(self,labels, **kw):

        if not kw.has_key('overwrite'):
            kw['overwrite'] = False
            
        if not isinstance(labels,list):
            labels = [labels]
        
        self.outputfiles = []
        self.labels = labels
        self.outputdirname = "ic_"+now()
        cwd = getcwd()
        
        try:
            mkdir(ospathjoin(cwd,self.outputdirname))
        except OSError: # directory exists
            pass
        
        self.log.log(self,3,"mkdir="+self.outputdirname)

        for lbl in self.labels:
            
            cmd = ['convert','-verbose']

            outputfilename = get_gif_filename(self.outputdirname,lbl,kw)
            #filename = lbl + "-" + "-".join(map(str,kw.values())) + ".gif"  
            #outputfilename = ospathjoin(self.outputdirname,filename)
            
            if os_file_exists(outputfilename) == False or kw['overwrite'] == True:
                
                # settings go before the input label/file
                for s in settings:
                    if kw.has_key(s):
                        cmd = cmd + ["-"+s,str(kw[s])]
        
                cmd.append(labelstr_get(lbl))  
                
                # image operators come afterwards
                for im in image_operator:
                    if kw.has_key(im):
                        cmd = cmd + ["-"+im,str(kw[im])]
                
                cmd.append(outputfilename)
    
                p = process_start(cmd)
                
                status = parse_convert_stdout(p,lbl)
                
                if status[0] == 0:
                    self.log.log(self,3,
                                 "created image="+outputfilename,
                                 "status"," ".join(map(str,status)))
                else:
                    self.log.log(self,3,"failed","status="," ".join(map(str,status)))
                    raise Exception("ImageCreate failure",status)
            else:
                self.log.log(self,3,"reused image="+outputfilename)
                
            self.outputfiles.append(outputfilename)
                
        return(self.outputfiles)
Beispiel #24
0
def run_stage(stage, params=None):
    """
    Main program logic
    As many stages share a lot of similar functionality, they are interleved using the 'stage' parameter as a selector
    Stages should be able to run independently for testing or admin convenience
    """
    ensure_hdfs()
    if stage == stage_1:
        # Stage 1 returns a result and a an ID for the job and has no job list
        target = params
        # TODO: Validate passed file target(s) further, for ex trailing slashes or actually a directory in arg parse
        job = str(uuid4())
        log.debug("Generated uuid4 [{0}] for job identification".format(job))
        persist_job_info(job, 'master', stage, status_init)
        persist_job_info(job, 'data_status', stage, status_init)
        holding_pen_path = ospathjoin(conf.HDFS_SHRED_PATH, "store", job, 'data')
        source_path, source_filename = ospathsplit(target)
        expected_target_real_path = ospathjoin(holding_pen_path, source_filename)
        try:
            # TODO: Update to handle list of input files instead of single file as a string
            target_details = hdfs.status(target)
            if target_details['type'] == u'FILE':
                # We need to ensure the directory is created, or the rename command will dump the data into the file
                hdfs.makedirs(holding_pen_path)
                # Using the HDFS module's rename function to move the target files to test permissions
                log.debug("Moving target file [{0}] to shredder holding pen [{1}]".format(target, holding_pen_path))
                # TODO: Do an are-you-sure, then return status_skip if they don't accept
                hdfs.rename(target, holding_pen_path)
                # TODO: Write more sanity checks for ingest process
                persist_job_info(job, "data_file_list", stage_1, expected_target_real_path)
                log.debug("Job [{0}] prepared, exiting with success".format(job))
                persist_job_info(job, 'master', stage, status_success)
                persist_job_info(job, 'data_status', stage, status_success)
                return status_success, job
            else:
                log.critical("Target is not valid, type returned was [{0}]".format(target_details['type']))
                persist_job_info(job, 'master', stage, status_fail)
                persist_job_info(job, 'data_status', stage, status_fail)
                return status_fail, job
        except HdfsError as e:
            persist_job_info(job, 'master', stage, status_fail)
            persist_job_info(job, 'data_status', stage, status_fail)
            log.critical("Ingestion failed for file [{0}] for job [{1}] with details: {2}"
                         .format(target, job, e))
            return status_fail, job
    elif stage in [stage_2, stage_3, stage_4, stage_5, stage_6]:
        # stages 2 - 6 operate from an active job list predicated by success of the last master stage
        worker = get_worker_identity()
        job_list = get_jobs(stage)
        log.info("Worker [{0}] found [{1}] jobs for stage [{2}]".format(worker, len(job_list), stage))
        if len(job_list) > 0:
            for job in job_list:
                if stage in [stage_2, stage_4, stage_6]:
                    # Leader Jobs for stages 2, 4, and 6
                    # We use the absence of a leader_result to control activity within leader tasks
                    leader_result = None
                    # Worker may not yet have status file initialised for s2 of job
                    worker_status = (retrieve_job_info(job, "worker_" + worker + "_status", strict=False))
                    # TODO: Move worker state validation to a seperate function returning a t/f against worker/stage
                    if (
                        (worker_status is None and stage != stage_2) or
                        (worker_status is not None and worker_status not in [
                            stage_3 + "-" + status_success, stage_3 + "-" + status_skip, stage_4 + "-" + status_task_timeout,
                            stage_5 + "-" + status_success, stage_5 + "-" + status_skip, stage_6 + "-" + status_task_timeout,
                    ])):
                        log.critical(
                            "Worker [{0}] is in status [{1}] for job [{2}], which is not valid to be [{3}] leader."
                            .format(worker, worker_status, job, stage)
                        )
                        leader_result = status_fail
                    persist_job_info(job, 'master', stage, status_init)
                    persist_job_info(job, "worker_" + worker + "_status", stage, status_init)
                    ensure_zk()
                    lease_path = conf.ZOOKEEPER['PATH'] + job
                    lease = zk.NonBlockingLease(
                        path=lease_path,
                        duration=dttd(minutes=conf.LEADER_WAIT),
                        identifier="Worker [{0}] running stage [{1}]".format(worker, stage)
                    )
                    if not lease:
                        leader_result = status_skip
                    else:
                        while lease:
                            while leader_result is None:
                                if zk.state != KazooState.CONNECTED:
                                    log.critical("ZooKeeper disconnected from worker [{0}] during stage [{1}] of job"
                                                 "[{2}], expiring activity"
                                                 .format(worker, stage, job))
                                    leader_result = status_task_timeout
                                persist_job_info(job, "worker_" + worker + "_status", stage, status_is_leader)
                                if stage == stage_2:
                                    target = retrieve_job_info(job, "data_file_list")
                                    master_shard_dict = {}
                                    fsck_iter = run_shell_command(
                                        ["hdfs", "fsck", target, "-files", "-blocks", "-locations"]
                                    )
                                    master_shard_dict.update(parse_fsck_iter(fsck_iter))
                                    target_workers = master_shard_dict.keys()
                                    for this_worker in target_workers:
                                        worker_shard_dict = {}
                                        for shard_file in master_shard_dict[this_worker]:
                                            worker_shard_dict[shard_file] = status_no_init
                                        persist_job_info(
                                            job, "worker_" + worker + "_source_shard_dict", stage, worker_shard_dict
                                        )
                                    persist_job_info(job, "worker_list", stage, target_workers)
                                    leader_result = status_success
                                elif stage in [stage_4, stage_6]:
                                    worker_list = retrieve_job_info(job, "worker_list")
                                    wait = True
                                    while wait is True:
                                        # TODO: Do stuff to validate count and expected names of workers are all correct
                                        nodes_finished = True
                                        for node in worker_list:
                                            node_stage, node_status = (
                                                retrieve_job_info(job, "worker_" + node + "_status")).split("-")
                                            if (
                                                node_status == status_fail or  # some node failed something
                                                stage == stage_4 and node_stage != stage_3 or  # bad stage combo
                                                stage == stage_6 and node_stage != stage_5  # stage combo breaker!
                                            ):
                                                # This should crash the outer while loop to fail this process
                                                leader_result = status_fail
                                            elif node_status not in [status_success, status_skip]:
                                                nodes_finished = False
                                        if nodes_finished is True:
                                            wait = False
                                        else:
                                            sleep(60 * conf.WORKER_WAIT)
                                    else:
                                        # We only stop 'wait'ing to start Stage 4/6 if all workers report success
                                        # before the leader lease times out
                                        persist_job_info(job, 'master', stage, status_init)
                                        if stage == stage_4:
                                            persist_job_info(job, 'data_status', stage, status_init)
                                            # TODO: Handle multiple files instead of a single file as string
                                            # TODO: Validate against fresh blocklist in case of changes?
                                            delete_target = retrieve_job_info(job, "data_file_list")
                                            delete_cmd_result = next(
                                                run_shell_command(['hdfs', 'dfs', '-rm', '-skipTrash', delete_target])
                                            )
                                            if "Deleted" in delete_cmd_result:
                                                persist_job_info(job, 'data_status', stage, status_success)
                                                leader_result = status_success
                                            else:
                                                log.critical(
                                                    "Deletion of file from HDFS returned bad result of [{0}], bailing"
                                                    .format(delete_cmd_result))
                                                persist_job_info(job, 'data_status', stage, status_fail)
                                                leader_result = status_fail
                                        elif stage == stage_6:
                                            # All workers have completed shredding, shut down job and clean up
                                            # TODO: Test that job completed as expected
                                            leader_result = status_success
                                else:
                                    raise StandardError("Bad stage passed to run_stage")
                            lease = False
                    if leader_result is None or leader_result == status_task_timeout:
                        log.warning(
                            "Worker [{0}] timed out on stage [{1}] leader task, "
                            "resetting status for another worker attempt"
                            .format(worker, stage))
                        persist_job_info(job, "worker_" + worker + "_status", stage, status_task_timeout)
                        persist_job_info(job, 'master', stage, status_task_timeout)
                    elif leader_result in [status_success, status_fail]:
                        # Cleanup lease
                        # TODO: Test if this breaks when the worker test says the worker is in a bad state
                        _ = zk.NonBlockingLease(
                            path=lease_path,
                            duration=dttd(seconds=1),
                            identifier="Worker [{0}] running stage [{1}]".format(worker, stage)
                        )
                        sleep(2)
                        persist_job_info(job, "worker_" + worker + "_status", stage, leader_result)
                        persist_job_info(job, 'master', stage, leader_result)
                    elif leader_result == status_skip:
                        persist_job_info(job, "worker_" + worker + "_status", stage, status_skip)
                    else:
                        raise StandardError("Bad leader_result returned from ZooKeeper wrapper")
                elif stage in [stage_3, stage_5]:
                    # Distributed worker jobs for stage 3 and 5
                    persist_job_info(job, "worker_" + worker + "_status", stage, status_init)
                    if stage == stage_3:
                        targets_dict = retrieve_job_info(job, "worker_" + worker + "_source_shard_dict", strict=False)
                        # allowing for restart of job where shard linking was partially completed.
                        linked_shard_dict = retrieve_job_info(
                            job, "worker_" + worker + "_linked_shard_dict", strict=False
                        )
                        if linked_shard_dict is None:
                            linked_shard_dict = {}
                    elif stage == stage_5:
                        targets_dict = retrieve_job_info(job, "worker_" + worker + "_linked_shard_dict", strict=False)
                    else:
                        raise StandardError("Bad code pathway")
                    if targets_dict is None:
                        log.debug("Worker [{0}] found no shard list for stage [{1}] in job [{2}]"
                                  .format(worker, stage, job))
                        persist_job_info(job, "worker_" + worker + "_status", stage, status_skip)
                    else:
                        for shard in targets_dict:
                            if targets_dict[shard] in [status_no_init, status_init]:
                                targets_dict[shard] = status_init
                                if stage == stage_3:
                                    shard_file_path = find_shard(shard)
                                    shard_file_mount = find_mount_point(shard_file_path)
                                    this_mount_shred_dir = ospathjoin(shard_file_mount, conf.LINUXFS_SHRED_PATH, job)
                                    linked_shard_path = ospathjoin(this_mount_shred_dir, shard)
                                    try:
                                        if not exists(this_mount_shred_dir):
                                            # apparently the exists_ok flag is only in Python2.7+
                                            makedirs(this_mount_shred_dir)
                                        link(shard_file_path, linked_shard_path)
                                        linked_shard_dict[linked_shard_path] = status_no_init
                                        targets_dict[shard] = status_success
                                    except OSError as e:
                                        log.critical("Failed to link shard file [{0}] at loc [{1}] to shred loc [{2}]"
                                                     .format(shard, shard_file_path, linked_shard_path))
                                        targets_dict[shard] = status_fail
                                elif stage == stage_5:
                                    # Shred returns 0 on success and a 'failed' message on error
                                    # run_shell_command handles this behavior for us
                                    # TODO: Insert final sanity check before shredding files
                                    shred_result = run_shell_command(
                                        ['shred', '-n', str(conf.SHRED_COUNT), '-z', '-u', shard],
                                        return_iter=False
                                    )
                                    if shred_result is not None:
                                        log.critical("Worker [{0}] failed to shred shard [{1}] with error: {2}"
                                                     .format(worker, shard, shred_result))
                                        targets_dict[shard] = status_fail
                                    else:
                                        targets_dict[shard] = status_success
                            elif targets_dict[shard] == status_success:
                                # Already done, therefore skip
                                pass
                            else:
                                raise StandardError(
                                    "Shard control for worker [{0}] on job [{1}] in unexpected state: [{1}]"
                                    .format(worker, job, dumps(targets_dict))
                                )
                        if stage == stage_3:
                            persist_job_info(job, "worker_" + worker + "_source_shard_dict", stage, targets_dict)
                            persist_job_info(job, "worker_" + worker + "_linked_shard_dict", stage, linked_shard_dict)
                        if stage == stage_5:
                            persist_job_info(job, "worker_" + worker + "_linked_shard_dict", stage, targets_dict)
                        # sanity test if task is completed successfully
                        target_status = []
                        for shard in targets_dict:
                            target_status.append(targets_dict[shard])
                        if len(set(target_status)) == 1 and status_success in set(target_status):
                            persist_job_info(job, "worker_" + worker + "_status", stage, status_success)
                        else:
                            persist_job_info(job, "worker_" + worker + "_status", stage, status_fail)
                else:
                    # Shouldn't be able to get here
                    raise StandardError("Bad stage definition passed to run_stage: {0}".format(stage))
            # Now all jobs for stage have run, check all jobs completed successfully before returning
            for job in job_list:
                if stage in [stage_2, stage_4, stage_6]:
                    component = "master"
                else:
                    # must be stage 3 or 5
                    component = "worker_" + worker + "_status"
                job_status = (retrieve_job_info(job, component)).split("-")[1]
                if job_status not in [status_success, status_skip]:
                    log.critical("Worker [{0}] failed or timed out one or more of [{1}] jobs for stage [{2}]"
                                 .format(worker, len(job_list), stage))
                    return status_fail
                log.info("Worker [{0}] found and processed [{1}] jobs for stage [{2}]"
                         .format(worker, len(job_list), stage))
            return status_success
        else:
            # No jobs found for this stage/worker
            return status_skip
    else:
        raise StandardError("Bad stage definition passed to run_stage: {0}".format(stage))
Beispiel #25
0
 def __init__(self):
     conn = sqlite3.connect(ospathjoin(self.db_dir, 'mixxxdb.sqlite'))
     self.cur = conn.cursor()
Beispiel #26
0
def clear_test_jobs():
    shred.log.info("Removing test jobs")
    rmdir_cmd = ["hdfs", "dfs", "-rm", "-f", "-r", "-skipTrash", ospathjoin(shred.conf.HDFS_SHRED_PATH)]
    rmdir_iter = shred.run_shell_command(rmdir_cmd)
    for line in rmdir_iter:
        shred.log.info(line)
Beispiel #27
0
# Por ahora tomaré una base desde mi computador local
f = NamedTemporaryFile(suffix="BaseValidador", delete=False)

myHostname = "162.243.165.69"
myUsername = "******"
myPassword = "******"
filename = "BaseValidador.dta"
opts = pysftpCnOpts()
opts.hostkeys = None
with pysftpConnection(host=myHostname,
                      username=myUsername,
                      password=myPassword,
                      cnopts=opts) as sftp:
    remoteFilePath = '/root/CMD/EOD_Call/bases/validador/' + filename
    localFilePath = ospathjoin(f.name)
    sftp.get(remoteFilePath, localFilePath)

sys.path.append(f.name)
vps = pd.read_stata(f, convert_categoricals=False)

##################################################################################
# Etapa 4: Union de base VPS a SQL
##################################################################################

#############
### VPS
#############

# Cambiamos de formato orden
vps['orden'] = vps['orden'].astype('str').str[:-2].astype('float64')
Beispiel #28
0
 def test_dot_relative(self):
     fn = "my.txt"
     self.assertEqual(ospathjoin(self.cwd, fn),
                      fully_qualify_filename("./" + fn))
Beispiel #29
0
Description: Track time spent at various activities.

Author: Andrew Mattheisen

"""
from __future__ import print_function
import datetime
import argparse
from sys import argv
import fileinput
from os.path import expanduser
from os.path import join as ospathjoin

VERSION = "0.0"
TIMELOG = ospathjoin(expanduser("~"), "timelog.txt")
DATETIMEFORMAT = "%Y-%m-%dT%H:%M:%S"
DAYFORMAT = "%Y-%m-%d"
INPROGRESS = "none"
DEFAULT_CATEGORY = "general"
ACTIVITY_DAY_HEADER = """= TRACKTIME REPORT FOR {weekday:<10} {day} =
 Start - End    (Duration) | Activity@Category
---------------------------+------------------"""


# MODELS
class Activity():
    """ An Activity is a task (e.g. sweep floor) with a start datetime,
    category, and end datetime. """
    def __init__(
      self, starttime, name, category=DEFAULT_CATEGORY,