Example #1
0
def _log(msg):
    dt_string = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    filename = os.path.basename(__file__)
    msg_string = '[%s]\t%s\t[%s]\t%s' % (
        util_get_platform(), dt_string, filename, msg)

    print(msg_string)

    with open('/home/o.koch/image-search/_%s.log' % filename, 'a') as fp:
        fp.write('%s\n' % msg_string)
        fp.close()

    # log error messages (but ignore HDFS warnings)
    if msg.find('ERROR') is not -1 and msg.find('WARN retry.RetryInvocationHandler') == -1:
        with open('/home/o.koch/image-search/_%s.log.error' % os.path.basename(__file__), 'a') as fp:
            fp.write('%s\n' % msg_string)
            fp.close()
Example #2
0
def process_dir(input_dir):

    host = '[email protected]'
    local_dir = '/home/o.koch/image-search/output-per-partner'
    remote_success_file = '/opt/output-per-partner/.success'
    local_success_file = '/home/o.koch/image-search/_SUCCESS'
    timestamp_filename = '/home/o.koch/image-search/lastPartnerTimestamp_computed.txt'
    job_timestamp_filename = '/home/o.koch/image-search/.hathi.lastrun.timestamp'

    _log('Found following dir to process : %s' % input_dir)

    timestamp_aws = os.path.basename(input_dir)

    # skip directory if does not belong to your platform
    if util_get_platform() == 'pa4' and (int(timestamp_aws) % 2 == 0):
        _log('This is a directory for AM5.  Skipping...')
        return
    if util_get_platform() == 'am5' and (int(timestamp_aws) % 2 == 1):
        _log('This is a directory for PA4.  Skipping...')
        return

    hdfs_root = '/user/o.koch/cnn'
    hdfs_dir = '%s/%s' % (hdfs_root, timestamp_aws)

    # quit if directory already exists on HDFS
    if exists_hdfs(hdfs_dir):
        _log('Directory %s already exists on HDFS.  Skipping...' % hdfs_dir)
        return

    # create directory on hdfs
    cmd = 'hadoop fs -mkdir %s' % hdfs_dir
    _log(cmd)
    cmd_out, cmd_err, rc = util_run_cmd(cmd)
    if rc != 0:
        _log('*** ERROR *** %s' % cmd_err)
        return

    # create local directory
    if not os.path.isdir(local_dir):
        os.makedirs(local_dir)

    # copy files to gateway
    cmd = 'scp -r %s:%s %s' % (host, input_dir, local_dir)
    _log(cmd)
    cmd_out, cmd_err, rc = util_run_cmd(cmd)
    if rc != 0:
        _log('*** ERROR *** %s' % cmd_err)
        return

    # load last timestamp for each partner
    last_partnersTimestamp = {}
    if os.path.isfile(timestamp_filename):
        with open(timestamp_filename, 'r') as fp:
            lines = fp.readlines()
            for l in lines:
                l = l.strip().split(' ')
                partnerid = int(l[0])
                timestamp = int(l[1])
                last_partnersTimestamp[partnerid] = timestamp

    # copy files to HDFS
    local_dir_output = os.path.join(local_dir, timestamp_aws)
    assert (os.path.isdir(local_dir_output))

    n_errors = 0

    for root, dirs, files in os.walk(local_dir_output):
        for filen in files:
            if not os.path.basename(filen).endswith('.gz'):
                continue
            filename = os.path.join(root, filen)

            # extract partner id from filename
            partnerid = int(re.findall('[0-9]+', filename)[-1])
            last_partnersTimestamp[partnerid] = int(timestamp_aws)

            hdfs_dir_full = '%s/%d' % (hdfs_dir, partnerid)

            # create subdir on HDFS
            cmd = 'hadoop fs -mkdir %s' % hdfs_dir_full
            cmd_out, cmd_err, rc = util_run_cmd(cmd)
            if rc != 0:
                _log('*** ERROR *** %s' % cmd_err)
                n_errors += 1
                break

            # transfer file to HDFS
            cmd = 'hadoop fs -put %s %s' % (filename, hdfs_dir_full)
            _log(cmd)
            cmd_out, cmd_err, rc = util_run_cmd(cmd)
            if rc != 0:
                _log('*** ERROR *** %s' % cmd_err)
                n_errors += 1
                break

    if n_errors > 0:
        _log('*** ERROR *** Encountered errors during copy. Exiting.')
        return

    _log('Done sending data to HDFS')

    # remove local dir to save space
    shutil.rmtree(local_dir_output)

    # write timestamps to file
    fp = open(timestamp_filename, 'w')
    for k, v in last_partnersTimestamp.iteritems():
        fp.write('%d %d\n' % (k, v))
    fp.close()

    _log('Done updating timestamps')

    # build json file
    json_file = os.path.join(local_dir, 'outputByPartner')
    total_size_bytes = util_timestamp_file_to_json(timestamp_filename,
                                                   json_file)

    _log('Done creating JSON file')

    # remove remote json file
    cmd = 'hadoop fs -rm %s/outputByPartner' % hdfs_root
    _log(cmd)
    cmd_out, cmd_err, rc = util_run_cmd(cmd)
    if rc != 0:
        _log('*** ERROR *** %s' % cmd_err)

    # send new json file
    cmd = 'hadoop fs -put %s %s' % (json_file, hdfs_root)
    _log(cmd)
    cmd_out, cmd_err, rc = util_run_cmd(cmd)
    if rc != 0:
        _log('*** ERROR *** %s' % cmd_err)

    # send success file
    with open(local_success_file, 'w') as fp:
        fp.write('0')

    cmd = 'hadoop fs -put %s %s' % (local_success_file, hdfs_dir)
    _log(cmd)
    cmd_out, cmd_err, rc = util_run_cmd(cmd)
    if rc != 0:
        _log('*** ERROR *** %s' % cmd_err)

    sent_mbytes = round(1.0 * total_size_bytes / 1000000)

    # move AWS input directory to bin
    #bin_input_dir = '/opt/old.output'
    #_log ('Moving AWS dir %s to %s' % (input_dir, bin_input_dir))
    #move_aws_directory (host, input_dir, bin_input_dir)
    #_log ('Done.')

    # write job timestamp to file
    with open(job_timestamp_filename, 'w') as gp:
        gp.write('%d' % int(time.time()))
        gp.close()

    _log('Summary : Sent %d MB to HDFS' % sent_mbytes)
Example #3
0
def check_partners():

    hdfs_input_json_filename = '/user/recocomputer/bestofs/imagedl/outputByPartner'
    hdfs_output_json_filename = '/user/o.koch/cnn/outputByPartner'
    input_json_filename = '/home/o.koch/image-search/.input.outputByPartner.argus'
    output_json_filename = '/home/o.koch/image-search/.output.outputByPartner.argus'
    timestamp_filename = '/home/o.koch/image-search/lastPartnerTimestamp.txt'
    hdfs_root = '/user/o.koch/cnn/'
    hathi_lastrun_filename = '/home/o.koch/image-search/.hathi.lastrun.timestamp'
    baloo_lastrun_filename = '/home/o.koch/image-search/.baloo.lastrun.timestamp'
    host = '[email protected]'

    content = ''
    good_to_go = True

    # count dirs to process
    n_dirs = count_aws_dirs_to_process(host, '/opt/input')
    if n_dirs is not None:
        _log('%d dirs left to process on AWS' % n_dirs)
        content += '%d dirs left to process on AWS\n' % n_dirs

    # compute job delays
    hathi_lastrun_timestamp = 0
    baloo_lastrun_timestamp = 0
    if os.path.isfile(hathi_lastrun_filename):
        with open(hathi_lastrun_filename, 'r') as gp:
            hathi_lastrun_timestamp = int(gp.read())
    if os.path.isfile(baloo_lastrun_filename):
        with open(baloo_lastrun_filename, 'r') as gp:
            baloo_lastrun_timestamp = int(gp.read())

    ref_timestamp = int(time.time())
    hathi_delay = ref_timestamp - hathi_lastrun_timestamp
    baloo_delay = ref_timestamp - baloo_lastrun_timestamp

    _log('Send-to-AWS   delay : %s' % delay_sec_to_string(baloo_delay))
    _log('Recv-from-AWS delay : %s' % delay_sec_to_string(hathi_delay))

    content += 'Send-to-AWS   delay : %s\n' % delay_sec_to_string(baloo_delay)
    content += 'Recv-from-AWS delay : %s\n' % delay_sec_to_string(hathi_delay)

    # remove local files
    if os.path.isfile(input_json_filename):
        os.remove(input_json_filename)
    if os.path.isfile(output_json_filename):
        os.remove(output_json_filename)

    # fetch input outputByPartner on HDFS
    cmd = 'hadoop fs -get %s %s' % (hdfs_input_json_filename,
                                    input_json_filename)
    _log(cmd)
    cmd_out, cmd_err, rc = util_run_cmd(cmd)
    if rc != 0:
        _log('*** ERROR *** %s' % cmd_err)
        content += '*** ERROR *** %s\n' % cmd_err
        good_to_go = False

    if util_file_size(input_json_filename) == 0:
        _log(
            '*** ERROR *** did not find outputByPartner at this location on HDFS : %s'
            % hdfs_input_json_filename)
        content += '*** ERROR *** did not find outputByPartner at this location on HDFS : %s\n' % hdfs_input_json_filename
        good_to_go = False

    # fetch output outputByPartner on HDFS
    cmd = 'hadoop fs -get %s %s' % (hdfs_output_json_filename,
                                    output_json_filename)
    _log(cmd)
    cmd_out, cmd_err, rc = util_run_cmd(cmd)
    if rc != 0:
        _log('*** ERROR *** %s' % cmd_err)
        content += '*** ERROR *** %s\n' % cmd_err
        good_to_go = False

    if util_file_size(output_json_filename) == 0:
        _log(
            '*** ERROR *** did not find outputByPartner at this location on HDFS : %s'
            % hdfs_output_json_filename)
        content += '*** ERROR *** did not find outputByPartner at this location on HDFS : %s\n' % hdfs_output_json_filename
        good_to_go = False

    if not good_to_go:
        return

    # load last timestamp for each partner
    last_partnersTimestamp = {}
    if os.path.isfile(timestamp_filename):
        with open(timestamp_filename, 'r') as fp:
            lines = fp.readlines()
            for l in lines:
                l = l.strip().split(' ')
                partnerid = int(l[0])
                timestamp = int(l[1])
                last_partnersTimestamp[partnerid] = timestamp

    # parse input data
    input_data = None
    with open(input_json_filename, 'r') as fp:
        json_data = fp.read()
        try:
            input_data = json.loads(json_data)
        except:
            _log('*** ERROR *** Failed to read JSON file %s.  Exiting.' %
                 input_json_filename)
            content += '*** ERROR *** Failed to read JSON file %s.  Exiting.\n' % input_json_filename
            good_to_go = False

    # parse output data
    output_data = None
    with open(output_json_filename, 'r') as fp:
        json_data = fp.read()
        try:
            output_data = json.loads(json_data)
        except:
            _log('*** ERROR *** Failed to read JSON file %s.  Exiting.' %
                 output_json_filename)
            content += '*** ERROR *** Failed to read JSON file %s.  Exiting.\n' % output_json_filename
            good_to_go = False

    if not good_to_go:
        return

    assert (input_data is not None)
    assert (output_data is not None)

    computed_size = 0
    remaining_size = 0
    nb_skipped_partners = 0
    nb_total_partners = 0

    # compute amount of data to process
    for item in input_data:
        partner_id = int(item)
        partner_timestamp = int(input_data[item]['jobTimeStamp'])
        nb_total_partners += 1

        if partner_id in last_partnersTimestamp and last_partnersTimestamp[
                partner_id] == partner_timestamp:
            computed_size += input_data[item]['outputSize']
        else:
            remaining_size += input_data[item]['outputSize']
            nb_skipped_partners += 1

    computed_size_gb = 1.0 * computed_size / 1000000000
    remaining_size_gb = 1.0 * remaining_size / 1000000000

    _log('Computed size  : %4.1f GB' % computed_size_gb)
    _log('Remaining size : %4.1f GB' % remaining_size_gb)
    _log('# skipped partners : %d out of %d total' %
         (nb_skipped_partners, nb_total_partners))

    content += 'Computed size  : %4.1f GB\n' % computed_size_gb
    content += 'Remaining size : %4.1f GB\n' % remaining_size_gb
    content += '# skipped partners : %d out of %d total\n' % (
        nb_skipped_partners, nb_total_partners)

    # check for missing partners in output
    for item in input_data:
        if item not in output_data:
            _log('*** WARNING *** Partner %s missing in output.' % item)
            content += '*** WARNING *** Partner %s missing in output.\n' % item

    # check that all files exist on HDFS
    n_files = 0
    n_files_success = 0

    check_hdfs = True
    if check_hdfs:
        for item in input_data:
            if item not in output_data:
                continue

            # check that files exist on HDFS
            output_folder = output_data[item]['outputFolder']
            output_files = output_data[item]['files']
            for filename in output_files:
                hdfs_path = os.path.join(hdfs_root, output_folder, filename)
                n_files += 1
                print('checking %s' % hdfs_path)
                if not exists_file_hdfs(hdfs_path):
                    _log(
                        '*** ERROR *** File %s does not exist on HDFS but is listed in outputByPartner.'
                        % hdfs_path)
                    content += '*** ERROR *** File %s does not exist on HDFS but is listed in outputByPartner.\n' % hdfs_path
                else:
                    n_files_success += 1

        _log('%d/%d files checked successfully on HDFS.' %
             (n_files_success, n_files))
        content += '%d/%d files checked successfully on HDFS.\n' % (
            n_files_success, n_files)

    # alert?
    warning = False
    alert = False
    if n_files_success != n_files:
        alert = True
        content = '*** Warning! Some files seem to be missing on HDFS ***\n' + content

    if (baloo_delay > 12 * 3600) or (hathi_delay > 12 * 3600):
        alert = True
        content = '*** Warning! Some jobs are more than 12 hours old ***\n' + content

    elif (baloo_delay > 6 * 3600) or (hathi_delay > 6 * 3600):
        warning = True
        content = '*** Warning! Some jobs are more than 6 hours old ***\n' + content

    if alert:
        title = '[prod][%s][aws-tiger] Summary -- Alert' % util_get_platform()
    elif warning:
        title = '[prod][%s][aws-tiger] Summary -- Warning' % util_get_platform(
        )
    else:
        title = '[prod][%s][aws-tiger] Summary -- OK' % util_get_platform()

    _log('Sending email with following title : %s' % title)

    # build email
    email_file = '/tmp/.email.argus.%d' % int(time.time())
    with open(email_file, 'w') as fp:
        fp.write(content)

    util_send_email('*****@*****.**', title, email_file,
                    1000)
    util_send_email('*****@*****.**', title, email_file, 1000)

    os.remove(email_file)
Example #4
0
def main():

    dry_run = True
    dry_run = False

    # header for the log file
    _log('======================================================================')

    remote_root = '/opt/input'
    json_filename = '/home/o.koch/image-search/outputByPartner'
    timestamp_filename = '/home/o.koch/image-search/lastPartnerTimestamp.txt'
    gpu_ip = '54.73.223.224'
    root_dir = '/user/recocomputer/bestofs/imagedl/'
    local_dir = '/home/o.koch/image-search/input'
    job_timestamp_filename = '/home/o.koch/image-search/.baloo.lastrun.timestamp'

    # build timestamp
    # timestamp is odd on PA4 and even on AM5
    ref_time = int(time.time())
    if util_get_platform() == 'pa4':
        if ref_time % 2 == 0:
            ref_time += 1
    elif util_get_platform() == 'am5':
        if ref_time % 2 == 1:
            ref_time += 1
    else:
        _log('***ERROR*** Unrecognized platform %s' % util_get_platform())
        assert(False)

    dt_string = '%d' % ref_time
    remote_dir = '%s/%s' % (remote_root, dt_string)

    # remove old JSON file
    if os.path.isfile(json_filename):
        os.remove(json_filename)

    # clean up local directory
    if not dry_run:
        if os.path.isdir(local_dir):
            _log('Removing %s...' % local_dir)
            shutil.rmtree(local_dir)
            _log('done.')
        os.makedirs(local_dir)

    # remove local file
    if os.path.isfile(json_filename):
        os.remove(json_filename)

    # fetch outputByPartner on HDFS
    # leave if file is not on HDFS
    cmd = 'hadoop fs -get %s/outputByPartner %s' % (root_dir, json_filename)
    _log(cmd)
    cmd_out, cmd_err, rc = util_run_cmd(cmd)
    if rc != 0:
        _log('*** ERROR *** %s' % cmd_err)
        return

    # parse JSON data
    with open(json_filename, 'r') as fp:
        json_data = fp.read()
        try:
            data = json.loads(json_data)
        except:
            _log('*** ERROR *** Failed to read JSON file %s.  File might be empty.  Exiting.' % json_filename)
            return

    # load last timestamp for each partner
    last_partnersTimestamp = {}
    if os.path.isfile(timestamp_filename):
        with open(timestamp_filename, 'r') as fp:
            lines = fp.readlines()
            for l in lines:
                l = l.strip().split(' ')
                partnerid = int(l[0])
                timestamp = int(l[1])
                last_partnersTimestamp[partnerid] = timestamp

    # sort partners by age
    partners_by_age = []
    for item in data:
        partner_id = int(item)
        partner_timestamp = int(data[item]['jobTimeStamp'])
        item_age = partner_timestamp
        if partner_id in last_partnersTimestamp:
            item_age -= last_partnersTimestamp[partner_id]
        else:
            print('partner %s not found' % item)
        partners_by_age.append((item, item_age))

    partners_by_age.sort(key=lambda x: x[1], reverse=True)

    n_transferred_files = 0
    n_transferred_partners = 0
    n_proc = 2
    n_files_limit = 200

    cmd_list = []

    # parse json file
    for item in [x[0] for x in partners_by_age]:
        partner_id = int(item)
        partner_timestamp = int(data[item]['jobTimeStamp'])

        if partner_id != 13045:
            continue

        # cap number of files to transfer
        if n_transferred_files > n_files_limit:
            _log('*** Reached file limit (%d files) ****  Partner ID %d and the following ones will be skipped.' %
                 (n_transferred_files, partner_id))
            break

        if partner_id in last_partnersTimestamp and last_partnersTimestamp[partner_id] >= partner_timestamp:
            # _log('Skipping partner %d.  No new data to process. Current timestamp : %d.  Last timestamp :%d.  Current - last = %d.' % (partner_id, \
            #        partner_timestamp, last_partnersTimestamp[partner_id], partner_timestamp - last_partnersTimestamp[partner_id]))
            continue

        last_partnersTimestamp[partner_id] = partner_timestamp
        n_transferred_partners += 1
        _log('Processing partner %d with timestamp %d' %
             (partner_id, partner_timestamp))

        # get file
        output_folder = data[item]['outputFolder']
        files = data[item]['files']
        for file in files:
            target = os.path.join(root_dir, output_folder, file)
            local_file = os.path.join(
                local_dir, '%d-%s.bin' % (partner_id, file))

            # copy from HDFS
            cmd_1 = 'hadoop fs -get %s %s' % (target, local_file)

            # send to AWS
            cmd_2 = 'scp %s ubuntu@%s:%s' % (local_file, gpu_ip, remote_dir)

            cmd_list.append((cmd_1, cmd_2, partner_id))

            n_transferred_files += 1

    # stop here if nothing to do
    if n_transferred_files == 0:
        _log('No files were planned for transfer.  Stopping here.')
        return

    # create remote dir on AWS (for first file only)
    cmd = 'ssh ubuntu@%s \'mkdir %s\'' % (gpu_ip, remote_dir)
    _log(cmd)
    if not dry_run:
        cmd_out, cmd_err, rc = util_run_cmd(cmd)
        if rc != 0:
            _log('*** ERROR *** %s' % cmd_err)
            return

    # split commands among processes
    cmd_lists = [[] for c in range(n_proc)]
    for (cmd, c) in zip(cmd_list, range(len(cmd_list))):
        cmd_lists[c % n_proc].append(cmd)

    # run commands
    manager = multiprocessing.Manager()
    return_dict = manager.dict()

    jobs = []
    c = 0
    for cmd_list in cmd_lists:
        process = multiprocessing.Process(target=run_commands, args=[
                                          c, cmd_list, dry_run, return_dict])
        process.start()
        jobs.append(process)
        c += 1

    # wait for jobs to finish
    for job in jobs:
        job.join()

    # if any of the job failed, exit
    for k in return_dict.values():
        if k != 0:
            _log('*** ERROR *** One of the baloo children failed.  Exiting.')
            # remove local data
            assert (os.path.isdir(local_dir))
            _log('Removing %s...' % local_dir)
            shutil.rmtree(local_dir)
            _log('done.')
            return

    # write timestamps to file
    if not dry_run:
        fp = open(timestamp_filename, 'w')
        for k, v in last_partnersTimestamp.iteritems():
            fp.write('%d %d\n' % (k, v))
        fp.close()

    # create sucess file on AWS
    local_success_file = os.path.join(local_dir, '.success.baloo')
    with open(local_success_file, 'w') as gp:
        gp.write('%s' % dt_string)
    cmd = 'scp %s ubuntu@%s:%s' % (local_success_file, gpu_ip, remote_dir)
    _log(cmd)
    if not dry_run:
        cmd_out, cmd_err, rc = util_run_cmd(cmd)
        if rc != 0:
            _log('*** ERROR *** %s' % cmd_err)

    # remove local dir
    if not dry_run:
        assert (os.path.isdir(local_dir))
        _log('Removing %s...' % local_dir)
        shutil.rmtree(local_dir)
        _log('done.')

    # write job timestamp to file
    with open(job_timestamp_filename, 'w') as gp:
        gp.write('%d' % int(time.time()))
        gp.close()

    _log('Summary : transferred %d files (%d partners) to AWS' %
         (n_transferred_files, n_transferred_partners))