def _log(msg): dt_string = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') filename = os.path.basename(__file__) msg_string = '[%s]\t%s\t[%s]\t%s' % ( util_get_platform(), dt_string, filename, msg) print(msg_string) with open('/home/o.koch/image-search/_%s.log' % filename, 'a') as fp: fp.write('%s\n' % msg_string) fp.close() # log error messages (but ignore HDFS warnings) if msg.find('ERROR') is not -1 and msg.find('WARN retry.RetryInvocationHandler') == -1: with open('/home/o.koch/image-search/_%s.log.error' % os.path.basename(__file__), 'a') as fp: fp.write('%s\n' % msg_string) fp.close()
def process_dir(input_dir): host = '[email protected]' local_dir = '/home/o.koch/image-search/output-per-partner' remote_success_file = '/opt/output-per-partner/.success' local_success_file = '/home/o.koch/image-search/_SUCCESS' timestamp_filename = '/home/o.koch/image-search/lastPartnerTimestamp_computed.txt' job_timestamp_filename = '/home/o.koch/image-search/.hathi.lastrun.timestamp' _log('Found following dir to process : %s' % input_dir) timestamp_aws = os.path.basename(input_dir) # skip directory if does not belong to your platform if util_get_platform() == 'pa4' and (int(timestamp_aws) % 2 == 0): _log('This is a directory for AM5. Skipping...') return if util_get_platform() == 'am5' and (int(timestamp_aws) % 2 == 1): _log('This is a directory for PA4. Skipping...') return hdfs_root = '/user/o.koch/cnn' hdfs_dir = '%s/%s' % (hdfs_root, timestamp_aws) # quit if directory already exists on HDFS if exists_hdfs(hdfs_dir): _log('Directory %s already exists on HDFS. Skipping...' % hdfs_dir) return # create directory on hdfs cmd = 'hadoop fs -mkdir %s' % hdfs_dir _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) return # create local directory if not os.path.isdir(local_dir): os.makedirs(local_dir) # copy files to gateway cmd = 'scp -r %s:%s %s' % (host, input_dir, local_dir) _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) return # load last timestamp for each partner last_partnersTimestamp = {} if os.path.isfile(timestamp_filename): with open(timestamp_filename, 'r') as fp: lines = fp.readlines() for l in lines: l = l.strip().split(' ') partnerid = int(l[0]) timestamp = int(l[1]) last_partnersTimestamp[partnerid] = timestamp # copy files to HDFS local_dir_output = os.path.join(local_dir, timestamp_aws) assert (os.path.isdir(local_dir_output)) n_errors = 0 for root, dirs, files in os.walk(local_dir_output): for filen in files: if not os.path.basename(filen).endswith('.gz'): continue filename = os.path.join(root, filen) # extract partner id from filename partnerid = int(re.findall('[0-9]+', filename)[-1]) last_partnersTimestamp[partnerid] = int(timestamp_aws) hdfs_dir_full = '%s/%d' % (hdfs_dir, partnerid) # create subdir on HDFS cmd = 'hadoop fs -mkdir %s' % hdfs_dir_full cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) n_errors += 1 break # transfer file to HDFS cmd = 'hadoop fs -put %s %s' % (filename, hdfs_dir_full) _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) n_errors += 1 break if n_errors > 0: _log('*** ERROR *** Encountered errors during copy. Exiting.') return _log('Done sending data to HDFS') # remove local dir to save space shutil.rmtree(local_dir_output) # write timestamps to file fp = open(timestamp_filename, 'w') for k, v in last_partnersTimestamp.iteritems(): fp.write('%d %d\n' % (k, v)) fp.close() _log('Done updating timestamps') # build json file json_file = os.path.join(local_dir, 'outputByPartner') total_size_bytes = util_timestamp_file_to_json(timestamp_filename, json_file) _log('Done creating JSON file') # remove remote json file cmd = 'hadoop fs -rm %s/outputByPartner' % hdfs_root _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) # send new json file cmd = 'hadoop fs -put %s %s' % (json_file, hdfs_root) _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) # send success file with open(local_success_file, 'w') as fp: fp.write('0') cmd = 'hadoop fs -put %s %s' % (local_success_file, hdfs_dir) _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) sent_mbytes = round(1.0 * total_size_bytes / 1000000) # move AWS input directory to bin #bin_input_dir = '/opt/old.output' #_log ('Moving AWS dir %s to %s' % (input_dir, bin_input_dir)) #move_aws_directory (host, input_dir, bin_input_dir) #_log ('Done.') # write job timestamp to file with open(job_timestamp_filename, 'w') as gp: gp.write('%d' % int(time.time())) gp.close() _log('Summary : Sent %d MB to HDFS' % sent_mbytes)
def check_partners(): hdfs_input_json_filename = '/user/recocomputer/bestofs/imagedl/outputByPartner' hdfs_output_json_filename = '/user/o.koch/cnn/outputByPartner' input_json_filename = '/home/o.koch/image-search/.input.outputByPartner.argus' output_json_filename = '/home/o.koch/image-search/.output.outputByPartner.argus' timestamp_filename = '/home/o.koch/image-search/lastPartnerTimestamp.txt' hdfs_root = '/user/o.koch/cnn/' hathi_lastrun_filename = '/home/o.koch/image-search/.hathi.lastrun.timestamp' baloo_lastrun_filename = '/home/o.koch/image-search/.baloo.lastrun.timestamp' host = '[email protected]' content = '' good_to_go = True # count dirs to process n_dirs = count_aws_dirs_to_process(host, '/opt/input') if n_dirs is not None: _log('%d dirs left to process on AWS' % n_dirs) content += '%d dirs left to process on AWS\n' % n_dirs # compute job delays hathi_lastrun_timestamp = 0 baloo_lastrun_timestamp = 0 if os.path.isfile(hathi_lastrun_filename): with open(hathi_lastrun_filename, 'r') as gp: hathi_lastrun_timestamp = int(gp.read()) if os.path.isfile(baloo_lastrun_filename): with open(baloo_lastrun_filename, 'r') as gp: baloo_lastrun_timestamp = int(gp.read()) ref_timestamp = int(time.time()) hathi_delay = ref_timestamp - hathi_lastrun_timestamp baloo_delay = ref_timestamp - baloo_lastrun_timestamp _log('Send-to-AWS delay : %s' % delay_sec_to_string(baloo_delay)) _log('Recv-from-AWS delay : %s' % delay_sec_to_string(hathi_delay)) content += 'Send-to-AWS delay : %s\n' % delay_sec_to_string(baloo_delay) content += 'Recv-from-AWS delay : %s\n' % delay_sec_to_string(hathi_delay) # remove local files if os.path.isfile(input_json_filename): os.remove(input_json_filename) if os.path.isfile(output_json_filename): os.remove(output_json_filename) # fetch input outputByPartner on HDFS cmd = 'hadoop fs -get %s %s' % (hdfs_input_json_filename, input_json_filename) _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) content += '*** ERROR *** %s\n' % cmd_err good_to_go = False if util_file_size(input_json_filename) == 0: _log( '*** ERROR *** did not find outputByPartner at this location on HDFS : %s' % hdfs_input_json_filename) content += '*** ERROR *** did not find outputByPartner at this location on HDFS : %s\n' % hdfs_input_json_filename good_to_go = False # fetch output outputByPartner on HDFS cmd = 'hadoop fs -get %s %s' % (hdfs_output_json_filename, output_json_filename) _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) content += '*** ERROR *** %s\n' % cmd_err good_to_go = False if util_file_size(output_json_filename) == 0: _log( '*** ERROR *** did not find outputByPartner at this location on HDFS : %s' % hdfs_output_json_filename) content += '*** ERROR *** did not find outputByPartner at this location on HDFS : %s\n' % hdfs_output_json_filename good_to_go = False if not good_to_go: return # load last timestamp for each partner last_partnersTimestamp = {} if os.path.isfile(timestamp_filename): with open(timestamp_filename, 'r') as fp: lines = fp.readlines() for l in lines: l = l.strip().split(' ') partnerid = int(l[0]) timestamp = int(l[1]) last_partnersTimestamp[partnerid] = timestamp # parse input data input_data = None with open(input_json_filename, 'r') as fp: json_data = fp.read() try: input_data = json.loads(json_data) except: _log('*** ERROR *** Failed to read JSON file %s. Exiting.' % input_json_filename) content += '*** ERROR *** Failed to read JSON file %s. Exiting.\n' % input_json_filename good_to_go = False # parse output data output_data = None with open(output_json_filename, 'r') as fp: json_data = fp.read() try: output_data = json.loads(json_data) except: _log('*** ERROR *** Failed to read JSON file %s. Exiting.' % output_json_filename) content += '*** ERROR *** Failed to read JSON file %s. Exiting.\n' % output_json_filename good_to_go = False if not good_to_go: return assert (input_data is not None) assert (output_data is not None) computed_size = 0 remaining_size = 0 nb_skipped_partners = 0 nb_total_partners = 0 # compute amount of data to process for item in input_data: partner_id = int(item) partner_timestamp = int(input_data[item]['jobTimeStamp']) nb_total_partners += 1 if partner_id in last_partnersTimestamp and last_partnersTimestamp[ partner_id] == partner_timestamp: computed_size += input_data[item]['outputSize'] else: remaining_size += input_data[item]['outputSize'] nb_skipped_partners += 1 computed_size_gb = 1.0 * computed_size / 1000000000 remaining_size_gb = 1.0 * remaining_size / 1000000000 _log('Computed size : %4.1f GB' % computed_size_gb) _log('Remaining size : %4.1f GB' % remaining_size_gb) _log('# skipped partners : %d out of %d total' % (nb_skipped_partners, nb_total_partners)) content += 'Computed size : %4.1f GB\n' % computed_size_gb content += 'Remaining size : %4.1f GB\n' % remaining_size_gb content += '# skipped partners : %d out of %d total\n' % ( nb_skipped_partners, nb_total_partners) # check for missing partners in output for item in input_data: if item not in output_data: _log('*** WARNING *** Partner %s missing in output.' % item) content += '*** WARNING *** Partner %s missing in output.\n' % item # check that all files exist on HDFS n_files = 0 n_files_success = 0 check_hdfs = True if check_hdfs: for item in input_data: if item not in output_data: continue # check that files exist on HDFS output_folder = output_data[item]['outputFolder'] output_files = output_data[item]['files'] for filename in output_files: hdfs_path = os.path.join(hdfs_root, output_folder, filename) n_files += 1 print('checking %s' % hdfs_path) if not exists_file_hdfs(hdfs_path): _log( '*** ERROR *** File %s does not exist on HDFS but is listed in outputByPartner.' % hdfs_path) content += '*** ERROR *** File %s does not exist on HDFS but is listed in outputByPartner.\n' % hdfs_path else: n_files_success += 1 _log('%d/%d files checked successfully on HDFS.' % (n_files_success, n_files)) content += '%d/%d files checked successfully on HDFS.\n' % ( n_files_success, n_files) # alert? warning = False alert = False if n_files_success != n_files: alert = True content = '*** Warning! Some files seem to be missing on HDFS ***\n' + content if (baloo_delay > 12 * 3600) or (hathi_delay > 12 * 3600): alert = True content = '*** Warning! Some jobs are more than 12 hours old ***\n' + content elif (baloo_delay > 6 * 3600) or (hathi_delay > 6 * 3600): warning = True content = '*** Warning! Some jobs are more than 6 hours old ***\n' + content if alert: title = '[prod][%s][aws-tiger] Summary -- Alert' % util_get_platform() elif warning: title = '[prod][%s][aws-tiger] Summary -- Warning' % util_get_platform( ) else: title = '[prod][%s][aws-tiger] Summary -- OK' % util_get_platform() _log('Sending email with following title : %s' % title) # build email email_file = '/tmp/.email.argus.%d' % int(time.time()) with open(email_file, 'w') as fp: fp.write(content) util_send_email('*****@*****.**', title, email_file, 1000) util_send_email('*****@*****.**', title, email_file, 1000) os.remove(email_file)
def main(): dry_run = True dry_run = False # header for the log file _log('======================================================================') remote_root = '/opt/input' json_filename = '/home/o.koch/image-search/outputByPartner' timestamp_filename = '/home/o.koch/image-search/lastPartnerTimestamp.txt' gpu_ip = '54.73.223.224' root_dir = '/user/recocomputer/bestofs/imagedl/' local_dir = '/home/o.koch/image-search/input' job_timestamp_filename = '/home/o.koch/image-search/.baloo.lastrun.timestamp' # build timestamp # timestamp is odd on PA4 and even on AM5 ref_time = int(time.time()) if util_get_platform() == 'pa4': if ref_time % 2 == 0: ref_time += 1 elif util_get_platform() == 'am5': if ref_time % 2 == 1: ref_time += 1 else: _log('***ERROR*** Unrecognized platform %s' % util_get_platform()) assert(False) dt_string = '%d' % ref_time remote_dir = '%s/%s' % (remote_root, dt_string) # remove old JSON file if os.path.isfile(json_filename): os.remove(json_filename) # clean up local directory if not dry_run: if os.path.isdir(local_dir): _log('Removing %s...' % local_dir) shutil.rmtree(local_dir) _log('done.') os.makedirs(local_dir) # remove local file if os.path.isfile(json_filename): os.remove(json_filename) # fetch outputByPartner on HDFS # leave if file is not on HDFS cmd = 'hadoop fs -get %s/outputByPartner %s' % (root_dir, json_filename) _log(cmd) cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) return # parse JSON data with open(json_filename, 'r') as fp: json_data = fp.read() try: data = json.loads(json_data) except: _log('*** ERROR *** Failed to read JSON file %s. File might be empty. Exiting.' % json_filename) return # load last timestamp for each partner last_partnersTimestamp = {} if os.path.isfile(timestamp_filename): with open(timestamp_filename, 'r') as fp: lines = fp.readlines() for l in lines: l = l.strip().split(' ') partnerid = int(l[0]) timestamp = int(l[1]) last_partnersTimestamp[partnerid] = timestamp # sort partners by age partners_by_age = [] for item in data: partner_id = int(item) partner_timestamp = int(data[item]['jobTimeStamp']) item_age = partner_timestamp if partner_id in last_partnersTimestamp: item_age -= last_partnersTimestamp[partner_id] else: print('partner %s not found' % item) partners_by_age.append((item, item_age)) partners_by_age.sort(key=lambda x: x[1], reverse=True) n_transferred_files = 0 n_transferred_partners = 0 n_proc = 2 n_files_limit = 200 cmd_list = [] # parse json file for item in [x[0] for x in partners_by_age]: partner_id = int(item) partner_timestamp = int(data[item]['jobTimeStamp']) if partner_id != 13045: continue # cap number of files to transfer if n_transferred_files > n_files_limit: _log('*** Reached file limit (%d files) **** Partner ID %d and the following ones will be skipped.' % (n_transferred_files, partner_id)) break if partner_id in last_partnersTimestamp and last_partnersTimestamp[partner_id] >= partner_timestamp: # _log('Skipping partner %d. No new data to process. Current timestamp : %d. Last timestamp :%d. Current - last = %d.' % (partner_id, \ # partner_timestamp, last_partnersTimestamp[partner_id], partner_timestamp - last_partnersTimestamp[partner_id])) continue last_partnersTimestamp[partner_id] = partner_timestamp n_transferred_partners += 1 _log('Processing partner %d with timestamp %d' % (partner_id, partner_timestamp)) # get file output_folder = data[item]['outputFolder'] files = data[item]['files'] for file in files: target = os.path.join(root_dir, output_folder, file) local_file = os.path.join( local_dir, '%d-%s.bin' % (partner_id, file)) # copy from HDFS cmd_1 = 'hadoop fs -get %s %s' % (target, local_file) # send to AWS cmd_2 = 'scp %s ubuntu@%s:%s' % (local_file, gpu_ip, remote_dir) cmd_list.append((cmd_1, cmd_2, partner_id)) n_transferred_files += 1 # stop here if nothing to do if n_transferred_files == 0: _log('No files were planned for transfer. Stopping here.') return # create remote dir on AWS (for first file only) cmd = 'ssh ubuntu@%s \'mkdir %s\'' % (gpu_ip, remote_dir) _log(cmd) if not dry_run: cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) return # split commands among processes cmd_lists = [[] for c in range(n_proc)] for (cmd, c) in zip(cmd_list, range(len(cmd_list))): cmd_lists[c % n_proc].append(cmd) # run commands manager = multiprocessing.Manager() return_dict = manager.dict() jobs = [] c = 0 for cmd_list in cmd_lists: process = multiprocessing.Process(target=run_commands, args=[ c, cmd_list, dry_run, return_dict]) process.start() jobs.append(process) c += 1 # wait for jobs to finish for job in jobs: job.join() # if any of the job failed, exit for k in return_dict.values(): if k != 0: _log('*** ERROR *** One of the baloo children failed. Exiting.') # remove local data assert (os.path.isdir(local_dir)) _log('Removing %s...' % local_dir) shutil.rmtree(local_dir) _log('done.') return # write timestamps to file if not dry_run: fp = open(timestamp_filename, 'w') for k, v in last_partnersTimestamp.iteritems(): fp.write('%d %d\n' % (k, v)) fp.close() # create sucess file on AWS local_success_file = os.path.join(local_dir, '.success.baloo') with open(local_success_file, 'w') as gp: gp.write('%s' % dt_string) cmd = 'scp %s ubuntu@%s:%s' % (local_success_file, gpu_ip, remote_dir) _log(cmd) if not dry_run: cmd_out, cmd_err, rc = util_run_cmd(cmd) if rc != 0: _log('*** ERROR *** %s' % cmd_err) # remove local dir if not dry_run: assert (os.path.isdir(local_dir)) _log('Removing %s...' % local_dir) shutil.rmtree(local_dir) _log('done.') # write job timestamp to file with open(job_timestamp_filename, 'w') as gp: gp.write('%d' % int(time.time())) gp.close() _log('Summary : transferred %d files (%d partners) to AWS' % (n_transferred_files, n_transferred_partners))