def _getDuration(self, start, end): startTime = self._getDateTime(start) endTime = self._getDateTime(end) if startTime >= endTime: logger.warning("startTime is after endTime") return 1 ##TODO To prevent divide by zero later return totalSeconds(endTime - startTime)
def _getDuration(self, start, end): startTime = self._getDateTime(start) endTime = self._getDateTime(end) if startTime >= endTime: logger.warning("startTime is after endTime") return 1 ##TODO To prevent divide by zero later return totalSeconds(endTime - startTime)
def _resubmit_message_if_applicable(self, msg: EventMessage) -> bool: if msg and msg.content: retry_count = msg.content.get('retry_count', 0) if retry_count < min( MAX_NR_OF_RETRIES, 127): # mom can't handle more than 127 status updates... retry_count += 1 try: retry_timestamp = msg.content.get('retry_timestamp', datetime.utcnow()) ago = totalSeconds(datetime.utcnow() - retry_timestamp) time.sleep( max(0, 2 * retry_count - ago) ) # wait longer between each retry, other messages can be processed in between except Exception as e: # just continue logger.warning(e) # set/update retry values msg.content['retry_count'] = retry_count msg.content['retry_timestamp'] = datetime.utcnow() # resubmit new_msg = EventMessage(subject=msg.subject, content=msg.content) new_msg.ttl = 48 * 3600 # remove message from queue's when not picked up within 48 hours logger.info( 'resubmitting unhandled message to back of queue %s: %s %s', self.exchange, new_msg.subject, str(new_msg.content).replace('\n', ' ')) self.send(new_msg) return True return False
def _getDuration(self, start, end): """ Returns number of fractional seconds as a float(!) (as totalSeconds()) between start and end. """ startTime = parseDatetime(start) endTime = parseDatetime(end) if startTime >= endTime: logger.warning("startTime is not before endTime") return 1.0 ##TODO To prevent divide by zero later return totalSeconds(endTime - startTime)
def executeQuery(self, query, qargs=None, fetch=FETCH_NONE): '''execute the query and reconnect upon OperationalError''' try: with self._connection.cursor(cursor_factory = psycopg2.extras.RealDictCursor) as cursor: start = datetime.utcnow() cursor.execute(query, qargs) if self._log_queries: elapsed = datetime.utcnow() - start elapsed_ms = 1000.0 * totalSeconds(elapsed) logger.info('executed query in %.1fms%s yielding %s rows: %s', elapsed_ms, ' (SLOW!)' if elapsed_ms > 250 else '', # for easy log grep'ing cursor.rowcount, self._queryAsSingleLine(query, qargs)) try: self._log_database_notifications() result = [] if fetch == FETCH_ONE: result = cursor.fetchone() if fetch == FETCH_ALL: result = cursor.fetchall() if self.__auto_commit_selects and re.search('select', query, re.IGNORECASE): #prevent dangling in idle transaction on server self.commit() return result except Exception as e: logger.error("error while fetching result(s) for %s: %s", self._queryAsSingleLine(query, qargs), e) except (psycopg2.OperationalError, AttributeError) as e: logger.error(str(e)) while self.__connection_retries < 5: logger.info("(re)trying to connect to database") self.__connection_retries += 1 self._connect() if self._connection: self.__connection_retries = 0 return self.executeQuery(query, qargs, fetch) time.sleep(i*i) except (psycopg2.IntegrityError, psycopg2.ProgrammingError, psycopg2.InternalError, psycopg2.DataError)as e: logger.error("Rolling back query=\'%s\' due to error: \'%s\'" % (self._queryAsSingleLine(query, qargs), e)) self.rollback() return [] except Exception as e: logger.error(str(e)) return []
def _do_execute_query(self, query, qargs=None, fetch=FETCH_NONE): '''execute the query and reconnect upon OperationalError''' query_log_line = self._queryAsSingleLine(query, qargs) try: self.connect_if_needed() # log logger.debug('executing query: %s', query_log_line) # execute (and time it) start = datetime.utcnow() self._cursor.execute(query, qargs) elapsed = datetime.utcnow() - start elapsed_ms = 1000.0 * totalSeconds(elapsed) # log execution result logger.info('executed query in %.1fms%s yielding %s rows: %s', elapsed_ms, ' (SLOW!)' if elapsed_ms > 250 else '', # for easy log grep'ing self._cursor.rowcount, query_log_line) # log any notifications from within the database itself self._log_database_notifications() self._commit_selects_if_needed(query) # fetch and return results if fetch == FETCH_ONE: row = self._cursor.fetchone() return dict(row) if row is not None else None if fetch == FETCH_ALL: return [dict(row) for row in self._cursor.fetchall() if row is not None] return [] except psycopg2.OperationalError as oe: if self._is_recoverable_connection_error(oe): raise PostgresDBConnectionError("Could not execute query due to connection errors. '%s' error=%s" % (query_log_line, single_line_with_single_spaces(oe))) else: self._log_error_rollback_and_raise(oe, query_log_line) except Exception as e: self._log_error_rollback_and_raise(e, query_log_line)
def test_resource_usages_performance(self): ELAPSED_TRESHOLD = 2.0 #max allowed insert/update/delete time in seconds num_resources = self.radb.executeQuery( 'select count(id) from virtual_instrument.resource;', fetch=FETCH_ONE)['count'] # make sure all resources have 1000 units available MAX_CAPACITY = 1000 self.radb.executeQuery( 'update resource_monitoring.resource_capacity set (available, total) = (%s, %s);', (MAX_CAPACITY, MAX_CAPACITY)) # pretend that we have an almost unlimited amount of storage space self.radb.executeQuery('update resource_monitoring.resource_capacity set (available, total) = (%s, %s) ' \ 'where resource_id in (select id from virtual_instrument.resource_view where type_name = \'storage\');', (1e9*MAX_CAPACITY,1e9*MAX_CAPACITY)) # keep a list of storage-type resource(ids), so we can create long lasting claims for these. storage_resource_ids = set( r['id'] for r in self.radb.getResources(resource_types='storage')) now = datetime.utcnow() now -= timedelta(minutes=now.minute, seconds=now.second, microseconds=now.microsecond) # round to full hour spec_ids = [] filename = 'resource_usages_performance%s.csv' % ( datetime.utcnow().strftime('%Y%m%dT%H%M%S'), ) with open(filename, 'w') as file: file.write( '#tasks, #claims, #claims_per_resource, #inserted_claims, elapsed_insert\n' ) counter = 0 # it is not common to claim a single resource multiple times for the same task, but it can happen, so test for it. for preferred_num_claims_per_resource in [1, 2, 5, 10, 20, 50]: # let's test over a feasible range of #claims. A lofar observation usually has ~200 claims. for num_claims_to_insert in [ 1, 2, 5, 10, 20, 50, 100, 200, 500 ]: num_claims_to_insert = min( num_claims_to_insert, preferred_num_claims_per_resource * num_resources) num_claims_per_resource = min( preferred_num_claims_per_resource, num_claims_to_insert) for oversubscription_factor in [1, 999]: counter += 1 logger.info( '*****************************************************************' ) logger.info( 'starting task and claim scheduling: counter=%s num_claims_per_resource=%s num_claims_to_insert=%s oversubscription_factor=%s', counter, num_claims_per_resource, num_claims_to_insert, oversubscription_factor) result = self.radb.insertOrUpdateSpecificationAndTask( counter, counter, 'approved', 'observation', now + timedelta(hours=3 * counter), now + timedelta(hours=3 * counter + 1), 'content', 'CEP4') task_id = result['task_id'] task = self.radb.getTask(task_id) spec_ids.append(task['specification_id']) claims = [{ 'resource_id': q / num_claims_per_resource, 'starttime': task['starttime'], 'endtime': task['endtime'], 'status': 'tentative', 'claim_size': oversubscription_factor * MAX_CAPACITY / num_claims_per_resource } for q in range(num_claims_to_insert)] # extend claims on storage resources for claim in claims: if claim['resource_id'] in storage_resource_ids: claim['endtime'] += timedelta(days=100) start = datetime.utcnow() self.radb.insertResourceClaims(task_id, claims, 'foo', 1, 1) elapsed_insert = totalSeconds(datetime.utcnow() - start) num_tasks = self.radb.executeQuery( 'select count(id) from resource_allocation.task;', fetch=FETCH_ONE)['count'] num_claims = self.radb.executeQuery( 'select count(id) from resource_allocation.resource_claim;', fetch=FETCH_ONE)['count'] has_storage_claims = len( self.radb.getResourceClaims( task_ids=task_id, resource_type='storage')) > 0 # enforce perfomance criterion: inserting claims should take less than ELAPSED_TRESHOLD sec self.assertLess( elapsed_insert, ELAPSED_TRESHOLD, msg= "insertResourceClaims took longer than allowed. (%ssec > %ssec) num_tasks=%s num_claims=%s num_claims_to_insert=%s num_claims_per_resource=%s" % (elapsed_insert, ELAPSED_TRESHOLD, num_tasks, num_claims, num_claims_to_insert, num_claims_per_resource)) if oversubscription_factor > 1: # (deliberate) oversubscription of resources # so, expect the claims and task to be in conflict self.assertGreater( len( self.radb.getResourceClaims( task_ids=task_id, status='conflict')), 0) self.assertEqual( 'conflict', self.radb.getTask(task_id)['status']) # solve oversubscription start = datetime.utcnow() self.radb.updateResourceClaims( where_task_ids=task_id, claim_size=MAX_CAPACITY / num_claims_per_resource) elapsed_status_update = totalSeconds( datetime.utcnow() - start) # enforce perfomance criterion: updating claims should take less than ELAPSED_TRESHOLD sec self.assertLess( elapsed_status_update, ELAPSED_TRESHOLD, msg= "updateResourceClaims took longer than allowed. (%ssec > %ssec) num_tasks=%s num_claims=%s num_claims_to_insert=%s num_claims_per_resource=%s" % (elapsed_status_update, ELAPSED_TRESHOLD, num_tasks, num_claims, num_claims_to_insert, num_claims_per_resource)) # check if not oversubscribed anymore self.assertEqual( 0, len( self.radb.getResourceClaims( task_ids=task_id, status='conflict'))) self.assertEqual( 'approved', self.radb.getTask(task_id)['status']) # no oversubscription (anymore), so expect all claims to be claimable... start = datetime.utcnow() self.radb.updateTaskAndResourceClaims( task_id=task_id, claim_status='claimed') elapsed_status_update = totalSeconds( datetime.utcnow() - start) # are they indeed claimed? self.assertEqual( num_claims_to_insert, len( self.radb.getResourceClaims(task_ids=task_id, status='claimed'))) # enforce perfomance criterion: updating claims should take less than 2*ELAPSED_TRESHOLD sec (2* because we update both tasks and claims) self.assertLess( elapsed_status_update, 2 * ELAPSED_TRESHOLD, msg= "updateTaskAndResourceClaims took longer than allowed. (%ssec > %ssec) num_tasks=%s num_claims=%s num_claims_to_insert=%s num_claims_per_resource=%s" % (elapsed_status_update, ELAPSED_TRESHOLD, num_tasks, num_claims, num_claims_to_insert, num_claims_per_resource)) # ... and proceed with cycling through the task status for task_status in [ 'prescheduled', 'scheduled', 'queued', 'active', 'completing', 'finished' ]: # update the task status start = datetime.utcnow() self.radb.updateTaskAndResourceClaims( task_id=task_id, task_status=task_status) elapsed_status_update = totalSeconds( datetime.utcnow() - start) # enforce perfomance criterion: updating task status should take less than 2*ELAPSED_TRESHOLD sec (2* because we update both tasks and claims) self.assertLess( elapsed_status_update, 2 * ELAPSED_TRESHOLD, msg= "updateTaskAndResourceClaims took longer than allowed. (%ssec > %ssec) num_tasks=%s num_claims=%s num_claims_to_insert=%s num_claims_per_resource=%s task_status=%s" % (elapsed_status_update, ELAPSED_TRESHOLD, num_tasks, num_claims, num_claims_to_insert, num_claims_per_resource, task_status)) # check task status self.assertEqual( task_status, self.radb.getTask(task_id)['status']) # task should now be finished self.assertEqual('finished', self.radb.getTask(task_id)['status']) # and all non-long-lasting (storage) claims should be removed. self.assertEqual( 0, len( list(c for c in self.radb.getResourceClaims( task_ids=task_id) if c['endtime'] <= task['endtime']))) if has_storage_claims: # and all long-lasting (storage) claims should still be there. # (they are removed by the cleanupservice ending/removing the storage claims) self.assertGreater( len( list(c for c in self.radb.getResourceClaims( task_ids=task_id) if c['endtime'] > task['endtime'])), 0) logger.info( 'TEST RESULT: radb now contains %d tasks, %d claims, insert of %d claims with %d claims per resource takes on average %.3fsec', num_tasks, num_claims, num_claims_to_insert, num_claims_per_resource, elapsed_insert) file.write( '%d, %d, %d, %d, %.3f\n' % (num_tasks, num_claims, num_claims_per_resource, num_claims_to_insert, elapsed_insert)) file.flush() logger.info('removing all test specs/tasks/claims from db') delete_elapsed_list = [] file.write('\n\n#tasks, #claims, elapsed_delete\n') for spec_id in spec_ids: num_tasks = self.radb.executeQuery( 'select count(id) from resource_allocation.task;', fetch=FETCH_ONE)['count'] num_claims = self.radb.executeQuery( 'select count(id) from resource_allocation.resource_claim;', fetch=FETCH_ONE)['count'] start = datetime.utcnow() self.radb.deleteSpecification(spec_id) elapsed = totalSeconds(datetime.utcnow() - start) delete_elapsed_list.append(elapsed) # enforce perfomance criterion: (cascading) delete of spec should take less than ELAPSED_TRESHOLD sec self.assertLess(elapsed, ELAPSED_TRESHOLD) file.write('%d, %d, %.3f\n' % (num_tasks, num_claims, elapsed)) file.flush() logger.info('average spec delete time: %.3f', sum(delete_elapsed_list) / float(len(delete_elapsed_list))) logger.info('Done. Results can be found in file: %s', filename)
def main(): from optparse import OptionParser from lofar.common import dbcredentials # Check the invocation arguments parser = OptionParser( "%prog [options]", description= 'execute a performance test by inserting many files on an empty test database.' ) (options, args) = parser.parse_args() logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.INFO) with LTAStorageDbTestInstance() as test_db: base_date = datetime.utcnow() db = store.LTAStorageDb(test_db.dbcreds) db.insertSiteIfNotExists('sara', 'srm://srm.siteA.nl:8444') rootdir_id = db.insertRootDirectory( 'sara', '/pnfs/grid.siteA.nl/data/lofar/ops') projects_dir_id = db.insertSubDirectory( '/pnfs/grid.siteA.nl/data/lofar/ops/projects', rootdir_id) total_num_files_inserted = 0 with open('db_perf.csv', 'w') as file: for cycle_nr in range(1, 10): for project_nr in range(1, 10): # project_name = 'lc%d_%03d/%d' % (cycle_nr, project_nr, os.getpid()) project_name = 'lc%d_%03d' % (cycle_nr, project_nr) projectdir_id = db.insertSubDirectory( '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s' % (project_name, ), projects_dir_id) obs_base_id = cycle_nr * 100000 + project_nr * 1000 for obs_nr, obsId in enumerate( range(obs_base_id, obs_base_id + 20)): obsName = 'L%s' % obsId obsdir_id = db.insertSubDirectory( '/pnfs/grid.siteA.nl/data/lofar/ops/projects/%s/%s' % (project_name, obsName), projectdir_id) fileinfos = [ ('%s_SB%3d' % (obsName, sbNr), 1000 + sbNr + project_nr * cycle_nr, base_date + timedelta(days=10 * cycle_nr + project_nr, minutes=obs_nr, seconds=sbNr), obsdir_id) for sbNr in range(0, 2) ] now = datetime.utcnow() file_ids = db.insertFileInfos(fileinfos) total_num_files_inserted += len(file_ids) elapsed = totalSeconds(datetime.utcnow() - now) line = '%s,%s' % (total_num_files_inserted, elapsed) print(line) file.write(line + '\n')
def enoughResourcesAvailable(self): try: now = datetime.utcnow() bytes_sent = _getBytesSent() if bytes_sent >= self.__prev_bytes_sent: # bytes_sent might wrap around zero # compute current speed in Gbps speed = 8 * (bytes_sent - self.__prev_bytes_sent ) / totalSeconds(now - self.__prev_bytes_sent_timestamp) # running average for used_bandwidth used_bandwidth = 0.5 * speed + 0.5 * self.__prev_used_bandwidth logger.debug("resources: current used_bandwidth = %s", humanreadablesize(used_bandwidth, 'bps')) # store for next iteration self.__prev_bytes_sent = bytes_sent self.__prev_bytes_sent_timestamp = now self.__prev_used_bandwidth = used_bandwidth # only start new jobs if we have some bandwith available # note that this is a 'soft' limit. # we cannot control the actual bandwith used by the running transfers # we can only not start new jobs if we already exceed the MAX_USED_BANDWITH_TO_START_NEW_JOBS if used_bandwidth > MAX_USED_BANDWITH_TO_START_NEW_JOBS: logger.warning( 'resources: not enough bandwith available to start new jobs, using %s, max %s' % (humanreadablesize(used_bandwidth, 'bps'), humanreadablesize( MAX_USED_BANDWITH_TO_START_NEW_JOBS, 'bps'))) return False else: # wrapped around 0, just store for next iteration, do not compute anything self.__prev_bytes_sent = bytes_sent self.__prev_bytes_sent_timestamp = now # only start new jobs if we have some cpu time available idle_cpu_percentage = psutil.cpu_times_percent().idle logger.debug("resources: current idle_cpu_percentage = %s%%", idle_cpu_percentage) if idle_cpu_percentage < 5: logger.warning( 'resources: not enough cpu power available to start new jobs, cpu_idle %s%%' % idle_cpu_percentage) return False # only start new jobs if system load is not too high short_load_avg = os.getloadavg()[0] cpu_count = psutil.cpu_count() allowed_load = 1.5 * cpu_count logger.debug( "resources: current short term load = %s #cpu's = %s allowed_load = %s", short_load_avg, cpu_count, allowed_load) if short_load_avg > allowed_load: logger.warning( 'resources: system load too high (%s > %s), cannot start new jobs' % (short_load_avg, allowed_load)) return False # only allow 1 job at the time if swapping swap_memory_percentage = psutil.swap_memory().percent logger.debug("resources: current swap_memory_percentage = %s%%", swap_memory_percentage) if swap_memory_percentage > 5 and len(self.__running_jobs) > 0: logger.warning( 'resources: system swapping. not enough memory available to start new jobs' ) return False # only start new jobs if number of processes is not too high try: current_user = getpass.getuser() current_user_procs = [ p for p in psutil.process_iter() if p.username() == current_user ] current_num_user_procs = len(current_user_procs) allowed_num_user_procs = 64 * cpu_count logger.debug( "resources: current num_user_procs = %s allowed_num_user_procs = %s", current_num_user_procs, allowed_num_user_procs) if current_num_user_procs > allowed_num_user_procs: logger.warning( 'resources: number of processes by %s too high (%s > %s), cannot start new jobs' % (current_user, current_num_user_procs, allowed_num_user_procs)) return False except Exception as e: logger.exception(e) pass # limit total number of parallel transferring jobs to self.max_nr_of_parallel_jobs with self.__lock: starting_threads = [ job_thread_dict['thread'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' not in job_thread_dict ] pipelines = [ job_thread_dict['pipeline'] for job_thread_dict in list(self.__running_jobs.values()) if 'pipeline' in job_thread_dict ] initializing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_INITIALIZING ] transferring_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_TRANSFERRING ] finalizing_pipelines = [ pipeline for pipeline in pipelines if pipeline.status == IngestPipeline.STATUS_FINALIZING ] num_busy_transfers = len(starting_threads) + len( initializing_pipelines) + len(transferring_pipelines) num_finalizing_transfers = len(finalizing_pipelines) logger.debug( "resources: current num_busy_transfers = %s num_finalizing_transfers = %s max_nr_of_parallel_jobs = %s", num_busy_transfers, num_finalizing_transfers, self.max_nr_of_parallel_jobs) if num_busy_transfers >= self.max_nr_of_parallel_jobs: logger.warning( 'resources: already running %d parallel jobs (#starting=%d, #transferring=%d) limiting the total number of transferring jobs to %d' % (len(self.__running_jobs), len(initializing_pipelines) + len(starting_threads), len(transferring_pipelines), self.max_nr_of_parallel_jobs)) return False if num_finalizing_transfers >= 2 * self.max_nr_of_parallel_jobs: logger.warning( 'resources: already waiting for %d jobs to finish (updating status/SIP to MoM and LTA). not starting new jobs until some jobs finished...' % (len(finalizing_pipelines), )) return False except Exception as e: logger.exception( "error while checking for available resources: %s", e) num_running_jobs = len(self.__running_jobs) if num_running_jobs <= 4: logger.info("running %d jobs, assuming we can run 1 more: ", num_running_jobs) return True else: logger.warning( "already running %d jobs, assuming for safety we cannot run more jobs...", num_running_jobs) return False return True
def transfer(self, force=False, dereference=False): starttime = datetime.utcnow() # for cleanup self.started_procs = {} self.fifos = [] try: if not self.source_exists(): if self.can_logon_to_source_host(): if self.source_mounted(): raise LtacpException( "ltacp %s: source path %s:%s does not exist" % (self.logId, self.src_host, self.src_path)) else: raise LtacpException( "ltacp %s: the disk of source path %s:%s does not seem to be mounted" % (self.logId, self.src_host, self.src_path)) else: raise LtacpException( "ltacp %s: cannot login to %s@%s" % (self.logId, self.src_user, self.src_host)) # determine if input is file input_is_file = self.is_soure_single_file() if not input_is_file: # make sure the file extension is .tar or .tar.gz missing_suffix = "" if self.gzip: if not (self.dst_surl.endswith(".tar.gz") or self.dst_surl.endswith(".tgz")): if self.dst_surl.endswith(".tar"): missing_suffix = ".gz" else: missing_suffix = ".tar.gz" else: if not self.dst_surl.endswith(".tar"): missing_suffix = ".tar" if missing_suffix: self.dst_surl += missing_suffix logger.info( "ltacp %s: appending missing suffix %s to surl: %s", self.logId, missing_suffix, self.dst_surl) dst_turl = convert_surl_to_turl(self.dst_surl) logger.info( 'ltacp %s: initiating transfer of %s:%s to surl=%s turl=%s' % (self.logId, self.src_host, self.src_path, self.dst_surl, dst_turl)) # get input datasize du_items = self.src_path if input_is_file or isinstance( self.src_path, str) else ' '.join(self.src_path) cmd_remote_du = self.ssh_cmd + [ 'du -b %s --max-depth=0 %s' % ("--dereference" if dereference else "", du_items) ] logger.info('ltacp %s: remote getting datasize. executing: %s' % (self.logId, ' '.join(cmd_remote_du))) p_remote_du = Popen(cmd_remote_du, stdout=PIPE, stderr=PIPE) self.started_procs[p_remote_du] = cmd_remote_du # block until du is finished output_remote_du = communicate_returning_strings(p_remote_du) del self.started_procs[p_remote_du] if p_remote_du.returncode != 0: raise LtacpException( 'ltacp %s: remote du failed: \nstdout: %s\nstderr: %s' % (self.logId, output_remote_du[0], output_remote_du[1])) # compute various parameters for progress logging if input_is_file: input_datasize = int(output_remote_du[0].split()[0]) else: input_datasize = sum([ int(line.strip().split()[0]) for line in output_remote_du[0].split('\n') if line.strip() ]) logger.info('ltacp %s: input datasize: %d bytes, %s' % (self.logId, input_datasize, humanreadablesize(input_datasize))) estimated_tar_size = 512 * ( input_datasize // 512 ) + 3 * 512 #512byte header, 2*512byte ending, 512byte modulo data logger.info('ltacp %s: estimated_tar_size: %d bytes, %s' % (self.logId, estimated_tar_size, humanreadablesize(estimated_tar_size))) #--- # Server part #--- # we'll randomize ports # to minimize initial collision, randomize based on path and time random.seed(hash(self.logId) ^ hash(time.time())) p_data_in, port_data = self._ncListen('data') self.local_data_fifo = '/tmp/ltacp_datapipe_%s_%s' % ( self.src_host, self.logId) logger.info('ltacp %s: creating data fifo: %s' % (self.logId, self.local_data_fifo)) if os.path.exists(self.local_data_fifo): os.remove(self.local_data_fifo) os.mkfifo(self.local_data_fifo) if not os.path.exists(self.local_data_fifo): raise LtacpException("ltacp %s: Could not create fifo: %s" % (self.logId, self.local_data_fifo)) # transfer incomming data stream via md5a32bc to compute md5, adler32 and byte_count # data is written to fifo, which is then later fed into globus-url-copy # on stdout we can monitor progress # set progress message step 0f 0.5% of estimated_tar_size cmd_md5a32bc = [ 'md5a32bc', '-p', str(min(1000000, estimated_tar_size // 200)), self.local_data_fifo ] logger.info( 'ltacp %s: processing data stream for md5, adler32 and byte_count. executing: %s' % ( self.logId, ' '.join(cmd_md5a32bc), )) p_md5a32bc = Popen(cmd_md5a32bc, stdin=p_data_in.stdout, stdout=PIPE, stderr=PIPE, universal_newlines=True) self.started_procs[p_md5a32bc] = cmd_md5a32bc # start copy fifo stream to globus-url-copy guc_options = [ '-cd', #create remote directories if missing '-p 4', #number of parallel ftp connections '-bs 131072', #buffer size '-b', # binary '-nodcau', # turn off data channel authentication for ftp transfers ] cmd_data_out = [ '/bin/bash', '-c', 'globus-url-copy %s file://%s %s' % (' '.join(guc_options), self.local_data_fifo, dst_turl) ] logger.info( 'ltacp %s: copying data stream into globus-url-copy. executing: %s' % (self.logId, ' '.join(cmd_data_out))) p_data_out = Popen(cmd_data_out, stdout=PIPE, stderr=PIPE) self.started_procs[p_data_out] = cmd_data_out # Check if receiver side is set up correctly # and all processes are still waiting for input from client finished_procs = dict( (p, cl) for (p, cl) in list(self.started_procs.items()) if p.poll() is not None) if len(finished_procs): msg = '' for p, cl in list(finished_procs.items()): o, e = communicate_returning_strings(p) msg += " process pid:%d exited prematurely with exit code %d. cmdline: %s\nstdout: %s\nstderr: %s\n" % ( p.pid, p.returncode, cl, o, e) raise LtacpException( "ltacp %s: %d local process(es) exited prematurely\n%s" % (self.logId, len(finished_procs), msg)) #--- # Client part #--- # start remote copy on src host: # 1) create fifo # 2) send tar stream of data/dir + tee to fifo for 3) # 3) simultaneously to 2), calculate checksum of fifo stream # 4) break fifo self.remote_data_fifo = '/tmp/ltacp_md5_pipe_%s' % (self.logId, ) #make sure there is no old remote fifo self._removeRemoteFifo() cmd_remote_mkfifo = self.ssh_cmd + [ 'mkfifo %s' % (self.remote_data_fifo, ) ] logger.info('ltacp %s: remote creating fifo. executing: %s' % (self.logId, ' '.join(cmd_remote_mkfifo))) p_remote_mkfifo = Popen(cmd_remote_mkfifo, stdout=PIPE, stderr=PIPE) self.started_procs[p_remote_mkfifo] = cmd_remote_mkfifo # block until fifo is created output_remote_mkfifo = communicate_returning_strings( p_remote_mkfifo) del self.started_procs[p_remote_mkfifo] if p_remote_mkfifo.returncode != 0: raise LtacpException( 'ltacp %s: remote fifo creation failed: \nstdout: %s\nstderr: %s' % (self.logId, output_remote_mkfifo[0], output_remote_mkfifo[1])) with open(os.devnull, 'r') as devnull: # start sending remote data, tee to fifo if input_is_file: cmd_remote_data = self.ssh_cmd + [ 'cat %s %s| tee %s | %s %s %s' % (self.src_path, '| gzip --stdout --fast ' if self.gzip else '', self.remote_data_fifo, self.remoteNetCatCmd, self.localIPAddress, port_data) ] else: if isinstance(self.src_path, str): #src_path is dir src_path_parent, src_path_child = os.path.split( self.src_path) else: #src_path is list if paths dirs = set([os.path.dirname(p) for p in self.src_path]) if len(dirs) > 1: raise LtacpException( 'ltacp %s: cannot combine multiple files from different directories in one tarbal' % self.logId) files = set( [os.path.basename(p) for p in self.src_path]) src_path_parent = list(dirs)[0] src_path_child = ' '.join(files) cmd_remote_data = self.ssh_cmd + [ 'cd %s && tar c %s -O %s %s| tee %s | %s %s %s' % (src_path_parent, src_path_child, '--dereference' if dereference else '', '| gzip --stdout --fast ' if self.gzip else '', self.remote_data_fifo, self.remoteNetCatCmd, self.localIPAddress, port_data) ] logger.info( 'ltacp %s: remote starting transfer. executing: %s' % (self.logId, ' '.join(cmd_remote_data))) p_remote_data = Popen(cmd_remote_data, stdin=devnull, stdout=PIPE, stderr=PIPE) self.started_procs[p_remote_data] = cmd_remote_data # start computation of checksum on remote fifo stream cmd_remote_checksum = self.ssh_cmd + [ 'md5sum %s' % (self.remote_data_fifo, ) ] logger.info( 'ltacp %s: remote starting computation of md5 checksum. executing: %s' % (self.logId, ' '.join(cmd_remote_checksum))) p_remote_checksum = Popen(cmd_remote_checksum, stdin=devnull, stdout=PIPE, stderr=PIPE) self.started_procs[p_remote_checksum] = cmd_remote_checksum # waiting for output, comparing checksums, etc. logger.info( 'ltacp %s: transfering... waiting for progress...' % self.logId) transfer_start_time = datetime.utcnow() prev_progress_time = datetime.utcnow() prev_bytes_transfered = 0 with PipeReader(p_md5a32bc.stdout, self.logId) as pipe_reader: # wait and poll for progress while all processes are runnning while len([ p for p in list(self.started_procs.keys()) if p.poll() is not None ]) == 0: try: current_progress_time = datetime.utcnow() elapsed_secs_since_prev = totalSeconds( current_progress_time - prev_progress_time) if elapsed_secs_since_prev > 900: raise LtacpException( 'ltacp %s: transfer stalled for 15min.' % (self.logId)) # read and process md5a32bc stdout lines to create progress messages lines = pipe_reader.readlines(1) nextline = lines[-1].strip() if lines else '' if len(nextline) > 0: try: logger.debug('ltacp %s: transfering... %s', self.logId, nextline) total_bytes_transfered = int( nextline.split()[0].strip()) percentage_done = ( 100.0 * float(total_bytes_transfered) ) / float(estimated_tar_size) elapsed_secs_since_start = totalSeconds( current_progress_time - transfer_start_time) if percentage_done > 0 and elapsed_secs_since_start > 0 and elapsed_secs_since_prev > 0: avg_speed = total_bytes_transfered / elapsed_secs_since_start current_bytes_transfered = total_bytes_transfered - prev_bytes_transfered current_speed = current_bytes_transfered / elapsed_secs_since_prev if elapsed_secs_since_prev > 120 or current_bytes_transfered > 0.333 * estimated_tar_size: prev_progress_time = current_progress_time prev_bytes_transfered = total_bytes_transfered percentage_to_go = 100.0 - percentage_done time_to_go = elapsed_secs_since_start * percentage_to_go / percentage_done try: if self.progress_callback: self.progress_callback( percentage_done= percentage_done, current_speed= current_speed, total_bytes_transfered= total_bytes_transfered) except Exception as e: logger.error(e) logger.info( 'ltacp %s: transfered %s %.1f%% in %s at avgSpeed=%s (%s) curSpeed=%s (%s) to_go=%s to %s' % (self.logId, humanreadablesize( total_bytes_transfered), percentage_done, timedelta(seconds=int( round( elapsed_secs_since_start ))), humanreadablesize( avg_speed, 'Bps'), humanreadablesize( avg_speed * 8, 'bps'), humanreadablesize( current_speed, 'Bps'), humanreadablesize( current_speed * 8, 'bps'), timedelta(seconds=int( round(time_to_go))), dst_turl)) except Exception as e: msg = 'ltacp %s: error while parsing md5a32bc loglines: error=%s. line=%s' % ( self.logId, e, nextline) logger.error(msg) self.cleanup() raise LtacpException(msg) time.sleep(0.05) except KeyboardInterrupt: self.cleanup() except LtacpException as e: logger.error('ltacp %s: %s' % (self.logId, str(e))) self.cleanup() raise except Exception as e: logger.error('ltacp %s: %s' % (self.logId, str(e))) def waitForSubprocess(proc, timeout=timedelta(seconds=60), proc_log_name='', loglevel=logging.DEBUG): logger.log( loglevel, 'ltacp %s: waiting at most %s for %s to finish...', self.logId, timeout, proc_log_name) start_wait = datetime.now() while datetime.now() - start_wait < timeout: if proc.poll() is not None: break time.sleep(1) if proc.poll() is None: raise LtacpException( 'ltacp %s: %s did not finish within %s.' % (self.logId, proc_log_name, timeout)) waitForSubprocess(p_data_out, timedelta(seconds=self.globus_timeout), 'globus-url-copy', logging.INFO) output_data_out = communicate_returning_strings(p_data_out) if p_data_out.returncode != 0: if 'file exist' in output_data_out[1].lower(): raise LtacpDestinationExistsException( 'ltacp %s: data transfer via globus-url-copy to LTA failed, file already exists. turl=%s.' % (self.logId, dst_turl)) raise LtacpException( 'ltacp %s: transfer via globus-url-copy to LTA failed. turl=%s error=%s' % (self.logId, dst_turl, output_data_out[0].strip() + output_data_out[1].strip())) logger.info( 'ltacp %s: data transfer via globus-url-copy to LTA complete.' % self.logId) waitForSubprocess(p_remote_data, timedelta(seconds=60), 'remote data transfer') output_remote_data = communicate_returning_strings( p_remote_data) if p_remote_data.returncode != 0: raise LtacpException( 'ltacp %s: Error in remote data transfer: %s' % (self.logId, output_remote_data[1])) logger.debug('ltacp %s: remote data transfer finished...' % self.logId) waitForSubprocess(p_remote_checksum, timedelta(seconds=60), 'remote md5 checksum computation') output_remote_checksum = communicate_returning_strings( p_remote_checksum) if p_remote_checksum.returncode != 0: raise LtacpException( 'ltacp %s: Error in remote md5 checksum computation: %s' % (self.logId, output_remote_checksum[1])) logger.debug( 'ltacp %s: remote md5 checksum computation finished.' % self.logId) waitForSubprocess( p_md5a32bc, timedelta(seconds=60), 'local computation of md5 adler32 and byte_count') output_md5a32bc_local = communicate_returning_strings( p_md5a32bc) if p_md5a32bc.returncode != 0: raise LtacpException( 'ltacp %s: Error while computing md5 adler32 and byte_count: %s' % (self.logId, output_md5a32bc_local[1])) logger.debug( 'ltacp %s: computed local md5 adler32 and byte_count.' % self.logId) # process remote md5 checksums try: md5_checksum_remote = output_remote_checksum[0].split( ' ')[0] except Exception as e: logger.error( 'ltacp %s: error while parsing remote md5: %s\n%s' % (self.logId, output_remote_checksum[0], output_remote_checksum[1])) raise # process local md5 adler32 and byte_count try: items = output_md5a32bc_local[1].splitlines()[-1].split( ' ') md5_checksum_local = items[0].strip() a32_checksum_local = items[1].strip().zfill(8) byte_count = int(items[2].strip()) except Exception as e: logger.error( 'ltacp %s: error while parsing md5 adler32 and byte_count outputs: %s' % (self.logId, output_md5a32bc_local[0])) raise # check transfered number of bytes if (byte_count == 0 and input_datasize > 0): raise LtacpException( 'ltacp %s: did not transfer any bytes of the expected %s. Something is wrong in the datastream setup.' % (self.logId, humanreadablesize(estimated_tar_size))) logger.info( 'ltacp %s: byte count of datastream is %d %s' % (self.logId, byte_count, humanreadablesize(byte_count))) # compare local and remote md5 if (md5_checksum_remote != md5_checksum_local): raise LtacpException( 'md5 checksum reported by client (%s) does not match local checksum of incoming data stream (%s)' % (self.logId, md5_checksum_remote, md5_checksum_local)) logger.info( 'ltacp %s: remote and local md5 checksums are equal: %s' % ( self.logId, md5_checksum_local, )) logger.info('ltacp %s: fetching adler32 checksum from LTA...' % self.logId) srm_ok, srm_file_size, srm_a32_checksum = get_srm_size_and_a32_checksum( self.dst_surl, 'ltacp %s:' % self.logId) if not srm_ok: raise LtacpException( 'ltacp %s: Could not get srm adler32 checksum for: %s' % (self.logId, self.dst_surl)) if srm_a32_checksum != a32_checksum_local: raise LtacpException( 'ltacp %s: adler32 checksum reported by srm (%s) does not match original data checksum (%s)' % (self.logId, srm_a32_checksum, a32_checksum_local)) logger.info('ltacp %s: adler32 checksums are equal: %s' % ( self.logId, a32_checksum_local, )) if int(srm_file_size) != int(byte_count): raise LtacpException( 'ltacp %s: file size reported by srm (%s) does not match datastream byte count (%s)' % (self.logId, srm_file_size, byte_count)) logger.info( 'ltacp %s: srm file size and datastream byte count are equal: %s bytes (%s)' % (self.logId, srm_file_size, humanreadablesize(srm_file_size))) logger.info('ltacp %s: transfer to LTA completed successfully.' % (self.logId)) except LtacpDestinationExistsException as e: logger.log(logging.WARN if force else logging.ERROR, str(e)) if force: self.cleanup() srmrm(self.dst_surl, 'ltacp %s ' % self.logId) return self.transfer(force=False) else: # re-raise the exception to the caller raise except Exception as e: # Something went wrong logger.error('ltacp %s: Error in transfer: %s' % (self.logId, str(e))) # re-raise the exception to the caller raise finally: # cleanup self.cleanup() total_time = round( 10 * totalSeconds(datetime.utcnow() - starttime)) / 10 logger.info( 'ltacp %s: successfully completed transfer of %s:%s to %s in %ssec for %s at avg speed of %s or %s', self.logId, self.src_host, self.src_path, self.dst_surl, total_time, humanreadablesize(int(srm_file_size)), humanreadablesize(int(srm_file_size) / total_time, 'Bps'), humanreadablesize(8 * int(srm_file_size) / total_time, 'bps')) return (md5_checksum_local, a32_checksum_local, str(byte_count))