def ingest(self): debug_print("Ingesting directory {}".format(self.directory)) debug_print("Ingesting the files \n{}".format(self.files)) is_lambda = self.context[c.KEY_LAMBDA_FUNCTION] is not None timeout = self.__calculate_aggregate_window_timeout(self.context[c.KEY_MAX_LAMBDA_TIME]) target_excretion_size = self.context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB] compression_ratio = self.context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO] sep = self.context[c.KEY_SEPERATOR_PARTITION] memory_trigger = self.context[c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] memory_used = mutil.get_memory_usage() main_filename, main_file_data, main_file_size_mb = self.__get_main_aggregate_file(self.directory, sep, target_excretion_size) main_file_data = self.__append(None, main_file_data) keys_ingested = [] for file in self.files: debug_print("\tProcessing file {}".format(file)) key_parts = KeyParts(file, sep) duration = datetime.datetime.utcnow() - key_parts.filename_timestamp if duration.total_seconds() < 300: debug_print("The file '{}' is {}s old. It is too new and will be processed later to allow for S3 propagation.".format(file, duration.total_seconds())) continue keys_ingested.append(file) data = self.__open(file, main_file_data) if data is None: continue size_in_megabytes = self.__size(file) main_file_data = self.__append(main_file_data, data) del data gc.collect() current_dataframe_size = sys.getsizeof(main_file_data) #break conditions #1. Memory limit exceeded #2. Time window exceeded #3. Target excretion size hit main_file_size_mb += size_in_megabytes memory_used = mutil.get_memory_usage() debug_print("\t\tSize on S3: {}MB Size of new dataset: {}bytes Estimated Compression Ratio: {} Memory Used: {}% Project Compression Size {}MB Target Excretion Size {}MB".format(size_in_megabytes, current_dataframe_size, compression_ratio, memory_used, main_file_size_mb, target_excretion_size)) if util.elapsed(self.context) > timeout or memory_used > memory_trigger or main_file_size_mb > target_excretion_size : print "Elapsed", util.elapsed(self.context), "Start:", self.starttime, "Timeout:", timeout, "Has timed out:", util.elapsed(self.context) > timeout, "Mem Used %:", memory_used, "Max Memory %:", memory_trigger break #only save the files if we have a reasonable amount of time remaining before the lambda timeout. debug_print("Time remaining: {}s".format(util.time_remaining(self.context))) debug_print("There were {} keys ingested. The keys ingested are: \n {}".format(len(keys_ingested), keys_ingested)) if len(keys_ingested)>0 and util.time_remaining(self.context) > c.SAVE_WINDOW_IN_SECONDS and not main_file_data.empty: main_file_data = self.__convert_to_submission_df(main_file_data) gc.collect() self.__excret(self.directory, main_filename, main_file_data, sep) self.__delete_keys(keys_ingested) elif util.time_remaining(self.context) <= c.SAVE_WINDOW_IN_SECONDS: print "Time has run out! We have less than {} seconds remaining before this lambda times out. Abandoning the S3 commit to avoid file corruption.".format(c.SAVE_WINDOW_IN_SECONDS) print "Aggregation window (Max Lambda Execution Time * {}): {} seconds".format(c.RATIO_OF_MAX_LAMBDA_TIME, timeout) print "S3 Save window: {} seconds".format(c.SAVE_WINDOW_IN_SECONDS) print "Lambda time remaining: {} seconds".format(util.time_remaining(self.context)) remaining_files = list(set(self.files) - set(keys_ingested)) if len(remaining_files) > 0: debug_print("Re-adding the {} paths to SQS to attempt again. The paths are \n{}".format(len(remaining_files), remaining_files)) self.__add_to_sqs(remaining_files) print "I've consumed everything I can in bucket '{}'".format(self.directory) return
def flush_and_delete(context, metric_sets): if len(metric_sets)>0: print "[{}]Time remaining {} seconds".format(context[c.KEY_REQUEST_ID], util.time_remaining(context)) print "[{}]Saving aggregated metrics to S3 bucket '{}'".format(context[c.KEY_REQUEST_ID], os.environ[c.RES_S3_STORAGE]) context[c.CW_ATTR_SAVE_DURATION] = save_duration = flush(context, metric_sets) #delete only messages that have succeeded across all partition saves #message payloads can span multiple partitions/schemas #only delete messages that succeeded on all partition/schema submissions #this could lead to duplication of data if only part of a message is processed successfully. #this is better than deleting the message entirely leaving missing data. context[c.KEY_THREAD_POOL].wait() print "[{}]Time remaining {} seconds".format(context[c.KEY_REQUEST_ID], util.time_remaining(context)) print "[{}]Deleting messages from SQS queue '{}'".format(context[c.KEY_REQUEST_ID],context[c.KEY_SQS].queue_url) delete_duration = context[c.KEY_SQS].delete_message_batch(context[c.KEY_SUCCEEDED_MSG_IDS]) write_cloudwatch_metrics(context, save_duration, delete_duration)
def __break_retry(self, backoff, current_attempt_number): time_remaining = util.time_remaining(self.__context) print "The lambda has {} seconds remaining. It started at {}.".format(time_remaining, datetime.datetime.fromtimestamp(self.__context[c.KEY_START_TIME]).strftime('%Y-%m-%d %H:%M:%S')) if time_remaining - backoff < context[c.CW_ATTR_SAVE_DURATION] or current_attempt_number >= self.__context[c.KEY_BACKOFF_MAX_TRYS] : print "Breaking the lock attempt as the lambda is nearing timeout or maximum retry limit is hit. The lambda has {} seconds remaining. There have been {} attempts made.".format(time_remaining,current_attempt_number) return True; return False
def save(context, metric_sets, partition, schema_hash): util.debug_print("\t{}:".format(partition)) for schema_hash, dict in metric_sets[partition].iteritems(): if util.time_remaining(context) <= ( context[c.CW_ATTR_DELETE_DURATION] + 20): break columns = dict[c.KEY_SET_COLUMNS] if len(dict[c.KEY_SET]) == 0: continue values = dict[c.KEY_SET].values() set = pd.DataFrame(values, columns=columns) util.debug_print("\t\t{}:".format(schema_hash)) path = create_file_path(partition, schema_hash, context[c.KEY_SEPERATOR_PARTITION]) util.debug_print("Writing to path '{}' with set:\n {}".format( path, set)) elapsed = 0 try: util.debug_print("Writing to partition '{}'".format(partition)) write(context[c.KEY_METRIC_BUCKET], path, set, context[c.KEY_SEPERATOR_PARTITION], schema.object_encoding(columns)) context[c.KEY_SUCCEEDED_MSG_IDS] += dict[c.KEY_MSG_IDS] except Exception as e: print "[{}]An error occured writing to path '{}'.\nSet: {}\nError: \n{}".format( context[c.KEY_REQUEST_ID], path, set, traceback.format_exc()) raise e finally: number_of_rows = len(values) if c.INFO_TOTAL_ROWS not in context[c.KEY_AGGREGATOR].info: context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] = 0 context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] += number_of_rows del set del columns
def __proto_http(self, scheme, address, port, timeout=None): url = scheme + "://" + address + ":" + str(port) + "/" request = urllib2.Request(url=url, headers=self._http_headers) result = None response = None t1 = time.time() try: # response is a urllib.addinfourl object response = self._proxy_url.open_url(request, timeout=timeout) except HTTPError as http_err: # HTTPError implies a response from an HTTP server # (but indicating something other than success), and # since it is also a urllib.addinfourl object, it can be # handled just like a normal response. response = http_err except Exception as err: logger.debug("%s failed: %s" % (url, err)) if response: try: status_code = response.getcode() logger.debug("%s returned HTTP status code %s" % (url, status_code)) result = True remaining = util.time_remaining(t1, timeout) self._process_response(response, scheme, address, port, remaining, self._args.download_limit) finally: response.close() return result
def save(context, metric_sets, partition, schema_hash): util.debug_print("\t{}:".format(partition)) paths = [] for schema_hash, dict in iteritems(metric_sets[partition]): if util.time_remaining(context) <= ( context[c.CW_ATTR_DELETE_DURATION] + 20): break columns = dict[c.KEY_SET_COLUMNS] if len(dict[c.KEY_SET]) == 0: continue values = dict[c.KEY_SET].values() set = pd.DataFrame(values, columns=columns) util.debug_print("\t\t{}:".format(schema_hash)) path = create_file_path(partition, schema_hash, context[c.KEY_SEPERATOR_PARTITION]) paths.append(path) util.debug_print("Writing to path '{}' with set:\n {}".format( path, set)) elapsed = 0 try: util.debug_print("Writing to partition '{}'".format(partition)) write(context[c.KEY_METRIC_BUCKET], path, set, context[c.KEY_SEPERATOR_PARTITION], schema.object_encoding(columns)) handoff_event_to_emitter(context, context[c.KEY_METRIC_BUCKET], path, set) context[c.KEY_SUCCEEDED_MSG_IDS] += dict[c.KEY_MSG_IDS] util.debug_print("Save complete to path '{}'".format(path)) except Exception as e: print( "[{}]An error occured writing to path '{}'.\nSet: {}\nError: \n{}" .format(context[c.KEY_REQUEST_ID], path, set, traceback.format_exc())) raise e finally: number_of_rows = len(values) if c.INFO_TOTAL_ROWS not in context[c.KEY_AGGREGATOR].info: context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] = 0 context[c.KEY_AGGREGATOR].info[c.INFO_TOTAL_ROWS] += number_of_rows del set del values del columns gc.collect() try: util.debug_print("Adding amoeba message to SQS queue '{}'".format( context[c.KEY_SQS_AMOEBA].queue_url)) context[c.KEY_SQS_AMOEBA].send_generic_message( json.dumps({"paths": paths})) except Exception as e: print( "[{}]An error occured writing messages to the Amoeba SQS queue. \nError: \n{}" .format(context[c.KEY_REQUEST_ID], traceback.format_exc())) raise e
def time_remaining(self): remaining = None self.acquire() try: remaining = util.time_remaining(self._start_time, self._timeout) if remaining: logger.debug("Probe of %s will time out in %.2f seconds" % (self._hostname, remaining)) finally: self.release() return remaining
def process(context): print mutil.get_memory_object() write_initial_stats(context) process_bytes = mutil.get_process_memory_usage_bytes() if c.KEY_SQS_QUEUE_URL not in context or context[ c.KEY_SQS_QUEUE_URL] is None: context[c.KEY_SQS].set_queue_url(lowest_load_queue=False) #execute at least once messages_to_process = None inflight_messages = 0 elapsed = 0 metric_sets = dict({}) context[c.KEY_AGGREGATOR] = Aggregator(context, metric_sets) messages = [] last_queue_size_check = context[c.KEY_FREQUENCY_TO_CHECK_TO_SPAWN_ANOTHER] growth_rate = last_check = 0 last_message_count = None timeout = calculate_aggregate_window_timeout(context) value = datetime.datetime.fromtimestamp(context[c.KEY_START_TIME]) message_processing_time = 0 print "[{}]Using SQS queue URL '{}'".format(context[c.KEY_REQUEST_ID], context[c.KEY_SQS].queue_url) print "[{}]Started the consumer at {}. The aggregation window is {} seconds.".format( context[c.KEY_REQUEST_ID], value.strftime('%Y-%m-%d %H:%M:%S'), timeout) while elapsed < timeout: if elapsed > last_check: last_check = elapsed + context[c.KEY_FREQUENCY_TO_CHECK_SQS_STATE] response = context[c.KEY_SQS].get_queue_attributes() inflight_messages = int(response['Attributes'] ['ApproximateNumberOfMessagesNotVisible']) messages_to_process = int( response['Attributes']['ApproximateNumberOfMessages']) if last_message_count is None: last_message_count = messages_to_process else: growth_rate = last_message_count if last_message_count == 0 else float( messages_to_process - last_message_count) / last_message_count last_message_count = messages_to_process grow_if_threshold_hit( context, growth_rate, context[c.KEY_GROWTH_RATE_BEFORE_ADDING_LAMBDAS]) #if the queue is growing slowly and is above 30,000 messages launch a new consumer if elapsed > last_queue_size_check: last_queue_size_check = elapsed + context[ c.KEY_FREQUENCY_TO_CHECK_TO_SPAWN_ANOTHER] print "[{}]\nThere are approximately {} messages that require processing.\n" \ "There are {} in-flight messages.\n" \ "{} seconds have elapsed and there is {} seconds remaining before timeout.\n" \ "The queue growth rate is {}\n" \ "{} message(s) were processed.".format(context[c.KEY_REQUEST_ID], messages_to_process,inflight_messages,round(elapsed,2),util.time_remaining(context),growth_rate,len(messages)) if messages_to_process > context[ c. KEY_THRESHOLD_BEFORE_SPAWN_NEW_CONSUMER] and inflight_messages <= context[ c.KEY_MAX_INFLIGHT_MESSAGES]: print "The queue size is greater than {}. Launching another consumer.".format( context[c.KEY_THRESHOLD_BEFORE_SPAWN_NEW_CONSUMER]) add_consumer(context) if last_message_count == 0: print "[{}]No more messages to process.".format( context[c.KEY_REQUEST_ID]) break messages = context[c.KEY_SQS].read_queue() if len(messages) > 0: start = time.time() context[c.KEY_AGGREGATOR].append_default_metrics_and_partition( messages) message_processing_time = round( ((time.time() - start) + message_processing_time) / 2, 4) else: if len(metric_sets) > 1: print "[{}]No more messages to process.".format( context[c.KEY_REQUEST_ID]) break else: print "[{}]No metric sets to process. Exiting.".format( context[c.KEY_REQUEST_ID]) return #start throttling the message processing when the SQS inflight messages is at 80% (16,000) #one queue is only allowed to have 20,000 maximum messages being processed (in-flight) usage = mutil.get_memory_usage() if inflight_messages > 16000: print "[{}]Stopping aggregation. There are too many messages in flight. Currently there are {} messages in flight.".format( context[c.KEY_REQUEST_ID], inflight_messages) break if usage > context[c.KEY_MEMORY_FLUSH_TRIGGER]: print "[{}]Stopping aggregation. Memory safe level threshold exceeded. The lambda is currently at {}%.".format( context[c.KEY_REQUEST_ID], usage) break if util.elapsed(context) + message_processing_time > timeout: print "[{}]Stopping aggregation. The elapsed time and the projected message processing time exceeds the timeout window. Messages are taking {} seconds to process. There is {} seconds left before time out and {} seconds for aggregation.".format( context[c.KEY_REQUEST_ID], message_processing_time, util.time_remaining(context), timeout) break elapsed = util.elapsed(context) util.debug_print( "[{}]Lambda has completed the agreggation phase. Elapsed time was {} seconds and we have {} seconds remaining. There are {} in-flight messages and {} remaining messages to process." .format(context[c.KEY_REQUEST_ID], elapsed, util.time_remaining(context), inflight_messages, messages_to_process)) context[c.KEY_THREAD_POOL].wait() bytes_consumed = mutil.get_process_memory_usage_bytes() memory_usage = str(mutil.get_memory_usage()) print mutil.get_memory_object() tables = metric_sets[c.KEY_TABLES] del metric_sets[c.KEY_TABLES] flush_and_delete(context, metric_sets) context[c.KEY_THREAD_POOL].wait() update_glue_crawler_datastores(context, tables) print mutil.get_memory_object() print "[{}]Elapsed time {} seconds. ".format(context[c.KEY_REQUEST_ID], util.elapsed(context)) print "[{}]Message processing averaged {} seconds per message. ".format( context[c.KEY_REQUEST_ID], message_processing_time) print "[{}]The process consumed {} KB of memory.".format( context[c.KEY_REQUEST_ID], bytes_consumed / 1024) print '[{}]The memory utilization was at {}%.'.format( context[c.KEY_REQUEST_ID], memory_usage) print '[{}]The process used {} KB for converting messages to parquet format.'.format( context[c.KEY_REQUEST_ID], (bytes_consumed - process_bytes) / 1024) print "[{}]The save process took {} seconds.".format( context[c.KEY_REQUEST_ID], context[c.CW_ATTR_SAVE_DURATION]) print "[{}]Processed {} uncompressed bytes.".format( context[c.KEY_REQUEST_ID], context[c.KEY_AGGREGATOR].bytes_uncompressed) print "[{}]Processed {} metrics. ".format(context[c.KEY_REQUEST_ID], context[c.KEY_AGGREGATOR].rows) print "[{}]Processed {} messages. ".format( context[c.KEY_REQUEST_ID], context[c.KEY_AGGREGATOR].messages) print "[{}]Average metrics per minute {}. ".format( context[c.KEY_REQUEST_ID], round( context[c.KEY_AGGREGATOR].rows / util.elasped_time_in_min(context), 2)) print "[{}]Average messages per minute {}. ".format( context[c.KEY_REQUEST_ID], round( context[c.KEY_AGGREGATOR].messages / util.elasped_time_in_min(context), 2)) print "[{}]Average uncompressed bytes per minute {}. ".format( context[c.KEY_REQUEST_ID], round( context[c.KEY_AGGREGATOR].bytes_uncompressed / util.elasped_time_in_min(context), 2)) print "[{}]There are approximately {} messages that require processing.".format( context[c.KEY_REQUEST_ID], messages_to_process if messages_to_process else 0) print "[{}]There are {} in-flight messages.".format( context[c.KEY_REQUEST_ID], inflight_messages) print "[{}]There was {} seconds remaining before timeout. ".format( context[c.KEY_REQUEST_ID], util.time_remaining(context)) del tables del metric_sets gc.collect()
def ingest(event, lambdacontext): starttime = time.time() gc.collect() root = event.get("root", None) print "Initial memory size:", mutil.get_memory_object() print "Started amoeba with root {}".format(root) context = event.get("context", {}) context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr( lambdacontext, 'function_name') else None context[c.KEY_START_TIME] = starttime is_lambda = context[c.KEY_LAMBDA_FUNCTION] is not None bucket = os.environ[c.ENV_S3_STORAGE] crawler = Crawler(context, bucket) roots = crawler.crawl_from_relative(root) s3_fs = s3fs.S3FileSystem() s3 = S3(context, bucket) timeout = calculate_aggregate_window_timeout( context[c.KEY_MAX_LAMBDA_TIME]) target_excretion_size = context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB] compression_ratio = context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO] sep = context[c.KEY_SEPERATOR_PARTITION] memory_used = mutil.get_memory_usage() projected_compressed_file_size_in_mb = 0 print "Hunting for {} seconds in bucket '{}'".format(timeout, bucket) for path in roots: #The GLUE Crawler does not work well when a single key in S3 contains varying data schemas. files = roots[path] if len(files) == 1: continue debug_print("\t\tIngesting path {}".format(path)) df = {} keys_ingested = [] data = None for file in files: debug_print("\t\t\t{}".format(file)) key = "{}/{}".format(path, file) try: size_in_megabytes = s3.size_in_megabytes(key) except ClientError as e: if str(e.response['Error']['Code']) == '404': continue else: print "Error: ", e.response['Error']['Code'], key raise e if size_in_megabytes > target_excretion_size: debug_print( "Skipping file '{}'. It has reached the targetted file size" .format(key)) continue size_in_bytes = size_in_megabytes * 1024 * 1024 try: data = reader.read(s3_fs, bucket, key) keys_ingested.append(key) except ClientError as e: print e.response['Error']['Code'], "key=>", key #handle corrupt files, this can happen if a write did not finish correctly if e.message == "Seek before start of file": print "Deleting corrupt file %s", key s3.delete([key]) elif e.response['Error']['Code'] == 'NoSuchKey': print '{}: for key {}'.format(e.response['Error']['Code'], key) else: util.logger.error(e) continue for row in data.itertuples(index=True): row = row.__dict__ del row['Index'] key_parts = KeyParts(key, sep) uuid_key = "{}{}{}".format(row[c.PRIMARY_KEY], key_parts.event, row[c.TERTIARY_KEY]) df_size = len(row) debug_print( "\t\t\tSize on S3 in MB: {} Size as Dataframe: {} Ratio: {}" .format(size_in_megabytes, df_size, compression_ratio)) #a dictionary is the fastest way to create a unique set. if uuid_key in df: debug_print( "\t\t\tFound duplication in key '{}'. Keeping the first occurrence." .format(key)) else: df[uuid_key] = row current_dataframe_size = len(df) #break conditions #1. Memory limit exceeded #2. Time window exceeded #3. Target excretion size hit projected_compressed_file_size_in_mb = ( compression_ratio * current_dataframe_size) / 1048576.0 memory_used = mutil.get_memory_usage() debug_print( "\t\t\t{} seconds have elapsed. {} kilobytes of memory have been used. The projected compressed file size is {} MB. We are targetting an excretion file size of {} MB." .format(util.elapsed(context), memory_used / 1024, projected_compressed_file_size_in_mb, target_excretion_size)) if util.elapsed(context) > timeout or memory_used > context[ c. KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size: break if util.elapsed(context) > timeout or memory_used > context[ c. KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size: print "Elapsed", util.elapsed(context), "Start:", context[ c. KEY_START_TIME], "Timeout:", timeout, "Has timed out:", util.elapsed( context ) > timeout, "Mem Used:", memory_used, "Max Memory %:", context[ c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] break if len(keys_ingested) > 0 and util.time_remaining(context) > 45: values = df.values() #since the schema hash of the event columns are all the same we can infer the dict[0].keys has the same column headers as the rest columns = values[0].keys() set = pd.DataFrame(values, columns=columns) excret(s3, bucket, path, set, keys_ingested, sep, m_schema.object_encoding(columns)) del set elif util.time_remaining(context) <= 45: return del data del df del keys_ingested if util.elapsed( context) > timeout or mutil.get_process_memory_usage_bytes( ) >= c.ONE_GB_IN_BYTES: print "\tThe elapsed time threshold of {} seconds has been hit or the memory threshold of {} megabytes has been hit. Time: {}s, Memory: {}MB".format( timeout, c.ONE_GB_IN_BYTES / 1048576.0, util.elapsed(context), mutil.get_process_memory_usage_megabytes()) return print "I've consumed everything I can in bucket '{}'".format(bucket) return
def _process_response(self, infile, proto, address, port, timeout=None, dl_limit=sys.maxint): download_dir = self._args.download_dir try: util.mkdir_p(download_dir) except Exception as e: logger.error("Failed to create download directory %s: %s" %s (download_dir, err)) return None host = address[0:-hs_sfx_len] if address.endswith(hs_sfx) else address outfilename = "%s/%s_%s_%s" % (download_dir, host, port, proto) try: outfile = open(outfilename, 'w') except Exception as err: logger.error("Failed to open %s for output: %s" % (outfilename, err)) return None try: http_code = infile.getcode() headers = infile.info() except: http_code = 0 headers = {} if http_code > 0: http_code_name = 'Unknown' if httplib.responses.has_key(http_code): http_code_name = httplib.responses[http_code] outfile.write("%d %s\n" % (http_code, http_code_name)) for key in headers.keys(): outfile.write("%s: %s\n" % (key, headers[key])) outfile.write("\n") decompressor = None if headers.has_key('content-encoding'): encoding = headers['content-encoding'].lower() if encoding in ['gzip', 'deflate', 'zlib']: decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS) max_read = min(1024, dl_limit) total_len = 0 remaining = timeout rin = [infile] win = [] xin = [] try: try: while total_len < dl_limit: # Workaround for closed socket try: infile.fileno() except: break t1 = time.time() rout, wout, xout = select.select(rin, win, xin, remaining) if infile not in rout: logger.warn("Read timeout on %s:%d (proto %s)" % (address, port, proto)) break data = infile.read(max_read) data_len = len(data) if data_len <= 0: break if decompressor: ddata = decompressor.decompress(data) outfile.write(ddata) total_len += len(ddata) else: outfile.write(data) total_len += data_len remaining = util.time_remaining(t1, remaining) finally: outfile.close() except Exception as err: logger.error("Error while downloading to %s: %s" % (outfilename, err)) return None return total_len