def main(self): # Specify options. options = [("n=", "The number of protocol buffers to save.", True)] # Start main method here. command_line = "%s --n=number_to_print input_shard output_shard" options_hash, remainder = parseCommandLine(options, command_line=command_line) if len(remainder) != 2: print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] num_in_file = countProtosInFile(input_shard) reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash["n"]) first_message_to_print = num_in_file - num_messages_to_print if first_message_to_print < 0: first_message_to_print = 0 num_messages = 0 for message in reader: if num_messages >= first_message_to_print: writer.write(message) num_messages += 1 writer.close() reader.close()
def main(self): # Specify options. options = [ ('n=', 'The number of protocol buffers to save.', True), ] # Start main method here. command_line = '%s --n=number_to_print input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash['n']) num_messages = 0 for message in reader: num_messages += 1 if num_messages > num_messages_to_print: break writer.write(message) writer.close() reader.close()
def run(self): # Specify options. options = [ ('num_records=', 'Number of records to select.', True), ] # Start main method here. command_line = '%s --num_records=n input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() num_records = int(options_hash['num_records']) input_file = remainder[0] output_file = remainder[1] total_records = countProtosInFile(input_file) print 'Selecting %d records from %d total records.' % (num_records, total_records) random.seed() # Randomly select some records to use. records_to_use = set(random.sample(range(total_records), num_records)) reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True) for ii, message in enumerate(reader): if ii in records_to_use: writer.write(message) reader.close() writer.close()
def run(self): # Specify options. options = [] # Start main method here. options_hash, remainder = parseCommandLine(options) if (len(remainder) < 1): command_line = '%s input_shard*' print usage(sys.argv, command_line, options) sys.exit() self.countFiles(remainder, verbose=True)
def run(self): # Specify options. options = [ ] # Start main method here. options_hash, remainder = parseCommandLine(options) if (len(remainder) < 1): command_line = '%s input_shard*' print usage(sys.argv, command_line, options) sys.exit() self.countFiles(remainder, verbose=True)
def main(self): # Specify options. options = [ ['input=', 'The input file containing Communication protos.', True], ['output=', 'Where to save the processed data.', True], ] # Start main method here. command_line = '%s --input --output' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 0): print usage(sys.argv, command_line, options) sys.exit() self.processFile(options_hash['input'], options_hash['output'])
def run(self): # Specify options. options = [ ] # Start main method here. command_line = '%s input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) sys.stdout = codecs.getwriter('utf8')(sys.stdout) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() input_file = remainder[0] output_file = remainder[1] self.randomize(input_file, output_file)
def run(self): # Specify options. options = [] # Start main method here. command_line = '%s input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) sys.stdout = codecs.getwriter('utf8')(sys.stdout) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() input_file = remainder[0] output_file = remainder[1] self.randomize(input_file, output_file)
def run(self): # Specify options. options = [ ('max_shard_size=', 'The maximum size of each shard in bytes.'), ('max_records_per_shard=', 'The maximum number of records in each shard.'), ('count_records_only', 'Counts the number of protobufs in the file and exits.'), ('output_shard_prefix=', 'REQUIRED (unless count_records_only): Creates shards starting with this file prefix.'), ] # Start main method here. options_hash, remainder = parseCommandLine(options) if (len(remainder) != 1): command_line = '%s --output_shard_prefix=shard_prefix input_shard' print usage(sys.argv, command_line, options) sys.exit() num_options_specified = 0 max_shard_size = None max_records_per_shard = None count_records_only = None if 'max_shard_size' in options_hash: max_shard_size = int(options_hash['max_shard_size']) num_options_specified += 1 if 'max_records_per_shard' in options_hash: max_records_per_shard = int(options_hash['max_records_per_shard']) num_options_specified += 1 print 'Using %d records per shard.' % (max_records_per_shard) if 'count_records_only' in options_hash: print 'Only counting records.' count_records_only = True num_options_specified += 1 if (num_options_specified != 1): print 'Only one of the following options must be specified:' print '\t max_shard_size, max_records_per_shard, count_records_only' sys.exit() if not count_records_only and 'output_shard_prefix' not in options_hash: print 'output_shard_prefix is a required option.' sys.exit() if 'output_shard_prefix' in options_hash: output_shard_prefix = options_hash['output_shard_prefix'] input_file = remainder[0] reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) num_messages_written = 0 bytes_written = 0 total_num_messages = 0 num_files_written = 1 if count_records_only: writer = None else: writer = ProtocolBufferFileWriter(filename=self.createShardFilename(output_shard_prefix, num_files_written), messages_are_byte_strings=True) num_files_written += 1 for message in reader: num_messages_written += 1 total_num_messages += 1 bytes_written += len(message) + 4 # +4 for the message size prefix if writer: writer.write(message) if (max_shard_size != None and bytes_written >= max_shard_size) or \ (max_records_per_shard != None and num_messages_written >= max_records_per_shard): bytes_written = 0 num_messages_written = 0 writer.close() writer = ProtocolBufferFileWriter(filename=self.createShardFilename(output_shard_prefix, num_files_written), messages_are_byte_strings=True) num_files_written += 1 if writer: writer.close() num_files_written -= 1 print 'Number of records in shard: %d' % total_num_messages if num_files_written != 0: print 'Number of files written: %d' % num_files_written
def run(self): # Specify options. options = [ ('max_shard_size=', 'The maximum size of each shard in bytes.'), ('max_records_per_shard=', 'The maximum number of records in each shard.'), ('count_records_only', 'Counts the number of protobufs in the file and exits.'), ('output_shard_prefix=', 'REQUIRED (unless count_records_only): Creates shards starting with this file prefix.' ), ] # Start main method here. options_hash, remainder = parseCommandLine(options) if (len(remainder) != 1): command_line = '%s --output_shard_prefix=shard_prefix input_shard' print usage(sys.argv, command_line, options) sys.exit() num_options_specified = 0 max_shard_size = None max_records_per_shard = None count_records_only = None if 'max_shard_size' in options_hash: max_shard_size = int(options_hash['max_shard_size']) num_options_specified += 1 if 'max_records_per_shard' in options_hash: max_records_per_shard = int(options_hash['max_records_per_shard']) num_options_specified += 1 print 'Using %d records per shard.' % (max_records_per_shard) if 'count_records_only' in options_hash: print 'Only counting records.' count_records_only = True num_options_specified += 1 if (num_options_specified != 1): print 'Only one of the following options must be specified:' print '\t max_shard_size, max_records_per_shard, count_records_only' sys.exit() if not count_records_only and 'output_shard_prefix' not in options_hash: print 'output_shard_prefix is a required option.' sys.exit() if 'output_shard_prefix' in options_hash: output_shard_prefix = options_hash['output_shard_prefix'] input_file = remainder[0] reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) num_messages_written = 0 bytes_written = 0 total_num_messages = 0 num_files_written = 1 if count_records_only: writer = None else: writer = ProtocolBufferFileWriter( filename=self.createShardFilename(output_shard_prefix, num_files_written), messages_are_byte_strings=True) num_files_written += 1 for message in reader: num_messages_written += 1 total_num_messages += 1 bytes_written += len(message) + 4 # +4 for the message size prefix if writer: writer.write(message) if (max_shard_size != None and bytes_written >= max_shard_size) or \ (max_records_per_shard != None and num_messages_written >= max_records_per_shard): bytes_written = 0 num_messages_written = 0 writer.close() writer = ProtocolBufferFileWriter( filename=self.createShardFilename(output_shard_prefix, num_files_written), messages_are_byte_strings=True) num_files_written += 1 if writer: writer.close() num_files_written -= 1 print 'Number of records in shard: %d' % total_num_messages if num_files_written != 0: print 'Number of files written: %d' % num_files_written