def main(self): # Specify options. options = [("n=", "The number of protocol buffers to save.", True)] # Start main method here. command_line = "%s --n=number_to_print input_shard output_shard" options_hash, remainder = parseCommandLine(options, command_line=command_line) if len(remainder) != 2: print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] num_in_file = countProtosInFile(input_shard) reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash["n"]) first_message_to_print = num_in_file - num_messages_to_print if first_message_to_print < 0: first_message_to_print = 0 num_messages = 0 for message in reader: if num_messages >= first_message_to_print: writer.write(message) num_messages += 1 writer.close() reader.close()
def run(self): # Specify options. options = [ ('num_records=', 'Number of records to select.', True), ] # Start main method here. command_line = '%s --num_records=n input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() num_records = int(options_hash['num_records']) input_file = remainder[0] output_file = remainder[1] total_records = countProtosInFile(input_file) print 'Selecting %d records from %d total records.' % (num_records, total_records) random.seed() # Randomly select some records to use. records_to_use = set(random.sample(range(total_records), num_records)) reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True) for ii, message in enumerate(reader): if ii in records_to_use: writer.write(message) reader.close() writer.close()
def countFiles(self, files, verbose=False): total_records = 0 for file in files: num_records = countProtosInFile(file) if verbose: print '%s\t%d' % (file, num_records) total_records += num_records if len(files) > 1 and verbose: print 'Total: %d' % (total_records) return total_records
def main(self): # Specify options. options = [ ('n=', 'The number of protocol buffers to save.', True), ] # Start main method here. command_line = '%s --n=number_to_print input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] num_in_file = countProtosInFile(input_shard) reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash['n']) first_message_to_print = num_in_file - num_messages_to_print if first_message_to_print < 0: first_message_to_print = 0 num_messages = 0 for message in reader: if num_messages >= first_message_to_print: writer.write(message) num_messages += 1 writer.close() reader.close()