def main(self): # Specify options. options = [("n=", "The number of protocol buffers to save.", True)] # Start main method here. command_line = "%s --n=number_to_print input_shard output_shard" options_hash, remainder = parseCommandLine(options, command_line=command_line) if len(remainder) != 2: print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] num_in_file = countProtosInFile(input_shard) reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash["n"]) first_message_to_print = num_in_file - num_messages_to_print if first_message_to_print < 0: first_message_to_print = 0 num_messages = 0 for message in reader: if num_messages >= first_message_to_print: writer.write(message) num_messages += 1 writer.close() reader.close()
def main(self): # Specify options. options = [ ('n=', 'The number of protocol buffers to save.', True), ] # Start main method here. command_line = '%s --n=number_to_print input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash['n']) num_messages = 0 for message in reader: num_messages += 1 if num_messages > num_messages_to_print: break writer.write(message) writer.close() reader.close()
def main(self): # Specify options. options = [ ('n=', 'The number of protocol buffers to save.', True), ] # Start main method here. command_line = '%s --n=number_to_print input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash['n']) num_messages = 0 for message in reader: num_messages += 1 if num_messages > num_messages_to_print: break writer.write(message) writer.close() reader.close()
def run(self): # Specify options. options = [ ('num_records=', 'Number of records to select.', True), ] # Start main method here. command_line = '%s --num_records=n input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() num_records = int(options_hash['num_records']) input_file = remainder[0] output_file = remainder[1] total_records = countProtosInFile(input_file) print 'Selecting %d records from %d total records.' % (num_records, total_records) random.seed() # Randomly select some records to use. records_to_use = set(random.sample(range(total_records), num_records)) reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True) for ii, message in enumerate(reader): if ii in records_to_use: writer.write(message) reader.close() writer.close()
def randomize(self, input_file, output_file): reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) buffer = [] for message in reader: buffer.append(message) reader.close() random.shuffle(buffer) writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True) for message in buffer: writer.write(message) writer.close()
def randomize(self, input_file, output_file): reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) buffer = [] for message in reader: buffer.append(message) reader.close() random.shuffle(buffer) writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True) for message in buffer: writer.write(message) writer.close()
def run(self): # Specify options. options = [ ('num_records=', 'Number of records to select.', True), ] # Start main method here. command_line = '%s --num_records=n input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() num_records = int(options_hash['num_records']) input_file = remainder[0] output_file = remainder[1] total_records = countProtosInFile(input_file) print 'Selecting %d records from %d total records.' % (num_records, total_records) random.seed() # Randomly select some records to use. records_to_use = set(random.sample(range(total_records), num_records)) reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True) for ii, message in enumerate(reader): if ii in records_to_use: writer.write(message) reader.close() writer.close()
def processFile(self, input, output): reader = ProtocolBufferFileReader(Communication, filename=input) writer = ProtocolBufferFileWriter(filename=output) num_docs = 0 num_entities = 0 num_entities_set = 0 for msg in reader: new_num_entities, new_num_entities_set = self.processCommunication(msg) num_entities_set += new_num_entities_set num_entities += new_num_entities num_docs += 1 writer.write(msg) if num_docs % 100 == 0: sys.stdout.write(str(num_docs) + '\r') sys.stdout.flush() reader.close() writer.close() print 'Processed %d communications.' % num_docs print 'Processed %d entities.' % num_entities print 'Assigned %d canonical names.' % num_entities_set