Exemple #1
0
    def main(self):
        # Specify options.
        options = [("n=", "The number of protocol buffers to save.", True)]
        # Start main method here.
        command_line = "%s --n=number_to_print input_shard output_shard"
        options_hash, remainder = parseCommandLine(options, command_line=command_line)

        if len(remainder) != 2:
            print usage(sys.argv, command_line, options)
            sys.exit()

        input_shard = remainder[0]
        output_shard = remainder[1]

        num_in_file = countProtosInFile(input_shard)

        reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True)
        num_messages_to_print = int(options_hash["n"])

        first_message_to_print = num_in_file - num_messages_to_print
        if first_message_to_print < 0:
            first_message_to_print = 0

        num_messages = 0
        for message in reader:
            if num_messages >= first_message_to_print:
                writer.write(message)
            num_messages += 1

        writer.close()
        reader.close()
	def run(self):
		# Specify options.
		options = [
					('num_records=', 'Number of records to select.', True),
					]
		# Start main method here.
	
		command_line = '%s --num_records=n input_shard output_shard'
		options_hash, remainder = parseCommandLine(options, command_line=command_line)

		if (len(remainder) != 2):
			print usage(sys.argv, command_line, options)
			sys.exit()
	
		num_records = int(options_hash['num_records'])
		
		input_file = remainder[0]
		output_file = remainder[1]
		
		total_records = countProtosInFile(input_file)
		
		print 'Selecting %d records from %d total records.' % (num_records, total_records)
		random.seed()
		# Randomly select some records to use.
		records_to_use = set(random.sample(range(total_records), num_records))
		
		reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True)
		writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True)

		for ii, message in enumerate(reader):
			if ii in records_to_use:
				writer.write(message)

		reader.close()
		writer.close()
    def countFiles(self, files, verbose=False):
        total_records = 0
        for file in files:
            num_records = countProtosInFile(file)
            if verbose:
                print '%s\t%d' % (file, num_records)
            total_records += num_records

        if len(files) > 1 and verbose:
            print 'Total: %d' % (total_records)
        return total_records
	def countFiles(self, files, verbose=False):
		total_records = 0
		for file in files:
			num_records = countProtosInFile(file)
			if verbose:
				print '%s\t%d' % (file, num_records)
			total_records += num_records
			
		if len(files) > 1 and verbose:
			print 'Total: %d' % (total_records)
		return total_records
    def run(self):
        # Specify options.
        options = [
            ('num_records=', 'Number of records to select.', True),
        ]
        # Start main method here.

        command_line = '%s --num_records=n input_shard output_shard'
        options_hash, remainder = parseCommandLine(options,
                                                   command_line=command_line)

        if (len(remainder) != 2):
            print usage(sys.argv, command_line, options)
            sys.exit()

        num_records = int(options_hash['num_records'])

        input_file = remainder[0]
        output_file = remainder[1]

        total_records = countProtosInFile(input_file)

        print 'Selecting %d records from %d total records.' % (num_records,
                                                               total_records)
        random.seed()
        # Randomly select some records to use.
        records_to_use = set(random.sample(range(total_records), num_records))

        reader = ProtocolBufferFileReader(None,
                                          filename=input_file,
                                          return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_file,
                                          messages_are_byte_strings=True)

        for ii, message in enumerate(reader):
            if ii in records_to_use:
                writer.write(message)

        reader.close()
        writer.close()
Exemple #6
0
    def main(self):
        # Specify options.
        options = [
            ('n=', 'The number of protocol buffers to save.', True),
        ]
        # Start main method here.
        command_line = '%s --n=number_to_print input_shard output_shard'
        options_hash, remainder = parseCommandLine(options,
                                                   command_line=command_line)

        if (len(remainder) != 2):
            print usage(sys.argv, command_line, options)
            sys.exit()

        input_shard = remainder[0]
        output_shard = remainder[1]

        num_in_file = countProtosInFile(input_shard)

        reader = ProtocolBufferFileReader(None,
                                          filename=input_shard,
                                          return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_shard,
                                          messages_are_byte_strings=True)
        num_messages_to_print = int(options_hash['n'])

        first_message_to_print = num_in_file - num_messages_to_print
        if first_message_to_print < 0:
            first_message_to_print = 0

        num_messages = 0
        for message in reader:
            if num_messages >= first_message_to_print:
                writer.write(message)
            num_messages += 1

        writer.close()
        reader.close()