Example #1
0
    def main(self):
        # Specify options.
        options = [("n=", "The number of protocol buffers to save.", True)]
        # Start main method here.
        command_line = "%s --n=number_to_print input_shard output_shard"
        options_hash, remainder = parseCommandLine(options, command_line=command_line)

        if len(remainder) != 2:
            print usage(sys.argv, command_line, options)
            sys.exit()

        input_shard = remainder[0]
        output_shard = remainder[1]

        num_in_file = countProtosInFile(input_shard)

        reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True)
        num_messages_to_print = int(options_hash["n"])

        first_message_to_print = num_in_file - num_messages_to_print
        if first_message_to_print < 0:
            first_message_to_print = 0

        num_messages = 0
        for message in reader:
            if num_messages >= first_message_to_print:
                writer.write(message)
            num_messages += 1

        writer.close()
        reader.close()
Example #2
0
	def main(self):
		# Specify options.
		options = [
					('n=', 'The number of protocol buffers to save.', True),
					]
		# Start main method here.
		command_line = '%s --n=number_to_print input_shard output_shard'
		options_hash, remainder = parseCommandLine(options, command_line=command_line)
		
		if (len(remainder) != 2):
			print usage(sys.argv, command_line, options)
			sys.exit()
			
		input_shard = remainder[0]
		output_shard = remainder[1]
		
		
		reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True)
		writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True)
		num_messages_to_print = int(options_hash['n'])
		
		num_messages = 0
		for message in reader:
			num_messages += 1
			if num_messages > num_messages_to_print:
				break
			writer.write(message)
		
		writer.close()
		reader.close()
	def run(self):
		# Specify options.
		options = [
					('num_records=', 'Number of records to select.', True),
					]
		# Start main method here.
	
		command_line = '%s --num_records=n input_shard output_shard'
		options_hash, remainder = parseCommandLine(options, command_line=command_line)

		if (len(remainder) != 2):
			print usage(sys.argv, command_line, options)
			sys.exit()
	
		num_records = int(options_hash['num_records'])
		
		input_file = remainder[0]
		output_file = remainder[1]
		
		total_records = countProtosInFile(input_file)
		
		print 'Selecting %d records from %d total records.' % (num_records, total_records)
		random.seed()
		# Randomly select some records to use.
		records_to_use = set(random.sample(range(total_records), num_records))
		
		reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True)
		writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True)

		for ii, message in enumerate(reader):
			if ii in records_to_use:
				writer.write(message)

		reader.close()
		writer.close()
Example #4
0
	def randomize(self, input_file, output_file):
		reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True)
		
		
		buffer = []
		for message in reader:
			buffer.append(message)
		
		reader.close()
		
		random.shuffle(buffer)
		writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True)
		for message in buffer:
			writer.write(message)
				
		writer.close()
Example #5
0
    def main(self):
        # Specify options.
        options = [
            ('n=', 'The number of protocol buffers to save.', True),
        ]
        # Start main method here.
        command_line = '%s --n=number_to_print input_shard output_shard'
        options_hash, remainder = parseCommandLine(options,
                                                   command_line=command_line)

        if (len(remainder) != 2):
            print usage(sys.argv, command_line, options)
            sys.exit()

        input_shard = remainder[0]
        output_shard = remainder[1]

        reader = ProtocolBufferFileReader(None,
                                          filename=input_shard,
                                          return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_shard,
                                          messages_are_byte_strings=True)
        num_messages_to_print = int(options_hash['n'])

        num_messages = 0
        for message in reader:
            num_messages += 1
            if num_messages > num_messages_to_print:
                break
            writer.write(message)

        writer.close()
        reader.close()
Example #6
0
    def randomize(self, input_file, output_file):
        reader = ProtocolBufferFileReader(None,
                                          filename=input_file,
                                          return_byte_string_only=True)

        buffer = []
        for message in reader:
            buffer.append(message)

        reader.close()

        random.shuffle(buffer)
        writer = ProtocolBufferFileWriter(filename=output_file,
                                          messages_are_byte_strings=True)
        for message in buffer:
            writer.write(message)

        writer.close()
    def run(self):
        # Specify options.
        options = [
            ('num_records=', 'Number of records to select.', True),
        ]
        # Start main method here.

        command_line = '%s --num_records=n input_shard output_shard'
        options_hash, remainder = parseCommandLine(options,
                                                   command_line=command_line)

        if (len(remainder) != 2):
            print usage(sys.argv, command_line, options)
            sys.exit()

        num_records = int(options_hash['num_records'])

        input_file = remainder[0]
        output_file = remainder[1]

        total_records = countProtosInFile(input_file)

        print 'Selecting %d records from %d total records.' % (num_records,
                                                               total_records)
        random.seed()
        # Randomly select some records to use.
        records_to_use = set(random.sample(range(total_records), num_records))

        reader = ProtocolBufferFileReader(None,
                                          filename=input_file,
                                          return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_file,
                                          messages_are_byte_strings=True)

        for ii, message in enumerate(reader):
            if ii in records_to_use:
                writer.write(message)

        reader.close()
        writer.close()
Example #8
0
	def processFile(self, input, output):
		reader = ProtocolBufferFileReader(Communication, filename=input)
		writer = ProtocolBufferFileWriter(filename=output)
		num_docs = 0
		num_entities = 0
		num_entities_set = 0
		for msg in reader:
			new_num_entities, new_num_entities_set = self.processCommunication(msg)
			num_entities_set += new_num_entities_set
			num_entities += new_num_entities
			
			num_docs += 1
			
			writer.write(msg)
			if num_docs % 100 == 0:
				sys.stdout.write(str(num_docs) + '\r')
				sys.stdout.flush()
			
		reader.close()
		writer.close()
		print 'Processed %d communications.' % num_docs
		print 'Processed %d entities.' % num_entities
		print 'Assigned %d canonical names.' % num_entities_set
	def run(self):
		# Specify options.
		options = [
					('max_shard_size=', 'The maximum size of each shard in bytes.'),
					('max_records_per_shard=', 'The maximum number of records in each shard.'),
					('count_records_only', 'Counts the number of protobufs in the file and exits.'),
					('output_shard_prefix=', 'REQUIRED (unless count_records_only): Creates shards starting with this file prefix.'),
					]
		# Start main method here.
	
		options_hash, remainder = parseCommandLine(options)

		if (len(remainder) != 1):
			command_line = '%s --output_shard_prefix=shard_prefix input_shard'
			print usage(sys.argv, command_line, options)
			sys.exit()
	
		num_options_specified = 0
		
		max_shard_size = None
		max_records_per_shard = None
		count_records_only = None
		if 'max_shard_size' in options_hash:
			max_shard_size = int(options_hash['max_shard_size'])
			num_options_specified += 1
		if 'max_records_per_shard' in options_hash:
			max_records_per_shard = int(options_hash['max_records_per_shard'])
			num_options_specified += 1
			print 'Using %d records per shard.' % (max_records_per_shard)
		if 'count_records_only' in options_hash:
			print 'Only counting records.'
			count_records_only = True
			num_options_specified += 1
		
		if (num_options_specified != 1):
			print 'Only one of the following options must be specified:'
			print '\t max_shard_size, max_records_per_shard, count_records_only'
			sys.exit()
		
		if not count_records_only and 'output_shard_prefix' not in options_hash:
			print 'output_shard_prefix is a required option.'
			sys.exit()
		
		if 'output_shard_prefix' in options_hash:
			output_shard_prefix = options_hash['output_shard_prefix']

		input_file = remainder[0]
		
		reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True)
		
		num_messages_written = 0
		bytes_written = 0
		total_num_messages = 0
		num_files_written = 1
		
		if count_records_only:
			writer = None
		else:
			writer = ProtocolBufferFileWriter(filename=self.createShardFilename(output_shard_prefix, num_files_written), messages_are_byte_strings=True)
			num_files_written += 1

		for message in reader:
			num_messages_written += 1
			total_num_messages += 1
			bytes_written += len(message) + 4 # +4 for the message size prefix
			
			if writer:
				writer.write(message)
		
			if (max_shard_size != None and bytes_written >= max_shard_size) or \
			   (max_records_per_shard != None and num_messages_written >= max_records_per_shard):
				bytes_written = 0
				num_messages_written = 0
				writer.close()
				writer = ProtocolBufferFileWriter(filename=self.createShardFilename(output_shard_prefix, num_files_written), messages_are_byte_strings=True)
				num_files_written += 1

		if writer:
			writer.close()

		num_files_written -= 1
		print 'Number of records in shard: %d' % total_num_messages
		if num_files_written != 0:
			print 'Number of files written: %d' % num_files_written
Example #10
0
    def run(self):
        # Specify options.
        options = [
            ('max_shard_size=', 'The maximum size of each shard in bytes.'),
            ('max_records_per_shard=',
             'The maximum number of records in each shard.'),
            ('count_records_only',
             'Counts the number of protobufs in the file and exits.'),
            ('output_shard_prefix=',
             'REQUIRED (unless count_records_only): Creates shards starting with this file prefix.'
             ),
        ]
        # Start main method here.

        options_hash, remainder = parseCommandLine(options)

        if (len(remainder) != 1):
            command_line = '%s --output_shard_prefix=shard_prefix input_shard'
            print usage(sys.argv, command_line, options)
            sys.exit()

        num_options_specified = 0

        max_shard_size = None
        max_records_per_shard = None
        count_records_only = None
        if 'max_shard_size' in options_hash:
            max_shard_size = int(options_hash['max_shard_size'])
            num_options_specified += 1
        if 'max_records_per_shard' in options_hash:
            max_records_per_shard = int(options_hash['max_records_per_shard'])
            num_options_specified += 1
            print 'Using %d records per shard.' % (max_records_per_shard)
        if 'count_records_only' in options_hash:
            print 'Only counting records.'
            count_records_only = True
            num_options_specified += 1

        if (num_options_specified != 1):
            print 'Only one of the following options must be specified:'
            print '\t max_shard_size, max_records_per_shard, count_records_only'
            sys.exit()

        if not count_records_only and 'output_shard_prefix' not in options_hash:
            print 'output_shard_prefix is a required option.'
            sys.exit()

        if 'output_shard_prefix' in options_hash:
            output_shard_prefix = options_hash['output_shard_prefix']

        input_file = remainder[0]

        reader = ProtocolBufferFileReader(None,
                                          filename=input_file,
                                          return_byte_string_only=True)

        num_messages_written = 0
        bytes_written = 0
        total_num_messages = 0
        num_files_written = 1

        if count_records_only:
            writer = None
        else:
            writer = ProtocolBufferFileWriter(
                filename=self.createShardFilename(output_shard_prefix,
                                                  num_files_written),
                messages_are_byte_strings=True)
            num_files_written += 1

        for message in reader:
            num_messages_written += 1
            total_num_messages += 1
            bytes_written += len(message) + 4  # +4 for the message size prefix

            if writer:
                writer.write(message)

            if (max_shard_size != None and bytes_written >= max_shard_size) or \
               (max_records_per_shard != None and num_messages_written >= max_records_per_shard):
                bytes_written = 0
                num_messages_written = 0
                writer.close()
                writer = ProtocolBufferFileWriter(
                    filename=self.createShardFilename(output_shard_prefix,
                                                      num_files_written),
                    messages_are_byte_strings=True)
                num_files_written += 1

        if writer:
            writer.close()

        num_files_written -= 1
        print 'Number of records in shard: %d' % total_num_messages
        if num_files_written != 0:
            print 'Number of files written: %d' % num_files_written