Beispiel #1
0
    def main(self):
        # Specify options.
        options = [("n=", "The number of protocol buffers to save.", True)]
        # Start main method here.
        command_line = "%s --n=number_to_print input_shard output_shard"
        options_hash, remainder = parseCommandLine(options, command_line=command_line)

        if len(remainder) != 2:
            print usage(sys.argv, command_line, options)
            sys.exit()

        input_shard = remainder[0]
        output_shard = remainder[1]

        num_in_file = countProtosInFile(input_shard)

        reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True)
        num_messages_to_print = int(options_hash["n"])

        first_message_to_print = num_in_file - num_messages_to_print
        if first_message_to_print < 0:
            first_message_to_print = 0

        num_messages = 0
        for message in reader:
            if num_messages >= first_message_to_print:
                writer.write(message)
            num_messages += 1

        writer.close()
        reader.close()
Beispiel #2
0
	def main(self):
		# Specify options.
		options = [
					('n=', 'The number of protocol buffers to save.', True),
					]
		# Start main method here.
		command_line = '%s --n=number_to_print input_shard output_shard'
		options_hash, remainder = parseCommandLine(options, command_line=command_line)
		
		if (len(remainder) != 2):
			print usage(sys.argv, command_line, options)
			sys.exit()
			
		input_shard = remainder[0]
		output_shard = remainder[1]
		
		
		reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True)
		writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True)
		num_messages_to_print = int(options_hash['n'])
		
		num_messages = 0
		for message in reader:
			num_messages += 1
			if num_messages > num_messages_to_print:
				break
			writer.write(message)
		
		writer.close()
		reader.close()
Beispiel #3
0
    def main(self):
        # Specify options.
        options = [
            ('n=', 'The number of protocol buffers to save.', True),
        ]
        # Start main method here.
        command_line = '%s --n=number_to_print input_shard output_shard'
        options_hash, remainder = parseCommandLine(options,
                                                   command_line=command_line)

        if (len(remainder) != 2):
            print usage(sys.argv, command_line, options)
            sys.exit()

        input_shard = remainder[0]
        output_shard = remainder[1]

        reader = ProtocolBufferFileReader(None,
                                          filename=input_shard,
                                          return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_shard,
                                          messages_are_byte_strings=True)
        num_messages_to_print = int(options_hash['n'])

        num_messages = 0
        for message in reader:
            num_messages += 1
            if num_messages > num_messages_to_print:
                break
            writer.write(message)

        writer.close()
        reader.close()
	def run(self):
		# Specify options.
		options = [
					('num_records=', 'Number of records to select.', True),
					]
		# Start main method here.
	
		command_line = '%s --num_records=n input_shard output_shard'
		options_hash, remainder = parseCommandLine(options, command_line=command_line)

		if (len(remainder) != 2):
			print usage(sys.argv, command_line, options)
			sys.exit()
	
		num_records = int(options_hash['num_records'])
		
		input_file = remainder[0]
		output_file = remainder[1]
		
		total_records = countProtosInFile(input_file)
		
		print 'Selecting %d records from %d total records.' % (num_records, total_records)
		random.seed()
		# Randomly select some records to use.
		records_to_use = set(random.sample(range(total_records), num_records))
		
		reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True)
		writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True)

		for ii, message in enumerate(reader):
			if ii in records_to_use:
				writer.write(message)

		reader.close()
		writer.close()
	def randomize(self, input_file, output_file):
		reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True)
		
		
		buffer = []
		for message in reader:
			buffer.append(message)
		
		reader.close()
		
		random.shuffle(buffer)
		writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True)
		for message in buffer:
			writer.write(message)
				
		writer.close()
    def randomize(self, input_file, output_file):
        reader = ProtocolBufferFileReader(None,
                                          filename=input_file,
                                          return_byte_string_only=True)

        buffer = []
        for message in reader:
            buffer.append(message)

        reader.close()

        random.shuffle(buffer)
        writer = ProtocolBufferFileWriter(filename=output_file,
                                          messages_are_byte_strings=True)
        for message in buffer:
            writer.write(message)

        writer.close()
    def run(self):
        # Specify options.
        options = [
            ('num_records=', 'Number of records to select.', True),
        ]
        # Start main method here.

        command_line = '%s --num_records=n input_shard output_shard'
        options_hash, remainder = parseCommandLine(options,
                                                   command_line=command_line)

        if (len(remainder) != 2):
            print usage(sys.argv, command_line, options)
            sys.exit()

        num_records = int(options_hash['num_records'])

        input_file = remainder[0]
        output_file = remainder[1]

        total_records = countProtosInFile(input_file)

        print 'Selecting %d records from %d total records.' % (num_records,
                                                               total_records)
        random.seed()
        # Randomly select some records to use.
        records_to_use = set(random.sample(range(total_records), num_records))

        reader = ProtocolBufferFileReader(None,
                                          filename=input_file,
                                          return_byte_string_only=True)
        writer = ProtocolBufferFileWriter(filename=output_file,
                                          messages_are_byte_strings=True)

        for ii, message in enumerate(reader):
            if ii in records_to_use:
                writer.write(message)

        reader.close()
        writer.close()
Beispiel #8
0
	def processFile(self, input, output):
		reader = ProtocolBufferFileReader(Communication, filename=input)
		writer = ProtocolBufferFileWriter(filename=output)
		num_docs = 0
		num_entities = 0
		num_entities_set = 0
		for msg in reader:
			new_num_entities, new_num_entities_set = self.processCommunication(msg)
			num_entities_set += new_num_entities_set
			num_entities += new_num_entities
			
			num_docs += 1
			
			writer.write(msg)
			if num_docs % 100 == 0:
				sys.stdout.write(str(num_docs) + '\r')
				sys.stdout.flush()
			
		reader.close()
		writer.close()
		print 'Processed %d communications.' % num_docs
		print 'Processed %d entities.' % num_entities
		print 'Assigned %d canonical names.' % num_entities_set