def writeData(writer): key = LongWritable() value = LongWritable() for i in range(1000): key.set(1000 - i) value.set(i) print(('[%d] %s %s' % (writer.getLength(), key.toString(), value.toString()))) writer.append(key, value)
def writeData(writer): key = LongWritable() value = LongWritable() for i in xrange(10): key.set(1000 - i) value.set(i) print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString()) writer.append(key, value)
def write_text_data(writer): key = LongWritable() value = Text() for i in xrange(1000): key.set(1000 - i) value.set('taro {}'.format(i)) print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString()) writer.append(key, value)
from hadoop.io import SequenceFile writer = SequenceFile.createWriter('reddit_posts.seq' % _id, LongWritable, LongWritable) #read the input files for line in sys.stdin: #use try/expect block to make sure a improperly formatted row does not blow our program u # remove leading and trailing whitespace # assume that the files are both comma delimited and only contain the columns described in assignment 4 part 1 line = line.strip().split(",") _id, _text, = line[0], line[5] or 'N/A' key = LongWritable() key.set(int(_id)) value = LongWritable() value.set(_text) writer.append(key, value) writer.close() #!/usr/bin/env python import sys #read the input files for line in sys.stdin: # remove leading and trailing whitespace