class StdIOInput(AbstractInput): io = None def __init__(self,io = sys.stdin): self.io = io self.formatter = PassingFormatter() def read(self): return self.formatter.format(self.io)
class MemInput(AbstractInput): data = None def __init__(self,data=[]): self.data = data self.formatter = PassingFormatter() @staticmethod def load_from_file(filename): start_time = datetime.datetime.now() itself = MemInput() f = open(filename) for line in f: itself.data.append(line) f.close() print "Load file %s in %s" % (filename, datetime.datetime.now() - start_time) return itself def to_file(self): filename = '/var/tmp/%s' % str(uuid.uuid1()) f = open(filename,mode='w') map(lambda item : f.write(json.dumps(item)),self.data) f.close() return filename def read(self): return self.formatter.format(iter(self.data)) def sample(self,size=100): return self.data[0:size-1] def close(self): pass def get_estimated_size(self): return len(self.data), MemInput(self.data[:999]) if len(self.data)>1000 else self def count(self,engine=None,debug=False,options={}): return len(self.data)
class FileInput(AbstractInput): file = None filename = None def __init__(self,filename): self.filename = filename self.formatter = PassingFormatter() def to_file(self): return self.filename def read(self): self.file = open(self.filename) return self.formatter.format(self.file) def close(self): if self.file != None: self.file.close() def as_output(self): return FileOutput(self.filename) def get_estimated_size(self,sample_size = 1000): file_size = os.stat(self.filename).st_size f = open(self.filename) count = 0 line_size = 0 for line in f: count +=1 line_size += len(line) if count >= sample_size: break f.close() return int(sample_size * float(file_size) / float(line_size)) def sample(self,size=100): self.file = open(self.filename) sample = [] i=0 for row in self.file: sample.append(row) if i>=size: break else: i+=1 return self.formatter.format(sample) def count(self,engine=None,debug=False,options={}): out = MemOutput() Count().run(self,out,engine,debug,options) return out.data[0][1][0] def compute(self,mapred,engine=None,debug=False,options={}): out = MemOutput() mapred.run(self,out,engine,debug,options) return out.data def map_reduce(self,mapred,output,engine=None,debug=False,options={}): mapred.run(self,output,engine,debug,options) def filter(self,filter_function, output=None,engine=None,debug=False,options={}): if output is None: output = self.as_output() output.filename = '/var/tmp/%s' % uuid1() mapred = Filter() mapred.set_function(filter_function) mapred.run(self, output, engine, debug, options) return output.as_input()