Example #1
0
 def map(self, records, task):
     if len(self.preMappers) == 0 and not self.secondsort:
         for key, record in records: task.collect(key, record)
     else:
         if self.mapFunctionChain is None:
             # set up mapper input fn:
             if self.inputJson:
                 def mapperfn(_, records):
                     for key, value in records: yield key, happy.json.decode(value)
             else:
                 def mapperfn(_, records):
                     return records
             # emitting raw text:
             if self.reducetasks == 0 and not self.outputJson:
                 def collector(k, v):
                     task.collect(k, v)
             # secondary sort:
             elif self.secondsort:
                 textint = TextInt()
                 def collector(k, v):
                     if len(v) != 2 or not isinstance(v[0], int):
                         raise Exception("Invalid value " + str(v) + " for a secondary sort, (<int>, <obj>) tuple required")
                     textint.setString(k)
                     textint.setInt(v[0])
                     task.collect(textint, happy.json.encode(v))
             # json output:
             else:
                 def collector(k, v):
                     task.collect(k, happy.json.encode(v))
             self.mapFunctionChain = FunctionChain([mapperfn] + self.preMappers + [collector], self._recordError)
         # do the work:
         self.mapFunctionChain.callChain(None, records)
Example #2
0
 def reduce(self, key, values, task):
     if self.reducer is None:
         for value in values: task.collect(key, value)
     else:
         if self.reduceFunctionChain is None:
             # emitting raw text:
             if not self.outputJson:
                 def collector(k, v):
                     task.collect(k, v)
             # json output:
             else:
                 def collector(k, v):
                     task.collect(k, happy.json.encode(v))
             self.reduceFunctionChain = FunctionChain([self.reducer] + self.postMappers + [collector], self._recordError)
         # second sort key:
         if self.secondsort: key = key.getString()
         # do the work:
         self.reduceFunctionChain.callChain(key, self._jsonReduceIterator(values))
Example #3
0
class PipeJob(happy.HappyJob):
    """
    The job that executes a series of pipes.
    """
    def __init__(self, spec):
        happy.HappyJob.__init__(self)
        self.id = spec.id
        self.inputpaths = spec.inputpaths
        self.inputformat = spec.inputformat
        self.inputJson = spec.inputJson
        self.outputpath = spec.outputpath
        self.outputformat = spec.outputformat
        self.compressoutput = spec.compressoutput
        if spec.compressiontype is not None: self.compressiontype = spec.compressiontype
        self.jobargs = spec.jobargs
        self.outputJson = spec.outputJson
        self.preMappers = spec.preMappers[:]
        self.reducer = spec.reducer
        if self.reducer is None: self.reducetasks = 0
        self.postMappers = spec.postMappers[:]
        self.errorpath = spec.workpath + "/errors"
        self.errorcollectors = {}
        # build a job name:
        prenames = [f.__name__ for f in self.preMappers]
        if self.reducer is not None: reducername = [self.reducer.__name__]
        else: reducername = []
        postnames = [f.__name__ for f in self.postMappers]
        self.jobname = _scriptname + " " + str(spec.id) + " " + "-".join(prenames + reducername + postnames)
        # config second sort:
        self.secondsort = spec.secondsort
        if self.secondsort:
            self.jobargs["mapred.output.value.groupfn.class"] = "com.freebase.happy.util.TextInt$TextComparator"
            self.jobargs["mapred.partitioner.class"] = "com.freebase.happy.util.TextInt$TextPartitioner"
            self.mapoutputkey = "com.freebase.happy.util.TextInt"
        # init function chains:
        self.mapFunctionChain = None
        self.reduceFunctionChain = None

    def mapconfig(self):
        self.jobstage = "map"

    def map(self, records, task):
        if len(self.preMappers) == 0 and not self.secondsort:
            for key, record in records: task.collect(key, record)
        else:
            if self.mapFunctionChain is None:
                # set up mapper input fn:
                if self.inputJson:
                    def mapperfn(_, records):
                        for key, value in records: yield key, happy.json.decode(value)
                else:
                    def mapperfn(_, records):
                        return records
                # emitting raw text:
                if self.reducetasks == 0 and not self.outputJson:
                    def collector(k, v):
                        task.collect(k, v)
                # secondary sort:
                elif self.secondsort:
                    textint = TextInt()
                    def collector(k, v):
                        if len(v) != 2 or not isinstance(v[0], int):
                            raise Exception("Invalid value " + str(v) + " for a secondary sort, (<int>, <obj>) tuple required")
                        textint.setString(k)
                        textint.setInt(v[0])
                        task.collect(textint, happy.json.encode(v))
                # json output:
                else:
                    def collector(k, v):
                        task.collect(k, happy.json.encode(v))
                self.mapFunctionChain = FunctionChain([mapperfn] + self.preMappers + [collector], self._recordError)
            # do the work:
            self.mapFunctionChain.callChain(None, records)

    def reduceconfig(self):
        self.jobstage = "reduce"

    def reduce(self, key, values, task):
        if self.reducer is None:
            for value in values: task.collect(key, value)
        else:
            if self.reduceFunctionChain is None:
                # emitting raw text:
                if not self.outputJson:
                    def collector(k, v):
                        task.collect(k, v)
                # json output:
                else:
                    def collector(k, v):
                        task.collect(k, happy.json.encode(v))
                self.reduceFunctionChain = FunctionChain([self.reducer] + self.postMappers + [collector], self._recordError)
            # second sort key:
            if self.secondsort: key = key.getString()
            # do the work:
            self.reduceFunctionChain.callChain(key, self._jsonReduceIterator(values))

    def _recordError(self, key, value, message, operation):
        """
        Records an error to the log and self.errorcollector.
        """
        errorcollector = self.errorcollectors.get(operation)
        if errorcollector is None:
            self.errorcollectors[operation] = errorcollector = \
                happy.dfs.createPartitionedCollector(self.errorpath + "/job-" + str(self.id) + "-" + self.jobstage + "-" + operation, type="text")
        errorcollector.collect(key, happy.json.encode({"key":key, "value":value, "operation": operation, "error": message}))
        currentErrors = happy.results.get("happy.cloud.dataerrors")
        if currentErrors is None: currentErrors = 1
        else: currentErrors += 1
        happy.results["happy.cloud.dataerrors"] = currentErrors

    def _jsonReduceIterator(self, records):
        for encodedRecord in records: yield happy.json.decode(encodedRecord)

    def run(self):
        happy.dfs.delete(self.outputpath)
        return happy.HappyJob.run(self)