def run(self, force=False, workingDir=None): """ Runs the entire job chain (ie DAG) that contains this node. """ logger.debug("Calling HappyJobNode.run(), workingDir=%s" % workingDir) self.linkNodes(workingDir) if force: self.deleteOutFiles(onlytmp=False) # stack = self.sources() stack = self.sort() logger.info("Stack order is: %s" % (", ".join([str(x._id) for x in stack],))) ok_children = self.sources() while stack: node = stack.pop(0) putChildren = False if (not node in ok_children): logger.warn("Branch terminated: node %s not in ok_children list %s." % (node, ok_children)) continue pre = node.precheck() if node.force: logger.info("FORCING %s [%s --> %s] (delete %s first)" % (node, node.inputpaths, node.outputpath, node.outputpath)) dfs.delete(node.outputpath) node.fire() elif (pre =='ready'): logger.info("Running %s [%s --> %s]" % (node, node.inputpaths, node.outputpath)) node.fire() else: logger.info("Skipping job %s: already done" % node) putChildren = True self.status = 'skip' post = node.postcheck() if (post == 'done'): logger.info("Job %s completed successfully. " % node) putChildren = True elif (post == 'fail'): logger.info("Job %s failed. Not adding children." % node) if putChildren: if (node.isSink()): logger.info("Job %s is a sink, no children." % node) else: newChildren = [child for child in node.children() if child not in ok_children] logger.info("Placing children %s of job %s on stack." % (newChildren, node)) ok_children.extend(newChildren)
def _auto_name(self, job): """ Generates a unique name for this node, if one was not provided. """ root = job.__class__.__name__ nodes = list(self.lastNode.nodes()) matches = [node.name for node in nodes if node.name.startswith(root)] logger.debug("Node names: %s" % nodes) if (len(matches)==0): return root + '_1' try: iter_str = [name.split('_')[-1] for name in matches] logger.debug("Node iter_str: %s" % iter_str) iters = [int(i) for i in iter_str] logger.debug("Node iters: %s" % iter_str) max_iter = max(iters) + 1 logger.debug("max_iter: %s" % max_iter) return root + '_' + str(max_iter) except: logger.warn("Could not determine iteration: %s " % matches) return root + '_1'
def smap(self, records, task): """ First, we pivot both files on their join keys, and set the joinorder for the secondsort such that all joinkey1 records come first. """ inpath = "/" + task.getInputPath() + "/" logger.warn("INPATH: %s" % inpath) for key, json in records: record = happy.json.decode(json) if ((inpath.find(self.file1)>=0) and record.has_key(self.key1)): newrec = self._modkeys(record, self.keymod1) record['__infile__'] = task.getInputPath() newrec['__joinorder__'] = 1 k1 = record[self.key1] if happy.flow.isIterable(k1): k1=':|:'.join(k1) task.collect(k1, 1, happy.json.encode(newrec)) if ((inpath.find(self.file2)>=0) and record.has_key(self.key2)): newrec = self._modkeys(record, self.keymod2) newrec['__joinorder__'] = 2 k2 = record[self.key2] if happy.flow.isIterable(k2): k2=':|:'.join(k2) task.collect(k2, 2, happy.json.encode(newrec))