def testTopoSort(self): """ Topological sort of the DAG """ logger.info("In TestFlow.testSort ...") logger.debug("Setting up DAG ...") a = HappyJobNode() b = HappyJobNode() c = HappyJobNode() d = HappyJobNode() e = HappyJobNode() f = HappyJobNode() g = HappyJobNode() h = HappyJobNode() i = HappyJobNode() j = HappyJobNode() a.addChild(b) a.addChild(c) b.addChild(d) b.addChild(e) c.addChild(f) c.addChild(g) d.addChild(h) g.addChild(i) i.addChild(j) e.addChild(j) b.addChild(i) a.addChild(j) logger.debug("Testing topological sort ...") sort = a.sort() self.assertEqual(sort, [a, b, c, d, e, f, g, h, i, j]) logger.info("DONE.")
def testDictize(self): logger.info("In TestFlow.testDictize ...") name="name", p1 = HappyJobNode(name="P1", inputpath=['inP1'], outputpath=['outP1'], status='statusP1', job='NullJob()') c1 = HappyJobNode(name="C1", inputpath=['inC1'], outputpath=['outC1'], status='statusC1', job='NullJob()') c2 = HappyJobNode(name="C2", inputpath=['inC2'], outputpath=['outC2'], status='statusC2', job='NullJob()') p1.addChild(c1) p1.addChild(c2) dict = p1.dictize() d1 = HappyJobNode.dedictize(dict) self.assertEqual(p1.name, d1.name) self.assertEqual(p1.inputpaths, d1.inputpaths) self.assertEqual(p1.outputpath, d1.outputpath) self.assertEqual(p1.job.__class__, d1.job.__class__) self.assertEqual(p1.children()[0].name, d1.children()[0].name) self.assertEqual(p1.children()[0].inputpaths, d1.children()[0].inputpaths) self.assertEqual(p1.children()[0].outputpath, d1.children()[0].outputpath) self.assertEqual(p1.children()[0].job.__class__, d1.children()[0].job.__class__) self.assertEqual(p1.children()[1].name, d1.children()[1].name) self.assertEqual(p1.children()[1].inputpaths, d1.children()[1].inputpaths) self.assertEqual(p1.children()[1].outputpath, d1.children()[1].outputpath) self.assertEqual(p1.children()[1].job.__class__, d1.children()[1].job.__class__) logger.info("DONE.")
def _testJoin(self): logger.info("In TestFlow.testGraphNames() ...") names = HappyJobNode(name='get_names', job=FilterExact(filterkey='propname', filtervalues=['/type/object/name', '/common/topic/alias'], keyfield='a:guid', mapfields={'value':'name'}), inputpaths=['/data/graph/latest/crawl']) types = HappyJobNode(name='get_types', job=FilterExact(filterkey='propname', filtervalues=['/type/object/type'], keyfield='b:guid', mapfields={'target':'type'}), inputpaths=['/data/graph/latest/crawl']) join = HappyJobNode(name='join_name_types', job=InnerJoin(joinkeys=['a:guid', 'b:guid'], outkey='guid')) people = HappyJobNode(name='filter_people', job=FilterExact(filterkey='type', filtervalues=['/people/person'], keyfield='guid', mapfields={'type':'type', 'name':'name'})) agg = HappyJobNode(name='invert_names', job=AggregateJson(aggkey='name', aggfunc='agg.list("guid")'), outputpath='namelist') names.addChild(join) types.addChild(join) join.addChild(people) people.addChild(agg) names.run(force=True)
def testCreateWithParent(self): """ Test instanciation with parent specified """ logger.info("In TestFlow.testCreateWithParent ...") p = HappyJobNode() c = HappyJobNode(parents=p) self.assertEqual(c.parents(), [p]) self.assertEqual(p.children(), [c]) logger.info("DONE.")
def _testHappyRun(self): logger.info("In TestFlow.testSingleRun() ...") h = IdentityJob() h.inputpaths = "small" h.outputpath = "crap" dfs.delete('crap') h.run() dfs.delete('crap')
def _testFilter(self): logger.info("In TestFlow.testFilter() ...") node = HappyJobNode(name='filter_graph', job=FilterJson(filterkey='propname', filtervalues=['/type/object/type'], returnkeys=['target', 'creator']), inputpaths=['/data/graph/latest/crawl'], outputpath='typecount') node.run(force=True)
def _testEmptyRun(self): logger.info("In TestFlow.testRun() ...") dict = {1: {'children': [2, 3, 5], 'kwargs': {'job': 'NullJob()', 'name': 'P1', 'inputpaths':['small']}}, 2: {'children': [4], 'kwargs': {'job': 'NullJob()', 'name': 'C1'}}, 3: {'children': [4], 'kwargs': {'job': 'NullJob()', 'name': 'C2'}}, 5: {'children': [4], 'kwargs': {'job': 'NullJob()', 'name': 'C3'}}, 4: {'children': [], 'kwargs': {'job': 'NullJob()', 'name': 'G1', 'outputpath':'crap'}}} dag = HappyJobNode.dedictize(dict) dag.run(force=True)
def _testFilterLambda(self): logger.info("In TestFlow.testFilterLambda() ...") node = HappyJobNode(name='filter_graph_lambda', job=FilterLambda(filters=["lambda x: x.get('propname', None) in ['/type/object/name', '/common/topic/alias']", "lambda y: type(y.get('value', ' '))==str and y.get('value', ' ').startswith('c')"], returnkeys=['value', '__keys__']), inputpaths=['/data/graph/latest/crawl'], outputpath='cnames') node.run(force=True)
def testCreateSingle(self): """ Test that we can create a HappyJobNode """ logger.info("In TestFlow.testCreateSingle ...") node = HappyJobNode(name="name", inputpaths=['in'], outputpath='out', job="NullJob()") self.assert_(node != None) self.assertEqual(node.name, 'name') self.assertEqual(node.inputpaths, ['in']) self.assertEqual(node.outputpath, 'out') logger.info("DONE.")
def deleteOutFiles(self, onlytmp=True): """ Deletes all files listed as outputs in the Flow. """ self.linkNodes() for node in self.sort(): file = node.outputpath if (not onlytmp or file[0:4]=='tmp.'): logger.info("Deleting output file '%s'" % file) dfs.delete(file)
def _testDagRun(self): logger.info("In TestFlow.testDagRun() ...") p1 = HappyJobNode(name="P1", job=IdentityJob(),inputpaths=['small']) c1 = HappyJobNode(name="C1", job=IdentityJob()) c2 = HappyJobNode(name="C2", job=IdentityJob()) g1 = HappyJobNode(name="G1", job=IdentityJob(), outputpath='crap') p1.addChild(c1) p1.addChild(c2) c1.addChild(g1) c2.addChild(g1) p1.run(force=True)
def testMultiParent(self): """ Set up with many parents """ logger.info("In TestFlow.testMultiParent ...") p1 = HappyJobNode() p2 = HappyJobNode() c1 = HappyJobNode() p1.addChild(c1) c1.addParent(p2) self.assertEqual(c1.parents(), [p1, p2]) self.assertEqual(p1.children(), [c1]) self.assertEqual(p2.children(), [c1]) logger.info("DONE.")
def _testGraphNames(self): logger.info("In TestFlow.testGraphNames() ...") names = HappyJobNode(name='filter_graph_names', job=FilterExact(filterkey='propname', filtervalues=['/type/object/name', '/common/topic/alias'], keyfield='guid', mapfields={'value':'name'}), inputpaths=['/data/graph/latest/crawl']) agg = HappyJobNode(name='invert_names', job=AggregateJson(aggkey='value', aggfunc='agg.list("guid")'), outputpath='namelist') names.addChild(agg) names.run(force=True)
def _testCountTypes(self): logger.info("In TestFlow.testCountTypes() ...") filter = HappyJobNode(name='filter_graph', job=FilterExact(filterkey='propname', filtervalues=['/type/object/type'], returnkeys=['target']), inputpaths=['/data/graph/latest/crawl']) agg = HappyJobNode(name='agg_types', job=AggregateJson(aggkey='target', aggfunc='agg.count()'), outputpath='typecount') filter.addChild(agg) filter.run(force=True)
def testMultiChild(self): """ Set up with many children """ logger.info("In TestFlow.testMultiChild ...") p1 = HappyJobNode(name="P1") c1 = HappyJobNode(name="C1") c2 = HappyJobNode(name="C2") p1.addChild(c1) p1.addChild(c2) self.assertEqual(c1.parents(), [p1]) self.assertEqual(c2.parents(), [p1]) self.assertEqual(p1.children(), [c1, c2]) logger.info("DONE.")
def testCreateSingle(self): """ Test that we can create a HappyJobNode """ logger.info("In TestFlow.testCreateSingle ...") node = HappyJobNode(name="name", inFiles=['in'], outFiles=['out'], status='status', job='job') self.assert_(node != None) self.assertEqual(node.name, 'name') self.assertEqual(node.inFiles, ['in']) self.assertEqual(node.outFiles, ['out']) self.assertEqual(node.status, 'status') self.assertEqual(node.job, 'job') logger.info("DONE.")
def testDAG(self): """ Set up with many relationships """ logger.info("In TestFlow.testDAG ...") logger.debug("Setting up DAG ...") a = HappyJobNode() b = HappyJobNode() c = HappyJobNode() d = HappyJobNode() e = HappyJobNode() f = HappyJobNode() g = HappyJobNode() h = HappyJobNode() i = HappyJobNode() a.addChild(b) a.addChild(c) b.addChild(c) d.addChild(f) e.addChild(f) c.addChild(g) f.addChild(g) h.addChild(g) g.addChild(i) logger.debug("Testing parent/child relationships ...") self.assertEqual(a.parents(), []) self.assertEqual(a.children(), [b, c]) self.assertEqual(c.parents(), [a, b]) self.assertEqual(f.parents(), [d, e]) self.assertEqual(g.parents(), [c, f, h]) self.assertEqual(g.children(), [i]) logger.debug("Testing node retrieval ...") nodes0 = set([a,b,c,d,e,f,g,h,i]) nodes1 = a.nodes() nodes2 = e.nodes() self.assertEqual(nodes0, nodes1) self.assertEqual(nodes0, nodes2) logger.debug("Testing sinks and sources ...") sinks = a.sinks() self.assertEqual(sinks, [i]) sources = a.sources() self.assertEqual(sources, [a, d, e, h]) logger.debug("Testing isAncestorOf() and isDecendentOf() ...") self.assert_(a.isAncestorOf(b)) self.assert_(a.isAncestorOf(g)) self.assert_(a.isAncestorOf(i)) self.assert_(not a.isAncestorOf(d)) self.assert_(i.isDecendentOf(g)) self.assert_(i.isDecendentOf(a)) self.assert_(i.isDecendentOf(b)) self.assert_(i.isDecendentOf(e)) self.assert_(not f.isDecendentOf(a)) logger.info("DONE.")
def testBidirectional(self): """ Test that parent / child links are bidirectional """ logger.info("In TestFlow.testBidirectional ...") p1 = HappyJobNode() c1 = HappyJobNode() p1.addChild(c1) self.assertEquals(c1.parents(), [p1]) self.assertEquals(p1.children(), [c1]) p2 = HappyJobNode() c2 = HappyJobNode() c2.addParent(p2) self.assertEquals(c2.parents(), [p2]) self.assertEquals(p2.children(), [c2]) logger.info("DONE.")
def _json_impl(agg, record): if not agg: agg = {} for (k, v) in record.items(): logger.info("k: " + str(k) + ", v: " + str(v)) if agg.has_key(k): if happy.flow.isIterable(v): agg[k].extend(v) else: agg[k].append(v) else: if happy.flow.isIterable(v): agg[k] = v else: agg[k] = [v] return agg
def testCircle(self): logger.info("In TestFlow.testCircle ...") p1 = HappyJobNode(name="P1") c1 = HappyJobNode(name="C1") c2 = HappyJobNode(name="C2") g1 = HappyJobNode(name="G1") p1.addChild(c1) p1.addChild(c2) c1.addChild(g1) self.assertRaises(CycleException, c1.addChild, p1) self.assertRaises(CycleException, p1.addParent, c1) self.assertRaises(CycleException, c2.addChild, p1) self.assertRaises(CycleException, p1.addParent, c2) self.assertRaises(CycleException, g1.addChild, p1) self.assertRaises(CycleException, p1.addParent, g1)
def linkNodes(self, workingDir=None): """ Assures that every parent/child pair have a matching file in their inFile / outFile lists. Creates files if necessary. @param workingDir: the directory to create temp files in. """ if workingDir: logger.info("Linking nodes, using workingDir = %s" % (workingDir)) if dfs.exists(workingDir): fs = dfs.fileStatus(workingDir) if not fs.isDir(): raise FlowException, "%s is a file, not a directory." % (workingDir) else: logger.info("Creating working directory %s." % (workingDir)) # dfs.mkdir(workingDir) stack = self.sources() for source in stack: if ((not source.inputpaths) or len(source.inputpaths)<1): raise FlowException, "Source node %s has no inputpaths defined." % source while stack: node = stack.pop(0) if node.outputpath: logger.trace("linkNodes(): %s has an outputpath '%s'. Using it." % (node, node.outputpath)) filename = node.outputpath else: filename = "tmp.%s" % (node.name) if workingDir: filename = "%s/%s" % (workingDir, filename) logger.trace("linkNodes(): Created temp outfile '%s' for %s." % (filename, node)) node.outputpath = filename for child in node.children(): if ((not child.inputpaths) or (len(set(node.outputpath) & set(child.inputpaths)) == 0)): logger.debug("linkNodes(): Linked %s and %s with file '%s'." % (node, child, filename)) child.inputpaths = castList(child.inputpaths) + [filename] stack.append(child) logger.debug("%s has inputs %s and outputs %s" % (node, node.inputpaths, node.outputpath))
def testFlow2(self): logger.info("In TestFlow.testFlow2() ...") test_flow = Flow2(inputpaths=['/data/graph/latest/crawl'], outputpath='namelist') (names, types) = test_flow.split() names.chain(HappyJobNode(name='get_names', job=FilterExact(filterkey='propname', filtervalues=['/type/object/name', '/common/topic/alias'], keyfield='a:guid', mapfields={'value':'name'}))) types.chain(HappyJobNode(name='get_types', job=FilterExact(filterkey='propname', filtervalues=['/type/object/type'], keyfield='b:guid', mapfields={'target':'type'}))) names.chain(HappyJobNode(name='join_name_types', job=InnerJoin(joinkeys=['a:guid', 'b:guid'], outkey='guid'), force=True), join=types) names.chain(HappyJobNode(name='filter_people', job=FilterExact(filterkey='type', filtervalues=['/people/person'], keyfield='guid', mapfields={'type':'type', 'name':'name'}))) names.chain(HappyJobNode(name='invert_names', job=AggregateJson(aggkey='name', aggfunc='agg.list("guid")'))) logger.debug("DAG: \n%s\n" % names.startNode.dictize()) names.run(force=False)
def _testSingleRun(self): logger.info("In TestFlow.testSingleRun() ...") node = HappyJobNode(name="P1", job=IdentityJob(), inputpaths=['small'], outputpath='crap') node.run(force=True)
def _testFlowRun(self): logger.info("In TestFlow.testFlowRun() ...") f = Flow(IdentityJob(),inputpaths=['small']) f1 = f.chain(IdentityJob()) f2 = f.chain(IdentityJob()).chain(IdentityJob(), join=f1, outputpath='crap') f2.run(force=True)
def run(self): logger.info("NullJob %s fired." % self.name) w = dfs.write(self.outputpath) w.write("NullJob() output -- for testing only.") w.close()
def testNull(self): """ Test that unittest harness is working """ logger.info("In TestFlow.testNull ...") self.assertEqual(1, 1) logger.info("DONE.")
def run(self, force=False, workingDir=None): """ Runs the entire job chain (ie DAG) that contains this node. """ logger.debug("Calling HappyJobNode.run(), workingDir=%s" % workingDir) self.linkNodes(workingDir) if force: self.deleteOutFiles(onlytmp=False) # stack = self.sources() stack = self.sort() logger.info("Stack order is: %s" % (", ".join([str(x._id) for x in stack],))) ok_children = self.sources() while stack: node = stack.pop(0) putChildren = False if (not node in ok_children): logger.warn("Branch terminated: node %s not in ok_children list %s." % (node, ok_children)) continue pre = node.precheck() if node.force: logger.info("FORCING %s [%s --> %s] (delete %s first)" % (node, node.inputpaths, node.outputpath, node.outputpath)) dfs.delete(node.outputpath) node.fire() elif (pre =='ready'): logger.info("Running %s [%s --> %s]" % (node, node.inputpaths, node.outputpath)) node.fire() else: logger.info("Skipping job %s: already done" % node) putChildren = True self.status = 'skip' post = node.postcheck() if (post == 'done'): logger.info("Job %s completed successfully. " % node) putChildren = True elif (post == 'fail'): logger.info("Job %s failed. Not adding children." % node) if putChildren: if (node.isSink()): logger.info("Job %s is a sink, no children." % node) else: newChildren = [child for child in node.children() if child not in ok_children] logger.info("Placing children %s of job %s on stack." % (newChildren, node)) ok_children.extend(newChildren)
def fire(self, *args, **kwargs): logger.info("NullNode %s fired." % self.name)