def generate_dag(optimal_indvidual, stage_name, num_nodes): # create nodes for the graph nodes = np.empty((0), dtype=np.str) for n in range(1, (num_nodes + 1)): nodes = np.append(nodes, ''.join([stage_name, "_", str(n)])) # initialize directed asyclic graph (DAG) and add nodes to it dag = DAG() for n in nodes: dag.add_node(n) # split best indvidual found via GA to identify vertices connections and connect them in DAG edges = np.split(optimal_indvidual, np.cumsum(range(num_nodes - 1)))[1:] v2 = 2 for e in edges: v1 = 1 for i in e: if i: dag.add_edge(''.join([stage_name, "_", str(v1)]), ''.join([stage_name, "_", str(v2)])) v1 += 1 v2 += 1 # delete nodes not connected to anyother node from DAG for n in nodes: if len(dag.predecessors(n)) == 0 and len(dag.downstream(n)) == 0: dag.delete_node(n) nodes = np.delete(nodes, np.where(nodes == n)[0][0]) return dag, nodes
def generate_dag(optimal_indvidual, stage_name, num_nodes): # optimal_individual为本stage的二进制字符串 # create nodes for the graph nodes = np.empty((0), dtype=np.str) # 给stage的节点命名,比如s1 stage,节点名字为s1_1,s1_2,... for n in range(1, (num_nodes + 1)): nodes = np.append(nodes, ''.join([stage_name, "_", str(n)])) # initialize directed asyclic graph (DAG) and add nodes to it # 加入所有节点 dag = DAG() for n in nodes: dag.add_node(n) # split best indvidual found via GA to identify vertices connections and connect them in DAG # cumsum累积和,cumsum([0, 1, 2, 3])返回[0, 1, 3, 6] # 在这里体现为比如有4个node,二进制字符串长度为6,切割成s[:0], s[0:1], s[1:3], s[3:6] # 即连接每个节点的二进制字符串 # 最后再删除第一个节点没有连的数据(上面的s[:0]) edges = np.split(optimal_indvidual, np.cumsum(range(num_nodes - 1)))[1:] v2 = 2 # 遍历所有节点的连接情况 for e in edges: v1 = 1 # 遍历这个节点的二进制字符串 # 如果是1,添加边到dag # 这里其实for循环替代v1会好看些 for i in e: if i: dag.add_edge(''.join([stage_name, "_", str(v1)]), ''.join([stage_name, "_", str(v2)])) v1 += 1 v2 += 1 # delete nodes not connected to anyother node from DAG # 删除孤立的点 for n in nodes: if len(dag.predecessors(n)) == 0 and len(dag.downstream(n)) == 0: dag.delete_node(n) nodes = np.delete(nodes, np.where(nodes == n)[0][0]) return dag, nodes
class Pipeline: def __init__(self, input_file): self.cfg = self._read(input_file) self.info = self.cfg['pipeline'] self.owner = self.info['owner'] self.basename = self.info['basename'] self.version = self.info['version'] self.dag = DAG() self.stages = {} for name in self.info['stages']: self.stages[name] = self.load_stage(name) self.dag.add_node(name) for name in self.info['stages']: stage_info = self.cfg[name] for parent in stage_info['depends-on']: self.dag.add_edge(parent, name) def build(self): for dirname in self.info['images']: os.system("cd {}; make".format(dirname)) def push(self): for dirname in self.info['images']: os.system("cd {}; make push".format(dirname)) def pull(self): for dirname in self.info['images']: os.system("cd {}; make pull".format(dirname)) def load_stage(self, name): for dirname in self.info['images']: dirpath = os.path.join(dirname,name) dockerfile_path = os.path.join(dirpath, "Dockerfile") run_path = os.path.join(dirpath, "run.py") if os.path.isdir(dirpath) and os.path.isfile(dockerfile_path) and os.path.isfile(run_path): path = run_path break else: raise PipelineError("""No Stage called {} was found - needs to be in one of the images directories and contain Dockerfile, run.py""".format(name)) # We want to load a module based on a python. The python people keep changing how to do this in obscure # ways. This one is deprecated but works back in python 3.4 which is what centos 7 can provide. loader = importlib.machinery.SourceFileLoader(name, path) module = loader.load_module() return module.Stage def input_tags(self): "Return a set of all input tags required by the pipeline and not generated inside it." pipeline_inputs = set() # Find all the inputs expected by the pipeline for stage in self.stages.values(): pipeline_inputs.update(stage.inputs.keys()) # Remove any stages that are generated by any step in the pipeline for stage in self.stages.values(): pipeline_inputs.difference_update(stage.outputs.keys()) return pipeline_inputs def output_filenames(self): outputs = set() # Find all the outputs generated by the pipeline for stage in self.stages.values(): outputs += stage.get_output_filenames() return outputs def _read(self, input_file): "Read a YAML file represnting a pipline" if not hasattr(input_file, 'read'): input_file = open(input_file) info = yaml.load(input_file) return info def image_name(self, name): "Return the expected image name for a given stage based on information in the pipeline file" return '{}/{}-{}:{}'.format(self.owner,self.basename, name, self.version) def sequence(self): "Return an acceptable serial ordering for the pipeline elements" order = self.dag.topological_sort() return [(name,self.stages[name]) for name in order] def dependencies(self, name): return self.dag.predecessors(name)