Example #1
0
    def run(self, graph, ctx):
        Pass.run(self, graph, ctx)
        new_tasks = []
        new_sources = []
        deleted_sources = []
        # Run edge transformations
        for tid, task in graph.tasks.items():
            for index, edge in enumerate(task.edges):
                # If we find that we need to do a data type transformation we
                # need to splice in the transformation in between the original
                # tasks
                if edge.needs_transform:
                    intype = edge.source.type
                    outtype = edge.dest.type

                    # [FIXME] Code debt - Currently we have two overloaded ways
                    # of indexing in to the task.outputs dictionary. One with
                    # indices and one with names in the case of named outputs.
                    # But there is currently no way of distinguishing between
                    # these two at the moment. I currently just assume it is
                    # indexed addressing case here since we don't currently
                    # support named outputs. This needs to change once we add
                    # support for named outputs.
                    tasklet = Tasklet(edge.source.task_ref,
                                      str(edge.source.index))
                    args = [tasklet, intype, outtype]

                    sig = inspect.signature(transform)
                    new_task = Task(gen_runner(transform, sig), transform, sig,
                                    args, {})
                    new_task.is_transform = True
                    # The input corresponds to the 'infile' parameter of the
                    # transform function
                    inport = new_task.inputs['infile']
                    # We know this generated task only has one output
                    outport = new_task.outputs['0']

                    old_dest_port = edge.dest

                    # Remove the old edge since we should have generated a new
                    # edge from the original source to this newly generated task
                    # during the call to Task constructor
                    del task.edges[index]

                    # Make the original destination port of the edge to be the
                    # the destination port of the outward edge of the new task
                    new_task.edges.append(Edge(outport, old_dest_port))
                    # Collect newly generated tasks
                    new_tasks.append(new_task)

        # Run source input transformations
        for tid, source in graph.sources.items():
            for name, inport in source.inputs.items():
                # Get the actual argument value passed to this source
                arg = source._args[name]

                # Check if it looks like a file
                ext = get_file_extention(arg)
                if ext:
                    if ext == 'csv':
                        intype = inport.type.id
                        if intype != ext:
                            outtype = get_type(ext)
                            args = [arg, intype, outtype]

                            sig = inspect.signature(transform)
                            # Generate a new task for transforming the input to
                            # type the original source was expecting
                            task = Task(gen_runner(transform, sig), transform,
                                        sig, args, {})
                            task.is_transform = True

                            # We know this generated task only has one output
                            outport = task.outputs['0']

                            # Make the configuration of the original task's input to be
                            # non immediate since now it accepts the output from newly
                            # generated staging task at runtime
                            inport.flip_is_immediate()

                            # Connect the out port of the new task to the
                            # in port of the old source
                            task.edges.append(Edge(outport, inport))

                            # Collect the new task as a source
                            new_sources.append(task)
                            # Collect sources which are made not sources anymore
                            deleted_sources.append(source)
                            # Collect newly generated tasks
                            new_tasks.append(task)

        # Add the newly generated tasks to the graph
        for task in new_tasks:
            graph.add_task(task)

        # Mark removed sources as not sources
        for source in deleted_sources:
            graph.unset_source(source)

        # Add newly generated sources
        for source in new_sources:
            graph.set_source(source)

        return PassResult.CONTINUE