Exemple #1
0
def execute(pcs: Process,
            url: str,
            parameters: Optional[str] = None,
            **kwargs) -> TaskResult:
    """
    Imports a file from an online resource given its url.

    Args:
        pcs (Process)

    Parameters:
        url (str): Public accessible resource
        parameters (str): GET parameters to append to url
    """
    filename = Path(urlparse(url).path).name

    try:
        filepath = pcs.storage.get_file(url)
    except (NotValidScheme, FileNotFoundError):
        pcs.warn("No valid scheme was provided")
        filepath = Path(pcs.storage.local_dir, filename)
        if parameters:
            url = url + parameters
        urllib.request.urlretrieve(url, filepath)

    # send to remote storage
    dfs_dir = pcs.storage.put_file(filepath)

    # send to downstream
    output_temp_file = TempFile(resource=dfs_dir)
    pcs.to_downstream(output_temp_file)

    return TaskResult(files=[dfs_dir])
def execute(pcs: Process):
    """
    Reads stream of 3D-Points with cartesian coordinates.
    """
    for key, p in pcs.poll_from_upstream():
        if key == "Points":
            pcs.info([f"Got point ({p['x']},{p['y']},{p['z']})"])
Exemple #3
0
 def setUp(self) -> None:
     self.process = Process(
         name="test-task-1",
         module="",
         params={},
         parent="test-workflow-1",
         inputs={"point": "test-task-0.Point"},
     )
Exemple #4
0
def execute(pcs: Process, **kwargs):
    """
    Reads a TSV file.
    """
    inputs = pcs.get_from_upstream()

    # read inputs
    input_file: Dict[SimpleTabularDataset] = inputs["TabularDataset"][0]
    input_file_delimiter = input_file["delimiter"]
    input_file_resource = input_file["resource"]

    filepath = pcs.storage.get_file(input_file_resource)

    with open(filepath, "r") as reader:
        for row in csv.reader(reader, delimiter=input_file_delimiter):
            pcs.info(row)
Exemple #5
0
def execute(pcs: Process, **kwargs):
    """
    Sends a global interruption signal.
    """
    signal = SignalMessage(data=SignalType.INTE)
    serialized_message = serialize(signal.dict(), pcs.MESSAGE_SCHEMA)

    producer = pcs._producer()
    producer.send(topic=pcs.parent, value=serialized_message, key=bytes(pcs.parent, "utf-8"), **kwargs)
    producer.close()
Exemple #6
0
class ProcessWithTwoInputsOneUpstreamTestCase(unittest.TestCase):
    def setUp(self) -> None:
        self.process = Process(
            name="test-task-1",
            module="",
            params={},
            parent="test-workflow-2",
            inputs={
                "point_a": "test-task-0.PointA",
                "point_b": "test-task-0.PointB"
            },
        )

    @mock.patch("drama.process.Process._consumer")
    def test_should_get_all_messages_from_upstream(self, consumer):
        def poll(**kwargs):
            _TopicPartition = collections.namedtuple("TopicPartition", [])
            _ConsumerRecord = collections.namedtuple("ConsumerRecord",
                                                     ["key", "value"])

            point_record_a = _ConsumerRecord(
                key=b"test-task-0",
                value=
                b'\nBLOCK$test-task-0.PointA\x04\x02\x04\x08AVRO\xda\x03{"namespace": "drama.examples.publisher.DemoSinglePublisherPoint", "name": "PointA", "type": "record", "fields": [{"name": "x", "type": "int"}, {"name": "y", "type": "int"}]}',
            )

            point_record_b = _ConsumerRecord(
                key=b"test-task-0",
                value=
                b'\nBLOCK$test-task-0.PointB\x04\x06\x08\x08AVRO\xda\x03{"namespace": "drama.examples.publisher.DemoSinglePublisherPoint", "name": "PointB", "type": "record", "fields": [{"name": "x", "type": "int"}, {"name": "y", "type": "int"}]}',
            )

            stop_record = _ConsumerRecord(
                key=b"test-task-0",
                value=
                b"\x0cSIGNAL\x12undefined\x18POISSON_PILL\x12undefined\x12undefined",
            )

            return {
                _TopicPartition: [point_record_a, point_record_b, stop_record]
            }

        mocked_consumer = MagicMock()
        mocked_consumer.poll = poll

        consumer.return_value = mocked_consumer  # mock self._consumer() to avoid kafka producer

        records = self.process.get_from_upstream()
        self.assertTrue(len(records.keys()) == 2)
        self.assertEqual([{"x": 1, "y": 2}], records["point_a"])
        self.assertEqual([{"x": 3, "y": 4}], records["point_b"])

    def tearDown(self) -> None:
        self.process.storage.remove_local_dir()
Exemple #7
0
def execute(pcs: Process,
            url: str,
            delimiter: str = "\t",
            comment: str = "#") -> TaskResult:
    """
    Imports a tab-separated values file from an online resource given its url.

    Args:
        pcs (Process)

    Parameters:
        url (str): Public accessible resource
        delimiter (str): Line column delimiter. Defaults to "\t".
        comment (str). Character representing starting comment. Defaults to "#".
    """
    filename = Path(urlparse(url).path).name

    try:
        filepath = pcs.storage.get_file(url)
    except (NotValidScheme, FileNotFoundError):
        pcs.warn("No valid scheme was provided")
        filepath = Path(pcs.storage.local_dir, filename)
        urllib.request.urlretrieve(url, filepath)

    out_tsv = Path(pcs.storage.local_dir, "out.tsv")

    def validate(csvfile):
        """
        De-comment input file.
        """
        for row in csvfile:
            raw = row.split(comment)[0].strip()
            if raw:
                yield raw

    with open(filepath, "r") as infile, open(out_tsv, "w",
                                             newline="") as outfile:
        reader = csv.reader(validate(infile), delimiter=delimiter)
        writer = csv.writer(outfile,
                            delimiter=delimiter,
                            lineterminator=os.linesep)

        for row in reader:
            writer.writerow(row)

    pcs.info(f"Out file: {outfile}")

    # send to remote storage
    dfs_dir = pcs.storage.put_file(out_tsv)

    # send to downstream
    output_simple_tabular_dataset = SimpleTabularDataset(resource=dfs_dir,
                                                         delimiter="\t",
                                                         file_format=".tsv")
    pcs.to_downstream(output_simple_tabular_dataset)

    return TaskResult(files=[dfs_dir])
Exemple #8
0
def execute(pcs: Process, x: int, y: int):
    """
    Parameters:
        x (int): x-cartesian coordinate
        y (int): y-cartesian coordinate
    """
    pcs.info([f"Generating point ({x},{y},?)"])

    # send to downstream
    for i in range(10):
        z = random.randint(0, 10)
        point = Point(x, y, z)

        pcs.info([f"Sending {i}"])
        pcs.to_downstream(point)
Exemple #9
0
def process_task(task_request: dict):
    """
    Main `drama` actor.
    Executes an arbitrary function defined by a task and updates its state.
    """
    message = CurrentMessage.get_current_message()
    task_id = message.message_id

    # required attributes
    task_name = task_request["name"]
    task_module = task_request["module"]
    task_parent = task_request["parent"]  # workflow id

    # optional attributes
    task_params = task_request["params"]
    task_inputs = task_request["inputs"]

    # task options
    task_opts = task_request["options"]
    force_interruption = task_opts["on_fail_force_interruption"]
    remove_local_dir = task_opts["on_fail_remove_local_dir"]

    # configure data file storage
    storage = get_available_storage()
    dfs = storage(bucket_name=task_parent, folder_name=task_name
                  )  # bucket folder is shared across tasks in a workflow

    # create process
    task_process = Process(
        name=task_name,
        module=task_module,
        parent=task_parent,
        params=task_params,
        inputs=task_inputs,
        storage=dfs,
    )

    task_process.debug(f"Running task {task_id} with name {task_name}")

    try:
        # import `execute` function from module
        task_process.debug(f"Importing function from {task_module}")
        func = get_process_func(task_module)

        # set process status to `running`
        process_running(message)

        # execute imported function
        data = func(**task_params, pcs=task_process)
        if data:
            if not isinstance(data, TaskResult):
                data = TaskResult(message=str(data))
        if not data:
            data = TaskResult()
    except ImportError:
        task_process.error(traceback.format_exc())
        task_process.close(force_interruption=force_interruption)
        raise ImportError(
            f"Module {task_module} from task {task_id} is not available")
    except StopIteration:
        task_process.error(traceback.format_exc())
        task_process.close(force_interruption=force_interruption)
        raise StopIteration("Could not get data from upstream")
    except Exception:
        task_process.error("Unexpected unknown exception was raised by actor:")
        task_process.error(traceback.format_exc())
        task_process.close(force_interruption=force_interruption,
                           remove_local_dir=remove_local_dir)
        raise

    remote_logging_file = task_process.close()
    task_process.info(f"Task {task_id} successfully executed")

    data.log = remote_logging_file

    # result of this function *must be* JSON-encodable
    data_as_json = data.json()

    return data_as_json
Exemple #10
0
class ProcessWithOneInputOneUpstreamTestCase(unittest.TestCase):
    def setUp(self) -> None:
        self.process = Process(
            name="test-task-1",
            module="",
            params={},
            parent="test-workflow-1",
            inputs={"point": "test-task-0.Point"},
        )

    @mock.patch("drama.process.Process._producer")
    def test_should_send_point_to_downstream(self, producer):
        producer.return_value = MagicMock(
        )  # mock self._producer() to avoid kafka producer

        point = Point(1, 2)

        message = self.process.to_downstream(data=point)

        self.assertEqual(MessageType.BLOCK, message.type)
        self.assertEqual("test-task-1.Point", message.key)
        self.assertEqual(b"\x02\x04", message.data)
        self.assertEqual(
            '{"namespace": "tests.test_process", "name": "Point", "type": "record", "fields": [{"name": "x", "type": "int"}, {"name": "y", "type": "int"}]}',
            message.schem,
        )
        self.assertEqual(Servo.AVRO, message.servo)

    @mock.patch("drama.process.Process._consumer")
    def test_should_poll_messages_from_upstream(self, consumer):
        def poll(**kwargs):
            _TopicPartition = collections.namedtuple("TopicPartition", [])
            _ConsumerRecord = collections.namedtuple("ConsumerRecord",
                                                     ["key", "value"])

            point_record = _ConsumerRecord(
                key=b"test-task-0",
                value=
                b'\nBLOCK"test-task-0.Point\x04\x02\x04\x08AVRO\xd8\x03{"namespace": "drama.examples.publisher.DemoSinglePublisherPoint", "name": "Point", "type": "record", "fields": [{"name": "x", "type": "int"}, {"name": "y", "type": "int"}]}',
            )

            stop_record = _ConsumerRecord(
                key=b"test-task-0",
                value=
                b"\x0cSIGNAL\x12undefined\x18POISSON_PILL\x12undefined\x12undefined",
            )

            return {_TopicPartition: [point_record, stop_record]}

        mocked_consumer = MagicMock()
        mocked_consumer.poll = poll

        consumer.return_value = mocked_consumer  # mock self._consumer() to avoid kafka producer

        for key, msg in self.process.poll_from_upstream():
            self.assertEqual("point", key)
            self.assertEqual({"x": 1, "y": 2}, msg)

    @mock.patch("drama.process.Process._consumer")
    def test_should_get_messages_from_upstream(self, consumer):
        def poll(**kwargs):
            _TopicPartition = collections.namedtuple("TopicPartition", [])
            _ConsumerRecord = collections.namedtuple("ConsumerRecord",
                                                     ["key", "value"])

            point_record = _ConsumerRecord(
                key=b"test-task-0",
                value=
                b'\nBLOCK"test-task-0.Point\x04\x02\x04\x08AVRO\xd8\x03{"namespace": "drama.examples.publisher.DemoSinglePublisherPoint", "name": "Point", "type": "record", "fields": [{"name": "x", "type": "int"}, {"name": "y", "type": "int"}]}',
            )

            stop_record = _ConsumerRecord(
                key=b"test-task-0",
                value=
                b"\x0cSIGNAL\x12undefined\x18POISSON_PILL\x12undefined\x12undefined",
            )

            return {_TopicPartition: [point_record, stop_record]}

        mocked_consumer = MagicMock()
        mocked_consumer.poll = poll

        consumer.return_value = mocked_consumer  # mock self._consumer() to avoid kafka produce

        records = self.process.get_from_upstream()
        self.assertTrue(len(records.keys()) == 1)
        self.assertEqual([{"x": 1, "y": 2}], records["point"])

    @mock.patch("drama.process.Process._consumer")
    def test_should_raise_exception_if_some_inputs_are_missing(self, consumer):
        def poll(**kwargs):
            _TopicPartition = collections.namedtuple("TopicPartition", [])
            _ConsumerRecord = collections.namedtuple("ConsumerRecord",
                                                     ["key", "value"])

            stop_record = _ConsumerRecord(
                key=b"test-task-0",
                value=
                b"\x0cSIGNAL\x12undefined\x18POISSON_PILL\x12undefined\x12undefined",
            )

            return {_TopicPartition: [stop_record]}

        mocked_consumer = MagicMock()
        mocked_consumer.poll = poll

        consumer.return_value = mocked_consumer  # mock self._consumer() to avoid kafka producer

        with self.assertRaises(Exception):
            self.process.get_from_upstream()

    def tearDown(self) -> None:
        self.process.storage.remove_local_dir()