Beispiel #1
0
    def _delete_transform_from_data_view(
        cls,
        transform: Transform,
        updated_transforms: TransformList,
        updated_labels: LabelSequence,
        data_view: DataView,
    ) -> Tuple[TransformList, LabelSequence]:
        log.info(f"Removing transform from {data_view.id}")
        transform_tree = data_view.transform_tree

        updated_transforms = TransformList(updated_transforms)
        updated_labels = LabelSequence(updated_labels)

        # the transforms queued for removal
        del_transforms: Deque[Transform] = deque([transform])
        while del_transforms:
            log.info(f"about to pop {del_transforms[0]}")
            transform = del_transforms.popleft()

            if isinstance(transform, EnrichmentTransform):
                for label_name in transform.output_labels:
                    log.info(f"removing by name {label_name}")
                    updated_labels.remove_by_name(label_name)

                del_transforms.extend(
                    transform_tree.get_children_of_transform(transform))
            log.info(f"removing transform: {transform.serialize()}")
            updated_transforms.remove(transform)

        return updated_transforms, updated_labels
Beispiel #2
0
    def _add_transform_to_data_view(
        cls,
        transform: Transform,
        updated_transforms: TransformList,
        updated_labels: LabelSequence,
        data_view: DataView,
    ) -> Tuple[TransformList, LabelSequence]:
        log.info(f"Adding transform to {data_view.id}")
        updated_transforms = TransformList(updated_transforms)
        updated_labels = LabelSequence(updated_labels)

        updated_transforms.append(transform)

        if isinstance(transform, EnrichmentTransform):
            updated_labels.extendleft(
                [Label(name) for name in transform.output_labels])

        return updated_transforms, updated_labels
def test_set_comparison():
    t = [
        ExactMatch("aaa", "bbb"),
        ExactMatch("aaa", "bbb"),
        HasText("aaa", "bbb"),
        DoesNotMatchAny("aaa", ["bbb", "ccc"]),
    ]

    lists = [
        TransformList([t[0], t[2]]),
        TransformList([t[1], t[2]]),
        TransformList([t[0], t[3]]),
        TransformList([t[0]]),
        TransformList([]),
    ]

    sets = [set(transform_list) for transform_list in lists]

    assert sets[0] == sets[0]
    assert sets[0] == sets[1]
    assert sets[0] != sets[2]
    assert sets[0] != sets[3]
    assert sets[0] != sets[4]

    assert sets[1] == sets[1]
    assert sets[1] != sets[2]
    assert sets[1] != sets[3]
    assert sets[1] != sets[4]

    assert sets[2] == sets[2]
    assert sets[2] != sets[3]
    assert sets[2] != sets[4]

    assert sets[3] == sets[3]
    assert sets[3] != sets[4]

    assert sets[4] == sets[4]
def test_set_comparison():
    transforms = TransformList([
        ExactMatch("aaa", "bbb"),
        ExactMatch("aaa", "bbb"),
        HasText("aaa", "bbb"),
        DoesNotMatchAny("aaa", ["bbb", "ccc"]),
    ])

    assert {transforms[0]} == {transforms[0]}
    assert {transforms[0], transforms[2]} == {transforms[0], transforms[2]}
    assert {transforms[0], transforms[2]} == {transforms[1], transforms[2]}
    assert {transforms[0], transforms[1]} == {transforms[0]}
    assert {transforms[0], transforms[2]} != {transforms[0], transforms[3]}
    assert {transforms[0]} != {transforms[2]}
    assert {transforms[0]} != {transforms[3]}
Beispiel #5
0
    def deserialize(cls, d: Dict[str]) -> DataView:
        data_view_id = DataViewId(d[cls.KEY_ID])
        parent_data_view_id = DataViewId(d[cls.KEY_PARENT_ID])
        dataset_id = DatasetId(d[cls.KEY_DATASET_ID])
        user_id = UserId(d[cls.KEY_USER_ID])
        labels = LabelSequence.deserialize(d[cls.KEY_COLUMN_LABELS])
        transforms = TransformList.deserialize(d[cls.KEY_TRANSFORMS])

        return DataView(
            data_view_id=data_view_id,
            parent_data_view_id=parent_data_view_id,
            dataset_id=dataset_id,
            user_id=user_id,
            labels=labels,
            transforms=transforms,
        )
Beispiel #6
0
    def __init__(
        self,
        data_view_id: DataViewId,
        parent_data_view_id: DataViewId,
        dataset_id: DatasetId,
        user_id: UserId,
        labels: Optional[LabelSequence] = None,
        transforms: Optional[TransformList] = None,
    ):
        self.id = data_view_id
        self.parent_id = parent_data_view_id
        self.dataset_id = dataset_id
        self.user_id = user_id
        self.transforms = transforms or TransformList()

        self._labels = labels or LabelSequence()
        self._label_by_name: Dict[str, Label] = {}
Beispiel #7
0
    def transform_data_view(
        self,
        data_view_id: DataViewId,
        add_transforms: Optional[List[Transform]] = None,
        del_transforms: Optional[List[Transform]] = None,
    ) -> DataView:

        data_view = self.by_id(data_view_id)
        if data_view is None:
            raise ValueError(f"Could not find DataView for id {data_view_id}")

        updated_transforms = TransformList(data_view.transforms)
        updated_labels = LabelSequence(data_view.labels)

        for transforms, apply_change in [
            (del_transforms or [], self._delete_transform_from_data_view),
            (add_transforms or [], self._add_transform_to_data_view),
        ]:
            for transform in transforms:
                updated_transforms, updated_labels = apply_change(
                    transform,
                    updated_transforms,
                    updated_labels,
                    data_view,
                )

        # see if this DataView already exists
        serialization = self._serialize_for_cache(
            data_view.dataset_id,
            updated_transforms,
        )

        existing_id = self._data_view_id_by_serialization.get(
            serialization, None)
        if existing_id:
            log.info(f"using cached DataView {existing_id}")
            return self.by_id(existing_id)

        log.info("saving new DataView")
        return self.create(
            parent=data_view_id,
            user=data_view.user_id,
            dataset=data_view.dataset_id,
            labels=updated_labels,
            transforms=updated_transforms,
        )
    def from_dict(cls, d: Dict) -> Query:
        transform_dicts = d.get(cls.KEY_TRANSFORMS, [])
        transforms: TransformList = TransformList()

        for d in transform_dicts:
            class_name: List[Dict] = d.get(cls.KEY_CLASS_NAME, None)
            args = d.get(cls.KEY_ARGS, {})

            if class_name is None:
                log.error(
                    "Transform has no class name - skipping: {}".format(d))
                continue

            transform_cls = transform_manager.transform_by_name(class_name)
            transform = transform_cls(**args)
            transforms.append(transform)

        return Query(transforms=transforms, )
Beispiel #9
0
    def create(
        self,
        parent: Optional[Union[DataView, DataViewId]],
        user: Union[User, UserId],
        dataset: Union[Dataset, DatasetId],
        labels: LabelSequence,
        transforms: Optional[TransformList] = None,
    ) -> DataView:
        log.debug("DataViewHandler.create")

        try:
            parent_id = parent.id
        except AttributeError:
            parent_id = parent

        try:
            dataset_id = dataset.id
        except AttributeError:
            dataset_id = dataset

        try:
            user_id = user.id
        except AttributeError:
            user_id = user

        if not transforms:
            transforms = TransformList()

        data_view = DataView(
            data_view_id=DataViewId(self._next_id),
            parent_data_view_id=parent_id,
            dataset_id=dataset_id,
            user_id=user_id,
            labels=labels,
            transforms=transforms,
        )

        self._data_views.append(data_view)
        self._index_data_view(data_view)

        log.info("saving new DataView: %s", data_view.id)
        self.save()

        return data_view
Beispiel #10
0
 def _serialize_for_cache(cls, dataset_id: DatasetId,
                          transforms: TransformList) -> str:
     serialized_transforms = transforms.serialize() if transforms else []
     return json.dumps([dataset_id, serialized_transforms])