Ejemplo n.º 1
0
    def write_labels(self,
                     labels: Union[List[Label], List[dict]],
                     index: Optional[str] = None):
        index = index or self.label_index
        if index and not self.client.indices.exists(index=index):
            self._create_label_index(index)

        # Make sure we comply to Label class format
        label_objects = [
            Label.from_dict(l) if isinstance(l, dict) else l for l in labels
        ]

        labels_to_index = []
        for label in label_objects:
            _label = {
                "_op_type":
                "index" if self.update_existing_documents else "create",
                "_index": index,
                **label.to_dict()
            }  # type: Dict[str, Any]

            labels_to_index.append(_label)
        bulk(self.client,
             labels_to_index,
             request_timeout=300,
             refresh=self.refresh_type)
Ejemplo n.º 2
0
    def write_labels(self,
                     labels: Union[List[dict], List[Label]],
                     index: Optional[str] = None):
        """Write annotation labels into document store."""
        index = index or self.label_index
        label_objects = [
            Label.from_dict(l) if isinstance(l, dict) else l for l in labels
        ]

        duplicate_ids: list = [
            label.id
            for label in self._get_duplicate_labels(label_objects, index=index)
        ]
        if len(duplicate_ids) > 0:
            logger.warning(
                f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
                f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
                f" the answer annotation and not the question."
                f" Problematic ids: {','.join(duplicate_ids)}")

        for label in label_objects:
            # create timestamps if not available yet
            if not label.created_at:
                label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
            if not label.updated_at:
                label.updated_at = label.created_at
            self.indexes[index][label.id] = label
Ejemplo n.º 3
0
 def get_all_labels(
         self,
         index: Optional[str] = None,
         filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
     index = index or self.label_index
     result = self.get_all_documents_in_index(index=index, filters=filters)
     labels = [Label.from_dict(hit["_source"]) for hit in result]
     return labels
Ejemplo n.º 4
0
    def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
        """Write annotation labels into document store."""
        index = index or self.label_index
        label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]

        for label in label_objects:
            label_id = str(uuid4())
            self.indexes[index][label_id] = label
Ejemplo n.º 5
0
 def get_all_labels(
     self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None, batch_size: int = 10_000
 ) -> List[Label]:
     """
     Return all labels in the document store
     """
     index = index or self.label_index
     result = list(self._get_all_documents_in_index(index=index, filters=filters, batch_size=batch_size))
     labels = [Label.from_dict(hit["_source"]) for hit in result]
     return labels
Ejemplo n.º 6
0
    def write_labels(self,
                     labels: Union[List[Label], List[dict]],
                     index: Optional[str] = None,
                     batch_size: int = 10_000):
        """Write annotation labels into document store.

        :param labels: A list of Python dictionaries or a list of Haystack Label objects.
        :param batch_size: Number of labels that are passed to Elasticsearch's bulk function at a time.
        """
        index = index or self.label_index
        if index and not self.client.indices.exists(index=index):
            self._create_label_index(index)

        labels_to_index = []
        for l in labels:
            # Make sure we comply to Label class format
            if isinstance(l, dict):
                label = Label.from_dict(l)
            else:
                label = l

            # create timestamps if not available yet
            if not label.created_at:
                label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
            if not label.updated_at:
                label.updated_at = label.created_at

            _label = {
                "_op_type":
                "index" if self.update_existing_documents else "create",
                "_index": index,
                **label.to_dict()
            }  # type: Dict[str, Any]

            # rename id for elastic
            if label.id is not None:
                _label["_id"] = str(_label.pop("id"))

            labels_to_index.append(_label)

            # Pass batch_size number of labels to bulk
            if len(labels_to_index) % batch_size == 0:
                bulk(self.client,
                     labels_to_index,
                     request_timeout=300,
                     refresh=self.refresh_type)
                labels_to_index = []

        if labels_to_index:
            bulk(self.client,
                 labels_to_index,
                 request_timeout=300,
                 refresh=self.refresh_type)
Ejemplo n.º 7
0
    def write_labels(self, labels: Union[List[dict], List[Label]], index: Optional[str] = None):
        """Write annotation labels into document store."""
        index = index or self.label_index
        label_objects = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]

        for label in label_objects:
            label_id = str(uuid4())
            # create timestamps if not available yet
            if not label.created_at:
                label.created_at = time.strftime("%Y-%m-%d %H:%M:%S")
            if not label.updated_at:
                label.updated_at = label.created_at
            self.indexes[index][label_id] = label
Ejemplo n.º 8
0
    def write_labels(self, labels, index=None):

        labels = [Label.from_dict(l) if isinstance(l, dict) else l for l in labels]
        index = index or self.label_index
        for label in labels:
            label_orm = LabelORM(
                document_id=label.document_id,
                no_answer=label.no_answer,
                origin=label.origin,
                question=label.question,
                is_correct_answer=label.is_correct_answer,
                is_correct_document=label.is_correct_document,
                answer=label.answer,
                offset_start_in_doc=label.offset_start_in_doc,
                model_id=label.model_id,
                index=index,
            )
            self.session.add(label_orm)
        self.session.commit()
Ejemplo n.º 9
0
    def write_labels(self, labels, index=None):
        """Write annotation labels into document store."""

        labels = [
            Label.from_dict(l) if isinstance(l, dict) else l for l in labels
        ]
        index = index or self.label_index

        duplicate_ids: list = [
            label.id
            for label in self._get_duplicate_labels(labels, index=index)
        ]
        if len(duplicate_ids) > 0:
            logger.warning(
                f"Duplicate Label IDs: Inserting a Label whose id already exists in this document store."
                f" This will overwrite the old Label. Please make sure Label.id is a unique identifier of"
                f" the answer annotation and not the question."
                f" Problematic ids: {','.join(duplicate_ids)}")
        # TODO: Use batch_size
        for label in labels:
            label_orm = LabelORM(
                id=label.id,
                document_id=label.document_id,
                no_answer=label.no_answer,
                origin=label.origin,
                question=label.question,
                is_correct_answer=label.is_correct_answer,
                is_correct_document=label.is_correct_document,
                answer=label.answer,
                offset_start_in_doc=label.offset_start_in_doc,
                model_id=label.model_id,
                index=index,
            )
            if label.id in duplicate_ids:
                self.session.merge(label_orm)
            else:
                self.session.add(label_orm)
        self.session.commit()