Beispiel #1
0
    def process_entry(self, entry):
        doc = Document.from_json(entry)
        doc.dataset = self.dataset
        if self.max_tokens_per_doc > 0:
            splits = doc.split(self.max_tokens_per_doc)
        else:
            splits = [doc]

        return [split.to_json() for split in splits]
Beispiel #2
0
    def text_to_instance(self, doc_text: Dict[str, Any]):
        """
        Convert a Document object into an instance.
        """
        doc = Document.from_json(doc_text)

        fields = self._process_sentence_fields(doc)
        fields["metadata"] = MetadataField(doc)

        return Instance(fields)
Beispiel #3
0
    def text_to_instance(self, doc_text: Dict[str, Any]):
        """
        Convert a Document object into an instance.
        """
        doc = Document.from_json(doc_text)

        # Make sure there are no single-token sentences; these break things.
        sent_lengths = [len(x) for x in doc.sentences]
        if min(sent_lengths) < 2:
            msg = (
                f"Documnt {doc.doc_key} has a sentence with a single token or no tokens. "
                "Please merge with another sentence or remove.")
            raise ValueError(msg)

        fields = self._process_sentence_fields(doc)
        fields["metadata"] = MetadataField(doc)

        return Instance(fields)
Beispiel #4
0
    def text_to_instance(self, doc_text: Dict[str, Any]):
        """
        Convert a Document object into an instance.
        Dict to instance
        :params doc_text dict
        """
        # dict to Document obj
        doc = Document.from_json(doc_text)

        # Make sure there are no single-token sentences; these break things.
        sent_lengths = [len(x) for x in doc.sentences]
        if min(sent_lengths) < 2:
            msg = (
                f"Document {doc.doc_key} has a sentence with a single token or no tokens. "
                "This may break the modeling code.")
            warnings.warn(msg)

        fields = self._process_sentence_fields(doc)
        fields["metadata"] = MetadataField(doc)

        return Instance(fields)