def process_entry(self, entry): doc = Document.from_json(entry) doc.dataset = self.dataset if self.max_tokens_per_doc > 0: splits = doc.split(self.max_tokens_per_doc) else: splits = [doc] return [split.to_json() for split in splits]
def text_to_instance(self, doc_text: Dict[str, Any]): """ Convert a Document object into an instance. """ doc = Document.from_json(doc_text) fields = self._process_sentence_fields(doc) fields["metadata"] = MetadataField(doc) return Instance(fields)
def text_to_instance(self, doc_text: Dict[str, Any]): """ Convert a Document object into an instance. """ doc = Document.from_json(doc_text) # Make sure there are no single-token sentences; these break things. sent_lengths = [len(x) for x in doc.sentences] if min(sent_lengths) < 2: msg = ( f"Documnt {doc.doc_key} has a sentence with a single token or no tokens. " "Please merge with another sentence or remove.") raise ValueError(msg) fields = self._process_sentence_fields(doc) fields["metadata"] = MetadataField(doc) return Instance(fields)
def text_to_instance(self, doc_text: Dict[str, Any]): """ Convert a Document object into an instance. Dict to instance :params doc_text dict """ # dict to Document obj doc = Document.from_json(doc_text) # Make sure there are no single-token sentences; these break things. sent_lengths = [len(x) for x in doc.sentences] if min(sent_lengths) < 2: msg = ( f"Document {doc.doc_key} has a sentence with a single token or no tokens. " "This may break the modeling code.") warnings.warn(msg) fields = self._process_sentence_fields(doc) fields["metadata"] = MetadataField(doc) return Instance(fields)