def remove_punctuation(reader, inst, punctuation=PUNCTUATION): words, labels = list( zip(*[(word, label) for word, label in zip( inst.fields['sentence'], inst.fields['labels'].labels, ) if word.text not in punctuation])) sentence = TextField(words, reader.token_indexers) label_field = SequenceLabelField(labels=labels, sequence_field=sentence) inst_out = Instance({"sentence": sentence, 'labels': label_field}) if hasattr(inst, 'index'): inst_out.index = inst.index return inst_out
def text_to_instance(sentence: List[str], tags: List[str] = None, idx: str = None, token_indexers=None) -> Instance: token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} tokens = [Token(word) for word in sentence] sentence_field = TextField(tokens, token_indexers) fields = {"sentence": sentence_field} if tags: label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field) fields["labels"] = label_field inst = Instance(fields) inst.index = idx return inst
def text_to_instance( # type: ignore self, tokens: List[Token], tags: List[str] = None, idx: str = None) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, sequence) inst = Instance(fields) inst.index = idx return inst