def extract(self, pack: DataPack, instance: Annotation) -> Feature: r"""Extract the character feature of one instance. Args: pack (Datapack): The datapack that contains the current instance. instance (Annotation): The instance from which the extractor will extractor feature. Returns: Feature: a feature that contains the extracted data. """ data = [] max_char_length = -1 for word in pack.get(self.config.entry_type, instance): if self.vocab: data.append([self.element2repr(char) for char in word.text]) else: data.append(list(word.text)) max_char_length = max(max_char_length, len(data[-1])) if hasattr(self.config, "max_char_length") and \ self.config.max_char_length is not None and \ self.config.max_char_length < max_char_length: data = [token[:self.config.max_char_length] for token in data] meta_data = { "need_pad": self.config.need_pad, "pad_value": self.get_pad_value(), "dim": 2, "dtype": int if self.vocab else str } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def extract(self, pack: DataPack, instance: Annotation) -> Feature: r"""Extract the sequence tagging feature of one instance. If the vocabulary of this extractor is set, then the extracted tag sequences will be converted to the tag ids (int). Args: pack (DataPack): The datapack that contains the current instance. instance (Annotation): The instance from which the extractor will extractor feature. Returns (Feature): a feature that contains the extracted data. """ instance_tagged: List[Tuple[Optional[str], str]] = bio_tagging( pack, instance, self.config.tagging_unit, self.config.entry_type, self.config.attribute) if self.vocab: # Use the vocabulary to map data into representation. vocab_mapped: List[Union[int, List[int]]] = [] for pair in instance_tagged: vocab_mapped.append(self.element2repr(pair)) raw_data: List = vocab_mapped else: # When vocabulary is not available, use the original data. raw_data = instance_tagged return Feature(data=raw_data, metadata={ "pad_value": self.get_pad_value(), "dim": 1, "dtype": int if self.vocab else tuple }, vocab=self.vocab)
def extract(self, pack: DataPack, instance: Annotation) -> Feature: r"""Extract attributes of one instance. For example, the text of tokens in one sentence. Args: pack (Datapack): The datapack that contains the current instance. instance (Annotation): The instance from which the extractor will extractor feature. Returns: Feature: a feature that contains the extracted data. """ data = [] for entry in pack.get(self.config.entry_type, instance): value = self.get_attribute(entry, self.config.attribute) rep = self.element2repr(value) if self.vocab else value data.append(rep) meta_data = { "need_pad": self.config.need_pad, "pad_value": self.get_pad_value(), "dim": 1, "dtype": int if self.vocab else Any } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def extract(self, pack: DataPack, context: Optional[Annotation] = None) -> Feature: """Extract the attribute of an entry of the configured entry type. The entry type is passed in from via extractor config `entry_type`. Args: pack (DataPack): The datapack that contains the current instance. context (Annotation): The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. Returns: Features (attributes) for instance with in the provided context, they will be converted to the representation based on the vocabulary configuration. """ data = [] instance: Annotation for instance in pack.get(self.config.entry_type, context): value = self._get_attribute(instance, self.config.attribute) rep = self.element2repr(value) if self.vocab else value data.append(rep) meta_data = { "need_pad": self.config.need_pad, "pad_value": self.get_pad_value(), "dim": 1, "dtype": int if self.vocab else Any, } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def extract(self, pack: DataPack, instance: Annotation) -> Feature: r"""Extract the subword feature of one instance. Args: pack (Datapack): The datapack that contains the current instance. instance (Annotation): The instance from which the extractor will extractor feature. Returns: Feature: a feature that contains the extracted data. """ data = [] for subword in pack.get(self._entry_type, instance): text = subword.text if not subword.is_first_segment: text = "##" + text data.append(self.element2repr(text)) data = ([self.element2repr("[CLS]")] + data + [self.element2repr("[SEP]")]) meta_data = { "need_pad": self.vocab.use_pad, # type: ignore "pad_value": self.get_pad_value(), "dim": 1, "dtype": int, } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def extract(self, pack: DataPack, instance: Annotation) -> Feature: r"""Extract the sequence tagging feature of one instance. Args: pack (Datapack): The datapack that contains the current instance. instance (Annotation): The instance from which the extractor will extractor feature. Returns: Feature: a feature that contains the extracted data. """ instance_tagged: List[Tuple[Optional[str], str]] = \ bio_tagging(pack, instance, self.config.tagging_unit, self.config.entry_type, self.config.attribute) data = [] for pair in instance_tagged: if self.vocab: data.append(self.element2repr(pair)) else: data.append(pair) meta_data = { "pad_value": self.get_pad_value(), "dim": 1, "dtype": int if self.vocab else tuple } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def extract(self, pack: DataPack, context: Optional[Annotation] = None) -> Feature: r"""Extract the sequence tagging feature of one instance. If the vocabulary of this extractor is set, then the extracted tag sequences will be converted to the tag ids (int). Args: pack (DataPack): The datapack that contains the current instance. context (Annotation): The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. Returns (Feature): a feature that contains the extracted BIO sequence of and other metadata. """ instance_tagged: List[Tuple[Optional[str], str]] = bio_tagging( pack, self.config.tagging_unit, self.config.entry_type, self.config.attribute, context, ) pad_value = self.get_pad_value() if self.vocab: # Use the vocabulary to map data into representation. vocab_mapped: List[Union[int, List[int]]] = [] for pair in instance_tagged: vocab_mapped.append(self.element2repr(pair)) raw_data: List = vocab_mapped if self.config.is_bert: raw_data = [pad_value] + raw_data + [pad_value] need_pad = self.vocab.use_pad else: # When vocabulary is not available, use the original data. raw_data = instance_tagged need_pad = self.config.need_pad meta_data = { "need_pad": need_pad, "pad_value": pad_value, "dim": 1, "dtype": int if self.vocab else tuple, } return Feature(data=raw_data, metadata=meta_data, vocab=self.vocab)
def extract(self, pack: DataPack, context: Optional[Annotation] = None) -> Feature: """Extract link data as features from the context. Args: pack (DataPack): The input data pack that contains the features. context (Annotation): The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. Returns: """ index_annotations: List[Annotation] = list( pack.get(self.config.index_annotation, context)) parent_nodes: List[Annotation] = [] child_nodes: List[Annotation] = [] relation_atts = [] r: Link for r in pack.get(self.config.entry_type, context): parent_nodes.append(r.get_parent()) # type: ignore child_nodes.append(r.get_child()) # type: ignore raw_att = getattr(r, self.config.attribute) relation_atts.append( self.element2repr(raw_att) if self.vocab else raw_att) parent_unit_span = [] child_unit_span = [] for p, c in zip(parent_nodes, child_nodes): parent_unit_span.append(get_index(pack, index_annotations, p)) child_unit_span.append(get_index(pack, index_annotations, c)) meta_data = { "parent_unit_span": parent_unit_span, "child_unit_span": child_unit_span, "pad_value": self.get_pad_value(), "dim": 1, "dtype": int if self.vocab else str, } return Feature(data=relation_atts, metadata=meta_data, vocab=self.vocab)
def extract( self, pack: DataPack, context: Optional[Annotation] = None ) -> Feature: r"""Extract the subword feature of one instance. Args: pack (Datapack): The datapack that contains the current instance. context (Annotation): The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. Returns: Feature: a feature that contains the extracted data. """ data = [] subword: Annotation for subword in pack.get(self.config.subword_class, context): text = subword.text # type: ignore if not subword.is_first_segment: # type: ignore text = "##" + text data.append(self.element2repr(text)) data = ( [self.element2repr("[CLS]")] + data + [self.element2repr("[SEP]")] ) meta_data = { "need_pad": self.vocab.use_pad, # type: ignore "pad_value": self.get_pad_value(), "dim": 1, "dtype": int, } return Feature(data=data, metadata=meta_data, vocab=self.vocab)