Ejemplo n.º 1
0
    def _parse_pack(self, file_path: str) -> Iterator[DataPack]:
        data_pack: DataPack = DataPack()

        sent_begin: int = 0
        doc_text: str = ""

        with open(file_path, encoding="utf8") as doc:
            for para in doc:
                para = self.preprocess_reviews(para)
                sents = para.split("\n")
                for sent in sents:
                    if len(sent) > 0:
                        sent = sent.strip()
                        doc_text += sent + " "
                        doc_offset = sent_begin + len(sent) + 1
                        # Add sentences.
                        Sentence(data_pack, sent_begin, doc_offset - 1)
                        sent_begin = doc_offset

        pos_dir: str = os.path.basename(os.path.dirname(file_path))
        movie_file: str = os.path.basename(file_path)
        title: List = movie_file.split('_')
        doc_id: str = pos_dir + title[0]
        score: float = float(title[1].split('.')[0])
        score /= 10.0

        data_pack.pack_name = doc_id
        data_pack.set_text(doc_text)

        # Add documents.
        document: Document = Document(data_pack, 0, len(doc_text))
        document.sentiment = {doc_id: score}

        yield data_pack
Ejemplo n.º 2
0
def get_index(pack: DataPack, index_entries: List[Annotation],
              context_entry: Annotation):
    founds = []
    for i, entry in enumerate(index_entries):
        if pack.covers(context_entry, entry):
            founds.append(i)
    return [founds[0], founds[-1] + 1]
Ejemplo n.º 3
0
    def pack(self, data_pack: DataPack, output_dict: Optional[Dict] = None):
        r"""Add corresponding fields to data_pack"""
        if output_dict is None:
            return

        for i in range(len(output_dict["RelationLink"]["parent.tid"])):
            for j in range(len(output_dict["RelationLink"]["parent.tid"][i])):
                link = RelationLink(data_pack)
                link.rel_type = output_dict["RelationLink"]["rel_type"][i][j]
                parent: EntityMention = data_pack.get_entry(  # type: ignore
                    output_dict["RelationLink"]["parent.tid"][i][j])
                link.set_parent(parent)
                child: EntityMention = data_pack.get_entry(  # type: ignore
                    output_dict["RelationLink"]["child.tid"][i][j])
                link.set_child(child)
                data_pack.add_or_get_entry(link)
Ejemplo n.º 4
0
    def extract(self, pack: DataPack, instance: Annotation) -> Feature:
        r"""Extract the subword feature of one instance.

        Args:
            pack (Datapack): The datapack that contains the current
                instance.
            instance (Annotation): The instance from which the
                extractor will extractor feature.

        Returns:
            Feature: a feature that contains the extracted data.
        """
        data = []
        for subword in pack.get(self._entry_type, instance):
            text = subword.text
            if not subword.is_first_segment:
                text = "##" + text
            data.append(self.element2repr(text))

        data = ([self.element2repr("[CLS]")] + data +
                [self.element2repr("[SEP]")])

        meta_data = {
            "need_pad": self.vocab.use_pad,  # type: ignore
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int,
        }

        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Ejemplo n.º 5
0
    def extract(self, pack: DataPack, instance: Annotation) -> Feature:
        r"""Extract the character feature of one instance.

        Args:
            pack (Datapack): The datapack that contains the current
                instance.
            instance (Annotation): The instance from which the
                extractor will extractor feature.

        Returns (Feature):
            a feature that contains the extracted data.
        """
        data = []
        max_char_length = -1

        for word in pack.get(self._entry_type, instance):
            if self.vocab:
                data.append([self.element2repr(char) for char in word.text])
            else:
                data.append(list(word.text))
            max_char_length = max(max_char_length, len(data[-1]))

        if (hasattr(self.config, "max_char_length")
                and self.config.max_char_length is not None
                and self.config.max_char_length < max_char_length):
            data = [token[:self.config.max_char_length] for token in data]

        meta_data = {
            "need_pad": self.config.need_pad,
            "pad_value": self.get_pad_value(),
            "dim": 2,
            "dtype": int if self.vocab else str,
        }
        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Ejemplo n.º 6
0
    def update_vocab(self,
                     pack: DataPack,
                     context: Optional[Annotation] = None):
        r"""Get all attributes of one instance and add them into the vocabulary.

        Args:
            pack (DataPack): The data pack input to extract vocabulary.
            context (Annotation): The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.
        """
        entry: Entry
        for entry in pack.get(self.config.entry_type, context):
            # The following pylint skip due to a bug:
            # https://github.com/PyCQA/pylint/issues/3507
            # Hashable is not recognized the type.
            # pylint: disable=isinstance-second-argument-not-valid-type
            element = self._get_attribute(entry, self.config.attribute)
            if not isinstance(element, Hashable):
                raise AttributeError(
                    "Only hashable element can be"
                    "added into the vocabulary. Consider setting"
                    "vocab_method to be raw and do not call update_vocab"
                    "if you only need the raw attribute value without"
                    "converting them into index.")
            self.add(element)
Ejemplo n.º 7
0
    def _get_data_batch(
            self, data_pack: DataPack, context_type: Type[Annotation],
            requests: Optional[Dict[Type[Entry], Union[Dict, List]]] = None,
            offset: int = 0) -> Iterable[Tuple[Dict, int]]:
        r"""Try to get batches from a dataset  with ``batch_size``, but will
        yield an incomplete batch if the data_pack is exhausted.

        Returns:
            An iterator of tuples ``(batch, cnt)``, ``batch`` is a dict
            containing the required annotations and context, and ``cnt`` is
            the number of instances in the batch.
        """
        instances: List[Dict] = []
        current_size = sum(self.current_batch_sources)

        for data in data_pack.get_data(context_type, requests, offset):
            instances.append(data)
            if len(instances) == self.batch_size - current_size:
                batch = batch_instances(instances)
                self.batch_is_full = True
                yield (batch, len(instances))
                instances = []
                self.batch_is_full = False

        # Flush the remaining data.
        if len(instances) > 0:
            batch = batch_instances(instances)
            yield (batch, len(instances))
Ejemplo n.º 8
0
    def _process(self, data_pack: DataPack):
        """
        Process the data pack to collect vocabulary information.

        Args:
            data_pack: The ner data to create vocabulary with.

        Returns:

        """
        # for data_pack in input_pack:
        for instance in data_pack.get_data(context_type=Sentence,
                                           request={
                                               Token: ["chunk", "pos", "ner"]
                                           }):
            for token in instance["Token"]["text"]:
                for char in token:
                    self.char_cnt[char] += 1
                word = self.normalize_func(token)
                self.word_cnt[word] += 1

            for pos in instance["Token"]["pos"]:
                self.pos_cnt[pos] += 1
            for chunk in instance["Token"]["chunk"]:
                self.chunk_cnt[chunk] += 1
            for ner in instance["Token"]["ner"]:
                self.ner_cnt[ner] += 1
Ejemplo n.º 9
0
    def _parse_pack(self, doc_info: Tuple[str, str]) -> Iterator[DataPack]:
        r"""Takes the `doc_info` returned by the `_collect` method and returns a
        `data_pack` that either contains entry of the type `Query`, or contains
        an entry of the type Document.

        Args:
            doc_info: document info to be populated in the data_pack.

        Returns: query or document data_pack.
        """
        data_pack: DataPack = DataPack()

        doc_id, doc_text = doc_info
        data_pack.pack_name = doc_id
        # data_pack.pack_id = doc_id

        data_pack.set_text(doc_text)
        # add documents

        # print([data_pack.pack_id, data_pack.pack_name, data_pack.text])

        Document(data_pack, 0, len(doc_text))

        # print([data_pack.pack_id, data_pack.pack_name, data_pack.text])

        yield data_pack
Ejemplo n.º 10
0
 def _process(self, input_pack: DataPack):
     token_entries = list(
         input_pack.get(entry_type=Token, components=self.token_component))
     token_texts = [token.text for token in token_entries]
     taggings = pos_tag(token_texts)
     for token, tag in zip(token_entries, taggings):
         token.pos = tag[1]
Ejemplo n.º 11
0
 def _process(self, input_pack: DataPack):
     entries = list(input_pack.get_entries_by_type(NewType))
     if len(entries) == 0:
         NewType(pack=input_pack, value="[PACK]")
     else:
         entry = entries[0]  # type: ignore
         entry.value += "[PACK]"
Ejemplo n.º 12
0
    def extract(self, pack: DataPack, instance: Annotation) -> Feature:
        r"""Extract attributes of one instance.
        For example, the text of tokens in one sentence.

        Args:
            pack (Datapack): The datapack that contains the current
                instance.
            instance (Annotation): The instance from which the
                extractor will extractor feature.

        Returns:
            Feature: a feature that contains the extracted data.
        """
        data = []
        for entry in pack.get(self.config.entry_type, instance):
            value = self.get_attribute(entry, self.config.attribute)
            rep = self.element2repr(value) if self.vocab else value
            data.append(rep)

        meta_data = {
            "need_pad": self.config.need_pad,
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int if self.vocab else Any
        }

        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Ejemplo n.º 13
0
    def add_to_pack(
        self,
        pack: DataPack,
        instance: Annotation,
        prediction: Iterable[Union[int, Any]],
    ):
        r"""Add the prediction for attribute to the
        instance. If the prediction is an iterable object, we assume
        each of the element in prediction will correspond to one entry.
        If the prediction is only one element, then we assume there will
        only be one entry in the instance.

        Extending this class will need to handle the specific prediction data
        types. The default implementation assume the data type is Integer.

        Args:
            pack (DataPack): The datapack that contains the current
                instance.
            instance (Annotation): The instance to which the
                extractor add prediction.
            prediction (Iterable[Union[int, Any]]): This is the output
                of the model, which contains the index for attributes
                of one instance.
        """
        instance_entry = list(pack.get(self._entry_type, instance))

        # The following pylint skip due to a bug:
        # https://github.com/PyCQA/pylint/issues/3507
        # Iterable is not recognized the type.
        # pylint: disable=isinstance-second-argument-not-valid-type
        if not isinstance(prediction, Iterable):
            prediction = [prediction]
        values = [self.id2element(int(x)) for x in prediction]
        for entry, value in zip(instance_entry, values):
            self._set_attribute(entry, self.config.attribute, value)
Ejemplo n.º 14
0
    def _process_pred_annotations(
            self,
            pack: DataPack,
            labels: List[str],
            word_begin: int,
            word_end: int,
            current_pred_arg: List[Optional[Tuple[int, str]]],
            verbal_pred_args: List[List[Tuple[PredicateArgument, str]]],
    ) -> None:

        for label_index, label in enumerate(labels):

            arg_type = label.strip("()*")
            if arg_type == "V":
                continue

            if "(" in label:
                # Entering into a span
                current_pred_arg[label_index] = (word_begin, arg_type)
            if ")" in label:
                # Exiting a span
                if current_pred_arg[label_index] is None:
                    raise ValueError(
                        "current_pred_arg is None when meet right blanket.")

                arg_begin = current_pred_arg[label_index][0]  # type: ignore
                arg_type = current_pred_arg[label_index][1]  # type: ignore

                pred_arg = PredicateArgument(pack, arg_begin, word_end)
                pred_arg = pack.add_or_get_entry(pred_arg)

                verbal_pred_args[label_index].append((pred_arg, arg_type))
                current_pred_arg[label_index] = None
Ejemplo n.º 15
0
 def pack(self, data_pack: DataPack, output_dict: Optional[Dict] = None):
     entries = list(data_pack.get_entries_of(NewType))
     if len(entries) == 0:
         NewType(pack=data_pack, value="[BATCH]")
     else:
         entry = entries[0]  # type: ignore
         entry.value += "[BATCH]"
Ejemplo n.º 16
0
    def _parse_pack(self, sent_lines) -> Iterator[DataPack]:
        data_pack: DataPack = DataPack()
        sent_bias: int = 0
        batch_text: str = "\n".join(
            [sent_text for _, sent_text, _ in sent_lines]
        )
        data_pack.set_text(batch_text)

        for i, sent_line in enumerate(sent_lines):
            sent_id: str = sent_line[0]
            sent_text: str = sent_line[1].strip()
            parent_pointer_list: List[int] = sent_line[2]
            # Name the data_pack with the first sentence id.
            if i == 0:
                data_pack.pack_name = sent_id
            # Add sentence to data_pack.
            Sentence(data_pack, sent_bias, sent_bias + len(sent_text))
            self._parse_parent_pointer_list(
                data_pack,
                sent_bias,
                sent_text,
                parent_pointer_list
            )

            sent_bias += len(sent_text) + 1

        yield data_pack
Ejemplo n.º 17
0
    def _parse_pack(
        self, collection: Tuple[str,
                                Dict[str,
                                     List[state_type]]]) -> Iterator[DataPack]:
        resource_name, info_box_data = collection

        if resource_name in self.redirects:
            resource_name = self.redirects[resource_name]

        if resource_name in self.pack_index:
            print_progress(f'Add infobox to resource: [{resource_name}]')

            pack_path = os.path.join(self.pack_dir,
                                     self.pack_index[resource_name])

            if os.path.exists(pack_path):
                with open(pack_path) as pack_file:
                    pack = DataPack.deserialize(pack_file.read())

                    add_info_boxes(pack, info_box_data['literals'])
                    add_info_boxes(pack, info_box_data['objects'])
                    add_property(pack, info_box_data['properties'])
                    yield pack
        else:
            print_notice(f"Resource {resource_name} is not in the raw packs.")
            self.logger.warning("Resource %s is not in the raw packs.",
                                resource_name)
Ejemplo n.º 18
0
    def _parse_pack(
            self, doc_data: Dict[str, str]
    ) -> Iterator[DataPack]:
        pack = DataPack()
        doc_name: str = doc_data['doc_name']
        if doc_name in self.__redirects:
            doc_name = self.__redirects[doc_name]

        full_text: str = doc_data['text']

        pack.set_text(full_text)
        page = WikiPage(pack, 0, len(full_text))
        page.page_id = doc_data['oldid']
        page.page_name = doc_name
        pack.pack_name = doc_name
        yield pack
Ejemplo n.º 19
0
    def extract(self,
                pack: DataPack,
                context: Optional[Annotation] = None) -> Feature:
        """Extract the attribute of an entry of the configured entry type.
        The entry type is passed in from via extractor config `entry_type`.

        Args:
            pack (DataPack): The datapack that contains the current instance.
            context (Annotation): The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.

        Returns: Features (attributes) for instance with in the provided
            context, they will be converted to the representation based on
            the vocabulary configuration.
        """
        data = []

        instance: Annotation
        for instance in pack.get(self.config.entry_type, context):
            value = self._get_attribute(instance, self.config.attribute)
            rep = self.element2repr(value) if self.vocab else value
            data.append(rep)

        meta_data = {
            "need_pad": self.config.need_pad,
            "pad_value": self.get_pad_value(),
            "dim": 1,
            "dtype": int if self.vocab else Any,
        }
        return Feature(data=data, metadata=meta_data, vocab=self.vocab)
Ejemplo n.º 20
0
    def _parse_pack(self, data_source: str) -> Iterator[DataPack]:
        r"""Takes a string which could be either a filepath or html_content and
        converts into a DataPack.

        Args:
            data_source: str that contains text of a document or a filepath

        Returns: DataPack containing Document.
        """
        pack = DataPack()

        # Check if data_source is a filepath
        if self.init_with_fileloc:
            with open(data_source, "r", encoding="utf8",
                      errors='ignore') as file:
                text = file.read()
        # else, must be a string with actual data
        else:
            text = data_source

        self.set_text(pack, text)
        # Note that pack.text can be different from the text passed in, due to
        # the text_replace_operation
        Document(pack, 0, len(pack.text))

        yield pack
Ejemplo n.º 21
0
    def _parse_pack(self, data_source: str) -> Iterator[DataPack]:
        r"""Takes a raw string and converts into a :class:`DataPack`.

        Args:
            data_source: str that contains text of a document.

        Returns: :class:`DataPack` containing Document.
        """
        pack = DataPack()

        document = Document(pack, 0, len(data_source))
        pack.add_or_get_entry(document)

        self.set_text(pack, data_source)

        yield pack
Ejemplo n.º 22
0
    def add_to_pack(
        self,
        pack: DataPack,
        predictions: List[int],
        context: Optional[Annotation] = None,
    ):
        r"""Add the prediction results to data pack. The predictions are

        We make following assumptions for prediction.

            1. If we encounter "I" while its tag is different from the previous
               tag, we will consider this "I" as a "B" and start a new tag here.
            2. We will truncate the prediction it according to the number of
               entry. If the prediction contains `<PAD>` element, this should
               remove them.

        Args:
            pack (DataPack): The datapack that contains the current instance.
            predictions (Iterable[Union[int, Any]]):
                This is the output of the model, which contains the index for
                attributes of one instance.
            context (Annotation): The context is an Annotation entry where
                features will be extracted within its range. If None, then the
                whole data pack will be used as the context. Default is None.
        """
        instance_tagging_unit: List[Annotation] = list(
            pack.get(self._tagging_unit, context))

        if self.config.is_bert:
            predictions = predictions[1:-1]

        predictions = predictions[:len(instance_tagging_unit)]
        if isinstance(predictions, Tensor):
            predictions = predictions.cpu().numpy()

        tags = [self.id2element(x) for x in predictions]
        tag_start = None
        tag_end = None
        tag_type = None
        for entry, tag in zip(instance_tagging_unit, tags):
            if (tag[1] == "O" or tag[1] == "B"
                    or (tag[1] == "I" and tag[0] != tag_type)):
                if tag_type:
                    entity_mention = self._entry_type(pack, tag_start, tag_end)
                    setattr(entity_mention, self._attribute, tag_type)
                tag_start = entry.begin
                tag_end = entry.end
                tag_type = tag[0]
            else:
                tag_end = entry.end

        # Handle the final tag
        if tag_type and tag_start and tag_end:
            entity_mention = self._entry_type(
                pack,
                tag_start,
                tag_end  # type: ignore
            )
            setattr(entity_mention, self._attribute, tag_type)
Ejemplo n.º 23
0
 def _process(self, input_pack: DataPack):
     subword_tokenizer = self.tokenizer.wordpiece_tokenizer
     for token in input_pack.get(Token):
         subwords = subword_tokenizer.tokenize_with_span(token.text)
         for subword, start, end in subwords:
             subword_token = Subword(input_pack, token.begin + start,
                                     token.begin + end)
             subword_token.is_first_segment = not subword.startswith("##")
Ejemplo n.º 24
0
 def _process(self, input_pack: DataPack):
     for sentence in input_pack.get(Sentence):
         token_entries = list(
             input_pack.get(entry_type=Token,
                            range_annotation=sentence,
                            component=self.token_component))
         token_texts = [token.text for token in token_entries]
         token_pos = [
             penn2morphy(token.pos)  # type: ignore
             for token in token_entries
         ]
         lemmas = [
             self.lemmatizer.lemmatize(token_texts[i], token_pos[i])
             for i in range(len(token_texts))
         ]
         for token, lemma in zip(token_entries, lemmas):
             token.set_fields(lemma=lemma)
Ejemplo n.º 25
0
    def pack(self, data_pack: DataPack,
             inputs: Dict[str, List[Prediction]]) -> None:
        batch_predictions = inputs["predictions"]
        for predictions in batch_predictions:
            for pred_span, arg_result in predictions:

                pred = data_pack.add_entry(
                    PredicateMention(data_pack, pred_span.begin,
                                     pred_span.end))

                for arg_span, label in arg_result:
                    arg = data_pack.add_or_get_entry(
                        PredicateArgument(data_pack, arg_span.begin,
                                          arg_span.end))
                    link = PredicateLink(data_pack, pred, arg)
                    link.set_fields(arg_type=label)
                    data_pack.add_or_get_entry(link)
Ejemplo n.º 26
0
    def _parse_pack(self, table: str) -> Iterator[DataPack]:
        p: DataPack = DataPack(pack_name="table_" + table.split("|")[0])
        p.set_text(table)

        # Create the table.
        UtteranceContext(p, 0, len(table))

        yield p
Ejemplo n.º 27
0
    def _parse_pack(self, file_path: str) -> Iterator[MultiPack]:

        m_pack: MultiPack = MultiPack()

        input_pack_name = self.config.input_pack_name
        output_pack_name = self.config.output_pack_name

        text = ""
        offset = 0
        with open(file_path, "r", encoding="utf8") as doc:

            input_pack = DataPack(doc_id=file_path)

            for line in doc:
                line = line.strip()

                if len(line) == 0:
                    continue

                # add sentence
                sent = Sentence(input_pack, offset, offset + len(line))
                input_pack.add_entry(sent)
                text += line + '\n'
                offset = offset + len(line) + 1

            input_pack.set_text(text, replace_func=self.text_replace_operation)

            output_pack = DataPack()

            m_pack.update_pack({
                input_pack_name: input_pack,
                output_pack_name: output_pack
            })

            yield m_pack
Ejemplo n.º 28
0
    def _process(self, input_pack: DataPack):
        # handle existing entries
        self._process_existing_entries(input_pack)

        batch_size: int = self.configs["infer_batch_size"]
        batches: Iterator[Iterator[Sentence]]
        # Need a copy of the one-pass iterators to support a second loop on
        # them. All other ways around it like using `itertools.tee` and `list`
        # would require extra storage conflicting with the idea of using
        # iterators in the first place. `more_itertools.ichunked` uses
        # `itertools.tee` under the hood but our usage (reading iterators
        # in order) does not cause memory issues.
        batches_copy: Iterator[Iterator[Sentence]]
        if batch_size <= 0:
            batches = iter([input_pack.get(Sentence)])
            batches_copy = iter([input_pack.get(Sentence)])
        else:
            batches = more_itertools.ichunked(input_pack.get(Sentence),
                                              batch_size)
            batches_copy = more_itertools.ichunked(input_pack.get(Sentence),
                                                   batch_size)
        for sentences, sentences_copy in zip(batches, batches_copy):
            inputs: List[Dict[str, str]] = [{
                "sentence": s.text
            } for s in sentences]
            results: Dict[str, List[Dict[str, Any]]] = {
                k: p.predict_batch_json(inputs)
                for k, p in self.predictor.items()
            }
            for i, sent in enumerate(sentences_copy):
                result: Dict[str, List[str]] = {}
                for key in self.predictor:
                    if key == "srl":
                        result.update(
                            parse_allennlp_srl_results(
                                results[key][i]["verbs"]))
                    else:
                        result.update(results[key][i])
                if "tokenize" in self.configs.processors:
                    # creating new tokens and dependencies
                    tokens = self._create_tokens(input_pack, sent, result)
                    if "depparse" in self.configs.processors:
                        self._create_dependencies(input_pack, tokens, result)
                    if "srl" in self.configs.processors:
                        self._create_srl(input_pack, tokens, result)
Ejemplo n.º 29
0
    def _process(self, input_pack: DataPack):
        serialized_datapack: str = input_pack.serialize()

        self.documents.append(
            (str(input_pack.pack_id), input_pack.text, serialized_datapack))

        if len(self.documents) == self.config.batch_size:
            self._bulk_process()
            self.documents = []
Ejemplo n.º 30
0
def create_utterance(input_pack: DataPack, text: str, speaker: str):
    """
    Create an utterance in the datapack. This is composed of two steps:
     1. Append the utterance text to the data pack.
     2. Create :class:`~ft.onto.base_ontology.Utterance` entry on the text.
     3. Set the speaker of the utterance to the provided `speaker`.

    Args:
        input_pack: The data pack to add utterance into.
        text: The text of the utterance.
        speaker: The speaker name to be associated with the utterance.

    """
    input_pack.set_text(input_pack.text + '\n' + text)

    u = Utterance(input_pack,
                  len(input_pack.text) - len(text), len(input_pack.text))
    u.speaker = speaker