Exemple #1
0
class SentenceTaggerPredictor(Predictor):
    """
    Wrapper for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the :class:`~allennlp.models.crf_tagger.CrfTagger` model
    and also
    the :class:`~allennlp.models.simple_tagger.SimpleTagger` model.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = JustSpacesWordSplitter()

    @overrides
    def _json_to_instance(self,
                          json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        instance = self._dataset_reader.text_to_instance(tokens)

        return_dict: JsonDict = {"words": [token.text for token in tokens]}

        return instance, return_dict
Exemple #2
0
class PairsDatasetReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__()
        self._tokenizer = JustSpacesWordSplitter()
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            for line in data_file.readlines():
                line_json = json.loads(line)
                if not line_json:
                    continue

                query_paper = line_json["query_paper"]
                candidate_paper = line_json["candidate_paper"]
                relevance = line_json["relevance"]

                instance = self.text_to_instance(
                    query_paper=query_paper,
                    candidate_paper=candidate_paper,
                    relevance=relevance)
                if instance is not None:
                    yield instance

    @overrides
    def text_to_instance(self,
                         query_paper: str,
                         candidate_paper: str,
                         relevance: str = None) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        query_tokens = self._tokenizer.split_words(query_paper)
        fields['query_paper'] = TextField(query_tokens, self._token_indexers)

        candidate_tokens = self._tokenizer.split_words(candidate_paper)
        fields['candidate_paper'] = TextField(candidate_tokens,
                                              self._token_indexers)

        if relevance is not None:
            fields['label'] = LabelField(relevance)

        return Instance(fields)
class LegalDatasetReader(DatasetReader):
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._word_splitter = JustSpacesWordSplitter()
        self.lazy = lazy

    @overrides
    def text_to_instance(self,
                         graf_tokens: List[Token],
                         labels: List[str] = None) -> Instance:
        graf_field = TextField(graf_tokens, self.token_indexers)

        metadata = MetadataField(({"graf_words": graf_tokens}))

        fields = {"graf": graf_field, "metadata": metadata}

        if labels is not None:
            label_field = MultiLabelField(labels)
            fields["label"] = label_field

        return Instance(fields)

    def _read(self, file_path: str) -> Iterator[Instance]:
        """
        This is a file that has been created by json2lines.
        :param file_path:
        :return:
        """

        counts = {"pos": 0, "neg": 0}

        with open(file_path) as f:
            lines = f.readlines()

        for line in lines:
            graf_str, label_str = line.strip().split("\t")
            if "unmatched" == label_str:
                counts["neg"] += 1
            else:
                counts["pos"] += 1

            yield self.text_to_instance(
                self._word_splitter.split_words(graf_str),
                label_str.split(","))

        print(counts)
Exemple #4
0
    def _read(self, file_path: str):
        file_path = cached_path(file_path)
        tokenizer = JustSpacesWordSplitter()

        with open(file_path, 'r') as f:
            while True:
                headline = f.readline()
                if not headline:
                    break
                parts = headline.strip().split('\t')
                step_count = int(parts[2])

                sent_list = []  # sentences of each paragraph
                sent_anno_list = []  # list of sentence annotations (P C F)
                word_pos_list = []  # list of word pos, -2 -1 0 1 2, to indicate the position of participants
                part_mask_list = []  # list of participant mask, 0 0 1 1 0 0, to obtain the part embeddings
                before_category_status_list = []  # list of bef category annotations   0-known, 1-unknown, 2-null
                before_category_mask_list = []  # list of bef category masks, 0-unknown   1-known

                before_loc_start_list = []  # list of start positions of bef loc
                before_loc_end_list = []  # list of end positions of bef loc
                after_category_status_list = []  # list of aft category annotations   0-known, 1-unknown, 2-null
                after_category_mask_list = []  # list of aft category masks, 0-unknown   1-known
                after_loc_start_list = []  # list of start positions of aft loc
                after_loc_end_list = []  # list of end positions of aft loc

                for i in range(step_count):
                    paraline = f.readline()
                    paraline = paraline.lower()
                    words = tokenizer.split_words(paraline.strip())

                    sent_anno_line = f.readline()
                    sent_annos = tokenizer.split_words(sent_anno_line.strip())

                    anno_line = f.readline()
                    anno_parts = anno_line.split('\t')
                    part_start = int(anno_parts[1])
                    part_end = int(anno_parts[2])

                    participant_mask = []

                    before_loc_start = int(anno_parts[4])
                    before_loc_end = int(anno_parts[5])
                    after_loc_start = int(anno_parts[8])
                    after_loc_end = int(anno_parts[9])

                    word_pos_line = ""
                    for i in range(0, part_start):
                        pos = i - part_start
                        word_pos_line += " " + str(pos)
                        participant_mask.append(0)
                    for i in range(part_start, part_end):
                        pos = 0
                        word_pos_line += " " + str(pos)
                        participant_mask.append(1)
                    for i in range(part_end, len(words)):
                        pos = i - part_end
                        word_pos_line += " " + str(pos)
                        participant_mask.append(0)

                    input_length = len(words)
                    word_pos = tokenizer.split_words(word_pos_line.strip())

                    assert input_length == len(participant_mask)

                    # 0: known    1: unk     2: null
                    before_category_status = 0
                    after_category_status = 0

                    # 1: known     0: category
                    before_category_mask = 1
                    after_category_mask = 1

                    category_index = 0
                    # -2 -- null,   -1 -- unk
                    if before_loc_start==-2 and before_loc_end==-2:
                        before_category_status = 2
                        before_category_mask = 0
                        before_loc_start = category_index
                        before_loc_end = category_index
                    elif before_loc_start==-1 and before_loc_end==-1:
                        before_category_status = 1
                        before_category_mask = 0
                        before_loc_start = category_index
                        before_loc_end = category_index
                    if after_loc_start == -2 and after_loc_end == -2:
                        after_category_status = 2
                        after_category_mask = 0
                        after_loc_start = category_index
                        after_loc_end = category_index
                    elif after_loc_start == -1 and after_loc_end == -1:
                        after_category_status = 1
                        after_category_mask = 0
                        after_loc_start = category_index
                        after_loc_end = category_index

                    sent_list.append(words)
                    sent_anno_list.append(sent_annos)
                    word_pos_list.append(word_pos)
                    part_mask_list.append(participant_mask)
                    before_category_status_list.append(before_category_status)
                    before_category_mask_list.append(before_category_mask)
                    before_loc_start_list.append(before_loc_start)
                    before_loc_end_list.append(before_loc_end)
                    after_category_status_list.append(after_category_status)
                    after_category_mask_list.append(after_category_mask)
                    after_loc_start_list.append(after_loc_start)
                    after_loc_end_list.append(after_loc_end)
                yield self.text_to_instance([sent_list, sent_anno_list, word_pos_list, part_mask_list,
                                             before_category_status_list, before_category_mask_list,
                                             before_loc_start_list, before_loc_end_list, after_category_status_list,
                                             after_category_mask_list, after_loc_start_list, after_loc_end_list])
Exemple #5
0
class ChatDatasetReader(DatasetReader):
    """
    How to Design a Personal Data Reader.
    ---------- ---------- ---------- ----------
    Override Function Needed:
        ( __init__: init tokenizer and token indexer )
        _read: read data set file and get data instances list
        text_to_instance: turn a text field to instance
    ---------- ---------- ---------- ----------
    Usage:
        1. Get reader class instance: reader = ChatDatasetReader()
        2. Get data instances: ensure_list(reader.read('dataset.json'))
        3. get data content: What's the structure of a single instance ?
            instance.fields: text and label field of an instance
            instance.fields['key']: information of 'key' field
            For Text Field:
                instance.fields['key'].sentence: 'key' field's sentence
                instance.fields['key'].sentence[i].text: get a 'key'-field-token's text content
            For Label Field:
                instance.fields['key'].label: 'key' field's label
    ---------- ---------- ---------- ----------
    Multiplex:
        Change _read and text_to_instance functions to multiplex the reader.
    """
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        # self._tokenizer = tokenizer or WordTokenizer()
        self._tokenizer = JustSpacesWordSplitter()
        self._token_indexers = token_indexers or {
            "sentence": SingleIdTokenIndexer()
        }

    @overrides
    # 实现DatasetReader的read方法
    def _read(self, file_path):
        # cached_path: Input can be a local path or an url. If input is url, it will download the file then read it.
        with open(cached_path(file_path), "r", encoding='utf-8') as data_file:
            json_data = json.loads(data_file.read())
            for line in json_data:
                sentence = line['sentence']
                label = line['label']
                yield self.text_to_instance(
                    sentence,
                    label)  # generator (get/return a list of text instance)

    @overrides
    def text_to_instance(self,
                         sentence: str,
                         label: int = None) -> Instance:  # type: ignore
        # Text/Label field can be more than one
        # tokenized_sentence = self._tokenizer.tokenize(sentence)  # step 1: text to token list
        tokenized_sentence = self._tokenizer.split_words(sentence)
        sentence_field = TextField(
            tokenized_sentence,
            self._token_indexers)  # step 2: token to sentence field
        # tokenized_example = self._tokenizer.tokenize(example)
        # example_field = TextField(tokenized_example, self._token_indexers)
        fields = {
            'sentence': sentence_field,
            # 'example': example_field,
        }
        if label is not None:  # when it use in predictor, label is init as None
            fields['label'] = LabelField(
                str(label))  # LabelField need a string input
        return Instance(fields)
Exemple #6
0
class DatasetReader(DatasetReader):
    """
    Parameters
    ----------
    lazy : ``bool`` (optional, default=False)
        Passed to ``DatasetReader``.  If this is ``True``, training will start sooner, but will
        take longer per batch.  This also allows training with datasets that are too large to fit
        in memory.
    tokenizer : ``Tokenizer``, optional
        Tokenizer to use to split the sentence into words or other kinds of tokens.
        Defaults to ``WordTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        Indexers used to define input token representations. Defaults to ``{"tokens":
        SingleIdTokenIndexer()}``.
    """
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._word_spliter = JustSpacesWordSplitter()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

    @overrides
    def _read(self, file_path):
        text_id = 0
        with open(cached_path(file_path), "r", encoding="utf8") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line in data_file:
                line = line.strip("\n")
                if not line:
                    continue
                segments = line.split('\t')
                if len(segments) == 2:
                    label = segments[0]
                    sentence = segments[1]
                else:
                    continue
                text_id += 1
                yield self.text_to_instance(sentence, label, text_id)

    @overrides
    def text_to_instance(self, text: str, label: int,
                         text_id: int) -> Instance:  # type: ignore
        """
        Parameters
        ----------
        text : ``str``, required.
            The text to classify
        label : ``int``, optional, (default = None).
            The label for this text.
        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label of the sentence or phrase.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # tokens = self._tokenizer.tokenize(text)
        text = " ".join(list(text))
        tokens = self._word_spliter.split_words(text)
        fields['tokens'] = TextField(tokens, self._token_indexers)
        fields["text_id"] = LabelField(text_id, skip_indexing=True)
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields)
class TruecaserPredictor(Predictor):
    """
    This is basically a copy of the SentenceTagger from allennlp. It is
    modified to dump output in a more sensible manner.
    """
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = JustSpacesWordSplitter()
        self.model = model

    def predict(self, sent):
        js = {"sentence": sent}
        return self.predict_instance(self._json_to_instance(js))

    @overrides
    def predict_instance(self, sent: Instance) -> JsonDict:
        output = super().predict_instance(sent)
        #output["chars"] = sent["tokens"]
        output["words"] = sent["tokens"].tokens
        return output

    @overrides
    def predict_batch_instance(self, sents: List[Instance]) -> List[JsonDict]:
        outputs = super().predict_batch_instance(sents)
        for i, sent in enumerate(sents):
            #output["chars"] = sent["tokens"]
            outputs[i]["words"] = sent["tokens"].tokens
        return outputs

    def load_line(self, line: str) -> JsonDict:
        """
        This will usually be overridden with use_dataset_reader = True on the command line.
        :param line:
        :return:
        """
        return {"sentence": line}

    def dump_line(self, outputs: JsonDict):
        newd = {}
        tags = outputs["tags"]
        chars = outputs["words"]

        # all chars are lower case by default.
        out = []
        for token, t in zip(chars, tags):
            c = token.text
            if t == "U":
                c = c.upper()
            out.append(c)

        return "".join(out) + "\n"

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokenized_sent = " ".join(
            map(str, self._tokenizer.split_words(sentence)))
        chars = [Token(c) for c in tokenized_sent.lower()]
        return self._dataset_reader.text_to_instance(chars)
Exemple #8
0
    def predict_json(self,
                     inputs: JsonDict,
                     cuda_device: int = -1) -> JsonDict:
        instance_text = inputs["instance"]
        step_list = instance_text.split("####")
        tokenizer = JustSpacesWordSplitter()

        headline = step_list[0]

        parts = headline.strip().split('\t')
        para_id = parts[0]

        paragraph = ""

        sent_list = []  # sentences of each paragraph
        sent_anno_list = []  # list of sentence annotations (P C F)
        word_pos_list = [
        ]  # list of word pos, -2 -1 0 1 2, to indicate the position of participants
        part_mask_list = [
        ]  # list of participant mask, 0 0 1 1 0 0, to obtain the part embeddings
        before_category_status_list = [
        ]  # list of bef category annotations   0-known, 1-unknown, 2-null
        before_category_mask_list = [
        ]  # list of bef category masks, 0-unknown   1-known
        before_loc_start_list = []  # list of start positions of bef loc
        before_loc_end_list = []  # list of end positions of bef loc
        after_category_status_list = [
        ]  # list of aft category annotations   0-known, 1-unknown, 2-null
        after_category_mask_list = [
        ]  # list of aft category masks, 0-unknown   1-known
        after_loc_start_list = []  # list of start positions of aft loc
        after_loc_end_list = []  # list of end positions of aft loc

        i = 1
        while i < len(step_list):
            para_line = step_list[i]
            para_line = para_line.lower()
            words = tokenizer.split_words(para_line.strip())

            paragraph = para_line

            i += 1
            sent_anno_line = step_list[i]
            sent_annos = tokenizer.split_words(sent_anno_line.strip())

            i += 1
            anno_line = step_list[i]
            anno_parts = anno_line.split('\t')
            part_start = int(anno_parts[1])
            part_end = int(anno_parts[2])

            i += 1

            participant_mask = []

            before_loc_start = int(anno_parts[4])
            before_loc_end = int(anno_parts[5])
            after_loc_start = int(anno_parts[8])
            after_loc_end = int(anno_parts[9])

            if before_loc_start == -3:
                before_loc_start = -2
                before_loc_end = -2
            if after_loc_start == -3:
                after_loc_start = -2
                after_loc_end = -2

            word_pos_line = ""
            for m in range(0, part_start):
                pos = m - part_start
                word_pos_line += " " + str(pos)
                participant_mask.append(0)
            for m in range(part_start, part_end):
                pos = 0
                word_pos_line += " " + str(pos)
                participant_mask.append(1)
            for m in range(part_end, len(words)):
                pos = m - part_end
                word_pos_line += " " + str(pos)
                participant_mask.append(0)

            input_length = len(words)
            word_pos = tokenizer.split_words(word_pos_line.strip())

            assert input_length == len(participant_mask)

            # 0: known    1: unk     2: null
            before_category_status = 0
            after_category_status = 0

            # 1: known     0: category
            before_category_mask = 1
            after_category_mask = 1

            category_index = 0
            # -2 -- null,   -1 -- unk
            if before_loc_start == -2 and before_loc_end == -2:
                before_category_status = 2
                before_category_mask = 0
                before_loc_start = category_index
                before_loc_end = category_index
            elif before_loc_start == -1 and before_loc_end == -1:
                before_category_status = 1
                before_category_mask = 0
                before_loc_start = category_index
                before_loc_end = category_index
            if after_loc_start == -2 and after_loc_end == -2:
                after_category_status = 2
                after_category_mask = 0
                after_loc_start = category_index
                after_loc_end = category_index
            elif after_loc_start == -1 and after_loc_end == -1:
                after_category_status = 1
                after_category_mask = 0
                after_loc_start = category_index
                after_loc_end = category_index

            sent_list.append(words)
            sent_anno_list.append(sent_annos)
            word_pos_list.append(word_pos)
            part_mask_list.append(participant_mask)
            before_category_status_list.append(before_category_status)
            before_category_mask_list.append(before_category_mask)
            before_loc_start_list.append(before_loc_start)
            before_loc_end_list.append(before_loc_end)
            after_category_status_list.append(after_category_status)
            after_category_mask_list.append(after_category_mask)
            after_loc_start_list.append(after_loc_start)
            after_loc_end_list.append(after_loc_end)
        instance = self._dataset_reader.text_to_instance([
            sent_list, sent_anno_list, word_pos_list, part_mask_list,
            before_category_status_list, before_category_mask_list,
            before_loc_start_list, before_loc_end_list,
            after_category_status_list, after_category_mask_list,
            after_loc_start_list, after_loc_end_list
        ])

        outputs = self._model.forward_on_instance(instance=instance)

        predictions = {}
        predictions["paraid"] = para_id
        predictions["entity"] = parts[1]
        predictions["paragraph"] = paragraph
        predictions["best_span"] = str(outputs["best_span"].numpy())
        predictions["true_span"] = str(outputs["true_span"].numpy())

        return predictions