Ejemplo n.º 1
0
class TestSentenceSplitter(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
        self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)

    def test_rule_based_splitter_passes_through_correctly(self):
        text = "This is the first sentence. This is the second sentence! "
        tokens = self.rule_based_splitter.split_sentences(text)
        expected_tokens = [
            "This is the first sentence.", "This is the second sentence!"
        ]
        assert tokens == expected_tokens

    def test_dep_parse_splitter_passes_through_correctly(self):
        text = "This is the first sentence. This is the second sentence! "
        tokens = self.dep_parse_splitter.split_sentences(text)
        expected_tokens = [
            "This is the first sentence.", "This is the second sentence!"
        ]
        assert tokens == expected_tokens

    def test_batch_rule_based_sentence_splitting(self):
        text = [
            "This is a sentence. This is a second sentence.",
            "This isn't a sentence. This is a second sentence! This is a third sentence.",
        ]
        batch_split = self.rule_based_splitter.batch_split_sentences(text)
        separately_split = [
            self.rule_based_splitter.split_sentences(doc) for doc in text
        ]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(
                    batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_batch_dep_parse_sentence_splitting(self):
        text = [
            "This is a sentence. This is a second sentence.",
            "This isn't a sentence. This is a second sentence! This is a third sentence.",
        ]
        batch_split = self.dep_parse_splitter.batch_split_sentences(text)
        separately_split = [
            self.dep_parse_splitter.split_sentences(doc) for doc in text
        ]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(
                    batch_doc, separate_doc):
                assert batch_sentence == separate_sentence
Ejemplo n.º 2
0
class TestSentenceSplitter(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
        self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)

    def test_rule_based_splitter_passes_through_correctly(self):
        text = ("This is the first sentence. This is the second sentence! "
                "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?")
        tokens = self.rule_based_splitter.split_sentences(text)
        expected_tokens = ["This is the first sentence.", "This is the second sentence!",
                           "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"]
        assert tokens == expected_tokens

    @pytest.mark.skipif(spacy.__version__ < "2.1", reason="this model changed from 2.0 to 2.1")
    def test_dep_parse_splitter_passes_through_correctly(self):
        text = ("This is the first sentence. This is the second sentence! "
                "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?")
        tokens = self.dep_parse_splitter.split_sentences(text)
        expected_tokens = ["This is the first sentence.", "This is the second sentence!",
                           "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"]
        assert tokens == expected_tokens

    def test_batch_rule_based_sentence_splitting(self):
        text = ["This is a sentence. This is a second sentence.",
                "This isn't a sentence. This is a second sentence! This is a third sentence.",
                "This is the 3rd sentence?",
                "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."]
        batch_split = self.rule_based_splitter.batch_split_sentences(text)
        separately_split = [self.rule_based_splitter.split_sentences(doc) for doc in text]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_batch_dep_parse_sentence_splitting(self):
        text = ["This is a sentence. This is a second sentence.",
                "This isn't a sentence. This is a second sentence! This is a third sentence.",
                "This is the 3rd sentence?",
                "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."]
        batch_split = self.dep_parse_splitter.batch_split_sentences(text)
        separately_split = [self.dep_parse_splitter.split_sentences(doc) for doc in text]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(batch_doc, separate_doc):
                assert batch_sentence == separate_sentence
    def test_read_from_file_reuters_corpus_and_segments_sentences_properly(
            self, lazy, max_sequence_length):
        reader = MultiLabelTextClassificationJsonReader(
            lazy=lazy,
            segment_sentences=True,
            max_sequence_length=max_sequence_length)
        reuters_path = Path(
            "tests/fixtures") / "data" / "reuters-21578" / "train.jsonl"
        instances = reader.read(reuters_path)
        instances = ensure_list(instances)

        splitter = SpacySentenceSplitter()
        spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False,
                                          False)

        text1 = (
            "U.K. GROWING IMPATIENT WITH JAPAN - THATCHER Prime Minister Margaret Thatcher said the"
            " U.K. Was growing more impatient with Japanese trade barriers and warned that it would"
            " soon have new powers against countries not offering reciprocal access to their"
            " markets.")
        instance1 = {"text": text1, "labels": ["acq", "trade"]}
        text2 = (
            "CANADA OIL EXPORTS RISE 20 PCT IN 1986 Canadian oil exports rose 20 pct in 1986 over"
            " the previous year to 33.96 mln cubic meters, while oil imports soared 25.2 pct to"
            " 20.58 mln cubic meters, Statistics Canada said. Production, meanwhile, was unchanged"
            " from the previous year at 91.09 mln cubic feet.")
        instance2 = {"text": text2, "labels": ["nat-gas", "crude"]}
        text3 = (
            "COFFEE, SUGAR AND COCOA EXCHANGE NAMES CHAIRMAN The New York Coffee, Sugar and Cocoa"
            " Exchange (CSCE) elected former first vice chairman Gerald Clancy to a two-year term"
            " as chairman of the board of managers, replacing previous chairman Howard Katz. Katz,"
            " chairman since 1985, will remain a board member.")
        instance3 = {"text": text3, "labels": ["sugar", "cocoa", "coffee"]}

        for instance in [instance1, instance2, instance3]:
            sentences = splitter.split_sentences(instance["text"])
            tokenized_sentences: List[List[str]] = []
            for sentence in sentences:
                tokens = [token.text for token in spacy_tokenizer(sentence)]
                if max_sequence_length:
                    tokens = tokens[:max_sequence_length]
                tokenized_sentences.append(tokens)
            instance["tokens"] = tokenized_sentences

        assert len(instances) == 3
        fields = instances[0].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance1["tokens"]
        assert fields["labels"].labels == instance1["labels"]
        fields = instances[1].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance2["tokens"]
        assert fields["labels"].labels == instance2["labels"]
        fields = instances[2].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance3["tokens"]
        assert fields["labels"].labels == instance3["labels"]
Ejemplo n.º 4
0
class TestSentenceSplitter(AllenNlpTestCase):
    def setUp(self):
        super(TestSentenceSplitter, self).setUp()
        self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
        self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)

    def test_rule_based_splitter_passes_through_correctly(self):
        text = ("This is the first sentence. This is the second sentence! "
                "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?")
        tokens = self.rule_based_splitter.split_sentences(text)
        expected_tokens = ["This is the first sentence.", "This is the second sentence!",
                           "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"]
        assert tokens == expected_tokens

    def test_dep_parse_splitter_passes_through_correctly(self):
        text = ("This is the first sentence. This is the second sentence! "
                "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?")
        tokens = self.dep_parse_splitter.split_sentences(text)
        expected_tokens = ["This is the first sentence.", "This is the second sentence!",
                           "Here's the '3rd' sentence -", "yes, it is.", "And yes; this is a fourth sentence?"]
        assert tokens == expected_tokens

    def test_batch_rule_based_sentence_splitting(self):
        text = ["This is a sentence. This is a second sentence.",
                "This isn't a sentence. This is a second sentence! This is a third sentence.",
                "This is the 3rd sentence?",
                "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."]
        batch_split = self.rule_based_splitter.batch_split_sentences(text)
        separately_split = [self.rule_based_splitter.split_sentences(doc) for doc in text]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_batch_dep_parse_sentence_splitting(self):
        text = ["This is a sentence. This is a second sentence.",
                "This isn't a sentence. This is a second sentence! This is a third sentence.",
                "This is the 3rd sentence?",
                "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."]
        batch_split = self.dep_parse_splitter.batch_split_sentences(text)
        separately_split = [self.dep_parse_splitter.split_sentences(doc) for doc in text]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(batch_doc, separate_doc):
                assert batch_sentence == separate_sentence
Ejemplo n.º 5
0
class TWTCDatasetReader(DatasetReader):
    """
    Reads a JSON file from the TWTC dataset.
    Expected format for each input line: {"report": "text", "label": "int"}
    The output of ``read`` is a list of ``Instance`` s with the fields:
        text: ``TextField``
        label: ``LabelField``
    Parameters
    ----------
    lazy : ``bool`` (optional, default=False)
        Passed to ``DatasetReader``.  If this is ``True``, training will start sooner, but will
        take longer per batch.  This also allows training with datasets that are too large to fit
        in memory.
    tokenizer : ``Tokenizer``, optional
        Tokenizer to use to split the title and abstrct into words or other kinds of tokens.
        Defaults to ``WordTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        Indexers used to define input token representations. Defaults to ``{"tokens":
        SingleIdTokenIndexer()}``.
    """
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sentence_splitter = SpacySentenceSplitter()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self.cache_data(os.path.expanduser('~/.allennlp/cache/datasets'))

    @overrides
    def _read(self, file_path):
        file_path = cached_path(file_path)
        data = pd.read_json(file_path, lines=True,
                            orient='records')[['text', 'label']].values
        for text, label in data:
            assert isinstance(label, int)
            inst = self.text_to_instance(text, str(label))
            yield inst

    @overrides
    def text_to_instance(self, document: str, label: str = None) -> Instance:
        sentences: List[str] = self._sentence_splitter.split_sentences(
            document)
        tokenized_sents: List[List[str]] = (self._tokenizer.tokenize(sent)
                                            for sent in sentences)

        fields = {
            'tokens':
            ListField(
                [TextField(s, self._token_indexers) for s in tokenized_sents])
        }
        if label:
            fields['label'] = LabelField(int(label), skip_indexing=True)
        return Instance(fields)
    def test_read_from_file_ag_news_corpus_and_segments_sentences_properly(
            self, lazy: bool, label_name: str,
            max_sequence_length: Optional[int]):
        reader = TextSentimentReader(lazy=lazy,
                                     segment_sentences=True,
                                     label_name=label_name,
                                     max_sequence_length=max_sequence_length)
        ag_path = Path(DATA_DIR, 'ag_news_corpus_original.jsonl')
        if label_name == 'text_sentiment':
            ag_path = Path(DATA_DIR, 'ag_news_corpus.jsonl')
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        splitter = SpacySentenceSplitter()
        spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False,
                                          False)

        text1 = ("Memphis Rout Still Stings for No. 14 Louisville; Coach "
                 "Petrino Vows to Have Team Better Prepared. NASHVILLE, "
                 "Tenn. Nov 3, 2004 - Louisville #39;s 30-point loss "
                 "at home to Memphis last season is still a painful memory "
                 "for the Cardinals.")
        instance1 = {"text": text1, "label": "2"}
        text2 = ("AP - Eli Manning has replaced Kurt Warner as the New York"
                 " Giants' starting quarterback.")
        instance2 = {"text": text2, "label": "2"}
        text3 = ("A conference dedicated to online journalism explores the "
                 "effect blogs have on news reporting. Some say they draw "
                 "attention to under-reported stories. Others struggle to "
                 "establish the credibility enjoyed by professionals.")
        instance3 = {"text": text3, "label": "4"}

        for instance in [instance1, instance2, instance3]:
            sentences = splitter.split_sentences(instance['text'])
            tokenized_sentences: List[List[str]] = []
            for sentence in sentences:
                tokens = [token.text for token in spacy_tokenizer(sentence)]
                if max_sequence_length:
                    tokens = tokens[:max_sequence_length]
                tokenized_sentences.append(tokens)
            instance["tokens"] = tokenized_sentences

        assert len(instances) == 3
        fields = instances[0].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance3["tokens"]
        assert fields["label"].label == instance3["label"]
Ejemplo n.º 7
0
class DoGDatasetReader(DatasetReader):
    def __init__(self,
                 lazy: bool = True,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 split_sentence_in_doc: bool = False):
        super().__init__(lazy)
        self.tokenizer = tokenizer or WordTokenizer()
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        if split_sentence_in_doc:
            self.sentence_splitter = SpacySentenceSplitter()
        else:
            self.sentence_splitter = None

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(os.path.join(os.path.split(file_path)[0], 'documents.json'), 'r') as doc_file:
            doc_json = json.load(doc_file)
        doc_field_dict = self.get_doc_field_dict(doc_json)

        with open(file_path, 'r') as data_file:
            for line in data_file:
                line = line.strip()
                dialog_json = json.loads(line)
                yield self.text_to_instance(dialog_json['dialogue'],
                                            doc_field_dict[dialog_json['docId']],
                                            dialog_json['whoSawDoc'])

    @overrides
    def text_to_instance(self, dialogs: List[str], doc_field: Field, who_saw_doc: int):
        tokenized_dialogs = [self.tokenizer.tokenize(dialog) for dialog in dialogs]
        for tokenized_dialog in tokenized_dialogs:
            tokenized_dialog.insert(0, Token(START_SYMBOL))
            tokenized_dialog.append(Token(END_SYMBOL))
        dialogue_field = ListField([TextField(tokenized_dialog, self.token_indexers)
                                   for tokenized_dialog in tokenized_dialogs])
        # who_saw_doc_field = MetadataField(who_saw_doc)
        # return Instance({'dialogue': dialogue_field, 'document': doc_field, 'who_saw_doc': who_saw_doc_field})
        return Instance({'dialogue': dialogue_field, 'document': doc_field})

    def get_doc_field_dict(self, doc_json: Dict) -> Dict[int, Field]:
        doc_field_dict = {}
        for idx, doc in doc_json.items():
            if self.sentence_splitter is not None:
                doc_sentence_list: List[str] = []
                for i in ('0', '1', '2', '3'):
                    doc_sentence_list.extend(self.sentence_splitter.split_sentences(doc[i]))
                tokenized_doc_sentence_list = [self.tokenizer.tokenize(doc_sequence) for doc_sequence in doc_sentence_list]
                doc_field = ListField([TextField(tokenized_doc_sentence, self.token_indexers)
                                       for tokenized_doc_sentence in tokenized_doc_sentence_list])
            else:
                doc_sequence = ' '.join(doc[i] for i in ('0', '1', '2', '3'))
                tokenized_doc = self.tokenizer.tokenize(doc_sequence)
                doc_field = TextField(tokenized_doc, self.token_indexers)

            doc_field_dict[int(idx)] = doc_field

        return doc_field_dict
Ejemplo n.º 8
0
def entity_extraction_wikihop(args):
    predictor_conll = AllenNER(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz")
    predictor_onto_note = \
        AllenNER("https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz")
    sentence_splitter = SpacySentenceSplitter(rule_based=True)
    with open(args.path, 'r') as f:
        data = json.load(f)
    for d in tqdm(data):
        golden_ners = []
        passage = []
        question = d['query'].strip().replace("\n", "")
        question_entity = " ".join(question.split()[1:])
        question = " ".join(question.split("_"))
        for para in d['supports']:
            sentences = sentence_splitter.split_sentences(para)
            para_ners = []
            outputs_conll = predictor_conll.predict_batch_raw(sentences)
            outputs_onto_note = predictor_onto_note.predict_batch_raw(sentences)
            for out1, out2 in zip(outputs_conll, outputs_onto_note):
                entities1 = entity_extraction_(out1['words'], out1['tags'])
                entities2 = entity_extraction_(out2['words'], out2['tags'])
                entities = set(entities1).union(set(entities2))
                # print(entities)
                para_ners.append(list(entities))
            golden_ners.append(para_ners)
            passage.append(sentences)
            # parsing_info.append([title, outputs_conll])
        # print(question)
        # print(question_entity)
        # input()
        d['supports'] = passage
        d['question_entities'] = [question_entity]
        d['ners'] = golden_ners
        d['query'] = question
        # input()
    with open(args.output, 'w') as f:
        json.dump(data, f)
Ejemplo n.º 9
0
        json.dump(named_entities_frequency_table, f)


instances = create_nabert_reader(
    data_path='../../data/drop_dataset/drop_dataset_train.json')
ner_tagger = fine_grained_named_entity_recognition_with_elmo_peters_2018()
sentences_splitter = SpacySentenceSplitter()
named_entities = defaultdict(list)

with torch.no_grad():
    for instance_idx, instance in enumerate(instances):
        original_question = instance.fields['metadata'].metadata[
            'original_question']
        original_passage = instance.fields['metadata'].metadata[
            'original_passage']

        aggregate_named_entities(original_question, named_entities)

        # NER tagger is more accurate when single sentences are fed as input
        passage_sentences = sentences_splitter.split_sentences(
            original_passage)
        for passage_sentence in passage_sentences:
            aggregate_named_entities(passage_sentence, named_entities)

        if instance_idx % 501 == 500:
            dump_frequency_table(named_entities, 'ner_frequencies_latest.json')

dump_frequency_table(named_entities, 'ner_frequencies.json')

print('Done.')
Ejemplo n.º 10
0
class TestSentenceSplitter(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()
        self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
        self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)

    def test_rule_based_splitter_passes_through_correctly(self):
        text = "This is the first sentence. This is the second sentence! "
        tokens = self.rule_based_splitter.split_sentences(text)
        expected_tokens = [
            "This is the first sentence.", "This is the second sentence!"
        ]
        assert tokens == expected_tokens

    def test_dep_parse_splitter_passes_through_correctly(self):
        text = "This is the first sentence. This is the second sentence! "
        tokens = self.dep_parse_splitter.split_sentences(text)
        expected_tokens = [
            "This is the first sentence.", "This is the second sentence!"
        ]
        assert tokens == expected_tokens

    def test_batch_rule_based_sentence_splitting(self):
        text = [
            "This is a sentence. This is a second sentence.",
            "This isn't a sentence. This is a second sentence! This is a third sentence.",
        ]
        batch_split = self.rule_based_splitter.batch_split_sentences(text)
        separately_split = [
            self.rule_based_splitter.split_sentences(doc) for doc in text
        ]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(
                    batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_batch_dep_parse_sentence_splitting(self):
        text = [
            "This is a sentence. This is a second sentence.",
            "This isn't a sentence. This is a second sentence! This is a third sentence.",
        ]
        batch_split = self.dep_parse_splitter.batch_split_sentences(text)
        separately_split = [
            self.dep_parse_splitter.split_sentences(doc) for doc in text
        ]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(
                    batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_to_params(self):
        params = self.dep_parse_splitter.to_params()
        assert isinstance(params, Params)
        assert params.params == {
            "type": "spacy",
            "language": self.dep_parse_splitter._language,
            "rule_based": self.dep_parse_splitter._rule_based,
        }
Ejemplo n.º 11
0
class TextClassificationJsonReader(DatasetReader):
    """
    Reads tokens and their labels from a labeled text classification dataset.

    The output of `read` is a list of `Instance` s with the fields:
        tokens : `TextField` and
        label : `LabelField`

    Registered as a `DatasetReader` with name "text_classification_json".

    [0]: https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf

    # Parameters

    token_indexers : `Dict[str, TokenIndexer]`, optional
        optional (default=`{"tokens": SingleIdTokenIndexer()}`)
        We use this to define the input representation for the text.
        See :class:`TokenIndexer`.
    tokenizer : `Tokenizer`, optional (default = `{"tokens": SpacyTokenizer()}`)
        Tokenizer to use to split the input text into words or other kinds of tokens.
    segment_sentences : `bool`, optional (default = `False`)
        If True, we will first segment the text into sentences using SpaCy and then tokenize words.
        Necessary for some models that require pre-segmentation of sentences, like [the Hierarchical
        Attention Network][0].
    max_sequence_length : `int`, optional (default = `None`)
        If specified, will truncate tokens to specified maximum length.
    skip_label_indexing : `bool`, optional (default = `False`)
        Whether or not to skip label indexing. You might want to skip label indexing if your
        labels are numbers, so the dataset reader doesn't re-number them starting from 0.
    text_key: `str`, optional (default=`"text"`)
        The key name of the source field in the JSON data file.
    label_key: `str`, optional (default=`"label"`)
        The key name of the target field in the JSON data file.
    """
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tokenizer: Tokenizer = None,
        segment_sentences: bool = False,
        max_sequence_length: int = None,
        skip_label_indexing: bool = False,
        text_key: str = "text",
        label_key: str = "label",
        **kwargs,
    ) -> None:
        super().__init__(manual_distributed_sharding=True,
                         manual_multiprocess_sharding=True,
                         **kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._segment_sentences = segment_sentences
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._text_key = text_key
        self._label_key = label_key
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            for line in self.shard_iterable(data_file.readlines()):
                if not line:
                    continue
                items = json.loads(line)
                text = items[self._text_key]
                label = items.get(self._label_key)
                if label is not None:
                    if self._skip_label_indexing:
                        try:
                            label = int(label)
                        except ValueError:
                            raise ValueError(
                                "Labels must be integers if skip_label_indexing is True."
                            )
                    else:
                        label = str(label)
                yield self.text_to_instance(text=text, label=label)

    def _truncate(self, tokens):
        """
        truncate a set of tokens using the provided sequence length
        """
        if len(tokens) > self._max_sequence_length:
            tokens = tokens[:self._max_sequence_length]
        return tokens

    @overrides
    def text_to_instance(
            self,
            text: str,
            label: Union[str, int] = None) -> Instance:  # type: ignore
        """
        # Parameters

        text : `str`, required.
            The text to classify
        label : `str`, optional, (default = `None`).
            The label for this text.

        # Returns

        An `Instance` containing the following fields:
            - tokens (`TextField`) :
              The tokens in the sentence or phrase.
            - label (`LabelField`) :
              The label label of the sentence or phrase.
        """

        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._max_sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens))
            fields["tokens"] = ListField(sentences)
        else:
            tokens = self._tokenizer.tokenize(text)
            if self._max_sequence_length is not None:
                tokens = self._truncate(tokens)
            fields["tokens"] = TextField(tokens)
        if label is not None:
            fields["label"] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        return Instance(fields)

    @overrides
    def apply_token_indexers(self, instance: Instance) -> None:
        if self._segment_sentences:
            for text_field in instance.fields["tokens"]:  # type: ignore
                text_field._token_indexers = self._token_indexers
        else:
            instance.fields[
                "tokens"]._token_indexers = self._token_indexers  # type: ignore
Ejemplo n.º 12
0
class ExampleLoader(object):
    def __init__(self):
        self.label_list = None
        self.sentence_splitter = SpacySentenceSplitter()

    def get_loss_weights():
        # Calculate loss weights as the inverse of label occurrence.
        loss_weights = {}
        for label in self.label_list:
            loss_weights[label] = 0

        for ex in train_examples:
            loss_weights[ex.str_label] += 1

        num_examples = len(train_examples)
        for key in loss_weights:
            loss_weights[key] = num_examples / loss_weights[key]

        weights_list = [
            float("%3.f" % loss_weights[key]) for key in loader.label_list
        ]

        return weights_list

    def get_text_from_element(self, node):
        if node.nodeType == node.TEXT_NODE:
            if node.data.isspace():
                return ""
            else:
                return node.data.replace("\n", " ")
        else:
            text = ""
            for child in node.childNodes:
                text += " " + self.get_text_from_element(child) + " "
            return text

    def process_node(self, node, events, times, full_text):
        if node.nodeName == "EVENT":
            eid = node.attributes['eid'].value
            cls = node.attributes['class'].value

            event = Event(eid=eid,
                          cls=cls,
                          sentence=None,
                          pos_in_sentence=None)
            event.idx_in_doc = len(full_text)
            events[eid] = event
            return event

        if node.nodeName == "TIMEX3":
            tid = node.attributes['tid'].value
            type = node.attributes['type'].value
            time = TimeX3(tid=tid, sentence=None, pos_in_sentence=None)
            time.idx_in_doc = len(full_text)
            times[tid] = time
            return time

    def get_instances(self, instance_elts, event_instances, events,
                      input_file):
        for instance in instance_elts:
            eiid = instance.attributes["eiid"].value
            eventID = instance.attributes["eventID"].value
            tense = instance.attributes["tense"].value
            aspect = instance.attributes["aspect"].value
            polarity = instance.attributes["polarity"].value
            pos = instance.attributes["pos"].value

            if eventID not in events:
                print(eventID, input_file)
                continue

            event = events[eventID]
            sentence = event.sentence
            pos_in_sentence = event.pos_in_sentence

            instance = EventInstance(eiid, event, tense, aspect, polarity, pos,
                                     sentence, pos_in_sentence)
            event_instances[eiid] = instance

    def parse_node(self, root, events, times, full_text):
        #         print(full_text)
        for node in root.childNodes:
            if node.nodeType == node.TEXT_NODE and not node.data.isspace():
                text = re.sub(r"\n+", " ", node.data)
                text = re.sub(r"_", "", node.data)
                text = re.sub(r"&UR;", "", node.data)
                text = re.sub(r"&LR;", "", node.data)
                split_space = text.split()
                full_text += split_space
            elif node.nodeName == "TEXT":
                self.parse_node(node, events, times, full_text)
            else:
                el = self.process_node(node, events, times, full_text)
                text = self.get_text_from_element(node)
                if el:
                    el.text = text.strip()
                full_text += text.split()

    def get_full_text_to_sentences(self, full_text, sentences):
        split_sentences = [s.split() for s in sentences]

        def next_position(split_sentences, sent_num, sent_idx):
            cur_sent = split_sentences[sent_num]
            if sent_idx < len(cur_sent) - 1:
                sent_idx += 1
            else:
                sent_idx = 0
                sent_num += 1
                if sent_num < len(split_sentences):
                    cur_sent = split_sentences[sent_num]
            return sent_num, sent_idx

        split_sentences = [s.split() for s in sentences]

        full_text_to_sentences = []

        sent_num = 0
        sent_idx = 0
        for i, tok in enumerate(full_text):
            sent_tok = split_sentences[sent_num][sent_idx]
            #             print(tok, sent_tok)
            assert tok.startswith(
                sent_tok), str(i) + " " + tok + " " + sent_tok + "\n" + str(
                    split_sentences[sent_num])
            full_text_to_sentences.append(tuple([sent_num, sent_idx]))

            while len(tok) > len(sent_tok):
                tok = tok[len(sent_tok):]
                sent_num, sent_idx = next_position(split_sentences, sent_num,
                                                   sent_idx)
                sent_tok = split_sentences[sent_num][sent_idx]
                #                 print("WHILE", tok, sent_tok)
                assert tok.startswith(sent_tok), str(
                    i) + " " + tok + " " + sent_tok + "\n" + str(
                        split_sentences[sent_num])


#                 print(tok)

            sent_num, sent_idx = next_position(split_sentences, sent_num,
                                               sent_idx)

        return full_text_to_sentences

    def convert_doc_idx_to_sentences(self, sentences, full_text_to_sentences,
                                     its):
        for key, obj in its.items():
            idx = obj.idx_in_doc
            sentence, pos_in_sentence = full_text_to_sentences[idx]
            #             print(idx, sentence, pos_in_sentence)
            text = sentences[sentence].split()[pos_in_sentence]
            assert text == obj.text.split()[0], text + " " + obj.text
            obj.sentence = sentence
            obj.pos_in_sentence = pos_in_sentence

    def read_file(self, input_file):
        """
        Parameters
        ----------
        input_file: str, path to input file

        Returns
        -------
        TimeMLFile containing sentences, events, eventInstances, times, and tlinks.
        """
        doc = dom.parse(input_file)
        root = doc.childNodes[0]

        events = {}
        times = {}
        full_text = []
        self.parse_node(root, events, times, full_text)
        #         print(full_text)

        sentences = self.sentence_splitter.split_sentences(" ".join(full_text))

        full_text_to_sentences = self.get_full_text_to_sentences(
            full_text, sentences)

        self.convert_doc_idx_to_sentences(sentences, full_text_to_sentences,
                                          events)
        self.convert_doc_idx_to_sentences(sentences, full_text_to_sentences,
                                          times)

        event_instances = {}
        instanceElts = root.getElementsByTagName("MAKEINSTANCE")
        self.get_instances(instanceElts, event_instances, events, input_file)

        tlinks = []
        tlinkElts = root.getElementsByTagName("TLINK")
        for tlinkElt in tlinkElts:
            if tlinkElt.hasAttribute("relatedToEventInstance") and \
              tlinkElt.hasAttribute("eventInstanceID"):
                lid = tlinkElt.attributes["lid"].value
                relType = tlinkElt.attributes["relType"].value
                eiid = tlinkElt.attributes["eventInstanceID"].value
                relatedToEventInstance = tlinkElt.attributes[
                    "relatedToEventInstance"].value

                if eiid not in event_instances or relatedToEventInstance not in event_instances:
                    continue

                tlink = Tlink(lid, relType, event_instances[eiid],
                              event_instances[relatedToEventInstance])
                tlinks.append(tlink)

            if tlinkElt.hasAttribute("eventInstanceID") and \
              tlinkElt.hasAttribute("relatedToTime"):
                lid = tlinkElt.attributes["lid"].value
                relType = tlinkElt.attributes["relType"].value
                eiid = tlinkElt.attributes["eventInstanceID"].value
                relatedToTime = tlinkElt.attributes["relatedToTime"].value

                if eiid not in event_instances or relatedToTime not in times:
                    continue
                tlink = Tlink(lid, relType, event_instances[eiid],
                              times[relatedToTime])
                tlinks.append(tlink)

            if tlinkElt.hasAttribute("timeID") and \
              tlinkElt.hasAttribute("relatedToEventInstance"):
                lid = tlinkElt.attributes["lid"].value
                relType = tlinkElt.attributes["relType"].value
                tid = tlinkElt.attributes["timeID"].value
                eiid = tlinkElt.attributes["relatedToEventInstance"].value

                if tid not in times or eiid not in event_instances:
                    continue
                tlink = Tlink(lid, relType, times[tid], event_instances[eiid])
                tlinks.append(tlink)

            if tlinkElt.hasAttribute("timeID") and \
              tlinkElt.hasAttribute("relatedToTime"):
                lid = tlinkElt.attributes["lid"].value
                relType = tlinkElt.attributes["relType"].value
                tid = tlinkElt.attributes["timeID"].value
                relatedToTime = tlinkElt.attributes["relatedToTime"].value

                if tid not in times or relatedToTime not in times:
                    continue
                tlink = Tlink(lid, relType, times[tid], times[relatedToTime])
                tlinks.append(tlink)

        return TimeMLFile(sentences, events, event_instances, times, tlinks,
                          input_file)

    def read_examples(self, input_file):
        file_data = self.read_file(input_file)

        examples = []

        for tlink in file_data.tlinks:
            #print(tlink.lid, tlink.relType, tlink.e1, tlink.e2)
            sent1 = tlink.e1.sentence
            sent2 = tlink.e2.sentence
            #print(sent1, sent2)

            example = None
            if sent1 >= len(file_data.sentences) or sent2 >= len(
                    file_data.sentences):
                continue

            if sent1 == sent2:
                text = file_data.sentences[sent1]
                example = TimeMLExample(text, tlink.e1.pos_in_sentence,
                                        tlink.e2.pos_in_sentence,
                                        tlink.relType)
            elif sent1 < sent2:
                sents = file_data.sentences[sent1:sent2 + 1]
                text = " [SEP] ".join(sents)

                e1_pos = tlink.e1.pos_in_sentence
                e2_pos = sum([len(s.split()) + 1
                              for s in sents[:-1]]) + tlink.e2.pos_in_sentence

                example = TimeMLExample(text, e1_pos, e2_pos, tlink.relType)
            elif sent1 > sent2:
                sents = file_data.sentences[sent2:sent1 + 1]
                text = " [SEP] ".join(sents)

                e1_pos = sum([len(s.split()) + 1
                              for s in sents[:-1]]) + tlink.e1.pos_in_sentence
                e2_pos = tlink.e2.pos_in_sentence

                example = TimeMLExample(text, e1_pos, e2_pos, tlink.relType)

            if example:
                examples.append(example)
            #print(example)
        return examples

    def antithetics(self, all_examples):
        new_exs = []

        for ex in all_examples:
            new_ex = None
            if ex.str_label == "AFTER":
                new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "BEFORE")
                new_ex.int_label = self.label_list.index("BEFORE")
                new_exs.append(new_ex)

            if ex.str_label == "BEFORE":
                new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "AFTER")
                new_ex.int_label = self.label_list.index("AFTER")
                new_exs.append(new_ex)

            if ex.str_label == "DURING":
                new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "DURING")
                new_ex.int_label = self.label_list.index("DURING")
                new_exs.append(new_ex)

            if new_ex != None:
                new_ex.sentences = ex.sentences
                new_ex.e1_sentence_num = ex.e2_sentence_num
                new_ex.e1_sentence_pos = ex.e2_sentence_pos
                new_ex.e2_sentence_num = ex.e1_sentence_num
                new_ex.e2_sentence_pos = ex.e1_sentence_pos

        all_examples.extend(new_exs)

    def assign_num_labels(self, all_examples):
        if not self.label_list:
            labels = set()
            for ex in all_examples:
                labels.add(ex.str_label)
            labels = list(labels)
            labels.sort()
            print(labels)
            print(len(labels))
            self.label_list = labels

        for ex in all_examples:
            ex.int_label = self.label_list.index(ex.str_label)

    def read_examples_from_directory(self, dir_path):
        #os.chdir(dir_path)
        examples_list = []
        for file in glob.glob(dir_path + "*.tml"):
            #file_path = dir_path + file
            examples = self.read_examples(file)
            examples_list.append(examples)

        all_examples = list(itertools.chain.from_iterable(examples_list))
        #antithetics(all_examples)
        print(len(all_examples))
        self.assign_num_labels(all_examples)
        return all_examples

    def read_example_files(self, dir_path):
        all_files = glob.glob(dir_path + "*.tml")
        train_files = all_files[:-4]
        dev_files = all_files[-4:]

        train_examples_list = []
        for file in train_files:
            examples = self.read_examples(file)
            train_examples_list.append(examples)
        train = list(itertools.chain.from_iterable(train_examples_list))

        dev_examples_list = []
        for file in dev_files:
            examples = self.read_examples(file)
            dev_examples_list.append(examples)
        dev = list(itertools.chain.from_iterable(dev_examples_list))
        self.assign_num_labels(train + dev)

        return train, dev

    def read_dense_examples(self, td_path, extra=False, window_size=None):
        class DenseExample(object):
            def __init__(self, file_name, e1, e2, label):
                self.file_name = file_name
                self.e1 = e1
                self.e2 = e2
                self.label = self.parse_label(label)

            def parse_label(self, label):
                labels = {
                    "a": "AFTER",
                    "b": "BEFORE",
                    "i": "INCLUDES",
                    "ii": "IS_INCLUDED",
                    "s": "SIMULTANEOUS",
                    "v": "VAGUE"
                }
                return labels[label]

        DEV_DOCS = {
            "APW19980227.0487", "CNN19980223.1130.0960", "NYT19980212.0019",
            "PRI19980216.2000.0170", "ed980111.1130.0089"
        }

        TEST_DOCS = {
            "APW19980227.0489", "APW19980227.0494", "APW19980308.0201",
            "APW19980418.0210", "CNN19980126.1600.1104",
            "CNN19980213.2130.0155", "NYT19980402.0453",
            "PRI19980115.2000.0186", "PRI19980306.2000.1675"
        }

        files_to_exs = {}

        f = open(td_path, "r")

        for line in f.readlines():
            split = line.split()
            ex = DenseExample(split[0], split[1], split[2], split[3])

            if ex.file_name not in files_to_exs:
                files_to_exs[ex.file_name] = [ex]
            else:
                files_to_exs[ex.file_name].append(ex)

        files = set(files_to_exs.keys())
        train_files = files - DEV_DOCS - TEST_DOCS
        dev_files = DEV_DOCS

        train_examples = []
        for file_name in train_files:
            file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \
                    if extra \
                    else self.read_file(FILE_DIR + "/" + file_name + ".tml")

            for ex in files_to_exs[file_name]:
                e1 = file.get_element(ex.e1)
                e2 = file.get_element(ex.e2)

                if e1 == None or e2 == None:
                    #print("oops", file_name, ex.e1, ex.e2)
                    continue

                example = file.get_example(e1, e2, ex.label, window_size)

                if not example:
                    print("o no")
                else:
                    train_examples.append(example)

        self.assign_num_labels(train_examples)

        dev_examples = []
        for file_name in dev_files:
            file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \
                    if extra \
                    else self.read_file(FILE_DIR + "/" + file_name + ".tml")

            for ex in files_to_exs[file_name]:
                e1 = file.get_element(ex.e1)
                e2 = file.get_element(ex.e2)

                if e1 == None or e2 == None:
                    #print("oops", file_name, ex.e1, ex.e2)
                    continue

                example = file.get_example(e1, e2, ex.label, window_size)

                if not example:
                    print("o no")
                else:
                    dev_examples.append(example)

        self.assign_num_labels(dev_examples)
        return train_examples, dev_examples

    def read_dense_test_examples(self, td_path, extra=False, window_size=None):
        class DenseExample(object):
            def __init__(self, file_name, e1, e2, label):
                self.file_name = file_name
                self.e1 = e1
                self.e2 = e2
                self.label = self.parse_label(label)

            def parse_label(self, label):
                labels = {
                    "a": "AFTER",
                    "b": "BEFORE",
                    "i": "INCLUDES",
                    "ii": "IS_INCLUDED",
                    "s": "SIMULTANEOUS",
                    "v": "VAGUE"
                }
                return labels[label]

        TEST_DOCS = {
            "APW19980227.0489", "APW19980227.0494", "APW19980308.0201",
            "APW19980418.0210", "CNN19980126.1600.1104",
            "CNN19980213.2130.0155", "NYT19980402.0453",
            "PRI19980115.2000.0186", "PRI19980306.2000.1675"
        }

        files_to_exs = {}

        f = open(td_path, "r")

        for line in f.readlines():
            split = line.split()
            ex = DenseExample(split[0], split[1], split[2], split[3])

            if ex.file_name not in files_to_exs:
                files_to_exs[ex.file_name] = [ex]
            else:
                files_to_exs[ex.file_name].append(ex)

        test_examples = []
        for file_name in TEST_DOCS:
            file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \
                    if extra \
                    else self.read_file(FILE_DIR + "/" + file_name + ".tml")

            for ex in files_to_exs[file_name]:
                e1 = file.get_element(ex.e1)
                e2 = file.get_element(ex.e2)

                if e1 == None or e2 == None:
                    #print("oops", file_name, ex.e1, ex.e2)
                    continue

                example = file.get_example(e1, e2, ex.label, window_size)

                if not example:
                    print("o no")
                else:
                    test_examples.append(example)

        self.assign_num_labels(test_examples)
        return test_examples

    def read_tempeval3_examples():
        return None, None
Ejemplo n.º 13
0
class IssueReaderSiamese(DatasetReader):
    """
    Parameters
    ----------
    lazy : ``bool`` (optional, default=False)
        Passed to ``DatasetReader``.  If this is ``True``, training will start sooner, but will
        take longer per batch.  This also allows training with datasets that are too large to fit
        in memory.
    tokenizer : ``Tokenizer``, optional
        Tokenizer to use to split the sentence into words or other kinds of tokens.
        Defaults to ``WordTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        Indexers used to define input token representations. Defaults to ``{"tokens":
        SingleIdTokenIndexer()}``.
    """
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 segment_sentences: bool = True,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            word_splitter=SpacyWordSplitter(pos_tags=True),
            word_stemmer=PorterStemmer())
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if segment_sentences:
            self._segment_sentences = SpacySentenceSplitter()
        self._class_cnt = defaultdict(int)

    def read_dataset(self, file_path):
        features = []
        others = []
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line in data_file:
                if not line or len(line) == 0:
                    continue
                line = json.loads(line)
                if "id" not in line.keys():
                    d_id = ""
                else:
                    d_id = line['id']
                report = split_issue_template(line['body'])
                report = self._segment_sentences.split_sentences(report)
                cmts = line['comments']
                comments = []
                for comment in cmts:
                    user_name = comment['user']
                    comment = replace_tokens(comment['body'])
                    if len(comment) == 0:
                        continue
                    comments.append((user_name, comment))
                dialog = report + comments
                if len(dialog) == 0:
                    continue
                labels = line['label']
                if len(labels) == 0:
                    label = None
                else:
                    label = "feature" if "feature" in labels or "type: feature" in labels else "other"
                if "feature" == label:
                    features.append((d_id, dialog, label))
                else:
                    others.append((d_id, dialog, label))
        return features, others

    @overrides
    def _read(self, file_path):
        features, others = self.read_dataset(file_path)
        all_data = features + others
        random.shuffle(all_data)
        same_num = 0
        diff_num = 0
        if "unlabel" in file_path:
            logger.info("Begin predict------")
            features, others = self.read_dataset(
                "frmodel/data/{}_target_train.txt")
            for sample in features + others:
                yield self.text_to_instance((sample, sample), is_gold=True)
            for sample in all_data:
                yield self.text_to_instance((sample, sample))
            logger.info(f"Predict sample num is {len(all_data)}")
        else:
            logger.info("Begin training-------")
            iter_num = 1
            if "test" in file_path:
                features, others = self.read_dataset(
                    re.sub("test", "train", file_path))
                iter_num = 1
            for _ in range(iter_num):
                # plain balance data
                if "train" in file_path:
                    for k in range(len(others) - len(features)):
                        all_data.append(random.choice(features))
                for sample in all_data:
                    positive = random.choice(features)
                    negative = random.choice(others)
                    yield self.text_to_instance((sample, positive))
                    yield self.text_to_instance((sample, negative))
                    same_num += 1
                    diff_num += 1
            logger.info(
                f"Dataset Count: Same : {same_num} / Diff : {diff_num}")

    @overrides
    def text_to_instance(self, p, is_gold=False) -> Instance:  # type: ignore
        fields: Dict[str, Field] = {}
        ins1, ins2 = p
        dialog = ListField([
            TextField([word for word in self._tokenizer.tokenize(line[1])],
                      self._token_indexers) for line in ins1[1]
        ])
        fields['dialog1'] = dialog
        fields["pos_tags1"] = ListField([
            SequenceLabelField(
                [word.tag_ for word in self._tokenizer.tokenize(line[1])],
                tokens,
                label_namespace="pos")
            for line, tokens in zip(ins1[1], dialog)
        ])
        if ins1[-1] is not None and ins2[-1] is not None:
            if ins1[-1] == ins2[-1]:
                fields['label'] = LabelField("same")
            else:
                fields['label'] = LabelField("diff")
            fields['label_tags'] = LabelField("@".join([ins1[-1], ins2[-1]]),
                                              label_namespace="label_tags")
        fields['label'] = LabelField(ins1[-1])
        fields['metadata'] = MetadataField({
            "is_gold": is_gold,
            "pair_instance": p
        })

        return Instance(fields)
Ejemplo n.º 14
0
class TextCatReader(DatasetReader):
    """
    Reads tokens and their labels from a labeled text classification dataset.
    Expects a "tokens" field and a "category" field in JSON format.

    The output of ``read`` is a list of ``Instance`` s with the fields:
        tokens: ``TextField`` and
        label: ``LabelField``

    Parameters
    ----------
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        We use this to define the input representation for the text.
        See :class:`TokenIndexer`.
    tokenizer : ``Tokenizer``, optional (default = ``{"tokens": WordTokenizer()}``)
        Tokenizer to use to split the input text into words or other kinds of tokens.
    segment_sentences: ``bool``, optional (default = ``False``)
        If True, we will first segment the text into sentences using SpaCy and then tokenize words.
        Necessary for some models that require pre-segmentation of sentences,
        like the Hierarchical Attention Network.
    sequence_length: ``int``, optional (default = ``None``)
        If specified, will truncate tokens to specified maximum length.
    ignore_labels: ``bool``, optional (default = ``False``)
        If specified, will ignore labels when reading data, useful for semi-supervised textcat
    skip_label_indexing: ``bool``, optional (default = ``False``)
        Whether or not to skip label indexing. You might want to skip label indexing if your
        labels are numbers, so the dataset reader doesn't re-number them starting from 0.
    lazy : ``bool``, optional, (default = ``False``)
        Whether or not instances can be read lazily.
    """
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 tokenizer: Tokenizer = None,
                 unrestricted_tokenizer: Tokenizer = None,
                 segment_sentences: bool = False,
                 sequence_length: int = None,
                 ignore_labels: bool = False,
                 skip_label_indexing: bool = False,
                 sample: int = None,
                 unlabeled_data_path: str = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._unrestricted_tokenizer = unrestricted_tokenizer
        self._sample = sample
        self._segment_sentences = segment_sentences
        self._sequence_length = sequence_length
        self._ignore_labels = ignore_labels
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        self._unlabeled_data_path = unlabeled_data_path
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

    def _reservoir_sampling(self, file_):
        """
        reservoir sampling for reading random lines from file without loading
        entire file into memory

        See here for explanation of algorithm:
        https://stackoverflow.com/questions/35680236/select-100-random-lines-from-a-file-with-a-1-million-which-cant-be-read-into-me

        Parameters
        ----------
        file : `str` - file path
        sample_size : `int` - size of random sample you want

        Returns
        -------
        result : `List[str]` - sample lines of file
        """
        file_iterator = iter(file_)

        try:
            result = [next(file_iterator) for _ in range(self._sample)]

        except StopIteration:
            raise ValueError("Sample larger than population")

        for index, item in enumerate(file_iterator, start=self._sample):
            sample_index = np.random.randint(0, index)
            if sample_index < self._sample:
                result[sample_index] = item

        np.random.shuffle(result)

        return result

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            if self._sample is not None:
                lines = [(item, False)
                         for item in self._reservoir_sampling(data_file)]
            else:
                lines = [(item, True) for item in data_file.readlines()]

        if self._unlabeled_data_path:
            with open(cached_path(self._unlabeled_data_path)) as data_file:
                lines += [(item, False) for item in data_file.readlines()]

        for line, is_labeled in lines:
            items = json.loads(line)
            text = items["tokens"]
            label = str(items['category'])
            instance = self.text_to_instance(text=text,
                                             label=label,
                                             is_labeled=is_labeled)
            if instance is not None:
                yield instance

    def _truncate(self, tokens):
        """
        truncate a set of tokens using the provided sequence length
        """
        if len(tokens) > self._sequence_length:
            tokens = tokens[:self._sequence_length]
        return tokens

    @overrides
    def text_to_instance(self,
                         text: str,
                         label: str = None,
                         is_labeled: bool = False) -> Instance:  # type: ignore
        """
        Parameters
        ----------
        text : ``str``, required.
            The text to classify
        label ``str``, optional, (default = None).
            The label for this text.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label label of the sentence or phrase.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens, self._token_indexers))
            fields['tokens'] = ListField(sentences)
        else:

            tokens = self._tokenizer.tokenize(text)
            if self._sequence_length is not None:
                tokens = self._truncate(tokens)

            fields['tokens'] = TextField(tokens, self._token_indexers)

            if self._unrestricted_tokenizer:
                unrestricted_tokens = self._unrestricted_tokenizer.tokenize(
                    text)
                if self._sequence_length is not None:
                    unrestricted_tokens = self._truncate(unrestricted_tokens)
                fields['filtered_tokens'] = TextField(unrestricted_tokens,
                                                      self._token_indexers)

        # TODO: Document 'default' unsupervised label as pre-condition.
        if label is not None:
            fields['label'] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        fields['metadata'] = MetadataField({"is_labeled": is_labeled})

        return Instance(fields)
Ejemplo n.º 15
0
def main(args):
    print(f"Arguments: {args}")

    attribute_to_use = args["attribute_to_use"]

    word_tokenizer = WordTokenizer()
    sentence_splitter = SpacySentenceSplitter()

    buckets = []

    bucket_strings = [i.split(':') for i in args['buckets']]

    for lower, upper in bucket_strings:
        buckets.append(
            BucketTuple(lower_bound=float(lower), upper_bound=float(upper)))

    buckets_map = OrderedDict({i: v for i, v in enumerate(buckets)})

    story_buckets_map = defaultdict(lambda: defaultdict(lambda: list()))
    story_text_map = defaultdict(lambda: defaultdict(lambda: list()))

    with jsonlines.open(args["source_json"], mode='r') as reader:
        for json_obj in reader:

            story_id = json_obj["metadata"]["story_id"]

            source_text = json_obj["metadata"]["source_text"]
            target_text = json_obj["metadata"]["target_text"]
            source_len = len(word_tokenizer.tokenize(source_text))
            target_len = len(word_tokenizer.tokenize(target_text))

            # Reconstruct the complete text of the story for completeness.
            if json_obj["metadata"]["absolute_position"] == 1:
                story_text_map[story_id]["text"].extend(
                    sentence_splitter.split_sentences(source_text))
            story_text_map[story_id]["text"].append(source_text)

            attribute = float(json_obj[attribute_to_use])
            # TODO: Restrict to in length.
            for i, bucket in buckets_map.items():
                if attribute >= bucket.lower_bound and attribute < bucket.upper_bound:

                    if source_len < args[
                            "min_word_length"] or target_len < args[
                                "min_word_length"]:
                        continue

                    story_buckets_map[story_id][i].append(json_obj)

    with jsonlines.open(args["target_json"], mode='w') as writer:
        for story_id, buckets in story_buckets_map.items():
            # If at least one from each of the buckets is in the story then randomly select one.
            if all([len(buckets[b]) > 0 for b in buckets_map.keys()]):
                selection = []
                for i, contexts in buckets.items():
                    selected = random.choice(contexts)
                    selected["bucket"] = i
                    selection.append(selected)

                random.shuffle(selection)
                task_map = {
                    "story_id": story_id,
                    "all_story_text": story_text_map[story_id],
                    "selection": selection
                }
                print(task_map)
                writer.write(task_map)
Ejemplo n.º 16
0
class ICCDatasetReader(DatasetReader):
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tokenizer: Tokenizer = None,
        segment_sentences: bool = False,
        max_sequence_length: int = None,
        skip_label_indexing: bool = False,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = DummyTokenizer()  # assumes our tokens unchanged
        self._segment_sentences = segment_sentences
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path)) as data_file:
            for line in data_file:
                try:
                    text, label = line.strip().split("\t")
                except ValueError as e:
                    continue
                instance = self.text_to_instance(text=text, label=label)
                if instance is not None:
                    yield instance

    def _truncate(self, tokens):
        """
        truncate a set of tokens using the provided sequence length
        """
        if len(tokens) > self._max_sequence_length:
            tokens = tokens[:self._max_sequence_length]
        return tokens

    @overrides
    def text_to_instance(self,
                         text: str,
                         label: Union[str, int] = None) -> Instance:
        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._max_sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens, self._token_indexers))
            fields["tokens"] = ListField(sentences)
        else:
            tokens = self._tokenizer.tokenize(text)
            if self._max_sequence_length is not None:
                tokens = self._truncate(tokens)
            fields["tokens"] = TextField(tokens, self._token_indexers)
        if label is not None:
            fields["label"] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        return Instance(fields)
Ejemplo n.º 17
0
class TextClassificationJsonReader(DatasetReader):
    """
    Reads tokens and their labels from a labeled text classification dataset.
    Expects a "text" field and a "label" field in JSON format.

    The output of ``read`` is a list of ``Instance`` s with the fields:
        tokens : ``TextField`` and
        label : ``LabelField``

    # Parameters

    token_indexers : ``Dict[str, TokenIndexer]``, optional
        optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        We use this to define the input representation for the text.
        See :class:`TokenIndexer`.
    tokenizer : ``Tokenizer``, optional (default = ``{"tokens": SpacyTokenizer()}``)
        Tokenizer to use to split the input text into words or other kinds of tokens.
    segment_sentences : ``bool``, optional (default = ``False``)
        If True, we will first segment the text into sentences using SpaCy and then tokenize words.
        Necessary for some models that require pre-segmentation of sentences, like the Hierarchical
        Attention Network (https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf).
    max_sequence_length : ``int``, optional (default = ``None``)
        If specified, will truncate tokens to specified maximum length.
    skip_label_indexing : ``bool``, optional (default = ``False``)
        Whether or not to skip label indexing. You might want to skip label indexing if your
        labels are numbers, so the dataset reader doesn't re-number them starting from 0.
    lazy : ``bool``, optional, (default = ``False``)
        Whether or not instances can be read lazily.
    """
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tokenizer: Tokenizer = None,
        segment_sentences: bool = False,
        max_sequence_length: int = None,
        skip_label_indexing: bool = False,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._segment_sentences = segment_sentences
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            for line in data_file.readlines():
                if not line:
                    continue
                items = json.loads(line)
                text = items["text"]
                label = items.get("label", None)
                if label is not None:
                    if self._skip_label_indexing:
                        try:
                            label = int(label)
                        except ValueError:
                            raise ValueError(
                                "Labels must be integers if skip_label_indexing is True."
                            )
                    else:
                        label = str(label)
                instance = self.text_to_instance(text=text, label=label)
                if instance is not None:
                    yield instance

    def _truncate(self, tokens):
        """
        truncate a set of tokens using the provided sequence length
        """
        if len(tokens) > self._max_sequence_length:
            tokens = tokens[:self._max_sequence_length]
        return tokens

    @overrides
    def text_to_instance(
            self,
            text: str,
            label: Union[str, int] = None) -> Instance:  # type: ignore
        """
        # Parameters

        text : ``str``, required.
            The text to classify
        label : ``str``, optional, (default = None).
            The label for this text.

        # Returns

        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label label of the sentence or phrase.
        """

        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._max_sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens, self._token_indexers))
            fields["tokens"] = ListField(sentences)
        else:
            tokens = self._tokenizer.tokenize(text)
            if self._max_sequence_length is not None:
                tokens = self._truncate(tokens)
            fields["tokens"] = TextField(tokens, self._token_indexers)
        if label is not None:
            fields["label"] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        return Instance(fields)
Ejemplo n.º 18
0
class SquadReader(DatasetReader):
    """
    Reads a JSON-formatted SQuAD file and returns a ``Dataset`` where the ``Instances`` have four
    fields: ``question``, a ``TextField``, ``passage``, another ``TextField``, and ``span_start``
    and ``span_end``, both ``IndexFields`` into the ``passage`` ``TextField``.  We also add a
    ``MetadataField`` that stores the instance's ID, the original passage text, gold answer strings,
    and token offsets into the original passage, accessible as ``metadata['id']``,
    ``metadata['original_passage']``, ``metadata['answer_texts']`` and
    ``metadata['token_offsets']``.  This is so that we can more easily use the official SQuAD
    evaluation script to get metrics.
    We also support limiting the maximum length for both passage and question. However, some gold
    answer spans may exceed the maximum passage length, which will cause error in making instances.
    We simply skip these spans to avoid errors. If all of the gold answer spans of an example
    are skipped, during training, we will skip this example. During validating or testing, since
    we cannot skip examples, we use the last token as the pseudo gold answer span instead. The
    computed loss will not be accurate as a result. But this will not affect the answer evaluation,
    because we keep all the original gold answer texts.
    Parameters
    ----------
    tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
        We use this ``Tokenizer`` for both the question and the passage.  See :class:`Tokenizer`.
        Default is ```WordTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        We similarly use this for both the question and the passage.  See :class:`TokenIndexer`.
        Default is ``{"tokens": SingleIdTokenIndexer()}``.
    lazy : ``bool``, optional (default=False)
        If this is true, ``instances()`` will return an object whose ``__iter__`` method
        reloads the dataset each time it's called. Otherwise, ``instances()`` returns a list.
    passage_length_limit : ``int``, optional (default=None)
        if specified, we will cut the passage if the length of passage exceeds this limit.
    question_length_limit : ``int``, optional (default=None)
        if specified, we will cut the question if the length of passage exceeds this limit.
    skip_invalid_examples: ``bool``, optional (default=False)
        if this is true, we will skip those invalid examples
    """
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False,
                 passage_length_limit: int = None,
                 question_length_limit: int = None,
                 skip_invalid_examples: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sentence_splitter = SpacySentenceSplitter()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.passage_length_limit = passage_length_limit
        self.question_length_limit = question_length_limit
        self.skip_invalid_examples = skip_invalid_examples

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        logger.info("Reading file at %s", file_path)
        with open(file_path) as dataset_file:
            dataset_json = json.load(dataset_file)
            dataset = dataset_json['data']
        logger.info("Reading the dataset")
        total = 0.0
        find = 0.0
        for article in dataset:
            for paragraph_json in article['paragraphs']:
                paragraph = paragraph_json["context"]
                sentences = self._sentence_splitter.split_sentences(paragraph)

                for question_answer in paragraph_json['qas']:
                    question_text = question_answer["question"].strip().replace("\n", "")
                    answer_texts = [answer['text'] for answer in question_answer['answers']]
                    concat_article = ""
                    sent_labels = []
                    sent_starts = []
                    sent_ends = []
                    passage_offsets = []
                    passage_tokens = []
                    for sent in sentences:
                        tokenized_sent = self._tokenizer.tokenize(sent)
                        tokenized_sent = [Token(text=tk.text, idx=tk.idx) for tk in tokenized_sent]
                        sent_offset = [(tk.idx + len(concat_article),
                                        tk.idx + len(tk.text) + len(concat_article)) for tk in tokenized_sent]
                        passage_offsets.extend(sent_offset)
                        concat_article += sent
                        passage_tokens.extend(tokenized_sent)

                        if sent_offset:
                            sent_start = sent_offset[0][0]
                            sent_end = sent_offset[-1][1]
                            sent_starts.append(sent_start)
                            sent_ends.append(sent_end)

                            ans_appears = False
                            for ans in answer_texts:
                                if ans in sent:
                                    ans_appears = True

                            if ans_appears:
                                sent_labels.append(1)
                                find += 1
                            else:
                                sent_labels.append(0)
                        total += 1

                    instance = self.text_to_instance(question_text,
                                                     concat_article,
                                                     zip(sent_starts, sent_ends),
                                                     sent_labels,
                                                     answer_texts,
                                                     passage_tokens,
                                                     passage_offsets)
                    if instance is not None:
                        yield instance
        print("percentage:", float(find) / float(total))

    @overrides
    def text_to_instance(self,  # type: ignore
                         question_text: str,
                         passage_text: str,
                         char_spans_sent: List[Tuple[int, int]] = None,
                         sent_labels: List[int] = None,
                         answer_texts: List[str] = None,
                         passage_tokens: List[Token] = None,
                         passage_offsets: List[Tuple] = None) -> Instance:

        token_spans_sent: List[Tuple[int, int]] = []

        for char_span_sent_start, char_span_sent_end in char_spans_sent:
            (span_start_sent, span_end_sent), error = util.char_span_to_token_span(passage_offsets,
                                                                                   (char_span_sent_start,
                                                                                    char_span_sent_end))
            token_spans_sent.append((span_start_sent, span_end_sent))

        tokenized_ques = self._tokenizer.tokenize(question_text)
        tokenized_ques = [Token(text=tk.text, idx=tk.idx) for tk in tokenized_ques]

        return make_reading_comprehension_instance(tokenized_ques,
                                                   passage_tokens,
                                                   self._token_indexers,
                                                   passage_text,
                                                   token_spans_sent,
                                                   sent_labels,
                                                   answer_texts,
                                                   passage_offsets)