Ejemplo n.º 1
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              tokenizer: Tokenizer = None,
              unrestricted_tokenizer: Tokenizer = None,
              segment_sentences: bool = False,
              sequence_length: int = None,
              ignore_labels: bool = False,
              skip_label_indexing: bool = False,
              sample: int = None,
              unlabeled_data_path: str = None,
              lazy: bool = False) -> None:
     super().__init__(lazy=lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._unrestricted_tokenizer = unrestricted_tokenizer
     self._sample = sample
     self._segment_sentences = segment_sentences
     self._sequence_length = sequence_length
     self._ignore_labels = ignore_labels
     self._skip_label_indexing = skip_label_indexing
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     self._unlabeled_data_path = unlabeled_data_path
     if self._segment_sentences:
         self._sentence_segmenter = SpacySentenceSplitter()
    def test_read_from_file_reuters_corpus_and_segments_sentences_properly(
            self, lazy, max_sequence_length):
        reader = MultiLabelTextClassificationJsonReader(
            lazy=lazy,
            segment_sentences=True,
            max_sequence_length=max_sequence_length)
        reuters_path = Path(
            "tests/fixtures") / "data" / "reuters-21578" / "train.jsonl"
        instances = reader.read(reuters_path)
        instances = ensure_list(instances)

        splitter = SpacySentenceSplitter()
        spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False,
                                          False)

        text1 = (
            "U.K. GROWING IMPATIENT WITH JAPAN - THATCHER Prime Minister Margaret Thatcher said the"
            " U.K. Was growing more impatient with Japanese trade barriers and warned that it would"
            " soon have new powers against countries not offering reciprocal access to their"
            " markets.")
        instance1 = {"text": text1, "labels": ["acq", "trade"]}
        text2 = (
            "CANADA OIL EXPORTS RISE 20 PCT IN 1986 Canadian oil exports rose 20 pct in 1986 over"
            " the previous year to 33.96 mln cubic meters, while oil imports soared 25.2 pct to"
            " 20.58 mln cubic meters, Statistics Canada said. Production, meanwhile, was unchanged"
            " from the previous year at 91.09 mln cubic feet.")
        instance2 = {"text": text2, "labels": ["nat-gas", "crude"]}
        text3 = (
            "COFFEE, SUGAR AND COCOA EXCHANGE NAMES CHAIRMAN The New York Coffee, Sugar and Cocoa"
            " Exchange (CSCE) elected former first vice chairman Gerald Clancy to a two-year term"
            " as chairman of the board of managers, replacing previous chairman Howard Katz. Katz,"
            " chairman since 1985, will remain a board member.")
        instance3 = {"text": text3, "labels": ["sugar", "cocoa", "coffee"]}

        for instance in [instance1, instance2, instance3]:
            sentences = splitter.split_sentences(instance["text"])
            tokenized_sentences: List[List[str]] = []
            for sentence in sentences:
                tokens = [token.text for token in spacy_tokenizer(sentence)]
                if max_sequence_length:
                    tokens = tokens[:max_sequence_length]
                tokenized_sentences.append(tokens)
            instance["tokens"] = tokenized_sentences

        assert len(instances) == 3
        fields = instances[0].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance1["tokens"]
        assert fields["labels"].labels == instance1["labels"]
        fields = instances[1].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance2["tokens"]
        assert fields["labels"].labels == instance2["labels"]
        fields = instances[2].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance3["tokens"]
        assert fields["labels"].labels == instance3["labels"]
Ejemplo n.º 3
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer] = None,
     tokenizer: Tokenizer = None,
     segment_sentences: bool = False,
     max_sequence_length: int = None,
     skip_label_indexing: bool = False,
     text_key: str = "text",
     label_key: str = "label",
     **kwargs,
 ) -> None:
     super().__init__(manual_distributed_sharding=True,
                      manual_multiprocess_sharding=True,
                      **kwargs)
     self._tokenizer = tokenizer or SpacyTokenizer()
     self._segment_sentences = segment_sentences
     self._max_sequence_length = max_sequence_length
     self._skip_label_indexing = skip_label_indexing
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._text_key = text_key
     self._label_key = label_key
     if self._segment_sentences:
         self._sentence_segmenter = SpacySentenceSplitter()
    def test_read_from_file_ag_news_corpus_and_segments_sentences_properly(
            self, lazy: bool, label_name: str,
            max_sequence_length: Optional[int]):
        reader = TextSentimentReader(lazy=lazy,
                                     segment_sentences=True,
                                     label_name=label_name,
                                     max_sequence_length=max_sequence_length)
        ag_path = Path(DATA_DIR, 'ag_news_corpus_original.jsonl')
        if label_name == 'text_sentiment':
            ag_path = Path(DATA_DIR, 'ag_news_corpus.jsonl')
        instances = reader.read(ag_path)
        instances = ensure_list(instances)

        splitter = SpacySentenceSplitter()
        spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False,
                                          False)

        text1 = ("Memphis Rout Still Stings for No. 14 Louisville; Coach "
                 "Petrino Vows to Have Team Better Prepared. NASHVILLE, "
                 "Tenn. Nov 3, 2004 - Louisville #39;s 30-point loss "
                 "at home to Memphis last season is still a painful memory "
                 "for the Cardinals.")
        instance1 = {"text": text1, "label": "2"}
        text2 = ("AP - Eli Manning has replaced Kurt Warner as the New York"
                 " Giants' starting quarterback.")
        instance2 = {"text": text2, "label": "2"}
        text3 = ("A conference dedicated to online journalism explores the "
                 "effect blogs have on news reporting. Some say they draw "
                 "attention to under-reported stories. Others struggle to "
                 "establish the credibility enjoyed by professionals.")
        instance3 = {"text": text3, "label": "4"}

        for instance in [instance1, instance2, instance3]:
            sentences = splitter.split_sentences(instance['text'])
            tokenized_sentences: List[List[str]] = []
            for sentence in sentences:
                tokens = [token.text for token in spacy_tokenizer(sentence)]
                if max_sequence_length:
                    tokens = tokens[:max_sequence_length]
                tokenized_sentences.append(tokens)
            instance["tokens"] = tokenized_sentences

        assert len(instances) == 3
        fields = instances[0].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance1["tokens"]
        assert fields["label"].label == instance1["label"]
        fields = instances[1].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance2["tokens"]
        assert fields["label"].label == instance2["label"]
        fields = instances[2].fields
        text = [[token.text for token in sentence.tokens]
                for sentence in fields["tokens"]]
        assert text == instance3["tokens"]
        assert fields["label"].label == instance3["label"]
Ejemplo n.º 5
0
 def __init__(self,
              lazy: bool = True,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              split_sentence_in_doc: bool = False):
     super().__init__(lazy)
     self.tokenizer = tokenizer or WordTokenizer()
     self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
     if split_sentence_in_doc:
         self.sentence_splitter = SpacySentenceSplitter()
     else:
         self.sentence_splitter = None
Ejemplo n.º 6
0
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sentence_splitter = SpacySentenceSplitter()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self.cache_data(os.path.expanduser('~/.allennlp/cache/datasets'))
Ejemplo n.º 7
0
 def __init__(self,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False,
              passage_length_limit: int = None,
              question_length_limit: int = None,
              skip_invalid_examples: bool = False) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer()
     self._sentence_splitter = SpacySentenceSplitter()
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
     self.passage_length_limit = passage_length_limit
     self.question_length_limit = question_length_limit
     self.skip_invalid_examples = skip_invalid_examples
Ejemplo n.º 8
0
 def __init__(self,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              segment_sentences: bool = True,
              token_indexers: Dict[str, TokenIndexer] = None) -> None:
     super().__init__(lazy)
     self._tokenizer = tokenizer or WordTokenizer(
         word_splitter=SpacyWordSplitter(pos_tags=True),
         word_stemmer=PorterStemmer())
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     if segment_sentences:
         self._segment_sentences = SpacySentenceSplitter()
     self._class_cnt = defaultdict(int)
Ejemplo n.º 9
0
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 tokenizer: Tokenizer = None,
                 max_sequence_length: int = None,
                 ignore_labels: bool = False,
                 sample: int = None,
                 skip_label_indexing: bool = False,
                 lazy: bool = False) -> None:
        super().__init__(lazy=lazy,
                         token_indexers=token_indexers,
                         tokenizer=tokenizer,
                         max_sequence_length=max_sequence_length,
                         skip_label_indexing=skip_label_indexing)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sample = sample
        self._max_sequence_length = max_sequence_length
        self._ignore_labels = ignore_labels
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

        self.label_order = [
            'External', 'Cardio', 'Cancer', 'Stroke', 'TB/AIDS', 'Other NCD',
            'Other Comm', 'Pneumonia', 'Renal', 'Maternal', 'Diabetes', 'Liver'
        ]
Ejemplo n.º 10
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer] = None,
     tokenizer: Tokenizer = None,
     segment_sentences: bool = False,
     max_sequence_length: int = None,
     skip_label_indexing: bool = False,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy=lazy)
     self._tokenizer = tokenizer or SpacyTokenizer()
     self._segment_sentences = segment_sentences
     self._max_sequence_length = max_sequence_length
     self._skip_label_indexing = skip_label_indexing
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
     if self._segment_sentences:
         self._sentence_segmenter = SpacySentenceSplitter()
Ejemplo n.º 11
0
class DoGDatasetReader(DatasetReader):
    def __init__(self,
                 lazy: bool = True,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 split_sentence_in_doc: bool = False):
        super().__init__(lazy)
        self.tokenizer = tokenizer or WordTokenizer()
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        if split_sentence_in_doc:
            self.sentence_splitter = SpacySentenceSplitter()
        else:
            self.sentence_splitter = None

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(os.path.join(os.path.split(file_path)[0], 'documents.json'), 'r') as doc_file:
            doc_json = json.load(doc_file)
        doc_field_dict = self.get_doc_field_dict(doc_json)

        with open(file_path, 'r') as data_file:
            for line in data_file:
                line = line.strip()
                dialog_json = json.loads(line)
                yield self.text_to_instance(dialog_json['dialogue'],
                                            doc_field_dict[dialog_json['docId']],
                                            dialog_json['whoSawDoc'])

    @overrides
    def text_to_instance(self, dialogs: List[str], doc_field: Field, who_saw_doc: int):
        tokenized_dialogs = [self.tokenizer.tokenize(dialog) for dialog in dialogs]
        for tokenized_dialog in tokenized_dialogs:
            tokenized_dialog.insert(0, Token(START_SYMBOL))
            tokenized_dialog.append(Token(END_SYMBOL))
        dialogue_field = ListField([TextField(tokenized_dialog, self.token_indexers)
                                   for tokenized_dialog in tokenized_dialogs])
        # who_saw_doc_field = MetadataField(who_saw_doc)
        # return Instance({'dialogue': dialogue_field, 'document': doc_field, 'who_saw_doc': who_saw_doc_field})
        return Instance({'dialogue': dialogue_field, 'document': doc_field})

    def get_doc_field_dict(self, doc_json: Dict) -> Dict[int, Field]:
        doc_field_dict = {}
        for idx, doc in doc_json.items():
            if self.sentence_splitter is not None:
                doc_sentence_list: List[str] = []
                for i in ('0', '1', '2', '3'):
                    doc_sentence_list.extend(self.sentence_splitter.split_sentences(doc[i]))
                tokenized_doc_sentence_list = [self.tokenizer.tokenize(doc_sequence) for doc_sequence in doc_sentence_list]
                doc_field = ListField([TextField(tokenized_doc_sentence, self.token_indexers)
                                       for tokenized_doc_sentence in tokenized_doc_sentence_list])
            else:
                doc_sequence = ' '.join(doc[i] for i in ('0', '1', '2', '3'))
                tokenized_doc = self.tokenizer.tokenize(doc_sequence)
                doc_field = TextField(tokenized_doc, self.token_indexers)

            doc_field_dict[int(idx)] = doc_field

        return doc_field_dict
Ejemplo n.º 12
0
class TWTCDatasetReader(DatasetReader):
    """
    Reads a JSON file from the TWTC dataset.
    Expected format for each input line: {"report": "text", "label": "int"}
    The output of ``read`` is a list of ``Instance`` s with the fields:
        text: ``TextField``
        label: ``LabelField``
    Parameters
    ----------
    lazy : ``bool`` (optional, default=False)
        Passed to ``DatasetReader``.  If this is ``True``, training will start sooner, but will
        take longer per batch.  This also allows training with datasets that are too large to fit
        in memory.
    tokenizer : ``Tokenizer``, optional
        Tokenizer to use to split the title and abstrct into words or other kinds of tokens.
        Defaults to ``WordTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        Indexers used to define input token representations. Defaults to ``{"tokens":
        SingleIdTokenIndexer()}``.
    """
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sentence_splitter = SpacySentenceSplitter()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }

        self.cache_data(os.path.expanduser('~/.allennlp/cache/datasets'))

    @overrides
    def _read(self, file_path):
        file_path = cached_path(file_path)
        data = pd.read_json(file_path, lines=True,
                            orient='records')[['text', 'label']].values
        for text, label in data:
            assert isinstance(label, int)
            inst = self.text_to_instance(text, str(label))
            yield inst

    @overrides
    def text_to_instance(self, document: str, label: str = None) -> Instance:
        sentences: List[str] = self._sentence_splitter.split_sentences(
            document)
        tokenized_sents: List[List[str]] = (self._tokenizer.tokenize(sent)
                                            for sent in sentences)

        fields = {
            'tokens':
            ListField(
                [TextField(s, self._token_indexers) for s in tokenized_sents])
        }
        if label:
            fields['label'] = LabelField(int(label), skip_indexing=True)
        return Instance(fields)
Ejemplo n.º 13
0
def entity_extraction_wikihop(args):
    predictor_conll = AllenNER(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz")
    predictor_onto_note = \
        AllenNER("https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz")
    sentence_splitter = SpacySentenceSplitter(rule_based=True)
    with open(args.path, 'r') as f:
        data = json.load(f)
    for d in tqdm(data):
        golden_ners = []
        passage = []
        question = d['query'].strip().replace("\n", "")
        question_entity = " ".join(question.split()[1:])
        question = " ".join(question.split("_"))
        for para in d['supports']:
            sentences = sentence_splitter.split_sentences(para)
            para_ners = []
            outputs_conll = predictor_conll.predict_batch_raw(sentences)
            outputs_onto_note = predictor_onto_note.predict_batch_raw(sentences)
            for out1, out2 in zip(outputs_conll, outputs_onto_note):
                entities1 = entity_extraction_(out1['words'], out1['tags'])
                entities2 = entity_extraction_(out2['words'], out2['tags'])
                entities = set(entities1).union(set(entities2))
                # print(entities)
                para_ners.append(list(entities))
            golden_ners.append(para_ners)
            passage.append(sentences)
            # parsing_info.append([title, outputs_conll])
        # print(question)
        # print(question_entity)
        # input()
        d['supports'] = passage
        d['question_entities'] = [question_entity]
        d['ners'] = golden_ners
        d['query'] = question
        # input()
    with open(args.output, 'w') as f:
        json.dump(data, f)
Ejemplo n.º 14
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False,
                 max_pieces: int = 512,
                 max_count: int = 10,
                 max_spans: int = 10,
                 max_numbers_expression: int = 2,
                 answer_type: List[str] = None,
                 use_validated: bool = True,
                 wordpiece_numbers: bool = True,
                 number_tokenizer: Tokenizer = None,
                 sentence_tokenizer: Tokenizer = None,
                 custom_word_to_num: bool = True,
                 exp_search: str = 'add_sub',
                 max_depth: int = 3,
                 extra_numbers: List[float] = []):
        super(BertDropReader, self).__init__(lazy)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers
        self.max_pieces = max_pieces
        self.max_count = max_count
        self.max_spans = max_spans
        self.max_numbers_expression = max_numbers_expression
        self.answer_type = answer_type
        self.use_validated = use_validated
        self.wordpiece_numbers = wordpiece_numbers
        self.number_tokenizer = number_tokenizer or WordTokenizer()
        self.sentence_tokenizer = sentence_tokenizer or SpacySentenceSplitter()

        self.exp_search = exp_search
        self.max_depth = max_depth
        self.extra_numbers = extra_numbers
        self.op_dict = {'+': operator.add, '-': operator.sub, '*': operator.mul, '/': operator.truediv}
        self.operations = list(enumerate(self.op_dict.keys()))
        self.templates = [lambda x,y,z: (x + y) * z,
                          lambda x,y,z: (x - y) * z,
                          lambda x,y,z: (x + y) / z,
                          lambda x,y,z: (x - y) / z,
                          lambda x,y,z: x * y / z]
        self.template_strings = ['(%s + %s) * %s',
                                 '(%s - %s) * %s',
                                 '(%s + %s) / %s',
                                 '(%s - %s) / %s',
                                 '%s * %s / %s',]
        if custom_word_to_num:
            self.word_to_num = get_number_from_word
        else:
            self.word_to_num = DropReader.convert_word_to_number
Ejemplo n.º 15
0
class TestSentenceSplitter(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
        self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)

    def test_rule_based_splitter_passes_through_correctly(self):
        text = "This is the first sentence. This is the second sentence! "
        tokens = self.rule_based_splitter.split_sentences(text)
        expected_tokens = [
            "This is the first sentence.", "This is the second sentence!"
        ]
        assert tokens == expected_tokens

    def test_dep_parse_splitter_passes_through_correctly(self):
        text = "This is the first sentence. This is the second sentence! "
        tokens = self.dep_parse_splitter.split_sentences(text)
        expected_tokens = [
            "This is the first sentence.", "This is the second sentence!"
        ]
        assert tokens == expected_tokens

    def test_batch_rule_based_sentence_splitting(self):
        text = [
            "This is a sentence. This is a second sentence.",
            "This isn't a sentence. This is a second sentence! This is a third sentence.",
        ]
        batch_split = self.rule_based_splitter.batch_split_sentences(text)
        separately_split = [
            self.rule_based_splitter.split_sentences(doc) for doc in text
        ]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(
                    batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_batch_dep_parse_sentence_splitting(self):
        text = [
            "This is a sentence. This is a second sentence.",
            "This isn't a sentence. This is a second sentence! This is a third sentence.",
        ]
        batch_split = self.dep_parse_splitter.batch_split_sentences(text)
        separately_split = [
            self.dep_parse_splitter.split_sentences(doc) for doc in text
        ]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(
                    batch_doc, separate_doc):
                assert batch_sentence == separate_sentence
Ejemplo n.º 16
0
class TestSentenceSplitter(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
        self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)

    def test_rule_based_splitter_passes_through_correctly(self):
        text = ("This is the first sentence. This is the second sentence! "
                "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?")
        tokens = self.rule_based_splitter.split_sentences(text)
        expected_tokens = ["This is the first sentence.", "This is the second sentence!",
                           "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"]
        assert tokens == expected_tokens

    @pytest.mark.skipif(spacy.__version__ < "2.1", reason="this model changed from 2.0 to 2.1")
    def test_dep_parse_splitter_passes_through_correctly(self):
        text = ("This is the first sentence. This is the second sentence! "
                "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?")
        tokens = self.dep_parse_splitter.split_sentences(text)
        expected_tokens = ["This is the first sentence.", "This is the second sentence!",
                           "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"]
        assert tokens == expected_tokens

    def test_batch_rule_based_sentence_splitting(self):
        text = ["This is a sentence. This is a second sentence.",
                "This isn't a sentence. This is a second sentence! This is a third sentence.",
                "This is the 3rd sentence?",
                "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."]
        batch_split = self.rule_based_splitter.batch_split_sentences(text)
        separately_split = [self.rule_based_splitter.split_sentences(doc) for doc in text]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_batch_dep_parse_sentence_splitting(self):
        text = ["This is a sentence. This is a second sentence.",
                "This isn't a sentence. This is a second sentence! This is a third sentence.",
                "This is the 3rd sentence?",
                "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."]
        batch_split = self.dep_parse_splitter.batch_split_sentences(text)
        separately_split = [self.dep_parse_splitter.split_sentences(doc) for doc in text]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(batch_doc, separate_doc):
                assert batch_sentence == separate_sentence
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              tokenizer: Tokenizer = None,
              max_sequence_length: int = None,
              sample: int = None,
              skip_label_indexing: bool = False,
              lazy: bool = False) -> None:
     super().__init__(lazy=lazy,
                      token_indexers=token_indexers,
                      tokenizer=tokenizer,
                      max_sequence_length=max_sequence_length,
                      skip_label_indexing=skip_label_indexing)
     self._tokenizer = tokenizer or WordTokenizer()
     self._sample = sample
     self._max_sequence_length = max_sequence_length
     self._skip_label_indexing = skip_label_indexing
     self._token_indexers = token_indexers or {
         'tokens': SingleIdTokenIndexer()
     }
     if self._segment_sentences:
         self._sentence_segmenter = SpacySentenceSplitter()
Ejemplo n.º 18
0
class TestSentenceSplitter(AllenNlpTestCase):
    def setUp(self):
        super(TestSentenceSplitter, self).setUp()
        self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
        self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)

    def test_rule_based_splitter_passes_through_correctly(self):
        text = ("This is the first sentence. This is the second sentence! "
                "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?")
        tokens = self.rule_based_splitter.split_sentences(text)
        expected_tokens = ["This is the first sentence.", "This is the second sentence!",
                           "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"]
        assert tokens == expected_tokens

    def test_dep_parse_splitter_passes_through_correctly(self):
        text = ("This is the first sentence. This is the second sentence! "
                "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?")
        tokens = self.dep_parse_splitter.split_sentences(text)
        expected_tokens = ["This is the first sentence.", "This is the second sentence!",
                           "Here's the '3rd' sentence -", "yes, it is.", "And yes; this is a fourth sentence?"]
        assert tokens == expected_tokens

    def test_batch_rule_based_sentence_splitting(self):
        text = ["This is a sentence. This is a second sentence.",
                "This isn't a sentence. This is a second sentence! This is a third sentence.",
                "This is the 3rd sentence?",
                "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."]
        batch_split = self.rule_based_splitter.batch_split_sentences(text)
        separately_split = [self.rule_based_splitter.split_sentences(doc) for doc in text]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_batch_dep_parse_sentence_splitting(self):
        text = ["This is a sentence. This is a second sentence.",
                "This isn't a sentence. This is a second sentence! This is a third sentence.",
                "This is the 3rd sentence?",
                "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."]
        batch_split = self.dep_parse_splitter.batch_split_sentences(text)
        separately_split = [self.dep_parse_splitter.split_sentences(doc) for doc in text]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(batch_doc, separate_doc):
                assert batch_sentence == separate_sentence
Ejemplo n.º 19
0
class ExampleLoader(object):
    def __init__(self):
        self.label_list = None
        self.sentence_splitter = SpacySentenceSplitter()

    def get_loss_weights():
        # Calculate loss weights as the inverse of label occurrence.
        loss_weights = {}
        for label in self.label_list:
            loss_weights[label] = 0

        for ex in train_examples:
            loss_weights[ex.str_label] += 1

        num_examples = len(train_examples)
        for key in loss_weights:
            loss_weights[key] = num_examples / loss_weights[key]

        weights_list = [
            float("%3.f" % loss_weights[key]) for key in loader.label_list
        ]

        return weights_list

    def get_text_from_element(self, node):
        if node.nodeType == node.TEXT_NODE:
            if node.data.isspace():
                return ""
            else:
                return node.data.replace("\n", " ")
        else:
            text = ""
            for child in node.childNodes:
                text += " " + self.get_text_from_element(child) + " "
            return text

    def process_node(self, node, events, times, full_text):
        if node.nodeName == "EVENT":
            eid = node.attributes['eid'].value
            cls = node.attributes['class'].value

            event = Event(eid=eid,
                          cls=cls,
                          sentence=None,
                          pos_in_sentence=None)
            event.idx_in_doc = len(full_text)
            events[eid] = event
            return event

        if node.nodeName == "TIMEX3":
            tid = node.attributes['tid'].value
            type = node.attributes['type'].value
            time = TimeX3(tid=tid, sentence=None, pos_in_sentence=None)
            time.idx_in_doc = len(full_text)
            times[tid] = time
            return time

    def get_instances(self, instance_elts, event_instances, events,
                      input_file):
        for instance in instance_elts:
            eiid = instance.attributes["eiid"].value
            eventID = instance.attributes["eventID"].value
            tense = instance.attributes["tense"].value
            aspect = instance.attributes["aspect"].value
            polarity = instance.attributes["polarity"].value
            pos = instance.attributes["pos"].value

            if eventID not in events:
                print(eventID, input_file)
                continue

            event = events[eventID]
            sentence = event.sentence
            pos_in_sentence = event.pos_in_sentence

            instance = EventInstance(eiid, event, tense, aspect, polarity, pos,
                                     sentence, pos_in_sentence)
            event_instances[eiid] = instance

    def parse_node(self, root, events, times, full_text):
        #         print(full_text)
        for node in root.childNodes:
            if node.nodeType == node.TEXT_NODE and not node.data.isspace():
                text = re.sub(r"\n+", " ", node.data)
                text = re.sub(r"_", "", node.data)
                text = re.sub(r"&UR;", "", node.data)
                text = re.sub(r"&LR;", "", node.data)
                split_space = text.split()
                full_text += split_space
            elif node.nodeName == "TEXT":
                self.parse_node(node, events, times, full_text)
            else:
                el = self.process_node(node, events, times, full_text)
                text = self.get_text_from_element(node)
                if el:
                    el.text = text.strip()
                full_text += text.split()

    def get_full_text_to_sentences(self, full_text, sentences):
        split_sentences = [s.split() for s in sentences]

        def next_position(split_sentences, sent_num, sent_idx):
            cur_sent = split_sentences[sent_num]
            if sent_idx < len(cur_sent) - 1:
                sent_idx += 1
            else:
                sent_idx = 0
                sent_num += 1
                if sent_num < len(split_sentences):
                    cur_sent = split_sentences[sent_num]
            return sent_num, sent_idx

        split_sentences = [s.split() for s in sentences]

        full_text_to_sentences = []

        sent_num = 0
        sent_idx = 0
        for i, tok in enumerate(full_text):
            sent_tok = split_sentences[sent_num][sent_idx]
            #             print(tok, sent_tok)
            assert tok.startswith(
                sent_tok), str(i) + " " + tok + " " + sent_tok + "\n" + str(
                    split_sentences[sent_num])
            full_text_to_sentences.append(tuple([sent_num, sent_idx]))

            while len(tok) > len(sent_tok):
                tok = tok[len(sent_tok):]
                sent_num, sent_idx = next_position(split_sentences, sent_num,
                                                   sent_idx)
                sent_tok = split_sentences[sent_num][sent_idx]
                #                 print("WHILE", tok, sent_tok)
                assert tok.startswith(sent_tok), str(
                    i) + " " + tok + " " + sent_tok + "\n" + str(
                        split_sentences[sent_num])


#                 print(tok)

            sent_num, sent_idx = next_position(split_sentences, sent_num,
                                               sent_idx)

        return full_text_to_sentences

    def convert_doc_idx_to_sentences(self, sentences, full_text_to_sentences,
                                     its):
        for key, obj in its.items():
            idx = obj.idx_in_doc
            sentence, pos_in_sentence = full_text_to_sentences[idx]
            #             print(idx, sentence, pos_in_sentence)
            text = sentences[sentence].split()[pos_in_sentence]
            assert text == obj.text.split()[0], text + " " + obj.text
            obj.sentence = sentence
            obj.pos_in_sentence = pos_in_sentence

    def read_file(self, input_file):
        """
        Parameters
        ----------
        input_file: str, path to input file

        Returns
        -------
        TimeMLFile containing sentences, events, eventInstances, times, and tlinks.
        """
        doc = dom.parse(input_file)
        root = doc.childNodes[0]

        events = {}
        times = {}
        full_text = []
        self.parse_node(root, events, times, full_text)
        #         print(full_text)

        sentences = self.sentence_splitter.split_sentences(" ".join(full_text))

        full_text_to_sentences = self.get_full_text_to_sentences(
            full_text, sentences)

        self.convert_doc_idx_to_sentences(sentences, full_text_to_sentences,
                                          events)
        self.convert_doc_idx_to_sentences(sentences, full_text_to_sentences,
                                          times)

        event_instances = {}
        instanceElts = root.getElementsByTagName("MAKEINSTANCE")
        self.get_instances(instanceElts, event_instances, events, input_file)

        tlinks = []
        tlinkElts = root.getElementsByTagName("TLINK")
        for tlinkElt in tlinkElts:
            if tlinkElt.hasAttribute("relatedToEventInstance") and \
              tlinkElt.hasAttribute("eventInstanceID"):
                lid = tlinkElt.attributes["lid"].value
                relType = tlinkElt.attributes["relType"].value
                eiid = tlinkElt.attributes["eventInstanceID"].value
                relatedToEventInstance = tlinkElt.attributes[
                    "relatedToEventInstance"].value

                if eiid not in event_instances or relatedToEventInstance not in event_instances:
                    continue

                tlink = Tlink(lid, relType, event_instances[eiid],
                              event_instances[relatedToEventInstance])
                tlinks.append(tlink)

            if tlinkElt.hasAttribute("eventInstanceID") and \
              tlinkElt.hasAttribute("relatedToTime"):
                lid = tlinkElt.attributes["lid"].value
                relType = tlinkElt.attributes["relType"].value
                eiid = tlinkElt.attributes["eventInstanceID"].value
                relatedToTime = tlinkElt.attributes["relatedToTime"].value

                if eiid not in event_instances or relatedToTime not in times:
                    continue
                tlink = Tlink(lid, relType, event_instances[eiid],
                              times[relatedToTime])
                tlinks.append(tlink)

            if tlinkElt.hasAttribute("timeID") and \
              tlinkElt.hasAttribute("relatedToEventInstance"):
                lid = tlinkElt.attributes["lid"].value
                relType = tlinkElt.attributes["relType"].value
                tid = tlinkElt.attributes["timeID"].value
                eiid = tlinkElt.attributes["relatedToEventInstance"].value

                if tid not in times or eiid not in event_instances:
                    continue
                tlink = Tlink(lid, relType, times[tid], event_instances[eiid])
                tlinks.append(tlink)

            if tlinkElt.hasAttribute("timeID") and \
              tlinkElt.hasAttribute("relatedToTime"):
                lid = tlinkElt.attributes["lid"].value
                relType = tlinkElt.attributes["relType"].value
                tid = tlinkElt.attributes["timeID"].value
                relatedToTime = tlinkElt.attributes["relatedToTime"].value

                if tid not in times or relatedToTime not in times:
                    continue
                tlink = Tlink(lid, relType, times[tid], times[relatedToTime])
                tlinks.append(tlink)

        return TimeMLFile(sentences, events, event_instances, times, tlinks,
                          input_file)

    def read_examples(self, input_file):
        file_data = self.read_file(input_file)

        examples = []

        for tlink in file_data.tlinks:
            #print(tlink.lid, tlink.relType, tlink.e1, tlink.e2)
            sent1 = tlink.e1.sentence
            sent2 = tlink.e2.sentence
            #print(sent1, sent2)

            example = None
            if sent1 >= len(file_data.sentences) or sent2 >= len(
                    file_data.sentences):
                continue

            if sent1 == sent2:
                text = file_data.sentences[sent1]
                example = TimeMLExample(text, tlink.e1.pos_in_sentence,
                                        tlink.e2.pos_in_sentence,
                                        tlink.relType)
            elif sent1 < sent2:
                sents = file_data.sentences[sent1:sent2 + 1]
                text = " [SEP] ".join(sents)

                e1_pos = tlink.e1.pos_in_sentence
                e2_pos = sum([len(s.split()) + 1
                              for s in sents[:-1]]) + tlink.e2.pos_in_sentence

                example = TimeMLExample(text, e1_pos, e2_pos, tlink.relType)
            elif sent1 > sent2:
                sents = file_data.sentences[sent2:sent1 + 1]
                text = " [SEP] ".join(sents)

                e1_pos = sum([len(s.split()) + 1
                              for s in sents[:-1]]) + tlink.e1.pos_in_sentence
                e2_pos = tlink.e2.pos_in_sentence

                example = TimeMLExample(text, e1_pos, e2_pos, tlink.relType)

            if example:
                examples.append(example)
            #print(example)
        return examples

    def antithetics(self, all_examples):
        new_exs = []

        for ex in all_examples:
            new_ex = None
            if ex.str_label == "AFTER":
                new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "BEFORE")
                new_ex.int_label = self.label_list.index("BEFORE")
                new_exs.append(new_ex)

            if ex.str_label == "BEFORE":
                new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "AFTER")
                new_ex.int_label = self.label_list.index("AFTER")
                new_exs.append(new_ex)

            if ex.str_label == "DURING":
                new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "DURING")
                new_ex.int_label = self.label_list.index("DURING")
                new_exs.append(new_ex)

            if new_ex != None:
                new_ex.sentences = ex.sentences
                new_ex.e1_sentence_num = ex.e2_sentence_num
                new_ex.e1_sentence_pos = ex.e2_sentence_pos
                new_ex.e2_sentence_num = ex.e1_sentence_num
                new_ex.e2_sentence_pos = ex.e1_sentence_pos

        all_examples.extend(new_exs)

    def assign_num_labels(self, all_examples):
        if not self.label_list:
            labels = set()
            for ex in all_examples:
                labels.add(ex.str_label)
            labels = list(labels)
            labels.sort()
            print(labels)
            print(len(labels))
            self.label_list = labels

        for ex in all_examples:
            ex.int_label = self.label_list.index(ex.str_label)

    def read_examples_from_directory(self, dir_path):
        #os.chdir(dir_path)
        examples_list = []
        for file in glob.glob(dir_path + "*.tml"):
            #file_path = dir_path + file
            examples = self.read_examples(file)
            examples_list.append(examples)

        all_examples = list(itertools.chain.from_iterable(examples_list))
        #antithetics(all_examples)
        print(len(all_examples))
        self.assign_num_labels(all_examples)
        return all_examples

    def read_example_files(self, dir_path):
        all_files = glob.glob(dir_path + "*.tml")
        train_files = all_files[:-4]
        dev_files = all_files[-4:]

        train_examples_list = []
        for file in train_files:
            examples = self.read_examples(file)
            train_examples_list.append(examples)
        train = list(itertools.chain.from_iterable(train_examples_list))

        dev_examples_list = []
        for file in dev_files:
            examples = self.read_examples(file)
            dev_examples_list.append(examples)
        dev = list(itertools.chain.from_iterable(dev_examples_list))
        self.assign_num_labels(train + dev)

        return train, dev

    def read_dense_examples(self, td_path, extra=False, window_size=None):
        class DenseExample(object):
            def __init__(self, file_name, e1, e2, label):
                self.file_name = file_name
                self.e1 = e1
                self.e2 = e2
                self.label = self.parse_label(label)

            def parse_label(self, label):
                labels = {
                    "a": "AFTER",
                    "b": "BEFORE",
                    "i": "INCLUDES",
                    "ii": "IS_INCLUDED",
                    "s": "SIMULTANEOUS",
                    "v": "VAGUE"
                }
                return labels[label]

        DEV_DOCS = {
            "APW19980227.0487", "CNN19980223.1130.0960", "NYT19980212.0019",
            "PRI19980216.2000.0170", "ed980111.1130.0089"
        }

        TEST_DOCS = {
            "APW19980227.0489", "APW19980227.0494", "APW19980308.0201",
            "APW19980418.0210", "CNN19980126.1600.1104",
            "CNN19980213.2130.0155", "NYT19980402.0453",
            "PRI19980115.2000.0186", "PRI19980306.2000.1675"
        }

        files_to_exs = {}

        f = open(td_path, "r")

        for line in f.readlines():
            split = line.split()
            ex = DenseExample(split[0], split[1], split[2], split[3])

            if ex.file_name not in files_to_exs:
                files_to_exs[ex.file_name] = [ex]
            else:
                files_to_exs[ex.file_name].append(ex)

        files = set(files_to_exs.keys())
        train_files = files - DEV_DOCS - TEST_DOCS
        dev_files = DEV_DOCS

        train_examples = []
        for file_name in train_files:
            file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \
                    if extra \
                    else self.read_file(FILE_DIR + "/" + file_name + ".tml")

            for ex in files_to_exs[file_name]:
                e1 = file.get_element(ex.e1)
                e2 = file.get_element(ex.e2)

                if e1 == None or e2 == None:
                    #print("oops", file_name, ex.e1, ex.e2)
                    continue

                example = file.get_example(e1, e2, ex.label, window_size)

                if not example:
                    print("o no")
                else:
                    train_examples.append(example)

        self.assign_num_labels(train_examples)

        dev_examples = []
        for file_name in dev_files:
            file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \
                    if extra \
                    else self.read_file(FILE_DIR + "/" + file_name + ".tml")

            for ex in files_to_exs[file_name]:
                e1 = file.get_element(ex.e1)
                e2 = file.get_element(ex.e2)

                if e1 == None or e2 == None:
                    #print("oops", file_name, ex.e1, ex.e2)
                    continue

                example = file.get_example(e1, e2, ex.label, window_size)

                if not example:
                    print("o no")
                else:
                    dev_examples.append(example)

        self.assign_num_labels(dev_examples)
        return train_examples, dev_examples

    def read_dense_test_examples(self, td_path, extra=False, window_size=None):
        class DenseExample(object):
            def __init__(self, file_name, e1, e2, label):
                self.file_name = file_name
                self.e1 = e1
                self.e2 = e2
                self.label = self.parse_label(label)

            def parse_label(self, label):
                labels = {
                    "a": "AFTER",
                    "b": "BEFORE",
                    "i": "INCLUDES",
                    "ii": "IS_INCLUDED",
                    "s": "SIMULTANEOUS",
                    "v": "VAGUE"
                }
                return labels[label]

        TEST_DOCS = {
            "APW19980227.0489", "APW19980227.0494", "APW19980308.0201",
            "APW19980418.0210", "CNN19980126.1600.1104",
            "CNN19980213.2130.0155", "NYT19980402.0453",
            "PRI19980115.2000.0186", "PRI19980306.2000.1675"
        }

        files_to_exs = {}

        f = open(td_path, "r")

        for line in f.readlines():
            split = line.split()
            ex = DenseExample(split[0], split[1], split[2], split[3])

            if ex.file_name not in files_to_exs:
                files_to_exs[ex.file_name] = [ex]
            else:
                files_to_exs[ex.file_name].append(ex)

        test_examples = []
        for file_name in TEST_DOCS:
            file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \
                    if extra \
                    else self.read_file(FILE_DIR + "/" + file_name + ".tml")

            for ex in files_to_exs[file_name]:
                e1 = file.get_element(ex.e1)
                e2 = file.get_element(ex.e2)

                if e1 == None or e2 == None:
                    #print("oops", file_name, ex.e1, ex.e2)
                    continue

                example = file.get_example(e1, e2, ex.label, window_size)

                if not example:
                    print("o no")
                else:
                    test_examples.append(example)

        self.assign_num_labels(test_examples)
        return test_examples

    def read_tempeval3_examples():
        return None, None
Ejemplo n.º 20
0
 def setUp(self):
     super(TestSentenceSplitter, self).setUp()
     self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
     self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)
Ejemplo n.º 21
0
    for named_entity_category, words in aggregated_named_entities.items():
        word_counts = Counter(words)
        total_sum = sum(word_counts.values())
        word_frequencies = [(word, float(frequency) / total_sum)
                            for word, frequency in word_counts.items()]
        named_entities_frequency_table[
            named_entity_category] = word_frequencies

    with open(filename, 'w') as f:
        json.dump(named_entities_frequency_table, f)


instances = create_nabert_reader(
    data_path='../../data/drop_dataset/drop_dataset_train.json')
ner_tagger = fine_grained_named_entity_recognition_with_elmo_peters_2018()
sentences_splitter = SpacySentenceSplitter()
named_entities = defaultdict(list)

with torch.no_grad():
    for instance_idx, instance in enumerate(instances):
        original_question = instance.fields['metadata'].metadata[
            'original_question']
        original_passage = instance.fields['metadata'].metadata[
            'original_passage']

        aggregate_named_entities(original_question, named_entities)

        # NER tagger is more accurate when single sentences are fed as input
        passage_sentences = sentences_splitter.split_sentences(
            original_passage)
        for passage_sentence in passage_sentences:
Ejemplo n.º 22
0
class TestSentenceSplitter(AllenNlpTestCase):
    def setup_method(self):
        super().setup_method()
        self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
        self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)

    def test_rule_based_splitter_passes_through_correctly(self):
        text = "This is the first sentence. This is the second sentence! "
        tokens = self.rule_based_splitter.split_sentences(text)
        expected_tokens = [
            "This is the first sentence.", "This is the second sentence!"
        ]
        assert tokens == expected_tokens

    def test_dep_parse_splitter_passes_through_correctly(self):
        text = "This is the first sentence. This is the second sentence! "
        tokens = self.dep_parse_splitter.split_sentences(text)
        expected_tokens = [
            "This is the first sentence.", "This is the second sentence!"
        ]
        assert tokens == expected_tokens

    def test_batch_rule_based_sentence_splitting(self):
        text = [
            "This is a sentence. This is a second sentence.",
            "This isn't a sentence. This is a second sentence! This is a third sentence.",
        ]
        batch_split = self.rule_based_splitter.batch_split_sentences(text)
        separately_split = [
            self.rule_based_splitter.split_sentences(doc) for doc in text
        ]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(
                    batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_batch_dep_parse_sentence_splitting(self):
        text = [
            "This is a sentence. This is a second sentence.",
            "This isn't a sentence. This is a second sentence! This is a third sentence.",
        ]
        batch_split = self.dep_parse_splitter.batch_split_sentences(text)
        separately_split = [
            self.dep_parse_splitter.split_sentences(doc) for doc in text
        ]
        assert len(batch_split) == len(separately_split)
        for batch_doc, separate_doc in zip(batch_split, separately_split):
            assert len(batch_doc) == len(separate_doc)
            for batch_sentence, separate_sentence in zip(
                    batch_doc, separate_doc):
                assert batch_sentence == separate_sentence

    def test_to_params(self):
        params = self.dep_parse_splitter.to_params()
        assert isinstance(params, Params)
        assert params.params == {
            "type": "spacy",
            "language": self.dep_parse_splitter._language,
            "rule_based": self.dep_parse_splitter._rule_based,
        }
Ejemplo n.º 23
0
        number = word_to_num(word)
    except ValueError:
        try:
            number = int(word)
        except ValueError:
            try:
                number = float(word)
            except ValueError:
                number = None
    return number


tokenizer = BertDropTokenizer(pretrained_model="bert-base-uncased")
number_tokenizer = WordTokenizer()
words_splitter = WordTokenizer()
sentences_splitter = SpacySentenceSplitter()
ner_tagger = fine_grained_named_entity_recognition_with_elmo_peters_2018()
pos_tagger = span_based_constituency_parsing_with_elmo_joshi_2018()


def extract_letters_frequency(passage, sentence_idx=None):
    """
    :param passage:
    :param sentence_idx: None for whole passage, else per sentence (index 0)..
    :return:
    """
    if sentence_idx is None:
        return dict(filter(lambda k: k[0].isalpha(), Counter(passage).items()))
    else:
        sentences = extract_sentences(passage)
        sen = sentences[sentence_idx]
Ejemplo n.º 24
0
async def create_dataset_db(dataset_path: str, db_discriminator: str, file_path: str, use_existing_database=True,
                            sentence_splitter: SentenceSplitter = SpacySentenceSplitter(),
                            should_save_sentiment: bool = True,
                            ner_model: str = True,
                            coreference_model: str = True,
                            batch_size: int = 100,
                            max_workers: int = 16,
                            marked_sentences=False,
                            cuda_device: Union[List[int], int] = None) -> str:
    file_name = os.path.basename(file_path)
    database_file = f"{dataset_path}/{file_name}_{db_discriminator}.db"
    dataset_db = f"sqlite:///{database_file}"
    logging.info(f"Cached dataset path: {dataset_db}")

    # Create dir
    try:
        os.makedirs(dataset_path)
    except OSError:
        pass

    # Remove database if it shouldn't be reused.
    if not use_existing_database:
        try:
            os.remove(database_file)
        except OSError:
            pass

    if not Path(database_file).is_file():

        loop = asyncio.get_event_loop()

        with dataset.connect(dataset_db, engine_kwargs=engine_kwargs) as db:

            # Create the main tables and columns that need indexing.
            story_table = db.create_table('story')
            story_table.create_column('story_num', db.types.integer)
            story_table.create_index(['story_num'])

            sentence_table = db.create_table('sentence')
            sentence_table.create_column('story_id', db.types.bigint)
            sentence_table.create_column('sentence_num', db.types.integer)
            sentence_table.create_column('sentence_len', db.types.integer)
            sentence_table.create_column('start_span', db.types.integer)
            sentence_table.create_column('end_span', db.types.integer)
            # Indices created at the beginning as creating them later causing other processes to fail
            # when the a large index is locking the database.
            sentence_table.create_index(['story_id'])
            sentence_table.create_index(['start_span'])
            sentence_table.create_index(['end_span'])

            db.query("PRAGMA journal_mode=WAL;")

        create_story_tasks = []
        with ProcessPoolExecutor(max_workers=max_workers) as executor:

            async for lines, story_nums in chunk_stories_from_file(file_path, batch_size=batch_size):

                story_ids = [id for id in
                             list(db['story'].insert(dict(story_num=story_num)) for story_num in story_nums)]

                if marked_sentences:
                    sentence_splitter = MarkerSentenceSplitter()

                create_story_tasks.append(
                    loop.run_in_executor(executor, ProcessStory(sentence_splitter),
                                         lines, story_ids))

            for i, t in enumerate(asyncio.as_completed(create_story_tasks)):
                story_ids, sentences_to_save, story_metrics = await t

                db["sentence"].insert_many(sentences_to_save)

                for m in story_metrics:
                    db["story"].update(m, ['id'])

                print(f"Batch {i} - stories text saved: {story_ids}")

            logger.info(f"Saved stories to db with ids: {story_ids}")

            await save_language_features(batch_size, dataset_db, executor, loop)

            if should_save_sentiment:
                await save_sentiment(batch_size, dataset_db, executor, loop)

        if ner_model:
            await save_ner(ner_model, batch_size, dataset_db, cuda_device=cuda_device)

        if coreference_model:
            await save_coreferences(coreference_model, dataset_db, cuda_device=cuda_device)

    return dataset_db
Ejemplo n.º 25
0
class TextCatReader(DatasetReader):
    """
    Reads tokens and their labels from a labeled text classification dataset.
    Expects a "tokens" field and a "category" field in JSON format.

    The output of ``read`` is a list of ``Instance`` s with the fields:
        tokens: ``TextField`` and
        label: ``LabelField``

    Parameters
    ----------
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        We use this to define the input representation for the text.
        See :class:`TokenIndexer`.
    tokenizer : ``Tokenizer``, optional (default = ``{"tokens": WordTokenizer()}``)
        Tokenizer to use to split the input text into words or other kinds of tokens.
    segment_sentences: ``bool``, optional (default = ``False``)
        If True, we will first segment the text into sentences using SpaCy and then tokenize words.
        Necessary for some models that require pre-segmentation of sentences,
        like the Hierarchical Attention Network.
    sequence_length: ``int``, optional (default = ``None``)
        If specified, will truncate tokens to specified maximum length.
    ignore_labels: ``bool``, optional (default = ``False``)
        If specified, will ignore labels when reading data, useful for semi-supervised textcat
    skip_label_indexing: ``bool``, optional (default = ``False``)
        Whether or not to skip label indexing. You might want to skip label indexing if your
        labels are numbers, so the dataset reader doesn't re-number them starting from 0.
    lazy : ``bool``, optional, (default = ``False``)
        Whether or not instances can be read lazily.
    """
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 tokenizer: Tokenizer = None,
                 unrestricted_tokenizer: Tokenizer = None,
                 segment_sentences: bool = False,
                 sequence_length: int = None,
                 ignore_labels: bool = False,
                 skip_label_indexing: bool = False,
                 sample: int = None,
                 unlabeled_data_path: str = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._unrestricted_tokenizer = unrestricted_tokenizer
        self._sample = sample
        self._segment_sentences = segment_sentences
        self._sequence_length = sequence_length
        self._ignore_labels = ignore_labels
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        self._unlabeled_data_path = unlabeled_data_path
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

    def _reservoir_sampling(self, file_):
        """
        reservoir sampling for reading random lines from file without loading
        entire file into memory

        See here for explanation of algorithm:
        https://stackoverflow.com/questions/35680236/select-100-random-lines-from-a-file-with-a-1-million-which-cant-be-read-into-me

        Parameters
        ----------
        file : `str` - file path
        sample_size : `int` - size of random sample you want

        Returns
        -------
        result : `List[str]` - sample lines of file
        """
        file_iterator = iter(file_)

        try:
            result = [next(file_iterator) for _ in range(self._sample)]

        except StopIteration:
            raise ValueError("Sample larger than population")

        for index, item in enumerate(file_iterator, start=self._sample):
            sample_index = np.random.randint(0, index)
            if sample_index < self._sample:
                result[sample_index] = item

        np.random.shuffle(result)

        return result

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            if self._sample is not None:
                lines = [(item, False)
                         for item in self._reservoir_sampling(data_file)]
            else:
                lines = [(item, True) for item in data_file.readlines()]

        if self._unlabeled_data_path:
            with open(cached_path(self._unlabeled_data_path)) as data_file:
                lines += [(item, False) for item in data_file.readlines()]

        for line, is_labeled in lines:
            items = json.loads(line)
            text = items["tokens"]
            label = str(items['category'])
            instance = self.text_to_instance(text=text,
                                             label=label,
                                             is_labeled=is_labeled)
            if instance is not None:
                yield instance

    def _truncate(self, tokens):
        """
        truncate a set of tokens using the provided sequence length
        """
        if len(tokens) > self._sequence_length:
            tokens = tokens[:self._sequence_length]
        return tokens

    @overrides
    def text_to_instance(self,
                         text: str,
                         label: str = None,
                         is_labeled: bool = False) -> Instance:  # type: ignore
        """
        Parameters
        ----------
        text : ``str``, required.
            The text to classify
        label ``str``, optional, (default = None).
            The label for this text.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label label of the sentence or phrase.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens, self._token_indexers))
            fields['tokens'] = ListField(sentences)
        else:

            tokens = self._tokenizer.tokenize(text)
            if self._sequence_length is not None:
                tokens = self._truncate(tokens)

            fields['tokens'] = TextField(tokens, self._token_indexers)

            if self._unrestricted_tokenizer:
                unrestricted_tokens = self._unrestricted_tokenizer.tokenize(
                    text)
                if self._sequence_length is not None:
                    unrestricted_tokens = self._truncate(unrestricted_tokens)
                fields['filtered_tokens'] = TextField(unrestricted_tokens,
                                                      self._token_indexers)

        # TODO: Document 'default' unsupervised label as pre-condition.
        if label is not None:
            fields['label'] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        fields['metadata'] = MetadataField({"is_labeled": is_labeled})

        return Instance(fields)
Ejemplo n.º 26
0
class ICCDatasetReader(DatasetReader):
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tokenizer: Tokenizer = None,
        segment_sentences: bool = False,
        max_sequence_length: int = None,
        skip_label_indexing: bool = False,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = DummyTokenizer()  # assumes our tokens unchanged
        self._segment_sentences = segment_sentences
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path)) as data_file:
            for line in data_file:
                try:
                    text, label = line.strip().split("\t")
                except ValueError as e:
                    continue
                instance = self.text_to_instance(text=text, label=label)
                if instance is not None:
                    yield instance

    def _truncate(self, tokens):
        """
        truncate a set of tokens using the provided sequence length
        """
        if len(tokens) > self._max_sequence_length:
            tokens = tokens[:self._max_sequence_length]
        return tokens

    @overrides
    def text_to_instance(self,
                         text: str,
                         label: Union[str, int] = None) -> Instance:
        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._max_sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens, self._token_indexers))
            fields["tokens"] = ListField(sentences)
        else:
            tokens = self._tokenizer.tokenize(text)
            if self._max_sequence_length is not None:
                tokens = self._truncate(tokens)
            fields["tokens"] = TextField(tokens, self._token_indexers)
        if label is not None:
            fields["label"] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        return Instance(fields)
Ejemplo n.º 27
0
 def setup_method(self):
     super().setup_method()
     self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
     self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)
Ejemplo n.º 28
0
class IssueReaderSiamese(DatasetReader):
    """
    Parameters
    ----------
    lazy : ``bool`` (optional, default=False)
        Passed to ``DatasetReader``.  If this is ``True``, training will start sooner, but will
        take longer per batch.  This also allows training with datasets that are too large to fit
        in memory.
    tokenizer : ``Tokenizer``, optional
        Tokenizer to use to split the sentence into words or other kinds of tokens.
        Defaults to ``WordTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        Indexers used to define input token representations. Defaults to ``{"tokens":
        SingleIdTokenIndexer()}``.
    """
    def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 segment_sentences: bool = True,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            word_splitter=SpacyWordSplitter(pos_tags=True),
            word_stemmer=PorterStemmer())
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if segment_sentences:
            self._segment_sentences = SpacySentenceSplitter()
        self._class_cnt = defaultdict(int)

    def read_dataset(self, file_path):
        features = []
        others = []
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s",
                        file_path)
            for line in data_file:
                if not line or len(line) == 0:
                    continue
                line = json.loads(line)
                if "id" not in line.keys():
                    d_id = ""
                else:
                    d_id = line['id']
                report = split_issue_template(line['body'])
                report = self._segment_sentences.split_sentences(report)
                cmts = line['comments']
                comments = []
                for comment in cmts:
                    user_name = comment['user']
                    comment = replace_tokens(comment['body'])
                    if len(comment) == 0:
                        continue
                    comments.append((user_name, comment))
                dialog = report + comments
                if len(dialog) == 0:
                    continue
                labels = line['label']
                if len(labels) == 0:
                    label = None
                else:
                    label = "feature" if "feature" in labels or "type: feature" in labels else "other"
                if "feature" == label:
                    features.append((d_id, dialog, label))
                else:
                    others.append((d_id, dialog, label))
        return features, others

    @overrides
    def _read(self, file_path):
        features, others = self.read_dataset(file_path)
        all_data = features + others
        random.shuffle(all_data)
        same_num = 0
        diff_num = 0
        if "unlabel" in file_path:
            logger.info("Begin predict------")
            features, others = self.read_dataset(
                "frmodel/data/{}_target_train.txt")
            for sample in features + others:
                yield self.text_to_instance((sample, sample), is_gold=True)
            for sample in all_data:
                yield self.text_to_instance((sample, sample))
            logger.info(f"Predict sample num is {len(all_data)}")
        else:
            logger.info("Begin training-------")
            iter_num = 1
            if "test" in file_path:
                features, others = self.read_dataset(
                    re.sub("test", "train", file_path))
                iter_num = 1
            for _ in range(iter_num):
                # plain balance data
                if "train" in file_path:
                    for k in range(len(others) - len(features)):
                        all_data.append(random.choice(features))
                for sample in all_data:
                    positive = random.choice(features)
                    negative = random.choice(others)
                    yield self.text_to_instance((sample, positive))
                    yield self.text_to_instance((sample, negative))
                    same_num += 1
                    diff_num += 1
            logger.info(
                f"Dataset Count: Same : {same_num} / Diff : {diff_num}")

    @overrides
    def text_to_instance(self, p, is_gold=False) -> Instance:  # type: ignore
        fields: Dict[str, Field] = {}
        ins1, ins2 = p
        dialog = ListField([
            TextField([word for word in self._tokenizer.tokenize(line[1])],
                      self._token_indexers) for line in ins1[1]
        ])
        fields['dialog1'] = dialog
        fields["pos_tags1"] = ListField([
            SequenceLabelField(
                [word.tag_ for word in self._tokenizer.tokenize(line[1])],
                tokens,
                label_namespace="pos")
            for line, tokens in zip(ins1[1], dialog)
        ])
        if ins1[-1] is not None and ins2[-1] is not None:
            if ins1[-1] == ins2[-1]:
                fields['label'] = LabelField("same")
            else:
                fields['label'] = LabelField("diff")
            fields['label_tags'] = LabelField("@".join([ins1[-1], ins2[-1]]),
                                              label_namespace="label_tags")
        fields['label'] = LabelField(ins1[-1])
        fields['metadata'] = MetadataField({
            "is_gold": is_gold,
            "pair_instance": p
        })

        return Instance(fields)
Ejemplo n.º 29
0
 def setUp(self):
     super(TestSentenceSplitter, self).setUp()
     self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False)
     self.rule_based_splitter = SpacySentenceSplitter(rule_based=True)
Ejemplo n.º 30
0
 def __init__(self):
     self.label_list = None
     self.sentence_splitter = SpacySentenceSplitter()
Ejemplo n.º 31
0
class TextClassificationJsonReader(DatasetReader):
    """
    Reads tokens and their labels from a labeled text classification dataset.
    Expects a "text" field and a "label" field in JSON format.

    The output of ``read`` is a list of ``Instance`` s with the fields:
        tokens : ``TextField`` and
        label : ``LabelField``

    # Parameters

    token_indexers : ``Dict[str, TokenIndexer]``, optional
        optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        We use this to define the input representation for the text.
        See :class:`TokenIndexer`.
    tokenizer : ``Tokenizer``, optional (default = ``{"tokens": SpacyTokenizer()}``)
        Tokenizer to use to split the input text into words or other kinds of tokens.
    segment_sentences : ``bool``, optional (default = ``False``)
        If True, we will first segment the text into sentences using SpaCy and then tokenize words.
        Necessary for some models that require pre-segmentation of sentences, like the Hierarchical
        Attention Network (https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf).
    max_sequence_length : ``int``, optional (default = ``None``)
        If specified, will truncate tokens to specified maximum length.
    skip_label_indexing : ``bool``, optional (default = ``False``)
        Whether or not to skip label indexing. You might want to skip label indexing if your
        labels are numbers, so the dataset reader doesn't re-number them starting from 0.
    lazy : ``bool``, optional, (default = ``False``)
        Whether or not instances can be read lazily.
    """
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tokenizer: Tokenizer = None,
        segment_sentences: bool = False,
        max_sequence_length: int = None,
        skip_label_indexing: bool = False,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._segment_sentences = segment_sentences
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            for line in data_file.readlines():
                if not line:
                    continue
                items = json.loads(line)
                text = items["text"]
                label = items.get("label", None)
                if label is not None:
                    if self._skip_label_indexing:
                        try:
                            label = int(label)
                        except ValueError:
                            raise ValueError(
                                "Labels must be integers if skip_label_indexing is True."
                            )
                    else:
                        label = str(label)
                instance = self.text_to_instance(text=text, label=label)
                if instance is not None:
                    yield instance

    def _truncate(self, tokens):
        """
        truncate a set of tokens using the provided sequence length
        """
        if len(tokens) > self._max_sequence_length:
            tokens = tokens[:self._max_sequence_length]
        return tokens

    @overrides
    def text_to_instance(
            self,
            text: str,
            label: Union[str, int] = None) -> Instance:  # type: ignore
        """
        # Parameters

        text : ``str``, required.
            The text to classify
        label : ``str``, optional, (default = None).
            The label for this text.

        # Returns

        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label label of the sentence or phrase.
        """

        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._max_sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens, self._token_indexers))
            fields["tokens"] = ListField(sentences)
        else:
            tokens = self._tokenizer.tokenize(text)
            if self._max_sequence_length is not None:
                tokens = self._truncate(tokens)
            fields["tokens"] = TextField(tokens, self._token_indexers)
        if label is not None:
            fields["label"] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        return Instance(fields)
Ejemplo n.º 32
0
class TextClassificationJsonReader(DatasetReader):
    """
    Reads tokens and their labels from a labeled text classification dataset.

    The output of `read` is a list of `Instance` s with the fields:
        tokens : `TextField` and
        label : `LabelField`

    Registered as a `DatasetReader` with name "text_classification_json".

    [0]: https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf

    # Parameters

    token_indexers : `Dict[str, TokenIndexer]`, optional
        optional (default=`{"tokens": SingleIdTokenIndexer()}`)
        We use this to define the input representation for the text.
        See :class:`TokenIndexer`.
    tokenizer : `Tokenizer`, optional (default = `{"tokens": SpacyTokenizer()}`)
        Tokenizer to use to split the input text into words or other kinds of tokens.
    segment_sentences : `bool`, optional (default = `False`)
        If True, we will first segment the text into sentences using SpaCy and then tokenize words.
        Necessary for some models that require pre-segmentation of sentences, like [the Hierarchical
        Attention Network][0].
    max_sequence_length : `int`, optional (default = `None`)
        If specified, will truncate tokens to specified maximum length.
    skip_label_indexing : `bool`, optional (default = `False`)
        Whether or not to skip label indexing. You might want to skip label indexing if your
        labels are numbers, so the dataset reader doesn't re-number them starting from 0.
    text_key: `str`, optional (default=`"text"`)
        The key name of the source field in the JSON data file.
    label_key: `str`, optional (default=`"label"`)
        The key name of the target field in the JSON data file.
    """
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tokenizer: Tokenizer = None,
        segment_sentences: bool = False,
        max_sequence_length: int = None,
        skip_label_indexing: bool = False,
        text_key: str = "text",
        label_key: str = "label",
        **kwargs,
    ) -> None:
        super().__init__(manual_distributed_sharding=True,
                         manual_multiprocess_sharding=True,
                         **kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._segment_sentences = segment_sentences
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._text_key = text_key
        self._label_key = label_key
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter()

    @overrides
    def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            for line in self.shard_iterable(data_file.readlines()):
                if not line:
                    continue
                items = json.loads(line)
                text = items[self._text_key]
                label = items.get(self._label_key)
                if label is not None:
                    if self._skip_label_indexing:
                        try:
                            label = int(label)
                        except ValueError:
                            raise ValueError(
                                "Labels must be integers if skip_label_indexing is True."
                            )
                    else:
                        label = str(label)
                yield self.text_to_instance(text=text, label=label)

    def _truncate(self, tokens):
        """
        truncate a set of tokens using the provided sequence length
        """
        if len(tokens) > self._max_sequence_length:
            tokens = tokens[:self._max_sequence_length]
        return tokens

    @overrides
    def text_to_instance(
            self,
            text: str,
            label: Union[str, int] = None) -> Instance:  # type: ignore
        """
        # Parameters

        text : `str`, required.
            The text to classify
        label : `str`, optional, (default = `None`).
            The label for this text.

        # Returns

        An `Instance` containing the following fields:
            - tokens (`TextField`) :
              The tokens in the sentence or phrase.
            - label (`LabelField`) :
              The label label of the sentence or phrase.
        """

        fields: Dict[str, Field] = {}
        if self._segment_sentences:
            sentences: List[Field] = []
            sentence_splits = self._sentence_segmenter.split_sentences(text)
            for sentence in sentence_splits:
                word_tokens = self._tokenizer.tokenize(sentence)
                if self._max_sequence_length is not None:
                    word_tokens = self._truncate(word_tokens)
                sentences.append(TextField(word_tokens))
            fields["tokens"] = ListField(sentences)
        else:
            tokens = self._tokenizer.tokenize(text)
            if self._max_sequence_length is not None:
                tokens = self._truncate(tokens)
            fields["tokens"] = TextField(tokens)
        if label is not None:
            fields["label"] = LabelField(
                label, skip_indexing=self._skip_label_indexing)
        return Instance(fields)

    @overrides
    def apply_token_indexers(self, instance: Instance) -> None:
        if self._segment_sentences:
            for text_field in instance.fields["tokens"]:  # type: ignore
                text_field._token_indexers = self._token_indexers
        else:
            instance.fields[
                "tokens"]._token_indexers = self._token_indexers  # type: ignore