Beispiel #1
0
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.tokenizer_space = WhitespaceTokenizer()
        self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md",
                                              pos_tags=True,
                                              split_on_spaces=True)
        self.token_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6),
            'pos_tags':
            SingleIdTokenIndexer(namespace='pos_tag_vocab',
                                 feature_name='tag_'),
            'ner_tags':
            SingleIdTokenIndexer(namespace='ner_tag_vocab',
                                 feature_name='ent_type_')
        }

        self.slot_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6)
        }
 def __init__(self, field_name: str) -> None:
     super().__init__()
     self.field_name = field_name
     self.tokenizer = SpacyTokenizer()
     self.token_indexers: Dict[str, TokenIndexer] = {
         "tokens": SingleIdTokenIndexer()
     }
    def test_never_lowercase(self):
        # Our default tokenizer doesn't handle lowercasing.
        tokenizer = SpacyTokenizer()

        #            2 15 10 11  6
        sentence = "the laziest fox"

        tokens = tokenizer.tokenize(sentence)
        tokens.append(Token("[PAD]"))  # have to do this b/c tokenizer splits it in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # PAD should get recognized and not lowercased      # [PAD]
        assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 0, 17]

        # Unless we manually override the never lowercases
        token_indexer = PretrainedBertIndexer(
            str(vocab_path), do_lowercase=True, never_lowercase=()
        )
        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # now PAD should get lowercased and be UNK          # [UNK]
        assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 1, 17]
Beispiel #4
0
class DSLSharedTaskDataset(DatasetReader):
    def __init__(self):
        super(DSLSharedTaskDataset, self).__init__(lazy=False)
        self.tokenizer = SpacyTokenizer()
        self.token_indexers = {'tokens': SingleIdTokenIndexer()}

    def _read(self, text_path: str) -> Iterable[Instance]:
        with open(text_path, "r") as text_data:
            text_data = text_data.read().splitlines()
            for line in text_data:
                try:
                    text, label = line.strip().split('\t')
                except ValueError:
                    print(line)
                text_field = TextField(self.tokenizer.tokenize(text),
                                       self.token_indexers)
                label_field = LabelField(label)
                fields = {'text': text_field, 'label': label_field}
                yield Instance(fields)

    def text_to_instance(self, text: str, label: str = None) -> Instance:
        tokens = self.tokenizer.tokenize(text)
        text_field = TextField(tokens, self.token_indexers)
        fields = {'text': text_field}
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)
    def setup_method(self):
        self.tokenizer = SpacyTokenizer(pos_tags=True)
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        table_file = self.FIXTURES_ROOT / "data" / "wikitables" / "tables" / "341.tagged"
        self.graph = TableQuestionContext.read_from_file(
            table_file, self.utterance).get_table_knowledge_graph()
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace="tokens")
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace="tokens")
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace="tokens")
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace="tokens")
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace="tokens")

        self.oov_index = self.vocab.get_token_index("random OOV string",
                                                    namespace="tokens")
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super().setup_method()
Beispiel #6
0
 def test_char_span_to_token_span_handles_hard_cases(self):
     # An earlier version of the code had a hard time when the answer was the last token in the
     # passage.  This tests that case, on the instance that used to fail.
     tokenizer = SpacyTokenizer()
     passage = (
         "Beyonc\u00e9 is believed to have first started a relationship with Jay Z "
         +
         'after a collaboration on "\'03 Bonnie & Clyde", which appeared on his seventh '
         +
         "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay "
         +
         "Z's girlfriend in the music video for the song, which would further fuel "
         +
         "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were "
         +
         "married without publicity. As of April 2014, the couple have sold a combined 300 "
         +
         "million records together. The couple are known for their private relationship, "
         +
         "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 "
         +
         'suffered a miscarriage in 2010 or 2011, describing it as "the saddest thing" '
         +
         "she had ever endured. She returned to the studio and wrote music in order to cope "
         +
         "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order "
         +
         "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris."
     )
     start = 912
     end = 912 + len("Paris.")
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     token_span = util.char_span_to_token_span(offsets, (start, end))[0]
     assert token_span == (184, 185)
Beispiel #7
0
 def __init__(self,
              model: Model,
              dataset_reader: DatasetReader,
              language: str = "en_core_web_sm") -> None:
     super().__init__(model, dataset_reader)
     self._language = language
     self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)
Beispiel #8
0
def read(fn: str) -> Iterable[List[Extraction]]:
    tokenizer = SpacyTokenizer(pos_tags=True)
    prev_sent: List[Extraction] = []

    with open(fn) as fin:
        for line in tqdm(fin):
            data = line.strip().split("\t")
            confidence = data[0]
            if not all(data[2:5]):
                # Make sure that all required elements are present
                continue
            arg1, rel, args2 = (parse_element(e) for e in data[2:5])

            # Exactly one subject and one relation
            # and at least one object
            if len(rel) == 1 and len(arg1) == 1 and len(args2) >= 1:
                sent = data[5]
                cur_ex = Extraction(
                    sent=sent,
                    toks=tokenizer.tokenize(sent),
                    arg1=arg1[0],
                    rel=rel[0],
                    args2=args2,
                    confidence=confidence,
                )

                # Decide whether to append or yield
                if not prev_sent or prev_sent[0].sent == sent:
                    prev_sent.append(cur_ex)
                else:
                    yield prev_sent
                    prev_sent = [cur_ex]
    if prev_sent:
        # Yield last element
        yield prev_sent
def read_dataset(file_path):
    with open(file_path) as dataset_file:
        tokenizer = SpacyTokenizer()
        dataset_json = json.load(dataset_file)
        dialogs = []
        for dialog in dataset_json:
            dialog_idx = dialog["dialogue_idx"]
            dialog = dialog['dialogue']
            dialog_context = None
            for turn_i, turn in enumerate(dialog):
                sys_utt = turn['system_transcript']
                user_utt = turn['transcript']
                tokenized_sys_utt = tokenizer.tokenize(sys_utt)
                if turn_i != 0:
                    tokenized_sys_utt = [Token(text="<S>", lemma_="<S>")
                                         ] + tokenized_sys_utt
                tokenized_user_utt = tokenizer.tokenize(user_utt)
                if turn_i != len(dialog) - 1:
                    tokenized_user_utt = tokenized_user_utt + [
                        Token(text="</S>", lemma_="</S>")
                    ]
                if dialog_context is None:
                    dialog_context = tokenized_sys_utt + tokenized_user_utt
                else:
                    dialog_context += tokenized_sys_utt + tokenized_user_utt
            dialog_context = [t.text for t in dialog_context]
            dialogs.append((dialog_idx, [dialog_context]))
    return dialogs
def search(
    tables_directory: str,
    data: JsonDict,
    output_path: str,
    max_path_length: int,
    max_num_logical_forms: int,
    use_agenda: bool,
    output_separate_files: bool,
    conservative_agenda: bool,
) -> None:
    print(f"Starting search with {len(data)} instances", file=sys.stderr)
    language_logger = logging.getLogger("allennlp.semparse.domain_languages.wikitables_language")
    language_logger.setLevel(logging.ERROR)
    tokenizer = SpacyTokenizer()
    if output_separate_files and not os.path.exists(output_path):
        os.makedirs(output_path)
    if not output_separate_files:
        output_file_pointer = open(output_path, "w")
    for instance_data in data:
        utterance = instance_data["question"]
        question_id = instance_data["id"]
        if utterance.startswith('"') and utterance.endswith('"'):
            utterance = utterance[1:-1]
        # For example: csv/200-csv/47.csv -> tagged/200-tagged/47.tagged
        table_file = instance_data["table_filename"].replace("csv", "tagged")
        target_list = instance_data["target_values"]
        tokenized_question = tokenizer.tokenize(utterance)
        table_file = f"{tables_directory}/{table_file}"
        context = TableQuestionContext.read_from_file(table_file, tokenized_question)
        world = WikiTablesLanguage(context)
        walker = ActionSpaceWalker(world, max_path_length=max_path_length)
        correct_logical_forms = []
        if use_agenda:
            agenda = world.get_agenda(conservative=conservative_agenda)
            allow_partial_match = not conservative_agenda
            all_logical_forms = walker.get_logical_forms_with_agenda(
                agenda=agenda, max_num_logical_forms=10000, allow_partial_match=allow_partial_match
            )
        else:
            all_logical_forms = walker.get_all_logical_forms(max_num_logical_forms=10000)
        for logical_form in all_logical_forms:
            if world.evaluate_logical_form(logical_form, target_list):
                correct_logical_forms.append(logical_form)
        if output_separate_files and correct_logical_forms:
            with gzip.open(f"{output_path}/{question_id}.gz", "wt") as output_file_pointer:
                for logical_form in correct_logical_forms:
                    print(logical_form, file=output_file_pointer)
        elif not output_separate_files:
            print(f"{question_id} {utterance}", file=output_file_pointer)
            if use_agenda:
                print(f"Agenda: {agenda}", file=output_file_pointer)
            if not correct_logical_forms:
                print("NO LOGICAL FORMS FOUND!", file=output_file_pointer)
            for logical_form in correct_logical_forms[:max_num_logical_forms]:
                print(logical_form, file=output_file_pointer)
            print(file=output_file_pointer)
    if not output_separate_files:
        output_file_pointer.close()
Beispiel #11
0
 def test_passes_through_correctly(self):
     tokenizer = SpacyTokenizer()
     sentence = "this (sentence) has 'crazy' \"punctuation\"."
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     expected_tokens = [
         "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"",
         "punctuation", "\"", "."
     ]
     self.assertSequenceEqual(tokens, expected_tokens)
Beispiel #12
0
 def test_crashes_with_empty_feature_value_and_no_default(self):
     tokenizer = SpacyTokenizer(parse=True)
     tokens = tokenizer.tokenize("This is a sentence.")
     tokens = [t for t in tokens] + [Token("</S>")]
     vocab = Vocabulary()
     vocab.add_token_to_namespace("ROOT", namespace="dep_labels")
     vocab.add_token_to_namespace("NONE", namespace="dep_labels")
     indexer = SingleIdTokenIndexer(namespace="dep_labels",
                                    feature_name="dep_")
     with pytest.raises(ValueError):
         indexer.tokens_to_indices([tokens[-1]], vocab)
Beispiel #13
0
def test_profile():
    data_path = "https://storage.googleapis.com/tyoyo/jwtd/v1.0/dev.tsv"
    dataset_reader = Seq2SeqDatasetReader(
        source_tokenizer=SpacyTokenizer(language="ja_core_news_sm"),
        target_tokenizer=SpacyTokenizer(language="ja_core_news_sm"),
        source_max_tokens=64,
        target_max_tokens=64,
        start_symbol="STARTSYMBOL",
        end_symbol="ENDSYMBOL",
    )
    dataset = dataset_reader.read(data_path)
Beispiel #14
0
 def test_empty_list_can_be_tensorized(self):
     tokenizer = SpacyTokenizer()
     tokens = tokenizer.tokenize("Foo")
     text_field = TextField(tokens, self.word_indexer)
     list_field = ListField([text_field.empty_field()])
     fields = {
         "list": list_field,
         "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
     }
     instance = Instance(fields)
     instance.index_fields(self.vocab)
     instance.as_tensor_dict()
Beispiel #15
0
    def test_no_namespace_means_no_counting(self):
        tokenizer = SpacyTokenizer(parse=True)
        tokens = tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = SingleIdTokenIndexer(namespace=None, feature_name="text_id")

        def fail():
            assert False

        counter = defaultdict(fail)
        for token in tokens:
            indexer.count_vocab_items(token, counter)
Beispiel #16
0
    def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", "words")
        self.vocab.add_token_to_namespace("s", "characters")
        self.vocab.add_token_to_namespace("e", "characters")
        self.vocab.add_token_to_namespace("n", "characters")
        self.vocab.add_token_to_namespace("t", "characters")
        self.vocab.add_token_to_namespace("c", "characters")
        for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]:
            self.vocab.add_token_to_namespace(label, "labels")

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words":
            SingleIdTokenIndexer("words"),
            "characters":
            TokenCharactersIndexer("characters", min_padding_length=1),
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]],
            self.word_indexer)
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]],
            self.word_indexer)
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]],
            self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1],
                                                       self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field(
        )

        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        empty_list_field = ListField([text_field.empty_field()])
        empty_fields = {"list_tensor": empty_list_field}
        self.empty_instance = Instance(empty_fields)

        non_empty_list_field = ListField([text_field])
        non_empty_fields = {"list_tensor": non_empty_list_field}
        self.non_empty_instance = Instance(non_empty_fields)

        super().setUp()
Beispiel #17
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer],
        max_sequence_length: int = None,
        keep_prob: float = 1.0,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy=lazy)
        self._max_sequence_length = max_sequence_length
        self._token_indexers = token_indexers
        self._tokenizer = SpacyTokenizer()

        self._keep_prob = keep_prob
        self._bert = "bert" in token_indexers
Beispiel #18
0
    def test_enumerate_spans_enumerates_all_spans(self):
        tokenizer = SpacyTokenizer(pos_tags=True)
        sentence = tokenizer.tokenize("This is a sentence.")

        spans = span_utils.enumerate_spans(sentence)
        assert spans == [
            (0, 0),
            (0, 1),
            (0, 2),
            (0, 3),
            (0, 4),
            (1, 1),
            (1, 2),
            (1, 3),
            (1, 4),
            (2, 2),
            (2, 3),
            (2, 4),
            (3, 3),
            (3, 4),
            (4, 4),
        ]

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2)
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (2, 4),
                         (3, 4)]

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2,
                                           offset=20)
        assert spans == [(20, 21), (20, 22), (21, 22), (21, 23), (22, 23),
                         (22, 24), (23, 24)]

        def no_prefixed_punctuation(tokens: List[Token]):
            # Only include spans which don't start or end with punctuation.
            return tokens[0].pos_ != "PUNCT" and tokens[-1].pos_ != "PUNCT"

        spans = span_utils.enumerate_spans(
            sentence,
            max_span_width=3,
            min_span_width=2,
            filter_function=no_prefixed_punctuation)

        # No longer includes (2, 4) or (3, 4) as these include punctuation
        # as their last element.
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]
Beispiel #19
0
 def test_to_params(self):
     tokenizer = SpacyTokenizer()
     params = tokenizer.to_params()
     assert isinstance(params, Params)
     assert params.params == {
         "type": "spacy",
         "language": tokenizer._language,
         "pos_tags": tokenizer._pos_tags,
         "parse": tokenizer._parse,
         "ner": tokenizer._ner,
         "keep_spacy_tokens": tokenizer._keep_spacy_tokens,
         "split_on_spaces": tokenizer._split_on_spaces,
         "start_tokens": tokenizer._start_tokens,
         "end_tokens": tokenizer._end_tokens,
     }
Beispiel #20
0
 def __init__(
     self,
     tokenizer: Tokenizer = None,
     token_indexers: Dict[str, TokenIndexer] = None,
     passage_length_limit: int = None,
     question_length_limit: int = None,
     skip_when_all_empty: List[str] = None,
     instance_format: str = "drop",
     relaxed_span_match_for_finding_labels: bool = True,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._tokenizer = tokenizer or SpacyTokenizer()
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self.passage_length_limit = passage_length_limit
     self.question_length_limit = question_length_limit
     self.skip_when_all_empty = skip_when_all_empty if skip_when_all_empty is not None else []
     for item in self.skip_when_all_empty:
         assert item in [
             "passage_span",
             "question_span",
             "addition_subtraction",
             "counting",
         ], f"Unsupported skip type: {item}"
     self.instance_format = instance_format
     self.relaxed_span_match_for_finding_labels = relaxed_span_match_for_finding_labels
Beispiel #21
0
 def __init__(
     self,
     target_namespace: str,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._target_namespace = target_namespace
     self._source_tokenizer = source_tokenizer or SpacyTokenizer()
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
     self._target_token_indexers: Dict[str, TokenIndexer] = {
         "tokens": SingleIdTokenIndexer(namespace=self._target_namespace)
     }
     if (
         isinstance(self._target_tokenizer, PretrainedTransformerTokenizer)
         and self._target_tokenizer._add_special_tokens
     ):
         warnings.warn(
             "'add_special_tokens' is True for target_tokenizer, which is a PretrainedTransformerTokenizer. "
             "This means special tokens, such as '[CLS]' and '[SEP]', will probably end up in "
             "your model's predicted target sequences. "
             "If this is not what you intended, make sure to specify 'add_special_tokens: False' for "
             "your target_tokenizer.",
             UserWarning,
         )
Beispiel #22
0
 def __init__(
     self,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     target_token_indexers: Dict[str, TokenIndexer] = None,
     source_add_start_token: bool = True,
     source_add_end_token: bool = True,
     delimiter: str = "\t",
     source_max_tokens: Optional[int] = None,
     target_max_tokens: Optional[int] = None,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._source_tokenizer = source_tokenizer or SpacyTokenizer()
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
     self._source_add_end_token = source_add_end_token
     self._delimiter = delimiter
     self._source_max_tokens = source_max_tokens
     self._target_max_tokens = target_max_tokens
     self._source_max_exceeded = 0
     self._target_max_exceeded = 0
Beispiel #23
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer] = None,
     tokenizer: Tokenizer = None,
     segment_sentences: bool = False,
     max_sequence_length: int = None,
     skip_label_indexing: bool = False,
     text_key: str = "text",
     label_key: str = "label",
     **kwargs,
 ) -> None:
     super().__init__(manual_distributed_sharding=True,
                      manual_multiprocess_sharding=True,
                      **kwargs)
     self._tokenizer = tokenizer or SpacyTokenizer()
     self._segment_sentences = segment_sentences
     self._max_sequence_length = max_sequence_length
     self._skip_label_indexing = skip_label_indexing
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._text_key = text_key
     self._label_key = label_key
     if self._segment_sentences:
         self._sentence_segmenter = SpacySentenceSplitter()
Beispiel #24
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 hyperbolic_phrase_indexers: Dict[str, TokenIndexer] = None,
                 max_sequence_length: int = None,
                 start_tokens: List[str] = None,
                 end_tokens: List[str] = None,
                 rare_frequency: int = 10) -> None:
        super().__init__()
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace='euclidean')
        }
        self._hyperbolic_phrase_indexers = hyperbolic_phrase_indexers or {
            "tokens": SingleIdTokenIndexer(namespace='hyperbolic')
        }

        if max_sequence_length is not None:
            self._max_sequence_length: Union[
                float, Optional[int]] = max_sequence_length
        else:
            self._max_sequence_length = math.inf

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]

        self._rare_frequency = rare_frequency

        logger.info("Creating SimpleLanguageModelingDatasetReader")
        logger.info("max_sequence_length=%s", max_sequence_length)
    def __init__(self,
                 pretrained_model: str = None,
                 tokenizer: Optional[Tokenizer] = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_pieces: int = 512,
                 add_prefix: bool = False,
                 combine_input_fields: bool = True,
                 sample: int = -1) -> None:
        super().__init__()

        if pretrained_model != None:
            self._tokenizer = PretrainedTransformerTokenizer(
                pretrained_model, max_length=max_pieces)
            token_indexer = PretrainedTransformerIndexer(pretrained_model)
            self._token_indexers = {'tokens': token_indexer}
        else:
            self._tokenizer = tokenizer or SpacyTokenizer()
            self._token_indexers = token_indexers or {
                "tokens": SingleIdTokenIndexer()
            }

        self._sample = sample
        self._add_prefix = add_prefix
        self._combine_input_fields = combine_input_fields
        self._debug_prints = -1
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        passage_length_limit: int = None,
        question_length_limit: int = None,
        skip_impossible_questions: bool = False,
        no_answer_token: Optional[str] = None,
        **kwargs,
    ) -> None:
        if "skip_invalid_examples" in kwargs:
            import warnings

            warnings.warn(
                "'skip_invalid_examples' is deprecated, please use 'skip_impossible_questions' instead",
                DeprecationWarning,
            )
            skip_impossible_questions = kwargs.pop("skip_invalid_examples")

        super().__init__(manual_distributed_sharding=True,
                         manual_multiprocess_sharding=True,
                         **kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self.passage_length_limit = passage_length_limit
        self.question_length_limit = question_length_limit
        self.skip_impossible_questions = skip_impossible_questions
        self.no_answer_token = no_answer_token
 def __init__(
     self,
     lazy: bool = False,
     tables_directory: str = None,
     offline_logical_forms_directory: str = None,
     max_offline_logical_forms: int = 10,
     keep_if_no_logical_forms: bool = False,
     tokenizer: Tokenizer = None,
     question_token_indexers: Dict[str, TokenIndexer] = None,
     table_token_indexers: Dict[str, TokenIndexer] = None,
     use_table_for_vocab: bool = False,
     max_table_tokens: int = None,
     output_agendas: bool = False,
 ) -> None:
     super().__init__(lazy=lazy)
     self._tables_directory = tables_directory
     self._offline_logical_forms_directory = offline_logical_forms_directory
     self._max_offline_logical_forms = max_offline_logical_forms
     self._keep_if_no_logical_forms = keep_if_no_logical_forms
     self._tokenizer = tokenizer or SpacyTokenizer(pos_tags=True)
     self._question_token_indexers = question_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._table_token_indexers = table_token_indexers or self._question_token_indexers
     self._use_table_for_vocab = use_table_for_vocab
     self._max_table_tokens = max_table_tokens
     self._output_agendas = output_agendas
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 frontend_reader: str = None,
                 frontend_args: Dict[str, Any] = {},
                 lazy: bool = False,
                 concatenate_instances: str = None,
                 concatenate_frontend_reader: str = None,
                 concatenate_frontend_args: Dict[str, Any] = None,
                 sentence1_name: str = "hypothesis",
                 sentence2_name: str = "premise",
                 **kwargs) -> None:
        super().__init__(lazy, **kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer(lowercase_tokens=True)
        }
        self._frontend = FrontEndReader.by_name(frontend_reader)(
            self, **frontend_args)
        self._concatenate_instances = concatenate_instances

        if self._concatenate_instances is not None and concatenate_frontend_reader is not None:
            self._concatenate_frontend = FrontEndReader.by_name(
                concatenate_frontend_reader)(self, **concatenate_frontend_args)

        self._sentence1_name = sentence1_name
        self._sentence2_name = sentence2_name
    def __init__(
        self,
        tokens_per_instance: int = None,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        # Warn here so imports of unrelated models don't fail our tests.
        warnings.warn(
            "LanguageModelingReader is deprecated and not used by any core AllenNLP "
            "models. You almost certainly want to use "
            "SimpleLanguageModelingDatasetReader. It will be removed after 2020/01/04 "
            "in the version 1.0.0 release or later.",
            DeprecationWarning,
        )
        super().__init__(lazy)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._tokens_per_instance = tokens_per_instance

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        self._output_indexer: Dict[str, TokenIndexer] = None
        for name, indexer in self._token_indexers.items():
            if isinstance(indexer, SingleIdTokenIndexer):
                self._output_indexer = {name: indexer}
                break
        else:
            self._output_indexer = {"tokens": SingleIdTokenIndexer()}
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        max_sequence_length: int = None,
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if max_sequence_length is not None:
            self._max_sequence_length: Union[
                float, Optional[int]] = max_sequence_length
        else:
            self._max_sequence_length = math.inf

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]

        logger.info("Creating SimpleLanguageModelingDatasetReader")
        logger.info("max_sequence_length=%s", max_sequence_length)