Example #1
0
def test_profile():
    data_path = "https://storage.googleapis.com/tyoyo/jwtd/v1.0/dev.tsv"
    dataset_reader = Seq2SeqDatasetReader(
        source_tokenizer=SpacyTokenizer(language="ja_core_news_sm"),
        target_tokenizer=SpacyTokenizer(language="ja_core_news_sm"),
        source_max_tokens=64,
        target_max_tokens=64,
        start_symbol="STARTSYMBOL",
        end_symbol="ENDSYMBOL",
    )
    dataset = dataset_reader.read(data_path)
Example #2
0
    def test_keep_spacy_tokens(self):
        word_tokenizer = SpacyTokenizer()
        sentence = "This should be an allennlp Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, Token) for token in tokens)

        word_tokenizer = SpacyTokenizer(keep_spacy_tokens=True)
        sentence = "This should be a spacy Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
Example #3
0
 def __init__(
     self,
     target_namespace: str,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._target_namespace = target_namespace
     self._source_tokenizer = source_tokenizer or SpacyTokenizer()
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
     self._target_token_indexers: Dict[str, TokenIndexer] = {
         "tokens": SingleIdTokenIndexer(namespace=self._target_namespace)
     }
     if (
         isinstance(self._target_tokenizer, PretrainedTransformerTokenizer)
         and self._target_tokenizer._add_special_tokens
     ):
         warnings.warn(
             "'add_special_tokens' is True for target_tokenizer, which is a PretrainedTransformerTokenizer. "
             "This means special tokens, such as '[CLS]' and '[SEP]', will probably end up in "
             "your model's predicted target sequences. "
             "If this is not what you intended, make sure to specify 'add_special_tokens: False' for "
             "your target_tokenizer.",
             UserWarning,
         )
Example #4
0
 def __init__(self,
              model: Model,
              dataset_reader: DatasetReader,
              language: str = "en_core_web_sm") -> None:
     super().__init__(model, dataset_reader)
     self._language = language
     self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)
Example #5
0
    def __init__(
        self,
        tokens_per_instance: int = None,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._tokens_per_instance = tokens_per_instance

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        self._output_indexer: Dict[str, TokenIndexer] = None
        for name, indexer in self._token_indexers.items():
            if isinstance(indexer, SingleIdTokenIndexer):
                self._output_indexer = {name: indexer}
                break
        else:
            self._output_indexer = {"tokens": SingleIdTokenIndexer()}
    def test_never_lowercase(self):
        # Our default tokenizer doesn't handle lowercasing.
        tokenizer = SpacyTokenizer()

        #            2 15 10 11  6
        sentence = "the laziest fox"

        tokens = tokenizer.tokenize(sentence)
        tokens.append(Token("[PAD]"))  # have to do this b/c tokenizer splits it in three

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # PAD should get recognized and not lowercased      # [PAD]
        assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 0, 17]

        # Unless we manually override the never lowercases
        token_indexer = PretrainedBertIndexer(
            str(vocab_path), do_lowercase=True, never_lowercase=()
        )
        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab)

        # now PAD should get lowercased and be UNK          # [UNK]
        assert indexed_tokens["input_ids"] == [16, 2, 15, 10, 11, 6, 1, 17]
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              tokenizer: Tokenizer = None,
              lazy: bool = False) -> None:
     super().__init__(lazy=lazy)
     self._tokenizer = tokenizer or SpacyTokenizer()
     self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
    def setup_method(self):
        self.tokenizer = SpacyTokenizer(pos_tags=True)
        self.utterance = self.tokenizer.tokenize("where is mersin?")
        self.token_indexers = {"tokens": SingleIdTokenIndexer("tokens")}

        table_file = self.FIXTURES_ROOT / "data" / "wikitables" / "tables" / "341.tagged"
        self.graph = TableQuestionContext.read_from_file(
            table_file, self.utterance).get_table_knowledge_graph()
        self.vocab = Vocabulary()
        self.name_index = self.vocab.add_token_to_namespace("name",
                                                            namespace="tokens")
        self.in_index = self.vocab.add_token_to_namespace("in",
                                                          namespace="tokens")
        self.english_index = self.vocab.add_token_to_namespace(
            "english", namespace="tokens")
        self.location_index = self.vocab.add_token_to_namespace(
            "location", namespace="tokens")
        self.mersin_index = self.vocab.add_token_to_namespace(
            "mersin", namespace="tokens")

        self.oov_index = self.vocab.get_token_index("random OOV string",
                                                    namespace="tokens")
        self.edirne_index = self.oov_index
        self.field = KnowledgeGraphField(self.graph, self.utterance,
                                         self.token_indexers, self.tokenizer)

        super().setup_method()
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        max_sequence_length: int = None,
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        if max_sequence_length is not None:
            self._max_sequence_length: Union[
                float, Optional[int]] = max_sequence_length
        else:
            self._max_sequence_length = math.inf

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]

        logger.info("Creating SimpleLanguageModelingDatasetReader")
        logger.info("max_sequence_length=%s", max_sequence_length)
 def __init__(self, field_name: str) -> None:
     super().__init__()
     self.field_name = field_name
     self.tokenizer = SpacyTokenizer()
     self.token_indexers: Dict[str, TokenIndexer] = {
         "tokens": SingleIdTokenIndexer()
     }
Example #11
0
 def __init__(
     self,
     tokenizer: Tokenizer = None,
     token_indexers: Dict[str, TokenIndexer] = None,
     passage_length_limit: int = None,
     question_length_limit: int = None,
     skip_when_all_empty: List[str] = None,
     instance_format: str = "drop",
     relaxed_span_match_for_finding_labels: bool = True,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._tokenizer = tokenizer or SpacyTokenizer()
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self.passage_length_limit = passage_length_limit
     self.question_length_limit = question_length_limit
     self.skip_when_all_empty = skip_when_all_empty if skip_when_all_empty is not None else []
     for item in self.skip_when_all_empty:
         assert item in [
             "passage_span",
             "question_span",
             "addition_subtraction",
             "counting",
         ], f"Unsupported skip type: {item}"
     self.instance_format = instance_format
     self.relaxed_span_match_for_finding_labels = relaxed_span_match_for_finding_labels
    def __init__(
        self,
        tokens_per_instance: int = None,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        # Warn here so imports of unrelated models don't fail our tests.
        warnings.warn(
            "LanguageModelingReader is deprecated and not used by any core AllenNLP "
            "models. You almost certainly want to use "
            "SimpleLanguageModelingDatasetReader. It will be removed after 2020/01/04 "
            "in the version 1.0.0 release or later.",
            DeprecationWarning,
        )
        super().__init__(lazy)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._tokens_per_instance = tokens_per_instance

        # No matter how you want to represent the input, we'll always represent the output as a
        # single token id.  This code lets you learn a language model that concatenates word
        # embeddings with character-level encoders, in order to predict the word token that comes
        # next.
        self._output_indexer: Dict[str, TokenIndexer] = None
        for name, indexer in self._token_indexers.items():
            if isinstance(indexer, SingleIdTokenIndexer):
                self._output_indexer = {name: indexer}
                break
        else:
            self._output_indexer = {"tokens": SingleIdTokenIndexer()}
    def __init__(self,
                 pretrained_model: str = None,
                 tokenizer: Optional[Tokenizer] = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_pieces: int = 512,
                 add_prefix: bool = False,
                 combine_input_fields: bool = True,
                 sample: int = -1) -> None:
        super().__init__()

        if pretrained_model != None:
            self._tokenizer = PretrainedTransformerTokenizer(
                pretrained_model, max_length=max_pieces)
            token_indexer = PretrainedTransformerIndexer(pretrained_model)
            self._token_indexers = {'tokens': token_indexer}
        else:
            self._tokenizer = tokenizer or SpacyTokenizer()
            self._token_indexers = token_indexers or {
                "tokens": SingleIdTokenIndexer()
            }

        self._sample = sample
        self._add_prefix = add_prefix
        self._combine_input_fields = combine_input_fields
        self._debug_prints = -1
Example #14
0
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 hyperbolic_phrase_indexers: Dict[str, TokenIndexer] = None,
                 max_sequence_length: int = None,
                 start_tokens: List[str] = None,
                 end_tokens: List[str] = None,
                 rare_frequency: int = 10) -> None:
        super().__init__()
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace='euclidean')
        }
        self._hyperbolic_phrase_indexers = hyperbolic_phrase_indexers or {
            "tokens": SingleIdTokenIndexer(namespace='hyperbolic')
        }

        if max_sequence_length is not None:
            self._max_sequence_length: Union[
                float, Optional[int]] = max_sequence_length
        else:
            self._max_sequence_length = math.inf

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]

        self._rare_frequency = rare_frequency

        logger.info("Creating SimpleLanguageModelingDatasetReader")
        logger.info("max_sequence_length=%s", max_sequence_length)
Example #15
0
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.tokenizer_space = WhitespaceTokenizer()
        self.tokenizer_spacy = SpacyTokenizer(language="en_core_web_md",
                                              pos_tags=True,
                                              split_on_spaces=True)
        self.token_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6),
            'pos_tags':
            SingleIdTokenIndexer(namespace='pos_tag_vocab',
                                 feature_name='tag_'),
            'ner_tags':
            SingleIdTokenIndexer(namespace='ner_tag_vocab',
                                 feature_name='ent_type_')
        }

        self.slot_indexers = {
            'elmo_tokens':
            ELMoTokenCharactersIndexer(),
            'token_characters':
            TokenCharactersIndexer(namespace='character_vocab',
                                   min_padding_length=6)
        }
    def __init__(
        self,
        tokenizer: Tokenizer = None,
        token_indexers: Dict[str, TokenIndexer] = None,
        passage_length_limit: int = None,
        question_length_limit: int = None,
        skip_impossible_questions: bool = False,
        no_answer_token: Optional[str] = None,
        **kwargs,
    ) -> None:
        if "skip_invalid_examples" in kwargs:
            import warnings

            warnings.warn(
                "'skip_invalid_examples' is deprecated, please use 'skip_impossible_questions' instead",
                DeprecationWarning,
            )
            skip_impossible_questions = kwargs.pop("skip_invalid_examples")

        super().__init__(manual_distributed_sharding=True,
                         manual_multiprocess_sharding=True,
                         **kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self.passage_length_limit = passage_length_limit
        self.question_length_limit = question_length_limit
        self.skip_impossible_questions = skip_impossible_questions
        self.no_answer_token = no_answer_token
Example #17
0
def read(fn: str) -> Iterable[List[Extraction]]:
    tokenizer = SpacyTokenizer(pos_tags=True)
    prev_sent: List[Extraction] = []

    with open(fn) as fin:
        for line in tqdm(fin):
            data = line.strip().split("\t")
            confidence = data[0]
            if not all(data[2:5]):
                # Make sure that all required elements are present
                continue
            arg1, rel, args2 = (parse_element(e) for e in data[2:5])

            # Exactly one subject and one relation
            # and at least one object
            if len(rel) == 1 and len(arg1) == 1 and len(args2) >= 1:
                sent = data[5]
                cur_ex = Extraction(
                    sent=sent,
                    toks=tokenizer.tokenize(sent),
                    arg1=arg1[0],
                    rel=rel[0],
                    args2=args2,
                    confidence=confidence,
                )

                # Decide whether to append or yield
                if not prev_sent or prev_sent[0].sent == sent:
                    prev_sent.append(cur_ex)
                else:
                    yield prev_sent
                    prev_sent = [cur_ex]
    if prev_sent:
        # Yield last element
        yield prev_sent
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 frontend_reader: str = None,
                 frontend_args: Dict[str, Any] = {},
                 lazy: bool = False,
                 concatenate_instances: str = None,
                 concatenate_frontend_reader: str = None,
                 concatenate_frontend_args: Dict[str, Any] = None,
                 sentence1_name: str = "hypothesis",
                 sentence2_name: str = "premise",
                 **kwargs) -> None:
        super().__init__(lazy, **kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer(lowercase_tokens=True)
        }
        self._frontend = FrontEndReader.by_name(frontend_reader)(
            self, **frontend_args)
        self._concatenate_instances = concatenate_instances

        if self._concatenate_instances is not None and concatenate_frontend_reader is not None:
            self._concatenate_frontend = FrontEndReader.by_name(
                concatenate_frontend_reader)(self, **concatenate_frontend_args)

        self._sentence1_name = sentence1_name
        self._sentence2_name = sentence2_name
 def __init__(
     self,
     lazy: bool = False,
     tables_directory: str = None,
     offline_logical_forms_directory: str = None,
     max_offline_logical_forms: int = 10,
     keep_if_no_logical_forms: bool = False,
     tokenizer: Tokenizer = None,
     question_token_indexers: Dict[str, TokenIndexer] = None,
     table_token_indexers: Dict[str, TokenIndexer] = None,
     use_table_for_vocab: bool = False,
     max_table_tokens: int = None,
     output_agendas: bool = False,
 ) -> None:
     super().__init__(lazy=lazy)
     self._tables_directory = tables_directory
     self._offline_logical_forms_directory = offline_logical_forms_directory
     self._max_offline_logical_forms = max_offline_logical_forms
     self._keep_if_no_logical_forms = keep_if_no_logical_forms
     self._tokenizer = tokenizer or SpacyTokenizer(pos_tags=True)
     self._question_token_indexers = question_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._table_token_indexers = table_token_indexers or self._question_token_indexers
     self._use_table_for_vocab = use_table_for_vocab
     self._max_table_tokens = max_table_tokens
     self._output_agendas = output_agendas
Example #20
0
 def __init__(
     self,
     token_indexers: Dict[str, TokenIndexer] = None,
     tokenizer: Tokenizer = None,
     segment_sentences: bool = False,
     max_sequence_length: int = None,
     skip_label_indexing: bool = False,
     text_key: str = "text",
     label_key: str = "label",
     **kwargs,
 ) -> None:
     super().__init__(manual_distributed_sharding=True,
                      manual_multiprocess_sharding=True,
                      **kwargs)
     self._tokenizer = tokenizer or SpacyTokenizer()
     self._segment_sentences = segment_sentences
     self._max_sequence_length = max_sequence_length
     self._skip_label_indexing = skip_label_indexing
     self._token_indexers = token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._text_key = text_key
     self._label_key = label_key
     if self._segment_sentences:
         self._sentence_segmenter = SpacySentenceSplitter()
Example #21
0
 def __init__(
     self,
     source_tokenizer: Tokenizer = None,
     target_tokenizer: Tokenizer = None,
     source_token_indexers: Dict[str, TokenIndexer] = None,
     target_token_indexers: Dict[str, TokenIndexer] = None,
     source_add_start_token: bool = True,
     source_add_end_token: bool = True,
     delimiter: str = "\t",
     source_max_tokens: Optional[int] = None,
     target_max_tokens: Optional[int] = None,
     **kwargs,
 ) -> None:
     super().__init__(**kwargs)
     self._source_tokenizer = source_tokenizer or SpacyTokenizer()
     self._target_tokenizer = target_tokenizer or self._source_tokenizer
     self._source_token_indexers = source_token_indexers or {
         "tokens": SingleIdTokenIndexer()
     }
     self._target_token_indexers = target_token_indexers or self._source_token_indexers
     self._source_add_start_token = source_add_start_token
     self._source_add_end_token = source_add_end_token
     self._delimiter = delimiter
     self._source_max_tokens = source_max_tokens
     self._target_max_tokens = target_max_tokens
     self._source_max_exceeded = 0
     self._target_max_exceeded = 0
Example #22
0
 def test_char_span_to_token_span_handles_hard_cases(self):
     # An earlier version of the code had a hard time when the answer was the last token in the
     # passage.  This tests that case, on the instance that used to fail.
     tokenizer = SpacyTokenizer()
     passage = (
         "Beyonc\u00e9 is believed to have first started a relationship with Jay Z "
         +
         'after a collaboration on "\'03 Bonnie & Clyde", which appeared on his seventh '
         +
         "album The Blueprint 2: The Gift & The Curse (2002). Beyonc\u00e9 appeared as Jay "
         +
         "Z's girlfriend in the music video for the song, which would further fuel "
         +
         "speculation of their relationship. On April 4, 2008, Beyonc\u00e9 and Jay Z were "
         +
         "married without publicity. As of April 2014, the couple have sold a combined 300 "
         +
         "million records together. The couple are known for their private relationship, "
         +
         "although they have appeared to become more relaxed in recent years. Beyonc\u00e9 "
         +
         'suffered a miscarriage in 2010 or 2011, describing it as "the saddest thing" '
         +
         "she had ever endured. She returned to the studio and wrote music in order to cope "
         +
         "with the loss. In April 2011, Beyonc\u00e9 and Jay Z traveled to Paris in order "
         +
         "to shoot the album cover for her 4, and unexpectedly became pregnant in Paris."
     )
     start = 912
     end = 912 + len("Paris.")
     tokens = tokenizer.tokenize(passage)
     offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
     token_span = util.char_span_to_token_span(offsets, (start, end))[0]
     assert token_span == (184, 185)
Example #23
0
def read_dataset(file_path):
    with open(file_path) as dataset_file:
        tokenizer = SpacyTokenizer()
        dataset_json = json.load(dataset_file)
        dialogs = []
        for dialog in dataset_json:
            dialog_idx = dialog["dialogue_idx"]
            dialog = dialog['dialogue']
            dialog_context = None
            for turn_i, turn in enumerate(dialog):
                sys_utt = turn['system_transcript']
                user_utt = turn['transcript']
                tokenized_sys_utt = tokenizer.tokenize(sys_utt)
                if turn_i != 0:
                    tokenized_sys_utt = [Token(text="<S>", lemma_="<S>")
                                         ] + tokenized_sys_utt
                tokenized_user_utt = tokenizer.tokenize(user_utt)
                if turn_i != len(dialog) - 1:
                    tokenized_user_utt = tokenized_user_utt + [
                        Token(text="</S>", lemma_="</S>")
                    ]
                if dialog_context is None:
                    dialog_context = tokenized_sys_utt + tokenized_user_utt
                else:
                    dialog_context += tokenized_sys_utt + tokenized_user_utt
            dialog_context = [t.text for t in dialog_context]
            dialogs.append((dialog_idx, [dialog_context]))
    return dialogs
    def __init__(
        self,
        source_tokenizer: Tokenizer = None,
        target_tokenizer: Tokenizer = None,
        source_token_indexers: Dict[str, TokenIndexer] = None,
        target_token_indexers: Dict[str, TokenIndexer] = None,
        source_add_start_token: bool = True,
        source_add_end_token: bool = True,
        target_add_start_token: bool = True,
        target_add_end_token: bool = True,
        src_start_symbol: str = START_SYMBOL,
        src_end_symbol: str = END_SYMBOL,
        tgt_start_symbol: str = START_SYMBOL,
        tgt_end_symbol: str = END_SYMBOL,
        delimiter: str = "\t",
        source_max_tokens: Optional[int] = None,
        target_max_tokens: Optional[int] = None,
        quoting: int = csv.QUOTE_MINIMAL,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._source_tokenizer = source_tokenizer or SpacyTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._target_token_indexers = target_token_indexers or self._source_token_indexers

        self._source_add_start_token = source_add_start_token
        self._source_add_end_token = source_add_end_token
        self._target_add_start_token = target_add_start_token
        self._target_add_end_token = target_add_end_token
        self._src_start_token: Optional[Token] = None
        self._src_end_token: Optional[Token] = None
        self._tgt_start_token: Optional[Token] = None
        self._tgt_end_token: Optional[Token] = None
        if (source_add_start_token or source_add_end_token
                or target_add_start_token or target_add_end_token):
            try:
                self._src_start_token, self._src_end_token = self._source_tokenizer.tokenize(
                    src_start_symbol + " " + src_end_symbol)
            except ValueError:
                raise ValueError(
                    f"Bad start or end symbol ({'start_symbol', 'end_symbol'}) "
                    f"for tokenizer {self._source_tokenizer}")
            try:
                self._tgt_start_token, self._tgt_end_token = self._target_tokenizer.tokenize(
                    tgt_start_symbol + " " + tgt_end_symbol)
            except ValueError:
                raise ValueError(
                    f"Bad start or end symbol ({'start_symbol', 'end_symbol'}) "
                    f"for tokenizer {self._target_tokenizer}")

        self._delimiter = delimiter
        self._source_max_tokens = source_max_tokens
        self._target_max_tokens = target_max_tokens
        self._source_max_exceeded = 0
        self._target_max_exceeded = 0
        self.quoting = quoting
Example #25
0
    def __init__(
        self,
        source_tokenizer: Tokenizer = None,
        target_tokenizer: Tokenizer = None,
        source_token_indexers: Dict[str, TokenIndexer] = None,
        target_token_indexers: Dict[str, TokenIndexer] = None,
        source_add_start_token: bool = True,
        source_add_end_token: bool = True,
        target_add_start_token: bool = True,
        target_add_end_token: bool = True,
        start_symbol: str = START_SYMBOL,
        end_symbol: str = END_SYMBOL,
        delimiter: str = "\t",
        source_max_tokens: Optional[int] = None,
        target_max_tokens: Optional[int] = None,
        quoting: int = csv.QUOTE_MINIMAL,
        **kwargs,
    ) -> None:
        super().__init__(manual_distributed_sharding=True,
                         manual_multiprocess_sharding=True,
                         **kwargs)
        self._source_tokenizer = source_tokenizer or SpacyTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._target_token_indexers = target_token_indexers or self._source_token_indexers

        self._source_add_start_token = source_add_start_token
        self._source_add_end_token = source_add_end_token
        self._target_add_start_token = target_add_start_token
        self._target_add_end_token = target_add_end_token
        self._start_token: Optional[Token] = None
        self._end_token: Optional[Token] = None
        if (source_add_start_token or source_add_end_token
                or target_add_start_token or target_add_end_token):
            # Check that the tokenizer correctly appends the start and end tokens to
            # the sequence without splitting them.
            tokens = self._source_tokenizer.tokenize(start_symbol + " " +
                                                     end_symbol)
            err_msg = (
                f"Bad start or end symbol ('{start_symbol}', '{end_symbol}') "
                f"for tokenizer {self._source_tokenizer}")
            try:
                start_token, end_token = tokens[0], tokens[-1]
            except IndexError:
                raise ValueError(err_msg)
            if start_token.text != start_symbol or end_token.text != end_symbol:
                raise ValueError(err_msg)

            self._start_token = start_token
            self._end_token = end_token

        self._delimiter = delimiter
        self._source_max_tokens = source_max_tokens
        self._target_max_tokens = target_max_tokens
        self._source_max_exceeded = 0
        self._target_max_exceeded = 0
        self.quoting = quoting
    def test_squad_with_unwordpieceable_passage(self):

        tokenizer = SpacyTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = (
            "There were four major HDTV systems tested by SMPTE in the late 1970s, "
            "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:"
        )
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = (
            "Broca, being what today would be called a neurosurgeon, "
            "had taken an interest in the pathology of speech. He wanted "
            "to localize the difference between man and the other animals, "
            "which appeared to reside in speech. He discovered the speech "
            "center of the human brain, today called Broca's area after him. "
            "His interest was mainly in Biological anthropology, but a German "
            "philosopher specializing in psychology, Theodor Waitz, took up the "
            "theme of general and social anthropology in his six-volume work, "
            "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
            """soon translated as "The Anthropology of Primitive Peoples". """
            "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import (
            make_reading_comprehension_instance, )

        instance1 = make_reading_comprehension_instance(
            tokenizer.tokenize(question1),
            tokenizer.tokenize(passage1),
            {"bert": token_indexer},
            passage1,
        )

        instance2 = make_reading_comprehension_instance(
            tokenizer.tokenize(question2),
            tokenizer.tokenize(passage2),
            {"bert": token_indexer},
            passage2,
        )

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
def search(
    tables_directory: str,
    data: JsonDict,
    output_path: str,
    max_path_length: int,
    max_num_logical_forms: int,
    use_agenda: bool,
    output_separate_files: bool,
    conservative_agenda: bool,
) -> None:
    print(f"Starting search with {len(data)} instances", file=sys.stderr)
    language_logger = logging.getLogger("allennlp.semparse.domain_languages.wikitables_language")
    language_logger.setLevel(logging.ERROR)
    tokenizer = SpacyTokenizer()
    if output_separate_files and not os.path.exists(output_path):
        os.makedirs(output_path)
    if not output_separate_files:
        output_file_pointer = open(output_path, "w")
    for instance_data in data:
        utterance = instance_data["question"]
        question_id = instance_data["id"]
        if utterance.startswith('"') and utterance.endswith('"'):
            utterance = utterance[1:-1]
        # For example: csv/200-csv/47.csv -> tagged/200-tagged/47.tagged
        table_file = instance_data["table_filename"].replace("csv", "tagged")
        target_list = instance_data["target_values"]
        tokenized_question = tokenizer.tokenize(utterance)
        table_file = f"{tables_directory}/{table_file}"
        context = TableQuestionContext.read_from_file(table_file, tokenized_question)
        world = WikiTablesLanguage(context)
        walker = ActionSpaceWalker(world, max_path_length=max_path_length)
        correct_logical_forms = []
        if use_agenda:
            agenda = world.get_agenda(conservative=conservative_agenda)
            allow_partial_match = not conservative_agenda
            all_logical_forms = walker.get_logical_forms_with_agenda(
                agenda=agenda, max_num_logical_forms=10000, allow_partial_match=allow_partial_match
            )
        else:
            all_logical_forms = walker.get_all_logical_forms(max_num_logical_forms=10000)
        for logical_form in all_logical_forms:
            if world.evaluate_logical_form(logical_form, target_list):
                correct_logical_forms.append(logical_form)
        if output_separate_files and correct_logical_forms:
            with gzip.open(f"{output_path}/{question_id}.gz", "wt") as output_file_pointer:
                for logical_form in correct_logical_forms:
                    print(logical_form, file=output_file_pointer)
        elif not output_separate_files:
            print(f"{question_id} {utterance}", file=output_file_pointer)
            if use_agenda:
                print(f"Agenda: {agenda}", file=output_file_pointer)
            if not correct_logical_forms:
                print("NO LOGICAL FORMS FOUND!", file=output_file_pointer)
            for logical_form in correct_logical_forms[:max_num_logical_forms]:
                print(logical_form, file=output_file_pointer)
            print(file=output_file_pointer)
    if not output_separate_files:
        output_file_pointer.close()
def get_spacy_tokenizer(special_cases_file='special-cases-spacy.yaml', verbose=False):
    import yaml
    from allennlp.data.tokenizers import SpacyTokenizer
    #from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter as SpacyTokenizer
    #nlp = English('en_core_web_sm')
    

    with open('special-cases-spacy.yaml') as fh:
        special_cases = yaml.load(fh)

    tokenizer_obj = SpacyTokenizer(language='en_core_web_sm', pos_tags=True)
    for case in special_cases:
        if verbose:
            print(*case)
        tokenizer_obj.spacy.tokenizer.add_special_case(*case)

    #return tokenizer_obj.split_words
    return tokenizer_obj.tokenize


# deprecated; see annotation_json2septext
# earlier: annotated_text
# def annotate_tokens(txt, anns,
#                   start = 'start_offset',
#                   end = 'end_offset',
#                   label='label',
#                   sep='#%#',
#                   punctuation = ''',.:;()#''"'''):
#     anns = sorted(anns, key = lambda x: x[start])
#     prev_end = 0
#     html_ = ''
#     for ann in anns:
#         prev_end
#         label_ = ann[label]
#         if (ann[start]-prev_end)>0:
#             piece = txt[prev_end:ann[start]].strip()
#             html_+= ' '  + label_words(piece, 'separator', sep).strip()
#         piece = txt[ann[start]:ann[end]]
#         piece = normalize_punctuation(piece, separators=punctuation)
#         html_+= ' '+ label_words(piece, label_, sep)
#         prev_end = ann[end]
#     return html_.strip()

    seq_anns = sorted(seq_anns, key=lambda x: x['start'])
    prev_ann = dict(start=-1, end=-1, label='')
    for ann in seq_anns:
    #     print(prev_ann['end'],ann['start'])
        if prev_ann['end']>ann['start']:
#             print(prev_ann, ann, sep='\t')
            return True
#             print(prev_ann, ann, sep='\t')
#             if prev_ann['label'] == 'punctuation':
#                 return True
#             if ann['label'] == 'punctuation':
#                 return True
#                 prev_ann['end'] = ann['start']
        prev_ann = ann
    return False
Example #29
0
 def __init__(
     self,
     lazy: bool = False,
     token_indexers: Dict[str, TokenIndexer] = None,
     tokenizer: Tokenizer = None,
 ) -> None:
     super().__init__(lazy)
     self._token_indexers = token_indexers or {}
     self.tokenizer = tokenizer or SpacyTokenizer()
Example #30
0
 def test_passes_through_correctly(self):
     tokenizer = SpacyTokenizer()
     sentence = "this (sentence) has 'crazy' \"punctuation\"."
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     expected_tokens = [
         "this", "(", "sentence", ")", "has", "'", "crazy", "'", "\"",
         "punctuation", "\"", "."
     ]
     self.assertSequenceEqual(tokens, expected_tokens)