Example #1
0
 def from_params(cls, params: Params) -> 'CrfSrlReader':
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     max_span_width = params.pop("max_span_width")
     params.assert_empty(cls.__name__)
     return CrfSrlReader(token_indexers=token_indexers,
                         max_span_width=max_span_width)
Example #2
0
 def from_params(cls, params: Params) -> "ConllCorefReader":
     token_indexers = TokenIndexer.dict_from_params(
         params.pop("token_indexers", {}))
     max_span_width = params.pop_int("max_span_width")
     params.assert_empty(cls.__name__)
     return cls(token_indexers=token_indexers,
                max_span_width=max_span_width)
    def from_params(cls, params: Params) -> 'ProParaDatasetReader':
        token_indexers = TokenIndexer.dict_from_params(
            params.pop("token_indexers", {}))
        multiple_annotations = params.pop_bool("multiple_annotations", False)

        return ProParaDatasetReader(token_indexers=token_indexers,
                                    multiple_annotations=multiple_annotations)
    def from_params(cls, params: Params) -> 'GardDatasetReader':
        tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
        token_indexers = TokenIndexer.dict_from_params(
            params.pop('token_indexers', {}))
        params.assert_empty(cls.__name__)

        return cls(tokenizer=tokenizer, token_indexers=token_indexers)
Example #5
0
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 entity_indexer: TokenIndexer = TokenIndexer.from_params(
                     Params(INDEXER_DEFAULT)),
                 granularity: str = "sentence",
                 mention_generator: MentionGenerator = None,
                 should_remap_span_indices: bool = True,
                 entity_disambiguation_only: bool = False,
                 extra_candidate_generators: Dict[str,
                                                  MentionGenerator] = None):

        lazy = False
        super().__init__(lazy)
        self.token_indexers = token_indexers or {
            "token": SingleIdTokenIndexer("token")
        }
        self.entity_indexer = {"ids": entity_indexer}
        self.separator = {"*NL*"}
        if granularity == "sentence":
            self.separator.add(".")

        if granularity not in {"sentence", "paragraph"}:
            raise ConfigurationError(
                "Valid arguments for granularity are 'sentence' or 'paragraph'."
            )

        self.entity_disambiguation_only = entity_disambiguation_only
        self.mention_generator = mention_generator or WikiCandidateMentionGenerator(
        )
        self.should_remap_span_indices = should_remap_span_indices

        self.extra_candidate_generators = extra_candidate_generators
Example #6
0
 def from_params(cls, params: Params) -> 'SimpleSrlReader':
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     word_tag_delimiter = params.pop("word_tag_delimiter",
                                     _DEFAULT_WORD_TAG_DELIMITER)
     params.assert_empty(cls.__name__)
     return SimpleSrlReader(token_indexers=token_indexers,
                            word_tag_delimiter=word_tag_delimiter)
 def from_params(cls, params: Params) -> 'PennTreeBankConstituencySpanDatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     use_pos_tags = params.pop('use_pos_tags', True)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return PennTreeBankConstituencySpanDatasetReader(token_indexers=token_indexers,
                                                      use_pos_tags=use_pos_tags,
                                                      lazy=lazy)
Example #8
0
 def from_params(cls, params: Params) -> 'Conll2003DatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     tag_label = params.pop('tag_label', None)
     feature_labels = params.pop('feature_labels', ())
     params.assert_empty(cls.__name__)
     return Conll2003DatasetReader(token_indexers=token_indexers,
                                   tag_label=tag_label,
                                   feature_labels=feature_labels)
Example #9
0
 def from_params(cls, params: Params) -> 'SequenceTaggingDatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     word_tag_delimiter = params.pop("word_tag_delimiter", DEFAULT_WORD_TAG_DELIMITER)
     token_delimiter = params.pop("token_delimiter", None)
     params.assert_empty(cls.__name__)
     return SequenceTaggingDatasetReader(token_indexers=token_indexers,
                                         word_tag_delimiter=word_tag_delimiter,
                                         token_delimiter=token_delimiter)
Example #10
0
 def from_params(cls,
                 params: Params) -> 'UniversalDependenciesDatasetReader':
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return UniversalDependenciesDatasetReader(
         token_indexers=token_indexers, lazy=lazy)
Example #11
0
 def from_params(cls, params: Params) -> 'SnliReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer,
                       token_indexers=token_indexers,
                       lazy=lazy)
Example #12
0
 def from_params(cls, params: Params) -> 'PennTreeBankConstituencySpanDatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     use_pos_tags = params.pop('use_pos_tags', True)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return PennTreeBankConstituencySpanDatasetReader(token_indexers=token_indexers,
                                                      use_pos_tags=use_pos_tags,
                                                      lazy=lazy)
 def from_params(cls, params: Params) -> 'SrlReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     domain_identifier = params.pop("domain_identifier", None)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return SrlReader(token_indexers=token_indexers,
                      domain_identifier=domain_identifier,
                      lazy=lazy)
Example #14
0
 def from_params(cls, params: Params) -> 'WikiTablesDatasetReader':
     lazy = params.pop('lazy', False)
     tables_directory = params.pop('tables_directory', None)
     dpd_output_directory = params.pop('dpd_output_directory', None)
     max_dpd_logical_forms = params.pop_int('max_dpd_logical_forms', 10)
     sort_dpd_logical_forms = params.pop_bool('sort_dpd_logical_forms',
                                              True)
     max_dpd_tries = params.pop_int('max_dpd_tries', 20)
     keep_if_no_dpd = params.pop_bool('keep_if_no_dpd', False)
     default_tokenizer_params = {
         'word_splitter': {
             'type': 'spacy',
             'pos_tags': True
         }
     }
     tokenizer = Tokenizer.from_params(
         params.pop('tokenizer', default_tokenizer_params))
     question_token_indexers = TokenIndexer.dict_from_params(
         params.pop('question_token_indexers', {}))
     table_token_indexers = TokenIndexer.dict_from_params(
         params.pop('table_token_indexers', {}))
     use_table_for_vocab = params.pop_bool('use_table_for_vocab', False)
     linking_feature_extracters = params.pop('linking_feature_extractors',
                                             None)
     include_table_metadata = params.pop_bool('include_table_metadata',
                                              False)
     max_table_tokens = params.pop_int('max_table_tokens', None)
     output_agendas = params.pop_bool('output_agendas', False)
     params.assert_empty(cls.__name__)
     return WikiTablesDatasetReader(
         lazy=lazy,
         tables_directory=tables_directory,
         dpd_output_directory=dpd_output_directory,
         max_dpd_logical_forms=max_dpd_logical_forms,
         sort_dpd_logical_forms=sort_dpd_logical_forms,
         max_dpd_tries=max_dpd_tries,
         keep_if_no_dpd=keep_if_no_dpd,
         tokenizer=tokenizer,
         question_token_indexers=question_token_indexers,
         table_token_indexers=table_token_indexers,
         use_table_for_vocab=use_table_for_vocab,
         linking_feature_extractors=linking_feature_extracters,
         include_table_metadata=include_table_metadata,
         max_table_tokens=max_table_tokens,
         output_agendas=output_agendas)
Example #15
0
 def from_params(cls, params: Params) -> 'SpookyAuthorsDatasetReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     cnn_paper_dataset = params.pop("cnn_paper_dataset", False)
     params.assert_empty(cls.__name__)
     return cls(tokenizer=tokenizer,
                token_indexers=token_indexers,
                cnn_paper_dataset=cnn_paper_dataset)
Example #16
0
 def from_params(cls, params: Params) -> 'NlvrDatasetReader':
     lazy = params.pop('lazy', False)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     sentence_token_indexers = TokenIndexer.dict_from_params(
         params.pop('sentence_token_indexers', {}))
     terminal_indexers = TokenIndexer.dict_from_params(
         params.pop('terminal_indexers', {}))
     nonterminal_indexers = TokenIndexer.dict_from_params(
         params.pop('nonterminal_indexers', {}))
     output_agendas = params.pop("output_agendas", True)
     params.assert_empty(cls.__name__)
     return NlvrDatasetReader(
         lazy=lazy,
         tokenizer=tokenizer,
         sentence_token_indexers=sentence_token_indexers,
         terminal_indexers=terminal_indexers,
         nonterminal_indexers=nonterminal_indexers,
         output_agendas=output_agendas)
Example #17
0
 def from_params(
         cls, params: Params) -> 'SrlwithConstituencySpanOntonotesReader':
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     domain_identifier = params.pop("domain_identifier", None)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return SrlwithConstituencySpanOntonotesReader(
         token_indexers=token_indexers, lazy=lazy)
Example #18
0
 def from_params(cls, params: Params) -> 'SwagReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     use_only_gold_examples = params.pop('use_only_gold_examples', False)
     params.assert_empty(cls.__name__)
     return cls(tokenizer=tokenizer,
                token_indexers=token_indexers,
                use_only_gold_examples=use_only_gold_examples)
Example #19
0
 def from_params(cls, params: Params) -> 'SemanticScholarDatasetReader':
     lazy = params.pop('lazy', False)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return cls(lazy=lazy,
                tokenizer=tokenizer,
                token_indexers=token_indexers)
Example #20
0
 def from_params(cls, params: Params) -> 'Seq2SeqDatasetReader':
     source_tokenizer_type = params.pop('source_tokenizer', None)
     source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params(source_tokenizer_type)
     target_tokenizer_type = params.pop('target_tokenizer', None)
     target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params(target_tokenizer_type)
     source_indexers_type = params.pop('source_token_indexers', None)
     if source_indexers_type is None:
         source_token_indexers = None
     else:
         source_token_indexers = TokenIndexer.dict_from_params(source_indexers_type)
     target_indexers_type = params.pop('target_token_indexers', None)
     if target_indexers_type is None:
         target_token_indexers = None
     else:
         target_token_indexers = TokenIndexer.dict_from_params(target_indexers_type)
     params.assert_empty(cls.__name__)
     return Seq2SeqDatasetReader(source_tokenizer, target_tokenizer,
                                 source_token_indexers, target_token_indexers)
Example #21
0
 def from_params(cls, params: Params) -> "BioMedReader":
     token_indexers = TokenIndexer.dict_from_params(
         params.pop("token_indexers", {}))
     max_span_width = params.pop_int("max_span_width")
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return cls(token_indexers=token_indexers,
                max_span_width=max_span_width,
                lazy=lazy)
Example #22
0
 def from_params(cls, params: Params) -> 'PnetOntoDatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     tag_label = params.pop('tag_label', None)
     feature_labels = params.pop('feature_labels', ())
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return PnetOntoDatasetReader(token_indexers=token_indexers,
                               tag_label=tag_label,
                               feature_labels=feature_labels,
                               lazy=lazy)
 def from_params(cls, params: Params) -> 'SequenceTaggingDatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     word_tag_delimiter = params.pop("word_tag_delimiter", DEFAULT_WORD_TAG_DELIMITER)
     token_delimiter = params.pop("token_delimiter", None)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return SequenceTaggingDatasetReader(token_indexers=token_indexers,
                                         word_tag_delimiter=word_tag_delimiter,
                                         token_delimiter=token_delimiter,
                                         lazy=lazy)
Example #24
0
 def from_params(cls, params: Params) -> 'Conll2003DatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     tag_label = params.pop('tag_label', None)
     feature_labels = params.pop('feature_labels', ())
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return Conll2003DatasetReader(token_indexers=token_indexers,
                                   tag_label=tag_label,
                                   feature_labels=feature_labels,
                                   lazy=lazy)
Example #25
0
 def from_params(cls, params: Params) -> 'JsonlClassificationReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     input = params.pop('input', None)
     gold_label = params.pop('gold_label', None)
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return JsonlClassificationReader(tokenizer=tokenizer,
                       token_indexers=token_indexers,
                                      input=input,
                                      gold_label=gold_label)
Example #26
0
 def from_params(cls, params):
     dataset_type = params.pop("type")
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return cls(tokenizer=tokenizer,
                token_indexers=token_indexers,
                lazy=lazy)
Example #27
0
 def from_params(cls, params: Params) -> 'TriviaQaReader':
     base_tarball_path = params.pop('base_tarball_path')
     unfiltered_tarball_path = params.pop('unfiltered_tarball_path', None)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     params.assert_empty(cls.__name__)
     return cls(base_tarball_path=base_tarball_path,
                unfiltered_tarball_path=unfiltered_tarball_path,
                tokenizer=tokenizer,
                token_indexers=token_indexers)
Example #28
0
 def from_params(cls, params: Params) -> 'DialogueContextDatasetReader':
     lazy = params.pop('lazy', False)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     shuffle_examples = params.pop('shuffle_examples', False)
     params.assert_empty(cls.__name__)
     return cls(lazy=lazy,
                shuffle_examples=shuffle_examples,
                tokenizer=tokenizer,
                token_indexers=token_indexers)
 def from_params(cls, params: Params) -> 'BiaoWenMingXiDatasetReader':
     lazy = params.pop('lazy', False)
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     sheet_name = params.pop('sheet_name', 'Sheet1')
     params.assert_empty(cls.__name__)
     return cls(lazy=lazy,
                tokenizer=tokenizer,
                token_indexers=token_indexers,
                sheet_name=sheet_name)
Example #30
0
    def from_params(cls, params: Params) -> 'FEVERSentenceReader':
        claim_tokenizer = Tokenizer.from_params(params.pop('claim_tokenizer', {}))
        wiki_tokenizer = Tokenizer.from_params(params.pop('wiki_tokenizer', {}))

        token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
        db = FeverDocDB(params.pop("db_path","data/fever/fever.db"))
        params.assert_empty(cls.__name__)
        return FEVERSentenceReader(db=db,
                           claim_tokenizer=claim_tokenizer,
                           wiki_tokenizer=wiki_tokenizer,
                           token_indexers=token_indexers)
 def from_params(cls, params: Params) -> 'EntailmentTupleReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     max_tuples = params.pop('max_tuples', 30)
     max_tokens = params.pop('max_tokens', 200)
     params.assert_empty(cls.__name__)
     return EntailmentTupleReader(max_tokens=max_tokens,
                                  max_tuples=max_tuples,
                                  tokenizer=tokenizer,
                                  token_indexers=token_indexers)
Example #32
0
 def from_params(cls, params):
     token_indexers_params = params.pop('token_indexers', Params({}))
     token_indexers = TokenIndexer.dict_from_params(token_indexers_params)
     sentence_field_name = params.pop('sentence_field_name', 'sentence')
     tags_field_name = params.pop('tags_field_name', 'tags')
     tag_namespace = params.pop('tag_namespace', 'tags')
     params.assert_empty(cls.__name__)
     return cls(token_indexers=token_indexers,
                sentence_field_name=sentence_field_name,
                tags_field_name=tags_field_name,
                tag_namespace=tag_namespace)
Example #33
0
 def from_params(cls, params: Params) -> 'ToxicReader':
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = TokenIndexer.dict_from_params(
         params.pop('token_indexers', {}))
     max_length = params.pop('max_length', None)
     fill_in_empty_labels = params.pop_bool('fill_in_empty_labels', False)
     params.assert_empty(cls.__name__)
     return cls(max_length=max_length,
                fill_in_empty_labels=fill_in_empty_labels,
                tokenizer=tokenizer,
                token_indexers=token_indexers)
 def from_params(cls, params: Params) -> 'StanfordSentimentTreeBankDatasetReader':
     token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {}))
     use_subtrees = params.pop('use_subtrees', False)
     granularity = params.pop_choice('granularity', ["5-class", "3-class", "2-class"], True)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return StanfordSentimentTreeBankDatasetReader(
             token_indexers=token_indexers,
             use_subtrees=use_subtrees,
             granularity=granularity,
             lazy=lazy)
Example #35
0
 def from_params(cls, params: Params) -> 'BabiDatasetReader':
     """
     Constructs the dataset reader described by ``params``.
     """
     token_indexers_type = params.pop('token_indexers', None)
     if token_indexers_type is None:
         token_indexers = None
     else:
         token_indexers = TokenIndexer.dict_from_params(token_indexers_type)
     params.assert_empty(cls.__name__)
     return BabiDatasetReader(token_indexers)
Example #36
0
 def from_params(cls, params: Params) -> 'Seq2SeqDatasetReader':
     source_tokenizer_type = params.pop('source_tokenizer', None)
     source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params(source_tokenizer_type)
     target_tokenizer_type = params.pop('target_tokenizer', None)
     target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params(target_tokenizer_type)
     source_indexers_type = params.pop('source_token_indexers', None)
     source_add_start_token = params.pop_bool('source_add_start_token', True)
     if source_indexers_type is None:
         source_token_indexers = None
     else:
         source_token_indexers = TokenIndexer.dict_from_params(source_indexers_type)
     target_indexers_type = params.pop('target_token_indexers', None)
     if target_indexers_type is None:
         target_token_indexers = None
     else:
         target_token_indexers = TokenIndexer.dict_from_params(target_indexers_type)
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return Seq2SeqDatasetReader(source_tokenizer=source_tokenizer,
                                 target_tokenizer=target_tokenizer,
                                 source_token_indexers=source_token_indexers,
                                 target_token_indexers=target_token_indexers,
                                 source_add_start_token=source_add_start_token,
                                 lazy=lazy)
Example #37
0
 def from_params(cls, params: Params) -> "WinobiasReader":
     token_indexers = TokenIndexer.dict_from_params(params.pop("token_indexers", {}))
     max_span_width = params.pop_int("max_span_width")
     lazy = params.pop('lazy', False)
     params.assert_empty(cls.__name__)
     return cls(token_indexers=token_indexers, max_span_width=max_span_width, lazy=lazy)