def from_params(cls, params: Params) -> 'CrfSrlReader': token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) max_span_width = params.pop("max_span_width") params.assert_empty(cls.__name__) return CrfSrlReader(token_indexers=token_indexers, max_span_width=max_span_width)
def from_params(cls, params: Params) -> "ConllCorefReader": token_indexers = TokenIndexer.dict_from_params( params.pop("token_indexers", {})) max_span_width = params.pop_int("max_span_width") params.assert_empty(cls.__name__) return cls(token_indexers=token_indexers, max_span_width=max_span_width)
def from_params(cls, params: Params) -> 'ProParaDatasetReader': token_indexers = TokenIndexer.dict_from_params( params.pop("token_indexers", {})) multiple_annotations = params.pop_bool("multiple_annotations", False) return ProParaDatasetReader(token_indexers=token_indexers, multiple_annotations=multiple_annotations)
def from_params(cls, params: Params) -> 'GardDatasetReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers)
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, entity_indexer: TokenIndexer = TokenIndexer.from_params( Params(INDEXER_DEFAULT)), granularity: str = "sentence", mention_generator: MentionGenerator = None, should_remap_span_indices: bool = True, entity_disambiguation_only: bool = False, extra_candidate_generators: Dict[str, MentionGenerator] = None): lazy = False super().__init__(lazy) self.token_indexers = token_indexers or { "token": SingleIdTokenIndexer("token") } self.entity_indexer = {"ids": entity_indexer} self.separator = {"*NL*"} if granularity == "sentence": self.separator.add(".") if granularity not in {"sentence", "paragraph"}: raise ConfigurationError( "Valid arguments for granularity are 'sentence' or 'paragraph'." ) self.entity_disambiguation_only = entity_disambiguation_only self.mention_generator = mention_generator or WikiCandidateMentionGenerator( ) self.should_remap_span_indices = should_remap_span_indices self.extra_candidate_generators = extra_candidate_generators
def from_params(cls, params: Params) -> 'SimpleSrlReader': token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) word_tag_delimiter = params.pop("word_tag_delimiter", _DEFAULT_WORD_TAG_DELIMITER) params.assert_empty(cls.__name__) return SimpleSrlReader(token_indexers=token_indexers, word_tag_delimiter=word_tag_delimiter)
def from_params(cls, params: Params) -> 'PennTreeBankConstituencySpanDatasetReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) use_pos_tags = params.pop('use_pos_tags', True) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return PennTreeBankConstituencySpanDatasetReader(token_indexers=token_indexers, use_pos_tags=use_pos_tags, lazy=lazy)
def from_params(cls, params: Params) -> 'Conll2003DatasetReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) tag_label = params.pop('tag_label', None) feature_labels = params.pop('feature_labels', ()) params.assert_empty(cls.__name__) return Conll2003DatasetReader(token_indexers=token_indexers, tag_label=tag_label, feature_labels=feature_labels)
def from_params(cls, params: Params) -> 'SequenceTaggingDatasetReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) word_tag_delimiter = params.pop("word_tag_delimiter", DEFAULT_WORD_TAG_DELIMITER) token_delimiter = params.pop("token_delimiter", None) params.assert_empty(cls.__name__) return SequenceTaggingDatasetReader(token_indexers=token_indexers, word_tag_delimiter=word_tag_delimiter, token_delimiter=token_delimiter)
def from_params(cls, params: Params) -> 'UniversalDependenciesDatasetReader': token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return UniversalDependenciesDatasetReader( token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'SnliReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'SrlReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) domain_identifier = params.pop("domain_identifier", None) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return SrlReader(token_indexers=token_indexers, domain_identifier=domain_identifier, lazy=lazy)
def from_params(cls, params: Params) -> 'WikiTablesDatasetReader': lazy = params.pop('lazy', False) tables_directory = params.pop('tables_directory', None) dpd_output_directory = params.pop('dpd_output_directory', None) max_dpd_logical_forms = params.pop_int('max_dpd_logical_forms', 10) sort_dpd_logical_forms = params.pop_bool('sort_dpd_logical_forms', True) max_dpd_tries = params.pop_int('max_dpd_tries', 20) keep_if_no_dpd = params.pop_bool('keep_if_no_dpd', False) default_tokenizer_params = { 'word_splitter': { 'type': 'spacy', 'pos_tags': True } } tokenizer = Tokenizer.from_params( params.pop('tokenizer', default_tokenizer_params)) question_token_indexers = TokenIndexer.dict_from_params( params.pop('question_token_indexers', {})) table_token_indexers = TokenIndexer.dict_from_params( params.pop('table_token_indexers', {})) use_table_for_vocab = params.pop_bool('use_table_for_vocab', False) linking_feature_extracters = params.pop('linking_feature_extractors', None) include_table_metadata = params.pop_bool('include_table_metadata', False) max_table_tokens = params.pop_int('max_table_tokens', None) output_agendas = params.pop_bool('output_agendas', False) params.assert_empty(cls.__name__) return WikiTablesDatasetReader( lazy=lazy, tables_directory=tables_directory, dpd_output_directory=dpd_output_directory, max_dpd_logical_forms=max_dpd_logical_forms, sort_dpd_logical_forms=sort_dpd_logical_forms, max_dpd_tries=max_dpd_tries, keep_if_no_dpd=keep_if_no_dpd, tokenizer=tokenizer, question_token_indexers=question_token_indexers, table_token_indexers=table_token_indexers, use_table_for_vocab=use_table_for_vocab, linking_feature_extractors=linking_feature_extracters, include_table_metadata=include_table_metadata, max_table_tokens=max_table_tokens, output_agendas=output_agendas)
def from_params(cls, params: Params) -> 'SpookyAuthorsDatasetReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) cnn_paper_dataset = params.pop("cnn_paper_dataset", False) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, cnn_paper_dataset=cnn_paper_dataset)
def from_params(cls, params: Params) -> 'NlvrDatasetReader': lazy = params.pop('lazy', False) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) sentence_token_indexers = TokenIndexer.dict_from_params( params.pop('sentence_token_indexers', {})) terminal_indexers = TokenIndexer.dict_from_params( params.pop('terminal_indexers', {})) nonterminal_indexers = TokenIndexer.dict_from_params( params.pop('nonterminal_indexers', {})) output_agendas = params.pop("output_agendas", True) params.assert_empty(cls.__name__) return NlvrDatasetReader( lazy=lazy, tokenizer=tokenizer, sentence_token_indexers=sentence_token_indexers, terminal_indexers=terminal_indexers, nonterminal_indexers=nonterminal_indexers, output_agendas=output_agendas)
def from_params( cls, params: Params) -> 'SrlwithConstituencySpanOntonotesReader': token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) domain_identifier = params.pop("domain_identifier", None) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return SrlwithConstituencySpanOntonotesReader( token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'SwagReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) use_only_gold_examples = params.pop('use_only_gold_examples', False) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, use_only_gold_examples=use_only_gold_examples)
def from_params(cls, params: Params) -> 'SemanticScholarDatasetReader': lazy = params.pop('lazy', False) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return cls(lazy=lazy, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'Seq2SeqDatasetReader': source_tokenizer_type = params.pop('source_tokenizer', None) source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params(source_tokenizer_type) target_tokenizer_type = params.pop('target_tokenizer', None) target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params(target_tokenizer_type) source_indexers_type = params.pop('source_token_indexers', None) if source_indexers_type is None: source_token_indexers = None else: source_token_indexers = TokenIndexer.dict_from_params(source_indexers_type) target_indexers_type = params.pop('target_token_indexers', None) if target_indexers_type is None: target_token_indexers = None else: target_token_indexers = TokenIndexer.dict_from_params(target_indexers_type) params.assert_empty(cls.__name__) return Seq2SeqDatasetReader(source_tokenizer, target_tokenizer, source_token_indexers, target_token_indexers)
def from_params(cls, params: Params) -> "BioMedReader": token_indexers = TokenIndexer.dict_from_params( params.pop("token_indexers", {})) max_span_width = params.pop_int("max_span_width") lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return cls(token_indexers=token_indexers, max_span_width=max_span_width, lazy=lazy)
def from_params(cls, params: Params) -> 'PnetOntoDatasetReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) tag_label = params.pop('tag_label', None) feature_labels = params.pop('feature_labels', ()) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return PnetOntoDatasetReader(token_indexers=token_indexers, tag_label=tag_label, feature_labels=feature_labels, lazy=lazy)
def from_params(cls, params: Params) -> 'SequenceTaggingDatasetReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) word_tag_delimiter = params.pop("word_tag_delimiter", DEFAULT_WORD_TAG_DELIMITER) token_delimiter = params.pop("token_delimiter", None) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return SequenceTaggingDatasetReader(token_indexers=token_indexers, word_tag_delimiter=word_tag_delimiter, token_delimiter=token_delimiter, lazy=lazy)
def from_params(cls, params: Params) -> 'Conll2003DatasetReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) tag_label = params.pop('tag_label', None) feature_labels = params.pop('feature_labels', ()) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return Conll2003DatasetReader(token_indexers=token_indexers, tag_label=tag_label, feature_labels=feature_labels, lazy=lazy)
def from_params(cls, params: Params) -> 'JsonlClassificationReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) input = params.pop('input', None) gold_label = params.pop('gold_label', None) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return JsonlClassificationReader(tokenizer=tokenizer, token_indexers=token_indexers, input=input, gold_label=gold_label)
def from_params(cls, params): dataset_type = params.pop("type") tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'TriviaQaReader': base_tarball_path = params.pop('base_tarball_path') unfiltered_tarball_path = params.pop('unfiltered_tarball_path', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return cls(base_tarball_path=base_tarball_path, unfiltered_tarball_path=unfiltered_tarball_path, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'DialogueContextDatasetReader': lazy = params.pop('lazy', False) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) shuffle_examples = params.pop('shuffle_examples', False) params.assert_empty(cls.__name__) return cls(lazy=lazy, shuffle_examples=shuffle_examples, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'BiaoWenMingXiDatasetReader': lazy = params.pop('lazy', False) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) sheet_name = params.pop('sheet_name', 'Sheet1') params.assert_empty(cls.__name__) return cls(lazy=lazy, tokenizer=tokenizer, token_indexers=token_indexers, sheet_name=sheet_name)
def from_params(cls, params: Params) -> 'FEVERSentenceReader': claim_tokenizer = Tokenizer.from_params(params.pop('claim_tokenizer', {})) wiki_tokenizer = Tokenizer.from_params(params.pop('wiki_tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) db = FeverDocDB(params.pop("db_path","data/fever/fever.db")) params.assert_empty(cls.__name__) return FEVERSentenceReader(db=db, claim_tokenizer=claim_tokenizer, wiki_tokenizer=wiki_tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'EntailmentTupleReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) max_tuples = params.pop('max_tuples', 30) max_tokens = params.pop('max_tokens', 200) params.assert_empty(cls.__name__) return EntailmentTupleReader(max_tokens=max_tokens, max_tuples=max_tuples, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params): token_indexers_params = params.pop('token_indexers', Params({})) token_indexers = TokenIndexer.dict_from_params(token_indexers_params) sentence_field_name = params.pop('sentence_field_name', 'sentence') tags_field_name = params.pop('tags_field_name', 'tags') tag_namespace = params.pop('tag_namespace', 'tags') params.assert_empty(cls.__name__) return cls(token_indexers=token_indexers, sentence_field_name=sentence_field_name, tags_field_name=tags_field_name, tag_namespace=tag_namespace)
def from_params(cls, params: Params) -> 'ToxicReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) max_length = params.pop('max_length', None) fill_in_empty_labels = params.pop_bool('fill_in_empty_labels', False) params.assert_empty(cls.__name__) return cls(max_length=max_length, fill_in_empty_labels=fill_in_empty_labels, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'StanfordSentimentTreeBankDatasetReader': token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) use_subtrees = params.pop('use_subtrees', False) granularity = params.pop_choice('granularity', ["5-class", "3-class", "2-class"], True) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return StanfordSentimentTreeBankDatasetReader( token_indexers=token_indexers, use_subtrees=use_subtrees, granularity=granularity, lazy=lazy)
def from_params(cls, params: Params) -> 'BabiDatasetReader': """ Constructs the dataset reader described by ``params``. """ token_indexers_type = params.pop('token_indexers', None) if token_indexers_type is None: token_indexers = None else: token_indexers = TokenIndexer.dict_from_params(token_indexers_type) params.assert_empty(cls.__name__) return BabiDatasetReader(token_indexers)
def from_params(cls, params: Params) -> 'Seq2SeqDatasetReader': source_tokenizer_type = params.pop('source_tokenizer', None) source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params(source_tokenizer_type) target_tokenizer_type = params.pop('target_tokenizer', None) target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params(target_tokenizer_type) source_indexers_type = params.pop('source_token_indexers', None) source_add_start_token = params.pop_bool('source_add_start_token', True) if source_indexers_type is None: source_token_indexers = None else: source_token_indexers = TokenIndexer.dict_from_params(source_indexers_type) target_indexers_type = params.pop('target_token_indexers', None) if target_indexers_type is None: target_token_indexers = None else: target_token_indexers = TokenIndexer.dict_from_params(target_indexers_type) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return Seq2SeqDatasetReader(source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, source_token_indexers=source_token_indexers, target_token_indexers=target_token_indexers, source_add_start_token=source_add_start_token, lazy=lazy)
def from_params(cls, params: Params) -> "WinobiasReader": token_indexers = TokenIndexer.dict_from_params(params.pop("token_indexers", {})) max_span_width = params.pop_int("max_span_width") lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return cls(token_indexers=token_indexers, max_span_width=max_span_width, lazy=lazy)