def from_params(cls, params: Params) -> 'Seq2SeqDatasetReader': source_tokenizer_type = params.pop('source_tokenizer', None) source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params( source_tokenizer_type) target_tokenizer_type = params.pop('target_tokenizer', None) target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params( target_tokenizer_type) source_indexers_type = params.pop('source_token_indexers', None) source_add_start_token = params.pop('source_add_start_token', True) if source_indexers_type is None: source_token_indexers = None else: source_token_indexers = TokenIndexer.dict_from_params( source_indexers_type) target_indexers_type = params.pop('target_token_indexers', None) if target_indexers_type is None: target_token_indexers = None else: target_token_indexers = TokenIndexer.dict_from_params( target_indexers_type) params.assert_empty(cls.__name__) return Seq2SeqDatasetReader(source_tokenizer, target_tokenizer, source_token_indexers, target_token_indexers, source_add_start_token)
def from_params(cls, params: Params) -> 'ScitailGraphDatasetReader': lazy = params.pop('lazy', False) tokenizer_params = params.pop('tokenizer', None) entities_tokenizer_params = params.pop('entities_tokenizer', None) max_length = params.pop("max_length", None) if not tokenizer_params and not entities_tokenizer_params: raise ConfigurationError( "Please specify at least one of tokenizer and entities_tokenizer") tokenizer = Tokenizer.from_params( tokenizer_params) if tokenizer_params else None entities_tokenizer = Tokenizer.from_params( entities_tokenizer_params) if entities_tokenizer_params else None token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) entities_indexers = TokenIndexer.dict_from_params( params.pop('entities_indexers', {})) params.assert_empty(cls.__name__) return cls(lazy=lazy, tokenizer=tokenizer, token_indexers=token_indexers, max_length=max_length, entities_tokenizer=entities_tokenizer, entities_indexers=entities_indexers)
def from_params(cls, params: Params) -> 'SpanAeDatasetReader': source_tokenizer_type = params.pop('source_tokenizer', None) source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params( source_tokenizer_type) target_tokenizer_type = params.pop('target_tokenizer', None) target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params( target_tokenizer_type) source_indexers_type = params.pop('source_token_indexers', None) source_add_start_token = params.pop_bool('source_add_start_token', True) if source_indexers_type is None: source_token_indexers = None else: source_token_indexers = TokenIndexer.dict_from_params( source_indexers_type) target_indexers_type = params.pop('target_token_indexers', None) if target_indexers_type is None: target_token_indexers = None else: target_token_indexers = TokenIndexer.dict_from_params( target_indexers_type) lazy = params.pop('lazy', False) max_span_width = params.pop('max_span_width', 1) params.assert_empty(cls.__name__) return SpanAeDatasetReader(source_tokenizer, target_tokenizer, source_token_indexers, target_token_indexers, source_add_start_token, lazy, max_span_width)
def from_params(cls, params: Params) -> 'NLPCC3DatasetReader': source_tokenizer_type = params.pop('source_tokenizer', None) source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params( source_tokenizer_type) target_tokenizer_type = params.pop('target_tokenizer', None) target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params( target_tokenizer_type) source_indexers_type = params.pop('source_token_indexers', None) source_add_start_token = params.pop_bool('source_add_start_token', True) if source_indexers_type is None: source_token_indexers = None else: source_token_indexers = TokenIndexer.dict_from_params( source_indexers_type) target_indexers_type = params.pop('target_token_indexers', None) if target_indexers_type is None: target_token_indexers = None else: target_token_indexers = TokenIndexer.dict_from_params( target_indexers_type) lazy = params.pop('lazy', False) make_vocab = params.pop_bool('make_vocab', False) max_encoding_steps = params.pop('max_encoding_steps', 1000) params.assert_empty(cls.__name__) return NLPCC3DatasetReader(source_tokenizer, target_tokenizer, source_token_indexers, target_token_indexers, source_add_start_token, lazy, make_vocab, max_encoding_steps)
def from_params(cls, params: Params) -> 'FEVERSentenceReader': claim_tokenizer = Tokenizer.from_params(params.pop('claim_tokenizer', {})) wiki_tokenizer = Tokenizer.from_params(params.pop('wiki_tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) db = FeverDocDB(params.pop("db_path","data/fever/fever.db")) params.assert_empty(cls.__name__) return FEVERSentenceReader(db=db, claim_tokenizer=claim_tokenizer, wiki_tokenizer=wiki_tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'ArcBidafReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'SnliSrlReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return SnliSrlReader(tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'WikiTablesDatasetReader': lazy = params.pop('lazy', False) tables_directory = params.pop('tables_directory', None) dpd_output_directory = params.pop('dpd_output_directory', None) max_dpd_logical_forms = params.pop_int('max_dpd_logical_forms', 10) sort_dpd_logical_forms = params.pop_bool('sort_dpd_logical_forms', True) max_dpd_tries = params.pop_int('max_dpd_tries', 20) keep_if_no_dpd = params.pop_bool('keep_if_no_dpd', False) default_tokenizer_params = {'word_splitter': {'type': 'spacy', 'pos_tags': True}} tokenizer = Tokenizer.from_params(params.pop('tokenizer', default_tokenizer_params)) question_token_indexers = TokenIndexer.dict_from_params(params.pop('question_token_indexers', {})) table_token_indexers = TokenIndexer.dict_from_params(params.pop('table_token_indexers', {})) use_table_for_vocab = params.pop_bool('use_table_for_vocab', False) linking_feature_extracters = params.pop('linking_feature_extractors', None) include_table_metadata = params.pop_bool('include_table_metadata', False) max_table_tokens = params.pop_int('max_table_tokens', None) output_agendas = params.pop_bool('output_agendas', False) params.assert_empty(cls.__name__) return WikiTablesDatasetReader(lazy=lazy, tables_directory=tables_directory, dpd_output_directory=dpd_output_directory, max_dpd_logical_forms=max_dpd_logical_forms, sort_dpd_logical_forms=sort_dpd_logical_forms, max_dpd_tries=max_dpd_tries, keep_if_no_dpd=keep_if_no_dpd, tokenizer=tokenizer, question_token_indexers=question_token_indexers, table_token_indexers=table_token_indexers, use_table_for_vocab=use_table_for_vocab, linking_feature_extractors=linking_feature_extracters, include_table_metadata=include_table_metadata, max_table_tokens=max_table_tokens, output_agendas=output_agendas)
def from_params(cls, params: Params) -> 'SnliReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'SpookyAuthorsDatasetReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) cnn_paper_dataset = params.pop("cnn_paper_dataset", False) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, cnn_paper_dataset=cnn_paper_dataset)
def from_params(cls, params: Params) -> 'SemanticScholarDatasetReader': lazy = params.pop('lazy', False) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return cls(lazy=lazy, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'SwagReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) use_only_gold_examples = params.pop('use_only_gold_examples', False) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, use_only_gold_examples=use_only_gold_examples)
def from_params(cls, params): dataset_type = params.pop("type") tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'JsonlClassificationReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) input = params.pop('input', None) gold_label = params.pop('gold_label', None) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return JsonlClassificationReader(tokenizer=tokenizer, token_indexers=token_indexers, input=input, gold_label=gold_label)
def from_params(cls, params: Params) -> 'SquadSentenceSelectionReader': negative_sentence_selection = params.pop('negative_sentence_selection', 'paragraph') tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return SquadSentenceSelectionReader( negative_sentence_selection=negative_sentence_selection, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'EntailmentTupleReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) max_tuples = params.pop('max_tuples', 30) max_tokens = params.pop('max_tokens', 200) params.assert_empty(cls.__name__) return EntailmentTupleReader(max_tokens=max_tokens, max_tuples=max_tuples, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'DialogueContextDatasetReader': lazy = params.pop('lazy', False) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) shuffle_examples = params.pop('shuffle_examples', False) params.assert_empty(cls.__name__) return cls(lazy=lazy, shuffle_examples=shuffle_examples, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'TriviaQaReader': base_tarball_path = params.pop('base_tarball_path') unfiltered_tarball_path = params.pop('unfiltered_tarball_path', None) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return cls(base_tarball_path=base_tarball_path, unfiltered_tarball_path=unfiltered_tarball_path, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'BiaoWenMingXiDatasetReader': lazy = params.pop('lazy', False) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) sheet_name = params.pop('sheet_name', 'Sheet1') params.assert_empty(cls.__name__) return cls(lazy=lazy, tokenizer=tokenizer, token_indexers=token_indexers, sheet_name=sheet_name)
def from_params(cls, params: Params) -> 'Squad2Reader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) lazy = params.pop('lazy', False) maxRows = params.pop('maxRows', -1) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, lazy=lazy, maxRows=maxRows)
def from_params(cls, params: Params) -> 'ToxicReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) max_length = params.pop('max_length', None) fill_in_empty_labels = params.pop_bool('fill_in_empty_labels', False) params.assert_empty(cls.__name__) return cls(max_length=max_length, fill_in_empty_labels=fill_in_empty_labels, tokenizer=tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'OntologyMatchingDatasetReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) name_token_indexers = TokenIndexer.dict_from_params( params.pop('name_token_indexers', {})) token_only_indexer = TokenIndexer.dict_from_params( params.pop('token_only_indexer', {})) params.assert_empty(cls.__name__) return OntologyMatchingDatasetReader( tokenizer=tokenizer, name_token_indexers=name_token_indexers, token_only_indexer=token_only_indexer)
def from_params(cls, params: Params) -> 'Seq2MultiSeqDatasetReader': source_tokenizer_type = params.pop('source_tokenizer', None) source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params( source_tokenizer_type) target_tokenizer_type = params.pop('target_tokenizer', None) target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params( target_tokenizer_type) source_indexers_type = params.pop('source_token_indexers', None) source_add_start_token = params.pop_bool('source_add_start_token', True) if source_indexers_type is None: source_token_indexers = None else: source_token_indexers = TokenIndexer.dict_from_params( source_indexers_type) upos_indexers_type = params.pop('upos_token_indexers', None) if upos_indexers_type is None: upos_token_indexers = None else: upos_token_indexers = TokenIndexer.dict_from_params( upos_indexers_type) ner_indexers_type = params.pop('ner_token_indexers', None) if ner_indexers_type is None: ner_token_indexers = None else: ner_token_indexers = TokenIndexer.dict_from_params( ner_indexers_type) chunk_indexers_type = params.pop('chunk_token_indexers', None) if chunk_indexers_type is None: chunk_token_indexers = None else: chunk_token_indexers = TokenIndexer.dict_from_params( chunk_indexers_type) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return Seq2MultiSeqDatasetReader(source_tokenizer, target_tokenizer, source_token_indexers, upos_token_indexers, ner_token_indexers, chunk_token_indexers, source_add_start_token, lazy)
def from_params(cls, params: Params) -> 'SeqTask2SeqDatasetReader': source_tokenizer_type = params.pop('source_tokenizer', None) source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params( source_tokenizer_type) target_tokenizer_type = params.pop('target_tokenizer', None) target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params( target_tokenizer_type) task_indexers_type = params.pop('task_token_indexers', None) domain_indexers_type = params.pop('domain_token_indexers', None) source_indexers_type = params.pop('source_token_indexers', None) source_add_start_token = params.pop_bool('source_add_start_token', True) if task_indexers_type is None: task_token_indexers = None else: task_token_indexers = TokenIndexer.dict_from_params( task_indexers_type) if domain_indexers_type is None: domain_token_indexers = None else: domain_token_indexers = TokenIndexer.dict_from_params( domain_indexers_type) if source_indexers_type is None: source_token_indexers = None else: source_token_indexers = TokenIndexer.dict_from_params( source_indexers_type) target_indexers_type = params.pop('target_token_indexers', None) if target_indexers_type is None: target_token_indexers = None else: target_token_indexers = TokenIndexer.dict_from_params( target_indexers_type) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return SeqTask2SeqDatasetReader(source_tokenizer, target_tokenizer, task_token_indexers, domain_token_indexers, source_token_indexers, target_token_indexers, source_add_start_token, lazy)
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, sample: int = -1, use_only_gold_examples: bool = False) -> None: super().__init__(lazy=False) self._tokenizer = tokenizer or Tokenizer() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self.use_only_gold_examples = use_only_gold_examples self._sample = sample
def from_params(cls, params: Params) -> 'FEVERReader': claim_tokenizer = Tokenizer.from_params( params.pop('claim_tokenizer', {})) wiki_tokenizer = Tokenizer.from_params(params.pop( 'wiki_tokenizer', {})) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) sentence_level = params.pop("sentence_level", False) is_snopes = params.pop("is_snopes", False) if is_snopes: db = SnopesDocDB(params.pop("db_path", "dataset/snopes.pages.json")) else: db = FeverDocDB(params.pop("db_path", "data/fever.db")) params.assert_empty(cls.__name__) return FEVERReader(db=db, sentence_level=sentence_level, claim_tokenizer=claim_tokenizer, wiki_tokenizer=wiki_tokenizer, token_indexers=token_indexers)
def from_params(cls, params: Params) -> 'MsMarcoReaderTest': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = TokenIndexer.dict_from_params(params.pop('token_indexers', {})) span_file_path = params.pop('span_file_path', None) extraction_model_path = params.pop('extraction_model_path', None) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return cls(tokenizer=tokenizer, token_indexers=token_indexers, span_file_path=span_file_path, extraction_model_path=extraction_model_path, lazy=lazy)
def from_params(cls, params: Params) -> 'AclarcDatasetReader': lazy = params.pop('lazy', False) tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) use_lexicon_features = params.pop_bool("use_lexicon_features", False) use_sparse_lexicon_features = params.pop_bool( "use_sparse_lexicon_features", False) with_elmo = params.pop_bool("with_elmo", False) params.assert_empty(cls.__name__) return cls(lazy=lazy, tokenizer=tokenizer, use_lexicon_features=use_lexicon_features, use_sparse_lexicon_features=use_sparse_lexicon_features, with_elmo=with_elmo)
def string_to_fields(string: str, tokenizer: Tokenizer, token_indexers: Dict[str, TokenIndexer]): tokenized_string = tokenizer.tokenize(string) tokenized_string.insert(0, Token(END_SYMBOL)) field = TextField(tokenized_string, token_indexers) # TODO: always use single id token indexer and tokenizer default/bpe cause we will have bert/elmo passed to main str tokenized_golden_string = golden_tokenizer.tokenize(string) tokenized_golden_string.append( Token(END_SYMBOL)) # with eos at the end for loss compute field_golden = TextField(tokenized_golden_string, golden_token_indexers) return field, field_golden
def from_params(cls, params: Params) -> 'SkipPhraseDatasetReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) pivot_phrase_token_indexers = TokenIndexer.dict_from_params( params.pop('pivot_phrase_token_indexers', {})) window_size = params.pop('window_size', 5) pivot_ngram_degree = params.pop('pivot_ngram_degree', 1) lazy = params.pop('lazy', False) # check if there are unprocessed parameters params.assert_empty(cls.__name__) return cls(window_size=window_size, pivot_ngram_degree=pivot_ngram_degree, tokenizer=tokenizer, pivot_phrase_token_indexers=pivot_phrase_token_indexers, lazy=lazy)
def from_params(cls, params: Params) -> 'Seq2SeqDatasetReader': source_tokenizer_type = params.pop('source_tokenizer', None) source_tokenizer = None if source_tokenizer_type is None else Tokenizer.from_params(source_tokenizer_type) target_tokenizer_type = params.pop('target_tokenizer', None) target_tokenizer = None if target_tokenizer_type is None else Tokenizer.from_params(target_tokenizer_type) source_indexers_type = params.pop('source_token_indexers', None) source_add_start_token = params.pop_bool('source_add_start_token', True) if source_indexers_type is None: source_token_indexers = None else: source_token_indexers = TokenIndexer.dict_from_params(source_indexers_type) target_indexers_type = params.pop('target_token_indexers', None) if target_indexers_type is None: target_token_indexers = None else: target_token_indexers = TokenIndexer.dict_from_params(target_indexers_type) lazy = params.pop('lazy', False) params.assert_empty(cls.__name__) return Seq2SeqDatasetReader(source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, source_token_indexers=source_token_indexers, target_token_indexers=target_token_indexers, source_add_start_token=source_add_start_token, lazy=lazy)
def from_params(cls, params: Params) -> 'CsvClassificationReader': tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) input = params.pop('pos_input', None) gold_label = params.pop('pos_gold_label', None) skip_header = params.pop('skip_header', None) delimiter = params.pop('delimiter', None) token_indexers = TokenIndexer.dict_from_params( params.pop('token_indexers', {})) params.assert_empty(cls.__name__) return CsvClassificationReader(tokenizer=tokenizer, token_indexers=token_indexers, skip_header=skip_header, delimiter=delimiter, input=input, gold_label=gold_label)