def from_config( cls, config: Config, feature_config: ModelInputConfig, target_config: TargetConfig, **kwargs, ): """Factory method to construct an instance of ContextualIntentSlotModelDataHandler object from the module's config, model input config and target config. Args: config (Config): Configuration object specifying all the parameters of ContextualIntentSlotModelDataHandler. feature_config (ModelInputConfig): Configuration object specifying model input. target_config (TargetConfig): Configuration object specifying target. Returns: type: An instance of ContextualIntentSlotModelDataHandler. """ features: Dict[str, Field] = create_fields( feature_config, { ModelInput.TEXT: TextFeatureField, ModelInput.DICT: DictFeatureField, ModelInput.CHAR: CharFeatureField, ModelInput.CONTEXTUAL_TOKEN_EMBEDDING: ContextualTokenEmbeddingField, ModelInput.SEQ: SeqFeatureField, ModelInput.DENSE: FloatVectorField, }, ) # Label fields. labels: Dict[str, Field] = create_label_fields( target_config, { DocLabelConfig._name: DocLabelField, WordLabelConfig._name: WordLabelField, }, ) extra_fields: Dict[str, Field] = { ExtraField.DOC_WEIGHT: FloatField(), ExtraField.WORD_WEIGHT: FloatField(), ExtraField.RAW_WORD_LABEL: RawField(), ExtraField.TOKEN_RANGE: RawField(), ExtraField.UTTERANCE: RawField(), } kwargs.update(config.items()) return cls( raw_columns=config.columns_to_read, labels=labels, features=features, extra_fields=extra_fields, **kwargs, )
def from_config(cls, config: Config, feature_config: FeatureConfig, *args, **kwargs): word_feat_config = feature_config.word_feat features: Dict[str, Field] = { DatasetFieldName.TEXT_FIELD: TextFeatureFieldWithSpecialUnk( pretrained_embeddings_path=word_feat_config. pretrained_embeddings_path, embed_dim=word_feat_config.embed_dim, embedding_init_strategy=word_feat_config. embedding_init_strategy, vocab_file=word_feat_config.vocab_file, vocab_size=word_feat_config.vocab_size, vocab_from_train_data=word_feat_config.vocab_from_train_data, vocab_from_all_data=word_feat_config.vocab_from_all_data, min_freq=word_feat_config.min_freq, pad_token=None, ) } if feature_config.dict_feat and feature_config.dict_feat.embed_dim > 0: features[DatasetFieldName.DICT_FIELD] = DictFeatureField() # Adding action_field to list of features so that it can be passed to # RNNGParser's forward method during training time. action_field = ActionField() # Use the same field for label too. features[ACTION_FEATURE_FIELD] = action_field if feature_config.contextual_token_embedding: features[ DatasetFieldName. CONTEXTUAL_TOKEN_EMBEDDING] = ContextualTokenEmbeddingField( embed_dim=feature_config.contextual_token_embedding. embed_dim) extra_fields: Dict[str, Field] = { DatasetFieldName.TOKENS: RawField(), "text": RawField(), } return cls( raw_columns=config.columns_to_read, features=features, labels={ACTION_LABEL_FIELD: action_field}, extra_fields=extra_fields, train_path=config.train_path, eval_path=config.eval_path, test_path=config.test_path, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, test_batch_size=config.test_batch_size, shuffle=config.shuffle, sort_within_batch=config.sort_within_batch, column_mapping=config.column_mapping, **kwargs, )
def from_config( cls, config: Config, model_input_config: ModelInputConfig, target_config: TargetConfig, **kwargs, ): """ Factory method to construct an instance of `DocClassificationDataHandler` from the module's config object and feature config object. Args: config (DocClassificationDataHandler.Config): Configuration object specifying all the parameters of `DocClassificationDataHandler`. model_input_config (ModelInputConfig): Configuration object specifying all the parameters of the model config. target_config (TargetConfig): Configuration object specifying all the parameters of the target. Returns: type: An instance of `KDDocClassificationDataHandler`. """ model_input_fields: Dict[str, Field] = create_fields( model_input_config, { ModelInput.WORD_FEAT: TextFeatureField, ModelInput.DICT_FEAT: DictFeatureField, ModelInput.CHAR_FEAT: CharFeatureField, ModelInput.PRETRAINED_MODEL_EMBEDDING: PretrainedModelEmbeddingField, }, ) target_fields: Dict[str, Field] = create_label_fields( target_config, {DocLabelConfig._name: DocLabelField}) extra_fields: Dict[str, Field] = {ExtraField.RAW_TEXT: RawField()} if target_config.target_prob: target_fields[Target.TARGET_PROB_FIELD] = RawField() target_fields[Target.TARGET_LOGITS_FIELD] = RawField() if target_config.target_prob: extra_fields[Target.TARGET_LABEL_FIELD] = RawField() kwargs.update(config.items()) return cls( raw_columns=config.columns_to_read, labels=target_fields, features=model_input_fields, extra_fields=extra_fields, **kwargs, )
def from_config( cls, config: Config, feature_config: FeatureConfig, label_configs: Union[DocLabelConfig, WordLabelConfig, List[TargetConfigBase]], **kwargs, ): features: Dict[str, Field] = create_fields( feature_config, { DatasetFieldName.TEXT_FIELD: TextFeatureField, DatasetFieldName.DICT_FIELD: DictFeatureField, DatasetFieldName.CHAR_FIELD: CharFeatureField, DatasetFieldName.DENSE_FIELD: FloatVectorField, DatasetFieldName.PRETRAINED_MODEL_EMBEDDING: PretrainedModelEmbeddingField, }, ) # Label fields. labels: Dict[str, Field] = create_label_fields( label_configs, { DocLabelConfig._name: DocLabelField, WordLabelConfig._name: WordLabelField, }, ) has_word_label = WordLabelConfig._name in labels extra_fields: Dict[str, Field] = { DatasetFieldName.DOC_WEIGHT_FIELD: FloatField(), DatasetFieldName.WORD_WEIGHT_FIELD: FloatField(), DatasetFieldName.TOKEN_RANGE: RawField(), DatasetFieldName.UTTERANCE_FIELD: RawField(), } if has_word_label: extra_fields[DatasetFieldName.RAW_WORD_LABEL] = RawField() kwargs.update(config.items()) return cls( raw_columns=config.columns_to_read, labels=labels, features=features, extra_fields=extra_fields, **kwargs, )
def from_config(cls, config: Config, feature_config: FeatureConfig, label_config: DocLabelConfig, **kwargs): word_feat_config = feature_config.word_feat features: Dict[str, Field] = { DatasetFieldName.TEXT_FIELD: SeqFeatureField( pretrained_embeddings_path=word_feat_config. pretrained_embeddings_path, embed_dim=word_feat_config.embed_dim, embedding_init_strategy=word_feat_config. embedding_init_strategy, vocab_file=word_feat_config.vocab_file, vocab_size=word_feat_config.vocab_size, vocab_from_train_data=word_feat_config.vocab_from_train_data, ) } labels: Dict[str, Field] = {DocLabelConfig._name: DocLabelField()} extra_fields: Dict[str, Field] = { DatasetFieldName.UTTERANCE_FIELD: RawField() } return cls(raw_columns=config.columns_to_read, labels=labels, features=features, extra_fields=extra_fields, shuffle=config.shuffle, train_path=config.train_path, eval_path=config.eval_path, test_path=config.test_path, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, test_batch_size=config.test_batch_size, **kwargs)
def from_config( cls, config: Config, feature_config: ModelInputConfig, target_config: TargetConfig, **kwargs, ): features: Dict[str, Field] = create_fields( feature_config, { ModelInput.TEXT1: TextFeatureField, ModelInput.TEXT2: TextFeatureField }, ) assert len(features) == 2 # share the processing field features[ModelInput.TEXT2] = features[ModelInput.TEXT1] labels: Dict[str, Field] = create_label_fields( target_config, {DocLabelConfig._name: DocLabelField}) extra_fields: Dict[str, Field] = { ExtraField.UTTERANCE_PAIR: RawField() } kwargs.update(config.items()) return cls( raw_columns=config.columns_to_read, labels=labels, features=features, extra_fields=extra_fields, **kwargs, )
def from_config(cls, config: Config, feature_config: ModelInputConfig, target_config: ModelOutputConfig, text_embedder_config: EmbedderInterface.Config, **kwargs): text_embedder: EmbedderInterface = EmbedderInterface.from_config(text_embedder_config) features: Dict[str, Field] = { ModelInput.SEQ: BPEField(text_embedder) } assert len(features) targets: Dict[str, Field] = { ModelOutputConfig._name: BPEField(text_embedder, is_target=True, all_responses=config.all_responses), } extra_fields = { RAW_TEXT: RawField(), ModelInput.DLG_LEN: RawField(), ModelInput.DLG_ID: RawField(), ModelInput.DOMAIN_ID: RawField(), ModelInput.TASK_ID: RawField() } kwargs.update(config.items()) self = cls( raw_columns=[], # ignored in our read function features=features, labels=targets, extra_fields=extra_fields, **kwargs, ) self.max_turns = config.max_turns self.text_embedder_cfg = text_embedder_config self.all_responses = config.all_responses self.preproc_chunksize = config.preproc_chunksize self.train_domains = config.train_domains self.eval_domains = config.eval_domains self.featurized_cache_dir = config.featurized_cache_dir self.test_domains = config.test_domains self.text_embedder = text_embedder self.seed = config.seed return self
def from_config(cls, config: Config, feature_config: FeatureConfig, *args, **kwargs): """ Factory method to construct an instance of `LanguageModelDataHandler` from the module's config object and feature config object. Args: config (LanguageModelDataHandler.Config): Configuration object specifying all the parameters of `LanguageModelDataHandler`. feature_config (FeatureConfig): Configuration object specifying all the parameters of all input features. Returns: type: An instance of `LanguageModelDataHandler`. """ # For language modeling the only input is a collection of utterances. # The input and the labels are created by the LangaugeModelDataHandler. # The input at time step t+1 becomes a label for the input at time step t. word_feat_config = feature_config.word_feat features: Dict[str, Field] = { DatasetFieldName.TEXT_FIELD: TextFeatureField( eos_token=VocabMeta.EOS_TOKEN if config.append_eos else None, init_token=VocabMeta.INIT_TOKEN if config.append_bos else None, pretrained_embeddings_path=word_feat_config. pretrained_embeddings_path, embed_dim=word_feat_config.embed_dim, embedding_init_strategy=word_feat_config. embedding_init_strategy, vocab_file=word_feat_config.vocab_file, vocab_size=word_feat_config.vocab_size, vocab_from_train_data=word_feat_config.vocab_from_train_data, ) } labels: Dict[str, Field] = {} extra_fields: Dict[str, Field] = { DatasetFieldName.UTTERANCE_FIELD: RawField() } return cls(raw_columns=config.columns_to_read, features=features, labels=labels, extra_fields=extra_fields, train_path=config.train_path, eval_path=config.eval_path, test_path=config.test_path, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, test_batch_size=config.test_batch_size, **kwargs)
def __init__( self, raw_columns: List[str], labels: Dict[str, Field], features: Dict[str, Field], featurizer: Featurizer, extra_fields: Dict[str, Field] = None, text_feature_name: str = DatasetFieldName.TEXT_FIELD, shuffle: bool = True, sort_within_batch: bool = True, train_path: str = "train.tsv", eval_path: str = "eval.tsv", test_path: str = "test.tsv", train_batch_size: int = 128, eval_batch_size: int = 128, test_batch_size: int = 128, max_seq_len: int = -1, pass_index: bool = True, column_mapping: Dict[str, str] = None, **kwargs, ) -> None: self.raw_columns: List[str] = raw_columns or [] self.labels: Dict[str, Field] = labels or {} self.features: Dict[str, Field] = features or {} self.featurizer = featurizer self.extra_fields: Dict[str, Field] = extra_fields or {} if pass_index: self.extra_fields[BatchContext.INDEX] = RawField() self.text_feature_name: str = text_feature_name self.metadata_cls: Type = CommonMetadata self.metadata: CommonMetadata = CommonMetadata() self._data_cache: MutableMapping[str, Any] = {} self.shuffle = shuffle self.sort_within_batch = sort_within_batch self.num_workers = multiprocessing.cpu_count() self.max_seq_len = max_seq_len self.train_path = train_path self.eval_path = eval_path self.test_path = test_path self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.test_batch_size = test_batch_size self.column_mapping = column_mapping log_class_usage(__class__)
def from_config(cls, config: Config, feature_config: FeatureConfig, label_config: DocLabelConfig, **kwargs): word_feat_config = feature_config.word_feat dense_feat_config = feature_config.dense_feat features: Dict[str, Field] = { ModelInput.WORD_FEAT: SeqFeatureField( pretrained_embeddings_path=word_feat_config. pretrained_embeddings_path, embed_dim=word_feat_config.embed_dim, embedding_init_strategy=word_feat_config. embedding_init_strategy, vocab_file=word_feat_config.vocab_file, vocab_size=word_feat_config.vocab_size, vocab_from_train_data=word_feat_config.vocab_from_train_data, ) } if dense_feat_config: features[ModelInput.DENSE_FEAT] = FloatVectorField( dim=dense_feat_config.dim, dim_error_check=dense_feat_config.dim_error_check, ) labels: Dict[str, Field] = {DocLabelConfig._name: DocLabelField()} extra_fields: Dict[str, Field] = { DatasetFieldName.UTTERANCE_FIELD: RawField() } return cls(raw_columns=config.columns_to_read, labels=labels, features=features, extra_fields=extra_fields, shuffle=config.shuffle, train_path=config.train_path, eval_path=config.eval_path, test_path=config.test_path, train_batch_size=config.train_batch_size, eval_batch_size=config.eval_batch_size, test_batch_size=config.test_batch_size, **kwargs)