def fit(self, trn_data, dev_data, save_dir, batch_size, embed: Embedding, mention_feedforward: FeedForward, antecedent_feedforward: FeedForward, feature_size: int, max_span_width: int, spans_per_word: float, max_antecedents: int, lr=1e-3, transformer_lr=1e-5, adam_epsilon=1e-6, weight_decay=0.01, warmup_steps=0.1, epochs=150, grad_norm=None, coarse_to_fine: bool = False, inference_order: int = 1, lexical_dropout: float = 0.2, context_layer: LSTMContextualEncoder = None, devices=None, logger=None, seed=None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, transformer=None, lr=5e-5, transformer_lr=None, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, batch_size=32, gradient_accumulation=1, grad_norm=5.0, transformer_grad_norm=None, average_subwords=False, scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None, word_dropout=None, hidden_dropout=None, max_sequence_length=None, ret_raw_hidden_states=False, batch_max_tokens=None, epochs=3, logger=None, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, transformer, max_seq_length=256, transformer_dropout=.33, d_positional=None, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, optimizer='adamw', learning_rate=5e-5, learning_rate_transformer=None, weight_decay_rate=0, epsilon=1e-8, clipnorm=None, fp16=False, warmup_steps_ratio=0, arc_loss='binary_crossentropy', rel_loss='sparse_categorical_crossentropy', metrics=('UF', 'LF'), batch_size=3000, samples_per_batch=150, max_samples_per_batch=None, epochs=100, tree=False, punct=False, token_mapping=None, enhanced_only=False, run_eagerly=False, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, embed, context_layer, batch_size=40, batch_max_tokens=700, lexical_dropout=0.5, dropout=0.2, span_width_feature_size=20, ffnn_size=150, ffnn_depth=2, argument_ratio=0.8, predicate_ratio=0.4, max_arg_width=30, mlp_label_size=100, enforce_srl_constraint=False, use_gold_predicates=False, doc_level_offset=True, use_biaffine=False, lr=1e-3, transformer_lr=1e-5, adam_epsilon=1e-6, weight_decay=0.01, warmup_steps=0.1, grad_norm=5.0, gradient_accumulation=1, loss_reduction='sum', devices=None, logger=None, seed=None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, encoder, lr=5e-5, transformer_lr=None, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, grad_norm=1.0, n_mlp_span=500, n_mlp_label=100, mlp_dropout=.33, batch_size=None, batch_max_tokens=5000, gradient_accumulation=1, epochs=30, patience=0.5, mbr=True, sampler_builder=None, delete=('', ':', '``', "''", '.', '?', '!', '-NONE-', 'TOP', ',', 'S1'), equal=(('ADVP', 'PRT'), ), no_subcategory=True, eval_trn=True, transform=None, devices=None, logger=None, seed=None, **kwargs): if isinstance(equal, tuple): equal = dict(equal) return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, embed, encoder=None, lr=1e-3, transformer_lr=1e-4, adam_epsilon=1e-8, warmup_steps=0.1, weight_decay=0, crf=False, n_mlp_rel=300, mlp_dropout=0.2, batch_size=32, gradient_accumulation=1, grad_norm=1, loss_reduction='mean', epochs=30, delimiter=None, doc_level_offset=True, eval_trn=False, logger=None, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, epochs=5, append_after_sentence=None, eos_chars=None, eos_char_min_freq=200, eos_char_is_punct=True, char_min_freq=None, window_size=5, batch_size=32, lr=0.001, grad_norm=None, loss_reduction='sum', embedding_size=128, rnn_type: str = 'LSTM', rnn_size=256, rnn_layers=1, rnn_bidirectional=False, dropout=0.2, devices=None, logger=None, seed=None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit( self, encoder: Embedding, tasks: Dict[str, Task], save_dir, epochs, patience=0.5, lr=1e-3, encoder_lr=5e-5, adam_epsilon=1e-8, weight_decay=0.0, warmup_steps=0.1, gradient_accumulation=1, grad_norm=5.0, encoder_grad_norm=None, decoder_grad_norm=None, tau: float = 0.8, transform=None, # prune: Callable = None, eval_trn=True, prefetch=None, tasks_need_custom_eval=None, _device_placeholder=False, cache=False, devices=None, logger=None, seed=None, **kwargs): trn_data, dev_data, batch_size = 'trn', 'dev', None task_names = list(tasks.keys()) return super().fit( **merge_locals_kwargs(locals(), kwargs, excludes=('self', 'kwargs', '__class__', 'tasks')), **tasks)
def fit(self, trn_data, dev_data, save_dir, transformer=None, mask_prob=0.15, projection=None, average_subwords=False, transformer_hidden_dropout=None, layer_dropout=0, mix_embedding: int = None, embed_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, lr=2e-3, transformer_lr=5e-5, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100, sampler='kmeans', n_buckets=32, batch_max_tokens=5000, batch_size=None, epochs=50000, tree=False, punct=False, logger=None, verbose=True, max_sequence_length=512, devices: Union[float, int, List[int]] = None, transform=None, **kwargs): return TorchComponent.fit(self, **merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2, hidden_dropout=None, layer_dropout=0, scalar_mix=None, mix_embedding: int = 0, grad_norm=5.0, transformer_grad_norm=None, lr=5e-5, transformer_lr=None, transformer_layers=None, gradient_accumulation=1, adam_epsilon=1e-6, weight_decay=0, warmup_steps=0.1, secondary_encoder=None, crf=False, reduction='sum', batch_size=32, sampler_builder: SamplerBuilder = None, epochs=3, patience=5, token_key=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, n_embed=100, pretrained_embed=None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, lstm_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, optimizer='adam', lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100, arc_loss='sparse_categorical_crossentropy', rel_loss='sparse_categorical_crossentropy', metrics=('UAS', 'LAS'), n_buckets=32, batch_size=5000, epochs=50000, early_stopping_patience=100, tree=False, punct=False, min_freq=2, run_eagerly=False, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def distill(self, teacher: str, trn_data, dev_data, save_dir, batch_size=None, epochs=None, kd_criterion='kd_ce_loss', temperature_scheduler='flsw', devices=None, logger=None, seed=None, **kwargs): devices = devices or cuda_devices() if isinstance(kd_criterion, str): kd_criterion = KnowledgeDistillationLoss(kd_criterion) if isinstance(temperature_scheduler, str): temperature_scheduler = TemperatureScheduler.from_name(temperature_scheduler) teacher = self.build_teacher(teacher, devices=devices) self.vocabs = teacher.vocabs config = copy(teacher.config) batch_size = batch_size or config.get('batch_size', None) epochs = epochs or config.get('epochs', None) config.update(kwargs) return super().fit(**merge_locals_kwargs(locals(), config, excludes=('self', 'kwargs', '__class__', 'config')))
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, punct=False, tree=False, apply_constraint=True, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, pad_rel=None, joint=True, mu=.9, nu=.9, epsilon=1e-12, cls_is_bos=True, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def fit(self, trn_data: Any, dev_data: Any, save_dir: str, word_embed: Union[str, int, dict] = 200, ngram_embed: Union[str, int, dict] = 50, embedding_trainable=True, window_size=4, kernel_size=3, filters=(200, 200, 200, 200, 200), dropout_embed=0.2, dropout_hidden=0.2, weight_norm=True, loss: Union[tf.keras.losses.Loss, str] = None, optimizer: Union[str, tf.keras.optimizers.Optimizer] = 'adam', metrics='accuracy', batch_size=100, epochs=100, logger=None, verbose=True, **kwargs): assert kwargs.get('run_eagerly', True), 'NgramConvTaggingModel can only run eagerly' kwargs['run_eagerly'] = True return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit( self, trn_data, dev_data, save_dir, text_a_key=None, text_b_key=None, label_key=None, transformer=None, max_seq_length=512, truncate_long_sequences=True, # hidden_dropout_prob=0.0, lr=5e-5, transformer_lr=None, adam_epsilon=1e-6, weight_decay=0, warmup_steps=0.1, batch_size=32, batch_max_tokens=None, epochs=3, logger=None, # transform=None, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, transformer: ContextualWordEmbedding, sampler_builder=None, mix_embedding: int = 13, layer_dropout: int = 0.1, n_mlp_arc=768, n_mlp_rel=256, mlp_dropout=.33, lr=1e-3, transformer_lr=2.5e-5, patience=0.1, batch_size=32, epochs=30, gradient_accumulation=1, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, grad_norm=1.0, tree=False, proj=False, punct=False, logger=None, verbose=True, devices: Union[float, int, List[int]] = None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, transformer, sent_a_col, sent_b_col, similarity_col, delimiter='auto', batch_size=32, max_seq_len=128, epochs=3, lr=1e-3, transformer_lr=5e-5, adam_epsilon=1e-8, weight_decay=0.0, warmup_steps=0.1, gradient_accumulation=1, grad_norm=1.0, sampler_builder=None, devices=None, logger=None, seed=None, finetune: Union[bool, str] = False, eval_trn=True, _device_placeholder=False, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def fit(self, trn_data, dev_data, save_dir, batch_size=50, epochs=100, embed=100, rnn_input=None, rnn_hidden=256, drop=0.5, lr=0.001, patience=10, crf=True, optimizer='adam', token_key='token', tagging_scheme=None, anneal_factor: float = 0.5, delimiter=None, anneal_patience=2, devices=None, token_delimiter=None, logger=None, verbose=True, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, punct=False, tree=True, pad_rel=None, apply_constraint=False, single_root=True, no_zero_head=None, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, mu=.9, nu=.9, epsilon=1e-12, decay=.75, decay_steps=5000, cls_is_bos=True, use_pos=False, **kwargs) -> None: r"""Implementation of "Stanford's graph-based neural dependency parser at the conll 2017 shared task" (:cite:`dozat2017stanford`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. punct: ``True`` to include punctuations in evaluation. pad_rel: Padding token for relations. apply_constraint: Enforce constraints (see following parameters). single_root: Force single root. no_zero_head: Every token has at least one head. n_mlp_arc: Number of features for arc representation. n_mlp_rel: Number of features for rel representation. mlp_dropout: Dropout applied to MLPs. mu: First coefficient used for computing running averages of gradient and its square in Adam. nu: Second coefficient used for computing running averages of gradient and its square in Adam. epsilon: Term added to the denominator to improve numerical stability decay: Decay rate for exceptional lr scheduler. decay_steps: Decay every ``decay_steps`` steps. cls_is_bos: ``True`` to treat the first token as ``BOS``. use_pos: Use pos feature. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=2, use_pos=True, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.form_vocab: VocabTF = None if use_pos: self.cpos_vocab: VocabTF = None self.rel_vocab: VocabTF = None self.puncts: tf.Tensor = None
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=False, sep_is_eos=False, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, crf=False, token_key='token', dict_tags: Union[DictInterface, Union[Dict[Union[str, Sequence[str]], Union[str, Sequence[str]]]]] = None, **kwargs) -> None: """A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for any tagging tasks including PoS tagging and many others. It also features with a custom dictionary ``dict_tags`` to perform ``longest-prefix-matching`` which replaces matched tokens with given tags. .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level, which is never the case for lemmatization. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. dict_tags: A custom dictionary to override predicted tags by performing longest-prefix-matching. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() self.dict_tags = dict_tags
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, lexical_dropout=0.5, dropout=0.2, span_width_feature_size=20, ffnn_size=150, ffnn_depth=2, argument_ratio=0.8, predicate_ratio=0.4, max_arg_width=30, mlp_label_size=100, enforce_srl_constraint=False, use_gold_predicates=False, doc_level_offset=True, use_biaffine=False, loss_reduction='mean', with_argument=' ', **kwargs) -> None: r""" An implementation of "Jointly Predicting Predicates and Arguments in Neural Semantic Role Labeling" (:cite:`he-etal-2018-jointly`). It generates candidates triples of (predicate, arg_start, arg_end) and rank them. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. lexical_dropout: Dropout applied to hidden states of encoder. dropout: Dropout used for other layers except the encoder. span_width_feature_size: Span width feature size. ffnn_size: Feedforward size. ffnn_depth: Number of layers of feedforward MLPs. argument_ratio: Ratio of candidate arguments over number of tokens. predicate_ratio: Ratio of candidate predicates over number of tokens. max_arg_width: Maximum argument width. mlp_label_size: Feature size for label representation. enforce_srl_constraint: Enforce SRL constraints (number of core ARGs etc.). use_gold_predicates: Use gold predicates instead of predicting them. doc_level_offset: ``True`` to indicate the offsets in ``jsonlines`` are of document level. use_biaffine: ``True`` to use biaffine (:cite:`dozat:17a`) instead of lineary layer for label prediction. loss_reduction: The loss reduction used in aggregating losses. with_argument: The delimiter between tokens in arguments to be used for joining tokens for outputs. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=2e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=False, punct=False, tree=False, proj=False, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, mu=.9, nu=.9, epsilon=1e-12, decay=.75, decay_steps=5000, use_pos=False, max_seq_len=None, **kwargs) -> None: """Biaffine dependency parsing (:cite:`dozat:17a`). Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. punct: ``True`` to include punctuations in evaluation. tree: ``True`` to enforce tree constraint. proj: ``True`` for projective parsing. n_mlp_arc: Number of features for arc representation. n_mlp_rel: Number of features for rel representation. mlp_dropout: Dropout applied to MLPs. mu: First coefficient used for computing running averages of gradient and its square in Adam. nu: Second coefficient used for computing running averages of gradient and its square in Adam. epsilon: Term added to the denominator to improve numerical stability decay: Decay rate for exceptional lr scheduler. decay_steps: Decay every ``decay_steps`` steps. use_pos: Use pos feature. max_seq_len: Prune samples longer than this length. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict()
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=True, n_buckets=32, min_freq=0, max_seq_length=256, use_pos=False, mask_p=None, graph=False, topk=None, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.tokenizer: PreTrainedTokenizer = None self.transformer_config: PretrainedConfig = None if graph: self.orphan_relation = ROOT
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=True, delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, tagging_scheme='BMES', crf=False, token_key='token', dict_force: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, dict_combine: Union[DictInterface, Union[Dict[str, Any], Set[str]]] = None, **kwargs) -> None: """Tokenization which casts a chunking problem into a tagging problem. This task has to create batch of tokens containing both [CLS] and [SEP] since it's usually the first task and later tasks might need them. Args: trn: Path to training set. dev: Path to dev set. tst: Path to test set. sampler_builder: A builder which builds a sampler. dependencies: Its dependencies on other tasks. scalar_mix: A builder which builds a `ScalarMixWithDropout` object. use_raw_hidden_states: Whether to use raw hidden states from transformer without any pooling. lr: Learning rate for this task. separate_optimizer: Use customized separate optimizer for this task. cls_is_bos: ``True`` to treat the first token as ``BOS``. sep_is_eos: ``True`` to treat the last token as ``EOS``. delimiter: Delimiter used to split a line in the corpus. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. transform: An optional transform to be applied to samples. Usually a character normalization transform is passed in. tagging_scheme: Either ``BMES`` or ``BI``. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). token_key: The key to tokens in dataset. This should always be set to ``token`` in MTL. **kwargs: Not used. """ super().__init__(**merge_locals_kwargs(locals(), kwargs, excludes=( 'self', 'kwargs', '__class__', 'dict_force', 'dict_combine'))) # avoid to config self.transform = transform self.vocabs = VocabDict() self.dict_force = dict_force self.dict_combine = dict_combine
def fit(self, trn_data, dev_data, save_dir, transformer, average_subwords=False, word_dropout: float = 0.2, hidden_dropout=None, layer_dropout=0, scalar_mix=None, grad_norm=5.0, transformer_grad_norm=None, lr=5e-5, transformer_lr=None, transformer_layers=None, gradient_accumulation=1, adam_epsilon=1e-8, weight_decay=0, warmup_steps=0.1, crf=False, reduction='sum', batch_size=32, sampler_builder: SamplerBuilder = None, epochs=30, patience=5, token_key=None, tagging_scheme='BMES', delimiter=None, max_seq_len=None, sent_delimiter=None, char_level=False, hard_constraint=False, transform=None, logger=None, devices: Union[float, int, List[int]] = None, **kwargs): """ Args: trn_data: Training set. dev_data: Development set. save_dir: The directory to save trained component. transformer: An identifier of a pre-trained transformer. average_subwords: ``True`` to average subword representations. word_dropout: Dropout rate to randomly replace a subword with MASK. hidden_dropout: Dropout rate applied to hidden states. layer_dropout: Randomly zero out hidden states of a transformer layer. scalar_mix: Layer attention. grad_norm: Gradient norm for clipping. transformer_grad_norm: Gradient norm for clipping transformer gradient. lr: Learning rate for decoder. transformer_lr: Learning for encoder. transformer_layers: The number of bottom layers to use. gradient_accumulation: Number of batches per update. adam_epsilon: The epsilon to use in Adam. weight_decay: The weight decay to use. warmup_steps: The number of warmup steps. crf: ``True`` to enable CRF (:cite:`lafferty2001conditional`). reduction: The loss reduction used in aggregating losses. batch_size: The number of samples in a batch. sampler_builder: The builder to build sampler, which will override batch_size. epochs: The number of epochs to train. patience: The number of patience epochs before early stopping. token_key: The key to tokens in dataset. tagging_scheme: Either ``BMES`` or ``BI``. delimiter: Delimiter between tokens used to split a line in the corpus. max_seq_len: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. sent_delimiter: Delimiter between sentences, like period or comma, which indicates a long sentence can be split here. char_level: Whether the sequence length is measured at char level. hard_constraint: Whether to enforce hard length constraint on sentences. If there is no ``sent_delimiter`` in a sentence, it will be split at a token anyway. transform: An optional transform to be applied to samples. Usually a character normalization transform is passed in. devices: Devices this component will live on. logger: Any :class:`logging.Logger` instance. seed: Random seed to reproduce this training. **kwargs: Not used. Returns: Best metrics on dev set. """ return super().fit(**merge_locals_kwargs(locals(), kwargs))
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, use_char=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.word_vocab: Optional[VocabTF] = None self.tag_vocab: Optional[VocabTF] = None self.char_vocab: Optional[VocabTF] = None
def fit(self, trn_data, dev_data, save_dir, feat=None, n_embed=100, pretrained_embed=None, transformer=None, average_subwords=False, word_dropout: float = 0.2, transformer_hidden_dropout=None, layer_dropout=0, mix_embedding: int = None, embed_dropout=.33, n_lstm_hidden=400, n_lstm_layers=3, hidden_dropout=.33, n_mlp_arc=500, n_mlp_rel=100, mlp_dropout=.33, arc_dropout=None, rel_dropout=None, arc_loss_interpolation=0.4, lr=2e-3, transformer_lr=5e-5, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, weight_decay=0, warmup_steps=0.1, separate_optimizer=True, patience=100, batch_size=None, sampler_builder=None, lowercase=False, epochs=50000, apply_constraint=False, single_root=None, no_zero_head=None, punct=False, min_freq=2, logger=None, verbose=True, unk=UNK, pad_rel=None, max_sequence_length=512, gradient_accumulation=1, devices: Union[float, int, List[int]] = None, transform=None, **kwargs): return super().fit(**merge_locals_kwargs(locals(), kwargs))
def __init__(self, config: SerializableDict = None, map_x=True, map_y=True, lower=False, **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.token_vocab = VocabTF() self.pos_vocab = VocabTF(pad_token=None, unk_token=None) self.ner_vocab = VocabTF(pad_token=None) self.deprel_vocab = VocabTF(pad_token=None, unk_token=None) self.rel_vocab = VocabTF(pad_token=None, unk_token=None)
def distill(self, teacher: str, trn_data, dev_data, save_dir, transformer: str, batch_size=None, temperature_scheduler='flsw', epochs=None, devices=None, logger=None, seed=None, **kwargs): return super().distill(**merge_locals_kwargs(locals(), kwargs))