Ejemplo n.º 1
0
 def build_model(self,
                 transformer: ContextualWordEmbedding,
                 n_mlp_arc,
                 n_mlp_rel,
                 mlp_dropout,
                 mix_embedding,
                 layer_dropout,
                 training=True,
                 **kwargs) -> torch.nn.Module:
     assert bool(transformer.scalar_mix) == bool(mix_embedding), 'transformer.scalar_mix has to be 1 ' \
                                                                 'when mix_embedding is non-zero.'
     # noinspection PyTypeChecker
     return UniversalDependenciesModel(
         transformer.module(training=training), n_mlp_arc, n_mlp_rel,
         mlp_dropout, len(self.vocabs.rel), len(self.vocabs.lemma),
         len(self.vocabs.pos), len(self.vocabs.feat), mix_embedding,
         layer_dropout)
Ejemplo n.º 2
0
 def build_dataloader(self,
                      data,
                      batch_size,
                      shuffle=False,
                      device=None,
                      logger: logging.Logger = None,
                      sampler_builder=None,
                      gradient_accumulation=1,
                      transformer: ContextualWordEmbedding = None,
                      **kwargs) -> DataLoader:
     transform = [
         generate_lemma_rule, append_bos, self.vocabs,
         transformer.transform(),
         FieldLength('token')
     ]
     if not self.config.punct:
         transform.append(PunctuationMask('token', 'punct_mask'))
     dataset = self.build_dataset(data, transform)
     if self.vocabs.mutable:
         # noinspection PyTypeChecker
         self.build_vocabs(dataset, logger)
     lens = [len(x['token_input_ids']) for x in dataset]
     if sampler_builder:
         sampler = sampler_builder.build(lens, shuffle,
                                         gradient_accumulation)
     else:
         sampler = SortingSamplerBuilder(batch_size).build(
             lens, shuffle, gradient_accumulation)
     return PadSequenceDataLoader(
         dataset,
         batch_size,
         shuffle,
         device=device,
         batch_sampler=sampler,
         pad={'arc': 0},
     )
Ejemplo n.º 3
0
    CRFConstituencyParsing(
        CTB8_BRACKET_LINE_NOEC_TRAIN,
        CTB8_BRACKET_LINE_NOEC_DEV,
        CTB8_BRACKET_LINE_NOEC_TEST,
        SortingSamplerBuilder(batch_size=32),
        lr=1e-3,
        dependencies='tok',
    )
}
mtl = MultiTaskLearning()
save_dir = 'data/model/mtl/open_tok_pos_ner_srl_dep_sdp_con_electra_small'
cprint(f'Model will be saved in [cyan]{save_dir}[/cyan]')
mtl.fit(
    ContextualWordEmbedding('token',
                            "hfl/chinese-electra-180g-small-discriminator",
                            average_subwords=True,
                            max_sequence_length=512,
                            word_dropout=.1),
    tasks,
    save_dir,
    30,
    lr=1e-3,
    encoder_lr=5e-5,
    grad_norm=1,
    gradient_accumulation=1,
    eval_trn=False,
)
cprint(f'Model saved in [cyan]{save_dir}[/cyan]')
mtl.evaluate(save_dir)
mtl.load(save_dir)
mtl('华纳音乐旗下的新垣结衣在12月21日于日本武道馆举办歌手出道活动').pretty_print()