def build_embeddings(opt, text_field, for_encoder=True): """ Args: opt: the option in current environment. text_field(TextMultiField): word and feats field. for_encoder(bool): build Embeddings for encoder or decoder? """ emb_dim = opt.src_word_vec_size if for_encoder else opt.tgt_word_vec_size if opt.model_type == "vec" and for_encoder: return VecEmbedding( opt.feat_vec_size, emb_dim, position_encoding=opt.position_encoding, dropout=(opt.dropout[0] if type(opt.dropout) is list else opt.dropout), ) pad_indices = [f.vocab.stoi[f.pad_token] for _, f in text_field] word_padding_idx, feat_pad_indices = pad_indices[0], pad_indices[1:] num_embs = [len(f.vocab) for _, f in text_field] num_word_embeddings, num_feat_embeddings = num_embs[0], num_embs[1:] fix_word_vecs = opt.fix_word_vecs_enc if for_encoder \ else opt.fix_word_vecs_dec # if seg_token_id is None, it indicates that segment_embedding is False. if opt.segment_embedding and for_encoder: seg_token_id = opt.seg_token_id else: seg_token_id = None # wei 20200723 if opt.flat_layers > 0 and for_encoder: flat_layer_flag = opt.flat_layers else: flat_layer_flag = -1 # end wei emb = Embeddings( word_vec_size=emb_dim, position_encoding=opt.position_encoding, seg_token_id=seg_token_id, # wei 20200723 flat_layer_flag=flat_layer_flag, # end wei feat_merge=opt.feat_merge, feat_vec_exponent=opt.feat_vec_exponent, feat_vec_size=opt.feat_vec_size, dropout=opt.dropout[0] if type(opt.dropout) is list else opt.dropout, word_padding_idx=word_padding_idx, feat_padding_idx=feat_pad_indices, word_vocab_size=num_word_embeddings, feat_vocab_sizes=num_feat_embeddings, sparse=opt.optim == "sparseadam", fix_word_vecs=fix_word_vecs) return emb
def build_embeddings(opt, text_field, for_encoder=True, aux_field=None): """ Args: opt: the option in current environment. text_field(TextMultiField): word and feats field. for_encoder(bool): build Embeddings for encoder or decoder? """ emb_dim = opt.src_word_vec_size if for_encoder else opt.tgt_word_vec_size if opt.model_type == "vec" and for_encoder: return VecEmbedding( opt.feat_vec_size, emb_dim, position_encoding=opt.position_encoding, dropout=(opt.dropout[0] if type(opt.dropout) is list else opt.dropout), ) pad_indices = [f.vocab.stoi[f.pad_token] for _, f in text_field] word_padding_idx, feat_pad_indices = pad_indices[0], pad_indices[1:] def get_num_embs(field): num_embs = [len(f.vocab) for _, f in field] num_word_embeddings, num_feat_embeddings = num_embs[0], num_embs[1:] return num_word_embeddings, num_feat_embeddings num_word_embeddings, num_feat_embeddings = get_num_embs(text_field) fix_word_vecs = opt.fix_word_vecs_enc if for_encoder \ else opt.fix_word_vecs_dec if opt.crosslingual: cls = XEmbeddings word_vec_size = [emb_dim, emb_dim] aux_num_word_embeddings, _ = get_num_embs(aux_field) word_vocab_size = [num_word_embeddings, aux_num_word_embeddings] else: cls = Embeddings word_vec_size = emb_dim word_vec_size = num_word_embeddings emb = cls( word_vec_size, word_vocab_size, word_padding_idx, position_encoding=opt.position_encoding, feat_merge=opt.feat_merge, feat_vec_exponent=opt.feat_vec_exponent, feat_vec_size=opt.feat_vec_size, dropout=opt.dropout[0] if type(opt.dropout) is list else opt.dropout, feat_padding_idx=feat_pad_indices, feat_vocab_sizes=num_feat_embeddings, sparse=opt.optim == "sparseadam", fix_word_vecs=fix_word_vecs) return emb
def build_embeddings(opt, text_field, for_encoder=True): """ Args: opt: the option in current environment. text_field(TextMultiField): word and feats field. for_encoder(bool): build Embeddings for encoder or decoder? """ if opt.is_bert: token_fields_vocab = text_field.base_field.vocab vocab_size = len(token_fields_vocab) emb_dim = opt.word_vec_size return BertEmbeddings( vocab_size, emb_dim, dropout=(opt.dropout[0] if type(opt.dropout) is list else opt.dropout)) emb_dim = opt.src_word_vec_size if for_encoder else opt.tgt_word_vec_size if opt.model_type == "vec" and for_encoder: return VecEmbedding( opt.feat_vec_size, emb_dim, position_encoding=opt.position_encoding, dropout=(opt.dropout[0] if type(opt.dropout) is list else opt.dropout), ) pad_indices = [f.vocab.stoi[f.pad_token] for _, f in text_field] word_padding_idx, feat_pad_indices = pad_indices[0], pad_indices[1:] num_embs = [len(f.vocab) for _, f in text_field] num_word_embeddings, num_feat_embeddings = num_embs[0], num_embs[1:] fix_word_vecs = opt.fix_word_vecs_enc if for_encoder \ else opt.fix_word_vecs_dec emb = Embeddings( word_vec_size=emb_dim, position_encoding=opt.position_encoding, feat_merge=opt.feat_merge, feat_vec_exponent=opt.feat_vec_exponent, feat_vec_size=opt.feat_vec_size, dropout=opt.dropout[0] if type(opt.dropout) is list else opt.dropout, word_padding_idx=word_padding_idx, feat_padding_idx=feat_pad_indices, word_vocab_size=num_word_embeddings, feat_vocab_sizes=num_feat_embeddings, sparse=opt.optim == "sparseadam", fix_word_vecs=fix_word_vecs) return emb
def build_embeddings(opt, text_field, for_encoder=True): """ Args: opt: the option in current environment. text_field(TextMultiField): word and feats field. for_encoder(bool): build Embeddings for encoder or decoder? """ emb_dim = opt.src_word_vec_size if for_encoder else opt.tgt_word_vec_size if opt.model_type == "vec" and for_encoder: return VecEmbedding( opt.feat_vec_size, emb_dim, position_encoding=opt.position_encoding, dropout=(opt.dropout[0] if type(opt.dropout) is list else opt.dropout), ) pad_indices = [f.vocab.stoi[f.pad_token] for _, f in text_field] word_padding_idx, feat_pad_indices = pad_indices[0], pad_indices[1:] num_embs = [len(f.vocab) for _, f in text_field] num_word_embeddings, num_feat_embeddings = num_embs[0], num_embs[1:] fix_word_vecs = opt.fix_word_vecs_enc if for_encoder \ else opt.fix_word_vecs_dec conmt = False out_vec_size = None if ("continuous" in opt.generator_function) and not for_encoder: out_vec_size = text_field.base_field.vocab.vectors.size(1) conmt = True emb = Embeddings( word_vec_size=emb_dim, position_encoding=opt.position_encoding, feat_merge=opt.feat_merge, feat_vec_exponent=opt.feat_vec_exponent, feat_vec_size=opt.feat_vec_size, dropout=opt.dropout[0] if type(opt.dropout) is list else opt.dropout, word_padding_idx=word_padding_idx, feat_padding_idx=feat_pad_indices, word_vocab_size=num_word_embeddings, feat_vocab_sizes=num_feat_embeddings, sparse=opt.optim == "sparseadam", fix_word_vecs=fix_word_vecs, tie_embeddings=opt.share_decoder_embeddings and conmt, out_vec_size=out_vec_size) return emb