def test_same_vector3(self):
        # 验证lower
        word_lst = ["The", "the"]
        no_create_word_lst = ['of', 'Of', 'With', 'with']
        vocab = Vocabulary().add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=True)
        words = torch.LongTensor(
            [[vocab.to_index(word) for word in word_lst + no_create_word_lst]])
        words = embed(words)

        lowered_word_lst = [word.lower() for word in word_lst]
        lowered_no_create_word_lst = [
            word.lower() for word in no_create_word_lst
        ]
        lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst)
        lowered_vocab.add_word_lst(lowered_no_create_word_lst,
                                   no_create_entry=True)
        lowered_embed = StaticEmbedding(lowered_vocab,
                                        model_dir_or_name='en-glove-6B-100d',
                                        lower=False)
        lowered_words = torch.LongTensor([[
            lowered_vocab.to_index(word)
            for word in lowered_word_lst + lowered_no_create_word_lst
        ]])
        lowered_words = lowered_embed(lowered_words)

        all_words = word_lst + no_create_word_lst

        for idx, (word_i, word_j) in enumerate(zip(words[0],
                                                   lowered_words[0])):
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
Beispiel #2
0
def get_data():
    data_bundle = ChineseNERLoader().process('MSRA-NER/', bigrams=True)
    char_embed = StaticEmbedding(data_bundle.vocabs['chars'],
                                 model_dir_or_name='cn-char')
    bigram_embed = StaticEmbedding(data_bundle.vocabs['bigrams'],
                                   model_dir_or_name='cn-bigram')
    return data_bundle, char_embed, bigram_embed
    def test_same_vector4(self):
        # 验证在有min_freq下的lower
        word_lst = ["The", "the", "the", "The", "a", "A"]
        no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with']
        all_words = word_lst[:-2] + no_create_word_lst[:-2]
        vocab = Vocabulary(min_freq=2).add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=True)
        words = torch.LongTensor([[vocab.to_index(word)
                                   for word in all_words]])
        words = embed(words)

        lowered_word_lst = [word.lower() for word in word_lst]
        lowered_no_create_word_lst = [
            word.lower() for word in no_create_word_lst
        ]
        lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst)
        lowered_vocab.add_word_lst(lowered_no_create_word_lst,
                                   no_create_entry=True)
        lowered_embed = StaticEmbedding(lowered_vocab,
                                        model_dir_or_name='en-glove-6B-100d',
                                        lower=False)
        lowered_words = torch.LongTensor(
            [[lowered_vocab.to_index(word.lower()) for word in all_words]])
        lowered_words = lowered_embed(lowered_words)

        for idx in range(len(all_words)):
            word_i, word_j = words[0, idx], lowered_words[0, idx]
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size)
    def test_same_vector5(self):
        # 检查通过使用min_freq后的word是否内容一致
        word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"]
        no_create_word_lst = ['of', "of", "she", "she", 'With', 'with']
        all_words = word_lst[:-2] + no_create_word_lst[:-2]
        vocab = Vocabulary().add_word_lst(word_lst)
        vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        embed = StaticEmbedding(vocab,
                                model_dir_or_name='en-glove-6B-100d',
                                lower=False,
                                min_freq=2)
        words = torch.LongTensor([[vocab.to_index(word)
                                   for word in all_words]])
        words = embed(words)

        min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst)
        min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True)
        min_freq_embed = StaticEmbedding(min_freq_vocab,
                                         model_dir_or_name='en-glove-6B-100d',
                                         lower=False)
        min_freq_words = torch.LongTensor(
            [[min_freq_vocab.to_index(word.lower()) for word in all_words]])
        min_freq_words = min_freq_embed(min_freq_words)

        for idx in range(len(all_words)):
            word_i, word_j = words[0, idx], min_freq_words[0, idx]
            with self.subTest(idx=idx, word=all_words[idx]):
                assert torch.sum(word_i == word_j).eq(
                    min_freq_embed.embed_size)
Beispiel #5
0
def load_data():

    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        'dev': 'data/{}/dev.txt'.format(dataset),
        'test': 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 2
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    dict_save_path = os.path.join("data/{}/data.pth".format(dataset))
    context_dict, context_word2id, context_id2word = get_neighbor_for_vocab(
        data_bundle.get_vocab('chars').word2idx, glove_path, dict_save_path)

    train_feature_data, dev_feature_data, test_feature_data = build_instances(
        "data/{}".format(dataset), context_num, context_dict)

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, context_word2id, context_id2word
Beispiel #6
0
def prepare_data():
    data_bundle = CWSShiftRelayPipe(dataset_name=data_name, L=L).process_from_file()
    # 预训练的character embedding和bigram embedding
    char_embed = StaticEmbedding(data_bundle.get_vocab('chars'), dropout=0.5, word_dropout=0.01,
                                 model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt')
    bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), dropout=0.5, min_freq=3, word_dropout=0.01,
                                 model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt')

    return data_bundle, char_embed, bigram_embed
Beispiel #7
0
def get_data():
    data_bundle = WeiboNERLoader().load()
    data_bundle = ChineseNERPipe(encoding_type='bioes',
                                 bigram=True).process(data_bundle)
    char_embed = StaticEmbedding(data_bundle.get_vocab(C.CHAR_INPUT),
                                 model_dir_or_name='cn-fasttext')
    bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'),
                                   embedding_dim=100,
                                   min_freq=3)
    return data_bundle, char_embed, bigram_embed
Beispiel #8
0
def load_data():
    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        "dev": 'data/{}/dev.txt'.format(dataset),
        "test": 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 1
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature = generate_knowledge_api(
        os.path.join("data", dataset), "all", args.feature_level)

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=False)

    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)

    return data_bundle, embed, bi_embed, train_feature_data, dev_feature_data, test_feature_data, feature2count, feature2id, id2feature
def load_ner_data():
    paths = {
        'train': 'data/{}/train.txt'.format(dataset),
        'dev': 'data/{}/dev.txt'.format(dataset),
        'test': 'data/{}/test.txt'.format(dataset)
    }
    min_freq = 2
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)

    # train_list = data_bundle.get_dataset('train')['raw_chars']

    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=2,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    tencent_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='data/tencent_unigram.txt',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bert_embed = BertEmbedding(vocab=data_bundle.get_vocab('chars'),
                               model_dir_or_name=args.bert_model,
                               layers='-1',
                               pool_method=args.pool_method,
                               word_dropout=0,
                               dropout=0.5,
                               include_cls_sep=False,
                               pooled_cls=True,
                               requires_grad=False,
                               auto_truncate=True)

    # embed = StackEmbedding([tencent_embed, bert_embed], dropout=0, word_dropout=0.02)
    embed = StackEmbedding([embed, tencent_embed, bert_embed],
                           dropout=0,
                           word_dropout=0.02)
    return data_bundle, embed, bi_embed
Beispiel #10
0
def load_data():
    # 替换路径
    if dataset == 'ontonotes':
        paths = {
            'train': '../data/OntoNote4NER/train.char.bmes',
            "dev": '../data/OntoNote4NER/dev.char.bmes',
            "test": '../data/OntoNote4NER/test.char.bmes'
        }
        min_freq = 2
    elif dataset == 'weibo':
        paths = {
            'train': '../data/WeiboNER/train.all.bmes',
            'dev': '../data/WeiboNER/dev.all.bmes',
            'test': '../data/WeiboNER/test.all.bmes'
        }
        min_freq = 1
    elif dataset == 'resume':
        paths = {
            'train': '../data/ResumeNER/train.char.bmes',
            'dev': '../data/ResumeNER/dev.char.bmes',
            'test': '../data/ResumeNER/test.char.bmes'
        }
        min_freq = 1
    elif dataset == 'msra':
        paths = {
            'train': '../data/MSRANER/train_dev.char.bmes',
            'dev': '../data/MSRANER/test.char.bmes',
            'test': '../data/MSRANER/test.char.bmes'
        }
        min_freq = 2
    data_bundle = CNNERPipe(
        bigrams=True, encoding_type=encoding_type).process_from_file(paths)
    embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        model_dir_or_name='../data/gigaword_chn.all.a2b.uni.ite50.vec',
        min_freq=1,
        only_norm_found_vector=normalize_embed,
        word_dropout=0.01,
        dropout=0.3)

    bi_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        model_dir_or_name='../data/gigaword_chn.all.a2b.bi.ite50.vec',
        word_dropout=0.02,
        dropout=0.3,
        min_freq=min_freq,
        only_norm_found_vector=normalize_embed,
        only_train_min_freq=True)

    return data_bundle, embed, bi_embed
Beispiel #11
0
def load_data():
    # paths = {'test': "../data/conll2003/test.txt",
    #          'train': "../data/conll2003/train.txt",
    #          'dev': "../data/conll2003/dev.txt"}
    paths = {'test': args.test, 'train': args.train, 'dev': args.dev}
    data = Conll2003NERPipe(
        encoding_type=encoding_type).process_from_file(paths)
    char_embed = None
    if char_type == 'cnn':
        char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'),
                                      embed_size=30,
                                      char_emb_size=30,
                                      filter_nums=[30],
                                      kernel_sizes=[3],
                                      word_dropout=0,
                                      dropout=0.3,
                                      pool_method='max',
                                      include_word_start_end=False,
                                      min_char_freq=2)
    elif char_type == 'lstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'),
                                       embed_size=30,
                                       char_emb_size=30,
                                       word_dropout=0,
                                       dropout=0.3,
                                       hidden_size=100,
                                       pool_method='max',
                                       activation='relu',
                                       min_char_freq=2,
                                       bidirectional=True,
                                       requires_grad=True,
                                       include_word_start_end=False)
    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='en-glove-6b-100d',
                                 requires_grad=True,
                                 lower=True,
                                 word_dropout=0,
                                 dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    if char_embed is not None:
        embed = StackEmbedding([word_embed, char_embed],
                               dropout=0,
                               word_dropout=0.02)
    else:
        word_embed.word_drop = 0.02
        embed = word_embed

    data.rename_field('words', 'chars')
    return data, embed
Beispiel #12
0
    def test_case(self):
        vocab = Vocabulary().add_word_lst("This is a test .".split())
        vocab.add_word_lst("Another test !".split())
        embed = StaticEmbedding(vocab, embedding_dim=10)

        encoder_output = torch.randn(2, 3, 10)
        src_seq_len = torch.LongTensor([3, 2])
        encoder_mask = seq_len_to_mask(src_seq_len)

        for flag in [True, False]:
            with self.subTest(bind_decoder_input_output_embed=flag):
                decoder = TransformerSeq2SeqDecoder(
                    embed=embed,
                    pos_embed=None,
                    d_model=10,
                    num_layers=2,
                    n_head=5,
                    dim_ff=20,
                    dropout=0.1,
                    bind_decoder_input_output_embed=True)
                state = decoder.init_state(encoder_output, encoder_mask)
                output = decoder(tokens=torch.randint(0,
                                                      len(vocab),
                                                      size=(2, 4)),
                                 state=state)
                self.assertEqual(output.size(), (2, 4, len(vocab)))
Beispiel #13
0
def load_conllized_ontonote_pkl(path,embedding_path=None):

    data_bundle = pickle.load(open(path,'rb'))
    train_set = data_bundle.datasets['train']
    dev_set = data_bundle.datasets['dev']
    test_set = data_bundle.datasets['test']

    train_set.rename_field('pos','posid')
    train_set.rename_field('ner','nerid')
    train_set.rename_field('chunk','chunkid')

    dev_set.rename_field('pos','posid')
    dev_set.rename_field('ner','nerid')
    dev_set.rename_field('chunk','chunkid')

    test_set.rename_field('pos','posid')
    test_set.rename_field('ner','nerid')
    test_set.rename_field('chunk','chunkid')


    word_vocab = data_bundle.vocabs['words']
    pos_vocab = data_bundle.vocabs['pos']
    ner_vocab = data_bundle.vocabs['ner']
    chunk_vocab = data_bundle.vocabs['chunk']


    if embedding_path is not None:

        embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01,
                                dropout=0.5,lower=True)

        return (train_set,dev_set,test_set),\
               (word_vocab,pos_vocab,ner_vocab,chunk_vocab),embed
    else:
        return (train_set, dev_set, test_set), (word_vocab,ner_vocab)
Beispiel #14
0
    def test_case(self):
        vocab = Vocabulary().add_word_lst("This is a test .".split())
        vocab.add_word_lst("Another test !".split())
        embed = StaticEmbedding(vocab,
                                model_dir_or_name=None,
                                embedding_dim=10)

        encoder_output = torch.randn(2, 3, 10)
        tgt_words_idx = torch.LongTensor([[1, 2, 3, 4], [2, 3, 0, 0]])
        src_seq_len = torch.LongTensor([3, 2])
        encoder_mask = seq_len_to_mask(src_seq_len)

        for flag in [True, False]:
            for attention in [True, False]:
                with self.subTest(bind_decoder_input_output_embed=flag,
                                  attention=attention):
                    decoder = LSTMSeq2SeqDecoder(
                        embed=embed,
                        num_layers=2,
                        hidden_size=10,
                        dropout=0.3,
                        bind_decoder_input_output_embed=flag,
                        attention=attention)
                    state = decoder.init_state(encoder_output, encoder_mask)
                    output = decoder(tgt_words_idx, state)
                    self.assertEqual(tuple(output.size()), (2, 4, len(vocab)))
Beispiel #15
0
 def test_search(self):
     """语义搜索.TypeError: expected dimension <= 2 array or matrix
     """
     print('{} test_search {}'.format('-' * 15, '-' * 15))
     texts = ['温都尔站', '东乌广厦', '国电四郎', '阿尔善站', '朱日和基']
     # 文本向量化
     vocab = Vocabulary()
     for text in texts:
         vocab.add_word_lst(list(text))
     print(len(vocab))
     embed = StaticEmbedding(
         vocab, model_dir_or_name='./data/cn_char_fastnlp_100d.txt')
     texts_to_id = [[vocab.to_index(word) for word in list(text)]
                    for text in texts]
     words = torch.LongTensor(texts_to_id)  # 将文本转为index
     features_vec = embed(words)
     print(features_vec.shape)
     # build the search index!
     cp = ci.MultiClusterIndex(features_vec.detach().numpy(), texts)
     search_texts = ['朱日和站', '温都尔站', '国电站']
     for text in search_texts:
         texts_to_id = [[vocab.to_index(word) for word in list(text)]]
         words = torch.LongTensor(texts_to_id)  # 将文本转为index
         features_vec = embed(words)
         search_features_vec = features_vec.detach().numpy()
         search_result = cp.search(search_features_vec,
                                   k=2,
                                   k_clusters=2,
                                   return_distance=True)
         print('text:{}'.format(text))
         print('search_result:{}'.format(search_result))
     """
Beispiel #16
0
    def __init__(self,
                 vocab,
                 hidden_size,
                 num_layers,
                 n_class_per_task,
                 dropout=0.5):
        super().__init__()
        # word_embed = nn.Embedding(len(vocab), 50)
        word_embed = StaticEmbedding(vocab=vocab,
                                     embedding_dim=50,
                                     word_dropout=0,
                                     dropout=dropout,
                                     lower=True)
        self.word_embed = word_embed
        emb_dim = self.word_embed.embedding_dim
        self.lstm = fastNLP.modules.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )
        self.out = nn.ModuleList(
            [nn.Linear(hidden_size * 2, i) for i in n_class_per_task])
        self.dropout = nn.Dropout(dropout)
        self.loss = nn.CrossEntropyLoss()

        for name, param in self.named_parameters():
            if "out" in name:
                if param.data.dim() > 1:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.constant_(param, 0)
Beispiel #17
0
def load_conllized_ontonote_NER(path, embedding_path=None):
    from fastNLP.io.pipe.conll import OntoNotesNERPipe
    ontoNotesNERPipe = OntoNotesNERPipe(lower=True, target_pad_val=-100)
    bundle_NER = ontoNotesNERPipe.process_from_file(path)

    train_set_NER = bundle_NER.datasets['train']
    dev_set_NER = bundle_NER.datasets['dev']
    test_set_NER = bundle_NER.datasets['test']

    train_set_NER.add_seq_len('words', 'seq_len')
    dev_set_NER.add_seq_len('words', 'seq_len')
    test_set_NER.add_seq_len('words', 'seq_len')

    NER_vocab = bundle_NER.get_vocab('target')
    word_vocab = bundle_NER.get_vocab('words')

    if embedding_path is not None:

        embed = StaticEmbedding(vocab=word_vocab,
                                model_dir_or_name=embedding_path,
                                word_dropout=0.01,
                                dropout=0.5,
                                lower=True)

        # pretrained_embedding = load_word_emb(embedding_path, 300, word_vocab)
        return (train_set_NER,dev_set_NER,test_set_NER),\
               (word_vocab,NER_vocab),embed
    else:
        return (train_set_NER, dev_set_NER, test_set_NER), (NER_vocab,
                                                            word_vocab)
Beispiel #18
0
    def __init__(self, vocab, d_model):
        super().__init__()
        self.emb = StaticEmbedding(vocab,
                                   model_dir_or_name="en-glove-840b-300d")
        self.emb_ln = nn.Linear(300, d_model)

        self.reset_params()
Beispiel #19
0
 def test_same_vector(self):
     vocab = Vocabulary().add_word_lst(["The", "the", "THE"])
     embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True)
     words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE"]]])
     words = embed(words)
     embed_0 = words[0, 0]
     for i in range(1, words.size(1)):
         assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
 def test_case(self):
     vocab = Vocabulary().add_word_lst("This is a test .".split())
     embed = StaticEmbedding(vocab, embedding_dim=5)
     encoder = TransformerSeq2SeqEncoder(embed, num_layers=2, d_model=10, n_head=2)
     words_idx = torch.LongTensor([0, 1, 2]).unsqueeze(0)
     seq_len = torch.LongTensor([3])
     encoder_output, encoder_mask = encoder(words_idx, seq_len)
     self.assertEqual(encoder_output.size(), (1, 3, 10))
Beispiel #21
0
def get_data():
    data_bundle = CWSPipe(dataset_name=dataname, bigrams=True,
                          trigrams=False).process_from_file()
    char_embed = StaticEmbedding(
        data_bundle.get_vocab('chars'),
        dropout=0.33,
        word_dropout=0.01,
        model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt'
    )
    bigram_embed = StaticEmbedding(
        data_bundle.get_vocab('bigrams'),
        dropout=0.33,
        min_freq=3,
        word_dropout=0.01,
        model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt'
    )
    return data_bundle, char_embed, bigram_embed
 def test_norm1(self):
     # 测试只对可以找到的norm
     vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
     embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/'
                                                      'glove.6B.50d_test.txt',
                             only_norm_found_vector=True)
     self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1)
     self.assertNotEqual(torch.norm(embed(torch.LongTensor([[4]]))).item(), 1)
Beispiel #23
0
def load_conll_2003_mine(path, embedding_path=None, pad_val=-100):
    f = open(path, 'rb')

    data_pkl = pickle.load(f)
    # print(data_pkl)
    # print(data_pkl)
    train_set = data_pkl[0]['train']
    dev_set = data_pkl[0]['dev']
    test_set = data_pkl[0]['test']

    train_set.set_pad_val('posid', pad_val)
    train_set.set_pad_val('nerid', pad_val)
    train_set.set_pad_val('chunkid', pad_val)

    dev_set.set_pad_val('posid', pad_val)
    dev_set.set_pad_val('nerid', pad_val)
    dev_set.set_pad_val('chunkid', pad_val)

    test_set.set_pad_val('posid', pad_val)
    test_set.set_pad_val('nerid', pad_val)
    test_set.set_pad_val('chunkid', pad_val)

    if train_set.has_field('task_id'):

        train_set.delete_field('task_id')

    if dev_set.has_field('task_id'):
        dev_set.delete_field('task_id')

    if test_set.has_field('task_id'):
        test_set.delete_field('task_id')

    if train_set.has_field('words_idx'):
        train_set.rename_field('words_idx', 'words')

    if dev_set.has_field('words_idx'):
        dev_set.rename_field('words_idx', 'words')

    if test_set.has_field('words_idx'):
        test_set.rename_field('words_idx', 'words')

    word_vocab = data_pkl[1]['words']
    pos_vocab = data_pkl[1]['pos']
    ner_vocab = data_pkl[1]['ner']
    chunk_vocab = data_pkl[1]['chunk']

    if embedding_path is not None:
        embed = StaticEmbedding(vocab=word_vocab,
                                model_dir_or_name=embedding_path,
                                word_dropout=0.01,
                                dropout=0.5,
                                lower=True)
        return (train_set, dev_set, test_set), (word_vocab, pos_vocab,
                                                ner_vocab, chunk_vocab), embed
    else:
        return (train_set, dev_set, test_set), (word_vocab, pos_vocab,
                                                ner_vocab, chunk_vocab)
 def test_dropword(self):
     # 测试是否可以通过drop word
     vocab = Vocabulary().add_word_lst([chr(i) for i in range(1, 200)])
     embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=10, dropout=0.1, word_dropout=0.4)
     for i in range(10):
         length = torch.randint(1, 50, (1,)).item()
         batch = torch.randint(1, 4, (1,)).item()
         words = torch.randint(1, 200, (batch, length)).long()
         embed(words)
Beispiel #25
0
def load_data():
    print('loading data')
    data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file(
        paths=get_path('workdir/datasets/ontonotes-v4'))
    print('loading embedding')
    word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
                                 model_dir_or_name='en-glove-840b-300',
                                 requires_grad=True)
    return data, [word_embed]
    def test_case(self):
        vocab = Vocabulary().add_word_lst("This is a test .".split())
        embed = StaticEmbedding(vocab, embedding_dim=5)
        encoder = LSTMSeq2SeqEncoder(embed, hidden_size=5, num_layers=1)
        words_idx = torch.LongTensor([0, 1, 2]).unsqueeze(0)
        seq_len = torch.LongTensor([3])

        encoder_output, encoder_mask = encoder(words_idx, seq_len)
        self.assertEqual(encoder_mask.size(), (1, 3))
def load_data():
    if dataset == 'vlsp2016':
        paths = {'test': "./data_2/test.txt",
                 'train': "./data_2/train.txt",
                 'dev': "./data_2/dev.txt"}
        data = VLSP2016NERPipe(encoding_type=encoding_type).process_from_file(paths)
        # data.get_vocab('words').clear()
        vocab = []
        with open("vocab.txt", 'r') as files:
            for word in files:
                vocab.append(word.replace("\n", ""))
        data.get_vocab('words').add_word_lst(vocab)
    char_embed = None
    if char_type == 'cnn':
        char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30],
                                      kernel_sizes=[3], word_dropout=0, dropout=0.3, pool_method='max'
                                      , include_word_start_end=False, min_char_freq=2)
    elif char_type in ['adatrans', 'naive']:
        char_embed = TransformerCharEmbed(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0,
                 dropout=0.3, pool_method='max', activation='relu',
                 min_char_freq=2, requires_grad=True, include_word_start_end=False,
                 char_attn_type=char_type, char_n_head=3, char_dim_ffn=60, char_scale=char_type=='naive',
                 char_dropout=0.15, char_after_norm=True)
    elif char_type == 'bilstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0,
                 dropout=0.3, hidden_size=100, pool_method='max', activation='relu',
                 min_char_freq=2, bidirectional=True, requires_grad=True, include_word_start_end=False)
    elif char_type == 'lstm':
        char_embed = LSTMCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, word_dropout=0,
                 dropout=0.3, hidden_size=100, pool_method='max', activation='relu',
                 min_char_freq=2, bidirectional=False, requires_grad=True, include_word_start_end=False)
    word_embed = StaticEmbedding(vocab=data.get_vocab('words'),
                                 model_dir_or_name='word2vec',
                                 requires_grad=True, lower=True, word_dropout=0, dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    if char_embed is not None:
        embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02)
    else:
        word_embed.word_drop = 0.02
        embed = word_embed
    # print(data.get_dataset('train'))
    data__ = data.get_vocab('words')
    data.rename_field('words', 'chars')
    return data, embed, data__
Beispiel #28
0
def load_conllized_ontonote_NER_POS(path, embedding_path=None):
    from fastNLP.io.pipe.conll import OntoNotesNERPipe
    ontoNotesNERPipe = OntoNotesNERPipe(lower=True)
    bundle_NER = ontoNotesNERPipe.process_from_file(path)

    train_set_NER = bundle_NER.datasets['train']
    dev_set_NER = bundle_NER.datasets['dev']
    test_set_NER = bundle_NER.datasets['test']

    NER_vocab = bundle_NER.get_vocab('target')
    word_vocab = bundle_NER.get_vocab('words')

    (train_set_POS, dev_set_POS,
     test_set_POS), (_, POS_vocab) = load_conllized_ontonote_POS(path)
    POS_vocab = POS_vocab['POS']

    train_set_NER.add_field('pos', train_set_POS['POS'], is_target=True)
    dev_set_NER.add_field('pos', dev_set_POS['POS'], is_target=True)
    test_set_NER.add_field('pos', test_set_POS['POS'], is_target=True)

    if train_set_NER.has_field('target'):
        train_set_NER.rename_field('target', 'ner')

    if dev_set_NER.has_field('target'):
        dev_set_NER.rename_field('target', 'ner')

    if test_set_NER.has_field('target'):
        test_set_NER.rename_field('target', 'ner')

    if train_set_NER.has_field('pos'):
        train_set_NER.rename_field('pos', 'posid')
    if dev_set_NER.has_field('pos'):
        dev_set_NER.rename_field('pos', 'posid')
    if test_set_NER.has_field('pos'):
        test_set_NER.rename_field('pos', 'posid')

    if train_set_NER.has_field('ner'):
        train_set_NER.rename_field('ner', 'nerid')
    if dev_set_NER.has_field('ner'):
        dev_set_NER.rename_field('ner', 'nerid')
    if test_set_NER.has_field('ner'):
        test_set_NER.rename_field('ner', 'nerid')

    if embedding_path is not None:

        embed = StaticEmbedding(vocab=word_vocab,
                                model_dir_or_name=embedding_path,
                                word_dropout=0.01,
                                dropout=0.5,
                                lower=True)

        return (train_set_NER,dev_set_NER,test_set_NER),\
               (word_vocab,POS_vocab,NER_vocab),embed
    else:
        return (train_set_NER, dev_set_NER, test_set_NER), (NER_vocab,
                                                            word_vocab)
def prepare_env():
    vocab = Vocabulary().add_word_lst("This is a test .".split())
    vocab.add_word_lst("Another test !".split())
    embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5)

    encoder_output = torch.randn(2, 3, 10)
    src_seq_len = torch.LongTensor([3, 2])
    encoder_mask = seq_len_to_mask(src_seq_len)

    return embed, encoder_output, encoder_mask
Beispiel #30
0
    def __init__(self,
                 vocab,
                 hidden_size,
                 num_layers,
                 n_class_per_task,
                 dropout=0.5,
                 crf=False):
        super().__init__()
        # logger.info(n_class_per_task)
        word_embed = StaticEmbedding(
            vocab=vocab,
            model_dir_or_name="en-glove-6b-100d",
            word_dropout=0.01,
            dropout=dropout,
            lower=True,
        )
        char_embed = CNNCharEmbedding(vocab=vocab,
                                      embed_size=30,
                                      char_emb_size=30,
                                      filter_nums=[30],
                                      kernel_sizes=[3],
                                      word_dropout=0,
                                      dropout=dropout,
                                      include_word_start_end=False)
        self.embedding = word_embed
        self.char = char_embed
        self.lstm = fastNLP.modules.LSTM(
            input_size=self.embedding.embedding_dim + self.char.embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )

        self.out = nn.ModuleList()
        for i, n_class in enumerate(n_class_per_task):
            self.out.append(nn.Linear(hidden_size * 2, n_class))

        self.dropout = nn.Dropout(dropout, inplace=True)
        if crf:
            self.crf = nn.ModuleList([
                fastNLP.modules.ConditionalRandomField(n_class)
                for n_class in n_class_per_task
            ])
        else:
            self.crf = None
            self.criterion = nn.CrossEntropyLoss()

        for name, param in self.named_parameters():
            if "out" in name:
                if param.data.dim() > 1:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.constant_(param, 0)