Example #1
0
 def __init__(self,
              gpu=-1,
              check_for_lowercase=True,
              embeddings_dim=0,
              verbose=True,
              options_file='',
              weights_file='',
              num_layers_=2,
              dropout_=0.1):
     SeqIndexerBaseEmbeddings.__init__(
         self,
         gpu=gpu,
         check_for_lowercase=check_for_lowercase,
         zero_digits=True,
         pad='<pad>',
         unk='<unk>',
         load_embeddings=True,
         embeddings_dim=embeddings_dim,
         verbose=verbose,
         isElmo=True)
     print("create seq indexer elmo")
     self.no_context_base = True
     self.elmo = True
     self.options_fn = options_file
     self.weights_fn = weights_file
     self.emb = Elmo(options_file,
                     weights_file,
                     num_layers_,
                     dropout=dropout_)
     self.embeddings_dim = self.emb.get_output_dim
Example #2
0
 def __init__(self,
              args,
              gpu=-1,
              check_for_lowercase=True,
              embeddings_dim=0,
              verbose=True,
              unique_words_list=None):
     SeqIndexerBaseEmbeddings.__init__(
         self,
         gpu=gpu,
         check_for_lowercase=check_for_lowercase,
         zero_digits=True,
         pad='<pad>',
         unk='<unk>',
         load_embeddings=True,
         embeddings_dim=embeddings_dim,
         verbose=verbose)
     self.original_words_num = 0
     self.lowercase_words_num = 0
     self.zero_digits_replaced_num = 0
     self.zero_digits_replaced_lowercase_num = 0
     self.capitalize_word_num = 0
     self.uppercase_word_num = 0
     self.unique_words_list = unique_words_list
     self.args = args
Example #3
0
    def __init__(
        self,
        gpu=-1,
        check_for_lowercase=True,
        embeddings_dim=0,
        verbose=True,
        path_to_pretrained="xlnet-base-cased",
        model_frozen=True,
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
    ):
        SeqIndexerBaseEmbeddings.__init__(
            self,
            gpu=gpu,
            check_for_lowercase=check_for_lowercase,
            zero_digits=True,
            bos_token=bos_token,
            eos_token=eos_token,
            pad=pad_token,
            unk=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            mask_token=mask_token,
            load_embeddings=True,
            embeddings_dim=embeddings_dim,
            verbose=verbose,
            isBert=False,
            isXlNet=True)

        print("create seq indexer Transformers from Model {}".format(
            path_to_pretrained))

        self.xlnet = True

        self.path_to_pretrained = path_to_pretrained
        self.tokenizer = XLNetTokenizer.from_pretrained(path_to_pretrained)
        self.config = XLNetConfig.from_pretrained(path_to_pretrained)
        self.emb = XLNetModel.from_pretrained(path_to_pretrained)
        self.frozen = model_frozen
        for param in self.emb.parameters():
            param.requires_grad = False
        for elem in [
                self.emb.word_embedding, self.emb.layer, self.emb.dropout
        ]:
            for param in elem.parameters():
                param.requires_grad = False

        if (not self.frozen):
            for param in self.emb.pooler.parameters():
                param.requires_grad = True
        self.emb.eval()
        print("XLNET model loaded succesifully")
Example #4
0
 def __init__(self, gpu):
     SeqIndexerBaseEmbeddings.__init__(self,
                                       gpu=gpu,
                                       check_for_lowercase=False,
                                       zero_digits=False,
                                       pad='<pad>',
                                       unk='<unk>',
                                       load_embeddings=False,
                                       embeddings_dim=0,
                                       verbose=True)
Example #5
0
    def __init__(
            self,
            gpu=-1,
            check_for_lowercase=True,
            embeddings_dim=0,
            verbose=True,
            path_to_pretrained="/home/vika/targer/pretrained/uncased_L-12_H-768_A-12/",
            bert_type='bert-base-uncased',
            model_frozen=True):
        SeqIndexerBaseEmbeddings.__init__(
            self,
            gpu=gpu,
            check_for_lowercase=check_for_lowercase,
            zero_digits=True,
            pad='<pad>',
            unk='<unk>',
            load_embeddings=True,
            embeddings_dim=embeddings_dim,
            verbose=verbose,
            isBert=True)

        print("create seq indexer BERT")

        self.bert = True
        self.path_to_pretrained = path_to_pretrained
        #self.tokenizer = tokenizer_custom_bert.FullTokenizer(path_to_pretrained + 'vocab.txt')
        self.tokenizer = tokenizer_custom_bert.BertTokenizer.from_pretrained(
            "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
        )
        self.emb = BertModel.from_pretrained(path_to_pretrained)
        self.frozen = model_frozen
        for param in self.emb.parameters():
            param.requires_grad = False
        for elem in [
                self.emb.embeddings.word_embeddings,
                self.emb.embeddings.position_embeddings,
                self.emb.embeddings.token_type_embeddings,
                self.emb.embeddings.LayerNorm
        ]:
            for param in elem.parameters():
                param.requires_grad = False

        ## froze - unfroze layer of loaded bert pre-trained model. Now only pooler leayer is unfrozen. You can unfroze layers from encoders, decoders, etc.
        if (not self.frozen):
            #print ("loaded BERT model will be trained")
            #for i in [0]:
            #for param in self.emb.encoder.layer[i].parameters():
            #param.requires_grad = True
            for param in self.emb.pooler.parameters():
                param.requires_grad = True
        self.emb.eval()
        print("Bert model loaded succesifully")
Example #6
0
    def load_items_from_embeddings_file_and_unique_words_list(
            self, vocab_emb_fn, emb_fn, emb_delimiter, emb_load_all,
            unique_words_list):
        # Get the full list of available case-sensitive words from text file with pretrained embeddings
        if os.path.exists(vocab_emb_fn):
            print('load pre-trained word embedding...')
            fread = open(vocab_emb_fn, 'rb')
            unique_words, emb_vecs = pickle.load(fread)
            for unique_word, emb_vec in zip(unique_words, emb_vecs):
                self.add_word_emb_vec(unique_word, emb_vec)

        else:
            embeddings_words_list = [
                emb_word for emb_word, _ in SeqIndexerBaseEmbeddings.
                load_embeddings_from_file(emb_fn, emb_delimiter, verbose=True)
            ]
            # Create reverse mapping word from the embeddings file -> list of unique words from the dataset
            emb_word_dict2unique_word_list = dict()
            out_of_vocabulary_words_list = list()
            for unique_word in unique_words_list:
                emb_word = self.get_embeddings_word(unique_word,
                                                    embeddings_words_list)
                if emb_word is None:
                    out_of_vocabulary_words_list.append(unique_word)
                else:
                    if emb_word not in emb_word_dict2unique_word_list:
                        emb_word_dict2unique_word_list[emb_word] = [
                            unique_word
                        ]
                    else:
                        emb_word_dict2unique_word_list[emb_word].append(
                            unique_word)
            # Add pretrained embeddings for ,
            unique_words = []
            emb_vecs = []
            for emb_word, emb_vec in SeqIndexerBaseEmbeddings.load_embeddings_from_file(
                    emb_fn, emb_delimiter, verbose=True):
                if emb_word in emb_word_dict2unique_word_list:
                    for unique_word in emb_word_dict2unique_word_list[
                            emb_word]:
                        self.add_word_emb_vec(unique_word, emb_vec)
                        unique_words.append(unique_word)
                        emb_vecs.append(emb_vec)
            fwrite = open(vocab_emb_fn, 'wb')
            pickle.dump([unique_words, emb_vecs], fwrite)
            fwrite.close()
            del unique_words
            del emb_vecs

            if self.verbose:
                print(
                    '\nload_vocabulary_from_embeddings_file_and_unique_words_list:'
                )
                print('    First 50 OOV words:')
                for i, oov_word in enumerate(out_of_vocabulary_words_list):
                    print('        out_of_vocabulary_words_list[%d] = %s' %
                          (i, oov_word))
                    if i > 49:
                        break
                print(' -- len(out_of_vocabulary_words_list) = %d' %
                      len(out_of_vocabulary_words_list))
                print(' -- original_words_num = %d' % self.original_words_num)
                print(' -- lowercase_words_num = %d' %
                      self.lowercase_words_num)
                print(' -- zero_digits_replaced_num = %d' %
                      self.zero_digits_replaced_num)
                print(' -- zero_digits_replaced_lowercase_num = %d' %
                      self.zero_digits_replaced_lowercase_num)
Example #7
0
 def get_char_tensor(self, curr_char_seq, word_len):
     return SeqIndexerBaseEmbeddings.items2tensor(
         self, curr_char_seq, align='center',
         word_len=word_len)  # curr_seq_len x word_len
Example #8
0
 def load_items_from_embeddings_file_and_unique_words_list(
         self, emb_fn, emb_delimiter, emb_load_all, unique_words_list):
     # Get the full list of available case-sensitive words from text file with pretrained embeddings
     embeddings_words_list = [
         emb_word for emb_word, _ in SeqIndexerBaseEmbeddings.
         load_embeddings_from_file(emb_fn, emb_delimiter, verbose=True)
     ]
     # Create reverse mapping word from the embeddings file -> list of unique words from the dataset
     emb_word_dict2unique_word_list = dict()
     out_of_vocabulary_words_list = list()
     for unique_word in unique_words_list:
         emb_word = self.get_embeddings_word(unique_word,
                                             embeddings_words_list)
         if emb_word is None:
             out_of_vocabulary_words_list.append(unique_word)
         else:
             if emb_word not in emb_word_dict2unique_word_list:
                 emb_word_dict2unique_word_list[emb_word] = [unique_word]
             else:
                 emb_word_dict2unique_word_list[emb_word].append(
                     unique_word)
     # Add pretrained embeddings for unique_words
     for emb_word, emb_vec in SeqIndexerBaseEmbeddings.load_embeddings_from_file(
             emb_fn, emb_delimiter, verbose=True):
         if emb_word in emb_word_dict2unique_word_list:
             for unique_word in emb_word_dict2unique_word_list[emb_word]:
                 self.add_word_emb_vec(unique_word, emb_vec)
     if self.verbose:
         print(
             '\nload_vocabulary_from_embeddings_file_and_unique_words_list:'
         )
         print('    First 50 OOV words:')
         for i, oov_word in enumerate(out_of_vocabulary_words_list):
             print('        out_of_vocabulary_words_list[%d] = %s' %
                   (i, oov_word))
             if i > 49:
                 break
         print(' -- len(out_of_vocabulary_words_list) = %d' %
               len(out_of_vocabulary_words_list))
         print(' -- original_words_num = %d' % self.original_words_num)
         print(' -- lowercase_words_num = %d' % self.lowercase_words_num)
         print(' -- zero_digits_replaced_num = %d' %
               self.zero_digits_replaced_num)
         print(' -- zero_digits_replaced_lowercase_num = %d' %
               self.zero_digits_replaced_lowercase_num)
     # Load all embeddings
     if emb_load_all:
         loaded_words_list = self.get_items_list()
         load_all_words_num_before = len(loaded_words_list)
         load_all_words_lower_num = 0
         load_all_words_upper_num = 0
         load_all_words_capitalize_num = 0
         for emb_word, emb_vec in SeqIndexerBaseEmbeddings.load_embeddings_from_file(
                 emb_fn, emb_delimiter, verbose=True):
             if emb_word in loaded_words_list:
                 continue
             if emb_word.lower(
             ) not in loaded_words_list and emb_word.lower(
             ) not in embeddings_words_list:
                 self.add_word_emb_vec(emb_word.lower(), emb_vec)
                 load_all_words_lower_num += 1
             if emb_word.upper(
             ) not in loaded_words_list and emb_word.upper(
             ) not in embeddings_words_list:
                 self.add_word_emb_vec(emb_word.upper(), emb_vec)
                 load_all_words_upper_num += 1
             if emb_word.capitalize() not in loaded_words_list and emb_word.capitalize() not in \
                     embeddings_words_list:
                 self.add_word_emb_vec(emb_word.capitalize(), emb_vec)
                 load_all_words_capitalize_num += 1
             self.add_item(emb_word)
             self.add_emb_vector(emb_vec)
         load_all_words_num_after = len(self.get_items_list())
         if self.verbose:
             print(' ++ load_all_words_num_before = %d ' %
                   load_all_words_num_before)
             print(' ++ load_all_words_lower_num = %d ' %
                   load_all_words_lower_num)
             print(' ++ load_all_words_num_after = %d ' %
                   load_all_words_num_after)