def __init__(self, gpu=-1, check_for_lowercase=True, embeddings_dim=0, verbose=True, options_file='', weights_file='', num_layers_=2, dropout_=0.1): SeqIndexerBaseEmbeddings.__init__( self, gpu=gpu, check_for_lowercase=check_for_lowercase, zero_digits=True, pad='<pad>', unk='<unk>', load_embeddings=True, embeddings_dim=embeddings_dim, verbose=verbose, isElmo=True) print("create seq indexer elmo") self.no_context_base = True self.elmo = True self.options_fn = options_file self.weights_fn = weights_file self.emb = Elmo(options_file, weights_file, num_layers_, dropout=dropout_) self.embeddings_dim = self.emb.get_output_dim
def __init__(self, args, gpu=-1, check_for_lowercase=True, embeddings_dim=0, verbose=True, unique_words_list=None): SeqIndexerBaseEmbeddings.__init__( self, gpu=gpu, check_for_lowercase=check_for_lowercase, zero_digits=True, pad='<pad>', unk='<unk>', load_embeddings=True, embeddings_dim=embeddings_dim, verbose=verbose) self.original_words_num = 0 self.lowercase_words_num = 0 self.zero_digits_replaced_num = 0 self.zero_digits_replaced_lowercase_num = 0 self.capitalize_word_num = 0 self.uppercase_word_num = 0 self.unique_words_list = unique_words_list self.args = args
def __init__( self, gpu=-1, check_for_lowercase=True, embeddings_dim=0, verbose=True, path_to_pretrained="xlnet-base-cased", model_frozen=True, bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", ): SeqIndexerBaseEmbeddings.__init__( self, gpu=gpu, check_for_lowercase=check_for_lowercase, zero_digits=True, bos_token=bos_token, eos_token=eos_token, pad=pad_token, unk=unk_token, sep_token=sep_token, cls_token=cls_token, mask_token=mask_token, load_embeddings=True, embeddings_dim=embeddings_dim, verbose=verbose, isBert=False, isXlNet=True) print("create seq indexer Transformers from Model {}".format( path_to_pretrained)) self.xlnet = True self.path_to_pretrained = path_to_pretrained self.tokenizer = XLNetTokenizer.from_pretrained(path_to_pretrained) self.config = XLNetConfig.from_pretrained(path_to_pretrained) self.emb = XLNetModel.from_pretrained(path_to_pretrained) self.frozen = model_frozen for param in self.emb.parameters(): param.requires_grad = False for elem in [ self.emb.word_embedding, self.emb.layer, self.emb.dropout ]: for param in elem.parameters(): param.requires_grad = False if (not self.frozen): for param in self.emb.pooler.parameters(): param.requires_grad = True self.emb.eval() print("XLNET model loaded succesifully")
def __init__(self, gpu): SeqIndexerBaseEmbeddings.__init__(self, gpu=gpu, check_for_lowercase=False, zero_digits=False, pad='<pad>', unk='<unk>', load_embeddings=False, embeddings_dim=0, verbose=True)
def __init__( self, gpu=-1, check_for_lowercase=True, embeddings_dim=0, verbose=True, path_to_pretrained="/home/vika/targer/pretrained/uncased_L-12_H-768_A-12/", bert_type='bert-base-uncased', model_frozen=True): SeqIndexerBaseEmbeddings.__init__( self, gpu=gpu, check_for_lowercase=check_for_lowercase, zero_digits=True, pad='<pad>', unk='<unk>', load_embeddings=True, embeddings_dim=embeddings_dim, verbose=verbose, isBert=True) print("create seq indexer BERT") self.bert = True self.path_to_pretrained = path_to_pretrained #self.tokenizer = tokenizer_custom_bert.FullTokenizer(path_to_pretrained + 'vocab.txt') self.tokenizer = tokenizer_custom_bert.BertTokenizer.from_pretrained( "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt" ) self.emb = BertModel.from_pretrained(path_to_pretrained) self.frozen = model_frozen for param in self.emb.parameters(): param.requires_grad = False for elem in [ self.emb.embeddings.word_embeddings, self.emb.embeddings.position_embeddings, self.emb.embeddings.token_type_embeddings, self.emb.embeddings.LayerNorm ]: for param in elem.parameters(): param.requires_grad = False ## froze - unfroze layer of loaded bert pre-trained model. Now only pooler leayer is unfrozen. You can unfroze layers from encoders, decoders, etc. if (not self.frozen): #print ("loaded BERT model will be trained") #for i in [0]: #for param in self.emb.encoder.layer[i].parameters(): #param.requires_grad = True for param in self.emb.pooler.parameters(): param.requires_grad = True self.emb.eval() print("Bert model loaded succesifully")
def load_items_from_embeddings_file_and_unique_words_list( self, vocab_emb_fn, emb_fn, emb_delimiter, emb_load_all, unique_words_list): # Get the full list of available case-sensitive words from text file with pretrained embeddings if os.path.exists(vocab_emb_fn): print('load pre-trained word embedding...') fread = open(vocab_emb_fn, 'rb') unique_words, emb_vecs = pickle.load(fread) for unique_word, emb_vec in zip(unique_words, emb_vecs): self.add_word_emb_vec(unique_word, emb_vec) else: embeddings_words_list = [ emb_word for emb_word, _ in SeqIndexerBaseEmbeddings. load_embeddings_from_file(emb_fn, emb_delimiter, verbose=True) ] # Create reverse mapping word from the embeddings file -> list of unique words from the dataset emb_word_dict2unique_word_list = dict() out_of_vocabulary_words_list = list() for unique_word in unique_words_list: emb_word = self.get_embeddings_word(unique_word, embeddings_words_list) if emb_word is None: out_of_vocabulary_words_list.append(unique_word) else: if emb_word not in emb_word_dict2unique_word_list: emb_word_dict2unique_word_list[emb_word] = [ unique_word ] else: emb_word_dict2unique_word_list[emb_word].append( unique_word) # Add pretrained embeddings for , unique_words = [] emb_vecs = [] for emb_word, emb_vec in SeqIndexerBaseEmbeddings.load_embeddings_from_file( emb_fn, emb_delimiter, verbose=True): if emb_word in emb_word_dict2unique_word_list: for unique_word in emb_word_dict2unique_word_list[ emb_word]: self.add_word_emb_vec(unique_word, emb_vec) unique_words.append(unique_word) emb_vecs.append(emb_vec) fwrite = open(vocab_emb_fn, 'wb') pickle.dump([unique_words, emb_vecs], fwrite) fwrite.close() del unique_words del emb_vecs if self.verbose: print( '\nload_vocabulary_from_embeddings_file_and_unique_words_list:' ) print(' First 50 OOV words:') for i, oov_word in enumerate(out_of_vocabulary_words_list): print(' out_of_vocabulary_words_list[%d] = %s' % (i, oov_word)) if i > 49: break print(' -- len(out_of_vocabulary_words_list) = %d' % len(out_of_vocabulary_words_list)) print(' -- original_words_num = %d' % self.original_words_num) print(' -- lowercase_words_num = %d' % self.lowercase_words_num) print(' -- zero_digits_replaced_num = %d' % self.zero_digits_replaced_num) print(' -- zero_digits_replaced_lowercase_num = %d' % self.zero_digits_replaced_lowercase_num)
def get_char_tensor(self, curr_char_seq, word_len): return SeqIndexerBaseEmbeddings.items2tensor( self, curr_char_seq, align='center', word_len=word_len) # curr_seq_len x word_len
def load_items_from_embeddings_file_and_unique_words_list( self, emb_fn, emb_delimiter, emb_load_all, unique_words_list): # Get the full list of available case-sensitive words from text file with pretrained embeddings embeddings_words_list = [ emb_word for emb_word, _ in SeqIndexerBaseEmbeddings. load_embeddings_from_file(emb_fn, emb_delimiter, verbose=True) ] # Create reverse mapping word from the embeddings file -> list of unique words from the dataset emb_word_dict2unique_word_list = dict() out_of_vocabulary_words_list = list() for unique_word in unique_words_list: emb_word = self.get_embeddings_word(unique_word, embeddings_words_list) if emb_word is None: out_of_vocabulary_words_list.append(unique_word) else: if emb_word not in emb_word_dict2unique_word_list: emb_word_dict2unique_word_list[emb_word] = [unique_word] else: emb_word_dict2unique_word_list[emb_word].append( unique_word) # Add pretrained embeddings for unique_words for emb_word, emb_vec in SeqIndexerBaseEmbeddings.load_embeddings_from_file( emb_fn, emb_delimiter, verbose=True): if emb_word in emb_word_dict2unique_word_list: for unique_word in emb_word_dict2unique_word_list[emb_word]: self.add_word_emb_vec(unique_word, emb_vec) if self.verbose: print( '\nload_vocabulary_from_embeddings_file_and_unique_words_list:' ) print(' First 50 OOV words:') for i, oov_word in enumerate(out_of_vocabulary_words_list): print(' out_of_vocabulary_words_list[%d] = %s' % (i, oov_word)) if i > 49: break print(' -- len(out_of_vocabulary_words_list) = %d' % len(out_of_vocabulary_words_list)) print(' -- original_words_num = %d' % self.original_words_num) print(' -- lowercase_words_num = %d' % self.lowercase_words_num) print(' -- zero_digits_replaced_num = %d' % self.zero_digits_replaced_num) print(' -- zero_digits_replaced_lowercase_num = %d' % self.zero_digits_replaced_lowercase_num) # Load all embeddings if emb_load_all: loaded_words_list = self.get_items_list() load_all_words_num_before = len(loaded_words_list) load_all_words_lower_num = 0 load_all_words_upper_num = 0 load_all_words_capitalize_num = 0 for emb_word, emb_vec in SeqIndexerBaseEmbeddings.load_embeddings_from_file( emb_fn, emb_delimiter, verbose=True): if emb_word in loaded_words_list: continue if emb_word.lower( ) not in loaded_words_list and emb_word.lower( ) not in embeddings_words_list: self.add_word_emb_vec(emb_word.lower(), emb_vec) load_all_words_lower_num += 1 if emb_word.upper( ) not in loaded_words_list and emb_word.upper( ) not in embeddings_words_list: self.add_word_emb_vec(emb_word.upper(), emb_vec) load_all_words_upper_num += 1 if emb_word.capitalize() not in loaded_words_list and emb_word.capitalize() not in \ embeddings_words_list: self.add_word_emb_vec(emb_word.capitalize(), emb_vec) load_all_words_capitalize_num += 1 self.add_item(emb_word) self.add_emb_vector(emb_vec) load_all_words_num_after = len(self.get_items_list()) if self.verbose: print(' ++ load_all_words_num_before = %d ' % load_all_words_num_before) print(' ++ load_all_words_lower_num = %d ' % load_all_words_lower_num) print(' ++ load_all_words_num_after = %d ' % load_all_words_num_after)