def __init__( self, vocab, options_file=DEFAULT_OPTIONS_FILE, weight_file=DEFAULT_WEIGHT_FILE, do_layer_norm=False, dropout=0.5, trainable=False, project_dim=None, ): super(ELMoEmbedding, self).__init__(vocab) data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR) option_path = data_handler.read(options_file, return_path=True) weight_path = data_handler.read(weight_file, return_path=True) self.elmo = Elmo(option_path, weight_path, 1, requires_grad=trainable, dropout=dropout) self.project_dim = project_dim self.project_linear = None if project_dim: self.project_linear = nn.Linear(self.elmo.get_output_dim(), project_dim)
class BPETokenizer(Tokenizer): """ BPTE(Byte-Pair Encoding) Tokenizer text -> ... * Args: name: tokenizer name [roberta] """ def __init__(self, name, config={}): super(BPETokenizer, self).__init__(name, f"bpe-{name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.bpe_tokenizer = None """ Tokenizers """ def _roberta(self, text, unit="text"): """ ex) """ if self.bpe_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) merges_path = self.data_handler.read(self.config["merges_path"], return_path=True) del self.config["vocab_path"] del self.config["merges_path"] self.bpe_tokenizer = RobertaTokenizer(vocab_path, merges_path, **self.config) return self.bpe_tokenizer._tokenize(text)
def __init__( self, word_embedding, pretrained_path=None, requires_grad=False, residual_embeddings=False ): """Initialize an MTLSTM. Arguments: n_vocab (bool): If not None, initialize MTLSTM with an embedding matrix with n_vocab vectors vectors (Float Tensor): If not None, initiapize embedding matrix with specified vectors residual_embedding (bool): If True, concatenate the input embeddings with MTLSTM outputs during forward """ super(MTLSTM, self).__init__() self.word_embedding = word_embedding self.rnn = nn.LSTM(300, 300, num_layers=2, bidirectional=True, batch_first=True) data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR) cove_weight_path = data_handler.read(pretrained_path, return_path=True) if torch.cuda.is_available(): checkpoint = torch.load(cove_weight_path) else: checkpoint = torch.load(cove_weight_path, map_location="cpu") self.rnn.load_state_dict(checkpoint) self.residual_embeddings = residual_embeddings self.requires_grad = requires_grad
def build_with_pretrained_file(self, token_counter): data_handler = DataHandler(CachePath.VOCAB) vocab_texts = data_handler.read(self.pretrained_path) if self.pretrained_path.endswith(".txt"): predefine_vocab = vocab_texts.split("\n") elif self.pretrained_path.endswith(".json"): vocab_texts = json.loads(vocab_texts) # {token: id} predefine_vocab = [ item[0] for item in sorted(vocab_texts.items(), key=lambda x: x[1]) ] else: raise ValueError(f"support vocab extention. .txt or .json") self.build(token_counter, predefine_vocab=predefine_vocab)
class SubwordTokenizer(Tokenizer): """ Subword Tokenizer text -> [word tokens] -> [[sub word tokens], ...] * Args: name: tokenizer name [wordpiece] """ def __init__(self, name, word_tokenizer, config={}): super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.word_tokenizer = word_tokenizer self.subword_tokenizer = None """ Tokenizers """ def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer( vocab, unk_token=self.config.get("unk_token", "[UNK]")) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens
class MRCEnsemble(Machine): """ Machine Reading Comprehension Ensemble * Args: config: machine_config """ def __init__(self, config): super(MRCEnsemble, self).__init__(config) self.data_handler = DataHandler(CachePath.MACHINE / "mrc_ensemble") self.load() @overrides def load(self): mrc_config = self.config.reading_comprehension # Model 1 - BERT-Kor self.rc_experiment1 = self.make_module(mrc_config.model_1) print("BERT-Kor ready ..! \n") # # Model 2 - BERT-Multilingual # self.rc_experiment2 = self.make_module(mrc_config.model_2) # print("BERT-Multilingual ready ..! \n") # # Model 3 - DocQA # self.rc_experiment3 = self.make_module(mrc_config.model_3) # print("DocQA ready ..! \n") # # Model 4 - DrQA # self.rc_experiment4 = self.make_module(mrc_config.model_4) # print("DrQA ready ..! \n") print("All ready ..! \n") def evaluate(self, file_path, output_path): # KorQuAD dataset... # def get_answer_after_clustering(predictions): # categories = {} # for l1 in predictions: # l1_text = l1["text"] # l1_text_normalized = normalize_answer(l1_text) # categories[l1_text] = { # "items": [], # "score": 0 # } # for l2 in predictions: # l2_text = l2["text"] # l2_text_normalized = normalize_answer(l2_text) # if l1_text_normalized in l2_text_normalized: # categories[l1_text]["items"].append(l2) # categories[l1_text]["score"] += l2["score"] # # # count items then score * 1.n # # for k, v in categories.items(): # # ratio = 1 + (len(v["items"]) / 10) # # v["score"] *= ratio # highest_category = [categories[c] for c in sorted(categories, key=lambda x: categories[x]["score"], reverse=True)][0] # answer_text = sorted(highest_category["items"], key=lambda x: x["score"], reverse=True)[0]["text"] # return answer_text # def get_answer_after_clustering_marginal(predictions): # categories = {} # for l1 in predictions: # l1_text = l1["text"] # l1_text_normalized = normalize_answer(l1_text) # categories[l1_text] = { # "items": [], # "score": 0 # } # for l2 in predictions: # l2_text = l2["text"] # l2_text_normalized = normalize_answer(l2_text) # if l1_text_normalized in l2_text_normalized: # categories[l1_text]["items"].append(l2) # categories[l1_text]["score"] *= l2["score"] # else: # categories[l1_text]["score"] *= 0.01 # Default value # # count items then score * 1.n # for k, v in categories.items(): # ratio = 1 + (len(v["items"]) / 10) # v["score"] *= ratio # highest_category = [categories[c] for c in sorted(categories, key=lambda x: categories[x]["score"], reverse=True)][0] # answer_text = sorted(highest_category["items"], key=lambda x: x["score"], reverse=True)[0]["text"] # return answer_text # def post_processing(text): # # detach josa # # josas = ['은', '는', '이', '가', '을', '를', '과', '와', '이다', '다', '으로', '로', '의', '에'] # josas = ["는", "를", "이다", "으로", "에", "이라고", "라고", "와의", "인데"] # for josa in josas: # if text.endswith(josa): # text = text[:-len(josa)] # break # # temperature # if text.endswith("°"): # text += "C" # # etc # special_cases = ["(", ",", "였", "."] # for s in special_cases: # if text.endswith(s): # text = text[:-len(s)] # return text def _clean_text(text): # https://github.com/allenai/document-qa/blob/2f9fa6878b60ed8a8a31bcf03f802cde292fe48b/docqa/data_processing/text_utils.py#L124 # be consistent with quotes, and replace \u2014 and \u2212 which I have seen being mapped to UNK # by glove word vecs return (text.replace("''", '"').replace("``", '"').replace( "\u2212", "-").replace("\u2014", "\u2013")) predictions = {} topk_predictions = {} print("Read input_data...") data = self.data_handler.read(file_path) squad = json.loads(data) if "data" in squad: squad = squad["data"] wrong_count = 0 print("Start predict 1-examples...") for article in tqdm(squad): for paragraph in article["paragraphs"]: context = paragraph["context"] for qa in paragraph["qas"]: question = qa["question"] id_ = qa["id"] # Marginal probabilities... # prediction = self.get_predict_with_marginal(context, question) prediction = self.get_predict(context, question) # print("prediction count:", len(prediction)) topk_predictions[id_] = prediction predictions[id_] = prediction[0]["text"] # answer_texts = [q["text"] for q in qa["answers"]] # # 1. Highest value # sorted_prediction = sorted(prediction, key=lambda x: x["score"], reverse=True) # prediction_text = sorted_prediction[0]["text"] # 2. Cluster by text # prediction_text = get_answer_after_clustering_marginal(prediction) # prediction_text = post_processing(prediction_text) # predictions[id_] = prediction_text # if prediction_text not in answer_texts: # pred_f1_score = metric_max_over_ground_truths(f1_score, prediction_text, answer_texts) # if pred_f1_score <= 0.5: # sorted_prediction = sorted(prediction, key=lambda x: x["score"], reverse=True) # print("predict:", json.dumps(sorted_prediction[:5], indent=4, ensure_ascii=False)) # print("predict_text:", prediction_text) # print("answers:", qa["answers"], "f1:", pred_f1_score) # print("-"*50) # wrong_count += 1 # is_answer = False # for pred in prediction: # if pred["text"] in answer_texts: # predictions[id_] = pred["text"] # is_answer = True # break # if not is_answer: # prediction_text = sorted(prediction, key=lambda x: x["score"], reverse=True)[0]["text"] # predictions[id_] = prediction_text # print("predict:", prediction) # print("predict_text:", prediction_text) # print("answers:", qa["answers"]) # print("-"*50) # wrong_count += 1 print("total_count:", len(predictions), "wrong_count:", wrong_count) print("Completed...!") with open(output_path, "w") as out_file: out_file.write(json.dumps(topk_predictions, indent=4) + "\n") # Evaluate with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json if "data" in dataset: dataset = dataset["data"] # with open(output_path) as prediction_file: # predictions = json.load(prediction_file) results = evaluate(dataset, predictions) print(json.dumps(results)) def get_predict(self, context, question): raw_feature = {"context": context, "question": question} # print(raw_feature) # Approach 1. Max Prob models = [ (self.rc_experiment1, 0.94), # (self.rc_experiment2, 0.90) # (self.rc_experiment3, 0.85), # (self.rc_experiment4, 0.84), ] # models = [self.rc_experiment3, self.rc_experiment4] model = models[0][0] return sorted(model.predict(raw_feature), key=lambda x: x["score"], reverse=True)
def build_with_pretrained_file(self, token_counter): data_handler = DataHandler(CachePath.VOCAB) vocab_texts = data_handler.read(self.pretrained_path) predefine_vocab = vocab_texts.split("\n") self.build(token_counter, predefine_vocab=predefine_vocab)
class WordEmbedding(TokenEmbedding): """ Word Embedding Default Token Embedding * Args: vocab: Vocab (claf.tokens.vocab) * Kwargs: dropout: The number of dropout probability embed_dim: The number of embedding dimension padding_idx: If given, pads the output with the embedding vector at padding_idx (initialized to zeros) whenever it encounters the index. max_norm: If given, will renormalize the embedding vectors to have a norm lesser than this before extracting. Note: this will modify weight in-place. norm_type: The p of the p-norm to compute for the max_norm option. Default 2. scale_grad_by_freq: if given, this will scale gradients by the inverse of frequency of the words in the mini-batch. Default False. sparse: if True, gradient w.r.t. weight will be a sparse tensor. See Notes under torch.nn.Embedding for more details regarding sparse gradients. pretrained_path: pretrained vector path (eg. GloVe) trainable: finetune or fixed """ def __init__( self, vocab, dropout=0.2, embed_dim=100, padding_idx=None, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, pretrained_path=None, trainable=True, ): super(WordEmbedding, self).__init__(vocab) self.data_handler = DataHandler(cache_path=CachePath.PRETRAINED_VECTOR) self.embed_dim = embed_dim if dropout and dropout > 0: self.dropout = nn.Dropout(p=dropout) else: self.dropout = lambda x: x if pretrained_path: weight = self._read_pretrained_file(pretrained_path) self.weight = torch.nn.Parameter(weight, requires_grad=trainable) else: self.weight = self._init_weight(trainable=trainable) # nn.functional.embedding = optional paramters # (padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse) # check - https://pytorch.org/docs/master/nn.html#torch.nn.functional.embeddin\ # ://pytorch.org/docs/master/nn.html#torch.nn.functional.embedding self.padding_idx = padding_idx self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq self.sparse = sparse def _init_weight(self, trainable=True): weight = torch.FloatTensor(self.get_vocab_size(), self.embed_dim) weight = torch.nn.Parameter(weight, requires_grad=trainable) torch.nn.init.xavier_uniform_(weight) return weight @overrides def forward(self, words): input_size = words.size() if len(input_size) > 2: words = words.view(-1, input_size[-1]) embedded_words = F.embedding( words, self.weight, padding_idx=self.padding_idx, max_norm=self.max_norm, norm_type=self.norm_type, scale_grad_by_freq=self.scale_grad_by_freq, sparse=self.sparse, ) if len(input_size) > 2: embedded_size = list(input_size) + [embedded_words.size(-1)] embedded_words = embedded_words.view(*embedded_size) return self.dropout(embedded_words) def _read_pretrained_file(self, file_path): words_to_keep = set(self.vocab.get_all_tokens()) vocab_size = self.get_vocab_size() embeddings = {} # First we read the embeddings from the file, only keeping vectors for the words we need. logger.info("Reading embeddings from file") file_path = self.data_handler.read(file_path, return_path=True) with open(file_path, "rb") as embeddings_file: for line in embeddings_file: fields = line.decode("utf-8").rstrip().split(" ") if len(fields) - 1 != self.embed_dim: logger.info( f"Found line with wrong number of dimensions (expected {self.embed_dim}, was {len(fields)}): {line}" ) continue word = fields[0] if word in words_to_keep: vector = np.asarray(fields[1:], dtype="float32") embeddings[word] = vector if not embeddings: raise ValueError( "No embeddings of correct dimension found. check input dimension value" ) all_embeddings = np.asarray(list(embeddings.values())) embeddings_mean = float(np.mean(all_embeddings)) embeddings_std = float(np.std(all_embeddings)) # Now we initialize the weight matrix for an embedding layer, starting with random vectors, # then filling in the word vectors we just read. logger.info("Initializing pre-trained embedding layer") embedding_matrix = torch.FloatTensor(vocab_size, self.embed_dim).normal_( embeddings_mean, embeddings_std ) match_count = 0 for i in range(0, vocab_size): word = self.vocab.get_token(i) if word in embeddings: embedding_matrix[i] = torch.FloatTensor(embeddings[word]) match_count += 1 else: # f"Word {word} was not found in the embedding file. Initialising randomly." pass logger.info(f"Match embedding vocab size: {match_count}. [{match_count}/{vocab_size}]") return embedding_matrix @overrides def get_output_dim(self): return self.embed_dim