def __init__(self): self.infer_model, self.infer_sess = self._load_pretrained_model() self.tfidf_char_vectorizer = pickle.load( open("../dump/tfidf_char_vectorizer.pkl", "rb")) self.tfidf_word_vectorizer = pickle.load( open("../dump/tfidf_word_vectorizer_big.pkl", "rb")) self.processor = JamoProcessor() self.tokenizer = SentencePieceTokenizer(config)
def get_embeddings(idx2word, config): embedding = np.random.uniform(-1 / 16, 1 / 16, [config.vocab_size, config.embed_dim]) if config.pretrained_embed_dir: processor = JamoProcessor() ft = FastText.load(config.pretrained_embed_dir) num_oov = 0 for i, vocab in enumerate(idx2word): try: embedding[i, :] = ft.wv[processor.word_to_jamo(vocab)] except: num_oov += 1 print("Pre-trained embedding loaded. Number of OOV : {} / {}".format( num_oov, len(idx2word))) else: print( "No pre-trained embedding found, initialize with random distribution" ) return embedding
def get_embeddings(vocab_list_dir, pretrained_embed_dir, vocab_size, embed_dim): embedding = np.random.uniform(-1/16, 1/16, [vocab_size, embed_dim]) if os.path.isfile(pretrained_embed_dir) & os.path.isfile(vocab_list_dir): with open(vocab_list_dir, "r", encoding="utf-8") as f: vocab_list = [word.strip() for word in f if len(word)>0] processor = JamoProcessor() ft = FastText.load(pretrained_embed_dir) num_oov = 0 for i, vocab in enumerate(vocab_list): try: embedding[i, :] = ft.wv[processor.word_to_jamo(vocab)] except: num_oov += 1 print("Pre-trained embedding loaded. Number of OOV : {} / {}".format(num_oov, len(vocab_list))) else: print("No pre-trained embedding found, initialize with random distribution") return embedding
def _build_vocab(self): count = Counter() processor = JamoProcessor() self.fasttext = FastText.load(self.pretrained_embed_dir) fname = os.listdir(self.base_dir)[0] with open( "/media/scatter/scatterdisk/reply_matching_model/sol.preprocessed_1.txt", "r") as f: for line in f: corpus_id, query, reply = line.strip().split("\t") count.update(self.tokenizer.tokenize(query)) count.update(self.tokenizer.tokenize(reply)) idx2word = [self.UNK_TOKEN, self.SOS_TOKEN, self.EOS_TOKEN] + \ sorted([word for word, _ in count.most_common(self.vocab_size-3)]) word2idx = {word: idx for idx, word in enumerate(idx2word)} return word2idx, idx2word
class FeatureExtractor: def __init__(self): self.tfidf_vectorizer = None self.jamo_processor = JamoProcessor() def tokens_diff(self, a, b): a_tokens = a.split(" ") b_tokens = b.split(" ") return len(set(a_tokens) & set(b_tokens)) / max( len(a_tokens), len(b_tokens)) def edit_distance(self, a, b): a_jamos = self.jamo_processor.word_to_jamo(a).replace("_", "") b_jamos = self.jamo_processor.word_to_jamo(b).replace("_", "") return editdistance.eval(a_jamos, b_jamos) def extract_features(self, A, B): extracted_features = list() for a, b in zip(A, B): ls = [self.tokens_diff(a, b), self.edit_distance(a, b)] extracted_features.append(ls) return extracted_features
def __init__(self): self.tfidf_vectorizer = None self.jamo_processor = JamoProcessor()
class FeatureExtractor: def __init__(self): self.infer_model, self.infer_sess = self._load_pretrained_model() self.tfidf_char_vectorizer = pickle.load( open("../dump/tfidf_char_vectorizer.pkl", "rb")) self.tfidf_word_vectorizer = pickle.load( open("../dump/tfidf_word_vectorizer_big.pkl", "rb")) self.processor = JamoProcessor() self.tokenizer = SentencePieceTokenizer(config) def _load_pretrained_model(self): base_dir = "/media/scatter/scatterdisk/reply_matching_model/runs/delstm_1024_nsrandom4_lr1e-3/" config_dir = base_dir + "config.json" best_model_dir = base_dir + "best_loss/best_loss.ckpt" model_config = load_config(config_dir) model_config.add_echo = False preprocessor = DynamicPreprocessor(model_config) preprocessor.build_preprocessor() infer_config = load_config(config_dir) setattr(infer_config, "tokenizer", "SentencePieceTokenizer") setattr( infer_config, "soynlp_scores", "/media/scatter/scatterdisk/tokenizer/soynlp_scores.sol.100M.txt") infer_preprocessor = DynamicPreprocessor(infer_config) infer_preprocessor.build_preprocessor() graph = tf.Graph() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with graph.as_default(): Model = get_model(model_config.model) data = DataGenerator(preprocessor, model_config) infer_model = Model(data, model_config) infer_sess = tf.Session(config=tf_config, graph=graph) infer_sess.run(tf.global_variables_initializer()) infer_sess.run(tf.local_variables_initializer()) infer_model.load(infer_sess, model_dir=best_model_dir) self.infer_preprocessor = infer_preprocessor return infer_model, infer_sess def _batch_infer(self, batch_A, batch_B): indexed_A, A_lengths = zip( *[self.infer_preprocessor.preprocess(a) for a in batch_A]) indexed_B, B_lengths = zip( *[self.infer_preprocessor.preprocess(b) for b in batch_B]) feed_dict = { self.infer_model.input_queries: indexed_A, self.infer_model.input_replies: indexed_B, self.infer_model.queries_lengths: A_lengths, self.infer_model.replies_lengths: B_lengths, self.infer_model.dropout_keep_prob: 1, } A_sentence_vectors, AB_probs = self.infer_sess.run([ self.infer_model.encoding_queries, self.infer_model.positive_probs ], feed_dict=feed_dict) feed_dict = { self.infer_model.input_queries: indexed_B, self.infer_model.input_replies: indexed_A, self.infer_model.queries_lengths: B_lengths, self.infer_model.replies_lengths: A_lengths, self.infer_model.dropout_keep_prob: 1, } B_sentence_vectors, BA_probs = self.infer_sess.run([ self.infer_model.encoding_queries, self.infer_model.positive_probs ], feed_dict=feed_dict) semantic_sim = [ cosine_similarity([a_vector], [b_vector])[0][0] for a_vector, b_vector in zip(list(A_sentence_vectors), list(B_sentence_vectors)) ] return [p[0] for p in AB_probs], [p[0] for p in BA_probs], semantic_sim def extract_features(self, sentences_A, sentences_B): def get_semantic_sim(A, B, batch_size=512): length = len(A) num_batches = (length - 1) // batch_size + 1 result = { "ab_probs": list(), "ba_probs": list(), "semantic_sim": list() } for batch_num in range(num_batches): start = batch_num * batch_size end = min([(batch_num + 1) * batch_size, length]) ab_probs, ba_probs, semantic_sim = self._batch_infer( A[start:end], B[start:end]) result["ab_probs"] += list(ab_probs) result["ba_probs"] += list(ba_probs) result["semantic_sim"] += semantic_sim return result def get_word_tfidf_sim(A, B): word_sim = list() for a, b in zip(A, B): word_sim.append( cosine_similarity( self.tfidf_word_vectorizer.transform([a]), self.tfidf_word_vectorizer.transform([b]))[0][0]) return {"tfidf_word_sim": word_sim} def get_char_tfidf_sim(A, B): char_sim = list() for a, b in zip(A, B): char_sim.append( cosine_similarity( self.tfidf_char_vectorizer.transform([a]), self.tfidf_char_vectorizer.transform([b]))[0][0]) return {"tfidf_char_sim": char_sim} def get_edit_distance(A, B): edit_distance = list() substring_ratio = list() for a, b in zip(A, B): a_jamos = self.processor.word_to_jamo(a).replace("_", "") b_jamos = self.processor.word_to_jamo(b).replace("_", "") edit_distance.append(proper_edit_distance(a_jamos, b_jamos)) substring_ratio.append(substring(a_jamos, b_jamos)) return { "edit_distance": edit_distance, "substring_ratio": substring_ratio } extracted_features = dict() extracted_features.update( get_semantic_sim(sentences_A, sentences_B, batch_size=512)) extracted_features.update(get_word_tfidf_sim(sentences_A, sentences_B)) extracted_features.update(get_char_tfidf_sim(sentences_A, sentences_B)) extracted_features.update(get_edit_distance(sentences_A, sentences_B)) return extracted_features
sys.path.append("/home/angrypark/korean-text-matching-tf") from data_loader import DataGenerator from trainer import MatchingModelTrainer from preprocessor import DynamicPreprocessor from utils.dirs import create_dirs from utils.logger import SummaryWriter from utils.config import load_config, save_config from models.base import get_model from utils.utils import JamoProcessor from text.tokenizers import SentencePieceTokenizer Config = namedtuple("config", ["sent_piece_model"]) config = Config("/media/scatter/scatterdisk/tokenizer/sent_piece.100K.model") processor = JamoProcessor() tokenizer = SentencePieceTokenizer(config) def my_word_tokenizer(raw, pos=["Noun", "Alpha", "Verb", "Number"], stopword=[]): return [word for word in tokenizer.tokenize(raw)] def my_char_tokenizer(raw, pos=["Noun", "Alpha", "Verb", "Number"], stopword=[]): return [processor.word_to_jamo(word) for word in tokenizer.tokenize(raw)]