def __init__(self, vocab_size, embedding_size, vocab: 'Word2VecVocab', neg_sample=20, padding_idx=0, neg_weight=True): """ :param word2vec: :param neg_sample: the number of negative sampling (5~20 for small datasets, 2~5 for large datasets) """ super(SGNSModel, self).__init__() self.vocab_size = vocab_size self.embedding_size = embedding_size self.ivectors = torch.nn.Embedding(self.vocab_size, self.embedding_size, padding_idx=padding_idx) self.ovectors = torch.nn.Embedding(self.vocab_size, self.embedding_size, padding_idx=padding_idx) self.ivectors.weight = torch.nn.Parameter( torch.cat([ torch.zeros(1, self.embedding_size), torch.FloatTensor(self.vocab_size - 1, self.embedding_size).uniform_( -0.5 / self.embedding_size, 0.5 / self.embedding_size) ])) self.ovectors.weight = torch.nn.Parameter( torch.cat([ torch.zeros(1, self.embedding_size), torch.FloatTensor(self.vocab_size - 1, self.embedding_size).uniform_( -0.5 / self.embedding_size, 0.5 / self.embedding_size) ])) self.ivectors.weight.requires_grad = True self.ovectors.weight.requires_grad = True self.vocab_size = len(vocab) self.neg_sample = neg_sample if (neg_weight and neg_sample > 0) and (vocab is not None and vocab.idx2freq is not None): self.ns_weights = numpy.power(vocab.idx2freq, 0.75) self.ns_weights = torch.FloatTensor(self.ns_weights / vocab.idx2freq.sum()) else: self.ns_weights = None log.info( f'SGNSModel(vocab_size: {NumUtil.comma_str(self.vocab_size)}, embedding_size: {embedding_size}, neg_sample: {self.neg_sample})' )
def build(cls, text_file: str, vocab_size=int(1e5), token=TOKEN, min_count=2, data_dir=WORD2VEC_DATA_DIR) -> 'Word2VecVocab': log.info(f"building vocab... {text_file}") if data_dir is None: data_dir = os.path.dirname(text_file) filepath = cls.get_filepath(data_dir, text_file, vocab_size) log.info(filepath) total_lines = FileUtil.count_lines(text_file) word2cnt = {} if text_file.endswith('.gz') or text_file.endswith('zip'): f = gzip.open(text_file, 'r') else: f = codecs.open(text_file, 'r', encoding='utf-8') with f: for no, line in enumerate(f): if no % 10000 == 0: log.info( f"{os.path.basename(text_file)} {no/total_lines*100:.1f}% readed." ) line = line.strip() if len(line) == 0: continue sent = line.split() for word in sent: word2cnt[word] = word2cnt.get(word, 0) + 1 for word, cnt in word2cnt.copy().items(): if cnt < min_count: del word2cnt[word] log.info(f'total unique words: {NumUtil.comma_str(len(word2cnt) + 1)}') idx2word = sorted(word2cnt, key=word2cnt.get, reverse=True) idx2word = [cls.UNK_CHAR] + idx2word[:vocab_size - 1] word2cnt[cls.UNK_CHAR] = 1 idx2freq = numpy.array([word2cnt[word] for word in idx2word]) idx2freq = idx2freq / idx2freq.sum() vocab = Word2VecVocab(token=token, min_count=min_count, idx2word=idx2word, idx2freq=idx2freq) vocab.save(filepath=filepath) log.info(f"build vocab OK. {filepath}") return vocab
def build(cls, text_file: str, vocab: Word2VecVocab, window=5, side='both', data_dir=None) -> 'Word2VecCorpus': log.info(f"build corpus... {text_file}") if data_dir is None: data_dir = os.path.dirname(text_file) filepath = cls.get_filepath(data_dir=data_dir, vocab=vocab, window=window, side=side) if os.path.exists(filepath): log.info(f"corpus file exists. load {filepath}") return Word2VecCorpus.load(filepath) total_lines = FileUtil.count_lines(text_file) word2idx = { vocab.idx2word[idx]: idx for idx, _ in enumerate(vocab.idx2word) } data = [] if text_file.endswith('.gz') or text_file.endswith('zip'): f = gzip.open(text_file, 'r') else: f = codecs.open(text_file, 'r', encoding='utf-8') with f: for no, line in enumerate(f): if no % 100000 == 0: log.info( f"{os.path.basename(text_file)} {no/total_lines*100:.1f}% readed." ) line = line.strip() if len(line) == 0: continue sent = [] for word in line.split(): if word in word2idx.keys(): sent.append(word) else: sent.append(Word2VecVocab.UNK_CHAR) for i in range(len(sent)): iword, owords = cls.skipgram(sent, i, window=window, side=side) data.append((word2idx[iword], [word2idx[oword] for oword in owords])) corpus = Word2VecCorpus(data=data, vocab=vocab, window=window, side=side) corpus.save(filepath=filepath) log.info(f"build corpus OK. {filepath}") return corpus
def train(self, iterations: int, batch: int, embedding: Word2VecEmbedding, args: argparse.Namespace) -> str: batches_in_epoch = int(numpy.ceil( len(self.dataloader.dataset) / batch)) total_batches = batches_in_epoch * iterations nth_total_batch = 0 log.info(f'batches_in_epoch: {batches_in_epoch}') log.info(f'total_batches: {total_batches}') watch = WatchUtil(auto_stop=False) watch.start() best_loss = float("inf") first_epoch, last_epoch = self.epoch + 1, self.epoch + iterations + 1 last_embedding_file = None log.info(Word2VecEmbedding.get_filenpath(args)) for self.epoch in range(first_epoch, last_epoch): log.info(f"[e{self.epoch:2d}] {self}") loss_list = [] for nth, (iword, owords) in enumerate(self.dataloader, 1): try: loss = self.sgns(iword, owords) except RuntimeError: loss_list = [float('-inf')] break self.optim.zero_grad() loss.backward() self.optim.step() # if nth_batch == 1 and self.scheduler is not None and self.epoch >= self.decay_start_epoch: # TODO: TEST # self.scheduler.step() if self.learning_decay != 0: PytorchUtil.set_learning_rate(self.optim, self.epoch, gamma=self.learning_decay, base_lr=self.init_lr, min_lr=1e-10, decay_start=2, decay_interval=3) lr = PytorchUtil.get_learning_rate(self.optim) _, negatives = owords.size() real_loss = loss.data[0] / float(negatives) loss_list.append(real_loss) nth_total_batch += 1 progressed = nth_total_batch / total_batches seconds_per_batch = float( watch.elapsed()) / float(nth_total_batch) remain_batches = total_batches - nth_total_batch remain_secs = int(seconds_per_batch * remain_batches) if nth == 1 or nth == batches_in_epoch or nth % 1000 == 0: log.info( f"[e{self.epoch:2d}][b{nth:5d}/{batches_in_epoch:5d}][{progressed*100:.1f}% remain: {DateUtil.secs_to_string(remain_secs)}][window: {self.window}][lr: {lr:.0e}] loss: {real_loss:.7f}" ) total_loss = numpy.mean(loss_list) log.info( f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss: {total_loss:.7f}, best_loss: {best_loss:.7f}" ) if total_loss > best_loss or total_loss == float( 'inf') or total_loss == float( '-inf'): # bad loss than before or diverge log.info('') log.info( f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss > best_loss BREAK" ) log.info('') break else: if best_loss < total_loss: best_loss = total_loss log.info( f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save()..." ) args.epoch = self.epoch last_embedding_file = embedding.save( idx2vec=trainer.embedding, filepath=Word2VecEmbedding.get_filenpath(args)) log.info( f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save() OK. {os.path.basename(embedding.filepath)}" ) return last_embedding_file
parser.add_argument('--subsample', default=Word2VecEmbedding.SUBSAMPLE, type=float, help="subsample threshold (default: 1e-5)") parser.add_argument('--learning_rate', default=Word2VecEmbedding.LEARNING_RATE, type=float, help="learning rate for AdamOptimizer") parser.add_argument('--learning_decay', default=Word2VecEmbedding.LEARNING_DECAY, type=float, help="exponential decay gamma (default: 0.0=no decay)") args = parser.parse_args() log.info(args) watch = WatchUtil(auto_stop=True) try: log.info(f'load {args.corpus_file} ...') watch.start() corpus = Word2VecCorpus.load(filepath=args.corpus_file) log.info( f'load {args.corpus_file} OK. (elapsed: {watch.elapsed_string()})') log.info(corpus.vocab) if len(corpus.vocab) > 1e5: # out of memory (11GB GPU memory) args.device_no = None log.info('')
parser.add_argument('--vocab_size', default=Word2VecVocab.MAX_VOCAB, type=int, help="maximum number of vocab (default:1e5)") parser.add_argument( '--token', default=Word2VecVocab.TOKEN, choices=['word', 'morph', 'character', 'jaso'], help="token is word or morph or character (default: 'word')") parser.add_argument('--min_count', default=Word2VecVocab.MIN_COUNT, type=int) args = parser.parse_args() try: if not os.path.exists(args.text_file): log.error(f'text file does not exists. {args.text_file}') exit(-1) vocab = Word2VecVocab.build(text_file=args.text_file, vocab_size=args.vocab_size, token=args.token, min_count=args.min_count, data_dir=args.data_dir) log.info(f'vocab: {vocab.filepath} {NumUtil.comma_str(len(vocab))}') log.info(f'vocab.idx2word: {vocab.idx2word[:10]}') log.info(f'vocab.idx2freq: {vocab.idx2freq[:10]}') except: log.error(traceback.format_exc())
@property def data2text(self): for iword, owords in self.data: yield self.vocab.idx2word[iword], [self.vocab.idx2word[o] for o in owords] if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--text_file', default=WIKIPEDIA_SENTENCE_FILE, type=str, help="corpus file path") parser.add_argument('--data_dir', default=WORD2VEC_DATA_DIR, type=str, help="data directory path (default:'./data')") parser.add_argument('--vocab_file', default=Word2VecVocab.DEFAULT_FILE, type=str) parser.add_argument('--window', default=Word2VecCorpus.WINDOW, type=int, help="window size") parser.add_argument('--side', default=Word2VecCorpus.SIDE, type=str, choices=['both', 'front', 'back'], help="target words in front or back or both (default: both)") args = parser.parse_args() try: log.info(f'vocab_file {args.vocab_file}') if not os.path.exists(args.vocab_file): log.error(f'vocab file does not exists. {args.vocab_file}') vocab = Word2VecVocab.load(args.vocab_file) log.info(vocab) for args.window in [args.window]: # [1, 2, 3, 4, 5]: for args.side in [args.side]: # ['both', 'front', 'back']: log.info(f'window: {args.window} side: {args.side}') corpus = Word2VecCorpus.build(text_file=args.text_file, vocab=vocab, window=args.window, side=args.side, data_dir=args.data_dir) log.info(f'corpus: {corpus.filepath} {NumUtil.comma_str(len(corpus))}') except: log.error(traceback.format_exc())
def word2vec_tensorboard(embedding_file_list, top_n=1e5, output_dir=TENSORBOARD_LOG_DIR): if not os.path.exists(output_dir): os.mkdir(output_dir) for filename in os.listdir(output_dir): os.remove(os.path.join(output_dir, filename)) # remove old tensorboard files config = projector.ProjectorConfig() embedding_list = [] for embedding_file in embedding_file_list: if not os.path.exists(embedding_file): log.info(f'{embedding_file} not exists. skipped.') continue embedding = Word2VecEmbedding.load(embedding_file) name = os.path.basename(embedding_file.replace('+', '')) while name.startswith('_'): name = name[1:] idx2vec = embedding.idx2vec idx2word, idx2freq = embedding.idx2word, embedding.idx2freq if top_n > 0: name += f'.top_n_{top_n}' idx2vec, idx2word, idx2freq = idx2vec[: top_n], embedding.idx2word[: top_n], embedding.idx2freq[: top_n] embedding_var = tf.Variable(idx2vec, name=name) embedding_list.append(embedding_var) embedding = config.embeddings.add() embedding.tensor_name = embedding_var.name embedding.metadata_path = os.path.join(output_dir, f'{name}.tsv') log.info('') log.info(f'{embedding_file} loaded.') log.info( f'embedding_var.name: {embedding_var.name} shape: {embedding_var.shape}' ) log.info(f'embedding.metadata_path: {embedding.metadata_path}') with open(embedding.metadata_path, 'wt') as out_f: out_f.write('spell\tfreq\n') for spell, freq in zip(idx2word, idx2freq): out_f.write(f'{spell}\t{freq:.7f}\n') summary_writer = tf.summary.FileWriter(output_dir) projector.visualize_embeddings(summary_writer, config) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_list=embedding_list) checkpoint_file = os.path.join(output_dir, f'{name}.ckpt') saver.save(sess, checkpoint_file, global_step=None) log.info(f'checkpoint_file: {checkpoint_file}') # change absolute path -> relative path for filename in ['checkpoint', 'projector_config.pbtxt']: filepath = os.path.join(output_dir, filename) lines = [] with open(filepath, 'rt') as f: for line in f.readlines(): lines.append(line.replace(output_dir, '.')) os.remove(filepath) with open(filepath, 'wt') as f: for line in lines: f.write(line)