def collect_characters(sentences_file: str, characters_file: str, max_test: int = 0): """ 문장 파일을 읽어서, 유니크한 문자(음절)들을 추출 한다. 추후 corpus기반으로 one hot vector 생성시 사용한다. :param sentences_file: *.sentences file path :param characters_file: *.characters file path :param max_test: 0=run all :return: """ total = FileUtil.count_lines(sentences_file, gzip_format=True) log.info('total: %s' % NumUtil.comma_str(total)) char_set = set() with gzip.open(sentences_file, 'rt') as f: for i, sentence in enumerate(f): i += 1 if i % 10000 == 0: log.info( '%s %.1f%% writed.' % (os.path.basename(characters_file), i / total * 100)) _char_set = set([c for c in sentence]) char_set.update(_char_set) if 0 < max_test <= i: break char_list = list(char_set) char_list.sort() if max_test == 0: # 0=full with open(characters_file, 'w') as f: for c in char_list: f.write(c) f.write('\n') log.info('writed to %s OK.' % characters_file)
def build(cls, text_file: str, vocab: Word2VecVocab, window=5, side='both', data_dir=None) -> 'Word2VecCorpus': log.info(f"build corpus... {text_file}") if data_dir is None: data_dir = os.path.dirname(text_file) filepath = cls.get_filepath(data_dir=data_dir, vocab=vocab, window=window, side=side) if os.path.exists(filepath): log.info(f"corpus file exists. load {filepath}") return Word2VecCorpus.load(filepath) total_lines = FileUtil.count_lines(text_file) word2idx = { vocab.idx2word[idx]: idx for idx, _ in enumerate(vocab.idx2word) } data = [] if text_file.endswith('.gz') or text_file.endswith('zip'): f = gzip.open(text_file, 'r') else: f = codecs.open(text_file, 'r', encoding='utf-8') with f: for no, line in enumerate(f): if no % 100000 == 0: log.info( f"{os.path.basename(text_file)} {no/total_lines*100:.1f}% readed." ) line = line.strip() if len(line) == 0: continue sent = [] for word in line.split(): if word in word2idx.keys(): sent.append(word) else: sent.append(Word2VecVocab.UNK_CHAR) for i in range(len(sent)): iword, owords = cls.skipgram(sent, i, window=window, side=side) data.append((word2idx[iword], [word2idx[oword] for oword in owords])) corpus = Word2VecCorpus(data=data, vocab=vocab, window=window, side=side) corpus.save(filepath=filepath) log.info(f"build corpus OK. {filepath}") return corpus
def __log_path_from_path(source_file, sub_log_dir='logs'): """ :param sub_log_dir: :param source_file: :e.g. ::LogUtil.get_logger(__file__) ./a.py -> ./logs/a.log :e.g. ::LogUtil.get_logger(__file__, sub_log_dir='xx') ./a.py -> ./xx/a.log """ _dir = os.path.join(os.path.dirname(source_file), sub_log_dir) _basename = os.path.basename(source_file) if len(sys.argv) > 1: _basename = '%s.%s' % ( _basename, FileUtil.to_filename('.'.join(sys.argv[1:]))) log_path = os.path.join(_dir, _basename) + '.log' return log_path
def build(cls, text_file: str, vocab_size=int(1e5), token=TOKEN, min_count=2, data_dir=WORD2VEC_DATA_DIR) -> 'Word2VecVocab': log.info(f"building vocab... {text_file}") if data_dir is None: data_dir = os.path.dirname(text_file) filepath = cls.get_filepath(data_dir, text_file, vocab_size) log.info(filepath) total_lines = FileUtil.count_lines(text_file) word2cnt = {} if text_file.endswith('.gz') or text_file.endswith('zip'): f = gzip.open(text_file, 'r') else: f = codecs.open(text_file, 'r', encoding='utf-8') with f: for no, line in enumerate(f): if no % 10000 == 0: log.info( f"{os.path.basename(text_file)} {no/total_lines*100:.1f}% readed." ) line = line.strip() if len(line) == 0: continue sent = line.split() for word in sent: word2cnt[word] = word2cnt.get(word, 0) + 1 for word, cnt in word2cnt.copy().items(): if cnt < min_count: del word2cnt[word] log.info(f'total unique words: {NumUtil.comma_str(len(word2cnt) + 1)}') idx2word = sorted(word2cnt, key=word2cnt.get, reverse=True) idx2word = [cls.UNK_CHAR] + idx2word[:vocab_size - 1] word2cnt[cls.UNK_CHAR] = 1 idx2freq = numpy.array([word2cnt[word] for word in idx2word]) idx2freq = idx2freq / idx2freq.sum() vocab = Word2VecVocab(token=token, min_count=min_count, idx2word=idx2word, idx2freq=idx2freq) vocab.save(filepath=filepath) log.info(f"build vocab OK. {filepath}") return vocab
n_train = int(sys.argv[1]) left_gram = int(sys.argv[2]) right_gram = int(sys.argv[3]) else: n_train, left_gram, right_gram = 100, 2, 2 # n_train = int('1,000,000'.replace(',', '')) # 1M data (학습: 17시간 소요) ngram = left_gram + right_gram n_valid, n_test = 100, 100 log.info('n_train: %s' % NumUtil.comma_str(n_train)) log.info('n_valid: %s' % NumUtil.comma_str(n_valid)) log.info('n_test: %s' % NumUtil.comma_str(n_test)) log.info('left_gram: %s, right_gram: %s' % (left_gram, right_gram)) log.info('ngram: %s' % ngram) total_sentences = FileUtil.count_lines(KO_WIKIPEDIA_ORG_SENTENCES_FILE) model_file = os.path.join(KO_WIKIPEDIA_ORG_WORD_SPACING_MODEL_DIR, 'word_spacing_model.sentences=%s.left_gram=%s.right_gram=%s/model' % ( n_train, left_gram, right_gram)) # .%s' % max_sentences log.info('model_file: %s' % model_file) batch_size = 500 # mini batch size log.info('batch_size: %s' % batch_size) total_epoch = 100 # min(100, 1000000 // n_train) # 1 ~ 100 features_vector = CharOneHotVector(DataFileUtil.read_list(characters_file)) labels_vector = CharOneHotVector([0, 1]) # 붙여쓰기=0, 띄어쓰기=1 n_features = len(features_vector) * ngram # number of features = 17,380 * 4 n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1 # number of classes = 2 but len=1 n_hidden1 = 100 learning_rate = 0.001 # min(0.1, 0.001 * total_epoch) # 0.1 ~ 0.001
return base64.b64encode(s) @staticmethod def decodes(s): return base64.b64decode(s) def encode(self): self.s = Base64Util.encodes(self.s) return self.s def decode(self): self.s = Base64Util.decodes(self.s) return self.s if __name__ == '__main__': in_file_path = base_util.real_path("input/Penguins.jpg") out_data_file_path = base_util.real_path("output/Penguins.jpg.base64.txt") out_file_path = base_util.real_path("output/Penguins.jpg") if os.path.exists(out_file_path): os.remove(out_file_path) out_data = Base64Util.encodes(FileUtil.reads(in_file_path, is_binary=True)) FileUtil.writes(out_data, out_data_file_path) out_file = Base64Util.decodes(out_data) FileUtil.writes(out_file, out_file_path) _in = '박' out = Base64Util('박').encode() print('Base64Util.encode(%s) -> %s' % (_in, out)) print('Base64Util.decode(%s) -> %s' % (out, Base64Util(out).decode()))