Esempio n. 1
0
    def collect_characters(sentences_file: str,
                           characters_file: str,
                           max_test: int = 0):
        """
        문장 파일을 읽어서, 유니크한 문자(음절)들을 추출 한다.
        추후 corpus기반으로 one hot vector 생성시 사용한다.
        :param sentences_file: *.sentences file path 
        :param characters_file: *.characters file path
        :param max_test: 0=run all 
        :return: 
        """
        total = FileUtil.count_lines(sentences_file, gzip_format=True)
        log.info('total: %s' % NumUtil.comma_str(total))

        char_set = set()
        with gzip.open(sentences_file, 'rt') as f:
            for i, sentence in enumerate(f):
                i += 1
                if i % 10000 == 0:
                    log.info(
                        '%s %.1f%% writed.' %
                        (os.path.basename(characters_file), i / total * 100))
                _char_set = set([c for c in sentence])
                char_set.update(_char_set)
                if 0 < max_test <= i:
                    break

        char_list = list(char_set)
        char_list.sort()
        if max_test == 0:  # 0=full
            with open(characters_file, 'w') as f:
                for c in char_list:
                    f.write(c)
                    f.write('\n')
                log.info('writed to %s OK.' % characters_file)
Esempio n. 2
0
    def build(cls,
              text_file: str,
              vocab: Word2VecVocab,
              window=5,
              side='both',
              data_dir=None) -> 'Word2VecCorpus':
        log.info(f"build corpus... {text_file}")
        if data_dir is None:
            data_dir = os.path.dirname(text_file)
        filepath = cls.get_filepath(data_dir=data_dir,
                                    vocab=vocab,
                                    window=window,
                                    side=side)

        if os.path.exists(filepath):
            log.info(f"corpus file exists. load {filepath}")
            return Word2VecCorpus.load(filepath)

        total_lines = FileUtil.count_lines(text_file)
        word2idx = {
            vocab.idx2word[idx]: idx
            for idx, _ in enumerate(vocab.idx2word)
        }
        data = []
        if text_file.endswith('.gz') or text_file.endswith('zip'):
            f = gzip.open(text_file, 'r')
        else:
            f = codecs.open(text_file, 'r', encoding='utf-8')
        with f:
            for no, line in enumerate(f):
                if no % 100000 == 0:
                    log.info(
                        f"{os.path.basename(text_file)} {no/total_lines*100:.1f}% readed."
                    )
                line = line.strip()
                if len(line) == 0:
                    continue
                sent = []
                for word in line.split():
                    if word in word2idx.keys():
                        sent.append(word)
                    else:
                        sent.append(Word2VecVocab.UNK_CHAR)
                for i in range(len(sent)):
                    iword, owords = cls.skipgram(sent,
                                                 i,
                                                 window=window,
                                                 side=side)
                    data.append((word2idx[iword],
                                 [word2idx[oword] for oword in owords]))

        corpus = Word2VecCorpus(data=data,
                                vocab=vocab,
                                window=window,
                                side=side)
        corpus.save(filepath=filepath)
        log.info(f"build corpus OK. {filepath}")
        return corpus
Esempio n. 3
0
 def __log_path_from_path(source_file, sub_log_dir='logs'):
     """
     :param sub_log_dir:
     :param source_file:
     :e.g.           ::LogUtil.get_logger(__file__) ./a.py -> ./logs/a.log
     :e.g.           ::LogUtil.get_logger(__file__, sub_log_dir='xx') ./a.py -> ./xx/a.log
     """
     _dir = os.path.join(os.path.dirname(source_file), sub_log_dir)
     _basename = os.path.basename(source_file)
     if len(sys.argv) > 1:
         _basename = '%s.%s' % (
             _basename, FileUtil.to_filename('.'.join(sys.argv[1:])))
     log_path = os.path.join(_dir, _basename) + '.log'
     return log_path
Esempio n. 4
0
    def build(cls,
              text_file: str,
              vocab_size=int(1e5),
              token=TOKEN,
              min_count=2,
              data_dir=WORD2VEC_DATA_DIR) -> 'Word2VecVocab':
        log.info(f"building vocab... {text_file}")
        if data_dir is None:
            data_dir = os.path.dirname(text_file)
        filepath = cls.get_filepath(data_dir, text_file, vocab_size)
        log.info(filepath)

        total_lines = FileUtil.count_lines(text_file)
        word2cnt = {}
        if text_file.endswith('.gz') or text_file.endswith('zip'):
            f = gzip.open(text_file, 'r')
        else:
            f = codecs.open(text_file, 'r', encoding='utf-8')
        with f:
            for no, line in enumerate(f):
                if no % 10000 == 0:
                    log.info(
                        f"{os.path.basename(text_file)} {no/total_lines*100:.1f}% readed."
                    )
                line = line.strip()
                if len(line) == 0:
                    continue
                sent = line.split()
                for word in sent:
                    word2cnt[word] = word2cnt.get(word, 0) + 1

        for word, cnt in word2cnt.copy().items():
            if cnt < min_count:
                del word2cnt[word]

        log.info(f'total unique words: {NumUtil.comma_str(len(word2cnt) + 1)}')
        idx2word = sorted(word2cnt, key=word2cnt.get, reverse=True)
        idx2word = [cls.UNK_CHAR] + idx2word[:vocab_size - 1]
        word2cnt[cls.UNK_CHAR] = 1
        idx2freq = numpy.array([word2cnt[word] for word in idx2word])
        idx2freq = idx2freq / idx2freq.sum()

        vocab = Word2VecVocab(token=token,
                              min_count=min_count,
                              idx2word=idx2word,
                              idx2freq=idx2freq)
        vocab.save(filepath=filepath)
        log.info(f"build vocab OK. {filepath}")
        return vocab
Esempio n. 5
0
            n_train = int(sys.argv[1])
            left_gram = int(sys.argv[2])
            right_gram = int(sys.argv[3])
        else:
            n_train, left_gram, right_gram = 100, 2, 2
            # n_train = int('1,000,000'.replace(',', ''))  # 1M data (학습: 17시간 소요)

        ngram = left_gram + right_gram
        n_valid, n_test = 100, 100
        log.info('n_train: %s' % NumUtil.comma_str(n_train))
        log.info('n_valid: %s' % NumUtil.comma_str(n_valid))
        log.info('n_test: %s' % NumUtil.comma_str(n_test))
        log.info('left_gram: %s, right_gram: %s' % (left_gram, right_gram))
        log.info('ngram: %s' % ngram)

        total_sentences = FileUtil.count_lines(KO_WIKIPEDIA_ORG_SENTENCES_FILE)
        model_file = os.path.join(KO_WIKIPEDIA_ORG_WORD_SPACING_MODEL_DIR,
                                  'word_spacing_model.sentences=%s.left_gram=%s.right_gram=%s/model' % (
                                      n_train, left_gram, right_gram))  # .%s' % max_sentences
        log.info('model_file: %s' % model_file)

        batch_size = 500  # mini batch size
        log.info('batch_size: %s' % batch_size)

        total_epoch = 100  # min(100, 1000000 // n_train)  # 1 ~ 100
        features_vector = CharOneHotVector(DataFileUtil.read_list(characters_file))
        labels_vector = CharOneHotVector([0, 1])  # 붙여쓰기=0, 띄어쓰기=1
        n_features = len(features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1  # number of classes = 2 but len=1
        n_hidden1 = 100
        learning_rate = 0.001  # min(0.1, 0.001 * total_epoch)  # 0.1 ~ 0.001
Esempio n. 6
0
        return base64.b64encode(s)

    @staticmethod
    def decodes(s):
        return base64.b64decode(s)

    def encode(self):
        self.s = Base64Util.encodes(self.s)
        return self.s

    def decode(self):
        self.s = Base64Util.decodes(self.s)
        return self.s


if __name__ == '__main__':
    in_file_path = base_util.real_path("input/Penguins.jpg")
    out_data_file_path = base_util.real_path("output/Penguins.jpg.base64.txt")
    out_file_path = base_util.real_path("output/Penguins.jpg")
    if os.path.exists(out_file_path):
        os.remove(out_file_path)
    out_data = Base64Util.encodes(FileUtil.reads(in_file_path, is_binary=True))
    FileUtil.writes(out_data, out_data_file_path)
    out_file = Base64Util.decodes(out_data)
    FileUtil.writes(out_file, out_file_path)

    _in = '박'
    out = Base64Util('박').encode()
    print('Base64Util.encode(%s) -> %s' % (_in, out))
    print('Base64Util.decode(%s) -> %s' % (out, Base64Util(out).decode()))