Example #1
0
    def check(self, filepath, lineno):
        """
        2번 이상 memory leak 이 발생한 경우만 저장한다.
        따라서, 테스트코드에서 같은 명령을 2회 이상 실행해야 한다.
        :param filepath:
        :param lineno:
        :return:
        """
        if self.enable:
            lineno = int(lineno)
            garbage_len = self.gabage_len()
            leaks_count = garbage_len - self.last_garbage_len
            self.last_garbage_len = garbage_len

            # print('total_byes:', self.memory.total_memory())
            if leaks_count > 0:  # detect memory leak
                increased_bytes = self.memory.increased_bytes()
                print('increased_bytes:', increased_bytes)
                self.total_leaks_count += leaks_count
                self.total_increased_bytes += increased_bytes
                if self.show_lines:
                    line = 'leaks: %s bytes(%s)\n' % (NumUtil.comma_str(
                        increased_bytes), NumUtil.comma_str(leaks_count))
                    for i in range(lineno - 1, lineno):
                        # print(linecache.getline(filepath, i).strip())
                        line += '\tincreased:%s bytes\t%s:%s\t%s\n' % (
                            NumUtil.comma_str(increased_bytes), filepath, i,
                            linecache.getline(filepath, i).strip())
                    self.total_lines.append(line)
                return increased_bytes
        return 0
Example #2
0
    def collect_characters(sentences_file: str,
                           characters_file: str,
                           max_test: int = 0):
        """
        문장 파일을 읽어서, 유니크한 문자(음절)들을 추출 한다.
        추후 corpus기반으로 one hot vector 생성시 사용한다.
        :param sentences_file: *.sentences file path 
        :param characters_file: *.characters file path
        :param max_test: 0=run all 
        :return: 
        """
        total = FileUtil.count_lines(sentences_file, gzip_format=True)
        log.info('total: %s' % NumUtil.comma_str(total))

        char_set = set()
        with gzip.open(sentences_file, 'rt') as f:
            for i, sentence in enumerate(f):
                i += 1
                if i % 10000 == 0:
                    log.info(
                        '%s %.1f%% writed.' %
                        (os.path.basename(characters_file), i / total * 100))
                _char_set = set([c for c in sentence])
                char_set.update(_char_set)
                if 0 < max_test <= i:
                    break

        char_list = list(char_set)
        char_list.sort()
        if max_test == 0:  # 0=full
            with open(characters_file, 'w') as f:
                for c in char_list:
                    f.write(c)
                    f.write('\n')
                log.info('writed to %s OK.' % characters_file)
Example #3
0
    def dump_urls(mongo_url,
                  db_name,
                  collection_name,
                  urls_file,
                  mongo_query=None,
                  limit=0):
        if mongo_query is None:
            mongo_query = {}

        corpus_mongo = MongodbUtil(mongo_url,
                                   db_name=db_name,
                                   collection_name=collection_name)
        total = corpus_mongo.count()
        log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total)))

        output_dir = os.path.basename(urls_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with open(urls_file, 'wt') as out_f:
            for i, row in enumerate(corpus_mongo.find(mongo_query,
                                                      limit=limit)):
                if i % 1000 == 0:
                    log.info('%s %.1f%% writed.' %
                             (os.path.basename(urls_file), i / total * 100))
                    out_f.write(row['url'])
                    out_f.write('\n')
Example #4
0
 def summary(self):
     print('total_byes:', self.memory.total_memory())
     if self.enable:
         if self.show_lines:
             summary = '[leak summary]\ntotal bytes: %s, total lines: %s, increased: %s bytes\n' % (
                 NumUtil.comma_str(
                     self.memory.total_memory()), len(self.total_lines),
                 NumUtil.comma_str(self.total_increased_bytes))
             return '\n'.join(self.total_lines) + summary
         else:
             summary = '[leak summary]\ntotal bytes: %s, total increased: %s bytes\n' % (
                 NumUtil.comma_str(self.memory.total_memory()),
                 NumUtil.comma_str(self.total_increased_bytes))
             return summary
     else:
         return ''
Example #5
0
    def dump_corpus(mongo_url, db_name, collection_name, sentences_file, mongo_query=None, limit=None):
        """
        Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.)
        :param mongo_url: mongodb://~~~
        :param db_name: database name of mongodb
        :param collection_name: collection name of mongodb
        :param sentences_file: *.sentence file
        :param mongo_query: default={}
        :param limit:
        :return:
        """
        if mongo_query is None:
            mongo_query = {}

        corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name)
        total = corpus_mongo.count()
        log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total)))

        output_dir = os.path.basename(sentences_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with gzip.open(sentences_file, 'wt') as out_f:
            for i, row in enumerate(corpus_mongo.find(mongo_query, limit=limit)):
                # print('url:', row['url'])
                for c in row['content']:
                    if i % 1000 == 0:
                        print('%.1f%% writed.' % (i / total * 100))
                    for s in HangulUtil.text2sentences(c['sentences']):
                        if HangulUtil.has_hangul(s):
                            out_f.write(s)
                            out_f.write('\n')
    def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, left_gram, right_gram, model_file, features_vector, labels_vector, n_hidden1=100,
                 learning_rate=0.01, early_stop_cost=0.001):
        ngram = left_gram + right_gram
        n_features = len(features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(labels_vector) >= 3 else 1  # number of classes = 2 but len=1

        log.info('load characters list...')
        log.info('load characters list OK. len: %s\n' % NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                  'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.train.gz' % (n_train, left_gram, right_gram))
        valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                  'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.test.gz' % (n_valid, left_gram, right_gram))
        test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'word_spacing',
                                 'ko.wikipedia.org.dataset.sentences=%s.left=%d.right=%d.valid.gz' % (n_test, left_gram, right_gram))
        if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file):
            dataset_dir = os.path.dirname(train_file)
            if not os.path.exists(dataset_dir):
                os.makedirs(dataset_dir)

            watch.start('create dataset')
            log.info('create dataset...')

            data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False),
                          ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False),
                          ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False))

            for name, data_file, total, dataset_file, to_one_hot_vector in data_files:
                check_interval = 10000
                log.info('check_interval: %s' % check_interval)
                log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total)))

                features, labels = [], []
                with gzip.open(data_file, 'rt', encoding='utf8') as f:
                    for i, line in enumerate(f, 1):
                        if total < i:
                            break

                        if i % check_interval == 0:
                            time.sleep(0.01)  # prevent cpu overload
                            percent = i / total * 100
                            log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file))

                        _f, _l = WordSpacing.sentence2features_labels(line.strip(), left_gram=left_gram, right_gram=right_gram)
                        features.extend(_f)
                        labels.extend(_l)

                dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name)
                log.info('dataset save... %s' % dataset_file)
                dataset.save(dataset_file, gzip_format=True, verbose=True)
                log.info('dataset save OK. %s' % dataset_file)
                log.info('dataset: %s' % dataset)

            log.info('create dataset OK.')
            log.info('')
            watch.stop('create dataset')

        watch.start('dataset load')
        log.info('dataset load...')
        train = DataSet.load(train_file, gzip_format=True, verbose=True)

        if n_train >= int('100,000'.replace(',', '')):
            valid = DataSet.load(valid_file, gzip_format=True, verbose=True)
        else:
            valid = DataSet.load(train_file, gzip_format=True, verbose=True)
        log.info('valid.convert_to_one_hot_vector()...')
        valid = valid.convert_to_one_hot_vector(verbose=True)
        log.info('valid.convert_to_one_hot_vector() OK.')

        log.info('train dataset: %s' % train)
        log.info('valid dataset: %s' % valid)
        log.info('dataset load OK.')
        log.info('')
        watch.stop('dataset load')

        graph = WordSpacing.build_FFNN(n_features, n_classes, n_hidden1, learning_rate, watch)

        train_step, X, Y, cost, predicted, accuracy = graph['train_step'], graph['X'], graph['Y'], graph['cost'], graph['predicted'], graph['accuracy']

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            check_interval = 10  # max(1, min(1000, n_train // 10))
            nth_train, nth_input, total_input = 0, 0, total_epoch * train.size

            log.info('learn...')
            log.info('total: %s' % NumUtil.comma_str(train.size))
            watch.start('learn')
            valid_cost = sys.float_info.max
            for epoch in range(1, total_epoch + 1):
                if valid_cost < early_stop_cost:
                    break
                for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size), 1):
                    if valid_cost < early_stop_cost:
                        log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost))
                        break
                    nth_train += 1
                    nth_input += features_batch.shape[0]
                    sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch})

                    # if step % check_interval == 1:
                    percent = nth_input / total_input * 100
                    valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels})
                    log.info('[epoch=%s][%.1f%%] %s cost: %.4f' % (epoch, percent, valid.name, valid_cost))
            watch.stop('learn')
            log.info('learn OK.\n')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('\n')
        log.info('batch_size: %s' % batch_size)
        log.info(watch.summary())
        log.info('\n')
    log.info('test_sentences_file: %s' % test_sentences_file)

    characters_file = KO_WIKIPEDIA_ORG_CHARACTERS_FILE
    log.info('characters_file: %s' % characters_file)
    try:
        if len(sys.argv) == 4:
            n_train = int(sys.argv[1])
            left_gram = int(sys.argv[2])
            right_gram = int(sys.argv[3])
        else:
            n_train, left_gram, right_gram = 100, 2, 2
            # n_train = int('1,000,000'.replace(',', ''))  # 1M data (학습: 17시간 소요)

        ngram = left_gram + right_gram
        n_valid, n_test = 100, 100
        log.info('n_train: %s' % NumUtil.comma_str(n_train))
        log.info('n_valid: %s' % NumUtil.comma_str(n_valid))
        log.info('n_test: %s' % NumUtil.comma_str(n_test))
        log.info('left_gram: %s, right_gram: %s' % (left_gram, right_gram))
        log.info('ngram: %s' % ngram)

        total_sentences = FileUtil.count_lines(KO_WIKIPEDIA_ORG_SENTENCES_FILE)
        model_file = os.path.join(KO_WIKIPEDIA_ORG_WORD_SPACING_MODEL_DIR,
                                  'word_spacing_model.sentences=%s.left_gram=%s.right_gram=%s/model' % (
                                      n_train, left_gram, right_gram))  # .%s' % max_sentences
        log.info('model_file: %s' % model_file)

        batch_size = 500  # mini batch size
        log.info('batch_size: %s' % batch_size)

        total_epoch = 100  # min(100, 1000000 // n_train)  # 1 ~ 100
Example #8
0
    def dump_corpus(mongo_url, db_name, collection_name, sentences_file, characters_file, info_file, urls_file,
                    train_sentences_file, valid_sentences_file, test_sentences_file,
                    mongo_query=None, limit=None):
        """
        Mongodb에서 문서를 읽어서, 문장 단위로 저장한다. (단 문장안의 단어가 1개 이거나, 한글이 전혀 없는 문장은 추출하지 않는다.)
        :param characters_file:
        :param urls_file:
        :param info_file:
        :param mongo_url: mongodb://~~~
        :param db_name: database name of mongodb
        :param collection_name: collection name of mongodb
        :param sentences_file: *.sentence file
        :param train_sentences_file:
        :param valid_sentences_file:
        :param test_sentences_file:
        :param mongo_query: default={}
        :param limit:
        :return:
        """
        if mongo_query is None:
            mongo_query = {}

        corpus_mongo = MongodbUtil(mongo_url, db_name=db_name, collection_name=collection_name)
        total_docs = corpus_mongo.count()
        log.info('%s total: %s' % (corpus_mongo, NumUtil.comma_str(total_docs)))

        output_dir = os.path.basename(sentences_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with gzip.open(sentences_file, 'wt') as out_f, \
                gzip.open(train_sentences_file, 'wt') as train_f, \
                gzip.open(valid_sentences_file, 'wt') as valid_f, \
                gzip.open(test_sentences_file, 'wt') as test_f, \
                open(info_file, 'wt') as info_f, \
                open(urls_file, 'wt') as urls_f:

            char_set = set()
            n_docs = n_total = n_train = n_valid = n_test = 0
            if limit:
                cursor = corpus_mongo.find(mongo_query, limit=limit)
            else:
                cursor = corpus_mongo.find(mongo_query)

            for i, row in enumerate(cursor, 1):
                if i % 1000 == 0:
                    log.info('%s %.1f%% writed.' % (os.path.basename(sentences_file), i / total_docs * 100))

                sentences = []
                for c in row['content']:
                    sentences.extend(HangulUtil.text2sentences(c['sentences'], remove_only_one_word=True, has_hangul=True))
                # sentences = HangulUtil.text2sentences(row['content'], remove_only_one_word=True, has_hangul=True)

                log.debug('url: %s, len: %s' % (row['url'], len(sentences)))
                if len(sentences) == 0:
                    # log.error(row['content'])
                    continue

                urls_f.write(row['url'])
                urls_f.write('\n')
                n_docs += 1

                for s in sentences:
                    _char_set = set([c for c in s])
                    char_set.update(_char_set)

                    n_total += 1
                    out_f.write(s)
                    out_f.write('\n')

                if len(sentences) >= 10:  # can split
                    test_len = valid_len = len(sentences) // 10
                    # log.info('train: %s, test: %s, valid: %s' % (len(sentences) - test_len - valid_len, test_len, valid_len))
                    for s in sentences[:test_len]:
                        n_test += 1
                        test_f.write(s)
                        test_f.write('\n')
                    for s in sentences[test_len:test_len + valid_len]:
                        n_valid += 1
                        valid_f.write(s)
                        valid_f.write('\n')
                    for s in sentences[test_len + valid_len:]:
                        n_train += 1
                        train_f.write(s)
                        train_f.write('\n')
                else:  # can't split
                    for s in sentences:
                        n_train += 1
                        train_f.write(s)
                        train_f.write('\n')

            char_list = list(char_set)
            char_list.sort()
            log.info('writed to %s...' % characters_file)
            with open(characters_file, 'w') as f:
                for c in char_list:
                    f.write(c)
                    f.write('\n')
            log.info('writed to %s OK.' % characters_file)

            log.info('total docs: %s', NumUtil.comma_str(total_docs))
            log.info('total docs: %s (has hangul sentence)', NumUtil.comma_str(n_docs))
            log.info('total sentences: %s (has hangul sentence)', NumUtil.comma_str(n_total))
            log.info('train: %s', NumUtil.comma_str(n_train))
            log.info('valid: %s', NumUtil.comma_str(n_valid))
            log.info('test: %s', NumUtil.comma_str(n_test))
            log.info('total characters: %s', NumUtil.comma_str(len(char_list)))

            info_f.write('total docs: %s\n' % NumUtil.comma_str(total_docs))
            info_f.write('total docs: %s (has hangul sentence)\n' % NumUtil.comma_str(n_docs))
            info_f.write('total sentences: %s (has hangul sentence)\n' % NumUtil.comma_str(n_total))
            info_f.write('train: %s\n' % NumUtil.comma_str(n_train))
            info_f.write('valid: %s\n' % NumUtil.comma_str(n_valid))
            info_f.write('test: %s\n' % NumUtil.comma_str(n_test))
            info_f.write('total characters: %s\n' % NumUtil.comma_str(len(char_list)))
Example #9
0
    def learning(cls, total_epoch, n_train, n_valid, n_test, batch_size, window_size, noise_rate, model_file, features_vector, labels_vector,
                 n_hidden1,
                 learning_rate,
                 dropout_keep_rate, early_stop_cost=0.001):
        n_features = len(features_vector) * window_size  # number of features = 17,382 * 10

        log.info('load characters list...')
        log.info('load characters list OK. len: %s' % NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                  'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.train.gz' % (n_train, window_size))
        valid_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                  'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.valid.gz' % (n_valid, window_size))
        test_file = os.path.join(KO_WIKIPEDIA_ORG_DIR, 'datasets', 'spelling_error_correction',
                                 'ko.wikipedia.org.dataset.sentences=%s.window_size=%d.test.gz' % (n_test, window_size))

        log.info('train_file: %s' % train_file)
        log.info('valid_file: %s' % valid_file)
        log.info('test_file: %s' % test_file)
        if not os.path.exists(train_file) or not os.path.exists(valid_file) or not os.path.exists(test_file):
            dataset_dir = os.path.dirname(train_file)
            if not os.path.exists(dataset_dir):
                os.makedirs(dataset_dir)

            watch.start('create dataset')  # FIXME: out of memory (1M sentences)
            log.info('create dataset...')

            data_files = (('train', KO_WIKIPEDIA_ORG_TRAIN_SENTENCES_FILE, n_train, train_file, False),
                          ('valid', KO_WIKIPEDIA_ORG_VALID_SENTENCES_FILE, n_valid, valid_file, False),
                          ('test', KO_WIKIPEDIA_ORG_TEST_SENTENCES_FILE, n_test, test_file, False))

            for (name, data_file, total, dataset_file, to_one_hot_vector) in data_files:
                check_interval = 10000
                log.info('check_interval: %s' % check_interval)
                log.info('%s %s total: %s' % (name, os.path.basename(data_file), NumUtil.comma_str(total)))
                log.info('noise_rate: %s' % noise_rate)

                features, labels = [], []
                with gzip.open(data_file, 'rt') as f:
                    for i, line in enumerate(f, 1):
                        if total < i:
                            break

                        if i % check_interval == 0:
                            time.sleep(0.01)  # prevent cpu overload
                            percent = i / total * 100
                            log.info('create dataset... %.1f%% readed. data len: %s. %s' % (percent, NumUtil.comma_str(len(features)), data_file))

                        sentence = line.strip()
                        for start in range(0, len(sentence) - window_size + 1):  # 문자 단위로 노이즈(공백) 생성
                            chars = sentence[start: start + window_size]
                            for idx in range(len(chars)):
                                noised_chars = StringUtil.replace_with_index(chars, ' ', idx)
                                features.append(noised_chars)
                                labels.append(chars)
                                log.debug('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars))

                # log.info('noise_sampling: %s' % noise_sampling)
                #         for nth_sample in range(noise_sampling): # 초성, 중성, 종성 단위로 노이즈 생성
                #             for start in range(0, len(sentence) - window_size + 1):
                #                 chars = sentence[start: start + window_size]
                #                 noised_chars = SpellingErrorCorrection.encode_noise(chars, noise_rate=noise_rate, noise_with_blank=True)
                #                 if chars == noised_chars:
                #                     continue
                #                 if i % check_interval == 0 and nth_sample == 0:
                #                     log.info('create dataset... %s "%s" -> "%s"' % (name, noised_chars, chars))
                #                 features.append(noised_chars)
                #                 labels.append(chars)

                # print('dataset features:', features)
                # print('dataset labels:', labels)
                dataset = DataSet(features=features, labels=labels, features_vector=features_vector, labels_vector=labels_vector, name=name)
                log.info('dataset save... %s' % dataset_file)
                dataset.save(dataset_file, gzip_format=True, verbose=True)
                log.info('dataset save OK. %s' % dataset_file)
                log.info('dataset: %s' % dataset)

            log.info('create dataset OK.')
            log.info('')
            watch.stop('create dataset')

        watch.start('dataset load')
        log.info('dataset load...')
        train = DataSet.load(train_file, gzip_format=True, verbose=True)

        if n_train >= int('100,000'.replace(',', '')):
            valid = DataSet.load(valid_file, gzip_format=True, verbose=True)
        else:
            valid = DataSet.load(train_file, gzip_format=True, verbose=True)
        log.info('valid.convert_to_one_hot_vector()...')
        valid = valid.convert_to_one_hot_vector(verbose=True)
        log.info('valid.convert_to_one_hot_vector() OK.')

        log.info('train dataset: %s' % train)
        log.info('valid dataset: %s' % valid)
        log.info('dataset load OK.')
        log.info('')
        watch.stop('dataset load')

        X, Y, dropout_keep_prob, train_step, cost, y_hat, accuracy = SpellingErrorCorrection.build_DAE(n_features, window_size, noise_rate, n_hidden1,
                                                                                                       learning_rate, watch)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            check_interval = max(1, min(1000, n_train // 10))
            nth_train, nth_input, total_input = 0, 0, total_epoch * train.size

            log.info('')
            log.info('learn...')
            log.info('total_epoch: %s' % total_epoch)
            log.info('train.size (total features): %s' % NumUtil.comma_str(train.size))
            log.info('check_interval: %s' % check_interval)
            log.info('total_epoch: %s' % total_epoch)
            log.info('batch_size: %s' % batch_size)
            log.info('total_input: %s (total_epoch * train.size)' % total_input)
            log.info('')
            watch.start('learn')
            valid_cost = sys.float_info.max
            for epoch in range(1, total_epoch + 1):
                if valid_cost < early_stop_cost:
                    log.info('valid_cost: %s, early_stop_cost: %s, early stopped.' % (valid_cost, early_stop_cost))
                    break
                for step, (features_batch, labels_batch) in enumerate(train.next_batch(batch_size=batch_size, to_one_hot_vector=True), 1):
                    if valid_cost < early_stop_cost:
                        break

                    nth_train += 1
                    nth_input += features_batch.shape[0]
                    sess.run(train_step, feed_dict={X: features_batch, Y: labels_batch, dropout_keep_prob: dropout_keep_rate})

                    # if nth_train % check_interval == 1:
                    percent = nth_input / total_input * 100
                    valid_cost = sess.run(cost, feed_dict={X: valid.features, Y: valid.labels, dropout_keep_prob: 1.0})
                    log.info('[epoch=%s][%.1f%%] %s cost: %.8f' % (epoch, percent, valid.name, valid_cost))

            watch.stop('learn')
            log.info('learn OK.')
            log.info('')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('')
        log.info('total_epoch: %s' % total_epoch)
        log.info('batch_size: %s' % batch_size)
        log.info('total_input: %s (total_epoch * train.size)' % total_input)
        log.info('')
        log.info(watch.summary())
        log.info('')
Example #10
0
        n_test = min(100, n_train // 10)

        if noise_rate is None or window_size is None:  # default
            window_size = 6  # 2 ~ 10 # feature로 추출할 문자 수 (label과 동일)
            noise_rate = max(0.1, 1 / window_size)  # 0.0 ~ 1.0 # noise_rate = 노이즈 문자 수 / 전체 문자 수 (windos 안에서 최소 한 글자는 노이즈가 생기도록 함.)

        dropout_keep_rate = 1.0  # 0.0 ~ 1.0 # one hot vector에 경우에 dropout 사용시, 학습이 안 됨.
        # total_epoch = max(10, 100 // window_size)  # 10 ~ 100 # window_size 가 클 수록 total_epoch는 작아도 됨.
        total_epoch = min(100, 1000000 // n_train)  # 1 ~ 100
        batch_size = min(100, 10 * window_size)  # 1 ~ 100 # one hot vector 입력이면, batch_size 작게 잡아야 학습이 잘 된다. batch_size가 너무 크면, 전혀 학습이 안 됨.

        n_hidden1 = min(1000, 10 * window_size)  # 10 ~ 1000
        learning_rate = 1 / total_epoch  # 0.01  # 0.1 ~ 0.001  # total_epoch 가 클 수록 learning_rate는 작아도 됨.
        early_stop_cost = 0.0001
        log.info('')
        log.info('n_train (sentences): %s' % NumUtil.comma_str(n_train))
        log.info('n_valid (sentences): %s' % NumUtil.comma_str(n_valid))
        log.info('n_test (sentences): %s' % NumUtil.comma_str(n_test))
        log.info('')
        log.info('window_size: %s' % window_size)
        log.info('noise_rate: %s' % noise_rate)
        log.info('dropout_keep_rate: %s' % dropout_keep_rate)
        log.info('')
        log.info('n_hidden1: %s' % n_hidden1)
        log.info('learning_rate: %s' % learning_rate)
        log.info('')
        log.info('total_epoch: %s' % total_epoch)
        log.info('batch_size: %s' % batch_size)
        log.info('early_stop_cost: %s' % early_stop_cost)
        log.info('total_epoch: %s' % total_epoch)
Example #11
0
                                            if _valid_cost < min_valid_cost:
                                                min_valid_cost = _valid_cost
                                                min_valid_epoch = epoch
                                            log.info('[epoch: %s, nth_batch: %s] train cost: %.8f, valid cost: %.8f' % (
                                                epoch, nth_batch, _train_cost, _valid_cost))
                                            if min_valid_epoch == epoch:  # save the lastest best model
                                                saver.save(sess, model_file)

                                    if save_model_each_epochs:
                                        saver.save(sess, model_file, global_step=epoch)

                                log.info('')
                                log.info(
                                    '"%s" train: min_valid_cost: %.8f, min_valid_epoch: %s,  %.2f secs (batch_size: %s,  total_input_data: %s, total_epochs: %s, total_train_time: %s secs)' % (
                                        model_name, min_valid_cost, min_valid_epoch, watch.elapsed(),
                                        batch_size, NumUtil.comma_str(batch_size * nth_batch), epoch, total_train_time))
                                log.info('')
                            except:
                                log.info(traceback.format_exc())
                            finally:
                                coordinator.request_stop()
                                coordinator.join(threads)  # Wait for threads to finish.
                        else:  # testing
                            x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary = create_graph(model_name, scope_name, verbose=False)
                            test_x_batch, test_y_batch = input_pipeline([test_file], batch_size=n_test, delim='\t', splits=3)

                            log.info('')
                            log.info('model loaded... %s' % model_file)
                            saver = tf.train.Saver(max_to_keep=None)
                            saver.restore(sess, model_file)
                            log.info('model loaded OK. %s' % model_file)
Example #12
0
                                            if min_valid_epoch == epoch:  # save the lastest best model
                                                saver.save(sess, model_file)

                                    if save_model_each_epochs:
                                        saver.save(sess,
                                                   model_file,
                                                   global_step=epoch)

                                log.info('')
                                log.info(
                                    '"%s" train: min_valid_cost: %.8f, min_valid_epoch: %s,  %.2f secs (batch_size: %s,  total_input_data: %s, total_epochs: %s, total_train_time: %s secs)'
                                    %
                                    (model_name,
                                     min_valid_cost, min_valid_epoch,
                                     watch.elapsed(), batch_size,
                                     NumUtil.comma_str(batch_size * nth_batch),
                                     epoch, total_train_time))
                                log.info('')
                            except:
                                log.info(traceback.format_exc())
                            finally:
                                coordinator.request_stop()
                                coordinator.join(
                                    threads)  # Wait for threads to finish.
                        else:  # testing
                            x, y, learning_rate, use_first_pipeline, W1, b1, y_hat, cost, train_step, summary = create_graph(
                                model_name,
                                scope_name,
                                first_pipeline=test_pipeline,
                                second_pipeline=test_pipeline,
                                verbose=False)
                            log.info('[epoch: %s] rsme (train/valid): %.1f / %.1f model saved' % (epoch, train_rsme, valid_rsme))
                        else:
                            log.info('[epoch: %s] rsme (train/valid): %.1f / %.1f' % (epoch, train_rsme, valid_rsme))
                        if valid_rsme < early_stop_cost or valid_rsme > max_cost or math.isnan(valid_rsme):
                            running = False
                            break
                watch.stop('train')

                if model_file_saved and os.path.exists(model_file + '.index'):
                    restored = saver.restore(sess, model_file)

                    log.info('')
                    log.info('--------TEST----------')
                    watch.start('test')
                    test_rsme, _y_hat = sess.run([rsme, y_hat], feed_dict={x: x_test, y: y_test})

                    log.info('%s rsme (test): %.1f (epoch best/total: %s/%s), activation: %s, n_hiddens: %s, learning_rate: %s, weights_initializer: %s' % (
                        func.__name__, test_rsme, NumUtil.comma_str(best_epoch), NumUtil.comma_str(epoch), activation.__name__,
                        n_hiddens, learning_rate, weights_initializer.__name__))

                    # _y_hat = np.round(_y_hat)
                    for i in range(min(5, _y_hat.shape[0])):
                        log.info('%s\t->\t%.1f\t(label: %d)' % (x_test[i], _y_hat[i], y_test[i]))
                    watch.stop('test')
                    log.info('--------TEST----------')
            log.info(watch.summary())
        except:
            traceback.print_exc()

    log.info('OK.')
Example #14
0
    def learning(cls,
                 sentences_file,
                 batch_size,
                 left_gram,
                 right_gram,
                 model_file,
                 features_vector,
                 labels_vector,
                 n_hidden1=100,
                 max_sentences=0,
                 learning_rate=0.01,
                 layers=2):
        ngram = left_gram + right_gram
        n_features = len(
            features_vector) * ngram  # number of features = 17,380 * 4
        n_classes = len(labels_vector) if len(
            labels_vector) >= 3 else 1  # number of classes = 2 but len=1

        log.info('load characters list...')
        log.info('load characters list OK. len: %s\n' %
                 NumUtil.comma_str(len(features_vector)))
        watch = WatchUtil()

        train_file = os.path.join(
            KO_WIKIPEDIA_ORG_DATA_DIR, 'datasets',
            'ko.wikipedia.org.dataset.sentences=%d.left=%d.right=%d.train.gz' %
            (max_sentences, left_gram, right_gram))
        validation_file = train_file.replace('.train.', '.validation.')
        test_file = train_file.replace('.train.', '.test.')
        if not os.path.exists(train_file) or not os.path.exists(
                validation_file) or not os.path.exists(test_file):
            watch.start('create dataset')
            log.info('create dataset...')
            features, labels = [], []
            check_interval = min(10000, math.ceil(max_sentences))
            log.info('total: %s' % NumUtil.comma_str(max_sentences))

            with gzip.open(sentences_file, 'rt') as f:
                for i, line in enumerate(f, 1):
                    if max_sentences < i:
                        break

                    if i % check_interval == 0:
                        log.info(
                            'create dataset... %.1f%% readed. data len: %s' %
                            (i / max_sentences * 100,
                             NumUtil.comma_str(len(features))))

                    _f, _l = WordSpacing.sentence2features_labels(
                        line.strip(),
                        left_gram=left_gram,
                        right_gram=right_gram)
                    features.extend(_f)
                    labels.extend(_l)

            dataset = DataSet(features=features,
                              labels=labels,
                              features_vector=features_vector,
                              labels_vector=labels_vector,
                              name='all')
            log.info('dataset: %s' % dataset)
            log.info('create dataset OK.\n')
            watch.stop('create dataset')

            watch.start('dataset save')
            log.info('split to train, test, validation...')
            datasets = DataSets.to_datasets(dataset,
                                            test_rate=0.1,
                                            valid_rate=0.1,
                                            test_max=10000,
                                            valid_max=1000,
                                            shuffle=True)
            train, test, validation = datasets.train, datasets.test, datasets.validation
            log.info(train)
            log.info(test)
            log.info(validation)
            # log.info('%s %s' % (test.features[0], test.labels[0]))
            log.info('split to train, test, validation OK.\n')

            log.info('dataset save... %s' % train_file)
            train.save(train_file, verbose=True)  # save as text
            log.info('dataset save OK.\n')

            log.info('dataset save... %s' % validation_file)
            validation = validation.convert_to_one_hot_vector(
                verbose=True)  # save as vector
            validation.save(validation_file, verbose=True)
            log.info('dataset save OK.\n')

            log.info('dataset save... %s' % test_file)
            test = test.convert_to_one_hot_vector(verbose=True)
            test.save(test_file, verbose=True)  # save as vector
            log.info('dataset save OK.\n')
            watch.stop('dataset save')
        else:
            watch.start('dataset load')
            log.info('dataset load...')
            train = DataSet.load(train_file, verbose=True)
            validation = DataSet.load(validation_file, verbose=True)
            test = DataSet.load(test_file, verbose=True)
            log.info(train)
            log.info(validation)
            log.info(test)
            log.info('dataset load OK.\n')
            watch.stop('dataset load')

        log.info('check samples...')
        for i, (features_batch, labels_batch) in enumerate(
                train.next_batch(batch_size=5, to_one_hot_vector=True), 1):
            if i > 2:
                break
            for a, b in zip(features_batch, labels_batch):
                feature, label = a, b
                _feature = feature.reshape((ngram, len(features_vector)))
                chars = ''.join(features_vector.to_values(_feature))
                has_space = np.argmax(label)
                log.info('[%s] %s -> %s, %s (len=%s) %s (len=%s)' %
                         (i, chars, has_space, feature, len(feature), label,
                          len(label)))
        log.info('check samples OK.\n')

        graph = WordSpacing.build_FFNN(n_features,
                                       n_classes,
                                       n_hidden1,
                                       learning_rate,
                                       watch,
                                       layers=layers)

        train_step, X, Y, cost, hypothesis, predicted, accuracy = graph[
            'train_step'], graph['X'], graph['Y'], graph['cost'], graph[
                'hypothesis'], graph['predicted'], graph['accuracy']

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            n_input = 0
            log.info('total: %s' % NumUtil.comma_str(train.size))
            log.info('learn...')
            watch.start('learn')
            for step, (features_batch, labels_batch) in enumerate(
                    train.next_batch(batch_size=batch_size), 1):
                n_input += batch_size
                sess.run(train_step,
                         feed_dict={
                             X: features_batch,
                             Y: labels_batch
                         })
                log.info(
                    '[%s][%.1f%%] validation cost: %.4f' %
                    (NumUtil.comma_str(n_input), n_input / train.size * 100,
                     sess.run(cost,
                              feed_dict={
                                  X: validation.features,
                                  Y: validation.labels
                              })))
            watch.stop('learn')
            log.info('learn OK.\n')

            log.info('evaluate...')
            watch.start('evaluate...')
            _hypothesis, _correct, _accuracy = sess.run(
                [hypothesis, predicted, accuracy],
                feed_dict={
                    X: test.features,
                    Y: test.labels
                })  # Accuracy report
            watch.stop('evaluate...')
            log.info('evaluate OK.')

            log.info('model save... %s' % model_file)
            watch.start('model save...')
            model_dir = os.path.dirname(model_file)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
            saver = tf.train.Saver()
            saver.save(sess, model_file)
            watch.stop('model save...')
            log.info('model save OK. %s' % model_file)

        log.info('\n')
        log.info(watch.summary())
        # log.info('hypothesis: %s %s' % (_hypothesis.shape, _hypothesis))
        # log.info('correct: %s %s' % (_correct.shape, _correct))
        log.info('accuracy: %s %s' % (_accuracy.shape, _accuracy))
        log.info('\n')
Example #15
0
            if max_test_sentences < max_sentences:  # leared sentences is smaller than full sentences
                for i, line in enumerate(f, 1):
                    if i <= max_sentences:  # skip learned sentences
                        if i % 100000 == 0:
                            log.info('skip %d th learned sentence.' % i)
                        continue
                    if len(sentences
                           ) >= max_test_sentences:  # read new sentences
                        break

                    s = line.strip()
                    if s.count(' ') > 0 and len(
                            s.replace(' ', '')
                    ) > ngram:  # sentence must have one or more space.
                        sentences.append(s)
        log.info('len(sentences): %s' % NumUtil.comma_str(len(sentences)))
        watch.stop('read sentences')

        watch.start('run tensorflow')
        accuracies, sims = [], []
        with tf.Session() as sess:
            graph = WordSpacing.build_FFNN(n_features,
                                           n_classes,
                                           n_hidden1,
                                           learning_rate,
                                           layers=layers)
            X, Y, predicted, accuracy = graph['X'], graph['Y'], graph[
                'predicted'], graph['accuracy']

            saver = tf.train.Saver()
            try: