Beispiel #1
0
class Architecture(object):
    def __init__(self,
                 sess,
                 params,
                 trained_model=False,
                 prepare_train_set=True):
        start = time.time()
        self.sess = sess
        self.params = params
        self.train_set = Dataset(self.params,
                                 "data/" + self.params.get("corpus_name") +
                                 "/train",
                                 None,
                                 only_eval=False)

        if prepare_train_set:
            self.train_set.prepare_data(self.params.get("min_count"))

        self.model = Model(sess, self.params, self.train_set.vocab_size())

        if trained_model:
            self.model.saver.restore(sess, trained_model)

        print("Модель подготовлена за " + str(int(time.time() - start)) +
              " секунд.")

    def evaluate(self,
                 files,
                 max_langs_per_file,
                 allowed_langs,
                 output_file,
                 threashold,
                 eval_lines=False,
                 eval_blocks=False,
                 smoothing=0,
                 unknown=None,
                 separator=",",
                 code_swaps=None):
        langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int)
        for allowed in self.train_set.get_tagging_classes():
            langs_mask[allowed] = 1

        for l in allowed_langs:
            # try find originally
            id = self.train_set.trg_vocab.get_id(l)
            if id == Vocabulary.Vocab.UNK_ID:
                print("UNSUPPORTED LANGUAGE IN MODEL: " + l)
            else:
                langs_mask[id] = 1

        datafile = Dataset(self.params,
                           None,
                           "data/" + self.params.get("corpus_name") + "/train",
                           only_eval=True,
                           use_eol=eval_lines)

        if smoothing > 0:
            print("USING SMOOTHING OF {0}".format(smoothing))

        with open(output_file, encoding='utf-8', mode='w', buffering=1) as bal:
            for filename in files:
                # files has structure: [folder, outputing_name, possible_encoding]
                if len(filename) > 2:
                    datafile.restart(filename[0] + filename[1], filename[2])
                else:
                    datafile.restart(filename[0] + filename[1])

                guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
                row = np.zeros(self.train_set.vocab_size()[1], np.int)
                row_length = 0
                total = 0
                smooth = []
                while not datafile.is_finished():
                    dev_batch_xs, dev_batch_ys, lengths = datafile.get_batch()
                    outs = self.model.eval(self.sess,
                                           dev_batch_xs,
                                           lengths,
                                           langs_mask=langs_mask)
                    for j in range(len(outs[0])):
                        block_guesses = np.zeros(
                            self.train_set.vocab_size()[1], np.int)
                        for i in range(len(outs)):
                            if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                                break
                            # print(datafile.get_source_name(dev_batch_xs[i][j]), datafile.get_target_name(outs[i][j], "orig"))
                            total += 1
                            if eval_lines:
                                if dev_batch_xs[i][
                                        j] == datafile.trg_vocab.EOL_ID:  # dev_batch_ys[i][j] == -2: # or
                                    guesses[np.argmax(row)] += row_length
                                    # print("filename {0}, guessed {1}, sum {2}, line length {3}".format(filename[1], datafile.get_target_name(np.argmax(row), "iso2"), row[np.argmax(row)], row_length))
                                    row = np.zeros(
                                        self.train_set.vocab_size()[1], np.int)
                                    row_length = 0
                                else:
                                    row[outs[i][j]] += 1
                                    row_length += 1
                            elif eval_blocks:
                                block_guesses[outs[i][j]] += 1
                            elif smoothing > 0:
                                smooth.append(outs[i][j])
                            else:
                                guesses[outs[i][j]] += 1

                        if eval_blocks:
                            guesses[np.argmax(block_guesses)] += i

                if smoothing > 0:
                    for i in range(len(smooth)):
                        if i + smoothing < len(smooth) and smooth[i] == smooth[
                                i + smoothing]:
                            # if first and the last are the same, the inbetween should be too
                            guesses[smooth[i]] += smoothing
                            i += smoothing - 1
                        else:
                            guesses[smooth[i]] += 1

                langs = 0
                last_count = 1
                seznam = ""
                for max in np.argsort(-guesses):
                    if guesses[max] == 0 or langs == max_langs_per_file:
                        break
                    guess_name = datafile.get_target_name(max, "iso2")
                    percent = 100 * guesses[max] / total
                    if guess_name in allowed_langs:
                        if code_swaps is not None and guess_name in code_swaps:
                            guess_name = code_swaps[guess_name]
                        # print at least on language
                        # if langs > 0 and 100 * guesses[max] / last_count < threashold:
                        #     break
                        if langs > 0 and percent < threashold:
                            break
                        seznam += "{0} {1:.0f}; ".format(guess_name, percent)
                        bal.write(filename[1] + separator + guess_name + "\n")
                        # print(filename[1] + "," + guess_name)
                        langs += 1
                        last_count = guesses[max]
                    else:
                        print(filename[1] + ", not allowed lang: " +
                              guess_name)
                if langs == 0 and unknown is not None:
                    # no language was outputted
                    bal.write(filename[1] + separator + unknown + "\n")

    def evaluate_dataset(self, source, allowed_languages=None):

        correct_all = 0
        total_all = 0
        with open(source, mode='r') as src:
            for l in src:
                if total_all % 1000 == 0:
                    print("processed lines ", total_all)
                entry = l.strip().split(' ', 1)
                if allowed_languages is not None:
                    guess = self.evaluate_string(entry[1],
                                                 languages=allowed_languages)
                else:
                    guess = self.evaluate_string(entry[1])
                total_all += 1
                if entry[0] == guess[0]:
                    correct_all += 1

        print("Accuracy all: {0} ({1}/{2})".format(correct_all / total_all,
                                                   correct_all, total_all))

    def evaluate_string(self, text, print_per_character=False, languages=None):
        if languages is not None:
            langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int)

            for l in languages:
                # try find originally
                id = self.train_set.trg_vocab.get_id(l)
                if id == Vocabulary.Vocab.UNK_ID:
                    print("UNSUPPORTED LANGUAGE IN MODEL: " + l)
                else:
                    langs_mask[id] = 1
        datafile = Dataset(self.params,
                           None,
                           "data/" + self.params.get("corpus_name") + "/train",
                           text_to_eval=text)

        guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
        total = 0
        orig = ""
        classif = ""
        while not datafile.is_finished():
            dev_batch_xs, _, lengths = datafile.get_batch()

            if languages is not None:
                outs = self.model.eval(self.sess,
                                       dev_batch_xs,
                                       lengths,
                                       langs_mask=langs_mask)
            else:
                outs = self.model.eval(self.sess, dev_batch_xs, lengths)
            for j in range(len(outs[0])):
                for i in range(len(outs)):
                    maxim = outs[i][j]

                    if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                        break
                    guesses[maxim] += 1

                    total += 1
        max = np.argmax(guesses)
        if print_per_character:
            print(orig)
            print(classif)
        accur = 0
        if total > 0:
            accur = float(guesses[max]) / float(total)

        print([datafile.get_target_name(max, type='name'), accur])

    def training(self, eval=None):
        self.train_set.skip_n_lines(self.params.params["trained_lines"])

        dev = Dataset(self.params,
                      "data/" + self.params.get("corpus_name") + "/dev",
                      "data/" + self.params.get("corpus_name") + "/train")
        dev.prepare_data(self.params.get("min_count"))
        start = time.time()  # for counting the time
        cycle_time = time.time()
        logging.info("Training process begun.")
        stop = False
        loss_per_epoch = []
        accuracy_per_epoch = []

        # Keep training until reach max iterations
        while not stop:
            self.params.params["step"] += 1
            batch_xs, batch_ys, lengths = self.train_set.get_batch()
            l, _ = self.model.run(self.sess, batch_xs, batch_ys, lengths,
                                  self.params.get("dropout"))
            loss_per_epoch.append(l)

            stop = self.chech_stopfile("STOP_IMMEDIATELY")

            if time.strftime("%H") == self.params.get("time_stop"):
                stop = True

            if self.params.params["step"] % self.params.get(
                    "steps_per_checkpoint") == 0 or stop:
                c_time = time.time()
                corr = [0, 0]

                while not dev.is_finished() and eval is None:
                    dev_batch_xs, dev_batch_ys, lengths = dev.get_batch()

                    dropout = 1
                    _, out = self.model.run(self.sess, dev_batch_xs,
                                            dev_batch_ys, lengths, dropout)
                    corr = np.sum([corr, out], axis=0)

                if eval is not None:
                    logging.info("Not testing on dev but on special function.")
                    result = eval()
                else:
                    # restart development data
                    dev.restart()
                    result = (corr[0] / corr[1]) * 100
                    accuracy_per_epoch.append(corr[0] / corr[1])

                self.params.params[
                    "trained_lines"] = self.train_set.get_trained_lines()

                self.model.save(self.sess, self.params.params["step"], result)

                print(
                    "Iter {0}, Total correctness: {1} % {2}, time per step: {3} s, total time: {4} min, {5}"
                    .format(
                        self.params.params["step"] *
                        self.params.get("batch_size"), result, corr,
                        (c_time - cycle_time) /
                        self.params.get("steps_per_checkpoint"),
                        int((time.time() - start) / 60),
                        time.strftime("%H:%M:%S")))
                # print((c_time - cycle_time) / self.params.get("steps_per_checkpoint"))
                cycle_time = time.time()

                stop = stop or self.chech_stopfile(
                    "STOP_MODEL")  # if it already is True do not change it

                if self.params.params["step"] >= self.params.get("max_iters"):
                    stop = True

            # check if the file was not finished and if it was, start over
            if self.train_set.is_finished():
                avg_loss = np.mean(loss_per_epoch)
                avg_test_accuracy = np.mean(accuracy_per_epoch)

                summ = self.sess.run(self.model.performance_summaries,
                                     feed_dict={
                                         self.model.tf_loss_ph:
                                         avg_loss,
                                         self.model.tf_accuracy_ph:
                                         avg_test_accuracy
                                     })
                self.model.sum_writer.add_summary(summ,
                                                  self.params.get('epochs'))

                loss_per_epoch.clear()
                accuracy_per_epoch.clear()

                self.params.params["epochs"] += 1
                logging.info(
                    "Generator read training file completely and starts over")
                self.train_set.restart()

        print("Training finished in " + str(int(time.time() - start)) + " s")

    def chech_stopfile(self, filename):
        stop = False
        with open(filename, mode="r") as stp:
            for line in stp:
                if line.strip() == self.params.params["corpus_name"]:
                    logging.info("Stopping training on command from stopfile.")

                    stop = True
                    break

        if stop:
            # remove command from file
            f = open(filename, "r")
            lines = f.readlines()
            f.close()

            f = open(filename, "w")
            for line in lines:
                if line.strip() != self.params.params["corpus_name"]:
                    f.write(line)
            f.close()

        return stop
Beispiel #2
0
class NNHelper(object):
    def __init__(self,
                 sess,
                 trained_model=None,
                 params=None,
                 prepare_train_set=True):
        start = time.time()

        self.session = sess
        self.params = Parameters('PARAMS')

        if trained_model:
            self.params.load_params(trained_model)
            logging.info('Загружается модель {0}'.format(trained_model))
        else:
            self.params = params

        self.train_set = Dataset(self.params,
                                 os.path.join('data',
                                              self.params.get('corpus_name'),
                                              'train'),
                                 only_eval=False)
        self.langs = {}

        with open(
                os.path.join('data', self.params.get('corpus_name'), 'labels'),
                'r') as f:
            for line in f.readlines():
                split = line.strip().split(' ', 1)
                self.langs[split[0]] = split[1]

        if prepare_train_set:
            self.train_set.prepare_data(self.params.get('min_count'))

        self.model = Model(self.session, self.params,
                           self.train_set.vocab_size())

        if trained_model:
            self.model.saver.restore(
                self.session,
                os.path.join('models', self.params.get('corpus_name'),
                             trained_model))

        print('Модель подготовлена за {0} секунд'.format(
            str(int(time.time() - start))))

    def detect_langs(self, text, count):
        datafile = Dataset(self.params,
                           None,
                           os.path.join('data', self.params.get('corpus_name'),
                                        'train'),
                           text_to_eval=text)

        guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
        total = 0
        while not datafile.is_finished():
            batch_xs, _, lengths = datafile.get_batch()

            outs = self.model.eval(self.session, batch_xs, lengths)

            for j in range(len(outs[0])):
                for i in range(len(outs)):
                    max = outs[i][j]

                    if batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                        break

                    guesses[max] += 1

                    total += 1

        result = {}

        for i in range(count):
            if all(item == 0 for item in guesses):
                break

            best = np.argmax(guesses)

            acc = 0
            if total > 0:
                acc = float(guesses[best]) / float(total)

            lang = self.langs[datafile.get_target_name(best, type='orig')]
            guesses[best] = 0

            result[lang] = acc

        return result

    def detect_lang(self, text):
        datafile = Dataset(self.params,
                           None,
                           os.path.join('data', self.params.get('corpus_name'),
                                        'train'),
                           text_to_eval=text)

        guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
        total = 0
        while not datafile.is_finished():
            batch_xs, _, lengths = datafile.get_batch()

            outs = self.model.eval(self.session, batch_xs, lengths)

            for j in range(len(outs[0])):
                for i in range(len(outs)):
                    max = outs[i][j]

                    if batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                        break

                    guesses[max] += 1

                    total += 1
        best = np.argmax(guesses)
        acc = 0
        if total > 0:
            acc = float(guesses[best]) / float(total)

        return self.langs[datafile.get_target_name(best, type='orig')], acc

    def test(self, dataset):
        datafile = Dataset(
            self.params, os.path.join('data', dataset, 'test'),
            os.path.join('data', self.params.get('corpus_name'), 'train'))
        datafile.prepare_data(self.params.get('min_count'))
        start = time.time()

        logging.info(
            'Тестирование начато. Датасет для тестирования - {0}.'.format(
                dataset))
        corr = [0, 0]
        while not datafile.is_finished():
            batch_xs, batch_ys, lengths = datafile.get_batch()

            dropout = 1
            _, out = self.model.run(self.session, batch_xs, batch_ys, lengths,
                                    dropout)
            corr = np.sum([corr, out], axis=0)

        logging.info('Тестирование закончено за {0} секунд'.format(
            str(int(time.time() - start))))

        return corr

    def train(self):
        self.train_set.skip_n_lines(self.params.get('trained_lines'))

        dev = Dataset(
            self.params,
            os.path.join('data', self.params.get('corpus_name'), 'dev'),
            os.path.join('data', self.params.get('corpus_name'), 'train'))
        dev.prepare_data(self.params.get('min_count'))
        start = time.time()
        cycle_time = time.time()
        logging.info('Процесс обучения запущен')
        stop = False
        loss_per_epoch = []
        accuracy_per_epoch = []

        while not stop:
            self.params.params['step'] += 1
            batch_xs, batch_ys, lengths = self.train_set.get_batch()
            l, _ = self.model.run(self.session, batch_xs, batch_ys, lengths,
                                  self.params.get('dropout'))
            loss_per_epoch.append(l)

            stop = self.check_stopfile('STOP_IMMEDIATELY')

            if time.strftime('%H') == self.params.get('time_stop'):
                stop = True

            if self.params.get('step') % self.params.get(
                    'steps_per_checkpoint') == 0 or stop:
                c_time = time.time()
                corr = [0, 0]

                while not dev.is_finished():
                    dev_batch_xs, dev_batch_ys, lengths = dev.get_batch()

                    dropout = 1
                    _, out = self.model.run(self.session, dev_batch_xs,
                                            dev_batch_ys, lengths, dropout)
                    corr = np.sum([corr, out], axis=0)

                result = (corr[0] / corr[1]) * 100
                accuracy_per_epoch.append(float(corr[0]) / float(corr[1]))

                self.params.params[
                    'trained_lines'] = self.train_set.get_trained_lines()
                self.model.save(self.session, self.params.get('step'), result)

                print('''Итерация: {0},
                Точность: {1}% {2},
                Времени на шаг: {3} секунд
                Время обучения: {4} минут
                Время: {5}'''.format(
                    self.paramsget('step') * self.params.get('batch_size'),
                    result, corr, (c_time - cycle_time) /
                    self.params.get('steps_per_checkpoint'),
                    int((time.time() - start) / 60),
                    time.strftime('%H:%M:%S')))

                cycle_time = time.time()

                stop = stop or self.check_stopfile('STOP_MODEL')
                if self.params.get('step') >= self.params.get('max_iters'):
                    stop = True

            if self.train_set.is_finished():
                avg_loss = np.mean(loss_per_epoch)
                avg_test_accuracy = np.mean(accuracy_per_epoch)

                summ = self.sess.run(self.model.performance_summaries,
                                     feed_dict={
                                         self.model.tf_loss_ph:
                                         avg_loss,
                                         self.model.tf_accuracy_ph:
                                         avg_test_accuracy
                                     })
                self.model.sum_writer.add_summary(summ,
                                                  self.params.get('epochs'))

                loss_per_epoch.clear()
                accuracy_per_epoch.clear()

                self.params.params["epochs"] += 1
                logging.info("Эпоха {0} начата.".format(
                    self.params.get('epochs')))
                self.train_set.restart()

        print("Обучение закончено за " + str(int(time.time() - start)) +
              " секунд")

    def check_stopfile(self, filename):
        stop = False
        with open(filename, mode="r") as stp:
            for line in stp:
                if line.strip() == self.params.params["corpus_name"]:
                    logging.info("Stopping training on command from stopfile.")

                    stop = True
                    break

        if stop:
            # remove command from file
            f = open(filename, "r")
            lines = f.readlines()
            f.close()

            f = open(filename, "w")
            for line in lines:
                if line.strip() != self.params.params["corpus_name"]:
                    f.write(line)
            f.close()

        return stop