class Architecture(object): def __init__(self, sess, params, trained_model=False, prepare_train_set=True): start = time.time() self.sess = sess self.params = params self.train_set = Dataset(self.params, "data/" + self.params.get("corpus_name") + "/train", None, only_eval=False) if prepare_train_set: self.train_set.prepare_data(self.params.get("min_count")) self.model = Model(sess, self.params, self.train_set.vocab_size()) if trained_model: self.model.saver.restore(sess, trained_model) print("Модель подготовлена за " + str(int(time.time() - start)) + " секунд.") def evaluate(self, files, max_langs_per_file, allowed_langs, output_file, threashold, eval_lines=False, eval_blocks=False, smoothing=0, unknown=None, separator=",", code_swaps=None): langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int) for allowed in self.train_set.get_tagging_classes(): langs_mask[allowed] = 1 for l in allowed_langs: # try find originally id = self.train_set.trg_vocab.get_id(l) if id == Vocabulary.Vocab.UNK_ID: print("UNSUPPORTED LANGUAGE IN MODEL: " + l) else: langs_mask[id] = 1 datafile = Dataset(self.params, None, "data/" + self.params.get("corpus_name") + "/train", only_eval=True, use_eol=eval_lines) if smoothing > 0: print("USING SMOOTHING OF {0}".format(smoothing)) with open(output_file, encoding='utf-8', mode='w', buffering=1) as bal: for filename in files: # files has structure: [folder, outputing_name, possible_encoding] if len(filename) > 2: datafile.restart(filename[0] + filename[1], filename[2]) else: datafile.restart(filename[0] + filename[1]) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) row = np.zeros(self.train_set.vocab_size()[1], np.int) row_length = 0 total = 0 smooth = [] while not datafile.is_finished(): dev_batch_xs, dev_batch_ys, lengths = datafile.get_batch() outs = self.model.eval(self.sess, dev_batch_xs, lengths, langs_mask=langs_mask) for j in range(len(outs[0])): block_guesses = np.zeros( self.train_set.vocab_size()[1], np.int) for i in range(len(outs)): if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break # print(datafile.get_source_name(dev_batch_xs[i][j]), datafile.get_target_name(outs[i][j], "orig")) total += 1 if eval_lines: if dev_batch_xs[i][ j] == datafile.trg_vocab.EOL_ID: # dev_batch_ys[i][j] == -2: # or guesses[np.argmax(row)] += row_length # print("filename {0}, guessed {1}, sum {2}, line length {3}".format(filename[1], datafile.get_target_name(np.argmax(row), "iso2"), row[np.argmax(row)], row_length)) row = np.zeros( self.train_set.vocab_size()[1], np.int) row_length = 0 else: row[outs[i][j]] += 1 row_length += 1 elif eval_blocks: block_guesses[outs[i][j]] += 1 elif smoothing > 0: smooth.append(outs[i][j]) else: guesses[outs[i][j]] += 1 if eval_blocks: guesses[np.argmax(block_guesses)] += i if smoothing > 0: for i in range(len(smooth)): if i + smoothing < len(smooth) and smooth[i] == smooth[ i + smoothing]: # if first and the last are the same, the inbetween should be too guesses[smooth[i]] += smoothing i += smoothing - 1 else: guesses[smooth[i]] += 1 langs = 0 last_count = 1 seznam = "" for max in np.argsort(-guesses): if guesses[max] == 0 or langs == max_langs_per_file: break guess_name = datafile.get_target_name(max, "iso2") percent = 100 * guesses[max] / total if guess_name in allowed_langs: if code_swaps is not None and guess_name in code_swaps: guess_name = code_swaps[guess_name] # print at least on language # if langs > 0 and 100 * guesses[max] / last_count < threashold: # break if langs > 0 and percent < threashold: break seznam += "{0} {1:.0f}; ".format(guess_name, percent) bal.write(filename[1] + separator + guess_name + "\n") # print(filename[1] + "," + guess_name) langs += 1 last_count = guesses[max] else: print(filename[1] + ", not allowed lang: " + guess_name) if langs == 0 and unknown is not None: # no language was outputted bal.write(filename[1] + separator + unknown + "\n") def evaluate_dataset(self, source, allowed_languages=None): correct_all = 0 total_all = 0 with open(source, mode='r') as src: for l in src: if total_all % 1000 == 0: print("processed lines ", total_all) entry = l.strip().split(' ', 1) if allowed_languages is not None: guess = self.evaluate_string(entry[1], languages=allowed_languages) else: guess = self.evaluate_string(entry[1]) total_all += 1 if entry[0] == guess[0]: correct_all += 1 print("Accuracy all: {0} ({1}/{2})".format(correct_all / total_all, correct_all, total_all)) def evaluate_string(self, text, print_per_character=False, languages=None): if languages is not None: langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int) for l in languages: # try find originally id = self.train_set.trg_vocab.get_id(l) if id == Vocabulary.Vocab.UNK_ID: print("UNSUPPORTED LANGUAGE IN MODEL: " + l) else: langs_mask[id] = 1 datafile = Dataset(self.params, None, "data/" + self.params.get("corpus_name") + "/train", text_to_eval=text) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) total = 0 orig = "" classif = "" while not datafile.is_finished(): dev_batch_xs, _, lengths = datafile.get_batch() if languages is not None: outs = self.model.eval(self.sess, dev_batch_xs, lengths, langs_mask=langs_mask) else: outs = self.model.eval(self.sess, dev_batch_xs, lengths) for j in range(len(outs[0])): for i in range(len(outs)): maxim = outs[i][j] if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break guesses[maxim] += 1 total += 1 max = np.argmax(guesses) if print_per_character: print(orig) print(classif) accur = 0 if total > 0: accur = float(guesses[max]) / float(total) print([datafile.get_target_name(max, type='name'), accur]) def training(self, eval=None): self.train_set.skip_n_lines(self.params.params["trained_lines"]) dev = Dataset(self.params, "data/" + self.params.get("corpus_name") + "/dev", "data/" + self.params.get("corpus_name") + "/train") dev.prepare_data(self.params.get("min_count")) start = time.time() # for counting the time cycle_time = time.time() logging.info("Training process begun.") stop = False loss_per_epoch = [] accuracy_per_epoch = [] # Keep training until reach max iterations while not stop: self.params.params["step"] += 1 batch_xs, batch_ys, lengths = self.train_set.get_batch() l, _ = self.model.run(self.sess, batch_xs, batch_ys, lengths, self.params.get("dropout")) loss_per_epoch.append(l) stop = self.chech_stopfile("STOP_IMMEDIATELY") if time.strftime("%H") == self.params.get("time_stop"): stop = True if self.params.params["step"] % self.params.get( "steps_per_checkpoint") == 0 or stop: c_time = time.time() corr = [0, 0] while not dev.is_finished() and eval is None: dev_batch_xs, dev_batch_ys, lengths = dev.get_batch() dropout = 1 _, out = self.model.run(self.sess, dev_batch_xs, dev_batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) if eval is not None: logging.info("Not testing on dev but on special function.") result = eval() else: # restart development data dev.restart() result = (corr[0] / corr[1]) * 100 accuracy_per_epoch.append(corr[0] / corr[1]) self.params.params[ "trained_lines"] = self.train_set.get_trained_lines() self.model.save(self.sess, self.params.params["step"], result) print( "Iter {0}, Total correctness: {1} % {2}, time per step: {3} s, total time: {4} min, {5}" .format( self.params.params["step"] * self.params.get("batch_size"), result, corr, (c_time - cycle_time) / self.params.get("steps_per_checkpoint"), int((time.time() - start) / 60), time.strftime("%H:%M:%S"))) # print((c_time - cycle_time) / self.params.get("steps_per_checkpoint")) cycle_time = time.time() stop = stop or self.chech_stopfile( "STOP_MODEL") # if it already is True do not change it if self.params.params["step"] >= self.params.get("max_iters"): stop = True # check if the file was not finished and if it was, start over if self.train_set.is_finished(): avg_loss = np.mean(loss_per_epoch) avg_test_accuracy = np.mean(accuracy_per_epoch) summ = self.sess.run(self.model.performance_summaries, feed_dict={ self.model.tf_loss_ph: avg_loss, self.model.tf_accuracy_ph: avg_test_accuracy }) self.model.sum_writer.add_summary(summ, self.params.get('epochs')) loss_per_epoch.clear() accuracy_per_epoch.clear() self.params.params["epochs"] += 1 logging.info( "Generator read training file completely and starts over") self.train_set.restart() print("Training finished in " + str(int(time.time() - start)) + " s") def chech_stopfile(self, filename): stop = False with open(filename, mode="r") as stp: for line in stp: if line.strip() == self.params.params["corpus_name"]: logging.info("Stopping training on command from stopfile.") stop = True break if stop: # remove command from file f = open(filename, "r") lines = f.readlines() f.close() f = open(filename, "w") for line in lines: if line.strip() != self.params.params["corpus_name"]: f.write(line) f.close() return stop
class NNHelper(object): def __init__(self, sess, trained_model=None, params=None, prepare_train_set=True): start = time.time() self.session = sess self.params = Parameters('PARAMS') if trained_model: self.params.load_params(trained_model) logging.info('Загружается модель {0}'.format(trained_model)) else: self.params = params self.train_set = Dataset(self.params, os.path.join('data', self.params.get('corpus_name'), 'train'), only_eval=False) self.langs = {} with open( os.path.join('data', self.params.get('corpus_name'), 'labels'), 'r') as f: for line in f.readlines(): split = line.strip().split(' ', 1) self.langs[split[0]] = split[1] if prepare_train_set: self.train_set.prepare_data(self.params.get('min_count')) self.model = Model(self.session, self.params, self.train_set.vocab_size()) if trained_model: self.model.saver.restore( self.session, os.path.join('models', self.params.get('corpus_name'), trained_model)) print('Модель подготовлена за {0} секунд'.format( str(int(time.time() - start)))) def detect_langs(self, text, count): datafile = Dataset(self.params, None, os.path.join('data', self.params.get('corpus_name'), 'train'), text_to_eval=text) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) total = 0 while not datafile.is_finished(): batch_xs, _, lengths = datafile.get_batch() outs = self.model.eval(self.session, batch_xs, lengths) for j in range(len(outs[0])): for i in range(len(outs)): max = outs[i][j] if batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break guesses[max] += 1 total += 1 result = {} for i in range(count): if all(item == 0 for item in guesses): break best = np.argmax(guesses) acc = 0 if total > 0: acc = float(guesses[best]) / float(total) lang = self.langs[datafile.get_target_name(best, type='orig')] guesses[best] = 0 result[lang] = acc return result def detect_lang(self, text): datafile = Dataset(self.params, None, os.path.join('data', self.params.get('corpus_name'), 'train'), text_to_eval=text) guesses = np.zeros(self.train_set.vocab_size()[1], np.int) total = 0 while not datafile.is_finished(): batch_xs, _, lengths = datafile.get_batch() outs = self.model.eval(self.session, batch_xs, lengths) for j in range(len(outs[0])): for i in range(len(outs)): max = outs[i][j] if batch_xs[i][j] == datafile.trg_vocab.PAD_ID: break guesses[max] += 1 total += 1 best = np.argmax(guesses) acc = 0 if total > 0: acc = float(guesses[best]) / float(total) return self.langs[datafile.get_target_name(best, type='orig')], acc def test(self, dataset): datafile = Dataset( self.params, os.path.join('data', dataset, 'test'), os.path.join('data', self.params.get('corpus_name'), 'train')) datafile.prepare_data(self.params.get('min_count')) start = time.time() logging.info( 'Тестирование начато. Датасет для тестирования - {0}.'.format( dataset)) corr = [0, 0] while not datafile.is_finished(): batch_xs, batch_ys, lengths = datafile.get_batch() dropout = 1 _, out = self.model.run(self.session, batch_xs, batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) logging.info('Тестирование закончено за {0} секунд'.format( str(int(time.time() - start)))) return corr def train(self): self.train_set.skip_n_lines(self.params.get('trained_lines')) dev = Dataset( self.params, os.path.join('data', self.params.get('corpus_name'), 'dev'), os.path.join('data', self.params.get('corpus_name'), 'train')) dev.prepare_data(self.params.get('min_count')) start = time.time() cycle_time = time.time() logging.info('Процесс обучения запущен') stop = False loss_per_epoch = [] accuracy_per_epoch = [] while not stop: self.params.params['step'] += 1 batch_xs, batch_ys, lengths = self.train_set.get_batch() l, _ = self.model.run(self.session, batch_xs, batch_ys, lengths, self.params.get('dropout')) loss_per_epoch.append(l) stop = self.check_stopfile('STOP_IMMEDIATELY') if time.strftime('%H') == self.params.get('time_stop'): stop = True if self.params.get('step') % self.params.get( 'steps_per_checkpoint') == 0 or stop: c_time = time.time() corr = [0, 0] while not dev.is_finished(): dev_batch_xs, dev_batch_ys, lengths = dev.get_batch() dropout = 1 _, out = self.model.run(self.session, dev_batch_xs, dev_batch_ys, lengths, dropout) corr = np.sum([corr, out], axis=0) result = (corr[0] / corr[1]) * 100 accuracy_per_epoch.append(float(corr[0]) / float(corr[1])) self.params.params[ 'trained_lines'] = self.train_set.get_trained_lines() self.model.save(self.session, self.params.get('step'), result) print('''Итерация: {0}, Точность: {1}% {2}, Времени на шаг: {3} секунд Время обучения: {4} минут Время: {5}'''.format( self.paramsget('step') * self.params.get('batch_size'), result, corr, (c_time - cycle_time) / self.params.get('steps_per_checkpoint'), int((time.time() - start) / 60), time.strftime('%H:%M:%S'))) cycle_time = time.time() stop = stop or self.check_stopfile('STOP_MODEL') if self.params.get('step') >= self.params.get('max_iters'): stop = True if self.train_set.is_finished(): avg_loss = np.mean(loss_per_epoch) avg_test_accuracy = np.mean(accuracy_per_epoch) summ = self.sess.run(self.model.performance_summaries, feed_dict={ self.model.tf_loss_ph: avg_loss, self.model.tf_accuracy_ph: avg_test_accuracy }) self.model.sum_writer.add_summary(summ, self.params.get('epochs')) loss_per_epoch.clear() accuracy_per_epoch.clear() self.params.params["epochs"] += 1 logging.info("Эпоха {0} начата.".format( self.params.get('epochs'))) self.train_set.restart() print("Обучение закончено за " + str(int(time.time() - start)) + " секунд") def check_stopfile(self, filename): stop = False with open(filename, mode="r") as stp: for line in stp: if line.strip() == self.params.params["corpus_name"]: logging.info("Stopping training on command from stopfile.") stop = True break if stop: # remove command from file f = open(filename, "r") lines = f.readlines() f.close() f = open(filename, "w") for line in lines: if line.strip() != self.params.params["corpus_name"]: f.write(line) f.close() return stop