def train_in_ids_lm(train_data, vocab_path, out_dir): if not os.path.exists(vocab_path): os.mkdir(vocab_path) vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_out = os.path.join(vocab_path, "vocab_out") data_ut = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_out=vocab_file_out) with codecs.open(train_data, "r") as f: with codecs.open(os.path.join(vocab_path, out_dir), "w") as f1: for line in f.readlines(): words = line.strip() words = words.replace('.', ' .') words = words.replace(',', ' ,') words = words.replace("'", "' ") words = words.replace('"', '" ') words = words.split() words_ids = data_ut.words2ids(words) words_ids = [str(id) for id in words_ids] words_ids = ' '.join(words_ids) f1.write(words_ids + '#' + words_ids + '\n')
def __init__(self, model_path, config_name): vocab_file_in_words = os.path.join(model_path, "vocab_in_words") vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters") vocab_file_out = os.path.join(model_path, "vocab_out") config_file = os.path.join(model_path, config_name) config = Config() config.get_config(config_file) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=config.num_steps) self.sparsity = config.sparsity prefix = "import/" self.top_k_name = prefix + "Online/Model/top_k:0" self.state_in_name = prefix + "Online/Model/state:0" self.input_name = prefix + "Online/Model/batched_input_word_ids:0" self.top_k_prediction_name = prefix + "Online/Model/top_k_prediction:1" self.output_name = prefix + "Online/Model/probabilities:0" self.state_out_name = prefix + "Online/Model/state_out:0" saved_model_path = os.path.join( model_path, 'sparse_graph-finetune-' + config_name + '.pb') with open(saved_model_path, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = config.gpu_fraction self._sess = tf.Session(config=gpu_config)
def __init__(self, graph_file, vocab_path, full_vocab, config_name, use_phrase=False): vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase") self.use_phrase = use_phrase self._config = Config() self._config.get_config(vocab_path, config_name) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase, full_vocab_file_in_words=full_vocab) print( "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d" "\nphrase vocabulary size = %d" % (self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter, self._config.vocab_size_phrase)) prefix = "import/" self.lm_state_in_name = prefix + "Online/WordModel/state:0" self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0" self.lm_state_out_name = prefix + "Online/WordModel/state_out:0" self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1" self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0" self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1" self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0" self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0" self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0" self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0" self.kc_state_in_name = prefix + "Online/LetterModel/state:0" self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0" self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0" self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1" self.kc_output_name = prefix + "Online/LetterModel/probabilities:0" self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0" with open(graph_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config)
def render_PUT_advanced(self, request, response): request_text = str(request.payload) uri = request.uri_path first_part_uri = uri.split('/')[0] second_part_uri = uri.split('/')[1] local_uri = first_part_uri + '/' + second_part_uri key = self.get_data_handlers_key_byuri(local_uri) # assert (isinstance(response, Response)) if key is not None and request_text is not None and "" != request_text: listener = self._handler.get(key) if listener is not None: fmt = request.content_type from data_utility import DataUtility parser = DataUtility().get_parser(fmt) data = None if parser is None: from ..model.resource_data_general import ResourceDataGeneral data = ResourceDataGeneral(request_text) else: data = parser.parse(request_text) data.set_format(fmt) # ResourceDataOCF/LMW2M/GENERAL paths = uri.split("/") device_id = paths[3] if len(paths) > 2 else "" resource_uri = "" if len(paths[4:]) > 0: resource_uri = "/".join(paths[4:]) if not resource_uri.startswith("/"): resource_uri = "/" + resource_uri """ resource_uri = paths[3] if len(paths) > 3 else "" if fmt == MediaTypeFormat.APPLICATION_JSON: resource_uri += "/" + paths[4] if len(paths) > 4 else "" if fmt == MediaTypeFormat.TEXT_PLAIN: resource_uri += ("/" + paths[4]) if len(paths) > 4 else "" resource_uri += ("/" + paths[5]) if len(paths) > 5 else "" """ rt = listener(device_id, resource_uri, data) # listener is a function if key.process: if rt is True: response.code = defines.Codes.CHANGED.number else: response.code = defines.Codes.FORBIDDEN.number response.content_type = fmt response.payload = data.to_json() return self, response else: return self, None # response changed else: pass
def __init__(self, model_path, model_name, config_name, full_vocab_path=None): vocab_file_in_words = os.path.join(model_path, "vocab_in_words") vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters") vocab_file_out = os.path.join(model_path, "vocab_out") model_file = os.path.join(model_path, model_name) config_file = os.path.join(model_path, config_name) self._config = Config() self._config.get_config(config_file) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=self._config.num_steps, full_vocab_file_in_words=full_vocab_path) self._config.batch_size = 1 self._config.num_steps = 1 with tf.Graph().as_default(): with tf.variable_scope("Model"): self._language_model_test = PTBModel(is_training=False, config=self._config, bucket=1) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) with self._sess.as_default(): # Do not restore sparse weights from pretrain phase restore_variables = dict() for v in tf.trainable_variables(): if v.name.startswith("Model/Softmax/softmax_sp_trainable_weights") \ or v.name.startswith("Model/Embedding/embedding_sp_trainable_weights"): continue print("restore:", v.name) restore_variables[v.name] = v saver = tf.train.Saver(restore_variables) saver.restore(self._sess, model_file) self._fetches = { "topk": self._language_model_test._top_k_prediction, "probability": self._language_model_test._probabilities, "final_state": self._language_model_test.final_state }
def main(): writer = FCDRWriter() # get a template for sensor name in FULL format, supply product height # The scan-width is set automatically # --------------------------------------------------------------------- dataset = writer.createTemplateFull("AVHRR", 128) # set some mandatory global attributes (CF standards). Writing will fail if not all of them are filled # automatically set: CF version and FIDUCEO license # ---------------------------------------------------------------------------------------------------- dataset.attrs["institution"] = "Brockmann Consult GmbH" dataset.attrs["title"] = "FIDUCEO test dataset" dataset.attrs["source"] = "arbitray stuff" dataset.attrs["history"] = "none" dataset.attrs["references"] = "CDR_FCDR sensor reference documentation" dataset.attrs[ "comment"] = "just to show how things are intended to be used" # write real data to the variables. All variables initially contain "_FillValue". # Not writing to the whole array is completely OK # ------------------------------------------------------------------------------- Time = dataset.variables["Time"] Time.data[44] = 0.456 Time.data[45] = 0.457 raa = dataset.variables["relative_azimuth_angle"] raa.data[3, 0] = 0.567 raa.data[3, 1] = 0.568 # ensure not to generate over/underflows # -------------------------------------- DataUtility.check_scaling_ranges(raa) # create a standardized file name # ------------------------------- start = datetime.datetime(2006, 8, 23, 14, 24, 52) end = datetime.datetime(2006, 8, 23, 15, 25, 53) file_name = writer.create_file_name_FCDR_full("AVHRR", "NOAA12", start, end, "01.2") # dump it to disk, netcdf4, medium compression # overwrite existing file # -------------------------------------------- writer.write(dataset, "D:\\Satellite\\DELETE\\" + file_name, overwrite=True)
def train_in_ids_letters(train_data, vocab_path, emoji_data): if not os.path.exists(vocab_path): os.mkdir(vocab_path) vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") data_ut = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out) emojis = [] with codecs.open(emoji_data, "r") as f: for line in f.readlines(): emoji, _ = line.strip().split('\t') emojis.append(emoji) with codecs.open(train_data, "r") as f: with codecs.open(os.path.join(vocab_path, "train_in_ids_letters"), "w") as f1: for line in f.readlines(): letters, _ = line.strip().split('\t') letters = letters.split( '#' ) #letters = ['where', 'so', 'you', 'want', 'me', 'tk', 'ride', '?', '!', 'baby'] letters_ids = ['1'] for word in letters: #word = 'where' if word in emojis: letters_id = '1' else: letter = [] for i in word: letter.append( i) #letter = ['w', 'h', 'e', 'r', 'e'] letter = ' '.join(letter) #letter = 'w h e r e' letters_id = data_ut.letters2ids( letter) #letters_id = [1, 25, 10, 7, 20, 7] letters_id = [str(id) for id in letters_id ] #['1', '25', ..., '7'] letters_id = ' '.join( letters_id) #letters_id = '1 25 10 7 20 7' letters_ids.append( letters_id ) #letters_ids = ['1', '1', '1 25 10 7 20 7', ...,] f1.write('#'.join(letters_ids) + '\n')
def get_config(self, vocab_path, config_filename=None): vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase") print ("the data file path is:", vocab_path) self.data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase) self.vocab_size_letter = self.data_utility.in_letters_count self.vocab_size_in = self.data_utility.in_words_count self.vocab_size_out = self.data_utility.out_words_count self.vocab_size_phrase = self.data_utility.phrase_count if config_filename is not None: with open(config_filename) as f: for line in f: if line.startswith('#'): continue param, value = line.split() if param == "init_scale": self.init_scale = float(value) elif param == "learning_rate": self.learning_rate = float(value) elif param == "max_grad_norm": self.max_grad_norm = float(value) elif param == "num_layers": self.num_layers = int(value) elif param == "num_steps": self.num_steps = int(value) elif param == "max_word_length": self.max_word_length = int(value) elif param == "word_embedding_size": self.word_embedding_size = int(value) elif param == "letter_embedding_size": self.letter_embedding_size = int(value) elif param == "word_hidden_size": self.word_hidden_size = int(value) elif param == "letter_hidden_size": self.letter_hidden_size = int(value) elif param == "max_epoch": self.max_epoch = int(value) elif param == "max_max_epoch": self.max_max_epoch = int(value) elif param == "keep_prob": self.keep_prob = float(value) elif param == "lr_decay": self.lr_decay = float(value) elif param == "batch_size": self.batch_size = int(value) elif param == "gpu_fraction": self.gpu_fraction = float(value)
def __init__(self, config, vocab_file_in_words="resource/vocab/vocab_in_words", vocab_file_in_letters="resource/vocab/vocab_in_letters", vocab_file_out="resource/vocab/vocab_out", corpus_file_in_words="resource/train_data/train_in_ids_words", corpus_file_in_letters="resource/train_data/train_in_ids_letters", corpus_file_out="resource/train_data/train_out_ids"): # Use bucketing to reduce padding self.PAD_ID = 0 self.Buckets = config.buckets self.data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=0) corpus_in_words = self.load_corpus(corpus_file_in_words) corpus_in_letters = self.load_corpus(corpus_file_in_letters) corpus_out = self.load_corpus(corpus_file_out) self.all_data = [[] for _ in self.Buckets] # all_data which is divided into different bukets for i in range(len(corpus_in_words)): in_words_array = corpus_in_words[i].strip().split() in_letters_array = corpus_in_letters[i].strip().split() if len(in_letters_array) + len(in_words_array) == 0: continue if len(in_letters_array) <= self.Buckets[-1]: for bucketid, bucketlength in enumerate(self.Buckets): if len(in_letters_array) + len(in_words_array) <= bucketlength: in_data = in_words_array + in_letters_array + [self.PAD_ID] * (bucketlength - len(in_words_array) - len(in_letters_array)) words_num = len(in_words_array) letters_num = len(in_letters_array) out_data = corpus_out[i].strip() data = Data(in_data=in_data, words_num=words_num, letters_num=letters_num, out_data=out_data) self.all_data[bucketid].append(data) break if len(in_letters_array) + len(in_words_array) > self.Buckets[-1]: if len(in_letters_array) < self.Buckets[-1]: in_data = in_words_array[-(self.Buckets[-1] - len(in_letters_array)):] + in_letters_array else: in_data = in_letters_array words_num = self.Buckets[-1] - len(in_letters_array) letters_num = len(in_letters_array) out_data = corpus_out[i].strip() data = Data(in_data=in_data, words_num=words_num, letters_num=letters_num, out_data=out_data) self.all_data[self.Buckets.index(self.Buckets[-1])].append(data) break self.train_bucket_sizes = [len(self.all_data[b]) for b in range(len(self.Buckets))] print ("bucket size = " + str(self.train_bucket_sizes)) self.num_samples = float(sum(self.train_bucket_sizes)) self.train_buckets_scale = [sum(self.train_bucket_sizes[:i + 1]) / self.num_samples for i in range(len(self.train_bucket_sizes))] print ("bucket_scale = " + str(self.train_buckets_scale)) print ("samples num = " + str(self.num_samples)) self.current_batch_index = [0 for i in range(len(self.Buckets))] self.tmp_bucket_sizes = [len(self.all_data[b]) for b in range(len(self.Buckets))] self.tmp_bucket_scale = [sum(self.train_bucket_sizes[:i + 1]) / self.num_samples for i in range(len(self.train_bucket_sizes))]
def train_in_ids_lm(train_data, vocab_path): if not os.path.exists(vocab_path): os.mkdir(vocab_path) vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") data_ut = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out) with codecs.open(train_data, "r") as f: with codecs.open(os.path.join(vocab_path, "train_in_ids_lm"), "w") as f1: for line in f.readlines(): _, words = line.strip().split('\t') words = words.split('#') words_ids = data_ut.words2ids(words) words_ids = [str(id) for id in words_ids] words_ids = ' '.join(words_ids) f1.write(words_ids + '#' + words_ids + '\n')
def __init__(self, config, is_train=True, vocab_path="../lang-8_process/user_data/", data_path="../lang-8_process/user_data/"): # Use bucketing to reduce padding vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") phase = "train" if is_train else "dev" corpus_file_in_letters = os.path.join(data_path, phase + "_in_ids_letters") corpus_file_in_lm = os.path.join(data_path, phase + "_in_ids_lm") self.PAD_ID = 0 self.Buckets = config.buckets self.num_steps = config.num_steps self.data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out) self.all_data = [[] for _ in self.Buckets] corpus_in_words_lm = self.load_corpus(corpus_file_in_lm) corpus_in_letters = self.load_corpus(corpus_file_in_letters) assert len(corpus_in_words_lm) == len(corpus_in_letters) for i in range(len(corpus_in_words_lm)): corpus_in_words = corpus_in_words_lm[i].strip().split("#") lemma_words = corpus_in_words[0].split() words = corpus_in_words[1].split() if len(corpus_in_words) == 3: lemma_index = corpus_in_words[2].split() else: lemma_index = [0] letters = [letter.split() for letter in corpus_in_letters[i].strip().split("#")] self.gen_data(words, lemma_words, lemma_index, letters) self.train_bucket_sizes = [len(self.all_data[b]) for b in range(len(self.Buckets))] print ("bucket size = " + str(self.train_bucket_sizes)) self.num_samples = float(sum(self.train_bucket_sizes)) self.train_buckets_scale = [sum(self.train_bucket_sizes[:i + 1]) / self.num_samples for i in range(len(self.train_bucket_sizes))] print ("bucket_scale = " + str(self.train_buckets_scale)) print ("samples num = " + str(self.num_samples)) self.current_batch_index = [0 for i in range(len(self.Buckets))] self.tmp_bucket_sizes = [len(self.all_data[b]) for b in range(len(self.Buckets))] self.tmp_bucket_scale = [sum(self.train_bucket_sizes[:i + 1]) / self.num_samples for i in range(len(self.train_bucket_sizes))]
def __init__(self, vocab_file_in_words="resource/vocab/vocab_in_words", vocab_file_in_letters="resource/vocab/vocab_in_letters", vocab_file_out="resource/vocab/vocab_out", corpus_file_in_words="resource/train_data/train_in_ids_words", corpus_file_in_letters="resource/train_data/train_in_ids_letters", corpus_file_out="resource/train_data/train_out_ids", max_sentence_length=30): # Use bucketing to reduce padding self.PAD_ID = 0 self.data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=max_sentence_length) corpus_in_words = self.load_corpus(corpus_file_in_words) corpus_in_letters = self.load_corpus(corpus_file_in_letters) corpus_out = self.load_corpus(corpus_file_out) self.all_data = [] for i in range(len(corpus_in_words)): in_words_array = corpus_in_words[i].strip().split() in_letters_array = corpus_in_letters[i].strip().split() if len(in_letters_array) <= max_sentence_length: if len(in_words_array) + len(in_letters_array) <= max_sentence_length: in_data = in_words_array + in_letters_array + [self.PAD_ID] * (max_sentence_length - len(in_words_array) - len(in_letters_array)) words_num = len(in_words_array) letters_num = len(in_letters_array) else: if len(in_letters_array) < max_sentence_length: in_data = in_words_array[-(max_sentence_length - len(in_letters_array)):] + in_letters_array else: in_data = in_letters_array words_num = max_sentence_length - len(in_letters_array) letters_num = len(in_letters_array) out_data = corpus_out[i].strip() data = Data(in_data=in_data, words_num=words_num, letters_num=letters_num, out_data=out_data) self.all_data.append(data) self.num_samples = len(self.all_data) print ("samples num = " + str(self.num_samples)) self.current_batch_index = 0 self.max_sentence_length = max_sentence_length
class InputEngineRnn: def __init__(self, graph_file, vocab_path, config_name): vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase") self._config = Config() self._config.get_config(vocab_path, config_name) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase) print( "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d" "\nphrase vocabulary size = %d" % (self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter, self._config.vocab_size_phrase)) prefix = "import/" self.lm_state_in_name = prefix + "Online/WordModel/state:0" self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0" self.lm_state_out_name = prefix + "Online/WordModel/state_out:0" self.lm_output_top_k_name = prefix + "Online/WordModel/top_k_prediction:1" self.lm_output_top_k_probability = prefix + "Online/WordModel/probabilities:0" self.lm_top_k_name = prefix + "Online/WordModel/top_k:0" self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1" self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0" self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1" self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0" self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0" self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0" self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0" self.kc_state_in_name = prefix + "Online/LetterModel/state:0" self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0" self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0" self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1" self.kc_output_name = prefix + "Online/LetterModel/probabilities:0" self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0" with open(graph_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) def predict(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k inputs, inputs_key, word_letters = self._data_utility.sentence2ids( sentence) #word_letters是最后一个单词 # print(inputs) # print(inputs_key) lm_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) words_out = list() phrase_logits = None if len(inputs) > 0: #对输入的句子的每个单词循环 for i in range(len(inputs)): feed_values = { self.lm_input_name: [[inputs[i]]], self.lm_top_k_name: k } #外面多加一层列表是为了满足batchsize的那一维。即使batchsize为1 if i > 0: feed_values[self.lm_state_in_name] = lm_state_out # lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run([self.lm_state_out_name, # self.phrase_p_name, # self.phrase_p_probability, # self.phrase_logits], # feed_dict=feed_values) lm_state_out, lm_prob, lm_top_k = self._sess.run( [ self.lm_state_out_name, self.lm_output_top_k_probability, self.lm_output_top_k_name ], feed_dict=feed_values) # phrase_p_top_k = [id for id in phrase_p_top_k[0]]#[0]指的是第一个batchsize,本身是个二维的量,第一个维度是batchsize,但是因为是测试,所以batchsize只有1 # probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]#对应的归一化后的概率 lm_top_k = [id for id in lm_top_k[0]] lm_probability_topk = [lm_prob[0][id] for id in lm_top_k] words_out = self._data_utility.ids2outwords(lm_top_k) # for i in range(len(inputs_key)):#对最后一个单词内的字母进行循环。 # feed_values = {self.kc_input_name: [[inputs_key[i]]], # self.kc_top_k_name: k} # if i == 0 and len(inputs) > 0: # feed_values[self.kc_lm_state_in_name] = lm_state_out # else: # feed_values[self.kc_state_in_name] = kc_state_out # probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, # self.kc_state_out_name], feed_dict=feed_values) # probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]#softmax归一后(probabilities[0])的概率的前k个值 # words_out = self._data_utility.ids2outwords(top_k_predictions[0])#前k个id转为词 # if i == 0 and len(inputs) > 0: # top_word = words_out[0]#概率最大的那个词 # top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)#以概率最大的那个词为首的概率最大的词组,以及他的概率。是个长度为2的元组 # if top_phrase[0] is not None: # is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) # words_out, probability_topk = self.final_words_out(words_out, top_phrase, phrase_p, probability_topk)#把词组概率大于单个词的预测概率的那个词和对应的概率换成词组和对应的词组概率 return [{ 'word': word, 'probability': float(probability) } if word != '<unk>' else { 'word': '<' + word_letters + '>', 'probability': float(probability) } for word, probability in zip(words_out, lm_probability_topk) ] if len(words_out) > 0 else [] def predict_data(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k sentence = sentence.rstrip() words_line, letters_line, words_ids, letters_ids, words_num, letters_num = self._data_utility.data2ids_line( sentence) #把一行输入拆成单词部分,字母部分,单词部分id表示,字母部分id,单词个数,每个单词的字母个数 print('!!!!!', words_ids) print('!!!!!', letters_ids) out_str_list = [] probability_topk_list = [] # print(words_ids) # print(letters_ids) lm_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) for i in range(len(words_ids)): #对每个单词循环 words_out = [] probs_out = [] feed_values = { self.lm_input_name: [[words_ids[i]]], self.lm_top_k_name: k } if i > 0: feed_values[self.lm_state_in_name] = lm_state_out # lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run( # [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, # self.phrase_logits], feed_dict=feed_values) # phrase_p_top_k = [id for id in phrase_p_top_k[0]]####################################### # probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]################################### lm_state_out, lm_prob, lm_top_k = self._sess.run( [ self.lm_state_out_name, self.lm_output_top_k_probability, self.lm_output_top_k_name ], feed_dict=feed_values) lm_top_k = [id for id in lm_top_k[0]] lm_probability_topk = [lm_prob[0][id] for id in lm_top_k] words = self._data_utility.ids2outwords(lm_top_k) if i == len(letters_ids): break for j in range(len(letters_ids[i])): #循环这个单词内部的每个字母 # feed_values = {self.kc_input_name: [[letters_ids[i][j]]], # self.kc_top_k_name: k, self.key_length:[1]} # # if j == 0 and len(words_ids) > 0:#第一个字母的初始状态是从语言模型来的,后面的字母的输入状态是从上一个字母的状态来的 # feed_values[self.kc_lm_state_in_name] = lm_state_out # else: # feed_values[self.kc_state_in_name] = kc_state_out # probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, # self.kc_state_out_name], feed_dict=feed_values) # probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] # words = self._data_utility.ids2outwords(top_k_predictions[0]) # # if j == 0 and i > 0: # top_word = words[0] # top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word) # if top_phrase[0] is not None: # is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) # words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk) words_out.append(words) probs_out.append(lm_probability_topk) out_str = words_out if i > 0 else [['', '', '']] + words_out[1:] out_str_list.append(out_str) probability_topk_list.append(probs_out) return words_line, letters_line, out_str_list, probability_topk_list def calculate_phrase_p(self, top_phrase, probability_p_topk, phrase_p_top_k): is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)] phrase_p = is_phrase_p * top_phrase[ 1] #即起到把权重降下来的作用。词组的权重,乘上词组是否在词组表里的概率 return is_phrase_p, phrase_p def final_words_out(self, words, top_phrase, phrase_p, probability_topk): for i in range(len(probability_topk)): if phrase_p >= probability_topk[i]: probability_topk[i] = phrase_p words[i] = top_phrase[0] break return words, probability_topk def result_print(self, out_string, out_prob): string = "" for (word, prob) in zip(out_string, out_prob): prob = str(prob) if word != "" else "0.0" string = string + word + ":" + prob + "|" string = string[:-1] return string def predict_file(self, test_file_in, test_file_out, k): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() jj = 0 for sentence in testfilein: print(jj) jj += 1 sentence = sentence.rstrip() result = self.predict_data(sentence, k) if result is not None: words_line, letters_line, out_words_list, out_prob_list = result for i in range(len(out_words_list)): print("\t".join(words_line[:i]) + "|#|" + letters_line[i] + "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([ self.result_print(out_words, out_prob) for (out_words, out_prob) in zip( out_words_list[i], out_prob_list[i]) ]) + "\n") testfileout.write( "\t".join(words_line[:i]) + "|#|" + letters_line[i] + "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([ self.result_print(out_words, out_prob) for (out_words, out_prob ) in zip(out_words_list[i], out_prob_list[i]) ]) + "\n") t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close()
class InputEngineSparse(object): def __init__(self, model_path, config_name): vocab_file_in_words = os.path.join(model_path, "vocab_in_words") vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters") vocab_file_out = os.path.join(model_path, "vocab_out") config_file = os.path.join(model_path, config_name) config = Config() config.get_config(config_file) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=config.num_steps) self.sparsity = config.sparsity prefix = "import/" self.top_k_name = prefix + "Online/Model/top_k:0" self.state_in_name = prefix + "Online/Model/state:0" self.input_name = prefix + "Online/Model/batched_input_word_ids:0" self.top_k_prediction_name = prefix + "Online/Model/top_k_prediction:1" self.output_name = prefix + "Online/Model/probabilities:0" self.state_out_name = prefix + "Online/Model/state_out:0" saved_model_path = os.path.join( model_path, 'sparse_graph-finetune-' + config_name + '.pb') with open(saved_model_path, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = config.gpu_fraction self._sess = tf.Session(config=gpu_config) def predict(self, sentence, k): """Feed a sentence (str) and perform inference on this sentence """ global probabilities, top_k_predictions sentence_ids, word_letters = self._data_utility.sentence2ids(sentence) # Feed input sentence word by word. state_out = None for i in range(len(sentence_ids)): feed_values = { self.input_name: [[sentence_ids[i]]], self.top_k_name: k } if i > 0: feed_values[self.state_in_name] = state_out # probabilities is an ndarray of shape (batch_size * time_step) * vocab_size # For inference, batch_size = num_step = 1, thus probabilities.shape = 1 * vocab_size probabilities, top_k_predictions, state_out = self._sess.run( [ self.output_name, self.top_k_prediction_name, self.state_out_name ], feed_dict=feed_values) probability_topk = [ probabilities[0][id] for id in top_k_predictions[0] ] words_out = self._data_utility.ids2outwords(top_k_predictions[0]) return [{ 'word': word, 'probability': float(probability) } if word != '<unk>' else { 'word': '<' + word_letters + '>', 'probability': float(probability) } for word, probability in zip(words_out, probability_topk) ] if len(words_out) > 0 else [] def predict_data(self, sentence): sentence = sentence.rstrip() inputs, words_num, letters_num = self._data_utility.data2ids_line( sentence) if inputs == None: return None words_out = [] state_out = None for i in range(len(inputs)): feed_values = {self.input_name: [[inputs[i]]], self.top_k_name: 3} if i > 0: feed_values[self.state_in_name] = state_out probabilities, top_k_predictions, state_out = self._sess.run( [ self.output_name, self.top_k_prediction_name, self.state_out_name ], feed_dict=feed_values) words = self._data_utility.ids2outwords(top_k_predictions[0]) words_out.append(words) out_str = str( words_out[words_num - 1:words_num + letters_num] if words_num > 0 else [['', '', '']] + words_out[0:letters_num]) return out_str def predict_file(self, test_file_in, test_file_out): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() for sentence in testfilein: sentence = sentence.rstrip() out_str = self.predict_data(sentence) if (out_str): print(sentence + " |#| " + out_str) testfileout.write(sentence + " |#| " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close() def predict_data_probability(self, sentence): sentence = sentence.rstrip() inputs, words_num, letters_num = self._data_utility.data2ids_line( sentence) if inputs == None: return None words_out = [] probability_out = [] state_out = None for i in range(len(inputs)): feed_values = {self.input_name: [[inputs[i]]], self.top_k_name: 3} if i > 0: feed_values[self.state_in_name] = state_out probabilities, top_k_predictions, state_out = self._sess.run( [ self.output_name, self.top_k_prediction_name, self.state_out_name ], feed_dict=feed_values) top3 = top_k_predictions[0] probability_top3 = [probabilities[0][id] for id in top3] words = self._data_utility.ids2outwords(top3) words_out.append(words) probability_out.append(probability_top3) out_str = '' if words_num > 0: words_out_use = words_out[words_num - 1:words_num + letters_num] probability_out_use = probability_out[words_num - 1:words_num + letters_num] for words, probabilities in zip(words_out_use, probability_out_use): out_str_line = '' for word, probability in zip(words, probabilities): out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format( probability) out_str_line = out_str_line[3:-1] out_str = out_str + " || " + out_str_line out_str = out_str[4:-1] else: words_out_use = words_out[0:letters_num] probability_out_use = probability_out[0:letters_num] for words, probabilities in zip(words_out_use, probability_out_use): out_str_line = '' for word, probability in zip(words, probabilities): out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format( probability) out_str_line = out_str_line[3:-1] out_str = out_str + " || " + out_str_line return out_str def predict_file_probability(self, test_file_in, test_file_out): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() for sentence in testfilein: sentence = sentence.rstrip() out_str = self.predict_data_probability(sentence) if (out_str): print(sentence + " |#| " + out_str) testfileout.write(sentence + " |#| " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close()
nb_train_samples = 49700 nb_validation_samples = 2000 epochs = 10 batch_size = 32 # Note: Must be less than or equal to the nb_validation_samples size. img_width, img_height = 26, 99 if K.image_data_format() == 'channels_first': input_shape = (1, img_width, img_height) else: input_shape = (img_width, img_height, 1) m = models.Models() #model = m.get_cifar_model(input_shape, 10) #model = m.get_cifar_model_2(input_shape, 10) model = m.get_covn2d_six_layer_model(input_shape, len(training_categories) + 1) du = DataUtility(bucket_id='kaggle_voice_data', root_folder='/') X, Y = du.load_data_local('../../data/npz', training_categories, other_categories) #X, Y = du.du.load_local_binary_data('../../data/npz', target) x_train, y_train, x_test, y_test = train_test_split(X, Y, test_size=0.33, random_state=42) # x_train -> Training data to feed the net # x_test -> Training data for evaluation # y_train -> VALIDATION data for net input # y_test -> Expected Validation output #
class InputEngineRnn: def __init__(self, graph_file, vocab_path, config_name, use_phrase=False): vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase") self.use_phrase = use_phrase self._config = Config() self._config.get_config(vocab_path, config_name) self._data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase) print("in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d" "\nphrase vocabulary size = %d" % ( self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter, self._config.vocab_size_phrase)) prefix = "import/" self.lm_state_in_name = prefix + "Online/WordModel/state:0" self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0" self.lm_state_out_name = prefix + "Online/WordModel/state_out:0" self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1" self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0" self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1" self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0" self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0" self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0" self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0" self.kc_state_in_name = prefix + "Online/LetterModel/state:0" self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0" self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0" self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1" self.kc_output_name = prefix + "Online/LetterModel/probabilities:0" self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0" self.max_test_line = 10000 with open(graph_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) def predict(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k inputs, inputs_key, word_letters = self._data_utility.sentence2ids(sentence) lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) words_out = list() phrase_logits = None # Phase I: read contexts. if len(inputs) > 0: for i in range(len(inputs)): feed_values = {self.lm_input_name: [[inputs[i]]]} if i > 0: feed_values[self.lm_state_in_name] = lm_state_out # Use previous language model's final state as language model's initial state. if self.use_phrase: lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run([self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, self.phrase_logits], feed_dict=feed_values) phrase_p_top_k = [id for id in phrase_p_top_k[0]] probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k] else: lm_state_out = self._sess.run([self.lm_state_out_name], feed_dict=feed_values)[0] # Phase II: read letters, predict by feed the letters one-by-one. for i in range(len(inputs_key)): feed_values = {self.kc_input_name: [[inputs_key[i]]], self.kc_top_k_name: k} if i == 0 and len(inputs) > 0: feed_values[self.kc_lm_state_in_name] = lm_state_out # Use language model's final state to letter model's initial state when the letters haven't been feed. else: feed_values[self.kc_state_in_name] = kc_state_out # Use letter model's final state to letter model's initial state when feed the letters one-by-one. probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, self.kc_state_out_name], feed_dict=feed_values) probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] words_out = self._data_utility.ids2outwords(top_k_predictions[0]) # Predict phrase if self.use_phrase: if i == 0 and len(inputs) > 0: top_word = words_out[0] top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word) if top_phrase[0] is not None: is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) words_out, probability_topk = self.final_words_out(words_out, top_phrase, phrase_p, probability_topk) return [{'word': word, 'probability': float(probability)} if word != '<unk>' else {'word': '<' + word_letters + '>', 'probability': float(probability)} for word, probability in zip(words_out, probability_topk)] if len(words_out) > 0 else [] def predict_data(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k sentence = sentence.rstrip() res = self._data_utility.data2ids_line(sentence) if res is None: return None words_line, letters_line, words_ids, letters_ids, words_num, letters_num = res out_str_list = [] probability_topk_list = [] phrase_logits = None lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) for i in range(len(words_ids)): words_out = [] probs_out = [] # Phase I: read contexts. feed_values = {self.lm_input_name: [[words_ids[i]]]} if i > 0: feed_values[self.lm_state_in_name] = lm_state_out # Use previous language model's final state as language model's initial state. if self.use_phrase: lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run( [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, self.phrase_logits], feed_dict=feed_values) phrase_p_top_k = [id for id in phrase_p_top_k[0]] probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k] else: lm_state_out = self._sess.run([self.lm_state_out_name], feed_dict=feed_values)[0] if i == len(letters_ids): break # Phase II: read letters, predict by feed the letters one-by-one. for j in range(len(letters_ids[i])): feed_values = {self.kc_input_name: [[letters_ids[i][j]]], self.kc_top_k_name: k, self.key_length: [1]} if j == 0 and len(words_ids) > 0: feed_values[self.kc_lm_state_in_name] = lm_state_out # Use language model's final state to letter model's initial state when letters haven't been feed. else: feed_values[self.kc_state_in_name] = kc_state_out # Use letter model's final state to letter model's initial state when feed the letters one-by-one. probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, self.kc_state_out_name], feed_dict=feed_values) probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] words = self._data_utility.ids2outwords(top_k_predictions[0]) # Predict phrase if self.use_phrase: if j == 0 and i > 0: top_word = words[0] top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word) if top_phrase[0] is not None: is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk) words_out.append(words) probs_out.append(probability_topk) out_str = words_out if i > 0 else [['','','']] + words_out[1: ] out_str_list.append(out_str) probability_topk_list.append(probs_out) return words_line, letters_line, out_str_list, probability_topk_list def calculate_phrase_p(self, top_phrase, probability_p_topk, phrase_p_top_k): is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)] phrase_p = is_phrase_p * top_phrase[1] return is_phrase_p, phrase_p def final_words_out(self, words, top_phrase, phrase_p, probability_topk): for i in range(len(probability_topk)): if phrase_p >= probability_topk[i]: probability_topk[i] = phrase_p words[i] = top_phrase[0] break return words, probability_topk def result_print(self, out_string, out_prob): string = "" for (word, prob) in zip(out_string, out_prob): prob = str(prob) if word != "" else "0.0" string = string + word + ":" + prob + "|" string = string[:-1] return string def predict_file(self, test_file_in, test_file_out, k): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() line_count = 0 for sentence in testfilein: line_count += 1 if line_count > self.max_test_line: break sentence = sentence.rstrip() result = self.predict_data(sentence, k) if result is not None: words_line, letters_line, out_words_list, out_prob_list = result for i in range(len(out_words_list)): print("\t".join(words_line[:i]) + "|#|" + " ".join(letters_line[i]) + "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([self.result_print(out_words, out_prob) for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])]) + "\n") testfileout.write("\t".join(words_line[:i]) + "|#|" + " ".join(letters_line[i]) + "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([self.result_print(out_words, out_prob) for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])]) + "\n") t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close()
class InputEngineRnn: def __init__(self, graph_file, vocab_path, config_name): vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(vocab_path, "vocab_out") vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase") self._config = Config() self._config.get_config(vocab_path, config_name) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase) print( "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d" "\nphrase vocabulary size = %d" % (self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter, self._config.vocab_size_phrase)) prefix = "import/" self.lm_state_in_name = prefix + "Online/WordModel/state:0" self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0" self.lm_state_out_name = prefix + "Online/WordModel/state_out:0" # self.lm_top_k_name = prefix + "Online/WordModel/top_k:0" self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1" self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0" self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1" self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0" self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0" self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0" self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0" self.kc_state_in_name = prefix + "Online/LetterModel/state:0" self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0" self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0" self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1" self.kc_output_name = prefix + "Online/LetterModel/probabilities:0" self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0" with open(graph_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) def predict(self, sentence, k): global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k inputs, inputs_key, word_letters = self._data_utility.sentence2ids( sentence) # print(inputs) # print(inputs_key) lm_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state_out = np.zeros( [self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) words_out = list() phrase_logits = None if len(inputs) > 0: for i in range(len(inputs)): feed_values = {self.lm_input_name: [[inputs[i]]]} if i > 0: feed_values[self.lm_state_in_name] = lm_state_out lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run( [ self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, self.phrase_logits ], feed_dict=feed_values) phrase_p_top_k = [id for id in phrase_p_top_k[0]] probability_p_topk = [ phrase_p_prob[0][id] for id in phrase_p_top_k ] for i in range(len(inputs_key)): feed_values = { self.kc_input_name: [[inputs_key[i]]], self.kc_top_k_name: k } if i == 0 and len(inputs) > 0: feed_values[self.kc_lm_state_in_name] = lm_state_out else: feed_values[self.kc_state_in_name] = kc_state_out probabilities, top_k_predictions, kc_state_out = self._sess.run( [ self.kc_output_name, self.kc_top_k_prediction_name, self.kc_state_out_name ], feed_dict=feed_values) probability_topk = [ probabilities[0][id] for id in top_k_predictions[0] ] words_out = self._data_utility.ids2outwords(top_k_predictions[0]) if i == 0 and len(inputs) > 0: top_word = words_out[0] top_phrase = self._data_utility.get_top_phrase( phrase_logits, top_word) if top_phrase[0] is not None: is_phrase_p, phrase_p = self.calculate_phrase_p( top_phrase, probability_p_topk, phrase_p_top_k) words_out, probability_topk = self.final_words_out( words_out, top_phrase, phrase_p, probability_topk) return [{ 'word': word, 'probability': float(probability) } if word != '<unk>' else { 'word': '<' + word_letters + '>', 'probability': float(probability) } for word, probability in zip(words_out, probability_topk) ] if len(words_out) > 0 else [] # def predict_data(self, sentence, k): # global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k # sentence = sentence.rstrip() # words_line, letters_line, words_ids, letters_ids, words_num, letters_num = self._data_utility.data2ids_line(sentence) # out_str_list = [] # probability_topk_list = [] # # print(words_ids) # # print(letters_ids) # lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) # kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) # # for i in range(len(words_ids)): # words_out = [] # probs_out = [] # feed_values = {self.lm_input_name: [[words_ids[i]]]} # if i > 0: # feed_values[self.lm_state_in_name] = lm_state_out # # lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run( # [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability, # self.phrase_logits], feed_dict=feed_values) # phrase_p_top_k = [id for id in phrase_p_top_k[0]] # probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k] # # if i == len(letters_ids): # break # for j in range(len(letters_ids[i])): # feed_values = {self.kc_input_name: [[letters_ids[i][j]]], # self.kc_top_k_name: k, self.key_length:[1]} # # if j == 0 and len(words_ids) > 0: # feed_values[self.kc_lm_state_in_name] = lm_state_out # else: # feed_values[self.kc_state_in_name] = kc_state_out # probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name, # self.kc_state_out_name], feed_dict=feed_values) # probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] # words = self._data_utility.ids2outwords(top_k_predictions[0]) # # if j == 0 and i > 0: # top_word = words[0] # top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word) # if top_phrase[0] is not None: # is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k) # words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk) # words_out.append(words) # probs_out.append(probability_topk) # out_str = words_out if i > 0 else [['','','']] + words_out[1: ] # out_str_list.append(out_str) # probability_topk_list.append(probs_out) # # return words_line, letters_line, out_str_list, probability_topk_list def calculate_phrase_p(self, top_phrase, probability_p_topk, phrase_p_top_k): is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)] phrase_p = is_phrase_p * top_phrase[1] return is_phrase_p, phrase_p def final_words_out(self, words, top_phrase, phrase_p, probability_topk): for i in range(len(probability_topk)): if phrase_p >= probability_topk[i]: probability_topk[i] = phrase_p words[i] = top_phrase[0] break return words, probability_topk def result_print(self, out_string, out_prob): string = "" for (word, prob) in zip(out_string, out_prob): prob = str(prob) if word != "" else "0.0" string = string + word + ":" + prob + "|" string = string[:-1] return string # def predict_file(self, test_file_in, test_file_out, k): # testfilein = open(test_file_in, "r") # testfileout = open(test_file_out, 'w') # t1 = time.time() # # for sentence in testfilein: # sentence = sentence.rstrip() # result = self.predict_data(sentence, k) # # if result is not None: # words_line, letters_line, out_words_list, out_prob_list = result # # for i in range(len(out_words_list)): # print("\t".join(words_line[:i]) # + "|#|" + letters_line[i] # + "|#|" + "\t".join(words_line[i:]) + "|#|" # + '\t'.join([self.result_print(out_words, out_prob) # for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])]) # + "\n") # testfileout.write("\t".join(words_line[:i]) # + "|#|" + letters_line[i] # + "|#|" + "\t".join(words_line[i:]) + "|#|" # + '\t'.join([self.result_print(out_words, out_prob) # for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])]) # + "\n") # # t2 = time.time() # print(t2 - t1) # testfilein.close() # testfileout.close() def predict_data(self, sentence, k): sentence = sentence.rstrip() inputs, inputs_key, words_num, letters_num = self._data_utility.data2ids_line( sentence) #上下文的id,要预测的单词的键码部分id,上下文单词数,要预测的单词的字母数 words_out = [] lm_state = np.zeros( [self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32) kc_state = np.zeros( [self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32) if len(inputs) > 0: for i in range(len(inputs)): feed_values = {self.lm_input_name: [[inputs[i]]]} if i > 0: feed_values[self.lm_state_in_name] = lm_state # probabilities is an ndarray of shape (batch_size * time_step) * vocab_size # For inference, batch_size = num_step = 1, thus probabilities.shape = 1 * vocab_size result = self._sess.run([self.lm_state_out_name], feed_dict=feed_values) lm_state = result[0] #probability_topk = [probabilities[0][id] for id in top_k_predictions[0]] #words = self._data_utility.ids2outwords(top_k_predictions[0]) #words_out.append(words) for i in range(len(inputs_key)): feed_values = { self.kc_input_name: [[inputs_key[i]]], self.kc_top_k_name: k } if i > 0 or len(inputs) == 0: feed_values[self.kc_state_in_name] = kc_state else: feed_values[self.kc_lm_state_in_name] = lm_state #print (state_out) probabilities, top_k_predictions, kc_state = self._sess.run( [ self.kc_output_name, self.kc_top_k_prediction_name, self.kc_state_out_name ], feed_dict=feed_values) probability_topk = [ probabilities[0][id] for id in top_k_predictions[0] ] words = self._data_utility.ids2outwords(top_k_predictions[0]) words_out.append(words) out_str = str(words_out if words_num > 0 else [['', '', '']] + words_out[1:]) return out_str def predict_file(self, test_file_in, test_file_out, k): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() topk = k for sentence in testfilein: sentence = sentence.rstrip() sentence_in = sentence.lower() out_str = self.predict_data(sentence_in, topk) if (out_str): print(sentence + " | " + out_str) testfileout.write(sentence + " | " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close()
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") logfile = open(FLAGS.mode + '-' + FLAGS.model_config + '.log', 'w') # logfile = sys.stdout if not os.path.isdir(FLAGS.save_path): os.mkdir(FLAGS.save_path) if not os.path.isdir(FLAGS.graph_save_path): os.mkdir(FLAGS.graph_save_path) config = Config() config.get_config(FLAGS.model_config) test_config = Config() test_config.get_config(FLAGS.model_config) test_config.batch_size = 1 test_config.num_steps = 1 vocab_file_in_words = os.path.join(FLAGS.vocab_path, "vocab_in_words") vocab_file_in_letters = os.path.join(FLAGS.vocab_path, "vocab_in_letters") vocab_file_out = os.path.join(FLAGS.vocab_path, "vocab_out") train_file_in_words = os.path.join(FLAGS.data_path, "train_in_ids_words") train_file_in_letters = os.path.join(FLAGS.data_path, "train_in_ids_letters") train_file_out = os.path.join(FLAGS.data_path, "train_out_ids") dev_file_in_words = os.path.join(FLAGS.data_path, "dev_in_ids_words") dev_file_in_letters = os.path.join(FLAGS.data_path, "dev_in_ids_letters") dev_file_out = os.path.join(FLAGS.data_path, "dev_out_ids") data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=config.num_steps) with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = config.gpu_fraction with tf.Session(config=gpu_config) as session: with tf.name_scope("Train"): train_feeder = DataFeederContext( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, corpus_file_in_words=train_file_in_words, corpus_file_in_letters=train_file_in_letters, corpus_file_out=train_file_out, max_sentence_length=config.num_steps) with tf.variable_scope("Model", reuse=None, initializer=initializer): mtrain = PTBModel(is_training=True, config=config) tf.summary.scalar("Training Loss", mtrain.cost) tf.summary.scalar("Learning Rate", mtrain.lr) with tf.name_scope("Valid"): valid_feeder = DataFeederContext( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, corpus_file_in_words=dev_file_in_words, corpus_file_in_letters=dev_file_in_letters, corpus_file_out=dev_file_out, max_sentence_length=config.num_steps) with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) tf.summary.scalar("Validation Loss", mvalid.cost) # Evaluate on test data with tf.name_scope("Test"): test_feeder = DataFeederContext( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, corpus_file_in_words=dev_file_in_words, corpus_file_in_letters=dev_file_in_letters, corpus_file_out=dev_file_out, max_sentence_length=config.num_steps) with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = PTBModel(is_training=False, config=config) # Model to be saved and exported # Note: it's beneficial to distinguish between test model and save model, # because when evaluating on test set, a large batch size is more GPU-friendly and faster. # But when running on cellphone, it can accept a batch size of 1 only, this is why monline exists. with tf.name_scope("Online"): with tf.variable_scope("Model", reuse=True, initializer=initializer): monline = PTBModel(is_training=False, config=test_config) # Do not restore sparse weights from pretrain phase restore_variables = dict() for v in tf.trainable_variables(): if v.name.startswith("Model/Softmax/softmax_sp_trainable_weights") \ or v.name.startswith("Model/Embedding/embedding_sp_trainable_weights"): continue print("store:", v.name) restore_variables[v.name] = v sv = tf.train.Saver(restore_variables) if not FLAGS.model_name.endswith(".ckpt"): FLAGS.model_name += ".ckpt" session.run(tf.global_variables_initializer()) if FLAGS.mode == "pretrain": # restore previously trained model check_point_dir = os.path.join(FLAGS.save_path, "pretrain") ckpt = tf.train.get_checkpoint_state(check_point_dir) if ckpt and tf.train.checkpoint_exists( ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) sv.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") for i in range(config.max_max_epoch // FLAGS.laptop_discount): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0) mtrain.assign_lr(session, config.learning_rate * lr_decay) print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(mtrain.lr)), file=logfile) train_perplexity = run_epoch(session, mtrain, eval_op=mtrain.train_op, data_feeder=train_feeder, verbose=True) print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity), file=logfile) logfile.flush() valid_perplexity = run_epoch(session, mvalid, data_feeder=valid_feeder) print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity), file=logfile) logfile.flush() print("save path:", FLAGS.save_path) # Save model if FLAGS.mode == "pretrain" or "finetune" if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path, file=logfile) step = mtrain.get_global_step(session) pretrain_save_path = os.path.join( FLAGS.save_path, "pretrain") if not os.path.isdir(pretrain_save_path): os.mkdir(pretrain_save_path) model_save_path = os.path.join(pretrain_save_path, FLAGS.model_name) sv.save(session, model_save_path, global_step=step) print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Begin exporting graph!") export_graph(session) # Export dense graph print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Finish exporting graph!") # Evaluate on test data for {"pretrain", "finetune",} phase print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Begin exporting graph!") export_graph(session) # Export dense graph print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Begin test epoch!") sys.stdout.flush() print("=" * 30 + FLAGS.mode + "=" * 30, file=logfile) test_perplexity = run_evaluate_epoch( session, mtest, logfile, word_dict=data_utility.id2token_out, data_feeder=test_feeder) print("Test Perplexity: %.3f" % test_perplexity, file=logfile) print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Finish test epoch!") print("Test Perplexity: %.3f" % test_perplexity) # print to stdout logfile.close() elif FLAGS.mode == "learn_basis": sv.restore( session, tf.train.latest_checkpoint( os.path.join(FLAGS.save_path, "pretrain"))) print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Begin learning embedding basis!") learn_sparse_embedding(session, mtrain, verbose=True) print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Finish learning embedding basis!") print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Begin learning softmax basis!") learn_sparse_softmax(session, mtrain, verbose=True) print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Finish learning softmax basis!") sys.exit(0) elif FLAGS.mode == "finetune": # Restore pre-trained model sv.restore( session, tf.train.latest_checkpoint( os.path.join(FLAGS.save_path, "pretrain"))) for i in range(config.finetune_epoch // FLAGS.laptop_discount): lr_decay = config.lr_decay**(i // config.max_epoch) mtrain.assign_lr(session, config.finetune_learning_rate * lr_decay) print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(mtrain.lr)), file=logfile) train_perplexity = run_epoch(session, mtrain, eval_op=mtrain.train_op, data_feeder=train_feeder, verbose=True) print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity), file=logfile) logfile.flush() print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile) valid_perplexity = run_epoch(session, mvalid, data_feeder=valid_feeder) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity), file=logfile) logfile.flush() # Save model if FLAGS.mode == "pretrain" or "finetune" if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path, file=logfile) step = mtrain.get_global_step(session) finetune_save_path = os.path.join( FLAGS.save_path, "finetune-" + FLAGS.model_config) if not os.path.isdir(finetune_save_path): os.mkdir(finetune_save_path) model_save_path = os.path.join(finetune_save_path, FLAGS.model_name) sv.save(session, model_save_path, global_step=step) # Export sparse graph at every iteration print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Begin exporting graph!") export_graph(session) # Export dense graph print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Finish exporting graph!") # Evaluate on test data for {"pretrain", "finetune",} phase print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Begin test epoch!") sys.stdout.flush() print("=" * 30 + FLAGS.mode + "=" * 30, file=logfile) test_perplexity = run_evaluate_epoch( session, mtest, logfile, word_dict=data_utility.id2token_out, data_feeder=test_feeder) print("[" + time.strftime('%Y-%m-%d %H:%M:%S') + "] Finish test epoch!") print("Test Perplexity: %.3f" % test_perplexity, file=logfile) print("Test Perplexity: %.3f" % test_perplexity) # print to stdout logfile.close()
def train(self, target): start_time = time() img_width, img_height = 26, 99 epochs = 20 batch_size = 32 tb_callback = CB.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=1, write_graph=True, write_grads=True, write_images=True, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) m = models.Models() print('Training with target "{0}".'.format(target)) du = DataUtility(bucket_id='kaggle_voice_data', root_folder='/') if K.image_data_format() == 'channels_first': input_shape = (1, img_width, img_height) else: input_shape = (img_width, img_height, 1) model = m.get_covn2d_six_layer_model(input_shape, 1) X, Y = du.load_local_binary_data('../../data/npz', target) # X, Y = du.load_cloud_binary_data(target) x_train, y_train, x_test, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) # x_train -> Training data to feed the net # x_test -> Training data for evaluation # y_train -> VALIDATION data for net input # y_test -> Expected Validation output # # Train the network with x_train and x_test # Evaluate the network with y_train and y_test # x_test = np_utils.to_categorical(x_test, 2) # y_test = np_utils.to_categorical(y_test, 2) new_x_train = np.expand_dims(x_train, axis=3) new_y_train = np.expand_dims(y_train, axis=3) # datagen = ImageDataGenerator( # featurewise_std_normalization=True, # rotation_range=0, # height_shift_range=0.2, # horizontal_flip=False # ) # Fit the data generator to the test data for featurewise_std. #datagen.fit(new_x_train) # x_train = x_train[0:nb_train_samples] # x_test = x_test[0:nb_train_samples] # y_train = y_train[0:nb_validation_samples] # y_test = y_test[0:nb_validation_samples] #model.fit_generator(datagen.flow(new_x_train, x_test, batch_size=batch_size), # steps_per_epoch=len(x_train) / batch_size, epochs=epochs, validation_data=(new_y_train, y_test)) history = model.fit(x=new_x_train, y=x_test, validation_data=(new_y_train, y_test), batch_size=batch_size, epochs=epochs, verbose=0, callbacks=[tb_callback]) stop_time = time() print("Total training time: {0} seconds.".format( int(stop_time - start_time))) # model.save("./local_big_training") du.save_multi_model(self.save_dir, '{0}'.format(target), model) print("Model saved as {0}.h5".format(target)) return {"name": target, "accuracy": history.history['acc']}
class InputEngineRnn: def __init__(self, model_path, model_name, config_name, full_vocab_path=None): vocab_file_in_words = os.path.join(model_path, "vocab_in_words") vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters") vocab_file_out = os.path.join(model_path, "vocab_out") model_file = os.path.join(model_path, model_name) config_file = os.path.join(model_path, config_name) self._config = Config() self._config.get_config(config_file) self._data_utility = DataUtility( vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters, vocab_file_out=vocab_file_out, max_sentence_length=self._config.num_steps, full_vocab_file_in_words=full_vocab_path) self._config.batch_size = 1 self._config.num_steps = 1 with tf.Graph().as_default(): with tf.variable_scope("Model"): self._language_model_test = PTBModel(is_training=False, config=self._config, bucket=1) gpu_config = tf.ConfigProto() gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction self._sess = tf.Session(config=gpu_config) with self._sess.as_default(): # Do not restore sparse weights from pretrain phase restore_variables = dict() for v in tf.trainable_variables(): if v.name.startswith("Model/Softmax/softmax_sp_trainable_weights") \ or v.name.startswith("Model/Embedding/embedding_sp_trainable_weights"): continue print("restore:", v.name) restore_variables[v.name] = v saver = tf.train.Saver(restore_variables) saver.restore(self._sess, model_file) self._fetches = { "topk": self._language_model_test._top_k_prediction, "probability": self._language_model_test._probabilities, "final_state": self._language_model_test.final_state } def predict(self, sentence, k): state = self._sess.run(self._language_model_test.initial_state) inputs, word_letters = self._data_utility.sentence2ids(sentence) for i in range(len(inputs)): vals = self._sess.run(self._fetches, feed_dict={ self._language_model_test.initial_state: state, self._language_model_test.input_data: [[inputs[i]]], self._language_model_test.target_data: [[0]], self._language_model_test.output_masks: [[0.0]], self._language_model_test.top_k: k }) state = vals["final_state"] topk = vals["topk"][0] probability = vals["probability"][0] probability_topk = [probability[id] for id in topk] words_out = self._data_utility.ids2outwords(topk) return [{ 'word': word, 'probability': float(probability) } if word != '<unk>' else { 'word': '<' + word_letters + '>', 'probability': float(probability) } for word, probability in zip(words_out, probability_topk) ] if len(words_out) > 0 else [] def predict_data(self, sentence): sentence = sentence.rstrip() state = self._sess.run(self._language_model_test.initial_state) inputs, words_num, letters_num = self._data_utility.data2ids_line( sentence) if inputs == None: return None words_out = [] for i in range(len(inputs)): vals = self._sess.run(self._fetches, feed_dict={ self._language_model_test.initial_state: state, self._language_model_test.input_data: [[inputs[i]]], self._language_model_test.target_data: [[0]], self._language_model_test.output_masks: [[0.0]], self._language_model_test.top_k: 3 }) state = vals["final_state"] top3 = vals["topk"][0] words = self._data_utility.ids2outwords(top3) words_out.append(words) out_str = str( words_out[words_num - 1:words_num + letters_num] if words_num > 0 else [['', '', '']] + words_out[0:letters_num]) return out_str def predict_file(self, test_file_in, test_file_out): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() for sentence in testfilein: sentence = sentence.rstrip() out_str = self.predict_data(sentence) if (out_str): print(sentence + " |#| " + out_str) testfileout.write(sentence + " |#| " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close() def predict_data_probability(self, sentence): sentence = sentence.rstrip() state = self._sess.run(self._language_model_test.initial_state) inputs, words_num, letters_num = self._data_utility.data2ids_line( sentence) if inputs == None: return None words_out = [] probability_out = [] for i in range(len(inputs)): vals = self._sess.run(self._fetches, feed_dict={ self._language_model_test.initial_state: state, self._language_model_test.input_data: [[inputs[i]]], self._language_model_test.target_data: [[0]], self._language_model_test.output_masks: [[0.0]], self._language_model_test.top_k: 3 }) state = vals["final_state"] top3 = vals["topk"][0] probability = vals["probability"][0] probability_top3 = [probability[id] for id in top3] words = self._data_utility.ids2outwords(top3) words_out.append(words) probability_out.append(probability_top3) out_str = '' if words_num > 0: words_out_use = words_out[words_num - 1:words_num + letters_num] probability_out_use = probability_out[words_num - 1:words_num + letters_num] for words, probabilities in zip(words_out_use, probability_out_use): out_str_line = '' for word, probability in zip(words, probabilities): out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format( probability) out_str_line = out_str_line[3:-1] out_str = out_str + " || " + out_str_line out_str = out_str[4:-1] else: words_out_use = words_out[0:letters_num] probability_out_use = probability_out[0:letters_num] for words, probabilities in zip(words_out_use, probability_out_use): out_str_line = '' for word, probability in zip(words, probabilities): out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format( probability) out_str_line = out_str_line[3:-1] out_str = out_str + " || " + out_str_line return out_str def predict_file_probability(self, test_file_in, test_file_out): testfilein = open(test_file_in, "r") testfileout = open(test_file_out, 'w') t1 = time.time() for sentence in testfilein: sentence = sentence.rstrip() out_str = self.predict_data_probability(sentence) if (out_str): print(sentence + " |#| " + out_str) testfileout.write(sentence + " |#| " + out_str + "\n") else: print("predict error : " + sentence) t2 = time.time() print(t2 - t1) testfilein.close() testfileout.close() def save_model(self, out_path): tf.train.write_graph(self._sess.graph_def, out_path, "graph_rnn.pb", False)