def main(unused_argv): if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, FLAGS.do_lower, split_on_newlines=FLAGS.split_on_newlines) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.do_lower, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts(token_counts, FLAGS.min_count, FLAGS.num_iterations) encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def index(model_dir, rawfile, encodeIndexFile, batchsize=10000): if not os.path.exists(model_dir): print('Error! Model folder does not exist!! : %s' % model_dir) exit(-1) if not os.path.exists(os.path.join(model_dir, 'vocabulary.txt')): print( 'Error!! Could not find vocabulary file for encoder in folder :%s' % model_dir) exit(-1) encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(model_dir, 'vocabulary.txt')) print("Loaded vocab size is: %d" % encoder.vocab_size) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: #load model modelConfigs = data_utils.load_model_configs(model_dir) model = sse_model.SSEModel(modelConfigs) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % model_dir) exit(-1) # start to indexing createIndexFile(model, encoder, rawfile, int(modelConfigs['max_seq_length']), encodeIndexFile, sess, batchsize)
def index(model_dir, rawfile, encodeIndexFile, batchsize=10000): if not os.path.exists( model_dir ): print('Error! Model folder does not exist!! : %s' % model_dir) exit(-1) if not os.path.exists( os.path.join(model_dir, 'vocabulary.txt' ) ): print('Error!! Could not find vocabulary file for encoder in folder :%s' % model_dir) exit(-1) encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(model_dir, 'vocabulary.txt' )) print("Loaded vocab size is: %d" % encoder.vocab_size) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: #load model modelConfigs = data_utils.load_model_configs(model_dir) model = sse_model.SSEModel( int(modelConfigs['max_seq_length']), float(modelConfigs['max_gradient_norm']), int(modelConfigs['vocabsize']), int(modelConfigs['embedding_size']), int(modelConfigs['encoding_size']), int(modelConfigs['src_cell_size']), int(modelConfigs['tgt_cell_size']), int(modelConfigs['num_layers']), float(modelConfigs['learning_rate']), float(modelConfigs['learning_rate_decay_factor']), int(modelConfigs['targetSpaceSize']), network_mode=modelConfigs['network_mode'], forward_only=True, TOP_N=int(modelConfigs['TOP_N']) ) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print('Error!!!Could not load any model from specified folder: %s' % model_dir) exit(-1) # start to indexing createIndexFile(model, encoder, rawfile, int(modelConfigs['max_seq_length']), encodeIndexFile, sess, batchsize)
def main(unused_argv): if FLAGS.log_level not in ['DEBUG', 'INFO', 'ERROR']: raise ValueError('Set verbosity among "DEBUG", "INFO", "ERROR"') tf.logging.set_verbosity(FLAGS.log_level) if FLAGS.corpus_filepattern and FLAGS.vocab_filepattern: raise ValueError( 'Must only provide one of --corpus_filepattern or --vocab_filepattern' ) elif FLAGS.corpus_filepattern: token_counts = tokenizer.corpus_token_counts( FLAGS.corpus_filepattern, FLAGS.corpus_max_lines, split_on_newlines=FLAGS.split_on_newlines, additional_chars=FLAGS.additional_chars) elif FLAGS.vocab_filepattern: token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern, FLAGS.corpus_max_lines) else: raise ValueError( 'Must provide one of --corpus_filepattern or --vocab_filepattern') encoder = text_encoder.SubwordTextEncoder() encoder.build_from_token_counts( token_counts, FLAGS.min_count, FLAGS.num_iterations, max_subtoken_length=FLAGS.max_subtoken_length, backward=FLAGS.backward) encoder.store_to_file(FLAGS.output_filename, add_single_quotes=False)
def load_encodedTargetSpace(processed_data_dir): """ :param processed_data_dir: :return: """ vocabFile = processed_data_dir + '/vocabulary.txt' if gfile.Exists(vocabFile): encoder = text_encoder.SubwordTextEncoder(filename=vocabFile) print("Loaded vocab size is: %d" % encoder.vocab_size) else: raise ValueError( "Error!! Could not found vaculary file in model folder.") encodedTgtSpace = {} tgtID_Name_Map = {} tgtEncodeFile = os.path.join(processed_data_dir, "encoded.FullTargetSpace") if not gfile.Exists(tgtEncodeFile): raise ValueError( "Error! could not found encoded.FullTargetSpace in model folder.") print("Loading full target space index ...") for line in codecs.open(tgtEncodeFile, 'r', 'utf-8'): tgtId, tgtName, tgtEncoding = line.strip().split('\t') tgtID_Name_Map[tgtId] = tgtName encodedTgtSpace[tgtId] = [int(i) for i in tgtEncoding.split(',')] return encoder, encodedTgtSpace, tgtID_Name_Map
def __init__(self, work_dir, rawdata_dir, rawvocabsize, max_seq_length): json_path = work_dir + '/compressed' if os.path.exists(json_path): # load data from json print('loading saved json data from %s' % json_path) with open(json_path, 'r') as fin: gdict = json.load(fin) for name, val in gdict.items(): setattr(self, name, val) # setup encoder from vocabulary file vocabFile = work_dir + '/vocabulary.txt' if os.path.exists(vocabFile): print("Loading supplied vocabluary file: %s" % vocabFile) encoder = text_encoder.SubwordTextEncoder(filename=vocabFile) print("Total vocab size is: %d" % encoder.vocab_size) else: print( "No supplied vocabulary file found. Build new vocabulary based on training data ...." ) token_counts = tokenizer.corpus_token_counts( work_dir + '/*.Corpus', 2000000, split_on_newlines=True) encoder = text_encoder.SubwordTextEncoder.build_to_target_size( rawvocabsize, token_counts, 2, 1000) encoder.store_to_file(vocabFile) print("New vocabulary constructed.") self.encoder = encoder self.max_seq_length = int(self.max_seq_length) self.vocab_size = encoder.vocab_size print('-') print('Vocab size:', self.vocab_size, 'unique words') print('-') print('Max allowed sequence length:', self.max_seq_length) print('-') else: print('generating data from data path: %s' % rawdata_dir) encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap = data_utils.prepare_raw_data( rawdata_dir, work_dir, rawvocabsize, max_seq_length) self.encoder = encoder self.rawTrainPosCorpus = trainCorpus self.rawEvalCorpus = evalCorpus self.max_seq_length = max_seq_length self.encodedFullTargetSpace = encodedFullTargetSpace self.tgtIdNameMap = tgtIdNameMap self.vocab_size = encoder.vocab_size self.fullSetTargetIds = list(encodedFullTargetSpace.keys()) self.rawnegSetLen = len(self.fullSetTargetIds) print('-') print('Vocab size:', self.vocab_size, 'unique words') print('-') print('Max allowed sequence length:', self.max_seq_length) print('-') gdict = {} for name, attr in self.__dict__.items(): if not name.startswith("__") and name != 'encoder': if not callable(attr) and not type(attr) is staticmethod: gdict[name] = attr with open(json_path, 'w') as fout: json.dump(gdict, fout) print('Processed data dumped')
def __init__(self, *args, **kwargs): super(FlaskApp, self).__init__(*args, **kwargs) self.model = 'Do my initialization work here, loading model and index ....' self.model_type = os.environ.get("MODEL_TYPE", "classification") self.model_dir = "models-" + self.model_type self.indexFile = os.environ.get("INDEX_FILE", "targetEncodingIndex.tsv") print("In app class: Received flask appconfig is: " + os.environ.get('MODEL_TYPE', 'Default_classification') ) if not os.path.exists(self.model_dir): print('Model folder %s does not exist!!' % self.model_dir ) exit(-1) if not os.path.exists(os.path.join(self.model_dir, self.indexFile)): print('Index File does not exist!!') exit(-1) # load full set targetSeqID data if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')): print('Error!! Could not find vocabulary file for encoder in model folder.') exit(-1) self.encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(self.model_dir, 'vocabulary.txt')) # load full set target Index data self.targetEncodings = [] self.targetIDs = [] self.targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(self.model_dir, self.indexFile), 'r', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: print('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] self.targetIDs.append(tgtid) self.targetEncodings.append([float(f) for f in tgtEncoding.strip().split(',')]) self.targetIDNameMap[tgtid] = tgtseq idx += 1 self.targetEncodings = np.array(self.targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) self.sess = tf.Session(config=cfg) #load model self.modelConfigs = data_utils.load_model_configs(self.model_dir) self.model = sse_model.SSEModel( int(self.modelConfigs['max_seq_length']), float(self.modelConfigs['max_gradient_norm']), int(self.modelConfigs['vocabsize']), int(self.modelConfigs['embedding_size']), int(self.modelConfigs['encoding_size']), int(self.modelConfigs['src_cell_size']), int(self.modelConfigs['tgt_cell_size']), int(self.modelConfigs['num_layers']), float(self.modelConfigs['learning_rate']), float(self.modelConfigs['learning_rate_decay_factor']), int(self.modelConfigs['targetSpaceSize']), network_mode=self.modelConfigs['network_mode'], forward_only=True, TOP_N=int(self.modelConfigs['TOP_N']) ) ckpt = tf.train.get_checkpoint_state(self.model_dir) if ckpt: print("loading model from %s" % ckpt.model_checkpoint_path) self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: print('Error!!!Could not load any model from specified folder: %s' % self.model_dir) exit(-1)
def prepare_raw_data(raw_data_dir, processed_data_dir, vocabulary_size, neg_samples, max_seq_length): """ Get SSE training, and Evaluation related data, create tokenizer and vocabulary. :param raw_data_dir: :param processed_data_dir: :param vocabulary_size: :param neg_samples: :param max_seq_length: :return: """ # unzip corpus to the specified processed directory. get_data_set(raw_data_dir, processed_data_dir) # generate vocab file if not available, otherwise, use supplied vocab file for encoder vocabFile = processed_data_dir + '/vocabulary.txt' if gfile.Exists(vocabFile): print("Loading supplied vocabluary file: %s" % vocabFile) encoder = text_encoder.SubwordTextEncoder(filename=vocabFile) print("Total vocab size is: %d" % encoder.vocab_size) else: print("No supplied vocabulary file found. Build new vocabulary based on training data ....") token_counts = tokenizer.corpus_token_counts(processed_data_dir + '/*.Corpus', 1000000, split_on_newlines=True) encoder = text_encoder.SubwordTextEncoder.build_to_target_size(vocabulary_size, token_counts, 2, 1000) encoder.store_to_file(vocabFile) print("New vocabulary constructed.") # create encoded TargetSpace Data encodedFullTargetSpace = {} tgtIdNameMap = {} encodedFullTargetFile = codecs.open(os.path.join(processed_data_dir, "encoded.FullTargetSpace"), 'w', 'utf-8') for line in codecs.open(os.path.join(processed_data_dir, "targetIDs"), 'r', 'utf-8'): tgtSeq, id = line.strip().split('\t') token_ids = encoder.encode(tgtSeq.lower()) seqlen = len(token_ids) if seqlen > max_seq_length - 1: print( 'Error Detected!!! \n Target:\n %s \n Its seq length is:%d, which is longer than MAX_SEQ_LENTH of %d. Try to increase limit!!!!' % ( tgtSeq, seqlen, max_seq_length)) continue token_ids = token_ids + [text_encoder.EOS_ID] + [text_encoder.PAD_ID] * (max_seq_length - seqlen - 1) encodedFullTargetSpace[id] = token_ids tgtIdNameMap[id] = tgtSeq encodedFullTargetFile.write(id + '\t' + tgtSeq.strip() + '\t' + ','.join([str(i) for i in token_ids]) + '\n') encodedFullTargetFile.close() # creat positive Evaluation corpus: (source_tokens, verifiedTgtIds ) evalCorpus = gen_postive_corpus(os.path.join(processed_data_dir, "EvalPairs"), encodedFullTargetSpace, encoder, max_seq_length) # create positive Training Corpus: (source_tokens, verifiedTgtIds ) trainCorpus = gen_postive_corpus(os.path.join(processed_data_dir, "TrainPairs"), encodedFullTargetSpace, encoder, max_seq_length) return encoder, trainCorpus, evalCorpus, encodedFullTargetSpace, tgtIdNameMap
def prepare_raw_data(raw_data_dir, processed_data_dir, vocabulary_size, task_type, max_seq_length): """Get SSE training-Evaluation data into data_dir, create vocabularies and tokenized data. Args: raw_data_dir: directory contains the raw zipped dataset. processed_data_dir: directory in which the processed data sets will be stored. vocabulary_size: size of the vocabulary to create and use if no vocabulary file found in rawdata. Otherwise, use supplied vocabulary file. task_type: different task_type has slightly different rawdata format, and need different treatment for classification task, usually has TrainPairs, EvalPairs, targetSpaceID file for search task, for cross-lingual search tasks, for question answer tasks, max_seq_length: max number of tokens of a single source/target sequence Returns: A tuple of 5 elements: (1) path to encoded TrainPairs: targetID, Sequence of source token IDs (2) path to encoded EvalPairs: targetID, Sequence of source token IDs (3) path to encoded full TargetSpaces: targetID, Sequence of target token IDs (4) path to the source vocabulary file, (5) path to the target vocabulary file. """ # extract corpus to the specified processed directory. get_data_set(raw_data_dir, processed_data_dir) # generate vocab file if not available, otherwise, use supplied vocab file for encoder vocabFile = processed_data_dir + '/vocabulary.txt' if gfile.Exists( vocabFile ): print("Loading supplied vocabluary file: %s" % vocabFile ) encoder = text_encoder.SubwordTextEncoder(filename=vocabFile) print("Total vocab size is: %d" % encoder.vocab_size ) else: print("No supplied vocabulary file found. Build new vocabulary based on training data ....") token_counts = tokenizer.corpus_token_counts( processed_data_dir + '/*.Corpus', 1000000, split_on_newlines=True) encoder = text_encoder.SubwordTextEncoder.build_to_target_size( vocabulary_size, token_counts, 2, 1000 ) encoder.store_to_file(vocabFile) print("New vocabulary constructed.") # create training corpus and evaluation corpus per task_type if task_type.lower().strip() == "classification": train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_classification_corpus( processed_data_dir, encoder, max_seq_length) elif task_type.lower().strip() in ["ranking", "crosslingual" ]: train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_search_corpus( processed_data_dir, encoder, max_seq_length) elif task_type.lower().strip() == "qna": train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap = get_questionAnswer_corpus(processed_data_dir, encoder, max_seq_length) else: raise ValueError("Unsupported task_type. Please use one of: classification, search, crosslanguages, questionanswer") return encoder, train_corpus, dev_corpus, encodedTgtSpace, tgtIdNameMap
def get_or_generate_vocab_inner( data_dir, vocab_filename, vocab_size, generator, max_subtoken_length=None, reserved_tokens=None, ): """Inner implementation for vocab generators. Args: data_dir: The base directory where data and vocab files are stored. If None, then do not save the vocab even if it doesn't exist. vocab_filename: relative filename where vocab file is stored vocab_size: target size of the vocabulary constructed by SubwordTextEncoder generator: a generator that produces tokens from the vocabulary max_subtoken_length: an optional integer. Set this to a finite value to avoid quadratic costs during vocab building. reserved_tokens: List of reserved tokens. `text_encoder.RESERVED_TOKENS` should be a prefix of `reserved_tokens`. If `None`, defaults to `RESERVED_TOKENS`. Returns: A SubwordTextEncoder vocabulary object. """ if data_dir and vocab_filename: vocab_filepath = os.path.join(data_dir, vocab_filename) if tf.gfile.Exists(vocab_filepath): tf.logging.info("Found vocab file: %s", vocab_filepath) return text_encoder.SubwordTextEncoder(vocab_filepath) else: vocab_filepath = None tf.logging.info("Generating vocab file: %s", vocab_filepath) vocab = text_encoder.SubwordTextEncoder.build_from_generator( generator, vocab_size, max_subtoken_length=max_subtoken_length, reserved_tokens=reserved_tokens, ) if vocab_filepath: tf.gfile.MakeDirs(data_dir) vocab.store_to_file(vocab_filepath) return vocab
def __init__(self, *args, **kwargs): super(FlaskApp, self).__init__(*args, **kwargs) self.model = 'Do my initialization work here, loading model and index ....' self.model_type = os.environ.get("MODEL_TYPE", "classification") self.model_dir = "models-" + self.model_type self.indexFile = os.environ.get("INDEX_FILE", "targetEncodingIndex.tsv") if not os.path.exists("./logs"): os.makedirs("./logs", exist_ok=True) log = logging.getLogger('') log.setLevel(logging.DEBUG) format = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt='%m/%d/%Y %I:%M:%S %p') ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = handlers.RotatingFileHandler('./logs/WebServerLog.txt', maxBytes=(1048576 * 20), backupCount=7) fh.setFormatter(format) log.addHandler(fh) logging.info("In app class: Received flask appconfig is: " + os.environ.get('MODEL_TYPE', 'Default_classification')) if not os.path.exists(self.model_dir): logging.error('Model folder %s does not exist!!' % self.model_dir) exit(-1) if not os.path.exists(os.path.join(self.model_dir, self.indexFile)): logging.error('Index File does not exist!!') exit(-1) # load full set targetSeqID data if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')): logging.error( 'Error!! Could not find vocabulary file for encoder in model folder.' ) exit(-1) self.encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(self.model_dir, 'vocabulary.txt')) # load full set target Index data self.targetEncodings = [] self.targetIDs = [] self.targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(self.model_dir, self.indexFile), 'r', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: logging.info('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] self.targetIDs.append(tgtid) self.targetEncodings.append( [float(f) for f in tgtEncoding.strip().split(',')]) self.targetIDNameMap[tgtid] = tgtseq idx += 1 self.targetEncodings = np.array(self.targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) self.sess = tf.Session(config=cfg) #load model self.modelConfigs = data_utils.load_model_configs(self.model_dir) self.model = sse_model.SSEModel(self.modelConfigs) ckpt = tf.train.get_checkpoint_state(self.model_dir) if ckpt: logging.info("loading model from %s" % ckpt.model_checkpoint_path) self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: logging.error( 'Error!!!Could not load any model from specified folder: %s' % self.model_dir) exit(-1)
def demo(nbest): if not os.path.exists(FLAGS.model_dir): print('Model folder does not exist!!') exit(-1) if not os.path.exists(os.path.join(FLAGS.model_dir, 'vocabulary.txt')): print( 'Error!! Could not find vocabulary file for encoder in model folder.' ) exit(-1) encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(FLAGS.model_dir, 'vocabulary.txt')) if not os.path.exists(os.path.join(FLAGS.model_dir, FLAGS.indexFile)): print('Index file does not exist!!!') exit(-1) #load full set target Index data targetEncodings = [] targetIDs = [] idLabelMap = {} targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(FLAGS.model_dir, FLAGS.indexFile), 'rt', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: print('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] targetIDs.append(tgtid) targetEncodings.append( [float(f) for f in tgtEncoding.strip().split(',')]) idLabelMap[tgtid] = idx targetIDNameMap[tgtid] = tgtseq idx += 1 targetEncodings = np.array(targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: # TODO: improve here later #load model modelConfigs = data_utils.load_model_configs(FLAGS.model_dir) model = sse_model.SSEModel(modelConfigs) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % FLAGS.model_dir) exit(-1) # Decode from standard input. sys.stdout.write( "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > " ) sys.stdout.flush() sentence = sys.stdin.readline() while sentence and sentence.strip().lower() != 'exit': # Get token-ids for the input sentence. source_tokens = encoder.encode(tf.compat.as_str(sentence).lower()) srclen = len(source_tokens) max_seq_length = int(modelConfigs['max_seq_length']) if srclen > max_seq_length - 2: print( 'Input sentence too long, max allowed is %d. Try to increase limit!!!!' % (max_seq_length)) source_tokens = [ text_encoder.PAD_ID ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID] else: source_tokens = [text_encoder.PAD_ID] * ( max_seq_length - srclen - 1) + source_tokens + [ text_encoder.EOS_ID ] feed_dict = model.get_source_encoding_feed_dict( np.array([source_tokens])) model.set_forward_only(True) sourceEncodings = sess.run([model.src_seq_embedding], feed_dict=feed_dict) #sourceEncodings = sess.run([model.norm_src_seq_embedding], feed_dict=feed_dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) top_confs = rankedScore[0][:nbest] top_tgtIDs = [targetIDs[lbl] for lbl in rankedIdx[0][:nbest]] top_tgtNames = [targetIDNameMap[id] for id in top_tgtIDs] print('Top %s Prediction results are:\n' % nbest) for idx in range(nbest): print('top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
state.direction_choice = st.selectbox('Direction', directions) @st.cache(allow_output_mutation=True) def init(direction_choice): if state.direction_choice == "English to Vietnamese": return ( get_resource('envi_pure_tall9'), 'Welcome to the best ever translation project for Vietnamese !') else: return (get_resource('vien_pure_tall9'), 'Chào mừng bạn đến với dự án dịch tiếng Việt tốt nhất !') state.encoder = text_encoder.SubwordTextEncoder(vocab_file) with open(vocab_file, 'r') as f: state.vocab = f.read().split('\n') (state.model, state.model_path), state.prompt = init(state.direction_choice) if state.direction_choice != state.prev_choice and state.prev_choice != None: state.like = False state.submit = False state.first_time = True state.prev_choice = state.direction_choice write_ui()