def index(model_dir, rawfile, encodeIndexFile, batchsize=10000): if not os.path.exists(model_dir): print('Error! Model folder does not exist!! : %s' % model_dir) exit(-1) if not os.path.exists(os.path.join(model_dir, 'vocabulary.txt')): print( 'Error!! Could not find vocabulary file for encoder in folder :%s' % model_dir) exit(-1) encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(model_dir, 'vocabulary.txt')) print("Loaded vocab size is: %d" % encoder.vocab_size) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: #load model modelConfigs = data_utils.load_model_configs(model_dir) model = sse_model.SSEModel(modelConfigs) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % model_dir) exit(-1) # start to indexing createIndexFile(model, encoder, rawfile, int(modelConfigs['max_seq_length']), encodeIndexFile, sess, batchsize)
def index(model_dir, rawfile, encodeIndexFile, batchsize=10000): if not os.path.exists( model_dir ): print('Error! Model folder does not exist!! : %s' % model_dir) exit(-1) if not os.path.exists( os.path.join(model_dir, 'vocabulary.txt' ) ): print('Error!! Could not find vocabulary file for encoder in folder :%s' % model_dir) exit(-1) encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(model_dir, 'vocabulary.txt' )) print("Loaded vocab size is: %d" % encoder.vocab_size) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: #load model modelConfigs = data_utils.load_model_configs(model_dir) model = sse_model.SSEModel( int(modelConfigs['max_seq_length']), float(modelConfigs['max_gradient_norm']), int(modelConfigs['vocabsize']), int(modelConfigs['embedding_size']), int(modelConfigs['encoding_size']), int(modelConfigs['src_cell_size']), int(modelConfigs['tgt_cell_size']), int(modelConfigs['num_layers']), float(modelConfigs['learning_rate']), float(modelConfigs['learning_rate_decay_factor']), int(modelConfigs['targetSpaceSize']), network_mode=modelConfigs['network_mode'], forward_only=True, TOP_N=int(modelConfigs['TOP_N']) ) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print('Error!!!Could not load any model from specified folder: %s' % model_dir) exit(-1) # start to indexing createIndexFile(model, encoder, rawfile, int(modelConfigs['max_seq_length']), encodeIndexFile, sess, batchsize)
def __init__(self, *args, **kwargs): super(FlaskApp, self).__init__(*args, **kwargs) self.model = 'Do my initialization work here, loading model and index ....' self.model_type = os.environ.get("MODEL_TYPE", "classification") self.model_dir = "models-" + self.model_type self.indexFile = os.environ.get("INDEX_FILE", "targetEncodingIndex.tsv") print("In app class: Received flask appconfig is: " + os.environ.get('MODEL_TYPE', 'Default_classification') ) if not os.path.exists(self.model_dir): print('Model folder %s does not exist!!' % self.model_dir ) exit(-1) if not os.path.exists(os.path.join(self.model_dir, self.indexFile)): print('Index File does not exist!!') exit(-1) # load full set targetSeqID data if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')): print('Error!! Could not find vocabulary file for encoder in model folder.') exit(-1) self.encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(self.model_dir, 'vocabulary.txt')) # load full set target Index data self.targetEncodings = [] self.targetIDs = [] self.targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(self.model_dir, self.indexFile), 'r', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: print('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] self.targetIDs.append(tgtid) self.targetEncodings.append([float(f) for f in tgtEncoding.strip().split(',')]) self.targetIDNameMap[tgtid] = tgtseq idx += 1 self.targetEncodings = np.array(self.targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) self.sess = tf.Session(config=cfg) #load model self.modelConfigs = data_utils.load_model_configs(self.model_dir) self.model = sse_model.SSEModel( int(self.modelConfigs['max_seq_length']), float(self.modelConfigs['max_gradient_norm']), int(self.modelConfigs['vocabsize']), int(self.modelConfigs['embedding_size']), int(self.modelConfigs['encoding_size']), int(self.modelConfigs['src_cell_size']), int(self.modelConfigs['tgt_cell_size']), int(self.modelConfigs['num_layers']), float(self.modelConfigs['learning_rate']), float(self.modelConfigs['learning_rate_decay_factor']), int(self.modelConfigs['targetSpaceSize']), network_mode=self.modelConfigs['network_mode'], forward_only=True, TOP_N=int(self.modelConfigs['TOP_N']) ) ckpt = tf.train.get_checkpoint_state(self.model_dir) if ckpt: print("loading model from %s" % ckpt.model_checkpoint_path) self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: print('Error!!!Could not load any model from specified folder: %s' % self.model_dir) exit(-1)
def __init__(self, *args, **kwargs): super(FlaskApp, self).__init__(*args, **kwargs) self.model = 'Do my initialization work here, loading model and index ....' self.model_type = os.environ.get("MODEL_TYPE", "classification") self.model_dir = "models-" + self.model_type self.indexFile = os.environ.get("INDEX_FILE", "targetEncodingIndex.tsv") if not os.path.exists("./logs"): os.makedirs("./logs", exist_ok=True) log = logging.getLogger('') log.setLevel(logging.DEBUG) format = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt='%m/%d/%Y %I:%M:%S %p') ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = handlers.RotatingFileHandler('./logs/WebServerLog.txt', maxBytes=(1048576 * 20), backupCount=7) fh.setFormatter(format) log.addHandler(fh) logging.info("In app class: Received flask appconfig is: " + os.environ.get('MODEL_TYPE', 'Default_classification')) if not os.path.exists(self.model_dir): logging.error('Model folder %s does not exist!!' % self.model_dir) exit(-1) if not os.path.exists(os.path.join(self.model_dir, self.indexFile)): logging.error('Index File does not exist!!') exit(-1) # load full set targetSeqID data if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')): logging.error( 'Error!! Could not find vocabulary file for encoder in model folder.' ) exit(-1) self.encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(self.model_dir, 'vocabulary.txt')) # load full set target Index data self.targetEncodings = [] self.targetIDs = [] self.targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(self.model_dir, self.indexFile), 'r', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: logging.info('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] self.targetIDs.append(tgtid) self.targetEncodings.append( [float(f) for f in tgtEncoding.strip().split(',')]) self.targetIDNameMap[tgtid] = tgtseq idx += 1 self.targetEncodings = np.array(self.targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) self.sess = tf.Session(config=cfg) #load model self.modelConfigs = data_utils.load_model_configs(self.model_dir) self.model = sse_model.SSEModel(self.modelConfigs) ckpt = tf.train.get_checkpoint_state(self.model_dir) if ckpt: logging.info("loading model from %s" % ckpt.model_checkpoint_path) self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: logging.error( 'Error!!!Could not load any model from specified folder: %s' % self.model_dir) exit(-1)
def demo(nbest): if not os.path.exists(FLAGS.model_dir): print('Model folder does not exist!!') exit(-1) if not os.path.exists(os.path.join(FLAGS.model_dir, 'vocabulary.txt')): print( 'Error!! Could not find vocabulary file for encoder in model folder.' ) exit(-1) encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(FLAGS.model_dir, 'vocabulary.txt')) if not os.path.exists(os.path.join(FLAGS.model_dir, FLAGS.indexFile)): print('Index file does not exist!!!') exit(-1) #load full set target Index data targetEncodings = [] targetIDs = [] idLabelMap = {} targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(FLAGS.model_dir, FLAGS.indexFile), 'rt', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: print('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] targetIDs.append(tgtid) targetEncodings.append( [float(f) for f in tgtEncoding.strip().split(',')]) idLabelMap[tgtid] = idx targetIDNameMap[tgtid] = tgtseq idx += 1 targetEncodings = np.array(targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: # TODO: improve here later #load model modelConfigs = data_utils.load_model_configs(FLAGS.model_dir) model = sse_model.SSEModel(modelConfigs) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % FLAGS.model_dir) exit(-1) # Decode from standard input. sys.stdout.write( "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > " ) sys.stdout.flush() sentence = sys.stdin.readline() while sentence and sentence.strip().lower() != 'exit': # Get token-ids for the input sentence. source_tokens = encoder.encode(tf.compat.as_str(sentence).lower()) srclen = len(source_tokens) max_seq_length = int(modelConfigs['max_seq_length']) if srclen > max_seq_length - 2: print( 'Input sentence too long, max allowed is %d. Try to increase limit!!!!' % (max_seq_length)) source_tokens = [ text_encoder.PAD_ID ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID] else: source_tokens = [text_encoder.PAD_ID] * ( max_seq_length - srclen - 1) + source_tokens + [ text_encoder.EOS_ID ] feed_dict = model.get_source_encoding_feed_dict( np.array([source_tokens])) model.set_forward_only(True) sourceEncodings = sess.run([model.src_seq_embedding], feed_dict=feed_dict) #sourceEncodings = sess.run([model.norm_src_seq_embedding], feed_dict=feed_dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) top_confs = rankedScore[0][:nbest] top_tgtIDs = [targetIDs[lbl] for lbl in rankedIdx[0][:nbest]] top_tgtNames = [targetIDNameMap[id] for id in top_tgtIDs] print('Top %s Prediction results are:\n' % nbest) for idx in range(nbest): print('top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def demo(): if not os.path.exists(FLAGS.model_dir): print('Model folder does not exist!!') exit(-1) encodedFullTargetSpace_path = os.path.join(FLAGS.model_dir, "encoded.FullTargetSpace") if not os.path.exists(encodedFullTargetSpace_path): print( 'Encoded full target space file not exist. Please ReTrain the model to get it!!' ) exit(-1) #load full set targetSeqID data encoder, encodedTgtSpace, tgtID_Name_Map = data_utils.load_encodedTargetSpace( FLAGS.model_dir) fullTgtIdList = encodedTgtSpace.keys() tgtLabel_IDMap = {idx: tgtid for (idx, tgtid) in enumerate(fullTgtIdList)} tgtInput_batches = [encodedTgtSpace[tgtid] for tgtid in fullTgtIdList] tgtLen_batches = [ encodedTgtSpace[tgtid].index(text_encoder.PAD_ID) + 1 for tgtid in fullTgtIdList ] cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: # TODO: improve here later #load model modelConfigs = data_utils.load_model_configs(FLAGS.model_dir) model = sse_model.SSEModel( int(modelConfigs['max_seq_length']), float(modelConfigs['max_gradient_norm']), int(modelConfigs['vocabsize']), int(modelConfigs['embedding_size']), int(modelConfigs['encoding_size']), int(modelConfigs['src_cell_size']), int(modelConfigs['tgt_cell_size']), int(modelConfigs['num_layers']), float(modelConfigs['learning_rate']), float(modelConfigs['learning_rate_decay_factor']), int(modelConfigs['targetSpaceSize']), network_mode=modelConfigs['network_mode'], forward_only=True, TOP_N=int(modelConfigs['TOP_N'])) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % FLAGS.model_dir) exit(-1) # Decode from standard input. sys.stdout.write( "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > " ) sys.stdout.flush() sentence = sys.stdin.readline() while sentence and sentence.strip().lower() != 'exit': # Get token-ids for the input sentence. source_tokens = encoder.encode(tf.compat.as_str(sentence).lower()) srclen = len(source_tokens) if srclen > int(modelConfigs['max_seq_length']) - 1: print( 'Max number of supported keywords is %d \n Please try againt!!!!' % (int(modelConfigs['max_seq_length']))) continue source_tokens = source_tokens + [ text_encoder.EOS_ID ] + [text_encoder.PAD_ID ] * (int(modelConfigs['max_seq_length']) - srclen - 1) print("") dict = model.get_predict_feed_dict(np.array([source_tokens]), tgtInput_batches, np.array([srclen]), tgtLen_batches) pred_conf, pred_labels = sess.run( [model.predicted_tgts_score, model.predicted_labels], feed_dict=dict) pred_labels = np.vstack(pred_labels) pred_conf = np.vstack(pred_conf) top5_confs = pred_conf[0][:5] top5_tgtIDs = [tgtLabel_IDMap[lbl] for lbl in pred_labels[0][:5]] top5_tgtNames = [tgtID_Name_Map[id] for id in top5_tgtIDs] print('Top 5 Prediction results are:\n') for idx in range(5): print('top%d: %s , %f , %s ' % (idx + 1, top5_tgtIDs[idx], top5_confs[idx], top5_tgtNames[idx])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()