def eval(self, top_n=(1, 3, 10)): """ Obtains predictions for eval set target sequences and compares them to the respective previous labels. Returns an array of top-n accuracies. """ acc = [] self.model.set_forward_only(True) for n in top_n: batchSize = 600 batchacc = [] for batchId in range(math.ceil(len(self.srcSeq_batch) / batchSize)): feed_dict = self.model.get_source_encoding_feed_dict( self.srcSeq_batch[batchId * batchSize:(batchId + 1) * batchSize]) sourceEncodings = self.session.run( [self.model.src_seq_embedding], feed_dict=feed_dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, self.targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) batchacc.append( data_utils.computeTopK_TightVersion_accuracy( n, self.eval_Labels[batchId * batchSize:(batchId + 1) * batchSize], rankedIdx)) acc.append(np.mean(batchacc)) return acc
def relevanceRanking(): #parse out search ranking task GET request parameters: e.g.: /api/search?query=red nike shoes&?nbest=10 keywords = request.args.get('query') if 'nbest' in request.args: nbest = int(request.args.get('nbest')) else: nbest = 10 # inference tensorflow model # Get token-ids for the input sentence. source_tokens = app.encoder.encode(tf.compat.as_str(keywords).lower()) src_len = len(source_tokens) if src_len > int(app.modelConfigs['max_seq_length']): source_tokens = source_tokens[:int(app.modelConfigs['max_seq_length'])] else: source_tokens = source_tokens + [text_encoder.EOS_ID] + [text_encoder.PAD_ID] * ( int(app.modelConfigs['max_seq_length']) - src_len - 1) dict = app.model.get_source_encoding_feed_dict(np.array([source_tokens]), np.array([src_len+1])) sourceEncodings = app.sess.run([app.model.src_seq_embedding], feed_dict=dict) #sourceEncodings = app.sess.run([app.model.norm_src_seq_embedding], feed_dict=dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, app.targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) top_confs = rankedScore[0][:nbest] top_tgtIDs = [app.targetIDs[lbl] for lbl in rankedIdx[0][:nbest]] top_tgtNames = [app.targetIDNameMap[id] for id in top_tgtIDs] topResults = [] for idx in range(nbest): print('top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) entry={} entry['ListingId'] = top_tgtIDs[idx] entry['ListingTitle'] = top_tgtNames[idx] entry['rankingScore'] = float(top_confs[idx]) topResults.append(entry) return jsonify( { 'SearchQuery':keywords, 'SearchRankingResults':topResults} )
def classification(): #parse out classification task GET request parameters: e.g.: /api/classify?keywords=hello kitty sunglasses&?nbest=8 keywords = request.args.get('keywords') if 'nbest' in request.args: nbest = int(request.args.get('nbest')) else: nbest = 8 # inference tensorflow model # Get token-ids for the input sentence. source_tokens = app.encoder.encode(tf.compat.as_str(keywords).lower()) src_len = len(source_tokens) if src_len > int(app.modelConfigs['max_seq_length']): source_tokens = source_tokens[:int(app.modelConfigs['max_seq_length'])] else: source_tokens = source_tokens + [text_encoder.EOS_ID] + [text_encoder.PAD_ID] * ( int(app.modelConfigs['max_seq_length']) - src_len - 1) dict = app.model.get_source_encoding_feed_dict(np.array([source_tokens]), np.array([src_len+1])) #sourceEncodings = app.sess.run([app.model.src_seq_embedding], feed_dict=dict) sourceEncodings = app.sess.run([app.model.norm_src_seq_embedding], feed_dict=dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, app.targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) top_confs = rankedScore[0][:nbest] top_tgtIDs = [app.targetIDs[lbl] for lbl in rankedIdx[0][:nbest]] top_tgtNames = [app.targetIDNameMap[id] for id in top_tgtIDs] topResults = [] for idx in range(nbest): print('top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) entry={} entry['targetCategoryId'] = top_tgtIDs[idx] entry['targetCategoryName'] = top_tgtNames[idx] entry['confidenceScore'] = float(top_confs[idx]) topResults.append(entry) return jsonify( { 'ReqeustKeywords':keywords, 'ClassificationResults':topResults} )
def crosslingualSearch(): #parse out cross-lingual search task GET request parameters: e.g.: /api/crosslingual?query=nike运动鞋&?nbest=10 keywords = request.args.get('query') if 'nbest' in request.args: nbest = int(request.args.get('nbest')) else: nbest = 10 # inference tensorflow model # Get token-ids for the input sentence. source_tokens = app.encoder.encode(tf.compat.as_str(keywords).lower()) srclen = len(source_tokens) max_seq_length = int(app.modelConfigs['max_seq_length']) if srclen > max_seq_length - 2: logging.info( 'Input sentence too long, max allowed is %d. Try to increase limit!!!!' % (max_seq_length)) source_tokens = [ text_encoder.PAD_ID ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID] else: source_tokens = [text_encoder.PAD_ID ] * (max_seq_length - srclen - 1) + source_tokens + [ text_encoder.EOS_ID ] dict = app.model.get_source_encoding_feed_dict(np.array([source_tokens])) sourceEncodings = app.sess.run([app.model.src_seq_embedding], feed_dict=dict) #sourceEncodings = app.sess.run([app.model.norm_src_seq_embedding], feed_dict=dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, app.targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) top_confs = rankedScore[0][:nbest] top_tgtIDs = [app.targetIDs[lbl] for lbl in rankedIdx[0][:nbest]] top_tgtNames = [app.targetIDNameMap[id] for id in top_tgtIDs] topResults = [] for idx in range(nbest): logging.info( 'top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) entry = {} entry['documentId'] = top_tgtIDs[idx] entry['documentTitle'] = top_tgtNames[idx] entry['confScore'] = float(top_confs[idx]) topResults.append(entry) return jsonify({ 'CrossLingualQuery': keywords, 'SearchResults': topResults })
def questionAnswering(): #parse QnA task's GET request parameters: e.g.: /api/qna?question=how does secure pay work&?nbest=5 keywords = request.args.get('question') if 'nbest' in request.args: nbest = int(request.args.get('nbest')) else: nbest = 5 # inference tensorflow model # Get token-ids for the input sentence. source_tokens = app.encoder.encode(tf.compat.as_str(keywords).lower()) srclen = len(source_tokens) max_seq_length = int(app.modelConfigs['max_seq_length']) if srclen > max_seq_length - 2: print( 'Input sentence too long, max allowed is %d. Try to increase limit!!!!' % (max_seq_length)) source_tokens = [ text_encoder.PAD_ID ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID] else: source_tokens = [text_encoder.PAD_ID ] * (max_seq_length - srclen - 1) + source_tokens + [ text_encoder.EOS_ID ] dict = app.model.get_source_encoding_feed_dict(np.array([source_tokens])) sourceEncodings = app.sess.run([app.model.src_seq_embedding], feed_dict=dict) #sourceEncodings = app.sess.run([app.model.norm_src_seq_embedding], feed_dict=dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, app.targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) top_confs = rankedScore[0][:nbest] top_tgtIDs = [app.targetIDs[lbl] for lbl in rankedIdx[0][:nbest]] top_tgtNames = [app.targetIDNameMap[id] for id in top_tgtIDs] topResults = [] for idx in range(nbest): print('top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) entry = {} entry['answerDocId'] = top_tgtIDs[idx] entry['answerContent'] = top_tgtNames[idx] entry['confidenceScore'] = float(top_confs[idx]) topResults.append(entry) return jsonify({'Question': keywords, 'Answers': topResults})
def demo(nbest): if not os.path.exists(FLAGS.model_dir): print('Model folder does not exist!!') exit(-1) if not os.path.exists(os.path.join(FLAGS.model_dir, 'vocabulary.txt')): print( 'Error!! Could not find vocabulary file for encoder in model folder.' ) exit(-1) encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(FLAGS.model_dir, 'vocabulary.txt')) if not os.path.exists(os.path.join(FLAGS.model_dir, FLAGS.indexFile)): print('Index file does not exist!!!') exit(-1) #load full set target Index data targetEncodings = [] targetIDs = [] idLabelMap = {} targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(FLAGS.model_dir, FLAGS.indexFile), 'rt', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: print('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] targetIDs.append(tgtid) targetEncodings.append( [float(f) for f in tgtEncoding.strip().split(',')]) idLabelMap[tgtid] = idx targetIDNameMap[tgtid] = tgtseq idx += 1 targetEncodings = np.array(targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: # TODO: improve here later #load model modelConfigs = data_utils.load_model_configs(FLAGS.model_dir) model = sse_model.SSEModel(modelConfigs) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % FLAGS.model_dir) exit(-1) # Decode from standard input. sys.stdout.write( "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > " ) sys.stdout.flush() sentence = sys.stdin.readline() while sentence and sentence.strip().lower() != 'exit': # Get token-ids for the input sentence. source_tokens = encoder.encode(tf.compat.as_str(sentence).lower()) srclen = len(source_tokens) max_seq_length = int(modelConfigs['max_seq_length']) if srclen > max_seq_length - 2: print( 'Input sentence too long, max allowed is %d. Try to increase limit!!!!' % (max_seq_length)) source_tokens = [ text_encoder.PAD_ID ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID] else: source_tokens = [text_encoder.PAD_ID] * ( max_seq_length - srclen - 1) + source_tokens + [ text_encoder.EOS_ID ] feed_dict = model.get_source_encoding_feed_dict( np.array([source_tokens])) model.set_forward_only(True) sourceEncodings = sess.run([model.src_seq_embedding], feed_dict=feed_dict) #sourceEncodings = sess.run([model.norm_src_seq_embedding], feed_dict=feed_dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) top_confs = rankedScore[0][:nbest] top_tgtIDs = [targetIDs[lbl] for lbl in rankedIdx[0][:nbest]] top_tgtNames = [targetIDNameMap[id] for id in top_tgtIDs] print('Top %s Prediction results are:\n' % nbest) for idx in range(nbest): print('top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()