def eval(self, top_n=(1, 3, 10)):
     """
 Obtains predictions for eval set target sequences and compares them to the
 respective previous labels.
 Returns an array of top-n accuracies.
 """
     acc = []
     self.model.set_forward_only(True)
     for n in top_n:
         batchSize = 600
         batchacc = []
         for batchId in range(math.ceil(len(self.srcSeq_batch) /
                                        batchSize)):
             feed_dict = self.model.get_source_encoding_feed_dict(
                 self.srcSeq_batch[batchId * batchSize:(batchId + 1) *
                                   batchSize])
             sourceEncodings = self.session.run(
                 [self.model.src_seq_embedding], feed_dict=feed_dict)
             sourceEncodings = np.vstack(sourceEncodings)
             distances = np.dot(sourceEncodings, self.targetEncodings.T)
             rankedScore, rankedIdx = data_utils.getSortedResults(distances)
             batchacc.append(
                 data_utils.computeTopK_TightVersion_accuracy(
                     n, self.eval_Labels[batchId * batchSize:(batchId + 1) *
                                         batchSize], rankedIdx))
         acc.append(np.mean(batchacc))
     return acc
Beispiel #2
0
def relevanceRanking():
    #parse out search ranking task GET request parameters: e.g.:  /api/search?query=red nike shoes&?nbest=10
    keywords = request.args.get('query')
    if 'nbest' in request.args:
      nbest = int(request.args.get('nbest'))
    else:
      nbest = 10

    # inference tensorflow model
    # Get token-ids for the input sentence.
    source_tokens = app.encoder.encode(tf.compat.as_str(keywords).lower())
    src_len = len(source_tokens)
    if src_len > int(app.modelConfigs['max_seq_length']):
      source_tokens = source_tokens[:int(app.modelConfigs['max_seq_length'])]
    else:
      source_tokens = source_tokens + [text_encoder.EOS_ID] + [text_encoder.PAD_ID] * ( int(app.modelConfigs['max_seq_length']) - src_len - 1)
    dict = app.model.get_source_encoding_feed_dict(np.array([source_tokens]), np.array([src_len+1]))
    sourceEncodings = app.sess.run([app.model.src_seq_embedding], feed_dict=dict)
    #sourceEncodings = app.sess.run([app.model.norm_src_seq_embedding], feed_dict=dict)
    sourceEncodings = np.vstack(sourceEncodings)
    distances = np.dot(sourceEncodings, app.targetEncodings.T)
    rankedScore, rankedIdx = data_utils.getSortedResults(distances)
    top_confs = rankedScore[0][:nbest]
    top_tgtIDs = [app.targetIDs[lbl] for lbl in rankedIdx[0][:nbest]]
    top_tgtNames = [app.targetIDNameMap[id] for id in top_tgtIDs]
    topResults = []

    for idx in range(nbest):
        print('top%d:  %s , %f ,  %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx]))
        entry={}
        entry['ListingId'] = top_tgtIDs[idx]
        entry['ListingTitle'] = top_tgtNames[idx]
        entry['rankingScore'] = float(top_confs[idx])
        topResults.append(entry)
    return jsonify( { 'SearchQuery':keywords, 'SearchRankingResults':topResults} )
Beispiel #3
0
def classification():
    #parse out classification task GET request parameters: e.g.:  /api/classify?keywords=hello kitty sunglasses&?nbest=8
    keywords = request.args.get('keywords')
    if 'nbest' in request.args:
      nbest = int(request.args.get('nbest'))
    else:
      nbest = 8

    # inference tensorflow model
    # Get token-ids for the input sentence.
    source_tokens = app.encoder.encode(tf.compat.as_str(keywords).lower())
    src_len = len(source_tokens)
    if src_len > int(app.modelConfigs['max_seq_length']):
      source_tokens = source_tokens[:int(app.modelConfigs['max_seq_length'])]
    else:
      source_tokens = source_tokens + [text_encoder.EOS_ID] + [text_encoder.PAD_ID] * ( int(app.modelConfigs['max_seq_length']) - src_len - 1)
    dict = app.model.get_source_encoding_feed_dict(np.array([source_tokens]), np.array([src_len+1]))
    #sourceEncodings = app.sess.run([app.model.src_seq_embedding], feed_dict=dict)
    sourceEncodings = app.sess.run([app.model.norm_src_seq_embedding], feed_dict=dict)
    sourceEncodings = np.vstack(sourceEncodings)
    distances = np.dot(sourceEncodings, app.targetEncodings.T)
    rankedScore, rankedIdx = data_utils.getSortedResults(distances)
    top_confs = rankedScore[0][:nbest]
    top_tgtIDs = [app.targetIDs[lbl] for lbl in rankedIdx[0][:nbest]]
    top_tgtNames = [app.targetIDNameMap[id] for id in top_tgtIDs]
    topResults = []

    for idx in range(nbest):
        print('top%d:  %s , %f ,  %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx]))
        entry={}
        entry['targetCategoryId'] = top_tgtIDs[idx]
        entry['targetCategoryName'] = top_tgtNames[idx]
        entry['confidenceScore'] = float(top_confs[idx])
        topResults.append(entry)
    return jsonify( { 'ReqeustKeywords':keywords, 'ClassificationResults':topResults} )
def crosslingualSearch():
    #parse out cross-lingual search task GET request parameters: e.g.:  /api/crosslingual?query=nike运动鞋&?nbest=10
    keywords = request.args.get('query')
    if 'nbest' in request.args:
        nbest = int(request.args.get('nbest'))
    else:
        nbest = 10

    # inference tensorflow model
    # Get token-ids for the input sentence.
    source_tokens = app.encoder.encode(tf.compat.as_str(keywords).lower())

    srclen = len(source_tokens)
    max_seq_length = int(app.modelConfigs['max_seq_length'])
    if srclen > max_seq_length - 2:
        logging.info(
            'Input sentence too long, max allowed is %d. Try to increase limit!!!!'
            % (max_seq_length))
        source_tokens = [
            text_encoder.PAD_ID
        ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID]
    else:
        source_tokens = [text_encoder.PAD_ID
                         ] * (max_seq_length - srclen - 1) + source_tokens + [
                             text_encoder.EOS_ID
                         ]

    dict = app.model.get_source_encoding_feed_dict(np.array([source_tokens]))
    sourceEncodings = app.sess.run([app.model.src_seq_embedding],
                                   feed_dict=dict)
    #sourceEncodings = app.sess.run([app.model.norm_src_seq_embedding], feed_dict=dict)
    sourceEncodings = np.vstack(sourceEncodings)
    distances = np.dot(sourceEncodings, app.targetEncodings.T)
    rankedScore, rankedIdx = data_utils.getSortedResults(distances)
    top_confs = rankedScore[0][:nbest]
    top_tgtIDs = [app.targetIDs[lbl] for lbl in rankedIdx[0][:nbest]]
    top_tgtNames = [app.targetIDNameMap[id] for id in top_tgtIDs]
    topResults = []

    for idx in range(nbest):
        logging.info(
            'top%d:  %s , %f ,  %s ' %
            (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx]))
        entry = {}
        entry['documentId'] = top_tgtIDs[idx]
        entry['documentTitle'] = top_tgtNames[idx]
        entry['confScore'] = float(top_confs[idx])
        topResults.append(entry)
    return jsonify({
        'CrossLingualQuery': keywords,
        'SearchResults': topResults
    })
Beispiel #5
0
def questionAnswering():
    #parse QnA task's GET request parameters: e.g.:  /api/qna?question=how does secure pay work&?nbest=5
    keywords = request.args.get('question')
    if 'nbest' in request.args:
        nbest = int(request.args.get('nbest'))
    else:
        nbest = 5

    # inference tensorflow model
    # Get token-ids for the input sentence.
    source_tokens = app.encoder.encode(tf.compat.as_str(keywords).lower())
    srclen = len(source_tokens)
    max_seq_length = int(app.modelConfigs['max_seq_length'])
    if srclen > max_seq_length - 2:
        print(
            'Input sentence too long, max allowed is %d. Try to increase limit!!!!'
            % (max_seq_length))
        source_tokens = [
            text_encoder.PAD_ID
        ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID]
    else:
        source_tokens = [text_encoder.PAD_ID
                         ] * (max_seq_length - srclen - 1) + source_tokens + [
                             text_encoder.EOS_ID
                         ]
    dict = app.model.get_source_encoding_feed_dict(np.array([source_tokens]))
    sourceEncodings = app.sess.run([app.model.src_seq_embedding],
                                   feed_dict=dict)
    #sourceEncodings = app.sess.run([app.model.norm_src_seq_embedding], feed_dict=dict)
    sourceEncodings = np.vstack(sourceEncodings)
    distances = np.dot(sourceEncodings, app.targetEncodings.T)
    rankedScore, rankedIdx = data_utils.getSortedResults(distances)
    top_confs = rankedScore[0][:nbest]
    top_tgtIDs = [app.targetIDs[lbl] for lbl in rankedIdx[0][:nbest]]
    top_tgtNames = [app.targetIDNameMap[id] for id in top_tgtIDs]
    topResults = []

    for idx in range(nbest):
        print('top%d:  %s , %f ,  %s ' %
              (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx]))
        entry = {}
        entry['answerDocId'] = top_tgtIDs[idx]
        entry['answerContent'] = top_tgtNames[idx]
        entry['confidenceScore'] = float(top_confs[idx])
        topResults.append(entry)
    return jsonify({'Question': keywords, 'Answers': topResults})
Beispiel #6
0
def demo(nbest):
    if not os.path.exists(FLAGS.model_dir):
        print('Model folder does not exist!!')
        exit(-1)

    if not os.path.exists(os.path.join(FLAGS.model_dir, 'vocabulary.txt')):
        print(
            'Error!! Could not find vocabulary file for encoder in model folder.'
        )
        exit(-1)
    encoder = text_encoder.SubwordTextEncoder(
        filename=os.path.join(FLAGS.model_dir, 'vocabulary.txt'))

    if not os.path.exists(os.path.join(FLAGS.model_dir, FLAGS.indexFile)):
        print('Index file does not exist!!!')
        exit(-1)

    #load full set target Index data
    targetEncodings = []
    targetIDs = []
    idLabelMap = {}
    targetIDNameMap = {}
    idx = 0
    for line in codecs.open(os.path.join(FLAGS.model_dir, FLAGS.indexFile),
                            'rt', 'utf-8').readlines():
        info = line.strip().split('\t')
        if len(info) != 3:
            print('Error in targetIndexFile! %s' % line)
            continue
        tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2]
        targetIDs.append(tgtid)
        targetEncodings.append(
            [float(f) for f in tgtEncoding.strip().split(',')])
        idLabelMap[tgtid] = idx
        targetIDNameMap[tgtid] = tgtseq
        idx += 1
    targetEncodings = np.array(targetEncodings)

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    with tf.Session(config=cfg) as sess:
        # TODO: improve here later
        #load model
        modelConfigs = data_utils.load_model_configs(FLAGS.model_dir)
        model = sse_model.SSEModel(modelConfigs)
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt:
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print(
                'Error!!!Could not load any model from specified folder: %s' %
                FLAGS.model_dir)
            exit(-1)

        # Decode from standard input.
        sys.stdout.write(
            "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > "
        )
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence and sentence.strip().lower() != 'exit':
            # Get token-ids for the input sentence.
            source_tokens = encoder.encode(tf.compat.as_str(sentence).lower())
            srclen = len(source_tokens)
            max_seq_length = int(modelConfigs['max_seq_length'])
            if srclen > max_seq_length - 2:
                print(
                    'Input sentence too long, max allowed is %d. Try to increase limit!!!!'
                    % (max_seq_length))
                source_tokens = [
                    text_encoder.PAD_ID
                ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID]
            else:
                source_tokens = [text_encoder.PAD_ID] * (
                    max_seq_length - srclen - 1) + source_tokens + [
                        text_encoder.EOS_ID
                    ]

            feed_dict = model.get_source_encoding_feed_dict(
                np.array([source_tokens]))
            model.set_forward_only(True)
            sourceEncodings = sess.run([model.src_seq_embedding],
                                       feed_dict=feed_dict)
            #sourceEncodings = sess.run([model.norm_src_seq_embedding], feed_dict=feed_dict)
            sourceEncodings = np.vstack(sourceEncodings)
            distances = np.dot(sourceEncodings, targetEncodings.T)
            rankedScore, rankedIdx = data_utils.getSortedResults(distances)
            top_confs = rankedScore[0][:nbest]
            top_tgtIDs = [targetIDs[lbl] for lbl in rankedIdx[0][:nbest]]
            top_tgtNames = [targetIDNameMap[id] for id in top_tgtIDs]

            print('Top %s Prediction results are:\n' % nbest)
            for idx in range(nbest):
                print('top%d:  %s , %f ,  %s ' %
                      (idx + 1, top_tgtIDs[idx], top_confs[idx],
                       top_tgtNames[idx]))
            print("> ", end="")

            sys.stdout.flush()
            sentence = sys.stdin.readline()