Beispiel #1
0
def predict():

    predict_base_path = "../data/review_relation/predict/content_relation.csv"

    food_reviews = []
    liren_reviews = []
    jiudian_reviews = []
    yule_reviews = []
    gouwu_reviews = []

    with tf.gfile.GFile(predict_base_path, 'r') as reader:
        for line in reader:
            fields = line.strip().split('\t')
            if fields[2] == "美食":
                food_reviews.append(line)
            elif fields[2] == "丽人":
                liren_reviews.append(line)
            elif fields[2] == "酒店":
                jiudian_reviews.append(line)
            elif fields[2] == "休娱":
                yule_reviews.append(line)
            else:
                gouwu_reviews.append(line)

    words_path = "../data/review_relation/bert_words.csv"
    word_index = load_vocab_ids(words_path, sep='\t')

    model_base_path = "../data/review_relation/version2/"
    predict_result_base_path = "../data/review_relation/predict_result/"
    model_predict(
        os.path.join(model_base_path, 'food2.h5'), food_reviews,
        os.path.join(predict_result_base_path, 'food_predict_result2.csv'),
        word_index)
    model_predict(
        os.path.join(model_base_path, 'jiudian.h5'), jiudian_reviews,
        os.path.join(predict_result_base_path, 'jiudian_predict_result2.csv'),
        word_index)
    model_predict(
        os.path.join(model_base_path, 'liren.h5'), liren_reviews,
        os.path.join(predict_result_base_path, 'liren_predict_result2.csv'),
        word_index)
    model_predict(
        os.path.join(model_base_path, 'yule.h5'), yule_reviews,
        os.path.join(predict_result_base_path, 'yule_predict_result2.csv'),
        word_index)
    model_predict(
        os.path.join(model_base_path, 'gouwu.h5'), gouwu_reviews,
        os.path.join(predict_result_base_path, 'gouwu_predict_result2.csv'),
        word_index)
Beispiel #2
0
def predict():
    word_index = load_vocab_ids(FLAGS.word_path)

    data = pd.read_csv(FLAGS.predict_path, header=None)
    # shopid = data.values[:, 0]
    # textid = data.values[:, 1]
    texts = data.values[:, 0]

    text_id = [text_to_sequence(text.decode('utf-8'), word_index) for text in texts]

    text_id = tf.keras.preprocessing.sequence.pad_sequences(text_id, value=word_index['pad'], padding='post',
                                                            maxlen=15)
    print text_id[:10]
    label = np.zeros((len(texts), 2)).astype(np.int64)
    # a = serialize_example(text_id[0], label[0])
    # print tf.train.Example.FromString(a)



    # iterator = dataset.make_one_shot_iterator()
    # next = iterator.get_next()
    # with tf.Session() as sess:
    #     while True:
    #         try:
    #             x, y = sess.run(next)
    #             print
    #         except tf.errors.OutOfRangeError:
    #             break
    #
    config = BaseConfig.from_json_file(FLAGS.model_config_path).to_dict()
    model = BilstmModel(config)
    model = model.create_model()
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001), loss='binary_crossentropy',
                  metrics=['accuracy'])
    model = tf.keras.estimator.model_to_estimator(keras_model=model)

    # tf.estimator.Estimator.predict()
    inference = model.predict(
        input_fn=lambda: predict_input_fn(text_id, label),
        checkpoint_path=FLAGS.model_path)

    result = [0 if ele['dense'][0] > 0.5 else 1 for ele in inference]
    with tf.gfile.GFile("/tmp/1.csv", 'w') as writer:
        for i in xrange(len(result)):
            writer.write("%s\t%d\n" % (texts[i], result[i]))
Beispiel #3
0
def predict(text):
    words_path = '../data/review_relation/bert_words.csv'

    words = load_vocab_ids(words_path, sep='\t')
    wordIds = text_to_ids(text, words)
    signature_key = "xiaoxiang"
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True

    with tf.Session(graph=tf.Graph(), config=sess_config) as sess:
        meta_graph_def = tf.saved_model.loader.load(
            sess, [tf.saved_model.tag_constants.SERVING], "/tmp/saved_model/1")
        signature = meta_graph_def.signature_def
        text = signature[signature_key].inputs["text"].name
        outputs = signature[signature_key].outputs["label"].name
        text = sess.graph.get_tensor_by_name(text)
        outputs = sess.graph.get_tensor_by_name(outputs)

        print(sess.run([outputs], feed_dict={text: [wordIds]}))
Beispiel #4
0
def predict2():
    predict_base_path = "../data/review_relation/predict/content_relation.csv"
    reviews = []
    contentids = []
    reviewlist = []
    with tf.gfile.GFile(predict_base_path, 'r') as reader:
        for line in reader:
            fields = line.strip().split("\t")
            if len(fields) != 2:
                continue
            contentids.append(fields[0])
            reviewlist.append(fields[1])
            reviews.append(line.strip())

    words_path = "../data/review_relation/bert_words.csv"
    word_index = load_vocab_ids(words_path, sep='\t')
    model_base_path = "../data/review_relation/version2/"
    predict_result_base_path = "../data/review_relation/content_pic_relation/content_relation_res.csv"
    food_label = model_predict2(os.path.join(model_base_path, 'food2.h5'),
                                reviews, word_index)
    jiudian_label = model_predict2(os.path.join(model_base_path, 'jiudian.h5'),
                                   reviews, word_index)
    liren_label = model_predict2(os.path.join(model_base_path, 'liren.h5'),
                                 reviews, word_index)
    yule_label = model_predict2(os.path.join(model_base_path, 'yule.h5'),
                                reviews, word_index)
    gouwu_label = model_predict2(os.path.join(model_base_path, 'gouwu.h5'),
                                 reviews, word_index)

    with tf.gfile.GFile(predict_result_base_path, 'w') as writer:
        writer.write(
            "contentid\tfood\tjiudian\tliren\tyule\tgouwu\tcontentbody\n")
        for i in xrange(len(contentids)):
            writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                         (contentids[i], str(food_label[i]),
                          str(jiudian_label[i]), str(liren_label[i]),
                          str(yule_label[i]), str(gouwu_label[i]), reviews[i]))
def input_fn(path, vocab, config, batch_size):
    dataset = tf.data.TextLineDataset(path)

    dataset = dataset.map(lambda line: parse_line(line, vocab, config),
                          num_parallel_calls=4)

    return dataset.batch(batch_size=batch_size)


if __name__ == "__main__":
    config = BaseConfig.from_json_file(
        "../data/dish/dish_similarity/config.json").to_dict()
    path = "/tmp/valid.csv"
    word_path = "../data/dish/dish_similarity/words.csv"
    word_index = load_vocab_ids(word_path)
    signature_key = 'predict_label'
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True

    with tf.Session(graph=tf.Graph(), config=sess_config) as sess:
        dataset = input_fn(path, word_index, config, 10)
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        meta_graph_def = tf.saved_model.loader.load(
            sess, [tf.saved_model.tag_constants.SERVING],
            "../data/dish/dish_similarity/saved_model/4")
        signature = meta_graph_def.signature_def
        text_a = signature[signature_key].inputs["text_a"].name
        text_b = signature[signature_key].inputs["text_b"].name
Beispiel #6
0
        padding='pre',
        maxlen=max_len)
    tag_list = tf.keras.preprocessing.sequence.pad_sequences(
        tag_list,
        value=word_index.get('[PAD]'.encode('utf-8')),
        padding='pre',
        maxlen=max_len)

    return feature_list, tag_list


if __name__ == '__main__':

    # #处理训练数据
    words_path = "/Users/lionel/Desktop/data/review_relation/bert_words.csv"
    word_index = load_vocab_ids(words_path, sep='\t')
    path = '/Users/lionel/Downloads/review_dish.csv'

    config = {
        'batch_size': 100,
        'max_length': 100,
        'vocab_size': 21128,
        'embedding_size': 100,
        'units': 100,
        'num_tags': 3
    }

    feature_list, tag_list = process_data(path, word_index,
                                          config['max_length'])
    # index = feature_list.shape[0] // config['batch_size'] * config['batch_size']
    #
Beispiel #7
0
def train():
    train_path = '../train.csv'
    valid_path = '../valid.csv'

    words_path = '../data/review_relation/bert_words.csv'

    words = load_vocab_ids(words_path, sep='\t')

    train_text = []
    train_label = []

    with tf.gfile.GFile(train_path, 'r') as reader:
        for line in reader:
            fields = line.strip().split('\t')
            if len(fields) != 2:
                continue
            train_text.append(fields[0])
            train_label.append(int(fields[1]))

    valid_text = []
    valid_label = []

    with tf.gfile.GFile(valid_path, 'r') as reader:
        for line in reader:
            fields = line.strip().split('\t')
            if len(fields) != 2:
                continue
            valid_text.append(fields[0])
            valid_label.append(int(fields[1]))

    train_data = [text_to_ids(ele, words) for ele in train_text]
    train_label = tf.keras.utils.to_categorical(train_label, 2)

    valid_data = [text_to_ids(ele, words) for ele in valid_text]
    valid_label = tf.keras.utils.to_categorical(valid_label, 2)

    import numpy as np
    train_data = np.array(train_data)
    train_label = np.array(train_label)
    valid_data = np.array(valid_data)
    valid_label = np.array(valid_label)

    print train_data.shape

    config = {
        'max_sequence_length': 50,
        'vocab_size': 25000,
        'embedding_size': 200,
        'hidden_size': 100,
        'drop_out': 0.2,
        'num_classes': 2,
        'epoch': 10,
        'batch_size': 100,
        'model_path': '/tmp/100'
    }

    model = LstmModel(config).create_model()
    model.summary()

    model.compile(loss=tf.keras.losses.categorical_crossentropy,
                  optimizer=tf.keras.optimizers.Adam(lr=0.01),
                  metrics=['accuracy'])

    model.fit(x=train_data,
              y=train_label,
              epochs=config['epoch'],
              batch_size=config['batch_size'],
              validation_data=(valid_data, valid_label),
              verbose=1)
    model.save(config['model_path'])
Beispiel #8
0
def train(config):
    """
    训练模型
    :param config: 超参数配置文件
    :return: 预测结果
    """
    word_index = load_vocab_ids(os.path.join(FLAGS.data_dir, 'words.csv'))
    if not tf.gfile.Exists(os.path.join(FLAGS.data_dir, 'model')):
        tf.gfile.MkDir(tf.gfile.Exists(os.path.join(FLAGS.data_dir, 'model')))
    model_path = os.path.join(FLAGS.data_dir, 'model/%s' % FLAGS.model_name)

    train_examples = None
    valid_examples = None
    test_examples = None

    if FLAGS.do_train and FLAGS.do_valid:
        if FLAGS.input_format == 1:
            train_examples = OneInputDataProcessor().get_train_examples(
                FLAGS.data_dir)
            valid_examples = OneInputDataProcessor().get_valid_examples(
                FLAGS.data_dir)

        if FLAGS.input_format == 2:
            train_examples = TwoInputDataProcessor().get_train_examples(
                FLAGS.data_dir)
            valid_examples = TwoInputDataProcessor().get_valid_examples(
                FLAGS.data_dir)

        train_text_a, train_text_b, train_label_ids = features_labels_digitalize(
            train_examples, word_index, config['max_sequence_length'])
        valid_text_a, valid_text_b, valid_label_ids = features_labels_digitalize(
            valid_examples, word_index, config['max_sequence_length'])

        model = None
        if FLAGS.input_format == 2:
            model = text_similarity_model.BilstmModel(config,
                                                      merge_mode='multiply')
        if FLAGS.input_format == 1:
            model = text_classification_model.BilstmModel(config)
        model = model.create_model()

        model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=config['learning_rate']),
            loss='binary_crossentropy',
            metrics=['accuracy'])

        start = time.time()
        if train_text_b is not None:
            model.fit(x=[train_text_a, train_text_b],
                      y=train_label_ids,
                      epochs=config['epoch'],
                      batch_size=config['batch_size'],
                      validation_data=([valid_text_a,
                                        valid_text_b], valid_label_ids),
                      callbacks=[keras.callbacks.EarlyStopping(patience=2)])
        else:
            model.fit(x=train_text_a,
                      y=train_label_ids,
                      epochs=config['epoch'],
                      batch_size=config['batch_size'],
                      validation_data=(valid_text_a, valid_label_ids),
                      callbacks=[keras.callbacks.EarlyStopping(patience=2)])

        end = time.time()
        tf.logging.info("Train time is %ds", end - start)

        model.save(model_path, overwrite=True)

        if FLAGS.do_test:
            if FLAGS.input_format == 1:
                test_examples = OneInputDataProcessor().get_test_examples(
                    FLAGS.data_dir)
            if FLAGS.input_format == 2:
                test_examples = TwoInputDataProcessor().get_test_examples(
                    FLAGS.data_dir)
            test_text_a, test_text_b, test_label_ids = features_labels_digitalize(
                test_examples, word_index, config['max_sequence_length'])
            model = keras.models.load_model(model_path)
            if FLAGS.do_statistic:
                if test_text_b is not None:
                    result = model.predict([test_text_a, test_text_b])
                else:
                    result = model.predict(test_text_a)
                print metrics.classification_report(
                    np.argmax(test_label_ids, axis=1), np.argmax(result,
                                                                 axis=1))
            else:
                count = len(test_text_a)
                n = count // config['batch_size'] + 1
                with tf.gfile.GFile(
                        os.path.join(FLAGS.data_dir, 'predict.csv'), 'w') as f:
                    for i in xrange(n):
                        if (i + 1) * config['batch_size'] >= count:
                            x_a = test_text_a[i * config['batch_size']:]
                            x_b = test_text_b[i * config['batch_size']:]
                        else:
                            x_a = test_text_a[i *
                                              config['batch_size']:(i + 1) *
                                              config['batch_size']]
                            x_b = test_text_b[i *
                                              config['batch_size']:(i + 1) *
                                              config['batch_size']]

                        predictions = model.predict_on_batch([x_a, x_b])
                        result = np.argmax(predictions, axis=1)

                        for j in xrange(len(result)):
                            index = i * config['batch_size'] + j
                            f.write('%s\t%s\t%s\n' %
                                    (test_examples[index].text_a,
                                     test_examples[index].text_b, result[j]))

        if FLAGS.do_export:
            model = keras.models.load_model(model_path)
            features = dict()
            x = model.input
            y = model.output
            args = FLAGS.params
            if isinstance(x, list):
                for i in range(len(x)):
                    features[args[i]] = x[i]
            else:
                features[args[0]] = x

            labels = dict()
            if isinstance(y, list):
                for i in range(len(y)):
                    labels[args[len(features)]] = y[i]
            else:
                labels[args[len(features)]] = y

            sess = tf.keras.backend.get_session()

            prediction_signature = tf.saved_model.signature_def_utils.predict_signature_def(
                inputs=features, outputs=labels)

            valid_prediction_signature = tf.saved_model.signature_def_utils.is_valid_signature(
                prediction_signature)
            if not valid_prediction_signature:
                raise ValueError("Error: Prediction signature not valid!")

            saved_model_path = os.path.join(
                FLAGS.data_dir, 'saved_model/%d' % (FLAGS.saved_model_version))
            if tf.gfile.Exists(saved_model_path):
                tf.gfile.DeleteRecursively(saved_model_path)
            builder = tf.saved_model.builder.SavedModelBuilder(
                saved_model_path)
            legacy_init_op = tf.group(tf.tables_initializer(),
                                      name='legacy_init_op')
            builder.add_meta_graph_and_variables(
                sess, [tf.saved_model.tag_constants.SERVING],
                signature_def_map={FLAGS.signature_name: prediction_signature},
                legacy_init_op=legacy_init_op)
            builder.save()