def get_content(record):
        fields = record.decode('utf-8').strip().split("\t")
        if len(fields) != 4:
            raise ValueError("invalid record %s" % record)
        text_a = list(fields[1])
        text_b = list(fields[3])

        if len(text_a) > config['max_sequence_length']:
            text_a = text_a[:config['max_sequence_length']]

        if len(text_a) < config['max_sequence_length']:
            while len(text_a) < config['max_sequence_length']:
                text_a.append('pad')

        if len(text_b) > config['max_sequence_length']:
            text_b = text_b[:config['max_sequence_length']]

        if len(text_b) < config['max_sequence_length']:
            while len(text_b) < config['max_sequence_length']:
                text_b.append('pad')
        return [
            fields[0],
            text_to_sequence(text_a, vocab), fields[2],
            text_to_sequence(text_b, vocab), fields[1], fields[3]
        ]
Exemple #2
0
def model_predict(model_path, texts, predict_result_path, word_index):
    reviewids = []
    shoptypes = []
    reviews = []
    for line in texts:
        fields = line.strip().split('\t')
        if len(fields) != 3:
            continue
        reviewids.append(fields[0])
        shoptypes.append(fields[2])
        reviews.append(fields[1])
    review_ids = [text_to_sequence(review, word_index) for review in reviews]
    review_ids = tf.keras.preprocessing.sequence.pad_sequences(
        review_ids, value=word_index['[PAD]'], padding='post', maxlen=500)

    model = tf.keras.models.load_model(model_path)
    result = model.predict(review_ids)
    label = np.argmax(result, axis=1)

    with tf.gfile.GFile(predict_result_path, 'w') as writer:
        writer.write("reviewid\tshoptype\tlabel\tresult\treview\n")
        for i in range(len(reviews)):
            writer.write("%s\t%s\t%s\t%s\t%s\n" %
                         (reviewids[i], shoptypes[i], str(
                             label[i]), str(result[i]), reviews[i]))
Exemple #3
0
def model_predict2(model_path, texts, word_index):
    reviewids = []
    reviews = []
    for line in texts:
        fields = line.strip().split('\t')
        if len(fields) != 2:
            continue
        reviewids.append(fields[0])
        reviews.append(fields[1])
    review_ids = [text_to_sequence(review, word_index) for review in reviews]
    review_ids = tf.keras.preprocessing.sequence.pad_sequences(
        review_ids, value=word_index['[PAD]'], padding='post', maxlen=500)

    model = tf.keras.models.load_model(model_path)
    result = model.predict(review_ids)
    label = np.argmax(result, axis=1)
    return label
Exemple #4
0
def predict():
    word_index = load_vocab_ids(FLAGS.word_path)

    data = pd.read_csv(FLAGS.predict_path, header=None)
    # shopid = data.values[:, 0]
    # textid = data.values[:, 1]
    texts = data.values[:, 0]

    text_id = [text_to_sequence(text.decode('utf-8'), word_index) for text in texts]

    text_id = tf.keras.preprocessing.sequence.pad_sequences(text_id, value=word_index['pad'], padding='post',
                                                            maxlen=15)
    print text_id[:10]
    label = np.zeros((len(texts), 2)).astype(np.int64)
    # a = serialize_example(text_id[0], label[0])
    # print tf.train.Example.FromString(a)



    # iterator = dataset.make_one_shot_iterator()
    # next = iterator.get_next()
    # with tf.Session() as sess:
    #     while True:
    #         try:
    #             x, y = sess.run(next)
    #             print
    #         except tf.errors.OutOfRangeError:
    #             break
    #
    config = BaseConfig.from_json_file(FLAGS.model_config_path).to_dict()
    model = BilstmModel(config)
    model = model.create_model()
    model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001), loss='binary_crossentropy',
                  metrics=['accuracy'])
    model = tf.keras.estimator.model_to_estimator(keras_model=model)

    # tf.estimator.Estimator.predict()
    inference = model.predict(
        input_fn=lambda: predict_input_fn(text_id, label),
        checkpoint_path=FLAGS.model_path)

    result = [0 if ele['dense'][0] > 0.5 else 1 for ele in inference]
    with tf.gfile.GFile("/tmp/1.csv", 'w') as writer:
        for i in xrange(len(result)):
            writer.write("%s\t%d\n" % (texts[i], result[i]))
Exemple #5
0
def text_to_ids(text, words):
    ids = text_to_sequence(text, words)
    word_ids = tf.keras.preprocessing.sequence.pad_sequences(
        [ids], value=words['[PAD]'], padding='post', maxlen=50)
    return word_ids[0]