def get_content(record): fields = record.decode('utf-8').strip().split("\t") if len(fields) != 4: raise ValueError("invalid record %s" % record) text_a = list(fields[1]) text_b = list(fields[3]) if len(text_a) > config['max_sequence_length']: text_a = text_a[:config['max_sequence_length']] if len(text_a) < config['max_sequence_length']: while len(text_a) < config['max_sequence_length']: text_a.append('pad') if len(text_b) > config['max_sequence_length']: text_b = text_b[:config['max_sequence_length']] if len(text_b) < config['max_sequence_length']: while len(text_b) < config['max_sequence_length']: text_b.append('pad') return [ fields[0], text_to_sequence(text_a, vocab), fields[2], text_to_sequence(text_b, vocab), fields[1], fields[3] ]
def model_predict(model_path, texts, predict_result_path, word_index): reviewids = [] shoptypes = [] reviews = [] for line in texts: fields = line.strip().split('\t') if len(fields) != 3: continue reviewids.append(fields[0]) shoptypes.append(fields[2]) reviews.append(fields[1]) review_ids = [text_to_sequence(review, word_index) for review in reviews] review_ids = tf.keras.preprocessing.sequence.pad_sequences( review_ids, value=word_index['[PAD]'], padding='post', maxlen=500) model = tf.keras.models.load_model(model_path) result = model.predict(review_ids) label = np.argmax(result, axis=1) with tf.gfile.GFile(predict_result_path, 'w') as writer: writer.write("reviewid\tshoptype\tlabel\tresult\treview\n") for i in range(len(reviews)): writer.write("%s\t%s\t%s\t%s\t%s\n" % (reviewids[i], shoptypes[i], str( label[i]), str(result[i]), reviews[i]))
def model_predict2(model_path, texts, word_index): reviewids = [] reviews = [] for line in texts: fields = line.strip().split('\t') if len(fields) != 2: continue reviewids.append(fields[0]) reviews.append(fields[1]) review_ids = [text_to_sequence(review, word_index) for review in reviews] review_ids = tf.keras.preprocessing.sequence.pad_sequences( review_ids, value=word_index['[PAD]'], padding='post', maxlen=500) model = tf.keras.models.load_model(model_path) result = model.predict(review_ids) label = np.argmax(result, axis=1) return label
def predict(): word_index = load_vocab_ids(FLAGS.word_path) data = pd.read_csv(FLAGS.predict_path, header=None) # shopid = data.values[:, 0] # textid = data.values[:, 1] texts = data.values[:, 0] text_id = [text_to_sequence(text.decode('utf-8'), word_index) for text in texts] text_id = tf.keras.preprocessing.sequence.pad_sequences(text_id, value=word_index['pad'], padding='post', maxlen=15) print text_id[:10] label = np.zeros((len(texts), 2)).astype(np.int64) # a = serialize_example(text_id[0], label[0]) # print tf.train.Example.FromString(a) # iterator = dataset.make_one_shot_iterator() # next = iterator.get_next() # with tf.Session() as sess: # while True: # try: # x, y = sess.run(next) # print # except tf.errors.OutOfRangeError: # break # config = BaseConfig.from_json_file(FLAGS.model_config_path).to_dict() model = BilstmModel(config) model = model.create_model() model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy']) model = tf.keras.estimator.model_to_estimator(keras_model=model) # tf.estimator.Estimator.predict() inference = model.predict( input_fn=lambda: predict_input_fn(text_id, label), checkpoint_path=FLAGS.model_path) result = [0 if ele['dense'][0] > 0.5 else 1 for ele in inference] with tf.gfile.GFile("/tmp/1.csv", 'w') as writer: for i in xrange(len(result)): writer.write("%s\t%d\n" % (texts[i], result[i]))
def text_to_ids(text, words): ids = text_to_sequence(text, words) word_ids = tf.keras.preprocessing.sequence.pad_sequences( [ids], value=words['[PAD]'], padding='post', maxlen=50) return word_ids[0]