def predict2(s): s = data_helpers.clean_str(s) s = s.split(" ") s = s + ["<PAD/>"] * (56 - len(s)) for i, w in enumerate(s): if w not in vocabulary: s[i] = "<PAD/>" s = np.array([vocabulary[word] for word in s]) s = [s] return sess.run(predictions, {input_x: s, dropout_keep_prob: 1.0})
scores_op = graph.get_operation_by_name("output/scores").outputs[0] predictions_op = graph.get_operation_by_name("output/predictions").outputs[0] print("\n模型加载完毕\n") print("请根据提示输入对酒店的评价,系统根据评价进行分类(好评/差评)\n") print("提示:") print(" 1. 当前系统只能处理最多{:d}个汉语单词,超出部分将被系统忽略".format(max_num_words)) print(" 2. 输入exit()可退出\n") while True: # 获取输入 input_raw = input("\n请输入您对酒店的评价:\n") if input_raw == "exit()": break # 分词+清洗+转换 input_cut = " ".join(jieba.cut(input_raw)) input_cleaned = data_helpers.clean_str(input_cut) input_transformed = np.array(list(vocab_processor.transform([input_cleaned]))) # 预测 scores, predictions = sess.run((scores_op, predictions_op), {input_x: input_transformed}) score_pos = scores[0][1] score_neg = scores[0][0] predictions_readable = "好评" if predictions[0] == 1 else "差评" print("\n正分类得分:{:.2f}".format(score_pos)) print("负分类得分:{:.2f}".format(score_neg)) print("预测结果:{}\n".format(predictions_readable))
def main(dataset, title, body): # Data Preparation # ================================================== path = os.path.join('transformed_data', dataset) # body = sys.argv[1] text = data_helpers.preprocess(title, body) x_text = [data_helpers.clean_str(text)] # Restore vocab file vocab_processor = learn.preprocessing.VocabularyProcessor.restore( os.path.join(path, 'vocab')) x = np.array(list(vocab_processor.fit_transform(x_text))) # print(x) # print(x.shape) tags_df = pd.read_csv(os.path.join('community-data', dataset, '50', 'tags_df.csv'), encoding='utf8', index_col=0) tag_list = tags_df['TagName'].tolist() # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.compat.v1.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, intra_op_parallelism_threads=3, inter_op_parallelism_threads=3) sess = tf.compat.v1.Session(config=session_conf) with sess.as_default(): rcnn = RCNN(num_classes=len(tag_list), vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_dim, hidden_units=100, context_size=50, max_sequence_length=x.shape[1]) # l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.compat.v1.train.AdamOptimizer(1e-3) grads_and_vars = optimizer.compute_gradients(rcnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint directory. checkpoint_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", dataset)) saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables()) # Loading checkpoint save_path = os.path.join(checkpoint_dir, "model") saver.restore(sess, save_path) # predict sequence_length = [len(sample) for sample in x] feed_dict = { rcnn.X: x, rcnn.sequence_length: sequence_length, # rcnn.max_sequence_length: max_sequence_length, rcnn.dropout_keep_prob: 1.0 } prediction = sess.run([rcnn.predictions], feed_dict)[0][0] # print(prediction) idx = prediction.argsort()[-5:][::-1] # print(idx) tags = [tag_list[i] for i in idx] print("\n||| Text |||\n") print(x_text[0]) print("\n||| Predicted tags |||\n") print(tags)
def predict_Problem(loaded_model, vocabulary, sentence): print("sentence Before", sentence) sentence = sentence.split(" ") # print(json.dumps(vocabulary , indent=4)) sentence = map(lambda x: data_helpers.clean_str(x), sentence) print("sentence after", str(sentence)) sequence_length = 49 # Schauen ob word in dictionary vorhanden ist. Falls ja dessen Index im dict einer Liste hinzufuegen startT = timeit.default_timer() pred = [] for word in sentence: print("Predecting", word) if not word: continue x = vocabulary.get(word) if x != None: pred.append(x) else: # Testen ob in w2v ein aehnliches wort gefunden werden kann und ob dieses im Dictionary ist print('Word {} is not in dict'.format(word)) # w2vList=w2v.wv.most_similar(word) try: w2vList = w2v.most_similar(word) except Exception: pred.append(0) continue for w in w2vList: w = w[0] print(w) print('similarity: {} in dict: {}'.format( w, vocabulary.get(w))) if vocabulary.get(w) != None: pred.append(vocabulary.get(w)) break stopT = timeit.default_timer() print('Prepare duration: {}'.format(stopT - startT), "Result:\n", pred) # <PAD/> einfuegen falls word nicht im dict? # else: # pred.append(0) # print(str(word) + " " + str(x)) # print("test1") # print(json.dumps(pred, indent=4)) # Die eben erzeugte Liste mit 0 --> <PAD/> auf die Laenge der sequence_length (400) auffuellen empty_spaces = sequence_length - len(pred) for x in range(0, empty_spaces): pred.append(0) # print("test2") # print(json.dumps(pred, indent=4)) # Liste zu Numpy array konvertieren pred = np.array(pred) pred = pred[None, :] # print("Liste zu Numpy",json.dumps(pred, indent=4)) pred.T # print("test3") # print(pred) # print(json.dumps(pred, indent=4)) print("Pred before: ") startT = timeit.default_timer() prediction = loaded_model.predict(pred, verbose=1) stopT = timeit.default_timer() print('Prediction duration: {}'.format(stopT - startT)) # print("Pred#: ") # print(prediction) return [prediction, sequence_length, pred]
def classify(text): #Load data. Load your own data here sentenses = [ text, ] # Map data into vocabulary vocabulary = data_helpers.get_vocabulary() print "vocab:" print len(vocabulary) x_raw = [data_helpers.clean_str(sentense) for sentense in sentenses] print x_raw sentenses_digital = [] for sentense in x_raw: sentense_digital = np.zeros(data_helpers.max_len) for i, cn_char in enumerate(sentense): if cn_char in vocabulary: sentense_digital[i] = vocabulary[cn_char] sentenses_digital.append(sentense_digital) x_test = np.array(sentenses_digital) print x_test # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, { input_x: x_test_batch, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) labels_list = data_helpers.get_labels_list() print all_predictions print labels_list all_pred_classifies = [ labels_list[int(prediction)] for prediction in all_predictions ] print all_pred_classifies return all_pred_classifies[0]
saver = tf.train.Saver() saver.restore(sess, checkpoint_file) while(True): # read note from xml file xml = et.parse('topics2016.xml') root = xml.getroot() children = root.getchildren() for child in children: note = child[0].text input = note # remove stop words pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') input = pattern.sub('', input) tokenzied_input = [] wl = data_helpers.clean_str(input) count = 0 for ite in wl: if count < FLAGS.note_words_limit: if ite in vocabulary: tokenzied_input.append(vocabulary[ite]) else: tokenzied_input.append(0) count += 1 else: break print tokenzied_input diff = FLAGS.note_words_limit - len(tokenzied_input) if(diff > 0): for i in range(diff): tokenzied_input.append(0)
def test_phrase(self, sess, lrp_manager, test_data, answer, vocab_processor, split_no, k): saver = tf.train.Saver(tf.global_variables()) #model_path = "C:\\work\\Code\\PythonControversy\\src\\LRP\\runs\\1516167942\\checkpoints\\model-500" if split_no == 0: #model_path = PATH_SPLIT_0_CLEAN_DATA #model_path = get_model_path(1517013309,150) model_path = get_model_path(1517198610, 700) elif split_no == 1: #model_path = PATH_SPLIT1 #model_path = get_model_path(1517013721, 230) model_path = get_model_path(1517204236, 800) elif split_no == 2: #model_path = PATH_UNKNOWN #model_path = get_model_path(1517029400, 200) model_path = get_model_path(1517194730, 300) model_path = get_model_path(split_no, 700) saver.restore(sess, model_path) def transform(text): return list(vocab_processor.transform(text)) links, list_test_text = zip(*test_data) list_test_text = list( [data_helpers.clean_str(x) for x in list_test_text]) x_test = np.array(transform(list_test_text)) y = np.array([[0, 1]] * len(list_test_text)) list_test_text = list(vocab_processor.reverse(x_test)) def get_precision(x, y): feed_dict = { lrp_manager.cnn.input_x: x, lrp_manager.cnn.input_y: y, lrp_manager.cnn.dropout_keep_prob: 1.0 } [accuracy] = sess.run([ lrp_manager.cnn.accuracy, ], feed_dict) return accuracy accuracy = get_precision(x_test, y) #print("Prediction accuarcy : ", end="") #print(accuracy) def collect_correct_indice(sess, cnn, x, y): feed_dict = { cnn.input_x: x, cnn.input_y: y, cnn.dropout_keep_prob: 1.0 } [prediction] = sess.run([ cnn.predictions, ], feed_dict) indice = [] y_idx = np.argmax(y, axis=1) for i, _ in enumerate(x): if prediction[i] == y_idx[i]: indice.append(i) return indice correct_indice = collect_correct_indice(sess, lrp_manager.cnn, x_test, y) feed_dict = { lrp_manager.cnn.input_x: x_test, lrp_manager.cnn.input_y: y, lrp_manager.cnn.dropout_keep_prob: 1.0 } candidate_phrase = data_helpers.load_phrase(split_no) candidate_phrase = set(candidate_phrase) assert ("taxes" in candidate_phrase) stemmer = SnowballStemmer("english", ignore_stopwords=True) rt_pickle = "r_t{}.pickle".format(split_no) #if os.path.exists(rt_pickle): # r_t = pickle.load(open(rt_pickle, "rb")) #else: r_t = lrp_manager.run(feed_dict) pickle.dump(r_t, open(rt_pickle, "wb")) count = FailCounter() rand_count = FailCounter() middle_scores = [] for i, batch in enumerate(r_t): f_wrong = False if i not in correct_indice: f_wrong = True #continue if answer[i] is None: #print("No answer in text") continue #phrase_len = len(answer[i].split(" ")) answer_str = " ".join([ stemmer.stem(token) for token in answer[i].lower().split(" ") ]) print("Correct: " + answer[i] + "({})".format(answer_str)) text_tokens = list_test_text[i].split(" ") pos_tags = nltk.pos_tag(text_tokens) candidates = [] def window_sum(window): bonus = max(window) * (len(window) - 1) return (sum(window) + bonus * 0.4) / len(window) phrase_len = len(answer[i].split(" ")) for raw_index, value in np.ndenumerate(batch): for phrase_len in range(1, 3): index = raw_index[0] if index + phrase_len >= batch.shape[0]: break assert (batch[index] == value) window = batch[index:index + phrase_len] is_noun = 'NN' in pos_tags[index + phrase_len - 1][1] s = sum(window) text = " ".join(text_tokens[index:index + phrase_len]) #if text in candidate_phrase and is_noun: #if is_noun: candidates.append((s, index, index + phrase_len, window)) candidates.sort(key=lambda x: x[0], reverse=True) ranking_dict = collections.Counter() info_dict = dict() middle_score = [] for value, begin, end, window in candidates: #end = begin + phrase_len sys_answer = " ".join( [stemmer.stem(t) for t in text_tokens[begin:end]]) ranking_dict[sys_answer] += value info_dict[sys_answer] = (begin, end, window) #sys_answer = " ".join([stemmer.stem(t) for t in text.split(" ")]) middle_score.append((value, sys_answer)) middle_scores.append(middle_score) match = False def get_text(begin, end): st = begin if begin < 0: st = 0 return " ".join(text_tokens[st:end]) #for (value, begin) in candidates[:k]: for key, value in ranking_dict.most_common(k): begin, end, window = info_dict[key] sys_answer = " ".join( [stemmer.stem(t) for t in text_tokens[begin:end]]) sys_answer_no_stem = " ".join(text_tokens[begin:end]) print("{}-{} /{}: {}[{}]{}".format(begin, end, value, get_text(begin - 3, begin), sys_answer_no_stem, get_text(end, end + 3))) #for idx in range(end-begin): # print("{0:.2f} ".format(window[idx])) if sys_answer == answer_str: match = True if match: count.suc() else: count.fail() #print("") max_i = np.argmax(batch, axis=0) #print("Max : {} at {} ({})".format(batch[max_i], max_i, text_tokens[max_i])) pickle.dump(middle_scores, open("middle.score{}.pickle".format(split_no), "wb")) #print("Precision : {}".format(count.precision())) #print("Precision[Random] : {}".format(rand_count.precision())) return count.precision()
def train_and_phrase(self, sess, lrp_manager, test_data, answer, vocab_processor): split2_frame = "C:\work\Code\PythonControversy\src\LRP\\runs\\1516655846\checkpoints\model-{}" def transform(text): return list(vocab_processor.transform(text)) links, list_test_text = zip(*test_data) list_test_text = list( [data_helpers.clean_str(x) for x in list_test_text]) x_test = np.array(transform(list_test_text)) y = np.array([[0, 1]] * len(list_test_text)) list_test_text = list(vocab_processor.reverse(x_test)) def get_precision(x, y): feed_dict = { lrp_manager.cnn.input_x: x, lrp_manager.cnn.input_y: y, lrp_manager.cnn.dropout_keep_prob: 1.0 } [accuracy] = sess.run([ lrp_manager.cnn.accuracy, ], feed_dict) return accuracy summary = [] for progress in range(10, 150, 10): path = split2_frame.format(progress) saver = tf.train.Saver(tf.global_variables()) saver.restore(sess, path) feed_dict = { lrp_manager.cnn.input_x: x_test, lrp_manager.cnn.input_y: y, lrp_manager.cnn.dropout_keep_prob: 1.0 } accuracy = get_precision(x_test, y) print("y/n Accuracy : ", end="") print(accuracy) def collect_correct_indice(sess, cnn, x, y): feed_dict = { cnn.input_x: x, cnn.input_y: y, cnn.dropout_keep_prob: 1.0 } [prediction] = sess.run([ cnn.predictions, ], feed_dict) indice = [] y_idx = np.argmax(y, axis=1) for i, _ in enumerate(x): if prediction[i] == y_idx[i]: indice.append(i) return indice correct_indice = collect_correct_indice(sess, lrp_manager.cnn, x_test, y) r_t = lrp_manager.run(feed_dict) count = FailCounter() k = 10 # candiate to select for i, batch in enumerate(r_t): if i not in correct_indice: continue if answer[i] is None: continue phrase_len = len(answer[i].split(" ")) candidates = [] for raw_index, value in np.ndenumerate(batch): index = raw_index[0] if index + phrase_len >= batch.shape[0]: break assert (batch[index] == value) window = batch[index:index + phrase_len] s = sum(window) candidates.append((s, index)) candidates.sort(key=lambda x: x[0], reverse=True) answer_tokens = set(answer[i].lower().split(" ")) text_tokens = list_test_text[i].split(" ") match = False for (value, last_index) in candidates[:k]: end = last_index + 1 begin = end - phrase_len sys_answer = text_tokens[begin:end] if set(sys_answer) == answer_tokens: match = True if match: count.suc() else: count.fail() print("Precision : {}".format(count.precision())) summary.append((progress, accuracy, count.precision())) for progress, accuracy, precion in summary: print("{}\t{}\t{}\t".format(progress, accuracy, precion))
# only take forst 400 to avoid 0s in embedding cut_review = 400 #---------------------- CALCULATE FREQUENT WORDS------------------------------------# train_pos_path = './data/aclImdb/train/pos/' train_neg_path = './data/aclImdb/train/neg/' test_pos_path = './data/aclImdb/test/pos/' test_neg_path = './data/aclImdb/test/neg/' train_positive_examples = load_data(train_pos_path) train_negative_examples = load_data(train_neg_path) # Clean review train_reviews = train_positive_examples + train_negative_examples train_reviews = [clean_str(sentence, cut_review) for sentence in train_reviews] # Build the list of frequent words word_list = [word for line in train_reviews for word in line.split()] frequent_words_count = Counter(word_list).most_common( 10000 ) # generates list of tuples (word, word_count) and only keeps the 10000 common ones frequent_words = [item[0] for item in frequent_words_count] very_frequent_words_count = Counter(word_list).most_common(5000) very_frequent_words = [item[0] for item in very_frequent_words_count] #--------------------------------- update files with '<oov>' for other infrequent words------------# def replacebyoov(review): sequence = review.split()
print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.training_data_file, FLAGS.class_index_file, True) test_sample_index = -1 * int( FLAGS.test_sample_percentage * float(len(y_test))) x_raw, y_test = x_raw[test_sample_index:], y_test[test_sample_index:] else: x_raw = [ data_helpers.clean_str( "Auto Repair;Oil Change Stations;Transmission Repair") ] y_test = [[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]] y_test[0][10] = 1 # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw)))
id=1 f=open("cat") cat=set([s.strip() for s in list(f.readlines())]) for i in cat: catmap[i]=id id=id+1 return catmap tree = ET.ElementTree(file="test.xml") root = tree.getroot() cnn=open("cnn","a") lstm=open("lstm","a") cat=open("cat","a") for vespaadd in root: document = vespaadd.find("document") if(document!=None): subject = document.find("subject") content = document.find("content") maincat = document.find("maincat") if(subject==None): continue if(content==None): content=subject if(maincat==None): continue write_to_file(cnn,data_helpers.clean_str(subject.text)) write_to_file(lstm,data_helpers.clean_str(content.text)) write_to_file(cat,data_helpers.clean_str(maincat.text)) cnn.close() lstm.close() cat.close()
x_raw = x_raw[-1000:] y_test = y_test[-1000:] y_test = np.argmax(y_test, axis=1) else: x_raw = [ '亲爱的CFer,您获得了英雄级道具。还有全新英雄级道具在等你来拿,立即登录游戏领取吧!', '第一个build错误的解决方法能再说一下吗,我还是不懂怎么解决', '请联系张经理获取最新资讯', '威@信+約ō泡加v:', '朋友)您好,上期开的(27)猴,您肿了吗?+美女徽:86472186 勉費领明天的消息' ] y_test = [1, 0, 1, 1, 1] # 对自己的数据的处理 x_raw_cleaned = [ data_helpers.clean_str(data_helpers.seperate_line(line)) for line in x_raw ] # print(x_raw_cleaned) # 将数据转为词汇表的索引 vocab_path = os.path.join(FLAGS.checkpoint_dir, '..', 'vocab') vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw_cleaned))) print('\nEvaluating...\n') # 评估 # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph()
def process_sentences(sentences): return [clean_str(s.strip()).split() for s in sentences]
# print("{}={}".format(attr.upper(), value)) # print("") # load essay import pymongo from bson.objectid import ObjectId essayId = sys.argv[1] conn = pymongo.MongoClient('localhost') db = conn.get_database('mongodb_tutorial') essayCollection = db.get_collection('essays') result = essayCollection.find({"_id": ObjectId(essayId)})[0] inputEssay = result.get('paragraph', 'Hi~') inputOpinion = result.get('opinion', 'Hi~') nameAuthor = result.get('author', 'customer') x = dh.clean_str(inputEssay) vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab.npy") vocabulary = np.load(vocab_path, allow_pickle=True).item() x_test = dh.testpreprocess(inputEssay, vocabulary) # print("\nEvaluating\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables
tf.flags.DEFINE_string("runs_dir", "./runs", "Directory with trained models") tf.flags.DEFINE_string("input_text", "", "Input text to be classified") tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of character embedding (default: 128)") tf.flags.DEFINE_string("embeddings_file", "./misc/embeddings/pt/NILC-Embeddings/skip_s300.txt", "Word embeddings file (Gensim/word2vec only).") tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value).encode('utf-8').decode(sys.stdout.encoding)) print("") x_raw = [data_helpers.clean_str(FLAGS.input_text)] emb = [] embeddings_file = FLAGS.embeddings_file if embeddings_file != "": print("Loading embeddings file... ({:s})".format(embeddings_file)) try: pt_embeddings = gensim.models.KeyedVectors.load_word2vec_format(FLAGS.embeddings_file, unicode_errors="ignore") except: print("Error opening embeddings file ({:s})".format(embeddings_file)) sys.exit() # Prediction # ==================================================
while (True): # read note from xml file xml = et.parse('topics2016.xml') root = xml.getroot() children = root.getchildren() for child in children: note = child[0].text input = note # remove stop words pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') input = pattern.sub('', input) tokenzied_input = [] wl = data_helpers.clean_str(input) count = 0 for ite in wl: if count < FLAGS.note_words_limit: if ite in vocabulary: tokenzied_input.append(vocabulary[ite]) else: tokenzied_input.append(0) count += 1 else: break print tokenzied_input diff = FLAGS.note_words_limit - len(tokenzied_input) if (diff > 0): for i in range(diff): tokenzied_input.append(0)
def test(model_path): # Delete all flags before declaring del_all_flags(tf.flags.FLAGS) tf.flags.DEFINE_string( "test_data_file", "./../data/testing-dataset/patient_conversations-test.txt", "Data source for the test data.") tf.flags.DEFINE_string("checkpoint_dir", model_path, "Checkpoint directory from training run") tf.flags.DEFINE_boolean("eval_train", True, "Evaluate on all training data") tf.flags.DEFINE_integer("batch_size", 2, "Batch Size (default: 64)") tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS # Print all flags after declaring print_all_flags(FLAGS) if FLAGS.eval_train: # Load data from files test_examples = list( open(FLAGS.test_data_file, "r", encoding="utf8").readlines()) test_examples = [s.strip() for s in test_examples] # Split by words x_raw = [clean_str(sent) for sent in test_examples] else: x_raw = [ "I think I am suffering from cold and flu", "I am really loving this problem" ] # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "./../../", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore( vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint( os.path.join(FLAGS.checkpoint_dir, "./../")) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here patient_tag = ["Patient_Tag"] for x_test_batch in batches: batch_predictions = sess.run(predictions, { input_x: x_test_batch, dropout_keep_prob: 1.0 }) patient_tag = np.concatenate([patient_tag, batch_predictions]) patient_index = ["Index"] patient_index = np.concatenate( [patient_index, np.arange(1, len(x_raw) + 1, 1)]) conversation_data = ["Conversation_Data"] conversation_data = np.concatenate([conversation_data, np.array(x_raw)]) # Save the evaluation to a csv predictions = np.column_stack((patient_index, patient_tag)) predictions_description = np.column_stack((conversation_data, patient_tag)) submission_file = os.path.join("./../data/submission/", "prediction.csv") predictions_description_file = os.path.join("./../data/submission/", "prediction-description.csv") out_path = os.path.abspath(submission_file) print("Saving evaluation to {0}".format(out_path)) with open(out_path, 'w+', encoding="utf8", newline='') as f: csv.writer(f).writerows(predictions) out_path = os.path.abspath(predictions_description_file) print("Saving prediction descriptions to {0}".format(out_path)) with open(out_path, 'w+', encoding="utf8", newline='') as f: csv.writer(f).writerows(predictions_description)
Spotify has signed a new licensing deal with Warner Music Group, paving the way for the music streaming service to go public. Warner was the last of the three big record labels to agree to renewed terms to make its catalogue available to Spotify's 140 million users. However, Spotify has been forced to agree to some limitations to get the labels to sign. Artists and labels have in the past complained about minuscule revenue from steaming sites when compared to downloads or physical sales. But with the deals with Sony, Universal and now Warner in place, Spotify is expected to float on the New York Stock Exchange as early as this year. "Our partnership with Warner Music Group will help grow the new music economy where millions of artists can instantly connect with fans, and millions of fans can instantly connect with artists,” Spotify’s chief content officer Stefan Blom told the BBC. 'Inventive ways' Posting on Instagram, Warner Music chief digital officer Ole Obermann said: "It's taken us a while to get here, but it’s been worth it, as we've arrived at a balanced set of future-focused deal terms. "Together with Spotify, we've found inventive ways to reinforce the value of music, create additional benefits for artists, and excite their fans all over the world. Even with the current pace of growth, there’s still so much potential for music subscription to reach new audiences and territories.” The “inventive ways” were not outlined, but if Warner Music’s deal is similar to those agreed with Sony Music and Universal, it is likely to include a clause allowing the labels to hold back certain songs from Spotify’s non-paying users for a limited period of time. Such a move would increase the royalties for artists who are unhappy at the lower earnings generated by streaming services. """ ] # y_test = [0,0,0,0,1] x_raw[0] = data_helpers.clean_str(x_raw[0]) # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement)
def sentence(self, sess, lrp_manager, test_data, answer, vocab_processor, split_no): saver = tf.train.Saver(tf.global_variables()) #model_path = "C:\\work\\Code\\PythonControversy\\src\\LRP\\runs\\1516167942\\checkpoints\\model-500" model_path = self.get_model_path_by_split(split_no) saver.restore(sess, model_path) def transform(text): return list(vocab_processor.transform(text)) links, list_text_tokens = zip(*test_data) raw_text_tokens = deepcopy(list_text_tokens) list_test_text = list( [data_helpers.clean_str(x) for x in list_text_tokens]) x_test = np.array(transform(list_test_text)) y = np.array([[0, 1]] * len(list_test_text)) rev_test_text = list(vocab_processor.reverse(x_test)) feed_dict = { lrp_manager.cnn.input_x: x_test, lrp_manager.cnn.input_y: y, lrp_manager.cnn.dropout_keep_prob: 1.0 } stemmer = SnowballStemmer("english", ignore_stopwords=True) rt_pickle = "r_t{}.pickle".format(split_no) if os.path.exists(rt_pickle): r_t = pickle.load(open(rt_pickle, "rb")) else: r_t = lrp_manager.run(feed_dict) pickle.dump(r_t, open(rt_pickle, "wb")) candidate_phrase = data_helpers.load_phrase(split_no) candidate_phrase = set(candidate_phrase) for i, batch in enumerate(r_t): low_text = raw_text_tokens[i].lower() print("\\section{" + answer[i] + "}") #print("-----raw text : "+ raw_text_tokens[i][:200]) def has_dot_before(cursor): try: for j in range(1, 3): idx = cursor - j if idx < 0: break if raw_text_tokens[i][idx] == '.': return True except IndexError: print(cursor) raise return False text_tokens = rev_test_text[i].split(" ") cursor = 0 line_no = 0 sentence_rel = 0 max_idx = 0 max_rel = 0 last_idx = 0 sentences = [] for raw_index, value in np.ndenumerate(batch): index = raw_index[0] next_idx = low_text.find(text_tokens[index], cursor) #print(next_idx, end="") sentence_rel = sentence_rel + value if next_idx > len(raw_text_tokens[i]): break if has_dot_before(next_idx): av_score = sentence_rel / (next_idx - last_idx) if av_score > max_rel: max_rel = av_score max_idx = line_no #print("Max {}".format(line_no)) line_no = line_no + 1 sentence_rel = 0 sentence = raw_text_tokens[i][last_idx:next_idx] if av_score > 0.01: print("\hl{" + sentence.strip() + "}") else: print(sentence.strip()) last_idx = next_idx #print("Score :{0:.2f}".format(av_score)) #print(text_tokens[index], end=" ") if next_idx > 0: cursor = next_idx
print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") datasets = None # CHANGE THIS: Load data. Load your own data here dataset_name = cfg["datasets"]["default"] if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) print("Total number of test examples: {}".format(len(y_test))) else: if FLAGS.x_raw!="": x_raw=[data_helpers.clean_str(FLAGS.x_raw)] y_test=[FLAGS.y_test_special] #if dataset_name == "mrpolarity": # datasets = {"target_names": ['positive_examples', 'negative_examples']} # x_raw = ["a masterpiece four years in the making", "everything is off."] # y_test = [1, 0] #else: # datasets = {"target_names": ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']} # x_raw = ["The number of reported cases of gonorrhea in Colorado increased", # "I am in the market for a 24-bit graphics card for a PC"] # y_test = [2, 1] import re def clean(text): text = re.sub(r'\([^)]*\)', '', text)
FLAGS.batch_size print("\nParameters:") for attr, value in sorted(FLAGS.__flags.iteritems()): print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() sent = 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .' sent_arr = [vocabulary[w] for w in data_helpers.clean_str(sent).split()] print sent_arr # Evaluate # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x.shape[1], num_classes=2, vocab_size=len(vocabulary),
# end = min(j + 4, ln) # lst= [' '.join(input_list[j:i]) for i in range(j+1,end+1)] # for str in lst: # if str not in sentence_list: # sentence_list.append(str) # return sentence_list x_text, Y = load_data_and_y_labels( "data/rt-polaritydata/ADE-positive-org.txt", "data/rt-polaritydata/ADE-negative-org.txt") f = open("ADR/ADR_string.txt", "r") #opens file with name of "test.txt" ades = f.read() # d="Rivaroxaban 2/2 lower back pain. Not very PC but am crippled by this drug. Taking more paracetamols. Must ring for 'phone consultation." d = "19.32 day 20 Rivaroxaban diary. Still residual aches and pains; only had 4 paracetamol today." clean = clean_str(d) # corrected=SpellChecker(clean) # print("corrected {}".format(corrected)) # terms = twokenize.tokenizeRawTweetText(clean) # pos_tags = nltk.pos_tag(terms, 'universal') # print pos_tags # word_tokens = nltk.word_tokenize(clean_str(d.lower())) # min_len = min(32, len(pos_tags)) # for i in range(0,min_len ): # list=find_words(word_tokens) sentence_ade = [] result = re.search(r'\b{0}\b'.format('tendon'), ades)
with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name("output/predictions").outputs[0] # Generate batches for one epoch # batches = data_helpers.batch_iter(x_test, FLAGS.batch_size, 1, shuffle=False) # ################################################################################################## s = raw_input("type the sentence:") s = data_helpers.clean_str(s) s = s.split(" ") s = s + ["<PAD/>"] * (56 - len(s)) for i, w in enumerate(s): if w not in vocabulary: s[i] = "<PAD/>" s = np.array([vocabulary[word] for word in s]) s = [s] print "Prediction: ", sess.run(predictions, {input_x: s, dropout_keep_prob: 1.0}) # ##################################################################################################
for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") testfile = "./data/encoding3/v2/encoding3v2_2L_multiple_labels_nonetype_regression_afterpreprocess_haveduplicate_input_train.txt" trainfile = "./data/encoding3/v2/encoding3v2_2L_multiple_labels_nonetype_regression_afterpreprocess_haveduplicate_input_train.txt" # CHANGE THIS: Load data. Load your own data here if FLAGS.eval_train: x_raw, y_test, dic_coding, dic_coding_inv = data_helpers.load_data_and_labels( testfile) x_pred, y_pred, dic_coding_pred, dic_coding_inv_pred = data_helpers.load_data_and_labels( trainfile) y_test = np.argmax(y_test, axis=1) else: x_before = ["Nn2n0n1Nn2n0n4Nn2n1n0Nn2n1n0Pn3n0n1Pn3n0n4Pn3n1n0Pn3n1n0"] x_raw = [data_helpers.clean_str(sent) for sent in x_before] y_test = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] y_test = np.argmax(y_test, axis=1) # Map data into vocabulary vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default():