Ejemplo n.º 1
0
			def predict2(s):
				s = data_helpers.clean_str(s)
				s = s.split(" ")
				s = s + ["<PAD/>"] * (56 - len(s))
				for i, w in enumerate(s):
					if w not in vocabulary:
						s[i] =  "<PAD/>"
				s = np.array([vocabulary[word] for word in s])
				s = [s]
				return sess.run(predictions, {input_x: s, dropout_keep_prob: 1.0})
        scores_op = graph.get_operation_by_name("output/scores").outputs[0]
        predictions_op = graph.get_operation_by_name("output/predictions").outputs[0]

        print("\n模型加载完毕\n")
        print("请根据提示输入对酒店的评价,系统根据评价进行分类(好评/差评)\n")
        print("提示:")
        print("  1. 当前系统只能处理最多{:d}个汉语单词,超出部分将被系统忽略".format(max_num_words))
        print("  2. 输入exit()可退出\n")

        while True:
            # 获取输入
            input_raw = input("\n请输入您对酒店的评价:\n")

            if input_raw == "exit()":
                break

            # 分词+清洗+转换
            input_cut = " ".join(jieba.cut(input_raw))
            input_cleaned = data_helpers.clean_str(input_cut)
            input_transformed = np.array(list(vocab_processor.transform([input_cleaned])))

            # 预测
            scores, predictions = sess.run((scores_op, predictions_op), {input_x: input_transformed})
            score_pos = scores[0][1]
            score_neg = scores[0][0]
            predictions_readable = "好评" if predictions[0] == 1 else "差评"

            print("\n正分类得分:{:.2f}".format(score_pos))
            print("负分类得分:{:.2f}".format(score_neg))
            print("预测结果:{}\n".format(predictions_readable))
Ejemplo n.º 3
0
def main(dataset, title, body):
    # Data Preparation
    # ==================================================

    path = os.path.join('transformed_data', dataset)

    # body = sys.argv[1]
    text = data_helpers.preprocess(title, body)
    x_text = [data_helpers.clean_str(text)]

    # Restore vocab file
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        os.path.join(path, 'vocab'))

    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # print(x)
    # print(x.shape)

    tags_df = pd.read_csv(os.path.join('community-data', dataset, '50',
                                       'tags_df.csv'),
                          encoding='utf8',
                          index_col=0)
    tag_list = tags_df['TagName'].tolist()

    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.compat.v1.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement,
            intra_op_parallelism_threads=3,
            inter_op_parallelism_threads=3)
        sess = tf.compat.v1.Session(config=session_conf)
        with sess.as_default():
            rcnn = RCNN(num_classes=len(tag_list),
                        vocab_size=len(vocab_processor.vocabulary_),
                        embedding_size=FLAGS.embedding_dim,
                        hidden_units=100,
                        context_size=50,
                        max_sequence_length=x.shape[1])
            # l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.compat.v1.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(rcnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Checkpoint directory.
            checkpoint_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", dataset))
            saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())

            # Loading checkpoint
            save_path = os.path.join(checkpoint_dir, "model")
            saver.restore(sess, save_path)

            # predict
            sequence_length = [len(sample) for sample in x]
            feed_dict = {
                rcnn.X:
                x,
                rcnn.sequence_length:
                sequence_length,
                # rcnn.max_sequence_length: max_sequence_length,
                rcnn.dropout_keep_prob:
                1.0
            }
            prediction = sess.run([rcnn.predictions], feed_dict)[0][0]

            # print(prediction)
            idx = prediction.argsort()[-5:][::-1]
            # print(idx)
            tags = [tag_list[i] for i in idx]

    print("\n||| Text |||\n")
    print(x_text[0])
    print("\n||| Predicted tags |||\n")
    print(tags)
Ejemplo n.º 4
0
def predict_Problem(loaded_model, vocabulary, sentence):
    print("sentence Before", sentence)
    sentence = sentence.split(" ")
    # print(json.dumps(vocabulary , indent=4))
    sentence = map(lambda x: data_helpers.clean_str(x), sentence)
    print("sentence after", str(sentence))
    sequence_length = 49
    # Schauen ob word in dictionary vorhanden ist. Falls ja dessen Index im dict einer Liste hinzufuegen

    startT = timeit.default_timer()
    pred = []
    for word in sentence:
        print("Predecting", word)
        if not word:
            continue
        x = vocabulary.get(word)
        if x != None:
            pred.append(x)
        else:  # Testen ob in w2v ein aehnliches wort gefunden werden kann und ob dieses im Dictionary ist
            print('Word {} is not in dict'.format(word))
            # w2vList=w2v.wv.most_similar(word)
            try:
                w2vList = w2v.most_similar(word)
            except Exception:
                pred.append(0)
                continue
            for w in w2vList:
                w = w[0]
                print(w)
                print('similarity: {} in dict: {}'.format(
                    w, vocabulary.get(w)))
                if vocabulary.get(w) != None:
                    pred.append(vocabulary.get(w))
                    break
    stopT = timeit.default_timer()
    print('Prepare duration: {}'.format(stopT - startT), "Result:\n", pred)
    # <PAD/> einfuegen falls word nicht im dict?
    #	else:
    #		pred.append(0)
    #		print(str(word) + " " + str(x))
    # print("test1")
    # print(json.dumps(pred, indent=4))
    # Die eben erzeugte Liste mit 0 --> <PAD/> auf die Laenge der sequence_length (400) auffuellen

    empty_spaces = sequence_length - len(pred)
    for x in range(0, empty_spaces):
        pred.append(0)

    # print("test2")
    # print(json.dumps(pred, indent=4))
    # Liste zu Numpy array konvertieren
    pred = np.array(pred)
    pred = pred[None, :]
    # print("Liste zu Numpy",json.dumps(pred, indent=4))
    pred.T

    # print("test3")
    # print(pred)
    # print(json.dumps(pred, indent=4))

    print("Pred before: ")
    startT = timeit.default_timer()
    prediction = loaded_model.predict(pred, verbose=1)
    stopT = timeit.default_timer()
    print('Prediction duration: {}'.format(stopT - startT))
    # print("Pred#: ")
    # print(prediction)
    return [prediction, sequence_length, pred]
Ejemplo n.º 5
0
def classify(text):
    #Load data. Load your own data here
    sentenses = [
        text,
    ]
    # Map data into vocabulary
    vocabulary = data_helpers.get_vocabulary()
    print "vocab:"
    print len(vocabulary)
    x_raw = [data_helpers.clean_str(sentense) for sentense in sentenses]
    print x_raw
    sentenses_digital = []
    for sentense in x_raw:
        sentense_digital = np.zeros(data_helpers.max_len)
        for i, cn_char in enumerate(sentense):
            if cn_char in vocabulary:
                sentense_digital[i] = vocabulary[cn_char]
        sentenses_digital.append(sentense_digital)
    x_test = np.array(sentenses_digital)

    print x_test

    # Evaluation
    # ==================================================
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(x_test),
                                              FLAGS.batch_size,
                                              1,
                                              shuffle=False)

            # Collect the predictions here
            all_predictions = []

            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

    labels_list = data_helpers.get_labels_list()
    print all_predictions
    print labels_list
    all_pred_classifies = [
        labels_list[int(prediction)] for prediction in all_predictions
    ]
    print all_pred_classifies
    return all_pred_classifies[0]
Ejemplo n.º 6
0
        saver = tf.train.Saver()
        saver.restore(sess, checkpoint_file)
        while(True):
            # read note from xml file
            xml = et.parse('topics2016.xml')
            root = xml.getroot()
            children = root.getchildren()
            for child in children:
                note = child[0].text
                input = note
                # remove stop words
                pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
                input = pattern.sub('', input)

                tokenzied_input = []
                wl = data_helpers.clean_str(input)
                count = 0
                for ite in wl:
                    if count < FLAGS.note_words_limit:
                        if ite in vocabulary:
                            tokenzied_input.append(vocabulary[ite])
                        else:
                            tokenzied_input.append(0)
                        count += 1
                    else:
                        break
                print tokenzied_input
                diff = FLAGS.note_words_limit - len(tokenzied_input)
                if(diff > 0):
                    for i in range(diff):
                        tokenzied_input.append(0)
Ejemplo n.º 7
0
    def test_phrase(self, sess, lrp_manager, test_data, answer,
                    vocab_processor, split_no, k):
        saver = tf.train.Saver(tf.global_variables())
        #model_path = "C:\\work\\Code\\PythonControversy\\src\\LRP\\runs\\1516167942\\checkpoints\\model-500"
        if split_no == 0:
            #model_path = PATH_SPLIT_0_CLEAN_DATA
            #model_path = get_model_path(1517013309,150)
            model_path = get_model_path(1517198610, 700)
        elif split_no == 1:
            #model_path = PATH_SPLIT1
            #model_path = get_model_path(1517013721, 230)
            model_path = get_model_path(1517204236, 800)
        elif split_no == 2:
            #model_path = PATH_UNKNOWN
            #model_path = get_model_path(1517029400, 200)
            model_path = get_model_path(1517194730, 300)

        model_path = get_model_path(split_no, 700)
        saver.restore(sess, model_path)

        def transform(text):
            return list(vocab_processor.transform(text))

        links, list_test_text = zip(*test_data)
        list_test_text = list(
            [data_helpers.clean_str(x) for x in list_test_text])
        x_test = np.array(transform(list_test_text))
        y = np.array([[0, 1]] * len(list_test_text))
        list_test_text = list(vocab_processor.reverse(x_test))

        def get_precision(x, y):
            feed_dict = {
                lrp_manager.cnn.input_x: x,
                lrp_manager.cnn.input_y: y,
                lrp_manager.cnn.dropout_keep_prob: 1.0
            }
            [accuracy] = sess.run([
                lrp_manager.cnn.accuracy,
            ], feed_dict)
            return accuracy

        accuracy = get_precision(x_test, y)

        #print("Prediction accuarcy : ", end="")
        #print(accuracy)

        def collect_correct_indice(sess, cnn, x, y):
            feed_dict = {
                cnn.input_x: x,
                cnn.input_y: y,
                cnn.dropout_keep_prob: 1.0
            }

            [prediction] = sess.run([
                cnn.predictions,
            ], feed_dict)

            indice = []
            y_idx = np.argmax(y, axis=1)
            for i, _ in enumerate(x):
                if prediction[i] == y_idx[i]:
                    indice.append(i)
            return indice

        correct_indice = collect_correct_indice(sess, lrp_manager.cnn, x_test,
                                                y)

        feed_dict = {
            lrp_manager.cnn.input_x: x_test,
            lrp_manager.cnn.input_y: y,
            lrp_manager.cnn.dropout_keep_prob: 1.0
        }

        candidate_phrase = data_helpers.load_phrase(split_no)
        candidate_phrase = set(candidate_phrase)

        assert ("taxes" in candidate_phrase)

        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        rt_pickle = "r_t{}.pickle".format(split_no)
        #if os.path.exists(rt_pickle):
        #    r_t = pickle.load(open(rt_pickle, "rb"))
        #else:
        r_t = lrp_manager.run(feed_dict)
        pickle.dump(r_t, open(rt_pickle, "wb"))
        count = FailCounter()
        rand_count = FailCounter()
        middle_scores = []
        for i, batch in enumerate(r_t):
            f_wrong = False
            if i not in correct_indice:
                f_wrong = True
                #continue
            if answer[i] is None:
                #print("No answer in text")
                continue
            #phrase_len = len(answer[i].split(" "))

            answer_str = " ".join([
                stemmer.stem(token) for token in answer[i].lower().split(" ")
            ])
            print("Correct: " + answer[i] + "({})".format(answer_str))
            text_tokens = list_test_text[i].split(" ")
            pos_tags = nltk.pos_tag(text_tokens)
            candidates = []

            def window_sum(window):
                bonus = max(window) * (len(window) - 1)
                return (sum(window) + bonus * 0.4) / len(window)

            phrase_len = len(answer[i].split(" "))
            for raw_index, value in np.ndenumerate(batch):
                for phrase_len in range(1, 3):
                    index = raw_index[0]
                    if index + phrase_len >= batch.shape[0]:
                        break
                    assert (batch[index] == value)
                    window = batch[index:index + phrase_len]
                    is_noun = 'NN' in pos_tags[index + phrase_len - 1][1]
                    s = sum(window)
                    text = " ".join(text_tokens[index:index + phrase_len])
                    #if text in candidate_phrase and is_noun:
                    #if is_noun:
                    candidates.append((s, index, index + phrase_len, window))
            candidates.sort(key=lambda x: x[0], reverse=True)
            ranking_dict = collections.Counter()
            info_dict = dict()

            middle_score = []
            for value, begin, end, window in candidates:
                #end = begin + phrase_len
                sys_answer = " ".join(
                    [stemmer.stem(t) for t in text_tokens[begin:end]])
                ranking_dict[sys_answer] += value
                info_dict[sys_answer] = (begin, end, window)
                #sys_answer = " ".join([stemmer.stem(t) for t in text.split(" ")])
                middle_score.append((value, sys_answer))
            middle_scores.append(middle_score)
            match = False

            def get_text(begin, end):
                st = begin
                if begin < 0:
                    st = 0
                return " ".join(text_tokens[st:end])

            #for (value, begin) in candidates[:k]:
            for key, value in ranking_dict.most_common(k):
                begin, end, window = info_dict[key]
                sys_answer = " ".join(
                    [stemmer.stem(t) for t in text_tokens[begin:end]])
                sys_answer_no_stem = " ".join(text_tokens[begin:end])
                print("{}-{} /{}: {}[{}]{}".format(begin, end, value,
                                                   get_text(begin - 3, begin),
                                                   sys_answer_no_stem,
                                                   get_text(end, end + 3)))
                #for idx in range(end-begin):
                #    print("{0:.2f} ".format(window[idx]))
                if sys_answer == answer_str:
                    match = True

            if match:
                count.suc()
            else:
                count.fail()
            #print("")

            max_i = np.argmax(batch, axis=0)
            #print("Max : {} at {} ({})".format(batch[max_i], max_i, text_tokens[max_i]))

        pickle.dump(middle_scores,
                    open("middle.score{}.pickle".format(split_no), "wb"))
        #print("Precision : {}".format(count.precision()))
        #print("Precision[Random] : {}".format(rand_count.precision()))
        return count.precision()
Ejemplo n.º 8
0
    def train_and_phrase(self, sess, lrp_manager, test_data, answer,
                         vocab_processor):
        split2_frame = "C:\work\Code\PythonControversy\src\LRP\\runs\\1516655846\checkpoints\model-{}"

        def transform(text):
            return list(vocab_processor.transform(text))

        links, list_test_text = zip(*test_data)
        list_test_text = list(
            [data_helpers.clean_str(x) for x in list_test_text])
        x_test = np.array(transform(list_test_text))
        y = np.array([[0, 1]] * len(list_test_text))
        list_test_text = list(vocab_processor.reverse(x_test))

        def get_precision(x, y):
            feed_dict = {
                lrp_manager.cnn.input_x: x,
                lrp_manager.cnn.input_y: y,
                lrp_manager.cnn.dropout_keep_prob: 1.0
            }
            [accuracy] = sess.run([
                lrp_manager.cnn.accuracy,
            ], feed_dict)
            return accuracy

        summary = []
        for progress in range(10, 150, 10):
            path = split2_frame.format(progress)
            saver = tf.train.Saver(tf.global_variables())
            saver.restore(sess, path)

            feed_dict = {
                lrp_manager.cnn.input_x: x_test,
                lrp_manager.cnn.input_y: y,
                lrp_manager.cnn.dropout_keep_prob: 1.0
            }
            accuracy = get_precision(x_test, y)
            print("y/n Accuracy : ", end="")
            print(accuracy)

            def collect_correct_indice(sess, cnn, x, y):
                feed_dict = {
                    cnn.input_x: x,
                    cnn.input_y: y,
                    cnn.dropout_keep_prob: 1.0
                }

                [prediction] = sess.run([
                    cnn.predictions,
                ], feed_dict)

                indice = []
                y_idx = np.argmax(y, axis=1)
                for i, _ in enumerate(x):
                    if prediction[i] == y_idx[i]:
                        indice.append(i)
                return indice

            correct_indice = collect_correct_indice(sess, lrp_manager.cnn,
                                                    x_test, y)

            r_t = lrp_manager.run(feed_dict)
            count = FailCounter()
            k = 10  # candiate to select
            for i, batch in enumerate(r_t):
                if i not in correct_indice:
                    continue
                if answer[i] is None:
                    continue
                phrase_len = len(answer[i].split(" "))
                candidates = []
                for raw_index, value in np.ndenumerate(batch):
                    index = raw_index[0]
                    if index + phrase_len >= batch.shape[0]:
                        break
                    assert (batch[index] == value)
                    window = batch[index:index + phrase_len]
                    s = sum(window)
                    candidates.append((s, index))
                candidates.sort(key=lambda x: x[0], reverse=True)
                answer_tokens = set(answer[i].lower().split(" "))
                text_tokens = list_test_text[i].split(" ")
                match = False
                for (value, last_index) in candidates[:k]:
                    end = last_index + 1
                    begin = end - phrase_len
                    sys_answer = text_tokens[begin:end]
                    if set(sys_answer) == answer_tokens:
                        match = True
                if match:
                    count.suc()
                else:
                    count.fail()
            print("Precision : {}".format(count.precision()))
            summary.append((progress, accuracy, count.precision()))

        for progress, accuracy, precion in summary:
            print("{}\t{}\t{}\t".format(progress, accuracy, precion))
Ejemplo n.º 9
0
# only take forst 400 to avoid 0s in embedding
cut_review = 400

#---------------------- CALCULATE FREQUENT WORDS------------------------------------#
train_pos_path = './data/aclImdb/train/pos/'
train_neg_path = './data/aclImdb/train/neg/'
test_pos_path = './data/aclImdb/test/pos/'
test_neg_path = './data/aclImdb/test/neg/'

train_positive_examples = load_data(train_pos_path)
train_negative_examples = load_data(train_neg_path)

# Clean review
train_reviews = train_positive_examples + train_negative_examples
train_reviews = [clean_str(sentence, cut_review) for sentence in train_reviews]

# Build the list of frequent words
word_list = [word for line in train_reviews for word in line.split()]
frequent_words_count = Counter(word_list).most_common(
    10000
)  # generates list of tuples (word, word_count) and only keeps the 10000 common ones
frequent_words = [item[0] for item in frequent_words_count]
very_frequent_words_count = Counter(word_list).most_common(5000)
very_frequent_words = [item[0] for item in very_frequent_words_count]

#--------------------------------- update  files with '<oov>' for other infrequent words------------#


def replacebyoov(review):
    sequence = review.split()
Ejemplo n.º 10
0
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.training_data_file,
                                                      FLAGS.class_index_file,
                                                      True)
    test_sample_index = -1 * int(
        FLAGS.test_sample_percentage * float(len(y_test)))
    x_raw, y_test = x_raw[test_sample_index:], y_test[test_sample_index:]
else:
    x_raw = [
        data_helpers.clean_str(
            "Auto Repair;Oil Change Stations;Transmission Repair")
    ]
    y_test = [[
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0
    ]]
    y_test[0][10] = 1

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))
Ejemplo n.º 11
0
	id=1
	f=open("cat")
	cat=set([s.strip() for s in list(f.readlines())])
	for i in cat:
		catmap[i]=id
		id=id+1
	return catmap

tree = ET.ElementTree(file="test.xml")
root = tree.getroot()
cnn=open("cnn","a")
lstm=open("lstm","a")
cat=open("cat","a")
for vespaadd in root:
	document = vespaadd.find("document")
	if(document!=None):
		subject = document.find("subject")
		content = document.find("content")
		maincat = document.find("maincat")
		if(subject==None):
			continue
		if(content==None):
			content=subject
		if(maincat==None):
			continue
		write_to_file(cnn,data_helpers.clean_str(subject.text))
		write_to_file(lstm,data_helpers.clean_str(content.text))
		write_to_file(cat,data_helpers.clean_str(maincat.text))
cnn.close()
lstm.close()
cat.close()
Ejemplo n.º 12
0
    x_raw = x_raw[-1000:]
    y_test = y_test[-1000:]

    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = [
        '亲爱的CFer,您获得了英雄级道具。还有全新英雄级道具在等你来拿,立即登录游戏领取吧!',
        '第一个build错误的解决方法能再说一下吗,我还是不懂怎么解决', '请联系张经理获取最新资讯', '威@信+約ō泡加v:',
        '朋友)您好,上期开的(27)猴,您肿了吗?+美女徽:86472186 勉費领明天的消息'
    ]
    y_test = [1, 0, 1, 1, 1]

# 对自己的数据的处理
x_raw_cleaned = [
    data_helpers.clean_str(data_helpers.seperate_line(line)) for line in x_raw
]
# print(x_raw_cleaned)

# 将数据转为词汇表的索引
vocab_path = os.path.join(FLAGS.checkpoint_dir, '..', 'vocab')
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw_cleaned)))

print('\nEvaluating...\n')

# 评估
# ==================================================

checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
Ejemplo n.º 13
0
def process_sentences(sentences):
    return [clean_str(s.strip()).split() for s in sentences]
Ejemplo n.º 14
0
#     print("{}={}".format(attr.upper(), value))
# print("")

# load essay
import pymongo
from bson.objectid import ObjectId
essayId = sys.argv[1]
conn = pymongo.MongoClient('localhost')
db = conn.get_database('mongodb_tutorial')
essayCollection = db.get_collection('essays')
result = essayCollection.find({"_id": ObjectId(essayId)})[0]
inputEssay = result.get('paragraph', 'Hi~')
inputOpinion = result.get('opinion', 'Hi~')
nameAuthor = result.get('author', 'customer')

x = dh.clean_str(inputEssay)
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab.npy")
vocabulary = np.load(vocab_path, allow_pickle=True).item()
x_test = dh.testpreprocess(inputEssay, vocabulary)

# print("\nEvaluating\n")

checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
Ejemplo n.º 15
0
tf.flags.DEFINE_string("runs_dir", "./runs", "Directory with trained models")
tf.flags.DEFINE_string("input_text", "", "Input text to be classified")
tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("embeddings_file", "./misc/embeddings/pt/NILC-Embeddings/skip_s300.txt", "Word embeddings file (Gensim/word2vec only).")

tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value).encode('utf-8').decode(sys.stdout.encoding))
print("")

x_raw = [data_helpers.clean_str(FLAGS.input_text)]

emb = []
embeddings_file = FLAGS.embeddings_file
if embeddings_file != "":
    print("Loading embeddings file... ({:s})".format(embeddings_file))
    try:
        pt_embeddings = gensim.models.KeyedVectors.load_word2vec_format(FLAGS.embeddings_file, unicode_errors="ignore")
    except:
        print("Error opening embeddings file ({:s})".format(embeddings_file))
        sys.exit()


# Prediction
# ==================================================
Ejemplo n.º 16
0
        while (True):
            # read note from xml file
            xml = et.parse('topics2016.xml')
            root = xml.getroot()
            children = root.getchildren()
            for child in children:
                note = child[0].text
                input = note
                # remove stop words
                pattern = re.compile(r'\b(' +
                                     r'|'.join(stopwords.words('english')) +
                                     r')\b\s*')
                input = pattern.sub('', input)

                tokenzied_input = []
                wl = data_helpers.clean_str(input)
                count = 0
                for ite in wl:
                    if count < FLAGS.note_words_limit:
                        if ite in vocabulary:
                            tokenzied_input.append(vocabulary[ite])
                        else:
                            tokenzied_input.append(0)
                        count += 1
                    else:
                        break
                print tokenzied_input
                diff = FLAGS.note_words_limit - len(tokenzied_input)
                if (diff > 0):
                    for i in range(diff):
                        tokenzied_input.append(0)
def test(model_path):
    # Delete all flags before declaring
    del_all_flags(tf.flags.FLAGS)

    tf.flags.DEFINE_string(
        "test_data_file",
        "./../data/testing-dataset/patient_conversations-test.txt",
        "Data source for the test data.")
    tf.flags.DEFINE_string("checkpoint_dir", model_path,
                           "Checkpoint directory from training run")
    tf.flags.DEFINE_boolean("eval_train", True,
                            "Evaluate on all training data")
    tf.flags.DEFINE_integer("batch_size", 2, "Batch Size (default: 64)")
    tf.flags.DEFINE_boolean("allow_soft_placement", True,
                            "Allow device soft device placement")
    tf.flags.DEFINE_boolean("log_device_placement", False,
                            "Log placement of ops on devices")
    FLAGS = tf.flags.FLAGS

    # Print all flags after declaring
    print_all_flags(FLAGS)

    if FLAGS.eval_train:
        # Load data from files
        test_examples = list(
            open(FLAGS.test_data_file, "r", encoding="utf8").readlines())
        test_examples = [s.strip() for s in test_examples]
        # Split by words
        x_raw = [clean_str(sent) for sent in test_examples]
    else:
        x_raw = [
            "I think I am suffering from cold and flu",
            "I am really loving this problem"
        ]

    # Map data into vocabulary
    vocab_path = os.path.join(FLAGS.checkpoint_dir, "./../../", "vocab")
    vocab_processor = learn.preprocessing.VocabularyProcessor.restore(
        vocab_path)
    x_test = np.array(list(vocab_processor.transform(x_raw)))
    print("\nEvaluating...\n")

    # Evaluation
    # ==================================================
    checkpoint_file = tf.train.latest_checkpoint(
        os.path.join(FLAGS.checkpoint_dir, "./../"))
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = batch_iter(list(x_test),
                                 FLAGS.batch_size,
                                 1,
                                 shuffle=False)

            # Collect the predictions here
            patient_tag = ["Patient_Tag"]

            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                patient_tag = np.concatenate([patient_tag, batch_predictions])

    patient_index = ["Index"]
    patient_index = np.concatenate(
        [patient_index, np.arange(1,
                                  len(x_raw) + 1, 1)])

    conversation_data = ["Conversation_Data"]
    conversation_data = np.concatenate([conversation_data, np.array(x_raw)])

    # Save the evaluation to a csv
    predictions = np.column_stack((patient_index, patient_tag))
    predictions_description = np.column_stack((conversation_data, patient_tag))

    submission_file = os.path.join("./../data/submission/", "prediction.csv")
    predictions_description_file = os.path.join("./../data/submission/",
                                                "prediction-description.csv")

    out_path = os.path.abspath(submission_file)
    print("Saving evaluation to {0}".format(out_path))
    with open(out_path, 'w+', encoding="utf8", newline='') as f:
        csv.writer(f).writerows(predictions)

    out_path = os.path.abspath(predictions_description_file)
    print("Saving prediction descriptions to {0}".format(out_path))
    with open(out_path, 'w+', encoding="utf8", newline='') as f:
        csv.writer(f).writerows(predictions_description)
Ejemplo n.º 18
0
Spotify has signed a new licensing deal with Warner Music Group, paving the way for the music streaming service to go public.
Warner was the last of the three big record labels to agree to renewed terms to make its catalogue available to Spotify's 140 million users.
However, Spotify has been forced to agree to some limitations to get the labels to sign.
Artists and labels have in the past complained about minuscule revenue from steaming sites when compared to downloads or physical sales.
But with the deals with Sony, Universal and now Warner in place, Spotify is expected to float on the New York Stock Exchange as early as this year.
"Our partnership with Warner Music Group will help grow the new music economy where millions of artists can instantly connect with fans, and millions of fans can instantly connect with artists,” Spotify’s chief content officer Stefan Blom told the BBC.
'Inventive ways'
Posting on Instagram, Warner Music chief digital officer Ole Obermann said: "It's taken us a while to get here, but it’s been worth it, as we've arrived at a balanced set of future-focused deal terms.
"Together with Spotify, we've found inventive ways to reinforce the value of music, create additional benefits for artists, and excite their fans all over the world. Even with the current pace of growth, there’s still so much potential for music subscription to reach new audiences and territories.”
The “inventive ways” were not outlined, but if Warner Music’s deal is similar to those agreed with Sony Music and Universal, it is likely to include a clause allowing the labels to hold back certain songs from Spotify’s non-paying users for a limited period of time. Such a move would increase the royalties for artists who are unhappy at the lower earnings generated by streaming services.

"""
]
# y_test = [0,0,0,0,1]

x_raw[0] = data_helpers.clean_str(x_raw[0])
# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
Ejemplo n.º 19
0
    def sentence(self, sess, lrp_manager, test_data, answer, vocab_processor,
                 split_no):
        saver = tf.train.Saver(tf.global_variables())
        #model_path = "C:\\work\\Code\\PythonControversy\\src\\LRP\\runs\\1516167942\\checkpoints\\model-500"
        model_path = self.get_model_path_by_split(split_no)
        saver.restore(sess, model_path)

        def transform(text):
            return list(vocab_processor.transform(text))

        links, list_text_tokens = zip(*test_data)
        raw_text_tokens = deepcopy(list_text_tokens)
        list_test_text = list(
            [data_helpers.clean_str(x) for x in list_text_tokens])
        x_test = np.array(transform(list_test_text))
        y = np.array([[0, 1]] * len(list_test_text))
        rev_test_text = list(vocab_processor.reverse(x_test))

        feed_dict = {
            lrp_manager.cnn.input_x: x_test,
            lrp_manager.cnn.input_y: y,
            lrp_manager.cnn.dropout_keep_prob: 1.0
        }
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        rt_pickle = "r_t{}.pickle".format(split_no)
        if os.path.exists(rt_pickle):
            r_t = pickle.load(open(rt_pickle, "rb"))
        else:
            r_t = lrp_manager.run(feed_dict)
        pickle.dump(r_t, open(rt_pickle, "wb"))

        candidate_phrase = data_helpers.load_phrase(split_no)
        candidate_phrase = set(candidate_phrase)
        for i, batch in enumerate(r_t):
            low_text = raw_text_tokens[i].lower()
            print("\\section{" + answer[i] + "}")

            #print("-----raw text : "+ raw_text_tokens[i][:200])
            def has_dot_before(cursor):
                try:
                    for j in range(1, 3):
                        idx = cursor - j
                        if idx < 0:
                            break
                        if raw_text_tokens[i][idx] == '.':
                            return True
                except IndexError:
                    print(cursor)
                    raise

                return False

            text_tokens = rev_test_text[i].split(" ")
            cursor = 0
            line_no = 0
            sentence_rel = 0
            max_idx = 0
            max_rel = 0
            last_idx = 0
            sentences = []
            for raw_index, value in np.ndenumerate(batch):
                index = raw_index[0]
                next_idx = low_text.find(text_tokens[index], cursor)
                #print(next_idx, end="")
                sentence_rel = sentence_rel + value
                if next_idx > len(raw_text_tokens[i]):
                    break
                if has_dot_before(next_idx):
                    av_score = sentence_rel / (next_idx - last_idx)
                    if av_score > max_rel:
                        max_rel = av_score
                        max_idx = line_no
                        #print("Max {}".format(line_no))
                    line_no = line_no + 1
                    sentence_rel = 0
                    sentence = raw_text_tokens[i][last_idx:next_idx]
                    if av_score > 0.01:
                        print("\hl{" + sentence.strip() + "}")
                    else:
                        print(sentence.strip())
                    last_idx = next_idx
                    #print("Score :{0:.2f}".format(av_score))
                #print(text_tokens[index], end=" ")
                if next_idx > 0:
                    cursor = next_idx
Ejemplo n.º 20
0
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

datasets = None

# CHANGE THIS: Load data. Load your own data here
dataset_name = cfg["datasets"]["default"]
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
    y_test = np.argmax(y_test, axis=1)
    print("Total number of test examples: {}".format(len(y_test)))
else:
    if FLAGS.x_raw!="":
        x_raw=[data_helpers.clean_str(FLAGS.x_raw)]
        y_test=[FLAGS.y_test_special]
    #if dataset_name == "mrpolarity":
     #   datasets = {"target_names": ['positive_examples', 'negative_examples']}
     #   x_raw = ["a masterpiece four years in the making", "everything is off."]
     #   y_test = [1, 0]
    #else:
       # datasets = {"target_names": ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']}
       # x_raw = ["The number of reported cases of gonorrhea in Colorado increased",
       #          "I am in the market for a 24-bit graphics card for a PC"]
       # y_test = [2, 1]


import re
def clean(text):
    text = re.sub(r'\([^)]*\)', '', text)
Ejemplo n.º 21
0
FLAGS.batch_size
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.iteritems()):
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data()

sent = 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'
sent_arr = [vocabulary[w] for w in data_helpers.clean_str(sent).split()]
print sent_arr
# Evaluate
# ==================================================


with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x.shape[1],
            num_classes=2,
            vocab_size=len(vocabulary),
Ejemplo n.º 22
0
#         end = min(j + 4, ln)
#         lst= [' '.join(input_list[j:i]) for i in range(j+1,end+1)]
#         for str in lst:
#             if str not in sentence_list:
#                 sentence_list.append(str)
#     return sentence_list
x_text, Y = load_data_and_y_labels(
    "data/rt-polaritydata/ADE-positive-org.txt",
    "data/rt-polaritydata/ADE-negative-org.txt")

f = open("ADR/ADR_string.txt", "r")  #opens file with name of "test.txt"
ades = f.read()

# d="Rivaroxaban 2/2 lower back pain. Not very PC but am crippled by this drug. Taking more paracetamols. Must ring for 'phone consultation."
d = "19.32 day 20 Rivaroxaban diary. Still residual aches and pains; only had 4 paracetamol today."
clean = clean_str(d)
# corrected=SpellChecker(clean)
# print("corrected {}".format(corrected))
# terms = twokenize.tokenizeRawTweetText(clean)

# pos_tags = nltk.pos_tag(terms, 'universal')

# print pos_tags
# word_tokens = nltk.word_tokenize(clean_str(d.lower()))

# min_len = min(32, len(pos_tags))
# for i in range(0,min_len ):
#     list=find_words(word_tokens)

sentence_ade = []
result = re.search(r'\b{0}\b'.format('tendon'), ades)
Ejemplo n.º 23
0
    with sess.as_default():
			# Load the saved meta graph and restore variables
			saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
			saver.restore(sess, checkpoint_file)

			# Get the placeholders from the graph by name
			input_x = graph.get_operation_by_name("input_x").outputs[0]
			# input_y = graph.get_operation_by_name("input_y").outputs[0]
			dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

			# Tensors we want to evaluate
			predictions = graph.get_operation_by_name("output/predictions").outputs[0]

			# Generate batches for one epoch
			# batches = data_helpers.batch_iter(x_test, FLAGS.batch_size, 1, shuffle=False)


			# ##################################################################################################
			s = raw_input("type the sentence:")
			s = data_helpers.clean_str(s)
			s = s.split(" ")
			s = s + ["<PAD/>"] * (56 - len(s))
			for i, w in enumerate(s):
				if w not in vocabulary:
					s[i] =  "<PAD/>"
			s = np.array([vocabulary[word] for word in s])
			s = [s]

			print "Prediction: ", sess.run(predictions, {input_x: s, dropout_keep_prob: 1.0})
			# ##################################################################################################
Ejemplo n.º 24
0
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

testfile = "./data/encoding3/v2/encoding3v2_2L_multiple_labels_nonetype_regression_afterpreprocess_haveduplicate_input_train.txt"
trainfile = "./data/encoding3/v2/encoding3v2_2L_multiple_labels_nonetype_regression_afterpreprocess_haveduplicate_input_train.txt"
# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test, dic_coding, dic_coding_inv = data_helpers.load_data_and_labels(
        testfile)
    x_pred, y_pred, dic_coding_pred, dic_coding_inv_pred = data_helpers.load_data_and_labels(
        trainfile)
    y_test = np.argmax(y_test, axis=1)
else:
    x_before = ["Nn2n0n1Nn2n0n4Nn2n1n0Nn2n1n0Pn3n0n1Pn3n0n4Pn3n1n0Pn3n1n0"]
    x_raw = [data_helpers.clean_str(sent) for sent in x_before]
    y_test = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    y_test = np.argmax(y_test, axis=1)

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():