def load_test_data(self, test_file, max_length, vocabulary=None, config=None):
        """
        Loads and preprocessed data for the MR dataset.
        Returns input vectors, labels, vocabulary, and inverse vocabulary.
        """
        contents = util.read_txt(test_file)
        lines = [line for line in contents]
        labels = []
        x_text = []
        x_text_len = []
        y_text = None
        if config is None:
            for s in lines:
                cur_line = s.split()[:max_length]
                x_text.append(cur_line)
                x_text_len.append(len(cur_line))
        else:
            y = []
            label_dict = util.read_txt_to_dict(config)
            for line in lines:
                line = line.split(' <> ')
                cur_line = line[1].split()[:max_length]
                x_text.append(cur_line)
                x_text_len.append(len(cur_line))
                labels.append(line[0])
                label_num = label_dict[line[0].strip()]
                y.append(label_num)
            y_text = np.array(y)

        sentences_padded = util.pad_sentences(x_text, max_length)
        vocabulary = util.read_pickle(vocabulary)
        x = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in sentences_padded])
        x_text_len = np.array(x_text_len)

        return x, x_text_len, contents, labels, y_text
Exemple #2
0
 def get_voc_idx(self, ques, rela):
     # pad sentence
     pad = lambda x: util.pad_sentences(x, self.max_sent_len)
     pad_lst = lambda x: map(pad, x)
     self.ques_pad = map(pad, ques)
     self.rela_pad = map(pad_lst, rela)
     # Represent sentences as list(nparray) of ints
     idx_func = lambda word: self.word_dict[word] if self.word_dict.has_key(
         word) else self.word_dict["unk"]
     u_idx_func = lambda words: map(idx_func, words)
     v_idx_func = lambda words_list: map(u_idx_func, words_list)
     return map(u_idx_func, self.ques_pad), map(v_idx_func, self.rela_pad)
 def load_data(self, train_file, config, max_length, vocabulary=None):
     """
     Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
     sentences, sen_lens, labels, n_class = self.load_data_and_labels(train_file, config, max_length)
     sentences_padded = util.pad_sentences(sentences, max_length)
     vocabulary_inv = None
     if vocabulary is None:
         vocabulary, vocabulary_inv = self.build_vocab(sentences_padded)
     x, y = self.build_input_data(sentences_padded, labels, vocabulary)
     x_len = np.array(sen_lens)
     return [x, x_len, y, vocabulary, vocabulary_inv, n_class]
#x_text = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&     C's apply 08452810075over18's", "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]

x_text = [
    "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.",
    "SMS SERVICES. for your inclusive text credits, pls goto www.comuk.net login= 3qxj9 unsubscribe wit     h STOP, no extra charge. help 08702840625.COMUK. 220-CM2 9AE"
]

#labels, sentences = get_data_and_labels(lines)

lines_chars_level_features = generate_char_level_features(
    x_text, params['max_chars_features'])
lines_chars_level_features = np.array(lines_chars_level_features)

seq_len = params['max_chars_features']

x = pad_sentences(lines_chars_level_features,
                  max_sequence_length=seq_len,
                  is_max_sequence_length_modifiable=False)
x = text_to_sequence(x, vocabulary)

print("Generate predictions")
predictions = model.predict(x)
count = 0
for text in x_text:
    print("Text is: \t", text)
    if (predictions[count] > 0.5):
        print("predicted spam with spam prob ", predictions[count])
    else:
        print("predicted ham with spam prob ", predictions[count])
    count += 1
Exemple #5
0
lines_chars_level_features = generate_char_level_features(
    sentences, params['max_chars_features'])
params['max_chars_features'] = max(
    [len(lines) for lines in lines_chars_level_features])

lines_chars_level_features = np.array(lines_chars_level_features)

# Build vocabulary
print("Build the vocabulary")
vocabulary = build_vocab(lines_chars_level_features, max_vocab_size=10000)
#print(vocabulary)

# Pad sentence
print("Padding sentences...")
x_text = pad_sentences(lines_chars_level_features,
                       max_sequence_length=params['max_chars_features'])

seq_len = len(x_text[0])
print("The sequence length is: ", seq_len)

# Represent sentence with char index, using char index to represent a sentence
x = text_to_sequence(x_text, vocabulary)

# Shuffle data
#np.random.seed(1) #same shuffling each time
shuffle_indices = np.random.permutation(np.arange(len(labels)))
x = x[shuffle_indices]
labels = labels[shuffle_indices]
"""
## Build CNN model
"""