def load_data_and_labels(positive_data_file, negative_data_file): """ 加载数据集和标签 :param positive_data_file: 正例文件 :param negative_data_file: 负例文件 :return: x_text: [batch_size] y: [batch_size, 2] """ # 加载数据 with open(positive_data_file, encoding='UTF-8') as f: positive_examples = f.readlines() positive_examples = [s.strip() for s in positive_examples] with open(negative_data_file, encoding='UTF-8') as f: negative_examples = f.readlines() negative_examples = [s.strip() for s in negative_examples] # 合并数据集 x_text = positive_examples + negative_examples x_text = [clean_str(sent) for sent in x_text] # 生成标签 positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], axis=0) return [x_text, y]
def sample(): X, y = load_data_and_labels() vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary( X, FLAGS.en_vocab_size) X, seq_lens = data_to_token_ids(X, vocab_dict) test_sentence = "It was the best movie I have ever seen." test_sentence = get_tokens(clean_str(test_sentence)) test_sentence, seq_len = data_to_token_ids([test_sentence], vocab_dict) test_sentence = test_sentence[0] test_sentence = test_sentence + ([PAD_ID] * (max(len(sentence) \ for sentence in X) - len(test_sentence))) test_sentence = np.array(test_sentence).reshape([1, -1]) FLAGS.max_sequence_length = len(test_sentence[0]) with tf.Session() as sess: model = create_model(sess, FLAGS) probability = model.step(sess, batch_X=test_sentence, batch_seq_lens=np.array(seq_len), forward_only=True, sampling=True) print probability print np.argmax(probability)
def _build_vocab(input_file, output_dir): """ Build a vocab according to the corpus. Args: input_file: The processed corpus. Each line contains three fields: speaker id, utterance, and emotion label, which are separated by tab. Returns: An ordered dict mapping each character to its Id. """ word_cnt = collections.Counter() with tf.io.gfile.GFile(input_file, mode='r') as f: for line in f: line = line.strip() if not line: continue _, u, _ = line.split('\t') u = data_utils.clean_str(u) word_cnt.update(u.split()) sorted_items = word_cnt.most_common() vocab = collections.OrderedDict() vocab[special_words.PAD] = special_words.PAD_ID vocab[special_words.UNK] = special_words.UNK_ID for i, item in enumerate(sorted_items): vocab[item[0]] = i + 2 # 0: PAD, 1: UNK logging.info('Create vocab with %d words.', len(vocab)) vocab_file = os.path.join(output_dir, 'vocab.txt') with tf.io.gfile.GFile(vocab_file, mode='w') as f: f.write('\n'.join(vocab.keys())) logging.info('Wrote vocab file to %s', vocab_file) return vocab
def remap_string(encoding, id_to_word): return [id_to_word[code] for code in encoding] if __name__ == '__main__': newsgroups_train = fetch_20newsgroups(subset='train') train_targets = newsgroups_train.target target_names = newsgroups_train.target_names train_data = newsgroups_train.data newsgroups_test = fetch_20newsgroups(subset='test') test_data = newsgroups_test.data test_targets = newsgroups_test.target ## Preprocess the data clean_train_data = [clean_str(x) for x in train_data] max_seq_len = max([len(x.split(" ")) for x in clean_train_data]) # TODO(malzantot): The line below doesnot work, it flatten it as array of chars # all_words = list(itertools.chain(*clean_train_data)) all_words = [] for doc in train_data: all_words.extend(doc.split(" ")) counter = collections.Counter(all_words) count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) # TOP words # print('Most frequent words:\n', count_pairs[0:10]) print(count_pairs[2]) words, cnts = list(zip(*count_pairs)) #pdb.set_trace() freq_threshold = 20 # used to drop rare words
def _utterance_to_ids(utter, vocab): """Helper for converting an utterance (string) to a list of ids.""" utter = data_utils.clean_str(utter) utter = utter.split() ids = [vocab.get(w, special_words.UNK_ID) for w in utter] return ids