Exemple #1
0
def load_data_and_labels(positive_data_file, negative_data_file):
    """
    加载数据集和标签
    :param positive_data_file: 正例文件
    :param negative_data_file: 负例文件
    :return: x_text: [batch_size] y: [batch_size, 2]
    """
    # 加载数据
    with open(positive_data_file, encoding='UTF-8') as f:
        positive_examples = f.readlines()
        positive_examples = [s.strip() for s in positive_examples]
    with open(negative_data_file, encoding='UTF-8') as f:
        negative_examples = f.readlines()
        negative_examples = [s.strip() for s in negative_examples]

    # 合并数据集
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]

    # 生成标签
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], axis=0)

    return [x_text, y]
Exemple #2
0
def sample():

    X, y = load_data_and_labels()
    vocab_list, vocab_dict, rev_vocab_dict = create_vocabulary(
        X, FLAGS.en_vocab_size)
    X, seq_lens = data_to_token_ids(X, vocab_dict)

    test_sentence = "It was the best movie I have ever seen."
    test_sentence = get_tokens(clean_str(test_sentence))
    test_sentence, seq_len = data_to_token_ids([test_sentence], vocab_dict)
    test_sentence = test_sentence[0]
    test_sentence = test_sentence + ([PAD_ID] * (max(len(sentence) \
        for sentence in X) - len(test_sentence)))
    test_sentence = np.array(test_sentence).reshape([1, -1])
    FLAGS.max_sequence_length = len(test_sentence[0])

    with tf.Session() as sess:
        model = create_model(sess, FLAGS)

        probability = model.step(sess,
                                 batch_X=test_sentence,
                                 batch_seq_lens=np.array(seq_len),
                                 forward_only=True,
                                 sampling=True)

        print probability
        print np.argmax(probability)
Exemple #3
0
def _build_vocab(input_file, output_dir):
  """ Build a vocab according to the corpus.

  Args:
    input_file: The processed corpus. Each line contains three fields:
      speaker id, utterance, and emotion label, which are separated by tab.

  Returns:
    An ordered dict mapping each character to its Id.
  """
  word_cnt = collections.Counter()

  with tf.io.gfile.GFile(input_file, mode='r') as f:
    for line in f:
      line = line.strip()
      if not line:
        continue
      _, u, _ = line.split('\t')
      u = data_utils.clean_str(u)
      word_cnt.update(u.split())

  sorted_items = word_cnt.most_common()

  vocab = collections.OrderedDict()
  vocab[special_words.PAD] = special_words.PAD_ID
  vocab[special_words.UNK] = special_words.UNK_ID

  for i, item in enumerate(sorted_items):
    vocab[item[0]] = i + 2  # 0: PAD, 1: UNK
  
  logging.info('Create vocab with %d words.', len(vocab))

  vocab_file = os.path.join(output_dir, 'vocab.txt')
  with tf.io.gfile.GFile(vocab_file, mode='w') as f:
    f.write('\n'.join(vocab.keys()))

  logging.info('Wrote vocab file to %s', vocab_file)

  return vocab
Exemple #4
0
def remap_string(encoding, id_to_word):
    return [id_to_word[code] for code in encoding]

if __name__ == '__main__':
    newsgroups_train = fetch_20newsgroups(subset='train')
    train_targets = newsgroups_train.target
    target_names = newsgroups_train.target_names
    train_data = newsgroups_train.data

    newsgroups_test = fetch_20newsgroups(subset='test')
    test_data = newsgroups_test.data
    test_targets = newsgroups_test.target

    ## Preprocess the data
    clean_train_data = [clean_str(x) for x in train_data]
    max_seq_len = max([len(x.split(" ")) for x in clean_train_data])
    # TODO(malzantot): The line below doesnot work, it flatten it as array of chars
    # all_words = list(itertools.chain(*clean_train_data))
    all_words = []
    for doc in train_data:
        all_words.extend(doc.split(" "))
    counter = collections.Counter(all_words)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    # TOP words
    # print('Most frequent words:\n', count_pairs[0:10])
    print(count_pairs[2])
    words, cnts = list(zip(*count_pairs))
    #pdb.set_trace()

    freq_threshold = 20  # used to drop rare words
Exemple #5
0
def _utterance_to_ids(utter, vocab):
  """Helper for converting an utterance (string) to a list of ids."""
  utter = data_utils.clean_str(utter)
  utter = utter.split()
  ids = [vocab.get(w, special_words.UNK_ID) for w in utter]
  return ids