Beispiel #1
0
with open('/content/auth_id/tokenToIndex', 'r') as f:
    try:
        wordToIndex = json.load(f)
    # if the file is empty the ValueError will be thrown
    except ValueError:
        wordToIndex = {}


#auth_sent_num = fdt.file2auth_sent_num(data_path)  # read in the training data
#auth_sentbundle_num = fdt.file2auth_sentbundle_num(data_path, 3)[1:1000]

auth_news_num = fdt.file2auth_news_num(data_path)

ind = np.arange(len(auth_news_num))
np.random.shuffle(ind)
index = ind
raw_data = [auth_news_num[i] for i in index ]

batch_list = rmb.process_word2num_noglove(raw_data, wordToIndex, max_sent_num, max_sent_length)

batch_list_bundle = rmb.pack_batch_list(batch_list, batch_size)

output = open('/content/auth_id/data_sentence_index.pkl', 'wb')
pickle.dump(batch_list_bundle, output, -1)
output.close()

print "Success!"

#print batch_list
Beispiel #2
0
    try:
        wordToIndex = json.load(f)
    # if the file is empty the ValueError will be thrown
    except ValueError:
        wordToIndex = {}

#auth_sent_num = fdt.file2auth_sent_num(data_path)  # read in the training data
#auth_sentbundle_num = fdt.file2auth_sentbundle_num(data_path, 3)[1:1000]

pair_list = fdt.file2pair(data_path, sample_num=1000)

ind = np.arange(len(pair_list))
np.random.shuffle(ind)
index = ind
raw_data = [pair_list[i] for i in index]

batch_list = []
for article_ind in range(len(raw_data)):
    batch_list.append(
        rmb.process_word2num_noglove(raw_data[article_ind], wordToIndex,
                                     max_sent_num, max_sent_length))

pair_list_bundle = rmb.pack_pair_list(batch_list, batch_size)

output = open('../../data/batch_data/C50/data_sentence_pair.pkl', 'wb')
pickle.dump(pair_list_bundle, output, -1)
output.close()

print "Success!"

#print batch_list