with open('/content/auth_id/tokenToIndex', 'r') as f: try: wordToIndex = json.load(f) # if the file is empty the ValueError will be thrown except ValueError: wordToIndex = {} #auth_sent_num = fdt.file2auth_sent_num(data_path) # read in the training data #auth_sentbundle_num = fdt.file2auth_sentbundle_num(data_path, 3)[1:1000] auth_news_num = fdt.file2auth_news_num(data_path) ind = np.arange(len(auth_news_num)) np.random.shuffle(ind) index = ind raw_data = [auth_news_num[i] for i in index ] batch_list = rmb.process_word2num_noglove(raw_data, wordToIndex, max_sent_num, max_sent_length) batch_list_bundle = rmb.pack_batch_list(batch_list, batch_size) output = open('/content/auth_id/data_sentence_index.pkl', 'wb') pickle.dump(batch_list_bundle, output, -1) output.close() print "Success!" #print batch_list
try: wordToIndex = json.load(f) # if the file is empty the ValueError will be thrown except ValueError: wordToIndex = {} #auth_sent_num = fdt.file2auth_sent_num(data_path) # read in the training data #auth_sentbundle_num = fdt.file2auth_sentbundle_num(data_path, 3)[1:1000] pair_list = fdt.file2pair(data_path, sample_num=1000) ind = np.arange(len(pair_list)) np.random.shuffle(ind) index = ind raw_data = [pair_list[i] for i in index] batch_list = [] for article_ind in range(len(raw_data)): batch_list.append( rmb.process_word2num_noglove(raw_data[article_ind], wordToIndex, max_sent_num, max_sent_length)) pair_list_bundle = rmb.pack_pair_list(batch_list, batch_size) output = open('../../data/batch_data/C50/data_sentence_pair.pkl', 'wb') pickle.dump(pair_list_bundle, output, -1) output.close() print "Success!" #print batch_list