def build_keras_input(): filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv') vocab = get_vocab(texts) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') word_vecs = load_embeddings( 'zh', '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt' ) # load glove vectors # word_vecs = load_embeddings(arg='glove') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) idx_data = make_idx_data(texts, word_idx_map) data = (idx_data, valence, arousal) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def build_keras_input(): filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv') vocab = get_vocab(texts) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') word_vecs = load_embeddings('zh', '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt') # load glove vectors # word_vecs = load_embeddings(arg='glove') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) idx_data = make_idx_data(texts, word_idx_map) data = (idx_data, valence, arousal) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
plt.figure import matplotlib.mlab as mlab # example data mu = np.mean(data) # mean of distribution sigma = np.std((data)) # standard deviation of distribution num_bins = 20 # the histogram of the data n, bins, patches = plt.hist(data, num_bins, normed=1, facecolor='#78A5A3') # add a 'best fit' line y = mlab.normpdf(bins, mu, sigma) plt.plot(bins, y, '--', color="#CE5A57") plt.xlabel('Absolute Error') plt.ylabel('Frequency') plt.title(title + r'$\mu=%.3f$, $\sigma=%.3f$' % (mu, sigma)) # Tweak spacing to prevent clipping of ylabel plt.subplots_adjust(left=0.15) plt.grid(True) plt.show() # draw_hist(np.array([2,3,2,3,1,5,4,3,2,3,2,3,2,1,4,2,3,5,2]),'few') # exit() if __name__ == '__main__': from load_data import load_CVAT_2 texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.0).csv') draw_scatter(valence, arousal, 'Valence', 'Arousal')
# example data mu = np.mean(data) # mean of distribution sigma = np.std((data)) # standard deviation of distribution num_bins = 20 # the histogram of the data n, bins, patches = plt.hist(data, num_bins, normed=1, facecolor='#78A5A3') # add a 'best fit' line y = mlab.normpdf(bins, mu, sigma) plt.plot(bins, y, '--', color="#CE5A57") plt.xlabel('Absolute Error') plt.ylabel('Frequency') plt.title(title + r'$\mu=%.3f$, $\sigma=%.3f$' % (mu, sigma)) # Tweak spacing to prevent clipping of ylabel plt.subplots_adjust(left=0.15) plt.grid(True) plt.show() # draw_hist(np.array([2,3,2,3,1,5,4,3,2,3,2,3,2,1,4,2,3,5,2]),'few') # exit() if __name__ == '__main__': from load_data import load_CVAT_2 # texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.0).csv') texts, valence, arousal = load_CVAT_2('./resources/corpus 2009 sigma 1.5.csv') draw_scatter(valence, arousal, 'Valence', 'Arousal')
# example data mu = np.mean(data) # mean of distribution sigma = np.std((data)) # standard deviation of distribution num_bins = 20 # the histogram of the data n, bins, patches = plt.hist(data, num_bins, normed=1, facecolor="#78A5A3") # add a 'best fit' line y = mlab.normpdf(bins, mu, sigma) plt.plot(bins, y, "--", color="#CE5A57") plt.xlabel("Absolute Error") plt.ylabel("Frequency") plt.title(title + r"$\mu=%.3f$, $\sigma=%.3f$" % (mu, sigma)) # Tweak spacing to prevent clipping of ylabel plt.subplots_adjust(left=0.15) plt.grid(True) plt.show() # draw_hist(np.array([2,3,2,3,1,5,4,3,2,3,2,3,2,1,4,2,3,5,2]),'few') # exit() if __name__ == "__main__": from load_data import load_CVAT_2 texts, valence, arousal = load_CVAT_2("./resources/CVAT2.0(sigma=1.0).csv") draw_scatter(valence, arousal, "Valence", "Arousal")
from load_data import load_CVAT_2 filename = './resources/CVAT (utf-8).csv' texts, valence, arousal = load_CVAT_2(filename, categorical="all") len_text = [] from CKIP_tokenizer import segsentence out = [] for idx, i in enumerate(texts): # print(list(i)) print(idx) out.append(" ".join(segsentence(i))) # len_text.append(len(.split())) from save_data import dump_picle dump_picle(out, "tokenized_texts_(newest3.31).p") print("The tokenized text is saved.")
# example data mu = np.mean(data) # mean of distribution sigma = np.std((data)) # standard deviation of distribution num_bins = 20 # the histogram of the data n, bins, patches = plt.hist(data, num_bins, normed=1, facecolor='#78A5A3') # add a 'best fit' line y = mlab.normpdf(bins, mu, sigma) plt.plot(bins, y, '--', color="#CE5A57") plt.xlabel('Absolute Error') plt.ylabel('Frequency') plt.title(title + r'$\mu=%.3f$, $\sigma=%.3f$' % (mu, sigma)) # Tweak spacing to prevent clipping of ylabel plt.subplots_adjust(left=0.15) plt.grid(True) plt.show() # draw_hist(np.array([2,3,2,3,1,5,4,3,2,3,2,3,2,1,4,2,3,5,2]),'few') # exit() if __name__ == '__main__': from load_data import load_CVAT_2 # texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.0).csv') texts, valence, arousal = load_CVAT_2( './resources/corpus 2009 sigma 1.5.csv') draw_scatter(valence, arousal, 'Valence', 'Arousal')