def build_keras_input(): filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv') vocab = get_vocab(texts) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') word_vecs = load_embeddings( 'zh', '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt' ) # load glove vectors # word_vecs = load_embeddings(arg='glove') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) idx_data = make_idx_data(texts, word_idx_map) data = (idx_data, valence, arousal) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def build_keras_input(): filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv') vocab = get_vocab(texts) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') word_vecs = load_embeddings('zh', '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt') # load glove vectors # word_vecs = load_embeddings(arg='glove') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) idx_data = make_idx_data(texts, word_idx_map) data = (idx_data, valence, arousal) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def visual_pos_neg_vecs(amended_pos_path='./tmp/amended_pos.p', amended_neg_path='./tmp/amended_neg.p'): amended_pos = load_pickle(amended_pos_path) amended_neg = load_pickle(amended_neg_path) nb_pos, nb_neg = len(amended_pos), len(amended_neg) print('There are %s positive words, and %s negative words.' % (nb_pos, nb_neg)) num = 500 vecs = [v for v in list(amended_pos.values())[:num] ] + [v for v in list(amended_neg.values())[:num]] vecs = np.array(vecs) print('The shape of vecs is : %s row * %s columns.' % (vecs.shape)) reduced_vecs = t_sne(vecs) print('The shape of reduced vecs is : %s row * %s columns.' % (reduced_vecs.shape)) for i, vec in enumerate(vecs): if i < num: # pos color = 'r' else: # neg color = 'b' plt.plot(reduced_vecs[i, 0], reduced_vecs[i, 1], marker='o', color=color, markersize=8) plt.show()
def build_data(): positive_data = load_pickle('./tmp/amended_pos.p') negative_data = load_pickle('./tmp/amended_neg.p') X, Y = [], [] for pos in positive_data.keys(): X.append(positive_data[pos]) Y.append(1) for neg in negative_data.keys(): X.append(negative_data[neg]) Y.append(0) return np.array(X), np.array(Y)
def build_keras_input(texts, scores, test, new=True): dims = 300 # texts, scores are dict type, key: train, dev, devtest. keys = ["train", "dev", "devtest"] train, train_scores = texts[keys[0]], scores[keys[0]] dev, dev_scores = texts[keys[1]], scores[keys[1]] devtest, devtest_scores = texts[keys[2]], scores[keys[2]] filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p' test_filename = './tmp/test_data.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w) and new == False: data = load_pickle(filename_data) W = load_pickle(filename_w) test_data = load_pickle(test_filename) print('Use existing data. Load OK.') return (data, W, test_data) print("Construct new data.") # load data from pickle vocab = get_vocab(train) # using word2vec vectors # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') # word_vecs = load_embeddings('D:/Word_Embeddings/glove.840B.300d.txt.w2v') word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/glove.840B.300d.txt.w2v') # word_vecs = load_embeddings('/home/hs/Data/Word_Embeddings/word2vec_twitter_model/word2vec_twitter_model.bin', # binary=True) word_vecs = add_unknown_words(word_vecs, vocab, k=dims) W, word_idx_map = build_embedding_matrix(word_vecs, vocab, k=dims) idx_data_train = make_idx_data(train, word_idx_map) idx_data_dev = make_idx_data(dev, word_idx_map) idx_data_devtest = make_idx_data(devtest, word_idx_map) idx_data_test = make_idx_data(test[2], word_idx_map) data = (idx_data_train, idx_data_dev, idx_data_devtest, train_scores, dev_scores, devtest_scores) test_data = (test[0], test[1], idx_data_test) dump_picle(data, filename_data) dump_picle(W, filename_w) dump_picle(test_data, test_filename) print("Saved: data and W are saved into: %s, and %s." % (filename_data, filename_w)) return (data, W, test_data)
def cnn_Chinese(text=None): ########################### file_path ############################## embedding_matrix = './data/tmp/embedding_matrix_CVAT.p' word_idx_map = './data/tmp/word_idx_map_CVAT.p' cnn_model_weights_Valence = './data/tmp/CVAT_cnn_model_weights_Valence.hdf5' cnn_model_weights_Arousal = './data/tmp/CVAT_cnn_model_weights_Arousal.hdf5' #################################################################### request_text = text W = load_pickle(embedding_matrix) # print(len(W[1])) if request_text is None: request_text = '中文斷詞前言自然語言處理的其中一個重要環節就是中文斷詞的' # request_text = clean_str(request_text) # print(request_text) request_text = list(jieba.cut(request_text)) word_idx_map = load_pickle(word_idx_map) idx_request_text = get_idx_from_sent(request_text, word_idx_map) print(idx_request_text) # type: list max_len = len(idx_request_text) idx_request_text = np.array(idx_request_text).reshape((1, max_len)) print(idx_request_text.shape) def cnn_model(): N_fm = 400 # number of filters kernel_size = 8 conv_input_height, conv_input_width = max_len, len(W[1]) model = Sequential() model.add(Embedding(input_dim=W.shape[0], output_dim=W.shape[1], weights=[W], W_constraint=unitnorm())) model.add(Reshape(dims=(1, conv_input_height, conv_input_width))) model.add(Convolution2D(nb_filter=N_fm, nb_row=kernel_size, nb_col=conv_input_width, border_mode='valid', W_regularizer=l2(0.0001))) model.add(Activation("relu")) model.add(MaxPooling2D(pool_size=(conv_input_height - kernel_size + 1, 1), ignore_border=True)) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(1)) model.add(Activation('linear')) model.compile(loss='mse', optimizer='adagrad') return model model = cnn_model() model.load_weights(cnn_model_weights_Valence) valence = model.predict(idx_request_text) model.load_weights(cnn_model_weights_Arousal) arousal = model.predict(idx_request_text) return [valence[0], arousal[0]]
def build_amended_vectors(arg='word2vec'): prefix = None if arg == 'word2vec' else 'GloVe_' pos_vectors = load_pickle('./tmp/'+prefix+'common_positive_words.p') neg_vectors = load_pickle('./tmp/'+prefix+'common_negative_words.p') size = len(pos_vectors[list(pos_vectors.keys())[0]]) print('The dimension of word vectors: %s.' % size) for k in pos_vectors: pos_vectors[k]=np.array(pos_vectors[k]).reshape((1, size)) for k in neg_vectors: neg_vectors[k]=np.array(neg_vectors[k]).reshape((1, size)) amended_pos, amended_neg = amend(pos_vectors, neg_vectors) dump_picle(amended_pos, './tmp/amended_'+prefix+'pos.p') dump_picle(amended_neg, './tmp/amended_'+prefix+'neg.p')
def cnn(text=None): request_text = text # Test [idx_data, ratings] = load_pickle('./data/corpus/vader/vader_processed_data_tweets.p') # print(idx_data[2]) # print(ratings[2]) W = load_pickle('./data/corpus/vader/embedding_matrix_tweets.p') # print(len(W[1])) if request_text is None: request_text = 'why you are not happy' request_text = clean_str(request_text) # print(request_text) word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_tweets.p') idx_request_text = get_idx_from_sent(request_text, word_idx_map) # print(idx_request_text) # type: list max_len = len(idx_request_text) idx_request_text = np.array(idx_request_text).reshape((1,max_len)) # print(idx_request_text.shape) def cnn_model(): N_fm = 100 # number of filters kernel_size = 5 conv_input_height, conv_input_width = max_len, len(W[1]) model = Sequential() model.add(Embedding(input_dim=W.shape[0], output_dim=W.shape[1], weights=[W], W_constraint=unitnorm())) model.add(Reshape(dims=(1, conv_input_height, conv_input_width))) model.add(Convolution2D(nb_filter=N_fm, nb_row=kernel_size, nb_col=conv_input_width, border_mode='valid', W_regularizer=l2(0.0001))) model.add(Activation("relu")) model.add(MaxPooling2D(pool_size=(conv_input_height - kernel_size + 1, 1), ignore_border=True)) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(1)) model.add(Activation('linear')) sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='mse', optimizer='adagrad') return model model = cnn_model() model.load_weights('./data/corpus/vader/cnn_model_weights.hdf5') predict_value = model.predict(idx_request_text) return [predict_value[0], 5.0]
def result_analysis(filename): (param_grid, param_fitness) = load_pickle(filename) grid = ParameterGrid(param_grid) N=10 # top-n top_n_ind = np.argsort(param_fitness)[::-1][:N] # top-n max value index for i in top_n_ind: print('Parameter setting: %s, acc: %s' % (str(list(grid)[i]), param_fitness[i]))
def build_amended_anew_vectors(words): filename = './tmp/retrofitted_anew_vectors_word2vec.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt") amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p') amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") amended_pos = load_pickle('./tmp/amended_GloVe_pos.p') amended_neg = load_pickle('./tmp/amended_GloVe_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = "./tmp/amended_anew_vectors.p" if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin") amended_pos = load_pickle("./tmp/amended_pos.p") amended_neg = load_pickle("./tmp/amended_neg.p") vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_keras_input_amended(): filename_data, filename_w = './tmp/amended_indexed_data.p', './tmp/amended_Weight.p' if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK.') return (data, W) # load data from pickle (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/') vocab = get_vocab(x_train) # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') # word_vecs = load_embeddings('glove') # load amended word vectors word_vecs = load_embeddings('amended_word2vec') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) x_train_idx_data = make_idx_data(x_train, word_idx_map) x_test_idx_data = make_idx_data(x_test, word_idx_map) x_valid_idx_data = make_idx_data(x_valid, word_idx_map) x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map) x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map) x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map) data = (x_train_idx_data, y_train_valence, y_train_labels, x_test_idx_data, y_test_valence, y_test_labels, x_valid_idx_data, y_valid_valence, y_valid_labels, x_train_polarity_idx_data, y_train_polarity, x_test_polarity_idx_data, y_test_polarity, x_valid_polarity_idx_data, y_valid_polarity) dump_picle(data, filename_data) dump_picle(W, filename_w) return (data, W)
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') amended_pos = load_pickle('./tmp/amended_pos.p') amended_neg = load_pickle('./tmp/amended_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def visual_pos_neg_vecs(amended_pos_path='./tmp/amended_pos.p', amended_neg_path='./tmp/amended_neg.p'): amended_pos = load_pickle(amended_pos_path) amended_neg = load_pickle(amended_neg_path) nb_pos, nb_neg = len(amended_pos), len(amended_neg) print('There are %s positive words, and %s negative words.' % (nb_pos, nb_neg)) num = 500 vecs= [v for v in list(amended_pos.values())[:num]] + [v for v in list(amended_neg.values())[:num]] vecs = np.array(vecs) print('The shape of vecs is : %s row * %s columns.'%(vecs.shape)) reduced_vecs = t_sne(vecs) print('The shape of reduced vecs is : %s row * %s columns.'%(reduced_vecs.shape)) for i, vec in enumerate(vecs): if i < num: # pos color = 'r' else: # neg color = 'b' plt.plot(reduced_vecs[i,0], reduced_vecs[i,1], marker='o', color=color, markersize=8) plt.show()
def build_ori_anew_vectors(words): filename = "./tmp/anew_vectors.p" if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings("google_news", "/home/hs/Data/Word_Embeddings/google_news.bin") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/retrofitted_anew_vectors_word2vec.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\word2vec_out_vec_file.txt") amended_pos = load_pickle('./tmp/retrofitted_word2vec_pos.p') amended_neg = load_pickle('./tmp/retrofitted_word2vec_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings(arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_amended_anew_vectors(words): filename = './tmp/amended_anew_vectors_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\glove.6B.300d.txt") amended_pos = load_pickle('./tmp/amended_GloVe_pos.p') amended_neg = load_pickle('./tmp/amended_GloVe_neg.p') vecs = [] for w in words: vec = amended_neg.get(w) if vec is None: vec = amended_pos.get(w) if vec is None: vec = model[w] vecs.append(vec) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( 'google_news', 'D:\Word_Embeddings\English\GoogleNews-vectors-negative300.bin') vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
def build_ori_anew_vectors(words): filename = './tmp/anew_vectors_retrofitted_glove.p' if os.path.isfile(filename): return load_pickle(filename) model = load_embeddings( arg='zh_tw', filename="D:\Word_Embeddings\English\glove.6B\GloVe_out_vec_file.txt") vecs = [] for w in words: vecs.append(model[w]) vecs = np.array(vecs) dump_picle(vecs, filename) return vecs
predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('SVM classifier training complete, saved predict labels to pickle') return def logit(train_data, train_labels, test): log_state('Use logistic regression classifier') clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('MaxEnt classifier training complete, saved predict labels to pickle') return def kNN(train_data, train_labels, test): log_state('Use kNN classifier') clf = KNeighborsClassifier(n_neighbors=5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info('kNN classifier training complete, saved predict labels to pickle') return if __name__ == "__main__": train_data = load_pickle('./data/transformed_data/transformed_train.p') test = load_pickle('./data/transformed_data/transformed_test.p') _, train_labels = load_train_data() mNB(train_data, train_labels, test)
for sent in corpus: for word in sent: vocab[word] += 1 print(len(vocab)) return vocab ########################################## config ######################################## vec_dim = 400 ########################################################################################## corpus = load_corpus(get_file_path('cn_corpus')) # print(corpus[:2]) # vocab = get_vocab(corpus) # dump_picle(vocab, get_file_path('CVAT_Vocab')) # print('OK') vocab = load_pickle(get_file_path('CVAT_Vocab')) # for i in vocab: # print(i) # print(len(vocab)) # W, word_idx_map = build_embedding_matrix(load_embeddings('zh_tw'), vocab, k=400) # dump_picle(word_idx_map, get_file_path('word_idx_map_CVAT')) # print('dump word_idx_map successful') # dump_picle(W, '/home/hs/Data/embedding_matrix_CVAT.p') # print('OK') word_idx_map = load_pickle(get_file_path('word_idx_map_CVAT')) mark = load_mark(get_file_path('mark')) valence, arousal = gold_valence_arousal(corpus, mark) idx_data = make_idx_data(corpus, word_idx_map, max_len=200, kernel_size=5)
data = np.concatenate((pos_idx_data, neg_idx_data), axis=0) print(data.shape) return data, pos_length, neg_length if __name__ == '__main__': ########################################## config ######################################## file_dir = 'E:/研究/Data/IMDB/aclImdb/train/' if os.name == 'nt' else '/home/hs/Data/imdb/aclImdb/train/' vec_dim = 300 ########################################################################################## # get vocab and save to pickle vocab = get_vocab(file_dir) dump_picle(vocab, './data/tmp/vocab.p') print('OK') a = load_pickle('./data/tmp/vocab.p') for i in a: print(i) print(len(a)) exit() # end # make word index map vocab = load_pickle('./data/tmp/vocab.p') W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) dump_picle(word_idx_map, get_file_path('word_idx_map')) print('dump word_idx_map successful') dump_picle(W, '/home/hs/Data/embedding_matrix.p') print('OK') exit() # make word index map end
data = np.concatenate((pos_idx_data, neg_idx_data), axis=0) print(data.shape) return data, pos_length, neg_length if __name__ == '__main__': ########################################## config ######################################## file_dir = 'E:/研究/Data/IMDB/aclImdb/train/' if os.name == 'nt' else '/home/hs/Data/imdb/aclImdb/train/' vec_dim = 300 ########################################################################################## # get vocab and save to pickle vocab = get_vocab(file_dir) dump_picle(vocab, './data/tmp/vocab.p') print('OK') a = load_pickle('./data/tmp/vocab.p') for i in a: print(i) print(len(a)) exit() # end # make word index map vocab = load_pickle('./data/tmp/vocab.p') W, word_idx_map = build_embedding_matrix(load_embeddings('google_news'), vocab, k=300) dump_picle(word_idx_map, get_file_path('word_idx_map')) print('dump word_idx_map successful') dump_picle(W, '/home/hs/Data/embedding_matrix.p') print('OK')
(precision_binary, recall_binary, fbeta_score_binary)) log_performance(accuracy, f1, precision_binary, recall_binary, len(true)) if figure == False: return # 画图 n_groups = 5 values = (accuracy, f1, precision_binary, recall_binary, fbeta_score_binary) fig, ax = plt.subplots() index = np.arange(n_groups) bar_width = 0.35 rects1 = plt.bar(index + bar_width / 2, values, bar_width, alpha=0.6, color='b') plt.xlabel('Result') plt.ylabel('Scores') plt.title('Experiment analysis') plt.xticks(index + bar_width, ('Accuracy', 'F', 'Precision', 'Recall', 'F')) plt.ylim(0, 1) plt.tight_layout() plt.show() if __name__ == "__main__": predict = load_pickle('./data/predict_labels/predict_labels.p') _, true_labels = load_test_data() analysis_result(predict, true_labels)
__author__ = 'NLP-PC' from load_data import load_pickle import nltk from load_data import load_extend_anew words, _, _ = load_extend_anew() feature_names = load_pickle('./data/features/feature_names.p') print(feature_names) english_stemmer = nltk.stem.SnowballStemmer('english') stemmed_dict = [english_stemmer.stem(w) for w in words] print(len(stemmed_dict)) overlapping_words = (set(feature_names) & set(stemmed_dict)) print(len(overlapping_words)) print(english_stemmer.stem('')) features = load_pickle('./data/transformed_data/transformed_train.p') print(features[1, 249]) print(type(features)) d = 'We are very nice goes I am nicely' sent = list(d.split()) print(sent) stemmed_sent = [english_stemmer.stem(w) for w in sent] print(stemmed_sent)
def get_randomized_speed_profiles(experiment, a, sessions, filterpath): avgSpeeds_allsess = [] trialTypes_allsess = [] # Select Session for s in sessions: # Load Valid Trial Filter (manually sorted) validFilename = filterpath + r'\valid_a' + str(a) + '_s' + str( s) + '.pickle' valid_trials = load_data.load_pickle(validFilename) numTrials = np.size(valid_trials) # There is some misalignment for sessions on the last 4 animals..there session 0 is other session 1 if a >= 10: s = s - 1 # Set Trial Lables labelfilter1 = {'state': 'stable'} labelfilter2 = {'state': 'unstable'} labelFilters = [labelfilter1, labelfilter2] # Look at all Trajectories trajectories = experiment[a][s].trajectories times = experiment[a][s].time slices = experiment[a][s].slices labels = experiment[a][s].labels steps = experiment[a][s].steps speeds = experiment[a][s].speeds print str.format('a:s {0}:{1} {2} {3}', a, s, numTrials, len(slices)) # Set Valid Trials (No exploration or tracking errors) crossings = valid_trials # Set Binning and Range avgSpeeds = np.zeros((numTrials, numBins)) trialTypes = np.zeros((numTrials, 1)) for t in range(0, numTrials): #label_indices = np.array(pt.get_labeled_indices(labels,labelFilters[l])) c = crossings[t] # Load X Trajectories and flip all of 'Left' trialX = trajectories[slices[c], 0] if utils.is_dict_subset({'direction': 'left'}, labels[c]): # ALign on 2 important rails (the center of rail 3 is 550) # and the centr of rail 4 is 737, therefore, the first encounter # is at 550 going "right", and when flipped, (1280-737 = 543) # going "left"...therefore, to correct for the shift, I subteact 1273 # and align the left and right trials trialX = np.abs(trialX - 1273) # Load Y Trajectories trialY = trajectories[slices[c], 1] # Load and Parse Times trialTstrings = times[slices[c]] trialT = np.array([ dateutil.parser.parse(timeString) for timeString in trialTstrings ]) # Measure Progression Speed diffX = np.diff(trialX) diffT = time_diff(trialT) / 1000000 # Time interval in seconds speedX = np.concatenate((np.zeros(1), diffX / diffT)) # Find enter/exit and crop trials indR = np.where(trialX > 1200) indL = np.where(trialX < 150) if (np.size(indR) > 0) and (np.size(indL) > 0): exitInd = indR[0][0] + 1 enterInd = indL[0][-1] trialX = trialX[enterInd:exitInd] trialY = trialY[enterInd:exitInd] speedX = speedX[enterInd:exitInd] # Bin (progrssion - X) Speed Profiles (from position 200 to 1200) for b in range(0, numBins): bins = np.where((trialX >= (200 + (b * binSize))) & (trialX < (200 + (b * binSize) + binSize))) if np.size(bins) > 0: avgSpeeds[t, b] = np.mean(speedX[bins]) else: avgSpeeds[t, b] = np.NaN # Correct for starting speed - - first Third of assay baseSpeed = stats.nanmean(avgSpeeds[t, 0:14]) avgSpeeds[t, :] = avgSpeeds[t, :] / baseSpeed # Get Lables label = labels[c] if utils.is_dict_subset({'state': 'stable'}, label): trialTypes[t] = 0 else: trialTypes[t] = 1 # Pool All Average Speeds/TrialTypes Across Sessions avgSpeeds_allsess.append(avgSpeeds) trialTypes_allsess.append(trialTypes) avgSpeeds = np.concatenate(avgSpeeds_allsess) trialTypes = np.concatenate(trialTypes_allsess) return avgSpeeds, trialTypes
return def logit(train_data, train_labels, test): log_state('Use logistic regression classifier') clf = linear_model.LogisticRegression(C=1e5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info( 'MaxEnt classifier training complete, saved predict labels to pickle') return def kNN(train_data, train_labels, test): log_state('Use kNN classifier') clf = KNeighborsClassifier(n_neighbors=5) clf.fit(train_data, train_labels) predict_labels = clf.predict(test) dump_picle(predict_labels, './data/predict_labels/predict_labels.p') logger.info( 'kNN classifier training complete, saved predict labels to pickle') return if __name__ == "__main__": train_data = load_pickle('./data/transformed_data/transformed_train.p') test = load_pickle('./data/transformed_data/transformed_test.p') _, train_labels = load_train_data() mNB(train_data, train_labels, test)
f1 = f1_score(true, predict, average="binary") precision_binary, recall_binary, fbeta_score_binary, _ = precision_recall_fscore_support( true, predict, average="binary" ) accuracy = accuracy_score(true, predict) print("正确率(Accuracy):%.3f\nF值(Macro-F score):%.3f" % (accuracy, f1)) print("精确度(Precision):%.3f\n召回率:%.3f\nF值: %.3f" % (precision_binary, recall_binary, fbeta_score_binary)) log_performance(accuracy, f1, precision_binary, recall_binary, len(true)) if figure == False: return # 画图 n_groups = 5 values = (accuracy, f1, precision_binary, recall_binary, fbeta_score_binary) fig, ax = plt.subplots() index = np.arange(n_groups) bar_width = 0.35 rects1 = plt.bar(index + bar_width / 2, values, bar_width, alpha=0.6, color="b") plt.xlabel("Result") plt.ylabel("Scores") plt.title("Experiment analysis") plt.xticks(index + bar_width, ("Accuracy", "F", "Precision", "Recall", "F")) plt.ylim(0, 1) plt.tight_layout() plt.show() if __name__ == "__main__": predict = load_pickle("./data/predict_labels/predict_labels.p") _, true_labels = load_test_data() analysis_result(predict, true_labels)
import process_trajectories as proctraj plt.close('all') # Set Base Path base_path = r'C:/Users/gonca_000/Documents/Insync/[email protected]' #base_path = r'C:\kampff\Insync\[email protected]' #base_path = r'D:\kampff\Insync\[email protected]' #base_path = r'C:\kampff\Insync' # Set Figure Directory saveDirectory = r'C:\Users\gonca_000\Desktop\All Trajectories' # Load and process the 'pickled' Trajectories (Week 3 only) if not 'experiment' in locals(): experiment = load_data.load_pickle(base_path + r'\protocols\shuttling\data\trajectories_week3.pickle') # Set Animal #animals = [1, 4, 5, 12, 13, 2, 3, 10, 11, 8, 9, 6, 7] #names = ['Ca', 'Lb', 'Cb', 'Lc', 'Cc', 'Ld', 'Cd' ,'Le', 'Ce', 'Lf', 'Cf', 'Lg', 'Cg'] # Set Trial Validation Directory validationpath = base_path + r'\protocols\shuttling\ARK\MC Lesion-Sham Analysis\Figures\Figure 3\Valid Trials' ############# Figure 3b - Example Profiles ###################### animals = [4, 5] names = ['Lb', 'Cb'] sessions = [1, 2, 3] profiles = [figutils.get_randomized_speed_profiles(experiment,a,sessions,validationpath) for a in animals] [figutils.plot_randomized_speed_profiles(avgSpeeds,trialTypes) for avgSpeeds,trialTypes in profiles]
def keras_nn_input(word_vectors_model, amending): if word_vectors_model == 'word2vec': if amending == True: filename_data, filename_w = './tmp/amended_w2v_indexed_data.p', './tmp/amended_w2v_Weight.p' elif amending == False: filename_data, filename_w = './tmp/w2v_indexed_data.p', './tmp/w2v_Weight.p' else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: filename_data, filename_w = './tmp/amended_GloVe_indexed_data.p', './tmp/amended_GloVe_Weight.p' elif amending == False: filename_data, filename_w = './tmp/GloVe_indexed_data.p', './tmp/GloVe_Weight.p' else: raise Exception('Wrong!') else: raise Exception('Wrong parameter!') if os.path.isfile(filename_data) and os.path.isfile(filename_w): data = load_pickle(filename_data) W = load_pickle(filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W) # load data from pickle (x_train, y_train_valence, y_train_labels, x_test, y_test_valence, y_test_labels, x_valid, y_valid_valence, y_valid_labels, x_train_polarity, y_train_polarity, x_test_polarity, y_test_polarity, x_valid_polarity, y_valid_polarity) = load_sst(path='./resources/stanfordSentimentTreebank/') vocab = get_vocab(x_train) if word_vectors_model == 'word2vec': if amending == True: word_vecs = load_embeddings('amended_word2vec') elif amending == False: word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin') else: raise Exception('Wrong!') elif word_vectors_model == 'GloVe': if amending == True: word_vecs = load_embeddings('amended_glove') elif amending == False: word_vecs = load_embeddings('glove') else: raise Exception('Wrong!') else: raise Exception('Wrong parameter!') word_vecs = add_unknown_words(word_vecs, vocab) W, word_idx_map = build_embedding_matrix(word_vecs, vocab) x_train_idx_data = make_idx_data(x_train, word_idx_map) x_test_idx_data = make_idx_data(x_test, word_idx_map) x_valid_idx_data = make_idx_data(x_valid, word_idx_map) x_train_polarity_idx_data = make_idx_data(x_train_polarity, word_idx_map) x_test_polarity_idx_data = make_idx_data(x_test_polarity, word_idx_map) x_valid_polarity_idx_data = make_idx_data(x_valid_polarity, word_idx_map) data = (x_train_idx_data, y_train_valence, y_train_labels, x_test_idx_data, y_test_valence, y_test_labels, x_valid_idx_data, y_valid_valence, y_valid_labels, x_train_polarity_idx_data, y_train_polarity, x_test_polarity_idx_data, y_test_polarity, x_valid_polarity_idx_data, y_valid_polarity) dump_picle(data, filename_data) dump_picle(W, filename_w) print('Load OK, parameters: word_vectors_model = %s, amending = %s'%(word_vectors_model, amending)) return (data, W)
def convert(source_file): s = load_pickle(source_file) dump_picle(s, str(source_file)[:-2] + '_v2.7.p', protocol=2)
import os import pandas import itertools import load_data import numpy as np import video_player import subprocess import process_trajectories import plot_utilities as pltutils import matplotlib.pyplot as plt if not 'data' in locals(): # data = load_data.load_pickle(r'G:/Homework/trajectories.pickle') data = load_data.load_pickle( r'C:/Users/gonca_000/Documents/Insync/[email protected]/protocols/shuttling/data/trajectories_week1.pickle' ) process_trajectories.rebase_video_path(data, 'D:') width_pixel_to_cm = 50.0 / 1280.0 frames_per_second = 120.0 crop = [100, 1100] traj = data[0][4].trajectories slices = process_trajectories.clump_trajectories(traj, crop) trajectory_interval = [len(traj[x, 0]) / frames_per_second for x in slices[1:]] progression_speed = [ np.diff(traj[x, 0]) * width_pixel_to_cm * frames_per_second for x in slices[1:] ] average_speed = [np.mean(s) for s in progression_speed]
def load_dataset(pathlist, sessionslice=slice(None)): return [load_data.load_pickle(path)[sessionslice] for path in pathlist]
__author__ = 'NLP-PC' from load_data import load_pickle from file_name import get_file_path from evaluate import evaluate (Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles.p') print(Y_test) print(predict) evaluate(Y_test, predict, 'news_articles') (Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles1.p') print(Y_test) print(predict) evaluate(Y_test, predict, 'news_articles') (Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles2.p') print(Y_test) print(predict) evaluate(Y_test, predict, 'news_articles') (Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles3.p') print(Y_test) print(predict) evaluate(Y_test, predict, 'news_articles') (Y_test, predict) = load_pickle('./data/corpus/vader/cnn_movie_news_articles4.p') print(Y_test) print(predict) evaluate(Y_test, predict, 'news_articles')
def analysis_preprocess(): preprocessed = load_pickle('./data/acc/labeled_data.p') for id, i in enumerate(preprocessed): print('| %s | %s |' % (id, i))
__author__ = 'NLP-PC' from load_data import load_pickle import nltk from load_data import load_extend_anew words, _, _=load_extend_anew() feature_names = load_pickle('./data/features/feature_names.p') print(feature_names) english_stemmer=nltk.stem.SnowballStemmer('english') stemmed_dict = [english_stemmer.stem(w) for w in words] print(len(stemmed_dict)) overlapping_words= (set(feature_names) & set(stemmed_dict)) print(len(overlapping_words)) print(english_stemmer.stem('')) features = load_pickle('./data/transformed_data/transformed_train.p') print(features[1,249]) print(type(features)) d='We are very nice goes I am nicely' sent = list(d.split()) print(sent) stemmed_sent = [english_stemmer.stem(w) for w in sent] print(stemmed_sent)
def train(self, config): #####Train DCGAN#### global_step = tf.Variable(0,name='global_step',trainable=False) global_step1 = tf.Variable(0,name='global_step1',trainable=False) g_optim = tf.train.AdamOptimizer(config.learning_rate,beta1=config.beta1) \ .minimize(self.gen_loss, global_step=global_step,var_list=self.g_vars) if self.dis_loss: d_optim = tf.train.AdamOptimizer(config.learning_rate,beta1=config.beta1) \ .minimize(self.d_loss, global_step=global_step1,var_list=self.d_vars) tf.initialize_all_variables().run() start_time = time.time() if self.load(self.checkpoint_dir): print(" [*] Load SUCCESS") else: print(" [!] Load failed...") # loda training and validation dataset path dataset = load_pickle() train_input = dataset['train_input'] train_gt = dataset['train_gt'] val_input = dataset['val_input'] val_gt = dataset['val_gt'] S = range(len(train_input)) shuffle(S) SS = range(len(train_input[0])) shuffle(SS) list_val = [11,16,21,22,33,36,38,53,59,92] if self.use_queue: # creat thread coord = tf.train.Coordinator() num_thread =1 for i in range(num_thread): t = threading.Thread(target=self.load_and_enqueue,args=(coord,train_input,train_gt,S,SS,i,num_thread)) t.start() if self.use_queue: for epoch in xrange(config.epoch): #shuffle = np.random.permutation(range(len(data))) batch_idxs = min(len(train_input), config.train_size)/config.batch_size sum_L = 0.0 sum_g =0.0 sum_ang =0.0 sum_low =0.0 sum_high =0.0 if epoch ==0: train_log = open(os.path.join("logs",'train_%s.log' %config.dataset),'w') val_log = open(os.path.join("logs",'val_%s.log' %config.dataset),'w') else: train_log = open(os.path.join("logs",'train_%s.log' %config.dataset),'aw') val_log = open(os.path.join("logs",'val_%s.log' %config.dataset),'w') for idx in xrange(0,batch_idxs): start_time = time.time() if self.dis_loss: _,d_loss_real,d_loss_fake =self.sess.run([d_optim,self.d_loss_real,self.d_loss_fake],feed_dict={self.keep_prob:self.dropout}) _,g_loss,ang_loss,L_loss,low_loss,high_loss =self.sess.run([g_optim,self.g_loss,self.ang_loss,self.L_loss,self.low_loss,self.high_loss],feed_dict={self.keep_prob:self.dropout}) print("Epoch: [%2d] [%4d/%4d] time: %4.4f g_loss: %.6f L_loss:%.4f ang_loss: %.6f low_loss: %.6f high_loss:%.6f" \ % (epoch, idx, batch_idxs,time.time() - start_time,g_loss,L_loss,ang_loss,low_loss,high_loss)) sum_L += L_loss sum_g += g_loss sum_ang += ang_loss sum_low += low_loss sum_high += high_loss train_log.write('epoch %06d mean_g %.6f mean_L %.6f mean_ang %.6f mean_low %.6f mean_high %.6f\n' %(epoch,sum_g/(batch_idxs),sum_L/(batch_idxs),sum_ang/batch_idxs,sum_low/(batch_idxs),sum_high/batch_idxs)) train_log.close() self.save(config.checkpoint_dir,global_step) """ ####### Validation ######### for idx2 in xrange(0,len(list_val)): for tilt in range(1,10): print("Epoch: [%2d] [%4d/%4d] " % (epoch, idx2, len(list_val))) img = '/research2/IR_normal_small/save%03d/%d' % (list_val[idx2],tilt) light = random.randint(1,12) input_ = scipy.misc.imread(img+'/%d3.bmp' %light).astype(float) input_ = scipy.misc.imresize(input_,[600,800]) input_ = input_/127.5 - 1.0 input_ = np.reshape(input_,[1,600,800,1]) gt_ = scipy.misc.imread(img+'/12_Normal.bmp').astype(float) gt_ = gt_/127.5 -1.0 sample = self.sess.run([self.sample],feed_dict={self.ir_test: input_}) L1_loss = tf.reduce_mean(tf.square(tf.sub(sample,gt_))) sum_L1 += L1_loss val_log.write('epoch %06d mean_L1 %.6f \n' %(epoch,sum_L1/(len(range(1,10)*len(list_val))))) val_log.close() """ else: for epoch in xrange(config.epoch): # loda training and validation dataset path shuffle_ = np.random.permutation(range(len(data))) batch_idxs = min(len(data), config.train_size)/config.batch_size for idx in xrange(0, batch_idxs): start_time = time.time() batch_files = shuffle_[idx*config.batch_size:(idx+1)*config.batch_size] batches = [get_image(datalist[batch_file],labellist[batch_file],self.image_size,np.random.randint(64,224-64),\ np.random.randint(64,224-64), is_crop=self.is_crop) for batch_file in batch_files] batches = np.array(batches).astype(np.float32) batch_images = np.reshape(batches[:,:,:,0],[config.batch_size,64,64,1]) batchlabel_images = np.reshape(batches[:,:,:,1:],[config.batch_size,64,64,3]) #mask_mean = batch_mask * self.mean_nir #batch_images = batch_images- mask_mean # Update Normal D network _= self.sess.run([d_optim], feed_dict={self.ir_images: batch_images,self.normal_images:batchlabel_images }) self.writer.add_summary(summary_str, global_step.eval()) # Update NIR G network _,g_loss,L1_loss = self.sess.run([g_optim,self.g_loss,self.L1_loss], feed_dict={ self.ir_images: batch_images,self.normal_images:batchlabel_images}) print("Epoch: [%2d] [%4d/%4d] time: %4.4f g_loss: %.6f L1_loss:%.4f" \ % (epoch, idx, batch_idxs,time.time() - start_time,g_loss,L1_loss,d_loss)) self.save(config.checkpoint_dir,global_step)
from load_data import load_pickle def build_devtest_submit(gold, predict): out = [] with open(gold) as gf: for line in gf: out.append(line) cut_offs = [0.2, 0.4, 0.6, 0.8] with open('./resources/predict_cnn_lstm.txt', 'w') as o: for i, line in enumerate(out): score = predict[i] if score > cut_offs[3]: s = 2 elif score > cut_offs[2]: s = 1 elif score > cut_offs[1]: s = 0 elif score > cut_offs[0]: s = -1 else: s = -2 print(line[:-1], i, predict[i], s) o.write('\t'.join(line.split('\t')[0:2]) + '\t' + str(s) + '\n') (_, predict) = load_pickle("./tmp/submit_cnn_lstm.p") gold = './resources/devtest_gold_FILE.tsv' build_devtest_submit(gold, predict)
for line in lexicon: if word == line: count = count + 1 sum_valence = sum_valence + lexicon[line] return 5 if count == 0 else sum_valence / count for i, text in enumerate(corpus): V = VA_mean(text) valence_pred.append(V) valence_true.append(mark[i]) print(valence_true[:200]) print(valence_pred[:200]) evaluate(valence_true, valence_pred, 'valence') idfs = load_pickle('./data/vocab_idf.p') def tfidf(t, d, D): d = d.split() tf = float(d.count(t)) / sum(d.count(w) for w in set(d)) # idf = sp.log(float(len(D)) / (len([doc.split() for doc in D if t in doc.split()]))) return tf * idfs[t] def tf(t, d): d = d.split() tf = float(d.count(t)) / float(len(d)) return tf
import csv import time from load_data import load_pickle (test, submit_predict) = load_pickle("./tmp/submit_cnn_valid.p") ids, topics, texts = test print(len(submit_predict)) exit() ratings = [] cut_offs = [0.2, 0.4, 0.6, 0.8] # cut_offs = [0.125, 0.375, 0.625, 0.875] for score in submit_predict: if score > cut_offs[3]: s = 2 elif score > cut_offs[2]: s = 1 elif score > cut_offs[1]: s = 0 elif score > cut_offs[0]: s = -1 else: s = -2 ratings.append(s) for t in texts: print(t) timestr = time.strftime("%Y%m%d-%H%M%S") path = "./tmp/submit" + str(timestr) + ".csv"
print(list(predict)[:100]) print(Y_test[:100]) evaluate(list(predict), np.array(Y_test), 'linear regression ' + 'Explained variance score: %.2f' % regr.score(X_test, Y_test)) def cv(data, target): X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(data, target, test_size=0.2, random_state=10) linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Ridge_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='Bayesian_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg') def simple_evaluate(model): print('词语向量包含的词汇有: (前200个)') print(list(model.vocab.keys())[:200]) print('看一看词向量是什么样子的:') print(model.docvecs[0]) print(model.docvecs['L_SENT_4']) print('most_similar: ') print(model.most_similar('awesome')) print(model.most_similar('bad')) if __name__ == "__main__": run_build_docvecs() # only at the first time, you should run this X, Y = load_pickle('./data/acc/twitter_docvecs.p') Y = np.array(Y) + np.ones(len(Y), dtype=float) * 5 cv(X, Y)
__author__ = 'nobody' from load_data import load_pickle from file_name import get_file_path (Y_test, predict) = load_pickle('./data/corpus/vader/cnn_result.p') from evaluate import evaluate print(Y_test) print(predict) evaluate(Y_test, predict, 'Result of CNN')
__author__ = 'NLP-PC' from load_data import load_pickle from evaluate import evaluate import random from regression import linear_regression, linear_regression_multivariant from sklearn import cross_validation mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings = load_pickle('./data/vader_out.p') # size = 720 # slice_idx = random.sample(range(len(ratings)), size) # 从list中随机获取size个元素,作为一个片断返回 # mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings = mean_ratings[slice_idx], tf_means[slice_idx], \ # tfidf_means[ # slice_idx], geos[slice_idx], tf_geos[ # slice_idx], tfidf_geos[slice_idx], \ # ratings[slice_idx] evaluate(ratings, mean_ratings, 'mean_ratings') evaluate(ratings, tf_means, 'tf_means') evaluate(ratings, tfidf_means, 'tfidf_means') evaluate(ratings, geos, 'geos') evaluate(ratings, tf_geos, 'tf_geos') evaluate(ratings, tfidf_geos, 'tfidf_geos') ################################################ Regression Methods ########################################## # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(mean_ratings, ratings, test_size=0.2, # random_state=0) # linear_regression(X_train, X_test, Y_train, Y_test, plot=False) # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(tf_means, ratings, test_size=0.2, random_state=0) # linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
__author__ = 'NLP-PC' from load_data import load_pickle from evaluate import evaluate import random from regression import linear_regression, linear_regression_multivariant from sklearn import cross_validation mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings = load_pickle( './data/vader_out.p') # size = 720 # slice_idx = random.sample(range(len(ratings)), size) # 从list中随机获取size个元素,作为一个片断返回 # mean_ratings, tf_means, tfidf_means, geos, tf_geos, tfidf_geos, ratings = mean_ratings[slice_idx], tf_means[slice_idx], \ # tfidf_means[ # slice_idx], geos[slice_idx], tf_geos[ # slice_idx], tfidf_geos[slice_idx], \ # ratings[slice_idx] evaluate(ratings, mean_ratings, 'mean_ratings') evaluate(ratings, tf_means, 'tf_means') evaluate(ratings, tfidf_means, 'tfidf_means') evaluate(ratings, geos, 'geos') evaluate(ratings, tf_geos, 'tf_geos') evaluate(ratings, tfidf_geos, 'tfidf_geos') ################################################ Regression Methods ########################################## # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(mean_ratings, ratings, test_size=0.2, # random_state=0) # linear_regression(X_train, X_test, Y_train, Y_test, plot=False) # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(tf_means, ratings, test_size=0.2, random_state=0) # linear_regression(X_train, X_test, Y_train, Y_test, plot=False)
for line in lexicon: if word == line: count = count + 1 sum_valence = sum_valence + lexicon[line] return 5 if count == 0 else sum_valence / count for i, text in enumerate(corpus): V = VA_mean(text) valence_mean.append(V) valence_true.append(mark[i]) return valence_mean, valence_true from load_data import load_pickle idfs = load_pickle('./data/vocab_idf.p') def tfidf(t, d): d = d.split() tf = float(d.count(t)) / sum(d.count(w) for w in set(d)) # idf = sp.log(float(len(D)) / (len([doc.split() for doc in D if t in doc.split()]))) return tf * idfs[t] def tf(t, d): d = d.split() tf = float(d.count(t)) / float(len(d)) return tf
Y_test, cost_fun='Bayesian_Regression') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='SVR') linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='KNN_Reg') def simple_evaluate(model): print('词语向量包含的词汇有: (前200个)') print(list(model.vocab.keys())[:200]) print('看一看词向量是什么样子的:') print(model.docvecs[0]) print(model.docvecs['L_SENT_4']) print('most_similar: ') print(model.most_similar('awesome')) print(model.most_similar('bad')) if __name__ == "__main__": run_build_docvecs() # only at the first time, you should run this X, Y = load_pickle('./data/acc/twitter_docvecs.p') Y = np.array(Y) + np.ones(len(Y), dtype=float) * 5 cv(X, Y)
from load_data import load_pickle def build_devtest_submit(gold, predict): out = [] with open(gold) as gf: for line in gf: out.append(line) cut_offs = [0.2, 0.4, 0.6, 0.8] with open('./resources/predict_cnn_lstm.txt', 'w') as o: for i, line in enumerate(out): score = predict[i] if score > cut_offs[3]: s = 2 elif score > cut_offs[2]: s = 1 elif score > cut_offs[1]: s = 0 elif score > cut_offs[0]: s = -1 else: s = -2 print(line[:-1], i, predict[i], s) o.write('\t'.join(line.split('\t')[0:2])+'\t'+str(s)+'\n') (_, predict) = load_pickle("./tmp/submit_cnn_lstm.p") gold = './resources/devtest_gold_FILE.tsv' build_devtest_submit(gold, predict)