def load_data(file_dir): file_names = os.listdir(file_dir) for file_name in file_names: text = ' '.join( codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines()) for word in clean_str(text).split(): vocab[word] += 1
def load_data(file_dir): file_names = os.listdir(file_dir) data = [] length = len(file_names) for file_name in file_names: text = ' '.join(codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines()) data.append(clean_str(text)) idx_data = make_idx_data(data, word_idx_map, max_len=200, kernel_size=5) return idx_data, length
def cnn(text=None): request_text = text # Test [idx_data, ratings] = load_pickle('./data/corpus/vader/vader_processed_data_tweets.p') # print(idx_data[2]) # print(ratings[2]) W = load_pickle('./data/corpus/vader/embedding_matrix_tweets.p') # print(len(W[1])) if request_text is None: request_text = 'why you are not happy' request_text = clean_str(request_text) # print(request_text) word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_tweets.p') idx_request_text = get_idx_from_sent(request_text, word_idx_map) # print(idx_request_text) # type: list max_len = len(idx_request_text) idx_request_text = np.array(idx_request_text).reshape((1,max_len)) # print(idx_request_text.shape) def cnn_model(): N_fm = 100 # number of filters kernel_size = 5 conv_input_height, conv_input_width = max_len, len(W[1]) model = Sequential() model.add(Embedding(input_dim=W.shape[0], output_dim=W.shape[1], weights=[W], W_constraint=unitnorm())) model.add(Reshape(dims=(1, conv_input_height, conv_input_width))) model.add(Convolution2D(nb_filter=N_fm, nb_row=kernel_size, nb_col=conv_input_width, border_mode='valid', W_regularizer=l2(0.0001))) model.add(Activation("relu")) model.add(MaxPooling2D(pool_size=(conv_input_height - kernel_size + 1, 1), ignore_border=True)) model.add(Flatten()) model.add(Dropout(0.5)) model.add(Dense(1)) model.add(Activation('linear')) sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='mse', optimizer='adagrad') return model model = cnn_model() model.load_weights('./data/corpus/vader/cnn_model_weights.hdf5') predict_value = model.predict(idx_request_text) return [predict_value[0], 5.0]
def load_data(file_dir): file_names = os.listdir(file_dir) data = [] length = len(file_names) for file_name in file_names: text = ' '.join( codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines()) data.append(clean_str(text)) idx_data = make_idx_data(data, word_idx_map, max_len=200, kernel_size=5) return idx_data, length
def screen_data(corpus, ratings, words): processed_texts, processed_ratings = [], [] size = len(corpus) # sums = 0 for i, sentence in enumerate(corpus): sentence = clean_str(sentence) same_words = list(set(sentence.split()).intersection(set(words))) nb_same_words = len(same_words) if nb_same_words >= 1: # max_freq = max([sentence.split().count(w) for w in same_words]) # if max_freq >= 6: # sums = sums + 1 # print('*'*100) # print(sentence, str(same_words),max_freq, sums) processed_texts.append(sentence) processed_ratings.append(ratings[i]) if i % 500 == 0: print(sentence) print('the %i/%i is processing: %s' % (i, size, str(same_words))) print('size of corpus is %i' % len(processed_texts)) # print(sums) return processed_texts, processed_ratings
def process(corpus): return [clean_str(sent) for sent in corpus]
def load_data(file_dir): file_names = os.listdir(file_dir) for file_name in file_names: text = ' '.join(codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines()) for word in clean_str(text).split(): vocab[word] += 1