def load_data(file_dir):
     file_names = os.listdir(file_dir)
     for file_name in file_names:
         text = ' '.join(
             codecs.open(os.path.join(file_dir, file_name), 'r',
                         'utf-8').readlines())
         for word in clean_str(text).split():
             vocab[word] += 1
Esempio n. 2
0
 def load_data(file_dir):
     file_names = os.listdir(file_dir)
     data = []
     length = len(file_names)
     for file_name in file_names:
         text = ' '.join(codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines())
         data.append(clean_str(text))
     idx_data = make_idx_data(data, word_idx_map, max_len=200, kernel_size=5)
     return idx_data, length
Esempio n. 3
0
def cnn(text=None):
    request_text = text
    # Test
    [idx_data, ratings] = load_pickle('./data/corpus/vader/vader_processed_data_tweets.p')
    # print(idx_data[2])
    # print(ratings[2])

    W = load_pickle('./data/corpus/vader/embedding_matrix_tweets.p')
    # print(len(W[1]))
    if request_text is None:
        request_text = 'why you are not happy'
    request_text = clean_str(request_text)
    # print(request_text)
    word_idx_map = load_pickle('./data/corpus/vader/word_idx_map_tweets.p')

    idx_request_text = get_idx_from_sent(request_text, word_idx_map)
    # print(idx_request_text)  # type: list
    max_len = len(idx_request_text)
    idx_request_text = np.array(idx_request_text).reshape((1,max_len))
    # print(idx_request_text.shape)

    def cnn_model():
        N_fm = 100  # number of filters
        kernel_size = 5
        conv_input_height, conv_input_width = max_len, len(W[1])

        model = Sequential()
        model.add(Embedding(input_dim=W.shape[0], output_dim=W.shape[1], weights=[W], W_constraint=unitnorm()))
        model.add(Reshape(dims=(1, conv_input_height, conv_input_width)))
        model.add(Convolution2D(nb_filter=N_fm,
                                nb_row=kernel_size,
                                nb_col=conv_input_width,
                                border_mode='valid',
                                W_regularizer=l2(0.0001)))
        model.add(Activation("relu"))
        model.add(MaxPooling2D(pool_size=(conv_input_height - kernel_size + 1, 1), ignore_border=True))
        model.add(Flatten())
        model.add(Dropout(0.5))
        model.add(Dense(1))
        model.add(Activation('linear'))
        sgd = SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss='mse', optimizer='adagrad')
        return model



    model = cnn_model()
    model.load_weights('./data/corpus/vader/cnn_model_weights.hdf5')
    predict_value = model.predict(idx_request_text)

    return [predict_value[0], 5.0]
 def load_data(file_dir):
     file_names = os.listdir(file_dir)
     data = []
     length = len(file_names)
     for file_name in file_names:
         text = ' '.join(
             codecs.open(os.path.join(file_dir, file_name), 'r',
                         'utf-8').readlines())
         data.append(clean_str(text))
     idx_data = make_idx_data(data,
                              word_idx_map,
                              max_len=200,
                              kernel_size=5)
     return idx_data, length
def screen_data(corpus, ratings, words):
    processed_texts, processed_ratings = [], []
    size = len(corpus)
    # sums = 0
    for i, sentence in enumerate(corpus):
        sentence = clean_str(sentence)
        same_words = list(set(sentence.split()).intersection(set(words)))
        nb_same_words = len(same_words)
        if nb_same_words >= 1:
            # max_freq = max([sentence.split().count(w) for w in same_words])
            # if max_freq >= 6:
            #     sums = sums + 1
            #     print('*'*100)
            #     print(sentence, str(same_words),max_freq, sums)
            processed_texts.append(sentence)
            processed_ratings.append(ratings[i])
        if i % 500 == 0:
            print(sentence)
            print('the %i/%i is processing: %s' % (i, size, str(same_words)))
    print('size of corpus is %i' % len(processed_texts))
    # print(sums)
    return processed_texts, processed_ratings
def process(corpus):
    return [clean_str(sent) for sent in corpus]
def process(corpus):
    return [clean_str(sent) for sent in corpus]
Esempio n. 8
0
 def load_data(file_dir):
     file_names = os.listdir(file_dir)
     for file_name in file_names:
         text = ' '.join(codecs.open(os.path.join(file_dir, file_name), 'r', 'utf-8').readlines())
         for word in clean_str(text).split():
             vocab[word] += 1