def get_vocabulary_list(reviews): """ Retrieves the list of tuples of each word frequency """ freq_map = {} total = 0 for review in reviews: tokens = preprocess(review) # If the language of the review is not English if (tokens == -1): continue total += 1 print(total) for token in tokens: if token in freq_map: freq_map[token] += 1 else: freq_map[token] = 1 print('Number of English reviews:', total) # Sort the list of freq_map's tuples by their value sorted_freq_map = sorted(freq_map.items(), key=operator.itemgetter(1), reverse=True) return sorted_freq_map
def unigram(self, review, ID, label, total, num_of_reviews): """ Extracts the features of a review """ print(total, '----', total*100//num_of_reviews , '%', end='\r') tokens = preprocess(review) # If the language of the review is not English if tokens == -1: return 0 word_indices = self.get_word_indices(tokens) feature_vector = self.get_feature_vector(word_indices) r.table('X_' + self.model).insert({ 'id': ID, 'data': feature_vector }).run(connection) r.table('y_' + self.model).insert({ 'id': ID, 'data': label }).run(connection) return 1
def word2vec_word_embedding(min_count, rows, n_components): """ Returns the t-SNE dimension-reduced data of word embeddings with n_components as the new number of dimensions """ data = { 'X': [], 'y': [], 'z': [], 'words': [] } # The actual label will be used as class labels reviews = load_files(dir_path + '/../../data/reviews/not_corrected').data preprocessed_reviews = [preprocess(review.decode('utf-8')) for review in reviews] model = gensim.models.Word2Vec(preprocessed_reviews, min_count=min_count, size=400) X = [] for word in model.wv.index2word: X.append(model.wv[word]) X = np.array(X) X_reduced = TSNE(n_components=n_components, init='pca', random_state=0).fit_transform(X) data['X'] = X_reduced[:rows, 0].tolist() data['y'] = X_reduced[:rows, 1].tolist() data['words'] = model.wv.index2word if n_components == 3: data['z'] = X_reduced[:rows, 2].tolist() return jsonify(data)
def get_all_course_review_words_overall(course_slug): """ Retrieves all the preprocessed words """ data = {} cursor = r.table('reviews').filter({ 'id': course_slug }).run(connection) reviews = [] for document in cursor: reviews.extend(document['data']) if len(reviews) == 0: return jsonify(data) words = [] for review in reviews: tokens = preprocess(review) if tokens == -1: continue words.extend(tokens) data['word_mapping'] = count_by(words) return jsonify(data)
def get_wordcloud(): """ Retrieves all the words and its corresponding occurences on the whole dataset (English words only) """ data = {} cursor = r.table('combined_reviews_with_labels').limit(10000).run(connection) reviews = [] for document in cursor: reviews.append(document) overall_words = [] very_positive_words = [] positive_words = [] neutral_words = [] negative_words = [] very_negative_words = [] for review in reviews: tokens = preprocess(review['data']) if tokens == -1: continue overall_words.extend(tokens) if review['label'] == 5: very_positive_words.extend(tokens) elif review['label'] == 4: positive_words.extend(tokens) elif review['label'] == 3: neutral_words.extend(tokens) elif review['label'] == 2: negative_words.extend(tokens) elif review['label'] == 1: very_negative_words.extend(tokens) data['overall'] = count_by(overall_words) data['very_positive'] = count_by(very_positive_words) data['positive'] = count_by(positive_words) data['neutral'] = count_by(neutral_words) data['negative'] = count_by(negative_words) data['very_negative'] = count_by(very_negative_words) return jsonify(data)
def get_all_course_review_words_neutral(course_slug): """ Retrieves all the preprocessed neutral words """ data = {} cursor = r.table('reviews').filter({ 'id': course_slug }).run(connection) reviews = [] for document in cursor: reviews.extend(document['data']) if len(reviews) == 0: return jsonify(data) overall_words = [] for review in reviews: tokens = preprocess(review) if tokens == -1: continue # Check each word if neutral words = [] for word in tokens: tb = TextBlob(word) if tb.sentiment.subjectivity == 0 or tb.sentiment.polarity == 0: words.append(word) overall_words.extend(words) data['word_mapping'] = count_by(overall_words) return jsonify(data)