names = count_vectorizer.get_feature_names() # get_support - boolean array of shape [# input features], # in which an element is True iff its corresponding feature is selected for retention selected_words = np.asarray(names)[select.get_support()] # print(', '.join(selected_words)) # ************* Make average vectors of w2v representation of top 1000 words *************************** model = gensim.models.Word2Vec.load("300features_40minwords_10context") features_count = 300 train_reviews = [] for review in train["review"]: train_reviews.append(review_to_wordlist(review, vocabulary=selected_words)) trainDataVecs = get_avg_feature_vecs(train_reviews, model, features_count) print("Creating average feature vecs for test reviews") test_reviews = [] for review in test["review"]: test_reviews.append(review_to_wordlist(review, vocabulary=selected_words)) testDataVecs = get_avg_feature_vecs(test_reviews, model, features_count) # ************* Make a prediction ****************************** model = LinearRegression()
def prepare_rewiews(reviews): clean_reviews = [] for r in reviews: clean_reviews.append(review_to_wordlist(r, remove_stopwords=True)) return clean_reviews