class TopicModeler:
    
    seed = 1532525625823
    
    count_vect = None
    tfidf_transformer = None
    vocab = None
    model = None
    
    default_num_topics = 60
    
    def extract_features(self, documents : [str]):  
        """
        Builds tfidf matrix to be used when making the topic modelers
        """
        
        self.count_vect = CountVectorizer(stop_words=set(stopwords.words('english')))
        train_counts = self.count_vect.fit_transform(documents)
                
        self.tfidf_transformer = TfidfTransformer()
        train_tfidf = self.tfidf_transformer.fit_transform(train_counts)
        
        self.vocab = self.count_vect.get_feature_names()
        return train_tfidf # document-term matrix holding tfidf values
    
    def train_NMF_model(self, dtm : "doc-term tfidf matrix", num_topics = default_num_topics):
        self.model = decomposition.NMF(n_components=num_topics, random_state=1)
        self.model.fit_transform(dtm)
        
    def train_LSA_model(self, dtm : "doc-term tfidf matrix", num_topics = default_num_topics):
        self.model = decomposition.TruncatedSVD(n_components=num_topics, random_state=1)
        self.model.fit_transform(dtm)
        
    def train_LDA_model(self, dtm : "doc-term tfidf matrix", num_topics = default_num_topics):
        self.model = decomposition.LatentDirichletAllocation(n_topics=num_topics, random_state=1)
        self.model.fit_transform(dtm)
     
    def build_test_matrix(self, documents : [str]):
        try:
            test_counts = self.count_vect.tranform(documents)
            test_tfidf = self.tfidf_transformer.transform(test_counts)
        except:
            print("Error. Try calling extract_features() before build_test_matrix(). The CountVectorier and TfidfTransformer need to be build before use in build_test_matrix().")
        
        return test_tfidf
    
    def predict_top_n_topics(self, busi_dtm, n : "num topics to return"):
        topic_matrix = self.model.transform(busi_dtm)
        
        
    def gather_topic_words(self, num_top_words = 20) -> [[str]]:
        """
        Returns a list of words associated with each topic.
        Should this just make an object variable instead of returning the words?
        """
        try:
            # print words associated with topics
            topic_words = []
            for topic in self.model.components_:
                word_idx = np.argsort(topic)[::-1][0:num_top_words]
                topic_words.append([self.vocab[i] for i in word_idx])
        except:
            print("Error. Try calling one of the train_X_model() methods before calling gather_topic_words().")
            
        return topic_words
    
    def print_topics(self, model, topic_words : [[str]], number_of_words : int):
        """
        Neatly prints out topics and the words associated with them
        """
        
        print()
        for t in range(len(topic_words)):
           print("Topic {}: {}".format(t, ' '.join(topic_words[t][:number_of_words])));
        print()
        
    def pipeline(self, name_of_target_business : str, number_of_documents : int):
        # for consistent testing
        random.seed(self.seed)
        
        raw_data = pickle.load(open("pickles/list-of-reviews.p", "rb"))
        documents = random.sample(raw_data, number_of_documents)
        
        dtm = self.extract_features(documents)
        self.train_NMF_model(dtm)
        
        target_reviews = get_reviews_for_business_name(name_of_target_business)
        dtm_test = self.build_test_matrix(target_reviews)
        self.predict_top_n_topics()
Example #2
0
    train_data_features = vectorizer.fit_transform(clean_train_reviews)

    #numpy arrays are easy to work with,
    np.asarray(train_data_features)

    #Training a random forest

    forest = RandomForestClassifier(n_estimators=100)

    #fit the forest to the training set, using the bag of words as features
    #and the sentiment labels as the response variable

    forest = forest.fit(train_data_features, train['sentiment'])

    clean_test_reviews = []

    print('cleaning and parsing the test set movie reviews')
    for i in range(0, len(test['review'])):
        clean_test_reviews.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(test["review"][i], True)))

    test_data_features = vectorizer.tranform(clean_test_reviews)
    np.asarray(test_data_features)

    print("predicting test labels")
    result = forest.predict(test_data_features)

    #copy the result to a pandas dataframe
    output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
Example #3
0
class TopicModeler:

    seed = 1532525625823

    count_vect = None
    tfidf_transformer = None
    vocab = None
    model = None

    default_num_topics = 60

    def extract_features(self, documents: [str]):
        """
        Builds tfidf matrix to be used when making the topic modelers
        """

        self.count_vect = CountVectorizer(
            stop_words=set(stopwords.words('english')))
        train_counts = self.count_vect.fit_transform(documents)

        self.tfidf_transformer = TfidfTransformer()
        train_tfidf = self.tfidf_transformer.fit_transform(train_counts)

        self.vocab = self.count_vect.get_feature_names()
        return train_tfidf  # document-term matrix holding tfidf values

    def train_NMF_model(self,
                        dtm: "doc-term tfidf matrix",
                        num_topics=default_num_topics):
        self.model = decomposition.NMF(n_components=num_topics, random_state=1)
        self.model.fit_transform(dtm)

    def train_LSA_model(self,
                        dtm: "doc-term tfidf matrix",
                        num_topics=default_num_topics):
        self.model = decomposition.TruncatedSVD(n_components=num_topics,
                                                random_state=1)
        self.model.fit_transform(dtm)

    def train_LDA_model(self,
                        dtm: "doc-term tfidf matrix",
                        num_topics=default_num_topics):
        self.model = decomposition.LatentDirichletAllocation(
            n_topics=num_topics, random_state=1)
        self.model.fit_transform(dtm)

    def build_test_matrix(self, documents: [str]):
        try:
            test_counts = self.count_vect.tranform(documents)
            test_tfidf = self.tfidf_transformer.transform(test_counts)
        except:
            print(
                "Error. Try calling extract_features() before build_test_matrix(). The CountVectorier and TfidfTransformer need to be build before use in build_test_matrix()."
            )

        return test_tfidf

    def predict_top_n_topics(self, busi_dtm, n: "num topics to return"):
        topic_matrix = self.model.transform(busi_dtm)

    def gather_topic_words(self, num_top_words=20) -> [[str]]:
        """
        Returns a list of words associated with each topic.
        Should this just make an object variable instead of returning the words?
        """
        try:
            # print words associated with topics
            topic_words = []
            for topic in self.model.components_:
                word_idx = np.argsort(topic)[::-1][0:num_top_words]
                topic_words.append([self.vocab[i] for i in word_idx])
        except:
            print(
                "Error. Try calling one of the train_X_model() methods before calling gather_topic_words()."
            )

        return topic_words

    def print_topics(self, model, topic_words: [[str]], number_of_words: int):
        """
        Neatly prints out topics and the words associated with them
        """

        print()
        for t in range(len(topic_words)):
            print("Topic {}: {}".format(
                t, ' '.join(topic_words[t][:number_of_words])))
        print()

    def pipeline(self, name_of_target_business: str, number_of_documents: int):
        # for consistent testing
        random.seed(self.seed)

        raw_data = pickle.load(open("pickles/list-of-reviews.p", "rb"))
        documents = random.sample(raw_data, number_of_documents)

        dtm = self.extract_features(documents)
        self.train_NMF_model(dtm)

        target_reviews = get_reviews_for_business_name(name_of_target_business)
        dtm_test = self.build_test_matrix(target_reviews)
        self.predict_top_n_topics()