Ejemplo n.º 1
0
def batch_test(cv_id, sentiment, limit, ignore, rts, num_topics, tfidf, blind,
               div_number):
    from time import time

    start = time()

    results = []
    model = LDA(cv_id=cv_id)

    if blind is False:
        corpus, df = model.get_corpus(rts=rts,
                                      ignore=ignore,
                                      sentiment=sentiment,
                                      limit=limit,
                                      tfidf=tfidf)
        master_scores = model.getScores(corpus, df, num_topics=num_topics)

        for user in ground_truths_pundits.keys():
            scores = master_scores.copy()
            nolan_scores = model.get_nolan_scores(div_number=div_number,
                                                  testuser=user,
                                                  scores=scores)
            results.append(nolan_scores)

        end = time()
        print("Cross validation took this long: {}".format(end - start))

        print(results)
        return results

    else:
        for user in ground_truths_pundits.keys():
            corpus, df = model.get_corpus(rts=rts,
                                          ignore=ignore,
                                          sentiment=sentiment,
                                          limit=limit,
                                          tfidf=tfidf)
            scores_dict = model.getScores(corpus, df, num_topics=num_topics)
            nolan_scores = model.get_nolan_scores(testuser=user,
                                                  scores=scores_dict,
                                                  div_number=div_number)
            results.append(nolan_scores)

        end = time()
        print("Cross validation took this long: {}".format(end - start))

        print(results)
        return results
    def __init__(self,
                 tfidf=True,
                 ignore=tuple(),
                 limit=0,
                 sentiment=None,
                 rts=True):
        self.rts = rts
        self.sentiment = sentiment
        self.limit = limit
        self.exclude = ignore

        self.training_users = [
            x for x in ground_truths_politicians.keys() if x not in ignore
        ]
        self.test_users = [
            y for y in ground_truths_pundits.keys() if y not in ignore
        ]
        self.ytrain = np.array(
            [list(ground_truths_politicians[x]) for x in self.training_users])
        self.ytest = np.array(
            [list(ground_truths_pundits[x]) for x in self.test_users])

        self.ignore = ignore

        if tfidf:
            self.vectorizer = TfidfVectorizer()
        else:
            self.vectorizer = CountVectorizer()
Ejemplo n.º 3
0
    def __init__(
        self,
        testuser=None,
        tfidf=True,
        ignore=tuple(),
        limit=0,
        sentiment=None,
        rts=True,
        n_comps=75,
    ):
        self.rts = rts
        self.sentiment = sentiment
        self.limit = limit
        self.ignore = ignore
        self.n_comps = n_comps

        if testuser is None:
            self.training_users = [
                x for x in ground_truths_politicians.keys() if x not in ignore
            ]
            self.test_users = [
                y for y in ground_truths_pundits.keys() if y not in ignore
            ]
            self.ytrain = np.array(
                [list(ground_truths_politicians[x]) for x in self.training_users]
            )
            self.ytest = np.array(
                [list(ground_truths_pundits[x]) for x in self.test_users]
            )
        else:
            self.test_users = [testuser]
            self.training_users = [
                x
                for x in ground_truths_politicians.keys()
                if x not in ignore + (testuser,)
            ]
            self.ytrain = np.array(
                [list(ground_truths_politicians[x]) for x in self.training_users]
            )
            try:
                self.ytest = np.array(
                    [list(ground_truths_politicians[x]) for x in self.test_users]
                )
            except KeyError:
                self.ytest = np.array(
                    [list(ground_truths_pundits[x]) for x in self.test_users]
                )

        if tfidf:
            self.vectorizer = TfidfVectorizer()
        else:
            self.vectorizer = CountVectorizer()
Ejemplo n.º 4
0
    def preprocess(self, text):
        dirty_text = [r"http\S+", r"@\S+", r"#", r"&"]

        # manually add noisy words as they appear in topic composition results
        ignore = [
            "thing",
            "called",
            "say",
            "good",
            "said",
            "go",
            "today",
            "day",
            "want",
            "need",
            "hear",
            "get",
            "like",
            "come",
            "week",
            "think",
            "thought",
            "month",
            "tonight",
            "year",
            "hear",
            "need",
            "know",
            "talk",
            "RT",
        ]

        processed_ignores = []

        for word in (ignore + [x for x in ground_truths_pundits.keys()] +
                     [y for y in ground_truths_politicians.keys()]):
            processed_ignores.append(
                self.stemmer.stem(self.lemmatizer.lemmatize(word)))

        for pattern in dirty_text:
            text = sub(pattern, "", text)

        lemmed_and_stemmed = []

        for token in simple_preprocess(text):
            if (token not in STOPWORDS and len(token) > 3
                    and token not in processed_ignores):
                lemmed_and_stemmed.append(
                    self.stemmer.stem(self.lemmatizer.lemmatize(token)))

        return lemmed_and_stemmed
Ejemplo n.º 5
0
                                    (id,
                                     author_handle,
                                     tweet,
                                     sentiment,
                                     pos_score, 
                                     neg_score, 
                                     neu_score, 
                                     compound_score) VALUES 
                                     (?, ?, ?, ?, ?, ?, ?, ?);""",
                    tweet_attrs,
                )

            except sqlite3.IntegrityError:
                pass

        self.conn.commit()


if __name__ == "__main__":
    training_list = list(ground_truths_politicians.keys()) + list(
        ground_truths_pundits.keys())

    searcher = TwitterHandler()

    for handle in ground_truths_politicians.keys():
        tweets, user_data = searcher.gettweets(query=handle, count=200)
        searcher.exportresults(tweets, user_data, handle)
        print("finished {}".format(handle))

    searcher.conn.close()