def get_tweets_by_user_id_time_restricted(self, user_id: str) -> List[Tweet]: """ Return a list of tweet with user_id that matches the given user_id since a certain time @param user_id the id of the user to retrieve tweets from @return a list of tweets by the given user """ from_date = datetime.today() + relativedelta(months=-12) # from_date = datetime(2020, 6, 30) tweet_doc_list = self.collection.find({ "$and": [{ "user_id": bson.int64.Int64(user_id) }, { "created_at": { "$gte": from_date } }] }) tweets = [] for doc in tweet_doc_list: tweets.append(Tweet.fromDict(doc)) return tweets
def get_retweets_of_user_by_user_id(self, user_id: str) -> List[Tweet]: retweet_doc_list = self.collection.find( {"retweet_user_id": bson.int64.Int64(user_id)}) retweets = [] for doc in retweet_doc_list: retweets.append(Tweet.fromDict(doc)) return retweets
def get_tweet_by_id(self, id: str) -> Tweet: """ Return tweet with id that matches the given id @param id the id of the tweet to get @return the Tweet object corresponding to the tweet id, or none if no tweet matches the given id """ tweet_doc = self.collection.find_one({"id": bson.int64.Int64(id)}) if tweet_doc is not None: return Tweet.fromDict(tweet_doc) else: return None
def get_tweets_by_user_id(self, user_id, num_tweets=0): tweets = [] try: cursor = Cursor(self.twitter_api.user_timeline, user_id=user_id, count=200, since_id='1277627227954458624', exclude_replies=True).items() for data in cursor: tweets.append(Tweet.fromTweepyJSON(data._json)) except TweepError as e: log.error(e) return tweets
def get_tweets_by_user_id(self, user_id: str) -> List[Tweet]: """ Return a list of tweet with user_id that matches the given user_id @param user_id the id of the user to retrieve tweets from @return a list of tweets by the given user """ tweet_doc_list = self.collection.find( {"user_id": bson.int64.Int64(user_id)}) tweets = [] for doc in tweet_doc_list: tweets.append(Tweet.fromDict(doc)) return tweets
def get_retweets_of_user_by_user_id_time_restricted( self, user_id: str) -> List[Tweet]: from_date = datetime(2020, 6, 30) retweet_doc_list = self.collection.find({ "$and": [{ "retweet_user_id": bson.int64.Int64(user_id) }, { "created_at": { "$gte": from_date } }] }) retweets = [] for doc in retweet_doc_list: retweets.append(Tweet.fromDict(doc)) return retweets
def _process_tweet_text(self, tweet: Tweet): # -> ProcessedTweet: """ Processes a given tweet @param tweet the raw, unprocessed tweet @return the processed tweet """ text = tweet.get_text() text = text.lower() # Filter links, numbers, and emojis text = re.sub(r"\bhttps:\S*\b", "", text) text = re.sub(r"\b\d*\b", "", text) text = re.sub(r"[^\w\s@#]", "", text) processed_text_list = text.split() # Hashtags, usernames for i in range(0, len(processed_text_list)): word = processed_text_list[i] if '#' in word or '@' in word: processed_text_list[i] = '' processed_text_list = list(filter(lambda x: x != '', processed_text_list)) # Run stemming: it's important to run this first before stop words for cases such as that's sno = nltk.stem.SnowballStemmer('english') processed_text_list = [sno.stem(word) for word in processed_text_list] # Remove stop words stopwords = set(nltk.corpus.stopwords.words('english')) stopwords.add('amp') for word in stopwords: if word in processed_text_list: # extract while (processed_text_list.count(word)): processed_text_list.remove(word) return processed_text_list
def on_status(self, data): tweet = Tweet.fromTweepyJSON(data._json) self.raw_tweet_setter.store_tweet(tweet)