Python MongoHandler.store_to_collection Exemples

Langage de programmation: Python

Espace de nommage/Pack: DataCollection.mongo

Class/Type: MongoHandler

Méthode/Fonction: store_to_collection

Exemples au hotexamples.com: 2

Python MongoHandler.store_to_collection - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de DataCollection.mongo.MongoHandler.store_to_collection extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

retrieve_from_collection(6)

MongoHandler(5)

store_to_collection(2)

get_with_id(1)

Méthodes fréquemment utilisées

retrieve_from_collection (6)

MongoHandler (5)

store_to_collection (2)

get_with_id (1)

Exemple #1

0

Afficher le fichier

Fichier : twitter_data.py Projet : stevejpapad/WebMiningAUTh

class TweetMiner(object): api = None connection = None def __init__(self): auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) self.api = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) self.connection = MongoHandler() # Retrieve tweets fith a given tweet id def get_tweets_with_id(self): old_posts = self.connection.retrieve_from_collection("twitter") new_posts = self.connection.retrieve_from_collection("twitter_new") new_ids_list = [row["_id"] for row in new_posts] ids_list = [ row["_id"] for row in old_posts if not row["_id"] in new_ids_list and not row["full_text"].startswith("RT @") and ("promo" or "giveaway") not in row["full_text"] and len(row["full_text"].split()) >= 5 ] print("Starting...") count0 = 0 count1 = 0 for tweet_id in ids_list: try: # tweet = self.api.get_status(tweet_id, tweet_mode="extended")._json tweets = self.connection.get_with_id("twitter", {"_id": tweet_id}) for tweet in tweets: _pre_tweet = self.preprocess_tweet(tweet) # print(json.dumps(pre_tweet, indent=4, sort_keys=True)) count1 += 1 except TweepError: count0 += 1 print("--------------------------------") print(f"Number of found: {count1}") print("--------------------------------") print(f"Number of not found: {count0}") # Preprocess tweet text def preprocess_tweet(self, tweet): tweet_dict = dict() tweet_dict["_id"] = tweet["id"] created_at = time.strftime( '%Y-%m-%d', time.strptime(tweet["created_at"], '%a %b %d %H:%M:%S +0000 %Y')) tweet_dict["created_at"] = created_at tweet_dict["text"] = preprocess_text(tweet["full_text"]) tweet_dict["hashtags"] = [ hashtag["text"] for hashtag in tweet["entities"]["hashtags"] ] tweet_dict["mentions"] = [ hashtag["name"] for hashtag in tweet["entities"]["user_mentions"] ] tweet_dict["hashtags"] = [ hashtag["text"] for hashtag in tweet["entities"]["hashtags"] ] tweet_dict["urls"] = [ hashtag["url"] for hashtag in tweet["entities"]["urls"] ] tweet_dict["user_id"] = tweet["user"]["id"] tweet_dict["user_name"] = tweet["user"]["name"] tweet_dict["user_screen_name"] = tweet["user"]["screen_name"] tweet_dict["user_location"] = tweet["user"]["location"] tweet_dict["user_followers"] = tweet["user"]["followers_count"] tweet_dict["user_friends"] = tweet["user"]["friends_count"] tweet_dict["user_listed"] = tweet["user"]["listed_count"] tweet_dict["user_favourites"] = tweet["user"]["favourites_count"] ts = time.strftime( '%Y-%m', time.strptime(tweet["user"]["created_at"], '%a %b %d %H:%M:%S +0000 %Y')) date_time_obj = datetime.datetime.strptime(ts, '%Y-%m') end_date = datetime.datetime.now() num_months = (end_date.year - date_time_obj.year) * 12 + ( end_date.month - date_time_obj.month) tweet_dict["user_months"] = num_months tweet_dict["user_statuses"] = tweet["user"]["statuses_count"] tweet_dict["user_verified"] = int(tweet["user"]["verified"]) tweet_dict["retweets"] = tweet["retweet_count"] tweet_dict["favorites"] = tweet["favorite_count"] tweet_dict["is_quoted"] = tweet["is_quote_status"] self.connection.store_to_collection(tweet_dict, "twitter_new") return tweet_dict # Retrieve new tweets def get_new_tweets(self): count = 0 for tweet in Cursor(self.api.search, q="@#ClimateChange", lang="en", tweet_mode="extended").items(): if not tweet._json["full_text"].startswith("RT @") and ( "promo" or "giveaway") not in tweet._json["full_text"] and len( tweet._json["full_text"].split()) >= 5: count += 1 self.preprocess_tweet(tweet._json) print("--------------------------------") print(f"Number of found: {count}") # Get tweets from a particular user def get_user_tweets(self): re_list = [] users = profiling.get_user_names() # for user in lexicons.deniers: # for user in lexicons.non_deniers: count_users = 0 for user in users[489:500]: # 363 try: print("User: "******"en", tweet_mode="extended") for status in statuses: if any(keyword in status.full_text for keyword in lexicons.keywords) \ and len(status.full_text.split()) >= 5 \ and detect(status.full_text) == 'en': # and not status.full_text.startswith("RT @"): status_dict = dict() status_dict["_id"] = status.id status_dict[ "user_name"] = status.author.screen_name status_dict["location"] = status.author.location status_dict["description"] = preprocess_text( status.author.description) status_dict[ 'date'] = f"{status.created_at.year}-{status.created_at.month}-{status.created_at.day}" clean_text = preprocess_text( re.sub(r'^RT\s@\w+:', r'', status.full_text)) status_dict["text"] = clean_text status_dict["sentiment"] = round( sentiment_analyzer_scores( status.full_text)['compound'], 3) anger, anticipation, disgust, fear, joy, _negative, _positive, sadness, surprise, trust = get_emotions( clean_text) status_dict["anger"] = anger status_dict["anticipation"] = anticipation status_dict["disgust"] = disgust status_dict["fear"] = fear status_dict["joy"] = joy status_dict["sadness"] = sadness status_dict["surprise"] = surprise status_dict["trust"] = trust subj = TextBlob(''.join( status.full_text)).sentiment status_dict["subjectivity"] = round(subj[1], 3) # status_dict["label"] = 0 # non - denier # status_dict["label"] = 1 # denier user_tweets.append(status_dict) # re_list.append(statuses) for status_dict in user_tweets: try: self.connection.store_to_collection( status_dict, "twitter_profiles_1K" ) # new_twitter_profiles for training data count_tweets += 1 except pymongo.errors.DuplicateKeyError: # print(status_dict.id) print("exception") continue print("Found ", count_tweets, " relevant tweets by the user: "******"test sleep!") time.sleep(300) print("test sleep ended!!!") if count_users > 1001: print("break!") break except tweepy.error.TweepError: print("Locked profile!") continue except langdetect.lang_detect_exception.LangDetectException: continue return re_list

Exemple #2

0

Afficher le fichier

class InstaMiner(object): loader = None connection = None def __init__(self): loader = Instaloader(download_pictures=False, download_video_thumbnails=False, download_videos=False, compress_json=False, sleep=True) # loader.login(insta_username, insta_password) self.connection = MongoHandler() # Retrieve new posts from Instagram def get_new_posts(self): for post in self.loader.get_hashtag_posts('climatechange'): # Keeping only necessary k-v # print(post._node) new_post = dict() new_post["_id"] = post._node.pop("id") print(json.dumps(post._node, indent=4, sort_keys=True)) try: new_post["caption"] = post._node["edge_media_to_caption"][ "edges"][0]["node"]["text"] except: new_post["caption"] = None try: new_post["location"] = post._node["location"] except: new_post["location"] = None try: new_post["shortcode"] = post._node["shortcode"] except: new_post["shortcode"] = None try: new_post["timestamp"] = post._node["taken_at_timestamp"] except: new_post["timestamp"] = None try: new_post["liked_by"] = post._node["edge_liked_by"]["count"] except: new_post["liked_by"] = None try: new_post["user_id"] = post._node["owner"]["id"] except: new_post["user_id"] = None try: new_post["username"] = post._node["owner"]["username"] except: new_post["username"] = None try: new_post["is_verified"] = post._node["owner"]["is_verified"] except: new_post["is_verified"] = None try: new_post["is_private"] = post._node["owner"]["is_private"] except: new_post["is_private"] = None self.connection.store_to_collection(new_post, "instagram") # Preprocesses instagram posts def preprocess_posts(self): posts = self.connection.retrieve_from_collection("instagram") count = 0 for post in posts: if post["caption"]: try: if not len(post["caption"].split()) < 5 and detect( post["caption"]) == 'en': new_post = dict() new_post["_id"] = int(post["_id"]) new_post["hashtags"] = get_hashtags(post["caption"]) new_post["mentions"] = get_mentions(post["caption"]) new_post["caption"] = preprocess_text(post["caption"]) new_post["shortcode"] = post["shortcode"] new_post["user_id"] = post["user_id"] new_post["likes"] = post["liked_by"] new_post[ "created_at"] = datetime.datetime.fromtimestamp( post["timestamp"]).strftime("%Y-%m-%d") self.connection.store_to_collection( new_post, "instagram_new") count += 1 except: print(1) print("--------------------------------") print(f"Number of found: {count}")