def file_to_mongodb(args): operations = [] storage = Storage( mongo_connection_string= "mongodb://*****:*****@35.189.89.82:27017/dax_gcp") with open(args.file1, "r") as f1: records = 0 for line in f1: data = json.loads(line) #change column names try: data["NEWS_TITLE_NewsDim"] = data.pop("news_title") data["NEWS_DATE_NewsDim"] = datetime.strptime( data.pop("news_date"), '%Y-%m-%d %H:%M:%S') data["NEWS_ARTICLE_TXT_NewsDim"] = data.pop("news_article_txt") data["NEWS_SOURCE_NewsDim"] = data.pop("news_source") data["NEWS_PUBLICATION_NewsDim"] = data.pop("news_publication") data["categorised_tag"] = data.pop("news_topics") if data["constituent"] == "BMW": data["constituent"] = "bmw" except Exception as e: print(e) continue operations.append(data) records += 1 if len(operations) == 1000: print("Saving {} records".format(records)) storage.save_to_mongodb(operations, "dax_gcp", "all_news") operations = [] if len(operations) > 0: storage.save_to_mongodb(operations, "dax_gcp", "all_news")
def main(arguments): #connection string + database + table for storage all_constituents_dict_bi = { 'Allianz': 'Allianz', 'Adidas': 'Adidas', 'BASF': 'BASF', 'Bayer': 'Bayer', 'Beiersdorf': 'Beiersdorf', 'BMW': 'BMW', 'Commerzbank': 'Commerzbank', 'Continental': 'Continental', 'Daimler': 'Daimler', 'Deutsche Bank': 'Deutsche_Bank', 'Deutsche Börse': 'Deutsche_Boerse', 'Deutsche Post': 'Deutsche_Post', 'Deutsche Telekom': 'Deutsche_Telekom', 'EON': 'EON', 'Fresenius Medical Care': 'Fresenius_Medical_Care', 'Fresenius': 'Fresenius', 'HeidelbergCement': 'HeidelbergCement', 'Infineon': 'Infineon', 'Linde': 'Linde_6', 'Lufthansa': 'Lufthansa', 'Merck': 'Merck', 'Münchener Rückversicherungs-Gesellschaft': 'Munich_Re', 'ProSiebenSat1 Media': 'ProSiebenSat1_Media', 'RWE': 'RWE', 'Siemens': 'Siemens', 'Thyssenkrupp': 'thyssenkrupp', 'Volkswagen (VW) vz': 'Volkswagen_vz', 'Vonovia': 'Vonovia' } #,'Vonovia':'Vonovia' bi_analyst_table = analyst_businessinsider(all_constituents_dict_bi) ws_analyst_table = analyst_wallstreet() combined_analyst_table = combined_analyst(ws_analyst_table, bi_analyst_table) import json combined_analyst_json = json.loads( combined_analyst_table.to_json(orient='records')) bi_analyst_json = json.loads(bi_analyst_table.to_json(orient='records')) from utils.Storage import Storage storage = Storage() #save the result: combined - analyst_opinions, bi - analyst_opinions_all storage.save_to_mongodb(connection_string=args.param_connection_string, database=args.database, collection=args.collection_selected, data=combined_analyst_json) storage.save_to_mongodb(connection_string=args.param_connection_string, database=args.database, collection=args.collection_all, data=bi_analyst_json)
def get_tweets(args): from utils import logging_utils as logging_utils from utils.TwitterDownloader import TwitterDownloader from utils.Storage import Storage from utils import twitter_analytics_helpers as tap from utils.TaggingUtils import TaggingUtils as TU param_table = "PARAM_TWITTER_COLLECTION" parameters_list = [ "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING", "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME", "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET", "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE" ] parameters = tap.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tap.get_parameters(args.param_connection_string, common_table, common_list, common_where) languages = parameters["LANGUAGE"].split(",") storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) tagger = TU() downloader = TwitterDownloader(parameters["TWITTER_API_KEY"], parameters["TWITTER_API_SECRET"]) downloader.load_api() all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) fields_to_keep = [ "text", "favorite_count", "source", "retweeted", "entities", "id_str", "retweet_count", "favorited", "user", "lang", "created_at", "place", "constituent_name", "constituent_id", "search_term", "id", "sentiment_score", "entity_tags", "relevance", "constituent" ] for language in languages: for constituent_id, constituent_name in all_constituents: search_query = get_search_string(constituent_id, args.param_connection_string, "PARAM_TWITTER_KEYWORDS", "PARAM_TWITTER_EXCLUSIONS") #Get max id of all tweets to extract tweets with id highe than that q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \ "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"], constituent_id,language) try: sinceId = int( storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"]) except Exception as e: print(e) sinceId = None max_id = -1 tweetCount = 0 print("Downloading max {0} tweets for {1} in {2} on {3}".format( parameters["MAX_TWEETS"], constituent_name, language, str(datetime.now()))) while tweetCount < parameters["MAX_TWEETS"]: tweets_unmodified = [] tweets_modified = [] tweets_mongo = [] try: tweets, tmp_tweet_count, max_id = downloader.download( constituent_name, search_query, language, parameters["TWEETS_PER_QUERY"], sinceId, max_id) except Exception as e: continue if not tweets: break else: print("Downloaded {} tweets".format(tmp_tweet_count)) tweetCount += tmp_tweet_count #Add fields for both unmodified and modified tweets for tweet in tweets: tweet._json['source'] = "Twitter" tweet._json['constituent_name'] = constituent_name tweet._json['constituent_id'] = constituent_id tweet._json['search_term'] = search_query tweet._json["constituent"] = tap.get_old_constituent_name( constituent_id) #Removing bad fields clean_tweet = tap.scrub(tweet._json) # Separate the tweets that go to one topic or the other #unmodified t_unmodified = deepcopy(clean_tweet) t_unmodified["date"] = tap.convert_timestamp( t_unmodified["created_at"]) tweets_unmodified.append(t_unmodified) #Add additional fields clean_tweet["sentiment_score"] = tap.get_nltk_sentiment( str(clean_tweet["text"])) tagged_text = tagger.get_spacy_entities( str(clean_tweet["text"])) clean_tweet["entity_tags"] = tap.get_spacey_tags( tagged_text) clean_tweet["relevance"] = -1 #mongo t_mongo = deepcopy(clean_tweet) t_mongo['date'] = datetime.strptime( t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y') tweets_mongo.append(t_mongo) #modified tagged_tweet = dict((k, clean_tweet[k]) for k in fields_to_keep if k in clean_tweet) tagged_tweet['date'] = tap.convert_timestamp( clean_tweet["created_at"]) tweets_modified.append(tagged_tweet) #send to PubSub topic #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified) #ps_utils.publish("igenie-project", "tweets", tweets_modified) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], '{}_unmodified'.format( parameters["DESTINATION_TABLE"]), tweets_unmodified) except Exception as e: print(e) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], tweets_modified) except Exception as e: print(e) try: storage.save_to_mongodb(tweets_mongo, "dax_gcp", parameters["DESTINATION_TABLE"]) pass except Exception as e: print(e) time.sleep(1) print("Saved {} tweets for in {}".format(tweetCount, constituent_name, language)) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": tweetCount, "language": language }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) return "Downloaded tweets"