def main(args): client = MongoClient(args.connection_string) db = client["dax_gcp"] collection = db["tweets"] sia = SIA() tagger = TU() operations = [] records = 0 start_time = time.time() for doc in collection.find({"constituent": { "$exists": False }}, no_cursor_timeout=True): #sentiment_score = get_nltk_sentiment(doc["text"],sia) new_values = {} '''' new_values["nltk_sentiment_numeric"] = sentiment_score new_values["sentiment_score"] = sentiment_score if "relevance" not in doc: new_values["relevance"] = -1 new_values['date'] = datetime.strptime(doc['created_at'], '%a %b %d %H:%M:%S %z %Y') # TO DO tagged_text = tagger.get_spacy_entities(doc["text"]) new_values["entity_tags"] = tap.get_spacey_tags(tagged_text) ''' new_values["constituent"] = tap.get_old_constituent_name( doc["constituent_id"]) operations.append(UpdateOne({"_id": doc["_id"]}, {"$set": new_values})) # Send once every 1000 in batch if (len(operations) == 2000): print("Performing bulk write") collection.bulk_write(operations, ordered=False) operations = [] records += 2000 print("Write done. Saved {} records".format(records)) if (len(operations) > 0): collection.bulk_write(operations, ordered=False) print("--- %s seconds ---" % (time.time() - start_time)) print("Processed {} records".format(records))
def get_tweets(args): from utils import logging_utils as logging_utils from utils.TwitterDownloader import TwitterDownloader from utils.Storage import Storage from utils import twitter_analytics_helpers as tap from utils.TaggingUtils import TaggingUtils as TU param_table = "PARAM_TWITTER_COLLECTION" parameters_list = [ "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING", "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME", "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET", "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE" ] parameters = tap.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tap.get_parameters(args.param_connection_string, common_table, common_list, common_where) languages = parameters["LANGUAGE"].split(",") storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) tagger = TU() downloader = TwitterDownloader(parameters["TWITTER_API_KEY"], parameters["TWITTER_API_SECRET"]) downloader.load_api() all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) fields_to_keep = [ "text", "favorite_count", "source", "retweeted", "entities", "id_str", "retweet_count", "favorited", "user", "lang", "created_at", "place", "constituent_name", "constituent_id", "search_term", "id", "sentiment_score", "entity_tags", "relevance", "constituent" ] for language in languages: for constituent_id, constituent_name in all_constituents: search_query = get_search_string(constituent_id, args.param_connection_string, "PARAM_TWITTER_KEYWORDS", "PARAM_TWITTER_EXCLUSIONS") #Get max id of all tweets to extract tweets with id highe than that q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \ "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"], constituent_id,language) try: sinceId = int( storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"]) except Exception as e: print(e) sinceId = None max_id = -1 tweetCount = 0 print("Downloading max {0} tweets for {1} in {2} on {3}".format( parameters["MAX_TWEETS"], constituent_name, language, str(datetime.now()))) while tweetCount < parameters["MAX_TWEETS"]: tweets_unmodified = [] tweets_modified = [] tweets_mongo = [] try: tweets, tmp_tweet_count, max_id = downloader.download( constituent_name, search_query, language, parameters["TWEETS_PER_QUERY"], sinceId, max_id) except Exception as e: continue if not tweets: break else: print("Downloaded {} tweets".format(tmp_tweet_count)) tweetCount += tmp_tweet_count #Add fields for both unmodified and modified tweets for tweet in tweets: tweet._json['source'] = "Twitter" tweet._json['constituent_name'] = constituent_name tweet._json['constituent_id'] = constituent_id tweet._json['search_term'] = search_query tweet._json["constituent"] = tap.get_old_constituent_name( constituent_id) #Removing bad fields clean_tweet = tap.scrub(tweet._json) # Separate the tweets that go to one topic or the other #unmodified t_unmodified = deepcopy(clean_tweet) t_unmodified["date"] = tap.convert_timestamp( t_unmodified["created_at"]) tweets_unmodified.append(t_unmodified) #Add additional fields clean_tweet["sentiment_score"] = tap.get_nltk_sentiment( str(clean_tweet["text"])) tagged_text = tagger.get_spacy_entities( str(clean_tweet["text"])) clean_tweet["entity_tags"] = tap.get_spacey_tags( tagged_text) clean_tweet["relevance"] = -1 #mongo t_mongo = deepcopy(clean_tweet) t_mongo['date'] = datetime.strptime( t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y') tweets_mongo.append(t_mongo) #modified tagged_tweet = dict((k, clean_tweet[k]) for k in fields_to_keep if k in clean_tweet) tagged_tweet['date'] = tap.convert_timestamp( clean_tweet["created_at"]) tweets_modified.append(tagged_tweet) #send to PubSub topic #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified) #ps_utils.publish("igenie-project", "tweets", tweets_modified) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], '{}_unmodified'.format( parameters["DESTINATION_TABLE"]), tweets_unmodified) except Exception as e: print(e) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], tweets_modified) except Exception as e: print(e) try: storage.save_to_mongodb(tweets_mongo, "dax_gcp", parameters["DESTINATION_TABLE"]) pass except Exception as e: print(e) time.sleep(1) print("Saved {} tweets for in {}".format(tweetCount, constituent_name, language)) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": tweetCount, "language": language }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) return "Downloaded tweets"
def get_daily_orbis_news(args): if __name__ != "__main__": from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import logging_utils as logging_utils # get constituents soap = SOAPUtils() storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = [ "LOGGING", "BVD_USERNAME", "BVD_PASSWORD", "DESTINATION_TABLE", "LOGGING_TABLE" ] where = lambda x: x["SOURCE"] == 'Orbis' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, bvdid in constituents: #get last news date for the constituent query = """ SELECT max(news_id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Orbis" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue start = 0 max_count = 20 print("Constituent: {},{}".format(constituent_name, bvdid)) stop = False try: token = soap.get_token(parameters["BVD_USERNAME"], parameters["BVD_PASSWORD"], 'orbis') selection_token, selection_count = soap.find_by_bvd_id( token, bvdid, 'orbis') except Exception as e: print(e) return while not stop: try: query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \ "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \ "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \ "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \ "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \ "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \ "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \ "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \ "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \ "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \ "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format( start, max_count) get_data_result = soap.get_data(token, selection_token, selection_count, query, 'orbis', timeout=None) except Exception as e: print(str(e)) if token: soap.close_connection(token, 'orbis') token = None break result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) try: df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"]) except Exception as e: print(e) break if df.shape[0] == 0: print("No records in df") break initial_shape = df.shape[0] df.astype({"news_id": int}, copy=False, errors='ignore') df = df.loc[df["news_id"] > max_id] if df.shape[0] == 0: print("No new data") break if df.shape[0] != initial_shape: stop = True # Make news_title column a string df.astype({"news_title": str}, copy=False, errors='ignore') # Remove duplicate columns df.drop_duplicates(["news_title"], inplace=True) # Get sentiment score df["score"] = df.apply(lambda row: tah.get_nltk_sentiment( str(row["news_article_txt"])), axis=1) # get sentiment word df["sentiment"] = df.apply( lambda row: tah.get_sentiment_word(row["score"]), axis=1) # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) df["constituent"] = old_constituent_name # add URL df["url"] = None # add show df["show"] = True # get entity tags entity_tags = [] for text in df["news_title"]: tags = tah.get_spacey_tags(tagger.get_spacy_entities( str(text))) entity_tags.append(tags) fields = [ "news_date", "news_title", "news_article_txt", "news_companies", "news_source", "news_publication", "news_topics", "news_country", "news_region", "news_language", "news_id", "score", "sentiment", "constituent_id", "constituent_name", "constituent", "url", "show" ] # Get needed fields df_bigquery = df[fields] bigquery_data = json.loads( df_bigquery.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(bigquery_data)): bigquery_data[i]["entity_tags"] = entity_tags[i] # set news_date if "news_date" in bigquery_data[i] and bigquery_data[i][ "news_date"]: date = bigquery_data[i]["news_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) bigquery_data[i]["news_date"] = ts else: bigquery_data[i]["news_date"] = None if bigquery_data[i]["news_title"]: bigquery_data[i]["news_title"] = str( bigquery_data[i]["news_title"]) if bigquery_data[i]["news_article_txt"]: bigquery_data[i]["news_article_txt"] = str( bigquery_data[i]["news_article_txt"]) if bigquery_data[i]["news_companies"]: try: bigquery_data[i]["news_companies"] = [ i.strip() for i in bigquery_data[i] ["news_companies"].split(";") ] except Exception as e: bigquery_data[i]["news_companies"] = [] else: bigquery_data[i]["news_companies"] = [] if bigquery_data[i]["news_topics"]: try: if bigquery_data[i]["news_topics"].isdigit(): bigquery_data[i]["news_topics"] = [] elif bigquery_data[i]["news_topics"] == 'None': bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [ i.strip() for i in bigquery_data[i] ["news_topics"].split(";") ] except Exception as e: bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [] if bigquery_data[i]["news_country"]: try: bigquery_data[i]["news_country"] = [ i.strip() for i in bigquery_data[i] ["news_country"].split(";") ] except Exception as e: bigquery_data[i]["news_country"] = [] else: bigquery_data[i]["news_country"] = [] if bigquery_data[i]["news_region"]: try: bigquery_data[i]["news_region"] = [ i.strip() for i in bigquery_data[i]["news_region"].split(";") ] except Exception as e: bigquery_data[i]["news_region"] = [] else: bigquery_data[i]["news_region"] = [] if bigquery_data[i]["news_language"]: bigquery_data[i]["news_language"] = str( bigquery_data[i]["news_language"]) if bigquery_data[i]["news_source"]: bigquery_data[i]["news_source"] = str( bigquery_data[i]["news_source"]) if bigquery_data[i]["news_publication"]: bigquery_data[i]["news_publication"] = str( bigquery_data[i]["news_publication"]) if bigquery_data[i]["news_id"]: try: bigquery_data[i]["news_id"] = int( bigquery_data[i]["news_id"]) except Exception as e: bigquery_data[i]["news_id"] = str( bigquery_data[i]["news_id"]) if bigquery_data[i]["sentiment"]: bigquery_data[i]["sentiment"] = str( bigquery_data[i]["sentiment"]) if bigquery_data[i]["constituent_id"]: bigquery_data[i]["constituent_id"] = str( bigquery_data[i]["constituent_id"]) if bigquery_data[i]["constituent_name"]: bigquery_data[i]["constituent_name"] = str( bigquery_data[i]["constituent_name"]) if bigquery_data[i]["constituent"]: bigquery_data[i]["constituent"] = str( bigquery_data[i]["constituent"]) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], bigquery_data) except Exception as e: print(e) start = start + 20 print("Records saved: {}".format(df.shape[0])) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": df.shape[0], "source": "Orbis" }] if parameters["LOGGING"] and bigquery_data: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) if token: soap.close_connection(token, 'orbis')
def get_historical_orbis_news(user, pwd, database, google_key_path, param_connection_string): #get parameters soap = SOAPUtils() storage = Storage(google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=param_connection_string, sql_table_name=table, sql_column_list=columns)[2:] to_skip = [ "BAYERISCHE MOTOREN WERKE AG", "DEUTSCHE BANK AG", "SIEMENS AG", "SAP SE", "VOLKSWAGEN AG" ] for constituent_id, constituent_name, bvdid in constituents: limit = get_number_of_news_items(constituent_name) if constituent_name in to_skip: print("Skipping") continue records = 0 start = 0 max_count = 20 filename = "bq_news_{}.json".format(constituent_id) print("Constituent: {},{}".format(constituent_name, bvdid)) failed = 0 if constituent_name == "BASF SE": start = 4280 try: token = soap.get_token(user, pwd, database) selection_token, selection_count = soap.find_by_bvd_id( token, bvdid, database) except Exception as e: print(e) return with open(filename, "a") as f: while limit > records: try: query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \ "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \ "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \ "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \ "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \ "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \ "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \ "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \ "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \ "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \ "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format( start, max_count) get_data_result = soap.get_data(token, selection_token, selection_count, query, database, timeout=None) except Exception as e: print(str(e)) continue finally: pass #if token: # soap.close_connection(token, database) result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) try: df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"]) except Exception as e: print(e) continue if df.shape[0] == 0: print("No records in df") continue #Make news_title column a string df.astype({"news_title": str}, copy=False, errors='ignore') # Remove duplicate columns df.drop_duplicates(["news_title"], inplace=True) # Get sentiment score df["score"] = df.apply(lambda row: tah.get_nltk_sentiment( str(row["news_article_txt"])), axis=1) #df["score"] = df.apply(lambda row: sia.polarity_scores(str(row["news_article_txt"]))['compound'] , axis=1) # get sentiment word df["sentiment"] = df.apply( lambda row: tah.get_sentiment_word(row["score"]), axis=1) # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name( constituent_id) df["constituent"] = old_constituent_name # add URL df["url"] = None # add show df["show"] = True # get entity tags entity_tags = [] for text in df["news_title"]: tags = tah.get_spacey_tags( tagger.get_spacy_entities(str(text))) entity_tags.append(tags) fields = [ "news_date", "news_title", "news_article_txt", "news_companies", "news_source", "news_publication", "news_topics", "news_country", "news_region", "news_language", "news_id", "score", "sentiment", "constituent_id", "constituent_name", "constituent", "url", "show" ] ''' # Save to MongoDB filter = ["NEWS_DATE_NewsDim", "NEWS_TITLE_NewsDim", "NEWS_ARTICLE_TXT_NewsDim", "NEWS_SOURCE_NewsDim", "NEWS_PUBLICATION_NewsDim", "categorised_tag", "score", "sentiment", "constituent_id", "constituent_name", "constituent","url", "show"] df_mongo = df[fields] df_mongo.columns = filter mongo_data = json.loads(df_mongo.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(mongo_data)): mongo_data[i]["entity_tags"] = entity_tags[i] date = mongo_data[i]["NEWS_DATE_NewsDim"] #ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") mongo_data[i]["NEWS_DATE_NewsDim"] = ts #storage.save_to_mongodb(mongo_data, "dax_gcp", "all_news") ''' #Get needed fields df_bigquery = df[fields] bigquery_data = json.loads( df_bigquery.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(bigquery_data)): bigquery_data[i]["entity_tags"] = entity_tags[i] #set news_date if "news_date" in bigquery_data[i] and bigquery_data[i][ "news_date"]: date = bigquery_data[i]["news_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) bigquery_data[i]["news_date"] = ts else: bigquery_data[i]["news_date"] = None if bigquery_data[i]["news_title"]: bigquery_data[i]["news_title"] = str( bigquery_data[i]["news_title"]) if bigquery_data[i]["news_article_txt"]: bigquery_data[i]["news_article_txt"] = str( bigquery_data[i]["news_article_txt"]) if bigquery_data[i]["news_companies"]: try: bigquery_data[i]["news_companies"] = [ i.strip() for i in bigquery_data[i] ["news_companies"].split(";") ] except Exception as e: bigquery_data[i]["news_companies"] = [] else: bigquery_data[i]["news_companies"] = [] if bigquery_data[i]["news_topics"]: try: if bigquery_data[i]["news_topics"].isdigit(): bigquery_data[i]["news_topics"] = [] elif bigquery_data[i]["news_topics"] == 'None': bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [ i.strip() for i in bigquery_data[i] ["news_topics"].split(";") ] except Exception as e: bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [] if bigquery_data[i]["news_country"]: try: bigquery_data[i]["news_country"] = [ i.strip() for i in bigquery_data[i] ["news_country"].split(";") ] except Exception as e: bigquery_data[i]["news_country"] = [] else: bigquery_data[i]["news_country"] = [] if bigquery_data[i]["news_region"]: try: bigquery_data[i]["news_region"] = [ i.strip() for i in bigquery_data[i] ["news_region"].split(";") ] except Exception as e: bigquery_data[i]["news_region"] = [] else: bigquery_data[i]["news_region"] = [] if bigquery_data[i]["news_language"]: bigquery_data[i]["news_language"] = str( bigquery_data[i]["news_language"]) if bigquery_data[i]["news_source"]: bigquery_data[i]["news_source"] = str( bigquery_data[i]["news_source"]) if bigquery_data[i]["news_publication"]: bigquery_data[i]["news_publication"] = str( bigquery_data[i]["news_publication"]) if bigquery_data[i]["news_id"]: try: bigquery_data[i]["news_id"] = int( bigquery_data[i]["news_id"]) except Exception as e: bigquery_data[i]["news_id"] = str( bigquery_data[i]["news_id"]) if bigquery_data[i]["sentiment"]: bigquery_data[i]["sentiment"] = str( bigquery_data[i]["sentiment"]) if bigquery_data[i]["constituent_id"]: bigquery_data[i]["constituent_id"] = str( bigquery_data[i]["constituent_id"]) if bigquery_data[i]["constituent_name"]: bigquery_data[i]["constituent_name"] = str( bigquery_data[i]["constituent_name"]) if bigquery_data[i]["constituent"]: bigquery_data[i]["constituent"] = str( bigquery_data[i]["constituent"]) f.write( json.dumps(bigquery_data[i], cls=MongoEncoder) + '\n') # storage.insert_bigquery_data("pecten_dataset", "news", bigquery_data) start = start + 20 #end = start + 10 records += 20 print("Records saved: {}".format(records)) if token: soap.close_connection(token, database)
def get_stocktwits(args): from utils import logging_utils as logging_utils from utils.Storage import Storage from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU # get constituents storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"] table = "PARAM_STOCKTWITS_KEYS" all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_STOCKTWITS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, url_key in all_constituents: #Get the last id query = """ SELECT max(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and source = 'StockTwits' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue print("Gettins stocktwits for {}:{}".format(constituent_name, url_key)) response = requests.get( "https://api.stocktwits.com/api/2/streams/symbol/{}.json".format( url_key)) to_insert = [] if response.status_code == 200: for item in response.json()["messages"]: if max_id: if int(item['id']) < max_id: continue doc = tah.create_tweet_skelleton() doc['text'] = item['body'] doc['created_at'] = item['created_at'] doc['date'] = item['created_at'] doc['id'] = item['id'] doc['id_str'] = str(item['id']) if "symbols" in item: for symbol in item['symbols']: doc['entities']['symbols'].append({ 'indices': [], 'text': symbol['symbol'] }) if 'entities' in item: if 'sentiment' in item['entities'] and item['entities'][ 'sentiment']: if 'basic' in item['entities']['sentiment']: doc['user']['description'] = item['entities'][ 'sentiment']['basic'] doc['source'] = 'StockTwits' doc['constituent_name'] = constituent_name doc['constituent_id'] = constituent_id doc['search_term'] = '' doc["constituent"] = tah.get_old_constituent_name( constituent_id) doc['relevance'] = 1 doc["sentiment_score"] = tah.get_nltk_sentiment( str(doc["text"])) tagged_text = tagger.get_spacy_entities(str(doc["text"])) doc["entity_tags"] = tah.get_spacey_tags(tagged_text) to_insert.append(doc) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": len(to_insert), "language": 'StockTwits' }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) else: print(response.text)
def get_bloomberg_news(args): from utils.Storage import Storage from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils import logging_utils as logging_utils tagger = TU() storage_client = Storage(google_key_path=args.google_key_path) #Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Bloomberg' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) #get constituents all_constituents = storage_client.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="PARAM_NEWS_BLOOMBERG_KEYS", sql_column_list=[ "CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY", "PAGES" ]) # Define random user agent object #ua = UserAgent() # Base url for search queries base = 'https://bloomberg.com/search?query={}&sort=time:desc&page={}' # Iterate over companies for constituent_id, constituent_name, url_key, pages in all_constituents: # Get date of latest news article for this constituent for Bloomberg query = """ SELECT max(news_date) as last_date FROM `{}.{}` WHERE constituent_id = '{}' AND news_origin = 'Bloomberg' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) last_date_bq = result[0]["last_date"] except Exception as e: last_date_bq = None print('Scraping for {} up to {}.'.format(constituent_name, str(last_date_bq))) # iterate through specified page range and save articles to database in_target = True i = 1 while in_target: print('Scraping for page', i) # format url with company name and current page number and create bs4 object from the response url = base.format(url_key.replace('.', '_').replace(' ', '_'), i) i += 1 main_soup = BeautifulSoup(requests.get(url).content, 'html.parser') # check if there are any results on the page and finish the loop if none try: if 'No results for' in main_soup.find( 'div', { 'class': 'content-no-results' }).text: print('No more articles...') break except AttributeError: pass # iterate through article elelements on the page results = main_soup.findAll('div', {'class': 'search-result'}) times = [ x.find('time', { 'class': 'published-at' }).text.strip() for x in results ] times = [datetime.strptime(x, '%b %d, %Y') for x in times] print("Looping times") for time in times: if last_date_bq: if time < datetime.strptime( last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print('Target date reached') in_target = False to_insert = [] print("Looping articles...") for art in results: art_type = art.find('article')['class'] # check if the result is an article if 'type-article' in art_type: url = art.find('h1').find('a')['href'] d = get_article(url, None) if d is None: continue if last_date_bq: if datetime.strptime(d["news_date"], "%Y-%m-%d %H:%M:%S") < \ datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): continue #set extra fields: #score if d["news_article_txt"]: d['score'] = tah.get_nltk_sentiment( str(d["news_article_txt"])) # sentiment d['sentiment'] = tah.get_sentiment_word(d["score"]) # constituent fields d["constituent_id"] = constituent_id d["constituent_name"] = constituent_name d["constituent"] = tah.get_old_constituent_name( constituent_id) #url d["url"] = url #show d["show"] = True # entity_tags d["entity_tags"] = tah.get_spacey_tags( tagger.get_spacy_entities(str(d["news_title"]))) to_insert.append(d) if to_insert: print("Inserting records to BQ") try: storage_client.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Bloomberg" }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client) else: break
def get_reuters_news(args, driver): from utils import logging_utils as logging_utils from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils import twitter_analytics_helpers as tah tagger = TU() storage_client = Storage(google_key_path=args.google_key_path) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING","DESTINATION_TABLE","LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Reuters' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # get constituents all_constituents = storage_client.get_sql_data(sql_connection_string=args.param_connection_string, sql_table_name="PARAM_NEWS_REUTERS_KEYS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"]) for constituent_id, constituent_name, url_key in all_constituents: to_insert = [] # Get date of latest news article for this constituent for Reuters query = """ SELECT max(news_date) as last_date FROM `{}.{}` WHERE constituent_id = '{}' AND news_origin = 'Reuters' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"],constituent_id) try: try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) last_date_bq = result[0]["last_date"] except Exception as e: last_date_bq = None company_name = url_key print('Scraping articles for {}'.format(company_name)) query_url = 'https://reuters.com/search/news?blob={}&sortBy=date&dateRange=all'.format(company_name) # Find last article with date specified print(query_url) driver.get(query_url) sleep(2) skip = False if last_date_bq: try: last_date = \ driver.find_elements_by_xpath('//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[3]/div/div/h5')[ 0].get_attribute('innerHTML') except: last_date = BeautifulSoup(driver.page_source).findAll('h5', {'class': 'search-result-timestamp'})[ 0].text.strip() last_date = ' '.join(last_date.split(' ')[:-1]) month = [last_date.split(' ')[0][:3]] last_date = ' '.join(month + last_date.split(' ')[1:]).strip().replace(',', '') last_date = datetime.strptime(last_date, '%b %d %Y %I:%M%p') if last_date <= datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print("No newer articles") skip = True if skip: continue soup = BeautifulSoup(driver.page_source, 'html.parser') search = soup.find('div', {'class': 'search-result-list news-search'}).findAll('div', {'class': 'search-result-indiv'}) print("Getting articles") for article in search: title_elem = article.find('h3', {'class': 'search-result-title'}) title = title_elem.text.strip() url = "https://reuters.com" + title_elem.find('a')['href'] date_published = article.find('h5', {'class': 'search-result-timestamp'}).text date_published = ' '.join(date_published.split(' ')[:-1]) month = [date_published.split(' ')[0][:3]] date_published = ' '.join(month + date_published.split(' ')[1:]).strip().replace(',', '') date_published_dtype = datetime.strptime(date_published, '%b %d %Y %I:%M%p') date_published = date_published_dtype.strftime("%Y-%m-%d %H:%M:%S") if last_date_bq: if date_published_dtype < datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print("Skipping article") continue try: soup = BeautifulSoup( requests.get(url, headers={'User-Agent': random.choice(AGENTS)}).content, 'html.parser') article_body = soup.find('div', {'class': 'ArticleBody_body_2ECha'}).text.strip() source = soup.find('p', {'class': 'Attribution_content_27_rw'}).text.strip() except AttributeError: source = None continue d = {'news_title': title, 'news_date': date_published, 'news_source': source, 'news_origin':"Reuters", "news_article_txt": article_body} # set extra fields: # score if d["news_article_txt"]: d['score'] = tah.get_nltk_sentiment(str(d["news_article_txt"])) # sentiment d['sentiment'] = tah.get_sentiment_word(d["score"]) # constituent fields d["constituent_id"] = constituent_id d["constituent_name"] = constituent_name d["constituent"] = tah.get_old_constituent_name(constituent_id) # url d["url"] = url # show d["show"] = True # entity_tags d["entity_tags"] = tah.get_spacey_tags(tagger.get_spacy_entities(str(d["news_title"]))) to_insert.append(d) print('Article scraped...') #break # Save data to json if to_insert: print("Inserting records to BQ") try: storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Reuters"}] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client) except Exception as e: print(e) continue
def update_from_bigquery_split(args): #load data storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) #constituents = [("BMWDE8170003036", )] start_time = time.time() for item in constituents: constituent_id = item[0] print("Loading {}".format(constituent_id)) try: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE constituent_id = '{}' ".format(constituent_id) tweets = storage.get_bigquery_data(query) operations = [] records = 0 for tweet in tweets: row = {} row["text"] = tweet["text"] row["id"] = tweet["id"] row["favorite_count"] = tweet["favorite_count"] row["source"] = tweet["source"] row["retweeted"] = tweet["retweeted"] row["entities"] = tweet["entities"] row["id_str"] = tweet["id_str"] row["retweet_count"] = tweet["retweet_count"] row["favorited"] = tweet["favorited"] row["user"] = tweet["user"] row["lang"] = tweet["lang"] row["created_at"] = tweet["created_at"] row["place"] = tweet["place"] row["constituent_name"] = tweet["constituent_name"] row["constituent_id"] = tweet["constituent_id"] row["search_term"] = tweet["search_term"] row["relevance"] = tweet["relevance"] # Additional fields if isinstance(tweet["created_at"], str): date = tweet["created_at"] ts = time.strptime(date, "%a %b %d %H:%M:%S %z %Y") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) row['date'] = ts # sentiment score row["sentiment_score"] = get_nltk_sentiment(tweet["text"]) # TO DO tagged_text = tagger.get_spacy_entities(tweet["text"]) row["entity_tags"] = get_spacey_tags(tagged_text) operations.append(row) # print(row) if len(operations) == 1000: result = None result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 print("Performed bulk write of {} records".format(records)) if not result: print("Records not inserted") operations = [] if len(operations) > 0: result = None result = storage.insert_bigquery_data('pecten_dataset', 'tweets', operations) records += 1000 if not result: print("Records not inserted") except Exception as e: print(e) print("--- %s seconds ---" % (time.time() - start_time))
def update_from_bigquery_file(args): # load data storage = Storage(args.google_key_path) tagger = TU() query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified`" tweets = storage.get_bigquery_data(query, iterator_flag=False) file_name = "tweets.json" start_time = time.time() operations = [] records = 0 total = 0 print("Writing local file") with open(file_name, "w") as f: for tweet in tweets: row = {} row["text"] = tweet["text"] row["id"] = tweet["id"] row["favorite_count"] = tweet["favorite_count"] row["source"] = tweet["source"] row["retweeted"] = tweet["retweeted"] row["entities"] = tweet["entities"] row["id_str"] = tweet["id_str"] row["retweet_count"] = tweet["retweet_count"] row["favorited"] = tweet["favorited"] row["user"] = tweet["user"] row["lang"] = tweet["lang"] row["created_at"] = tweet["created_at"] row["place"] = tweet["place"] row["constituent_name"] = tweet["constituent_name"] row["constituent_id"] = tweet["constituent_id"] row["search_term"] = tweet["search_term"] row["relevance"] = tweet["relevance"] # Additional fields if isinstance(tweet["created_at"], str): row['date'] = convert_timestamp(tweet["created_at"]) # sentiment score row["sentiment_score"] = get_nltk_sentiment(tweet["text"]) # TO DO tagged_text = tagger.get_spacy_entities(tweet["text"]) row["entity_tags"] = get_spacey_tags(tagged_text) f.write(json.dumps(row, cls=MongoEncoder) + '\n') records += 1 total += 1 if records == 2000: print("Written {} records".format(total)) records = 0 print("--- %s seconds ---" % (time.time() - start_time)) print("Processed {} records".format(records))
def update_from_bigquery(args): #load data storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) start_time = time.time() for item in constituents: constituent_id = item[0] print("Loading {}".format(constituent_id)) while True: try: # get max id inserted from target table q = "SELECT max(id) as max_id " \ "FROM `pecten_dataset.tweets` " \ "WHERE constituent_id = '{}'".format(constituent_id) max_id = storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"] if not max_id: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE constituent_id = '{}' " \ "ORDER BY id ASC".format(constituent_id) else: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE id > {} AND constituent_id = '{}' " \ "ORDER BY id ASC".format(max_id, constituent_id) tweets = storage.get_bigquery_data(query) if tweet.total_rows == 0: print("Finished for {}".format(constituent_id)) break operations = [] records = 0 for tweet in tweets: row = {} row["text"] = tweet["text"] row["id"] = tweet["id"] row["favorite_count"] = tweet["favorite_count"] row["source"] = tweet["source"] row["retweeted"] = tweet["retweeted"] row["entities"] = tweet["entities"] row["id_str"] = tweet["id_str"] row["retweet_count"] = tweet["retweet_count"] row["favorited"] = tweet["favorited"] row["user"] = tweet["user"] row["lang"] = tweet["lang"] row["created_at"] = tweet["created_at"] row["place"] = tweet["place"] row["constituent_name"] = tweet["constituent_name"] row["constituent_id"] = tweet["constituent_id"] row["search_term"] = tweet["search_term"] row["relevance"] = tweet["relevance"] # Additional fields if isinstance(tweet["created_at"], str): row['date'] = convert_timestamp(tweet["created_at"]) # sentiment score row["sentiment_score"] = get_nltk_sentiment(tweet["text"]) # TO DO tagged_text = tagger.get_spacy_entities(tweet["text"]) row["entity_tags"] = get_spacey_tags(tagged_text) operations.append(row) # print(row) if len(operations) == 1000: result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 print("Performed bulk write of {} records".format( records)) if not result: print("Records not inserted") operations = [] if len(operations) > 0: result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 if not result: print("Records not inserted") except Exception as e: print(e) print("--- %s seconds ---" % (time.time() - start_time))