def get_country_data_bq(from_date, to_date, google_key_path): print("country_data") storage_client = Storage(google_key_path=google_key_path) query2 = """ UPDATE `pecten_dataset.country_data` SET status = 'inactive' where status = 'active'; """ try: storage_client.get_bigquery_data(query2, iterator_flag=True) except Exception as e: print(e) return columns = [ "constituent", "avg_sentiment", "count", "country_name", "constituent_name", "constituent_id", "date_of_analysis", 'status' ] query = """ SELECT constituent, AVG(sentiment_score) as avg_sentiment, count(place.country_code) as count, place.country_code as country_name, constituent_name, constituent_id, CURRENT_TIMESTAMP() as date_of_analysis, 'active' as status FROM `pecten_dataset.tweets` WHERE date between TIMESTAMP('{}') and TIMESTAMP('{}') GROUP BY constituent_id, constituent, country_name, constituent_name HAVING country_name IS NOT NULL; """.format(from_date, to_date) try: result = storage_client.get_bigquery_data(query, iterator_flag=True) except Exception as e: print(e) return None to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'country_data', to_insert) except Exception as e: print(e)
def get_twitter_sentiment_popularity(from_date, to_date, google_key_path): print("twitter_sentiment_popularity") columns = ["count", "constituent", "avg_sentiment_all", "constituent_name", "constituent_id", "date"] query = """ SELECT constituent, avg(sentiment_score) as avg_sentiment_all, constituent_name, constituent_id, date, count(text) as count FROM `igenie-project.pecten_dataset.tweets` WHERE date between TIMESTAMP ('{}') and TIMESTAMP ('{}') GROUP BY constituent, constituent_name, constituent_id, date """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else (k,item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'twitter_sentiment_popularity_copy', to_insert) except Exception as e: print(e)
def get_news_analytics_topic_articles(from_date, to_date, google_key_path): print("news_analytics_topic_articles") columns = [ "constituent_name", "constituent_id", "sentiment", "News_Date_NewsDim", "constituent", "News_source_NewsDim", "To_Date", "Score", "Categorised_tag", "News_Title_NewsDim", "Date", "From_Date", "NEWS_ARTICLE_TXT_NewsDim" ] query = """ SELECT constituent_name, constituent_id, sentiment, news_date as News_Date_NewsDim, Constituent, news_source as News_source_News_Dim, Score, news_topics as Categorised_tag, news_title as news_Title_NewsDim, news_date, news_article_txt as NEWS_ARTICLE_TXT_NewsDim FROM `pecten_dataset.all_news_bkp` WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}') """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data( 'pecten_dataset', 'news_analytics_topic_articles_copy', to_insert) except Exception as e: print(e)
def get_news_all(from_date, to_date, google_key_path): print("news_all") columns = [ "NEWS_DATE_NewsDim", "score", "NEWS_PUBLICATION_NewsDim", "categorised_tag", "constituent_id", "NEWS_ARTICLE_TXT_NewsDim", "sentiment", "news_Title_NewsDim", "entity_tags", "entity_tags.FACILITY", "entity_tags.QUANTITY", "entity_tags.EVENT", "entity_tags.PERSON", "entity_tags.DATE", "entity_tags.TIME", "entity_tags.CARDINAL", "entity_tags.PRODUCT", "count" ] query = """ SELECT news_date as news_date_NewsDim, score, news_publication as NEWS_PUBLICATION_NewsDim, news_topics as categorised_tag, constituent_id, news_article_txt as NEWS_ARTICLE_TXT_NewsDim, sentiment, news_title as NEWS_TITLE_NewsDim, entity_tags, entity_tags.FACILITY, entity_tags.QUANTITY, entity_tags.EVENT, entity_tags.PERSON, entity_tags.DATE, entity_tags.TIME, entity_tags.PERSON, entity_tags.DATE, entity_tags.TIME, entity_tags.CARDINAL, entity_tags.PRODUCT, entity_tags.LOC, entity_tags.WORK_OF_ART, entity_tags.LAW, entity_tags.GPE, entity_tags.PERCENT, entity_tags.FAC, entity_tags.ORDINAL, entity_tags.ORG, entity_tags.NORP, entity_tags.LANGUAGE, entity_tags.MONEY, constituent_name, count(constituent_name) as count, url, news_language, news_id, news_country, news_companies, news_region, constituent FROM `pecten_dataset.all_news_bkp` WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}') """.format(from_date, to_date) storage_client = Storage(google_key_path='/igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'all_news', to_insert) except Exception as e: print(e)
def get_news_analytics_topic_articles_bq(from_date, to_date, google_key_path): print("news_analytics_topic_articles") columns = [] query = """ SELECT constituent_name, constituent_id, sentiment, News_Date_NewsDim, Constituent, News_source_News_Dim, Score, news_topic as Categorised_tag, news_Title_NewsDim, Date, NEWS_ARTICLE_TXT_NewsDim FROM [igenie-project:pecten_dataset.news] where date between TIMESTAMP ('{}') and TIMESTAMP ('{}') """.format(from_date, to_date) #` #UPDATE twitter_analytics_latest_price_tweets SET From_date = TIMESTAMP() where date between TIMESTAMP() and TIMESTAMP() #UPDATE twitter_analytics_latest_price_tweets SET To_date = TIMESTAMP() where date between TIMESTAMP() and TIMESTAMP() storage_client = Storage(google_key_path=google_key_path) result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', '', to_insert) except Exception as e: print(e)
def get_news_analytics_daily_sentiment_bq(from_date, to_date, google_key_path): print("news_analytics_daily_sentiment") columns = [ "constituent_id", "avg_sentiment", "constituent_name", "date", "constituent" ] query = """ SELECT constituent_id, AVG(score) as avg_sentiment, constituent_name, news_date as date, constituent FROM `pecten_dataset.all_news` WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}') GROUP BY constituent_id, constituent_name, date, constituent """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data( 'pecten_dataset', 'news_analytics_daily_sentiment_copy', to_insert) except Exception as e: print(e)
def get_twitter_top_organisations(from_date, to_date, google_key_path): print("twitter_top_organisations") columns = ["count", "constituent", "trend", "date_of_analysis", "status", "constituent_name", "constituent_id"] query = """ SELECT a.constituent, b.entity_tags as trend, COUNT(b.entity_tags) AS count, a.constituent_name, a.constituent_id, CASE WHEN date > '2017-11-01 00:00:00' THEN 'active' ELSE 'inactive' END AS status, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-01 00:00:00 UTC' END AS date_of_analysis FROM `pecten_dataset.tweets` a, (SELECT x.id_str, entity_tags FROM `pecten_dataset.tweets` AS x, UNNEST(entity_tags.ORG) AS entity_tags) b WHERE a.id_str= b.id_str AND a.date BETWEEN TIMESTAMP ('{}') AND TIMESTAMP ('{}') GROUP BY a.constituent, a.constituent_name, a.constituent_id, date, b.entity_tags; """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else (k,item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'twitter_top_organisations_copy', to_insert) except Exception as e: print(e)
def get_news_tags_bq(from_date, to_date, google_key_path): print("news_tags") columns = [ "Date", "constituent", "Tags", "Count", "constituent_name", "constituent_id" ] query = """ SELECT a.news_date AS Date, a.constituent, b.news_topics as Tags, COUNT(b.news_topics) AS Count, a.constituent_name, a.constituent_id FROM `pecten_dataset.all_news_bkp` a, (SELECT x.news_id, news_topics FROM `pecten_dataset.all_news_bkp` AS x, UNNEST(news_topics) AS news_topics) b WHERE a.news_id = b.news_id AND a.news_date BETWEEN TIMESTAMP ('{}') AND TIMESTAMP ('{}') GROUP BY Date, a.constituent, b.news_topics, a.constituent_name, a.constituent_id; """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data('pecten_dataset', 'news_tag_copy', to_insert) except Exception as e: print(e)
def get_twitter_analytics_latest_price_tweets(from_date, to_date, google_key_path): print("twitter_sentiment_popularity") columns = [ "tweet_date", "constituent_name", "from_date", "date", "text", "entity_tags", "entity_tags.MONEY", "sentiment_score", "constituent", "constituent_id", "to_date" ] query = """ SELECT date as tweet_date, constituent_name, text, entity_tags.MONEY, sentiment_score, constituent, constituent_id, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-09 00:00:00 UTC' END AS date, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-01 00:00:00 UTC' END AS from_date, CASE WHEN date > '2017-12-01 00:00:00' THEN '2017-12-11 00:00:00 UTC' END AS to_date FROM `igenie-project.pecten_dataset.tweets` WHERE text LIKE '%rating%'and text LIKE '%€%' and date between TIMESTAMP ('{}') and TIMESTAMP ('{}') """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data( 'pecten_dataset', 'twitter_analytics_latest_price_tweets_copy', to_insert) except Exception as e: print(e)
def get_twitter_sentiment_count_daily(from_date, to_date, google_key_path): print("twitter_sentiment_count_daily") columns = [ "constituent_name", "sentiment_score", "line", "date", "count", "constituent_id", "constituent" ] query = """ SELECT constituent_name, DATE(date) as date, count(date) as count, constituent_id, constituent, ( CASE WHEN sentiment_score > 0.25 THEN 'positive' WHEN sentiment_score < -0.25 THEN 'negative' ELSE 'neutral' END) AS line FROM `pecten_dataset.tweets` WHERE date between TIMESTAMP ('{}') and TIMESTAMP ('{}') GROUP BY constituent_name, constituent, constituent_id, date, line ORDER BY date """.format(from_date, to_date) storage_client = Storage(google_key_path='igenie-project-key.json') result = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] for item in result: to_insert.append( dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(item[k], datetime) else (k, item[k]) for k in columns)) try: storage_client.insert_bigquery_data( 'pecten_dataset', 'twitter_sentiment_count_daily_copy', to_insert) except Exception as e: print(e)
def main(args): if __name__ != "__main__": sys.path.insert(0, args.python_path) from utils.Storage import Storage from utils import twitter_analytics_helpers as tah driver = webdriver.PhantomJS() # Get parameters param_table = "PARAM_HISTORICAL_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE"] parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # Get constituents storage = Storage(google_key_path=args.google_key_path) query = """ SELECT a.CONSTITUENT_ID, a.CONSTITUENT_NAME, b.URL_KEY FROM MASTER_CONSTITUENTS a, PARAM_FINANCIAL_URL_KEYS b WHERE a.CONSTITUENT_ID = b.CONSTITUENT_ID """ all_constituents = storage.get_sql_data_text_query( args.param_connection_string, query) # Get dates query = """ SELECT max(date) as last_date FROM `{}.{}` """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"]) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) except Exception as e: print(e) return from_date = result[0]["last_date"] + timedelta(days=1) ts = from_date.strftime("%d.%m.%Y") from_date_parts = ts.split(".") if from_date_parts[0][0] == "0": from_date_parts[0] = from_date_parts[0][1:] if from_date_parts[1][0] == "0": from_date_parts[1] = from_date_parts[1][1:] from_date = ".".join(from_date_parts) to_date = datetime.now().strftime("%d.%m.%Y") to_date_parts = to_date.split(".") if to_date_parts[0][0] == "0": to_date_parts[0] = to_date_parts[0][1:] if to_date_parts[1][0] == "0": to_date_parts[1] = to_date_parts[1][1:] to_date = ".".join(to_date_parts) if from_date == to_date: return dax_url = 'http://en.boerse-frankfurt.de/index/pricehistory/DAX/{}_{}#History'.format( from_date, to_date) constituent_base_url = 'http://en.boerse-frankfurt.de/stock/pricehistory/' constituent_date_url = '-share/FSE/{}_{}#Price_History'.format( from_date, to_date) if args.all: extract_historical_data(dax_url, driver, storage, common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent='DAX') for constituent_id, constituent_name, url_key in all_constituents: print("Extracting data for {} from {} to {}".format( constituent_name, from_date, to_date)) extract_historical_data(urljoin(constituent_base_url, url_key + constituent_date_url), driver, storage, common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent=(constituent_id, constituent_name)) time.sleep(10) else: if args.constituent == 'DAX': extract_historical_data(dax_url, driver, storage, common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent='DAX') else: for constituent_id, constituent_name, url_key in all_constituents: if constituent_id == args.constituent: print("Extracting data for {} from {} to {}".format( constituent_name, from_date, to_date)) constituent_url = urljoin(constituent_base_url, url_key + constituent_date_url) extract_historical_data(constituent_url, driver, storage, common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent=(constituent_id, constituent_name)) driver.quit()
def update_from_bigquery_split(args): #load data storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) #constituents = [("BMWDE8170003036", )] start_time = time.time() for item in constituents: constituent_id = item[0] print("Loading {}".format(constituent_id)) try: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE constituent_id = '{}' ".format(constituent_id) tweets = storage.get_bigquery_data(query) operations = [] records = 0 for tweet in tweets: row = {} row["text"] = tweet["text"] row["id"] = tweet["id"] row["favorite_count"] = tweet["favorite_count"] row["source"] = tweet["source"] row["retweeted"] = tweet["retweeted"] row["entities"] = tweet["entities"] row["id_str"] = tweet["id_str"] row["retweet_count"] = tweet["retweet_count"] row["favorited"] = tweet["favorited"] row["user"] = tweet["user"] row["lang"] = tweet["lang"] row["created_at"] = tweet["created_at"] row["place"] = tweet["place"] row["constituent_name"] = tweet["constituent_name"] row["constituent_id"] = tweet["constituent_id"] row["search_term"] = tweet["search_term"] row["relevance"] = tweet["relevance"] # Additional fields if isinstance(tweet["created_at"], str): date = tweet["created_at"] ts = time.strptime(date, "%a %b %d %H:%M:%S %z %Y") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) row['date'] = ts # sentiment score row["sentiment_score"] = get_nltk_sentiment(tweet["text"]) # TO DO tagged_text = tagger.get_spacy_entities(tweet["text"]) row["entity_tags"] = get_spacey_tags(tagged_text) operations.append(row) # print(row) if len(operations) == 1000: result = None result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 print("Performed bulk write of {} records".format(records)) if not result: print("Records not inserted") operations = [] if len(operations) > 0: result = None result = storage.insert_bigquery_data('pecten_dataset', 'tweets', operations) records += 1000 if not result: print("Records not inserted") except Exception as e: print(e) print("--- %s seconds ---" % (time.time() - start_time))
def get_rss_feed(args): from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage storage_client = Storage(google_key_path=args.google_key_path) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'RSS' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # get constituents all_constituents = storage_client.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"]) for constituent_id, constituent_name, symbol in all_constituents: # get last news date for the constituent query = """ SELECT max(news_date) as max_date FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Yahoo Finance RSS" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) max_date = result[0]["max_date"] except Exception as e: max_date = None continue print("Getting RSS Feed for {}".format(constituent_name)) d = feedparser.parse( 'http://finance.yahoo.com/rss/headline?s={}'.format(symbol + ".DE")) to_insert = [] for post in d.entries: #print(post) date = datetime(post.published_parsed[0], post.published_parsed[1], post.published_parsed[2], post.published_parsed[3], post.published_parsed[4]) if max_date: max_date = str(max_date)[:19] max_date = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S') if date < max_date: continue doc = { "url": post.link, "news_date": date.strftime('%Y-%m-%d %H:%M:%S'), "news_language": post.summary_detail["language"], "news_title": post.title, "news_article_txt": post.summary, "news_origin": "Yahoo Finance RSS", "show": True } # Get sentiment score doc["score"] = tah.get_nltk_sentiment(str(doc["news_article_txt"])) # get sentiment word doc["sentiment"] = tah.get_sentiment_word(doc["score"]) # add constituent name, id and old name doc["constituent_id"] = constituent_id doc["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) doc["constituent"] = old_constituent_name to_insert.append(doc) try: print("Inserting into BQ") storage_client.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Yahoo Finance RSS" }] if parameters["LOGGING"] and to_insert: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client)
def write_to_file(args): from utils.Storage import Storage, MongoEncoder # Get tweets query = """ SELECT * FROM pecten_dataset.tweets WHERE lang = 'de' """ storage_client = Storage(args.google_key_path) dataset = 'pecten_dataset' if args.environment != 'production': dataset += "_" + args.environment it = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] total = 0 ids = set() with open("tr_tweets.json", "r") as fo: for line in fo: ids.add(json.loads(line)['id']) with open("tr_tweets.json", "a") as f: for tweet in it: if tweet.id in ids: continue start_time = time.time() data = dict((k, tweet[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(tweet[k], datetime) else (k, tweet[k]) for k in list(tweet._xxx_field_to_index.keys())) response = None try: url = "http://{}:5674/translate".format(args.ip_address) payload = {'q': tweet['text']} print("Making request") response = requests.get(url, params=payload, timeout=11) response = json.loads(response.text) except Exception as e: print(e) continue translated_text = None if response: try: translated_text = response['data']['translations'][0][ 'translatedText'] except Exception as e: print(e) continue if translated_text: data['text_original_language'] = data['text'] data['text'] = translated_text f.write(json.dumps(data, cls=MongoEncoder) + '\n') print("--- {} seconds ---".format(time.time() - start_time)) total += 1 if total % 1000 == 0: print("Saved {} records".format(total))
def main(args): from utils.Storage import Storage #Get tweets query = """ SELECT * FROM pecten_dataset.tweets_backup WHERE lang = 'de' and id NOT IN( SELECT distinct(id) FROM pecten_dataset.tweets WHERE text_original_language IS NOT NULL) """ storage_client = Storage(args.google_key_path) dataset = 'pecten_dataset' if args.environment != 'production': dataset += "_" + args.environment it = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] total = 0 for tweet in it: data = dict((k, tweet[k].strftime('%Y-%m-%d %H:%M:%S') ) if isinstance(tweet[k], datetime) else (k, tweet[k]) for k in list(tweet._xxx_field_to_index.keys())) response = None try: url = "http://{}:5674/translate".format(args.ip_address) payload = {'q': tweet['text']} response = requests.get(url, params=payload, timeout=11) response = json.loads(response.text) except Exception as e: print(e) continue translated_text = None if response: try: translated_text = response['data']['translations'][0][ 'translatedText'] except Exception as e: print(e) continue if translated_text: data['text_original_language'] = data['text'] data['text'] = translated_text to_insert.append(data) if len(to_insert) == 500: print("Inserting to BQ production") try: result = storage_client.insert_bigquery_data( "pecten_dataset", "tweets", to_insert) if result: print("Data inserted") else: print("Data not inserted") except Exception as e: print(e) print("Inserting to BQ test") try: result = storage_client.insert_bigquery_data( "pecten_dataset_test", "tweets", to_insert) if result: print("Data inserted") else: print("Data not inserted") except Exception as e: print(e) to_insert = []
def get_reuters_news(args, driver): from utils import logging_utils as logging_utils from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils import twitter_analytics_helpers as tah tagger = TU() storage_client = Storage(google_key_path=args.google_key_path) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING","DESTINATION_TABLE","LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Reuters' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # get constituents all_constituents = storage_client.get_sql_data(sql_connection_string=args.param_connection_string, sql_table_name="PARAM_NEWS_REUTERS_KEYS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"]) for constituent_id, constituent_name, url_key in all_constituents: to_insert = [] # Get date of latest news article for this constituent for Reuters query = """ SELECT max(news_date) as last_date FROM `{}.{}` WHERE constituent_id = '{}' AND news_origin = 'Reuters' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"],constituent_id) try: try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) last_date_bq = result[0]["last_date"] except Exception as e: last_date_bq = None company_name = url_key print('Scraping articles for {}'.format(company_name)) query_url = 'https://reuters.com/search/news?blob={}&sortBy=date&dateRange=all'.format(company_name) # Find last article with date specified print(query_url) driver.get(query_url) sleep(2) skip = False if last_date_bq: try: last_date = \ driver.find_elements_by_xpath('//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[3]/div/div/h5')[ 0].get_attribute('innerHTML') except: last_date = BeautifulSoup(driver.page_source).findAll('h5', {'class': 'search-result-timestamp'})[ 0].text.strip() last_date = ' '.join(last_date.split(' ')[:-1]) month = [last_date.split(' ')[0][:3]] last_date = ' '.join(month + last_date.split(' ')[1:]).strip().replace(',', '') last_date = datetime.strptime(last_date, '%b %d %Y %I:%M%p') if last_date <= datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print("No newer articles") skip = True if skip: continue soup = BeautifulSoup(driver.page_source, 'html.parser') search = soup.find('div', {'class': 'search-result-list news-search'}).findAll('div', {'class': 'search-result-indiv'}) print("Getting articles") for article in search: title_elem = article.find('h3', {'class': 'search-result-title'}) title = title_elem.text.strip() url = "https://reuters.com" + title_elem.find('a')['href'] date_published = article.find('h5', {'class': 'search-result-timestamp'}).text date_published = ' '.join(date_published.split(' ')[:-1]) month = [date_published.split(' ')[0][:3]] date_published = ' '.join(month + date_published.split(' ')[1:]).strip().replace(',', '') date_published_dtype = datetime.strptime(date_published, '%b %d %Y %I:%M%p') date_published = date_published_dtype.strftime("%Y-%m-%d %H:%M:%S") if last_date_bq: if date_published_dtype < datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print("Skipping article") continue try: soup = BeautifulSoup( requests.get(url, headers={'User-Agent': random.choice(AGENTS)}).content, 'html.parser') article_body = soup.find('div', {'class': 'ArticleBody_body_2ECha'}).text.strip() source = soup.find('p', {'class': 'Attribution_content_27_rw'}).text.strip() except AttributeError: source = None continue d = {'news_title': title, 'news_date': date_published, 'news_source': source, 'news_origin':"Reuters", "news_article_txt": article_body} # set extra fields: # score if d["news_article_txt"]: d['score'] = tah.get_nltk_sentiment(str(d["news_article_txt"])) # sentiment d['sentiment'] = tah.get_sentiment_word(d["score"]) # constituent fields d["constituent_id"] = constituent_id d["constituent_name"] = constituent_name d["constituent"] = tah.get_old_constituent_name(constituent_id) # url d["url"] = url # show d["show"] = True # entity_tags d["entity_tags"] = tah.get_spacey_tags(tagger.get_spacy_entities(str(d["news_title"]))) to_insert.append(d) print('Article scraped...') #break # Save data to json if to_insert: print("Inserting records to BQ") try: storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Reuters"}] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client) except Exception as e: print(e) continue
def update_from_bigquery_file(args): # load data storage = Storage(args.google_key_path) tagger = TU() query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified`" tweets = storage.get_bigquery_data(query, iterator_flag=False) file_name = "tweets.json" start_time = time.time() operations = [] records = 0 total = 0 print("Writing local file") with open(file_name, "w") as f: for tweet in tweets: row = {} row["text"] = tweet["text"] row["id"] = tweet["id"] row["favorite_count"] = tweet["favorite_count"] row["source"] = tweet["source"] row["retweeted"] = tweet["retweeted"] row["entities"] = tweet["entities"] row["id_str"] = tweet["id_str"] row["retweet_count"] = tweet["retweet_count"] row["favorited"] = tweet["favorited"] row["user"] = tweet["user"] row["lang"] = tweet["lang"] row["created_at"] = tweet["created_at"] row["place"] = tweet["place"] row["constituent_name"] = tweet["constituent_name"] row["constituent_id"] = tweet["constituent_id"] row["search_term"] = tweet["search_term"] row["relevance"] = tweet["relevance"] # Additional fields if isinstance(tweet["created_at"], str): row['date'] = convert_timestamp(tweet["created_at"]) # sentiment score row["sentiment_score"] = get_nltk_sentiment(tweet["text"]) # TO DO tagged_text = tagger.get_spacy_entities(tweet["text"]) row["entity_tags"] = get_spacey_tags(tagged_text) f.write(json.dumps(row, cls=MongoEncoder) + '\n') records += 1 total += 1 if records == 2000: print("Written {} records".format(total)) records = 0 print("--- %s seconds ---" % (time.time() - start_time)) print("Processed {} records".format(records))
def main(args): from utils.Storage import Storage #Get tweets query = """ SELECT * FROM pecten_dataset.tweets_backup WHERE lang = 'de' and id NOT IN( SELECT distinct(id) FROM pecten_dataset.tweets WHERE text_original_language IS NOT NULL) """ storage_client = Storage(args.google_key_path) dataset = 'pecten_dataset' if args.environment != 'production': dataset += "_" + args.environment it = storage_client.get_bigquery_data(query,iterator_flag=True) to_insert = [] total = 0 for tweet in it: data = dict((k,tweet[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(tweet[k],datetime) else (k,tweet[k]) for k in list(tweet._xxx_field_to_index.keys())) response = None try: url = "http://{}:5674/translate".format(args.ip_address) payload = {'q': tweet['text']} response = requests.get(url, params=payload,timeout=11) response = json.loads(response.text) except Exception as e: print(e) continue translated_text = None if response: try: translated_text = response['data']['translations'][0]['translatedText'] except Exception as e: print(e) continue if translated_text: data['text_original_language'] = data['text'] data['text'] = translated_text to_insert.append(data) if len(to_insert) == 500: print("Inserting to BQ production") try: result = storage_client.insert_bigquery_data("pecten_dataset", "tweets", to_insert) if result: print("Data inserted") else: print("Data not inserted") except Exception as e: print(e) print("Inserting to BQ test") try: result = storage_client.insert_bigquery_data("pecten_dataset_test", "tweets", to_insert) if result: print("Data inserted") else: print("Data not inserted") except Exception as e: print(e) to_insert = []
def get_daily_orbis_news(args): if __name__ != "__main__": from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils from utils import logging_utils as logging_utils # get constituents soap = SOAPUtils() storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = [ "LOGGING", "BVD_USERNAME", "BVD_PASSWORD", "DESTINATION_TABLE", "LOGGING_TABLE" ] where = lambda x: x["SOURCE"] == 'Orbis' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, bvdid in constituents: #get last news date for the constituent query = """ SELECT max(news_id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and news_origin = "Orbis" """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue start = 0 max_count = 20 print("Constituent: {},{}".format(constituent_name, bvdid)) stop = False try: token = soap.get_token(parameters["BVD_USERNAME"], parameters["BVD_PASSWORD"], 'orbis') selection_token, selection_count = soap.find_by_bvd_id( token, bvdid, 'orbis') except Exception as e: print(e) return while not stop: try: query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \ "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \ "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \ "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \ "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \ "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \ "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \ "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \ "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \ "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \ "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format( start, max_count) get_data_result = soap.get_data(token, selection_token, selection_count, query, 'orbis', timeout=None) except Exception as e: print(str(e)) if token: soap.close_connection(token, 'orbis') token = None break result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) try: df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"]) except Exception as e: print(e) break if df.shape[0] == 0: print("No records in df") break initial_shape = df.shape[0] df.astype({"news_id": int}, copy=False, errors='ignore') df = df.loc[df["news_id"] > max_id] if df.shape[0] == 0: print("No new data") break if df.shape[0] != initial_shape: stop = True # Make news_title column a string df.astype({"news_title": str}, copy=False, errors='ignore') # Remove duplicate columns df.drop_duplicates(["news_title"], inplace=True) # Get sentiment score df["score"] = df.apply(lambda row: tah.get_nltk_sentiment( str(row["news_article_txt"])), axis=1) # get sentiment word df["sentiment"] = df.apply( lambda row: tah.get_sentiment_word(row["score"]), axis=1) # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) df["constituent"] = old_constituent_name # add URL df["url"] = None # add show df["show"] = True # get entity tags entity_tags = [] for text in df["news_title"]: tags = tah.get_spacey_tags(tagger.get_spacy_entities( str(text))) entity_tags.append(tags) fields = [ "news_date", "news_title", "news_article_txt", "news_companies", "news_source", "news_publication", "news_topics", "news_country", "news_region", "news_language", "news_id", "score", "sentiment", "constituent_id", "constituent_name", "constituent", "url", "show" ] # Get needed fields df_bigquery = df[fields] bigquery_data = json.loads( df_bigquery.to_json(orient="records", date_format="iso")) # set entity_tag field i = 0 for i in range(0, len(bigquery_data)): bigquery_data[i]["entity_tags"] = entity_tags[i] # set news_date if "news_date" in bigquery_data[i] and bigquery_data[i][ "news_date"]: date = bigquery_data[i]["news_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) bigquery_data[i]["news_date"] = ts else: bigquery_data[i]["news_date"] = None if bigquery_data[i]["news_title"]: bigquery_data[i]["news_title"] = str( bigquery_data[i]["news_title"]) if bigquery_data[i]["news_article_txt"]: bigquery_data[i]["news_article_txt"] = str( bigquery_data[i]["news_article_txt"]) if bigquery_data[i]["news_companies"]: try: bigquery_data[i]["news_companies"] = [ i.strip() for i in bigquery_data[i] ["news_companies"].split(";") ] except Exception as e: bigquery_data[i]["news_companies"] = [] else: bigquery_data[i]["news_companies"] = [] if bigquery_data[i]["news_topics"]: try: if bigquery_data[i]["news_topics"].isdigit(): bigquery_data[i]["news_topics"] = [] elif bigquery_data[i]["news_topics"] == 'None': bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [ i.strip() for i in bigquery_data[i] ["news_topics"].split(";") ] except Exception as e: bigquery_data[i]["news_topics"] = [] else: bigquery_data[i]["news_topics"] = [] if bigquery_data[i]["news_country"]: try: bigquery_data[i]["news_country"] = [ i.strip() for i in bigquery_data[i] ["news_country"].split(";") ] except Exception as e: bigquery_data[i]["news_country"] = [] else: bigquery_data[i]["news_country"] = [] if bigquery_data[i]["news_region"]: try: bigquery_data[i]["news_region"] = [ i.strip() for i in bigquery_data[i]["news_region"].split(";") ] except Exception as e: bigquery_data[i]["news_region"] = [] else: bigquery_data[i]["news_region"] = [] if bigquery_data[i]["news_language"]: bigquery_data[i]["news_language"] = str( bigquery_data[i]["news_language"]) if bigquery_data[i]["news_source"]: bigquery_data[i]["news_source"] = str( bigquery_data[i]["news_source"]) if bigquery_data[i]["news_publication"]: bigquery_data[i]["news_publication"] = str( bigquery_data[i]["news_publication"]) if bigquery_data[i]["news_id"]: try: bigquery_data[i]["news_id"] = int( bigquery_data[i]["news_id"]) except Exception as e: bigquery_data[i]["news_id"] = str( bigquery_data[i]["news_id"]) if bigquery_data[i]["sentiment"]: bigquery_data[i]["sentiment"] = str( bigquery_data[i]["sentiment"]) if bigquery_data[i]["constituent_id"]: bigquery_data[i]["constituent_id"] = str( bigquery_data[i]["constituent_id"]) if bigquery_data[i]["constituent_name"]: bigquery_data[i]["constituent_name"] = str( bigquery_data[i]["constituent_name"]) if bigquery_data[i]["constituent"]: bigquery_data[i]["constituent"] = str( bigquery_data[i]["constituent"]) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], bigquery_data) except Exception as e: print(e) start = start + 20 print("Records saved: {}".format(df.shape[0])) log = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": df.shape[0], "source": "Orbis" }] if parameters["LOGGING"] and bigquery_data: logging_utils.logging(log, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) if token: soap.close_connection(token, 'orbis')
def get_bloomberg_news(args): from utils.Storage import Storage from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU from utils import logging_utils as logging_utils tagger = TU() storage_client = Storage(google_key_path=args.google_key_path) #Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Bloomberg' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) #get constituents all_constituents = storage_client.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="PARAM_NEWS_BLOOMBERG_KEYS", sql_column_list=[ "CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY", "PAGES" ]) # Define random user agent object #ua = UserAgent() # Base url for search queries base = 'https://bloomberg.com/search?query={}&sort=time:desc&page={}' # Iterate over companies for constituent_id, constituent_name, url_key, pages in all_constituents: # Get date of latest news article for this constituent for Bloomberg query = """ SELECT max(news_date) as last_date FROM `{}.{}` WHERE constituent_id = '{}' AND news_origin = 'Bloomberg' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage_client.get_bigquery_data(query=query, iterator_flag=False) last_date_bq = result[0]["last_date"] except Exception as e: last_date_bq = None print('Scraping for {} up to {}.'.format(constituent_name, str(last_date_bq))) # iterate through specified page range and save articles to database in_target = True i = 1 while in_target: print('Scraping for page', i) # format url with company name and current page number and create bs4 object from the response url = base.format(url_key.replace('.', '_').replace(' ', '_'), i) i += 1 main_soup = BeautifulSoup(requests.get(url).content, 'html.parser') # check if there are any results on the page and finish the loop if none try: if 'No results for' in main_soup.find( 'div', { 'class': 'content-no-results' }).text: print('No more articles...') break except AttributeError: pass # iterate through article elelements on the page results = main_soup.findAll('div', {'class': 'search-result'}) times = [ x.find('time', { 'class': 'published-at' }).text.strip() for x in results ] times = [datetime.strptime(x, '%b %d, %Y') for x in times] print("Looping times") for time in times: if last_date_bq: if time < datetime.strptime( last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): print('Target date reached') in_target = False to_insert = [] print("Looping articles...") for art in results: art_type = art.find('article')['class'] # check if the result is an article if 'type-article' in art_type: url = art.find('h1').find('a')['href'] d = get_article(url, None) if d is None: continue if last_date_bq: if datetime.strptime(d["news_date"], "%Y-%m-%d %H:%M:%S") < \ datetime.strptime(last_date_bq.strftime("%Y-%m-%d"), "%Y-%m-%d"): continue #set extra fields: #score if d["news_article_txt"]: d['score'] = tah.get_nltk_sentiment( str(d["news_article_txt"])) # sentiment d['sentiment'] = tah.get_sentiment_word(d["score"]) # constituent fields d["constituent_id"] = constituent_id d["constituent_name"] = constituent_name d["constituent"] = tah.get_old_constituent_name( constituent_id) #url d["url"] = url #show d["show"] = True # entity_tags d["entity_tags"] = tah.get_spacey_tags( tagger.get_spacy_entities(str(d["news_title"]))) to_insert.append(d) if to_insert: print("Inserting records to BQ") try: storage_client.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_news": len(to_insert), "source": "Bloomberg" }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage_client) else: break
def get_daily_zephyr_ma_deals(args): from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage from utils.Storage import MongoEncoder from utils.SOAPUtils import SOAPUtils zephyr_query = """ DEFINE P1 AS [Parameters.Currency=SESSION;], P2 AS [Parameters.RepeatingDimension=NrOfTargets], P3 AS [Parameters.RepeatingDimension=NrOfBidders], P5 AS [Parameters.RepeatingDimension=NrOfVendors]; SELECT LINE RECORD_ID as record_id, LINE DEAL_OVERVIEW.DEAL_HEADLINE AS deal_headline, LINE DEAL_OVERVIEW.TGNAME USING P2 AS target, LINE DEAL_OVERVIEW.BDNAME USING P3 AS acquiror, LINE DEAL_OVERVIEW.VDNAME USING P5 AS vendor, LINE DEAL_OVERVIEW.DEALTYPE AS deal_type, LINE DEAL_OVERVIEW.DEAL_STATUS AS deal_status, LINE DEAL_STRUCTURE_AND_DATES.COMPLETION_DATE AS completion_date, LINE DEAL_STRUCTURE_AND_DATES.RUMOUR_DATE AS rumour_date, LINE DEAL_STRUCTURE_AND_DATES.ANNOUNCE_DATE AS announced_date, LINE DEAL_STRUCTURE_AND_DATES.EXPECTED_COMPLETION_DATE AS expected_completion_date, LINE DEAL_STRUCTURE_AND_DATES.ASSUMED_COMPLETION_DATE AS assumed_completion_date, LINE DEAL_STRUCTURE_AND_DATES.POSTPONED_DATE AS postponed_date, LINE DEAL_STRUCTURE_AND_DATES.WITHDRAWN_DATE AS withdrawn_date, LINE DEAL_OVERVIEW.ALLDLVALUE USING P1 AS deal_value FROM RemoteAccess.U ORDERBY 1 DESCENDING """ # Get parameters param_table = "PARAM_NEWS_COLLECTION" parameters_list = ['BVD_USERNAME', 'BVD_PASSWORD', "LOGGING","DESTINATION_TABLE","LOGGING_TABLE"] where = lambda x: x["SOURCE"] == 'Zephyr' parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list, where) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # Get constituents soap = SOAPUtils() storage = Storage(args.google_key_path) columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "STRATEGY"] table = "PARAM_NEWS_ZEPHYR_STRATEGIES" constituents = storage.get_sql_data(sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) fields = ["record_id", "deal_headline", "target", "acquiror", "vendor", "deal_type", "deal_status", "completion_date", "deal_value", "rumour_date", "announced_date", "expected_completion_date", "assumed_completion_date", "postponed_date", "withdrawn_date"] for constituent_id, constituent_name, strategy in constituents: # get last deal query = """ SELECT max(record_id) as max_id FROM `{}.{}` WHERE constituent_id = '{}'; """.format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"],constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] print("Getting M&A deals for {}".format(constituent_name)) token = soap.get_token(parameters['BVD_USERNAME'], parameters['BVD_PASSWORD'], 'zephyr') selection_token, selection_count = soap.find_with_strategy(token, strategy, "zephyr") get_data_result = soap.get_data(token, selection_token, selection_count, zephyr_query, 'zephyr', timeout=None, number_of_records=100) result = ET.fromstring(get_data_result) csv_result = result[0][0][0].text TESTDATA = StringIO(csv_result) df = pd.read_csv(TESTDATA, sep=",", parse_dates=["completion_date", "rumour_date", "announced_date", "expected_completion_date", "assumed_completion_date", "postponed_date", "withdrawn_date"]) df.astype({"record_id": int}, copy=False, errors='ignore') print("Retrieved {} items".format(df.shape[0])) df = df.loc[df["record_id"] > max_id] print("New records {} items".format(df.shape[0])) if df.shape[0] == 0: if token: soap.close_connection(token, 'zephyr') continue df = df[fields] # add constituent name, id and old name df["constituent_id"] = constituent_id df["constituent_name"] = constituent_name old_constituent_name = tah.get_old_constituent_name(constituent_id) df["constituent"] = old_constituent_name data = json.loads(df.to_json(orient="records", date_format="iso")) print(len(data)) for item in data: if "completion_date" in item and item["completion_date"]: date = item["completion_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["completion_date"] = ts if "rumour_date" in item and item["rumour_date"]: date = item["rumour_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["rumour_date"] = ts if "announced_date" in item and item["announced_date"]: date = item["announced_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["announced_date"] = ts if "expected_completion_dat" in item and item["expected_completion_dat"]: date = item["expected_completion_dat"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["expected_completion_date"] = ts if "assumed_completion_date" in item and item["assumed_completion_date"]: date = item["assumed_completion_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["assumed_completion_datee"] = ts if "postponed_date" in item and item["postponed_date"]: date = item["postponed_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["postponed_date"] = ts if "withdrawn_date" in item and item["withdrawn_date"]: date = item["withdrawn_date"] ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ") ts = time.strftime('%Y-%m-%d %H:%M:%S', ts) item["withdrawn_date"] = ts if data: print("Inserting records to BQ") try: #open("ma_deals.json", 'w').write("\n".join(json.dumps(e, cls=MongoEncoder) for e in data)) storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], data) except Exception as e: print(e) soap.close_connection(token, 'zephyr') if parameters["LOGGING"]: doc = [{"date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_deals": len(data), "source": "Zephyr"}] logging_utils.logging(doc,common_parameters["BQ_DATASET"],parameters["LOGGING_TABLE"],storage) except Exception as e: print(e) if token: soap.close_connection(token, 'zephyr') continue if token: soap.close_connection(token, 'zephyr')
def write_to_file(args): from utils.Storage import Storage,MongoEncoder # Get tweets query = """ SELECT * FROM pecten_dataset.tweets WHERE lang = 'de' """ storage_client = Storage(args.google_key_path) dataset = 'pecten_dataset' if args.environment != 'production': dataset += "_" + args.environment it = storage_client.get_bigquery_data(query, iterator_flag=True) to_insert = [] total = 0 ids = set() with open("tr_tweets.json", "r") as fo: for line in fo: ids.add(json.loads(line)['id']) with open("tr_tweets.json", "a") as f: for tweet in it: if tweet.id in ids: continue start_time = time.time() data = dict((k, tweet[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(tweet[k], datetime) else (k, tweet[k]) for k in list(tweet._xxx_field_to_index.keys())) response = None try: url = "http://{}:5674/translate".format(args.ip_address) payload = {'q': tweet['text']} print("Making request") response = requests.get(url, params=payload,timeout=11) response = json.loads(response.text) except Exception as e: print(e) continue translated_text = None if response: try: translated_text = response['data']['translations'][0]['translatedText'] except Exception as e: print(e) continue if translated_text: data['text_original_language'] = data['text'] data['text'] = translated_text f.write(json.dumps(data, cls=MongoEncoder) + '\n') print("--- {} seconds ---".format(time.time() - start_time)) total += 1 if total % 1000 == 0: print("Saved {} records".format(total))
def get_tweets(args): from utils import logging_utils as logging_utils from utils.TwitterDownloader import TwitterDownloader from utils.Storage import Storage from utils import twitter_analytics_helpers as tap from utils.TaggingUtils import TaggingUtils as TU param_table = "PARAM_TWITTER_COLLECTION" parameters_list = [ "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING", "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME", "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET", "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE" ] parameters = tap.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tap.get_parameters(args.param_connection_string, common_table, common_list, common_where) languages = parameters["LANGUAGE"].split(",") storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"]) tagger = TU() downloader = TwitterDownloader(parameters["TWITTER_API_KEY"], parameters["TWITTER_API_SECRET"]) downloader.load_api() all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name="MASTER_CONSTITUENTS", sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"]) fields_to_keep = [ "text", "favorite_count", "source", "retweeted", "entities", "id_str", "retweet_count", "favorited", "user", "lang", "created_at", "place", "constituent_name", "constituent_id", "search_term", "id", "sentiment_score", "entity_tags", "relevance", "constituent" ] for language in languages: for constituent_id, constituent_name in all_constituents: search_query = get_search_string(constituent_id, args.param_connection_string, "PARAM_TWITTER_KEYWORDS", "PARAM_TWITTER_EXCLUSIONS") #Get max id of all tweets to extract tweets with id highe than that q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \ "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"], constituent_id,language) try: sinceId = int( storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"]) except Exception as e: print(e) sinceId = None max_id = -1 tweetCount = 0 print("Downloading max {0} tweets for {1} in {2} on {3}".format( parameters["MAX_TWEETS"], constituent_name, language, str(datetime.now()))) while tweetCount < parameters["MAX_TWEETS"]: tweets_unmodified = [] tweets_modified = [] tweets_mongo = [] try: tweets, tmp_tweet_count, max_id = downloader.download( constituent_name, search_query, language, parameters["TWEETS_PER_QUERY"], sinceId, max_id) except Exception as e: continue if not tweets: break else: print("Downloaded {} tweets".format(tmp_tweet_count)) tweetCount += tmp_tweet_count #Add fields for both unmodified and modified tweets for tweet in tweets: tweet._json['source'] = "Twitter" tweet._json['constituent_name'] = constituent_name tweet._json['constituent_id'] = constituent_id tweet._json['search_term'] = search_query tweet._json["constituent"] = tap.get_old_constituent_name( constituent_id) #Removing bad fields clean_tweet = tap.scrub(tweet._json) # Separate the tweets that go to one topic or the other #unmodified t_unmodified = deepcopy(clean_tweet) t_unmodified["date"] = tap.convert_timestamp( t_unmodified["created_at"]) tweets_unmodified.append(t_unmodified) #Add additional fields clean_tweet["sentiment_score"] = tap.get_nltk_sentiment( str(clean_tweet["text"])) tagged_text = tagger.get_spacy_entities( str(clean_tweet["text"])) clean_tweet["entity_tags"] = tap.get_spacey_tags( tagged_text) clean_tweet["relevance"] = -1 #mongo t_mongo = deepcopy(clean_tweet) t_mongo['date'] = datetime.strptime( t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y') tweets_mongo.append(t_mongo) #modified tagged_tweet = dict((k, clean_tweet[k]) for k in fields_to_keep if k in clean_tweet) tagged_tweet['date'] = tap.convert_timestamp( clean_tweet["created_at"]) tweets_modified.append(tagged_tweet) #send to PubSub topic #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified) #ps_utils.publish("igenie-project", "tweets", tweets_modified) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], '{}_unmodified'.format( parameters["DESTINATION_TABLE"]), tweets_unmodified) except Exception as e: print(e) try: storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], tweets_modified) except Exception as e: print(e) try: storage.save_to_mongodb(tweets_mongo, "dax_gcp", parameters["DESTINATION_TABLE"]) pass except Exception as e: print(e) time.sleep(1) print("Saved {} tweets for in {}".format(tweetCount, constituent_name, language)) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": tweetCount, "language": language }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) return "Downloaded tweets"
def get_stocktwits(args): from utils import logging_utils as logging_utils from utils.Storage import Storage from utils import twitter_analytics_helpers as tah from utils.TaggingUtils import TaggingUtils as TU # get constituents storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"] table = "PARAM_STOCKTWITS_KEYS" all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) # Get parameters param_table = "PARAM_STOCKTWITS_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list) # Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) for constituent_id, constituent_name, url_key in all_constituents: #Get the last id query = """ SELECT max(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' and source = 'StockTwits' """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) max_id = result[0]["max_id"] except Exception as e: max_id = None continue print("Gettins stocktwits for {}:{}".format(constituent_name, url_key)) response = requests.get( "https://api.stocktwits.com/api/2/streams/symbol/{}.json".format( url_key)) to_insert = [] if response.status_code == 200: for item in response.json()["messages"]: if max_id: if int(item['id']) < max_id: continue doc = tah.create_tweet_skelleton() doc['text'] = item['body'] doc['created_at'] = item['created_at'] doc['date'] = item['created_at'] doc['id'] = item['id'] doc['id_str'] = str(item['id']) if "symbols" in item: for symbol in item['symbols']: doc['entities']['symbols'].append({ 'indices': [], 'text': symbol['symbol'] }) if 'entities' in item: if 'sentiment' in item['entities'] and item['entities'][ 'sentiment']: if 'basic' in item['entities']['sentiment']: doc['user']['description'] = item['entities'][ 'sentiment']['basic'] doc['source'] = 'StockTwits' doc['constituent_name'] = constituent_name doc['constituent_id'] = constituent_id doc['search_term'] = '' doc["constituent"] = tah.get_old_constituent_name( constituent_id) doc['relevance'] = 1 doc["sentiment_score"] = tah.get_nltk_sentiment( str(doc["text"])) tagged_text = tagger.get_spacy_entities(str(doc["text"])) doc["entity_tags"] = tah.get_spacey_tags(tagged_text) to_insert.append(doc) try: storage.insert_bigquery_data(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], to_insert) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": time.strftime('%Y-%m-%d %H:%M:%S', datetime.now().date().timetuple()), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_tweets": len(to_insert), "language": 'StockTwits' }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) else: print(response.text)
def extract_ticker_data(args): if __name__ != "__main__": from utils import logging_utils as logging_utils from utils import twitter_analytics_helpers as tah from utils.Storage import Storage #ua = UserAgent() base_url = "http://charts.finanzen.net/ChartData.ashx?request=" #Get parameters param_table = "PARAM_TICKER_COLLECTION" parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"] parameters = tah.get_parameters(args.param_connection_string, param_table, parameters_list) #Get dataset name common_table = "PARAM_READ_DATE" common_list = ["BQ_DATASET"] common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[ "STATUS"] == 'active') common_parameters = tah.get_parameters(args.param_connection_string, common_table, common_list, common_where) # Get constituents storage = Storage(args.google_key_path) columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "URL_KEY"] table = "PARAM_TICKER_KEYS" all_constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) i = 0 for constituent_id, constituent_name, url_key in all_constituents: if i == 10 or i == 29: time.sleep(900) # get last deal query = """ SELECT max(datetime) as max_date FROM `{}.{}` WHERE constituent_id = '{}'; """.format(common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], constituent_id) try: result = storage.get_bigquery_data(query=query, iterator_flag=False) except Exception as e: print(e) return max_date = result[0]["max_date"] print("Getting ticker data for {}".format(constituent_name)) url = base_url + url_key + '6000' list_of_ticks = [] try: response = requests.get(url) if response.status_code == requests.codes.ok: data = response.text.splitlines() for item in data[1:-2]: fields = item.split(';') if max_date and max_date.date() == datetime.strptime( fields[0], "%Y-%m-%d-%H-%M-%S-%f").date(): print("Last date in ticker_data is same date as today") break document = { 'constituent_name': constituent_name, 'constituent_id': constituent_id, 'date': fields[0][0:10], 'time': fields[0][11:19].replace('-', ':'), 'datetime': datetime.strptime(fields[0], "%Y-%m-%d-%H-%M-%S-%f").strftime( '%Y-%m-%d %H:%M:%S'), 'open': float(fields[1].replace(',', '')), 'high': float(fields[2].replace(',', '')), 'low': float(fields[3].replace(',', '')), 'close': float(fields[4].replace(',', '')), 'volume': float(fields[5].replace(',', '')) } list_of_ticks.append(document) if list_of_ticks: try: print("Inserting into BQ") storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], list_of_ticks) # pprint(list_of_ticks) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": datetime.now().date().strftime( '%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_ticks": len(list_of_ticks) }] logging_utils.logging(doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) i += 1 else: print("Error making request, code {}. Retrying...".format( response.status_code)) time.sleep(20) response = requests.get(url) if response.status_code == requests.codes.ok: data = response.text.splitlines() for item in data[1:-2]: fields = item.split(';') document = { 'constituent_name': constituent_name, 'constituent_id': constituent_id, 'date': fields[0][0:10], 'time': fields[0][11:19].replace('-', ':'), 'datetime': datetime.strptime(fields[0], "%Y-%m-%d-%H-%M-%S-%f").strftime( '%Y-%m-%d %H:%M:%S'), 'open': float(fields[1].replace(',', '')), 'high': float(fields[2].replace(',', '')), 'low': float(fields[3].replace(',', '')), 'close': float(fields[4].replace(',', '')), 'volume': float(fields[5].replace(',', '')) } list_of_ticks.append(document) if list_of_ticks: try: print("Inserting into BQ") storage.insert_bigquery_data( common_parameters["BQ_DATASET"], parameters["DESTINATION_TABLE"], list_of_ticks) except Exception as e: print(e) if parameters["LOGGING"]: doc = [{ "date": datetime.now().date().strftime( '%Y-%m-%d %H:%M:%S'), "constituent_name": constituent_name, "constituent_id": constituent_id, "downloaded_ticks": len(list_of_ticks) }] logging_utils.logging( doc, common_parameters["BQ_DATASET"], parameters["LOGGING_TABLE"], storage) i += 1 else: return "Error making request, code {}. Skipping.".format( response.status_code) except Exception as ex: return str(ex) finally: time.sleep(5)
def update_from_bigquery(args): #load data storage = Storage(args.google_key_path) tagger = TU() columns = ["CONSTITUENT_ID"] table = "MASTER_CONSTITUENTS" constituents = storage.get_sql_data( sql_connection_string=args.param_connection_string, sql_table_name=table, sql_column_list=columns) start_time = time.time() for item in constituents: constituent_id = item[0] print("Loading {}".format(constituent_id)) while True: try: # get max id inserted from target table q = "SELECT max(id) as max_id " \ "FROM `pecten_dataset.tweets` " \ "WHERE constituent_id = '{}'".format(constituent_id) max_id = storage.get_bigquery_data( q, iterator_flag=False)[0]["max_id"] if not max_id: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE constituent_id = '{}' " \ "ORDER BY id ASC".format(constituent_id) else: query = "SELECT text, id,favorite_count, source, retweeted,entities," \ "id_str,retweet_count,favorited,user,lang,created_at,place," \ "constituent_name,constituent_id,search_term, relevance " \ "FROM `pecten_dataset.tweets_unmodified` " \ "WHERE id > {} AND constituent_id = '{}' " \ "ORDER BY id ASC".format(max_id, constituent_id) tweets = storage.get_bigquery_data(query) if tweet.total_rows == 0: print("Finished for {}".format(constituent_id)) break operations = [] records = 0 for tweet in tweets: row = {} row["text"] = tweet["text"] row["id"] = tweet["id"] row["favorite_count"] = tweet["favorite_count"] row["source"] = tweet["source"] row["retweeted"] = tweet["retweeted"] row["entities"] = tweet["entities"] row["id_str"] = tweet["id_str"] row["retweet_count"] = tweet["retweet_count"] row["favorited"] = tweet["favorited"] row["user"] = tweet["user"] row["lang"] = tweet["lang"] row["created_at"] = tweet["created_at"] row["place"] = tweet["place"] row["constituent_name"] = tweet["constituent_name"] row["constituent_id"] = tweet["constituent_id"] row["search_term"] = tweet["search_term"] row["relevance"] = tweet["relevance"] # Additional fields if isinstance(tweet["created_at"], str): row['date'] = convert_timestamp(tweet["created_at"]) # sentiment score row["sentiment_score"] = get_nltk_sentiment(tweet["text"]) # TO DO tagged_text = tagger.get_spacy_entities(tweet["text"]) row["entity_tags"] = get_spacey_tags(tagged_text) operations.append(row) # print(row) if len(operations) == 1000: result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 print("Performed bulk write of {} records".format( records)) if not result: print("Records not inserted") operations = [] if len(operations) > 0: result = storage.insert_bigquery_data( 'pecten_dataset', 'tweets', operations) records += 1000 if not result: print("Records not inserted") except Exception as e: print(e) print("--- %s seconds ---" % (time.time() - start_time))