def get_twitter_sentiment_popularity(from_date, to_date, google_key_path):
    print("twitter_sentiment_popularity")

    columns = ["count", "constituent", "avg_sentiment_all", "constituent_name", "constituent_id", "date"]

    query = """
    SELECT constituent, avg(sentiment_score) as avg_sentiment_all, constituent_name, constituent_id, date, count(text) as count 
    FROM `igenie-project.pecten_dataset.tweets`
    WHERE date between TIMESTAMP ('{}') and TIMESTAMP ('{}') 
    GROUP BY constituent, constituent_name, constituent_id, date
    """.format(from_date, to_date)

    storage_client = Storage(google_key_path='igenie-project-key.json')

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else
                   (k,item[k]) for k in columns))

    try:
        storage_client.insert_bigquery_data('pecten_dataset', 'twitter_sentiment_popularity_copy', to_insert)
    except Exception as e:
        print(e) 
Beispiel #2
0
def get_news_all(from_date, to_date, google_key_path):
    print("news_all")

    columns = [
        "NEWS_DATE_NewsDim", "score", "NEWS_PUBLICATION_NewsDim",
        "categorised_tag", "constituent_id", "NEWS_ARTICLE_TXT_NewsDim",
        "sentiment", "news_Title_NewsDim", "entity_tags",
        "entity_tags.FACILITY", "entity_tags.QUANTITY", "entity_tags.EVENT",
        "entity_tags.PERSON", "entity_tags.DATE", "entity_tags.TIME",
        "entity_tags.CARDINAL", "entity_tags.PRODUCT", "count"
    ]

    query = """
    SELECT news_date as news_date_NewsDim, score, news_publication as NEWS_PUBLICATION_NewsDim, news_topics as categorised_tag, constituent_id, news_article_txt as NEWS_ARTICLE_TXT_NewsDim, sentiment, news_title as NEWS_TITLE_NewsDim, entity_tags, entity_tags.FACILITY, entity_tags.QUANTITY, entity_tags.EVENT, entity_tags.PERSON, entity_tags.DATE, entity_tags.TIME, entity_tags.PERSON, entity_tags.DATE, entity_tags.TIME, entity_tags.CARDINAL, entity_tags.PRODUCT, entity_tags.LOC, entity_tags.WORK_OF_ART, entity_tags.LAW, entity_tags.GPE, entity_tags.PERCENT, entity_tags.FAC, entity_tags.ORDINAL, entity_tags.ORG, entity_tags.NORP, entity_tags.LANGUAGE, entity_tags.MONEY, constituent_name, count(constituent_name) as count, url, news_language, news_id, news_country, news_companies, news_region, constituent
    FROM `pecten_dataset.all_news_bkp` 
    WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}')
    
    """.format(from_date, to_date)

    storage_client = Storage(google_key_path='/igenie-project-key.json')

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data('pecten_dataset', 'all_news',
                                            to_insert)
    except Exception as e:
        print(e)
Beispiel #3
0
def other_tables(args):
    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = ["CONNECTION_STRING"]

    parameters = get_parameters(args.param_connection_string, param_table, parameters_list)

    storage = Storage(google_key_path=args.google_key_path, mongo_connection_string=parameters["CONNECTION_STRING"])

    mongo_connection_string = parameters["CONNECTION_STRING"]


    client = MongoClient(mongo_connection_string)
    db = client["dax_gcp"]

    for collection_name in args.mongo_collections.split(","):
        collection = db[collection_name]
        cursor = collection.find({},{"_id":0})
        data = list(cursor)
        file_name = "{}.json".format(collection_name)

        open(file_name, 'w').write("\n".join(json.dumps(e, cls=MongoEncoder) for e in data))

        cloud_file_name = "{}/{}".format(args.bucket,file_name)

        if os.path.isfile(file_name):
            if storage.upload_to_cloud_storage(args.google_key_path, args.bucket, file_name, file_name):
                print("File uploaded to Cloud Storage")
                os.remove(file_name)
            else:
                print("File not uploaded to Cloud storage.")
        else:
            print("File does not exists in the local filesystem.")
def get_news_analytics_topic_articles_bq(from_date, to_date, google_key_path):
    print("news_analytics_topic_articles")

    columns = []

    query = """
    SELECT constituent_name, constituent_id, sentiment, News_Date_NewsDim, Constituent,
    News_source_News_Dim, Score, news_topic as Categorised_tag, news_Title_NewsDim, Date, NEWS_ARTICLE_TXT_NewsDim
    FROM [igenie-project:pecten_dataset.news]
    where date between TIMESTAMP ('{}') and TIMESTAMP ('{}')
    """.format(from_date, to_date)

    #`
    #UPDATE twitter_analytics_latest_price_tweets SET From_date = TIMESTAMP() where date between TIMESTAMP() and TIMESTAMP()
    #UPDATE twitter_analytics_latest_price_tweets SET To_date = TIMESTAMP() where date between TIMESTAMP() and TIMESTAMP()

    storage_client = Storage(google_key_path=google_key_path)

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data('pecten_dataset', '', to_insert)
    except Exception as e:
        print(e)
def get_search_string(constituent_id, connection_string, table_keywords,
                      table_exclusions):
    if __name__ != "__main__":
        from utils.TwitterDownloader import TwitterDownloader
        from utils.Storage import Storage
        from utils import twitter_analytics_helpers as tap
        from utils.TaggingUtils import TaggingUtils as TU

    storage = Storage()

    where = lambda x: and_((x["ACTIVE_STATE"] == 1),
                           (x["CONSTITUENT_ID"] == constituent_id))

    keywords = storage.get_sql_data(sql_connection_string=connection_string,
                                    sql_table_name=table_keywords,
                                    sql_column_list=["KEYWORD"],
                                    sql_where=where)

    keywords_list = ['"' + key[0] + '"' for key in keywords]
    keywords_string = " OR ".join(keywords_list)

    exclusions = storage.get_sql_data(sql_connection_string=connection_string,
                                      sql_table_name=table_exclusions,
                                      sql_column_list=["EXCLUSIONS"],
                                      sql_where=where)

    exclusions_list = ["-" + key[0] for key in exclusions]
    exclusions_string = " ".join(exclusions_list)

    all_words = keywords_string + exclusions_string

    return all_words
def get_news_analytics_topic_articles(from_date, to_date, google_key_path):
    print("news_analytics_topic_articles")

    columns = [
        "constituent_name", "constituent_id", "sentiment", "News_Date_NewsDim",
        "constituent", "News_source_NewsDim", "To_Date", "Score",
        "Categorised_tag", "News_Title_NewsDim", "Date", "From_Date",
        "NEWS_ARTICLE_TXT_NewsDim"
    ]

    query = """
    SELECT constituent_name, constituent_id, sentiment, news_date as News_Date_NewsDim, Constituent, news_source as News_source_News_Dim, Score, news_topics as Categorised_tag, news_title as news_Title_NewsDim, news_date, news_article_txt as NEWS_ARTICLE_TXT_NewsDim 
    FROM `pecten_dataset.all_news_bkp`
    WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}')
    """.format(from_date, to_date)

    storage_client = Storage(google_key_path='igenie-project-key.json')

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data(
            'pecten_dataset', 'news_analytics_topic_articles_copy', to_insert)
    except Exception as e:
        print(e)
Beispiel #7
0
def get_news_analytics_daily_sentiment_bq(from_date, to_date, google_key_path):
    print("news_analytics_daily_sentiment")

    columns = [
        "constituent_id", "avg_sentiment", "constituent_name", "date",
        "constituent"
    ]

    query = """
    SELECT constituent_id, AVG(score) as avg_sentiment, constituent_name, news_date as date, constituent
    FROM `pecten_dataset.all_news`
    WHERE news_date between TIMESTAMP ('{}') and TIMESTAMP ('{}')
    GROUP BY constituent_id, constituent_name, date, constituent
    """.format(from_date, to_date)

    storage_client = Storage(google_key_path='igenie-project-key.json')

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data(
            'pecten_dataset', 'news_analytics_daily_sentiment_copy', to_insert)
    except Exception as e:
        print(e)
Beispiel #8
0
def main(arguments):
    storage = Storage()
    result = storage.get_sql_data(
        sql_connection_string=arguments.connection_string,
        sql_table_name=arguments.table_name,
        sql_column_list=["EMAIL_USERNAME"])

    print(result)
def update_result(table_store, args):
    storage = Storage(google_key_path=args.service_key_path)
    storage = Storage(google_key_path=args.service_key_path)
    query = 'UPDATE `' + table_store + '` SET Status = "inactive" WHERE Status = "active"'

    try:
        result = storage.get_bigquery_data(query)
    except Exception as e:
        print(e)
Beispiel #10
0
def get_parameters(connection_string, table, column_list):
    storage = Storage()

    data = storage.get_sql_data(connection_string, table, column_list)[0]
    parameters = {}

    for i in range(0, len(column_list)):
        parameters[column_list[i]] = data[i]

    return parameters
Beispiel #11
0
def get_parameters(connection_string, table, column_list):
    storage = Storage()

    data = storage.get_sql_data(connection_string, table, column_list)[0]
    parameters = {}

    for i in range(0, len(column_list)):
        parameters[column_list[i]] = data[i]

    return parameters
Beispiel #12
0
def get_twitter_top_organisations(from_date, to_date, google_key_path):
    print("twitter_top_organisations")

    columns = ["count", "constituent", "trend", "date_of_analysis", "status", "constituent_name", "constituent_id"]

    query = """
    SELECT
  a.constituent,
  b.entity_tags as trend,
  COUNT(b.entity_tags) AS count,
  a.constituent_name,
  a.constituent_id,
  CASE
        WHEN date > '2017-11-01 00:00:00' THEN 'active'
        ELSE 'inactive'
    END AS status,
  CASE
        WHEN date > '2017-12-01 00:00:00' THEN '2017-12-01 00:00:00 UTC'
    END AS date_of_analysis  
FROM
  `pecten_dataset.tweets` a,
(SELECT
  x.id_str,
  entity_tags
FROM
  `pecten_dataset.tweets` AS x,
  UNNEST(entity_tags.ORG) AS entity_tags) b
WHERE a.id_str= b.id_str AND 
a.date BETWEEN TIMESTAMP ('{}')
  AND TIMESTAMP ('{}')
GROUP BY
  a.constituent,
  a.constituent_name,
  a.constituent_id,
  date,
  b.entity_tags;
   
    """.format(from_date, to_date)

    storage_client = Storage(google_key_path='igenie-project-key.json')

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else
                   (k,item[k]) for k in columns))

    try:
        storage_client.insert_bigquery_data('pecten_dataset', 'twitter_top_organisations_copy', to_insert)
    except Exception as e:
        print(e)
Beispiel #13
0
def get_news_tags_bq(from_date, to_date, google_key_path):
    print("news_tags")

    columns = [
        "Date", "constituent", "Tags", "Count", "constituent_name",
        "constituent_id"
    ]

    query = """
    SELECT
  a.news_date AS Date,
  a.constituent,
  b.news_topics as Tags,
  COUNT(b.news_topics) AS Count,
  a.constituent_name,
  a.constituent_id
FROM
  `pecten_dataset.all_news_bkp` a,
(SELECT
  x.news_id,
  news_topics
FROM
  `pecten_dataset.all_news_bkp` AS x,
  UNNEST(news_topics) AS news_topics) b
WHERE a.news_id = b.news_id AND 
a.news_date BETWEEN TIMESTAMP ('{}')
  AND TIMESTAMP ('{}')
GROUP BY
  Date,
  a.constituent,
  b.news_topics,
  a.constituent_name,
  a.constituent_id;
    """.format(from_date, to_date)

    storage_client = Storage(google_key_path='igenie-project-key.json')

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data('pecten_dataset', 'news_tag_copy',
                                            to_insert)
    except Exception as e:
        print(e)
def get_news_analytics_daily_sentiment_bq(args):
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active')
    common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where)

    print("news_analytics_daily_sentiment")

    columns = ["constituent_id", "avg_sentiment", "constituent_name", "date", "constituent", "To_date", "From_date"]

    query = """
    SELECT constituent_id, AVG(score) as avg_sentiment, constituent_name, news_date as date, constituent, TIMESTAMP('{1}') as From_date, TIMESTAMP('{2}') as To_date
    FROM `{0}.all_news`
    WHERE news_date between TIMESTAMP ('{1}') and TIMESTAMP ('{2}')
    GROUP BY constituent_id, constituent_name, date, constituent
    """.format(common_parameters["BQ_DATASET"],common_parameters["FROM_DATE"].strftime("%Y-%m-%d"),
               common_parameters["TO_DATE"].strftime("%Y-%m-%d"))

    storage_client = Storage.Storage(args.google_key_path)

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else
                   (k,item[k]) for k in columns))

    try:
        print("Inserting into BQ")
        storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                            'news_analytics_daily_sentiment', to_insert)
    except Exception as e:
        print(e)
def get_twitter_analytics_latest_price_tweets(from_date, to_date,
                                              google_key_path):
    print("twitter_sentiment_popularity")

    columns = [
        "tweet_date", "constituent_name", "from_date", "date", "text",
        "entity_tags", "entity_tags.MONEY", "sentiment_score", "constituent",
        "constituent_id", "to_date"
    ]

    query = """
    SELECT date as tweet_date, constituent_name, text, entity_tags.MONEY, sentiment_score, constituent, constituent_id, 
CASE
        WHEN date > '2017-12-01 00:00:00' THEN '2017-12-09 00:00:00 UTC'
    END AS date,
CASE
        WHEN date > '2017-12-01 00:00:00' THEN '2017-12-01 00:00:00 UTC'
    END AS from_date,
CASE
        WHEN date > '2017-12-01 00:00:00' THEN '2017-12-11 00:00:00 UTC'
    END AS to_date    
FROM `igenie-project.pecten_dataset.tweets` 
WHERE text LIKE '%rating%'and text LIKE '%€%' and date between TIMESTAMP ('{}') and TIMESTAMP ('{}')

    """.format(from_date, to_date)

    storage_client = Storage(google_key_path='igenie-project-key.json')

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data(
            'pecten_dataset', 'twitter_analytics_latest_price_tweets_copy',
            to_insert)
    except Exception as e:
        print(e)
Beispiel #16
0
def file_to_mongodb(args):
    operations = []
    storage = Storage(
        mongo_connection_string=
        "mongodb://*****:*****@35.189.89.82:27017/dax_gcp")
    with open(args.file1, "r") as f1:
        records = 0
        for line in f1:
            data = json.loads(line)
            #change column names
            try:
                data["NEWS_TITLE_NewsDim"] = data.pop("news_title")
                data["NEWS_DATE_NewsDim"] = datetime.strptime(
                    data.pop("news_date"), '%Y-%m-%d %H:%M:%S')
                data["NEWS_ARTICLE_TXT_NewsDim"] = data.pop("news_article_txt")
                data["NEWS_SOURCE_NewsDim"] = data.pop("news_source")
                data["NEWS_PUBLICATION_NewsDim"] = data.pop("news_publication")
                data["categorised_tag"] = data.pop("news_topics")
                if data["constituent"] == "BMW":
                    data["constituent"] = "bmw"
            except Exception as e:
                print(e)
                continue

            operations.append(data)
            records += 1

            if len(operations) == 1000:
                print("Saving {} records".format(records))
                storage.save_to_mongodb(operations, "dax_gcp", "all_news")
                operations = []

        if len(operations) > 0:
            storage.save_to_mongodb(operations, "dax_gcp", "all_news")
def get_twitter_sentiment_count_daily(from_date, to_date, google_key_path):
    print("twitter_sentiment_count_daily")

    columns = [
        "constituent_name", "sentiment_score", "line", "date", "count",
        "constituent_id", "constituent"
    ]

    query = """
    SELECT constituent_name, DATE(date) as date, count(date) as count, constituent_id, constituent, 
    (
    CASE 
        WHEN sentiment_score > 0.25 THEN 'positive'
        WHEN sentiment_score < -0.25 THEN 'negative'
        ELSE 'neutral'
    END) AS line
    FROM `pecten_dataset.tweets` 
    WHERE date between TIMESTAMP ('{}') and TIMESTAMP ('{}') 
    GROUP BY constituent_name, constituent, constituent_id, date, line
    ORDER BY date
    """.format(from_date, to_date)

    storage_client = Storage(google_key_path='igenie-project-key.json')

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data(
            'pecten_dataset', 'twitter_sentiment_count_daily_copy', to_insert)
    except Exception as e:
        print(e)
def get_twitter_sentiment_count_daily(args):
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')
    common_parameters = get_parameters(args.param_connection_string,
                                       common_table, common_list, common_where)

    print("twitter_sentiment_count_daily")

    columns = [
        "constituent_name", "line", "date", "count", "constituent_id",
        "constituent", "From_date", "To_date"
    ]

    query = """
    SELECT constituent_name, DATE(date) as date, count(date) as count, constituent_id, constituent, TIMESTAMP('{1}') as From_date, TIMESTAMP('{2}') as To_date,
    (
    CASE 
        WHEN sentiment_score > 0.25 THEN 'positive'
        WHEN sentiment_score < -0.25 THEN 'negative'
        ELSE 'neutral'
    END) AS line
    FROM `{0}.tweets` 
    WHERE date between TIMESTAMP ('{1}') and TIMESTAMP ('{2}') 
    GROUP BY constituent_name, constituent, constituent_id, date, line
    ORDER BY date
    """.format(common_parameters["BQ_DATASET"],
               common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S"),
               common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S"))

    storage_client = Storage.Storage(args.google_key_path)

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], date) else (k, item[k])
                 for k in columns))

    try:
        print("Inserting into BQ")
        storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                            'twitter_sentiment_count_daily',
                                            to_insert)
    except Exception as e:
        print(e)
Beispiel #19
0
def main(arguments):
    #connection string + database + table for storage
    all_constituents_dict_bi = {
        'Allianz': 'Allianz',
        'Adidas': 'Adidas',
        'BASF': 'BASF',
        'Bayer': 'Bayer',
        'Beiersdorf': 'Beiersdorf',
        'BMW': 'BMW',
        'Commerzbank': 'Commerzbank',
        'Continental': 'Continental',
        'Daimler': 'Daimler',
        'Deutsche Bank': 'Deutsche_Bank',
        'Deutsche Börse': 'Deutsche_Boerse',
        'Deutsche Post': 'Deutsche_Post',
        'Deutsche Telekom': 'Deutsche_Telekom',
        'EON': 'EON',
        'Fresenius Medical Care': 'Fresenius_Medical_Care',
        'Fresenius': 'Fresenius',
        'HeidelbergCement': 'HeidelbergCement',
        'Infineon': 'Infineon',
        'Linde': 'Linde_6',
        'Lufthansa': 'Lufthansa',
        'Merck': 'Merck',
        'Münchener Rückversicherungs-Gesellschaft': 'Munich_Re',
        'ProSiebenSat1 Media': 'ProSiebenSat1_Media',
        'RWE': 'RWE',
        'Siemens': 'Siemens',
        'Thyssenkrupp': 'thyssenkrupp',
        'Volkswagen (VW) vz': 'Volkswagen_vz',
        'Vonovia': 'Vonovia'
    }
    #,'Vonovia':'Vonovia'

    bi_analyst_table = analyst_businessinsider(all_constituents_dict_bi)
    ws_analyst_table = analyst_wallstreet()
    combined_analyst_table = combined_analyst(ws_analyst_table,
                                              bi_analyst_table)

    import json
    combined_analyst_json = json.loads(
        combined_analyst_table.to_json(orient='records'))
    bi_analyst_json = json.loads(bi_analyst_table.to_json(orient='records'))

    from utils.Storage import Storage
    storage = Storage()

    #save the result: combined - analyst_opinions, bi - analyst_opinions_all
    storage.save_to_mongodb(connection_string=args.param_connection_string,
                            database=args.database,
                            collection=args.collection_selected,
                            data=combined_analyst_json)
    storage.save_to_mongodb(connection_string=args.param_connection_string,
                            database=args.database,
                            collection=args.collection_all,
                            data=bi_analyst_json)
Beispiel #20
0
def get_bigquery(args):
    from utils.Storage import Storage
    storage_client = Storage(args.google_key_path)
    q1 = """SELECT
  date AS ds,
  closing_price AS y
FROM
`pecten_dataset.stockprice_historical_collection`
where 
constituent_name = 'DAIMLER AG'
order by 
date"""

    query1 = pd.read_gbq(q1, project_id='igenie-project', dialect='standard')
    #    print(query1)
    args.query1 = query1
def get_country_data_bq(from_date, to_date, google_key_path):
    print("country_data")
    storage_client = Storage(google_key_path=google_key_path)

    query2 = """
    UPDATE `pecten_dataset.country_data` SET status = 'inactive' where status = 'active';
    """

    try:
        storage_client.get_bigquery_data(query2, iterator_flag=True)
    except Exception as e:
        print(e)
        return

    columns = [
        "constituent", "avg_sentiment", "count", "country_name",
        "constituent_name", "constituent_id", "date_of_analysis", 'status'
    ]

    query = """
    SELECT constituent, AVG(sentiment_score) as avg_sentiment, count(place.country_code) as count,
    place.country_code as country_name, constituent_name, constituent_id, CURRENT_TIMESTAMP() as date_of_analysis,
    'active' as status
    FROM `pecten_dataset.tweets`
    WHERE date between TIMESTAMP('{}') and TIMESTAMP('{}')
    GROUP BY constituent_id, constituent, country_name, constituent_name
    HAVING country_name IS NOT NULL;
    """.format(from_date, to_date)

    try:
        result = storage_client.get_bigquery_data(query, iterator_flag=True)
    except Exception as e:
        print(e)
        return None

    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data('pecten_dataset', 'country_data',
                                            to_insert)
    except Exception as e:
        print(e)
def get_twitter_sentiment_popularity(args):
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')
    common_parameters = get_parameters(args.param_connection_string,
                                       common_table, common_list, common_where)

    print("twitter_sentiment_popularity")

    columns = [
        "count", "constituent", "avg_sentiment_all", "constituent_name",
        "constituent_id", "date", "from_date", "to_date"
    ]

    query = """
    SELECT constituent_name, constituent_id, constituent, count(text) as count, avg(sentiment_score) as avg_sentiment_all, TIMESTAMP('{1}') as from_date, TIMESTAMP('{2}') as to_date,
CASE
        WHEN date > '2017-11-17 00:00:00' THEN '2017-12-13 00:00:00 UTC'
    END AS date
FROM `{0}.tweets` 
WHERE date between TIMESTAMP('{1}') and ('{2}')
group by date, constituent_name, constituent_id, constituent order by count DESC LIMIT 10
    """.format(common_parameters["BQ_DATASET"],
               common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S"),
               common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S"))

    storage_client = Storage.Storage(args.google_key_path)

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], date) else (k, item[k])
                 for k in columns))

    try:
        print("Inserting into BQ")
        storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                            'twitter_sentiment_popularity',
                                            to_insert)
    except Exception as e:
        print(e)
Beispiel #23
0
def get_influencer_price_tweets(args):
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active')
    common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where)

    print("influencer_price_tweets")

    columns = ["tweet_date", "constituent_name", "from_date", "date", "text", "MONEY", "sentiment_score", "constituent", "constituent_id", "to_date"]

    query = """
    SELECT date as tweet_date, constituent_name, text, entity_tags.MONEY, sentiment_score, constituent, constituent_id, TIMESTAMP('{1}') as from_date, TIMESTAMP('{2}') as to_date,  
CASE
        WHEN date > '2017-12-01 00:00:00' THEN '2017-12-13 00:00:00 UTC'
    END AS date
FROM `{0}.tweets`
WHERE text LIKE '%rating%'and text LIKE '%€%' and date between TIMESTAMP ('{1}') and TIMESTAMP ('{2}') and user.followers_count > 1000

    """.format(common_parameters["BQ_DATASET"],common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S"),
               common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S"))

    storage_client = Storage.Storage(args.google_key_path)

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else
                   (k,item[k]) for k in columns))

    try:
        print("Inserting into BQ")
        storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                            'influencer_price_tweets', to_insert)
    except Exception as e:
        print(e)
Beispiel #24
0
def get_twitter_analytics_top_orgs_bq(args, from_date, to_date):
    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = ["CONNECTION_STRING"]

    parameters = get_parameters(args.param_connection_string, param_table,
                                parameters_list)

    storage = Storage(google_key_path=args.google_key_path,
                      mongo_connection_string=parameters["CONNECTION_STRING"])

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"])

    client = MongoClient(parameters["CONNECTION_STRING"])
    db = client["dax_gcp"]
    collection = db["tweets"]
    twitter_analytics_top_orgs = db["twitter_analytics_top_orgs"]

    for constituent_id, constituent_name in all_constituents:
        print(constituent_name)
        pipeline = [{
            "$match": {
                "constituent_id": constituent_id,
                "date": {
                    "$gte": from_date,
                    "$lte": to_date
                }
            }
        }, {
            "$project": {
                "constituent_id": "$constituent_id",
                "constituent_name": "$constituent_name",
                "org_tags": "$entity_tags.ORG",
                "constituent": "$constituent"
            }
        }, {
            "$unwind": "$org_tags"
        }, {
            "$group": {
                "_id": {
                    "constituent_id": "$constituent_id",
                    "org_tags": "$org_tags",
                    "constituent_name": "$constituent_name"
                },
                "count": {
                    "$sum": 1
                }
            }
        }, {
            "$sort": SON([("count", -1)])
        }, {
            "$project": {
                "_id": 0,
                "constituent_id": "$_id.constituent_id",
                "constituent_name": "$_id.constituent_name",
                "org_tags": "$_id.org_tags",
                "count": "$count",
                "from_date": from_date,
                "to_date": to_date,
                "date": datetime.now()
            }
        }]

        results = list(collection.aggregate(pipeline))
        to_return = []

        for r in results:
            if r["constituent_name"].lower() in r["org_tags"].lower() or \
               r["org_tags"].lower() in r["constituent_name"].lower() or \
               " AG".lower() in r["org_tags"].lower() or \
               len(r["org_tags"].split(" ")) > 3 or \
               r["count"] < 10 or \
               r["org_tags"].lower() == "Trends".lower() or \
               r["org_tags"].lower() == "Share".lower():
                continue
            else:
                to_return.append(r)

        if to_return:
            twitter_analytics_top_orgs.insert_many(to_return)
        else:
            if results:
                twitter_analytics_top_orgs.insert_many(results)
def get_tweets(args):
    from utils import logging_utils as logging_utils
    from utils.TwitterDownloader import TwitterDownloader
    from utils.Storage import Storage
    from utils import twitter_analytics_helpers as tap
    from utils.TaggingUtils import TaggingUtils as TU

    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = [
        "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING",
        "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME",
        "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET",
        "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE"
    ]

    parameters = tap.get_parameters(args.param_connection_string, param_table,
                                    parameters_list)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tap.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    languages = parameters["LANGUAGE"].split(",")

    storage = Storage(google_key_path=args.google_key_path,
                      mongo_connection_string=parameters["CONNECTION_STRING"])
    tagger = TU()

    downloader = TwitterDownloader(parameters["TWITTER_API_KEY"],
                                   parameters["TWITTER_API_SECRET"])
    downloader.load_api()

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"])

    fields_to_keep = [
        "text", "favorite_count", "source", "retweeted", "entities", "id_str",
        "retweet_count", "favorited", "user", "lang", "created_at", "place",
        "constituent_name", "constituent_id", "search_term", "id",
        "sentiment_score", "entity_tags", "relevance", "constituent"
    ]

    for language in languages:
        for constituent_id, constituent_name in all_constituents:
            search_query = get_search_string(constituent_id,
                                             args.param_connection_string,
                                             "PARAM_TWITTER_KEYWORDS",
                                             "PARAM_TWITTER_EXCLUSIONS")

            #Get max id of all tweets to extract tweets with id highe than that
            q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \
                "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"],
                                        constituent_id,language)
            try:
                sinceId = int(
                    storage.get_bigquery_data(
                        q, iterator_flag=False)[0]["max_id"])
            except Exception as e:
                print(e)
                sinceId = None

            max_id = -1
            tweetCount = 0

            print("Downloading max {0} tweets for {1} in {2} on {3}".format(
                parameters["MAX_TWEETS"], constituent_name, language,
                str(datetime.now())))
            while tweetCount < parameters["MAX_TWEETS"]:
                tweets_unmodified = []
                tweets_modified = []
                tweets_mongo = []

                try:
                    tweets, tmp_tweet_count, max_id = downloader.download(
                        constituent_name, search_query, language,
                        parameters["TWEETS_PER_QUERY"], sinceId, max_id)
                except Exception as e:
                    continue

                if not tweets:
                    break
                else:
                    print("Downloaded {} tweets".format(tmp_tweet_count))

                tweetCount += tmp_tweet_count

                #Add fields for both unmodified and modified tweets
                for tweet in tweets:
                    tweet._json['source'] = "Twitter"
                    tweet._json['constituent_name'] = constituent_name
                    tweet._json['constituent_id'] = constituent_id
                    tweet._json['search_term'] = search_query
                    tweet._json["constituent"] = tap.get_old_constituent_name(
                        constituent_id)

                    #Removing bad fields
                    clean_tweet = tap.scrub(tweet._json)

                    # Separate the tweets that go to one topic or the other

                    #unmodified
                    t_unmodified = deepcopy(clean_tweet)
                    t_unmodified["date"] = tap.convert_timestamp(
                        t_unmodified["created_at"])
                    tweets_unmodified.append(t_unmodified)

                    #Add additional fields
                    clean_tweet["sentiment_score"] = tap.get_nltk_sentiment(
                        str(clean_tweet["text"]))
                    tagged_text = tagger.get_spacy_entities(
                        str(clean_tweet["text"]))
                    clean_tweet["entity_tags"] = tap.get_spacey_tags(
                        tagged_text)
                    clean_tweet["relevance"] = -1

                    #mongo
                    t_mongo = deepcopy(clean_tweet)
                    t_mongo['date'] = datetime.strptime(
                        t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y')
                    tweets_mongo.append(t_mongo)

                    #modified
                    tagged_tweet = dict((k, clean_tweet[k])
                                        for k in fields_to_keep
                                        if k in clean_tweet)
                    tagged_tweet['date'] = tap.convert_timestamp(
                        clean_tweet["created_at"])
                    tweets_modified.append(tagged_tweet)

                #send to PubSub topic
                #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified)
                #ps_utils.publish("igenie-project", "tweets", tweets_modified)
                try:
                    storage.insert_bigquery_data(
                        common_parameters["BQ_DATASET"],
                        '{}_unmodified'.format(
                            parameters["DESTINATION_TABLE"]),
                        tweets_unmodified)
                except Exception as e:
                    print(e)
                try:
                    storage.insert_bigquery_data(
                        common_parameters["BQ_DATASET"],
                        parameters["DESTINATION_TABLE"], tweets_modified)
                except Exception as e:
                    print(e)
                try:
                    storage.save_to_mongodb(tweets_mongo, "dax_gcp",
                                            parameters["DESTINATION_TABLE"])
                    pass
                except Exception as e:
                    print(e)

                time.sleep(1)

            print("Saved {} tweets for in {}".format(tweetCount,
                                                     constituent_name,
                                                     language))

            if parameters["LOGGING"]:
                doc = [{
                    "date":
                    time.strftime('%Y-%m-%d %H:%M:%S',
                                  datetime.now().date().timetuple()),
                    "constituent_name":
                    constituent_name,
                    "constituent_id":
                    constituent_id,
                    "downloaded_tweets":
                    tweetCount,
                    "language":
                    language
                }]
                logging_utils.logging(doc, common_parameters["BQ_DATASET"],
                                      parameters["LOGGING_TABLE"], storage)

    return "Downloaded tweets"
Beispiel #26
0
def get_daily_orbis_news(args):
    if __name__ != "__main__":
        from utils.Storage import Storage
        from utils.Storage import MongoEncoder
        from utils.SOAPUtils import SOAPUtils
        from utils import twitter_analytics_helpers as tah
        from utils.TaggingUtils import TaggingUtils as TU
        from utils.Storage import Storage
        from utils.Storage import MongoEncoder
        from utils.SOAPUtils import SOAPUtils
        from utils import logging_utils as logging_utils

    # get constituents
    soap = SOAPUtils()
    storage = Storage(args.google_key_path)
    tagger = TU()

    columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"]
    table = "MASTER_CONSTITUENTS"

    constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name=table,
        sql_column_list=columns)

    # Get parameters
    param_table = "PARAM_NEWS_COLLECTION"
    parameters_list = [
        "LOGGING", "BVD_USERNAME", "BVD_PASSWORD", "DESTINATION_TABLE",
        "LOGGING_TABLE"
    ]
    where = lambda x: x["SOURCE"] == 'Orbis'

    parameters = tah.get_parameters(args.param_connection_string, param_table,
                                    parameters_list, where)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    for constituent_id, constituent_name, bvdid in constituents:
        #get last news date for the constituent
        query = """
            SELECT max(news_id) as max_id FROM `{}.{}`
            WHERE constituent_id = '{}' and news_origin = "Orbis"
            """.format(common_parameters["BQ_DATASET"],
                       parameters["DESTINATION_TABLE"], constituent_id)

        try:
            result = storage.get_bigquery_data(query=query,
                                               iterator_flag=False)
            max_id = result[0]["max_id"]
        except Exception as e:
            max_id = None
            continue

        start = 0
        max_count = 20
        print("Constituent: {},{}".format(constituent_name, bvdid))

        stop = False

        try:
            token = soap.get_token(parameters["BVD_USERNAME"],
                                   parameters["BVD_PASSWORD"], 'orbis')
            selection_token, selection_count = soap.find_by_bvd_id(
                token, bvdid, 'orbis')
        except Exception as e:
            print(e)
            return

        while not stop:
            try:
                query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \
                        "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \
                        "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \
                        "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \
                        "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \
                        "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \
                        "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \
                        "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \
                        "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \
                        "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \
                        "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format(
                    start, max_count)

                get_data_result = soap.get_data(token,
                                                selection_token,
                                                selection_count,
                                                query,
                                                'orbis',
                                                timeout=None)
            except Exception as e:
                print(str(e))
                if token:
                    soap.close_connection(token, 'orbis')
                    token = None
                break

            result = ET.fromstring(get_data_result)
            csv_result = result[0][0][0].text

            TESTDATA = StringIO(csv_result)
            try:
                df = pd.read_csv(TESTDATA, sep=",", parse_dates=["news_date"])
            except Exception as e:
                print(e)
                break

            if df.shape[0] == 0:
                print("No records in df")
                break

            initial_shape = df.shape[0]

            df.astype({"news_id": int}, copy=False, errors='ignore')
            df = df.loc[df["news_id"] > max_id]

            if df.shape[0] == 0:
                print("No new data")
                break

            if df.shape[0] != initial_shape:
                stop = True

            # Make news_title column a string
            df.astype({"news_title": str}, copy=False, errors='ignore')

            # Remove duplicate columns
            df.drop_duplicates(["news_title"], inplace=True)

            # Get sentiment score
            df["score"] = df.apply(lambda row: tah.get_nltk_sentiment(
                str(row["news_article_txt"])),
                                   axis=1)

            # get sentiment word
            df["sentiment"] = df.apply(
                lambda row: tah.get_sentiment_word(row["score"]), axis=1)

            # add constituent name, id and old name
            df["constituent_id"] = constituent_id
            df["constituent_name"] = constituent_name
            old_constituent_name = tah.get_old_constituent_name(constituent_id)
            df["constituent"] = old_constituent_name

            # add URL
            df["url"] = None

            # add show
            df["show"] = True

            # get entity tags
            entity_tags = []
            for text in df["news_title"]:
                tags = tah.get_spacey_tags(tagger.get_spacy_entities(
                    str(text)))
                entity_tags.append(tags)

            fields = [
                "news_date", "news_title", "news_article_txt",
                "news_companies", "news_source", "news_publication",
                "news_topics", "news_country", "news_region", "news_language",
                "news_id", "score", "sentiment", "constituent_id",
                "constituent_name", "constituent", "url", "show"
            ]

            # Get needed fields
            df_bigquery = df[fields]
            bigquery_data = json.loads(
                df_bigquery.to_json(orient="records", date_format="iso"))

            # set entity_tag field
            i = 0
            for i in range(0, len(bigquery_data)):
                bigquery_data[i]["entity_tags"] = entity_tags[i]

                # set news_date
                if "news_date" in bigquery_data[i] and bigquery_data[i][
                        "news_date"]:
                    date = bigquery_data[i]["news_date"]
                    ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                    bigquery_data[i]["news_date"] = ts
                else:
                    bigquery_data[i]["news_date"] = None

                if bigquery_data[i]["news_title"]:
                    bigquery_data[i]["news_title"] = str(
                        bigquery_data[i]["news_title"])

                if bigquery_data[i]["news_article_txt"]:
                    bigquery_data[i]["news_article_txt"] = str(
                        bigquery_data[i]["news_article_txt"])

                if bigquery_data[i]["news_companies"]:
                    try:
                        bigquery_data[i]["news_companies"] = [
                            i.strip() for i in bigquery_data[i]
                            ["news_companies"].split(";")
                        ]
                    except Exception as e:
                        bigquery_data[i]["news_companies"] = []
                else:
                    bigquery_data[i]["news_companies"] = []

                if bigquery_data[i]["news_topics"]:
                    try:
                        if bigquery_data[i]["news_topics"].isdigit():
                            bigquery_data[i]["news_topics"] = []
                        elif bigquery_data[i]["news_topics"] == 'None':
                            bigquery_data[i]["news_topics"] = []
                        else:
                            bigquery_data[i]["news_topics"] = [
                                i.strip() for i in bigquery_data[i]
                                ["news_topics"].split(";")
                            ]
                    except Exception as e:
                        bigquery_data[i]["news_topics"] = []
                else:
                    bigquery_data[i]["news_topics"] = []

                if bigquery_data[i]["news_country"]:
                    try:
                        bigquery_data[i]["news_country"] = [
                            i.strip() for i in bigquery_data[i]
                            ["news_country"].split(";")
                        ]
                    except Exception as e:
                        bigquery_data[i]["news_country"] = []
                else:
                    bigquery_data[i]["news_country"] = []

                if bigquery_data[i]["news_region"]:
                    try:
                        bigquery_data[i]["news_region"] = [
                            i.strip()
                            for i in bigquery_data[i]["news_region"].split(";")
                        ]
                    except Exception as e:
                        bigquery_data[i]["news_region"] = []
                else:
                    bigquery_data[i]["news_region"] = []

                if bigquery_data[i]["news_language"]:
                    bigquery_data[i]["news_language"] = str(
                        bigquery_data[i]["news_language"])

                if bigquery_data[i]["news_source"]:
                    bigquery_data[i]["news_source"] = str(
                        bigquery_data[i]["news_source"])

                if bigquery_data[i]["news_publication"]:
                    bigquery_data[i]["news_publication"] = str(
                        bigquery_data[i]["news_publication"])

                if bigquery_data[i]["news_id"]:
                    try:
                        bigquery_data[i]["news_id"] = int(
                            bigquery_data[i]["news_id"])
                    except Exception as e:
                        bigquery_data[i]["news_id"] = str(
                            bigquery_data[i]["news_id"])

                if bigquery_data[i]["sentiment"]:
                    bigquery_data[i]["sentiment"] = str(
                        bigquery_data[i]["sentiment"])

                if bigquery_data[i]["constituent_id"]:
                    bigquery_data[i]["constituent_id"] = str(
                        bigquery_data[i]["constituent_id"])

                if bigquery_data[i]["constituent_name"]:
                    bigquery_data[i]["constituent_name"] = str(
                        bigquery_data[i]["constituent_name"])

                if bigquery_data[i]["constituent"]:
                    bigquery_data[i]["constituent"] = str(
                        bigquery_data[i]["constituent"])

            try:
                storage.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                             parameters["DESTINATION_TABLE"],
                                             bigquery_data)
            except Exception as e:
                print(e)

            start = start + 20

            print("Records saved: {}".format(df.shape[0]))

            log = [{
                "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'),
                "constituent_name": constituent_name,
                "constituent_id": constituent_id,
                "downloaded_news": df.shape[0],
                "source": "Orbis"
            }]

            if parameters["LOGGING"] and bigquery_data:
                logging_utils.logging(log, common_parameters["BQ_DATASET"],
                                      parameters["LOGGING_TABLE"], storage)

        if token:
            soap.close_connection(token, 'orbis')
Beispiel #27
0
def get_historical_orbis_news(user, pwd, database, google_key_path,
                              param_connection_string):
    #get parameters
    soap = SOAPUtils()
    storage = Storage(google_key_path)
    tagger = TU()

    columns = ["CONSTITUENT_ID", "CONSTITUENT_NAME", "BVDID"]
    table = "MASTER_CONSTITUENTS"

    constituents = storage.get_sql_data(
        sql_connection_string=param_connection_string,
        sql_table_name=table,
        sql_column_list=columns)[2:]

    to_skip = [
        "BAYERISCHE MOTOREN WERKE AG", "DEUTSCHE BANK AG", "SIEMENS AG",
        "SAP SE", "VOLKSWAGEN AG"
    ]

    for constituent_id, constituent_name, bvdid in constituents:
        limit = get_number_of_news_items(constituent_name)
        if constituent_name in to_skip:
            print("Skipping")
            continue

        records = 0
        start = 0
        max_count = 20
        filename = "bq_news_{}.json".format(constituent_id)
        print("Constituent: {},{}".format(constituent_name, bvdid))
        failed = 0

        if constituent_name == "BASF SE":
            start = 4280

        try:
            token = soap.get_token(user, pwd, database)
            selection_token, selection_count = soap.find_by_bvd_id(
                token, bvdid, database)
        except Exception as e:
            print(e)
            return

        with open(filename, "a") as f:
            while limit > records:
                try:
                    query = "SELECT LINE BVDNEWS.NEWS_DATE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_date, " \
                            "LINE BVDNEWS.NEWS_TITLE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_title," \
                            "LINE BVDNEWS.NEWS_ARTICLE_TXT USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_article_txt, " \
                            "LINE BVDNEWS.NEWS_COMPANIES USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_companies, " \
                            "LINE BVDNEWS.NEWS_TOPICS USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_topics," \
                            "LINE BVDNEWS.NEWS_COUNTRY USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_country," \
                            "LINE BVDNEWS.NEWS_REGION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_region," \
                            "LINE BVDNEWS.NEWS_LANGUAGE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_language," \
                            "LINE BVDNEWS.NEWS_SOURCE USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_source," \
                            "LINE BVDNEWS.NEWS_PUBLICATION USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_publication," \
                            "LINE BVDNEWS.NEWS_ID USING [Parameters.RepeatingDimension=NewsDim;Parameters.RepeatingOffset={0};Parameters.RepeatingMaxCount={1}] AS news_id FROM RemoteAccess.A".format(
                        start, max_count)

                    get_data_result = soap.get_data(token,
                                                    selection_token,
                                                    selection_count,
                                                    query,
                                                    database,
                                                    timeout=None)
                except Exception as e:
                    print(str(e))
                    continue
                finally:
                    pass
                    #if token:
                    #    soap.close_connection(token, database)

                result = ET.fromstring(get_data_result)
                csv_result = result[0][0][0].text

                TESTDATA = StringIO(csv_result)
                try:
                    df = pd.read_csv(TESTDATA,
                                     sep=",",
                                     parse_dates=["news_date"])
                except Exception as e:
                    print(e)
                    continue

                if df.shape[0] == 0:
                    print("No records in df")
                    continue

                #Make news_title column a string
                df.astype({"news_title": str}, copy=False, errors='ignore')

                # Remove duplicate columns
                df.drop_duplicates(["news_title"], inplace=True)

                # Get sentiment score
                df["score"] = df.apply(lambda row: tah.get_nltk_sentiment(
                    str(row["news_article_txt"])),
                                       axis=1)
                #df["score"] = df.apply(lambda row: sia.polarity_scores(str(row["news_article_txt"]))['compound'] , axis=1)

                # get sentiment word
                df["sentiment"] = df.apply(
                    lambda row: tah.get_sentiment_word(row["score"]), axis=1)

                # add constituent name, id and old name
                df["constituent_id"] = constituent_id
                df["constituent_name"] = constituent_name
                old_constituent_name = tah.get_old_constituent_name(
                    constituent_id)
                df["constituent"] = old_constituent_name

                # add URL
                df["url"] = None

                # add show
                df["show"] = True

                # get entity tags
                entity_tags = []
                for text in df["news_title"]:
                    tags = tah.get_spacey_tags(
                        tagger.get_spacy_entities(str(text)))
                    entity_tags.append(tags)

                fields = [
                    "news_date", "news_title", "news_article_txt",
                    "news_companies", "news_source", "news_publication",
                    "news_topics", "news_country", "news_region",
                    "news_language", "news_id", "score", "sentiment",
                    "constituent_id", "constituent_name", "constituent", "url",
                    "show"
                ]
                '''
                # Save to MongoDB
                filter = ["NEWS_DATE_NewsDim", "NEWS_TITLE_NewsDim", "NEWS_ARTICLE_TXT_NewsDim",
                          "NEWS_SOURCE_NewsDim", "NEWS_PUBLICATION_NewsDim", "categorised_tag", "score", "sentiment",
                          "constituent_id", "constituent_name", "constituent","url", "show"]

                df_mongo = df[fields]
                df_mongo.columns = filter

                mongo_data = json.loads(df_mongo.to_json(orient="records", date_format="iso"))

                # set entity_tag field
                i = 0
                for i in range(0, len(mongo_data)):
                    mongo_data[i]["entity_tags"] = entity_tags[i]
                    date = mongo_data[i]["NEWS_DATE_NewsDim"]
                    #ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    ts = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                    mongo_data[i]["NEWS_DATE_NewsDim"] = ts

                #storage.save_to_mongodb(mongo_data, "dax_gcp", "all_news")
                '''

                #Get needed fields
                df_bigquery = df[fields]
                bigquery_data = json.loads(
                    df_bigquery.to_json(orient="records", date_format="iso"))

                # set entity_tag field
                i = 0
                for i in range(0, len(bigquery_data)):
                    bigquery_data[i]["entity_tags"] = entity_tags[i]

                    #set news_date
                    if "news_date" in bigquery_data[i] and bigquery_data[i][
                            "news_date"]:
                        date = bigquery_data[i]["news_date"]
                        ts = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
                        ts = time.strftime('%Y-%m-%d %H:%M:%S', ts)
                        bigquery_data[i]["news_date"] = ts
                    else:
                        bigquery_data[i]["news_date"] = None

                    if bigquery_data[i]["news_title"]:
                        bigquery_data[i]["news_title"] = str(
                            bigquery_data[i]["news_title"])

                    if bigquery_data[i]["news_article_txt"]:
                        bigquery_data[i]["news_article_txt"] = str(
                            bigquery_data[i]["news_article_txt"])

                    if bigquery_data[i]["news_companies"]:
                        try:
                            bigquery_data[i]["news_companies"] = [
                                i.strip() for i in bigquery_data[i]
                                ["news_companies"].split(";")
                            ]
                        except Exception as e:
                            bigquery_data[i]["news_companies"] = []
                    else:
                        bigquery_data[i]["news_companies"] = []

                    if bigquery_data[i]["news_topics"]:
                        try:
                            if bigquery_data[i]["news_topics"].isdigit():
                                bigquery_data[i]["news_topics"] = []
                            elif bigquery_data[i]["news_topics"] == 'None':
                                bigquery_data[i]["news_topics"] = []
                            else:
                                bigquery_data[i]["news_topics"] = [
                                    i.strip() for i in bigquery_data[i]
                                    ["news_topics"].split(";")
                                ]
                        except Exception as e:
                            bigquery_data[i]["news_topics"] = []
                    else:
                        bigquery_data[i]["news_topics"] = []

                    if bigquery_data[i]["news_country"]:
                        try:
                            bigquery_data[i]["news_country"] = [
                                i.strip() for i in bigquery_data[i]
                                ["news_country"].split(";")
                            ]
                        except Exception as e:
                            bigquery_data[i]["news_country"] = []
                    else:
                        bigquery_data[i]["news_country"] = []

                    if bigquery_data[i]["news_region"]:
                        try:
                            bigquery_data[i]["news_region"] = [
                                i.strip() for i in bigquery_data[i]
                                ["news_region"].split(";")
                            ]
                        except Exception as e:
                            bigquery_data[i]["news_region"] = []
                    else:
                        bigquery_data[i]["news_region"] = []

                    if bigquery_data[i]["news_language"]:
                        bigquery_data[i]["news_language"] = str(
                            bigquery_data[i]["news_language"])

                    if bigquery_data[i]["news_source"]:
                        bigquery_data[i]["news_source"] = str(
                            bigquery_data[i]["news_source"])

                    if bigquery_data[i]["news_publication"]:
                        bigquery_data[i]["news_publication"] = str(
                            bigquery_data[i]["news_publication"])

                    if bigquery_data[i]["news_id"]:
                        try:
                            bigquery_data[i]["news_id"] = int(
                                bigquery_data[i]["news_id"])
                        except Exception as e:
                            bigquery_data[i]["news_id"] = str(
                                bigquery_data[i]["news_id"])

                    if bigquery_data[i]["sentiment"]:
                        bigquery_data[i]["sentiment"] = str(
                            bigquery_data[i]["sentiment"])

                    if bigquery_data[i]["constituent_id"]:
                        bigquery_data[i]["constituent_id"] = str(
                            bigquery_data[i]["constituent_id"])

                    if bigquery_data[i]["constituent_name"]:
                        bigquery_data[i]["constituent_name"] = str(
                            bigquery_data[i]["constituent_name"])

                    if bigquery_data[i]["constituent"]:
                        bigquery_data[i]["constituent"] = str(
                            bigquery_data[i]["constituent"])

                    f.write(
                        json.dumps(bigquery_data[i], cls=MongoEncoder) + '\n')

                # storage.insert_bigquery_data("pecten_dataset", "news", bigquery_data)

                start = start + 20
                #end = start + 10
                records += 20
                print("Records saved: {}".format(records))

        if token:
            soap.close_connection(token, database)
Beispiel #28
0
def get_twitter_analytics_latest_price_tweets(args, from_date, to_date):
    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = ["CONNECTION_STRING"]

    parameters = get_parameters(args.param_connection_string, param_table,
                                parameters_list)

    storage = Storage(google_key_path=args.google_key_path,
                      mongo_connection_string=parameters["CONNECTION_STRING"])

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"])

    client = MongoClient(parameters["CONNECTION_STRING"])
    db = client["dax_gcp"]
    collection = db["tweets"]
    twitter_analytics_latest_price_tweets = db[
        "twitter_analytics_latest_price_tweets"]

    for constituent_id, constituent_name, symbol in all_constituents:
        symbol = '$' + symbol
        print(symbol)
        to_return = []
        final = []

        results = list(
            collection.find(
                {
                    "date": {
                        "$gte": from_date,
                        "$lte": to_date
                    },
                    "constituent_id": constituent_id,
                    "entities.symbols": {
                        "$exists": True,
                        "$ne": []
                    }
                }, {
                    "text": 1,
                    "constituent_id": 1,
                    "constituent_name": 1,
                    "constituent": 1,
                    "entity_tags.MONEY": 1,
                    "sentiment_score": 1,
                    "date": 1,
                    "_id": 0
                }).sort([("date", -1)]))

        if results:
            print("A")
            for item in results:
                if symbol in item["text"].split(" "):
                    if item["entity_tags"]["MONEY"]:
                        text = item["text"].replace("$", "")
                        text = text.replace("€", "")
                        text = text.replace("EUR", " ")
                        tokens = text.split(" ")
                        contains_number = False
                        for t in tokens:
                            try:
                                float(t)
                                contains_number = True
                            except Exception as e:
                                pass

                        if contains_number:
                            clean_tokens = [
                                t for t in item["text"].split(" ")
                                if "http" not in t
                            ]
                            item["text"] = " ".join(clean_tokens)
                            item["tweet_date"] = item["date"]
                            item["date"] = datetime.now()
                            item["from_date"] = from_date
                            item["to_date"] = to_date
                            to_return.append(item)

            if to_return:
                df = pd.DataFrame(to_return)
                df.drop_duplicates("text", inplace=True)

                final = df.to_json(orient="records", date_format="iso")
                final = json.loads(final)

                for f in final:
                    f["from_date"] = datetime.strptime(
                        f["from_date"], "%Y-%m-%dT%H:%M:%S.%fZ")
                    f["to_date"] = datetime.strptime(f["to_date"],
                                                     "%Y-%m-%dT%H:%M:%S.%fZ")
            else:
                print("A2")
                results = list(
                    collection.find(
                        {
                            "date": {
                                "$gte": from_date,
                                "$lte": to_date
                            },
                            "constituent_id": constituent_id,
                            "user.followers_count": {
                                "$gte": 200
                            }
                        }, {
                            "text": 1,
                            "constituent_id": 1,
                            "constituent_name": 1,
                            "constituent": 1,
                            "entity_tags.MONEY": 1,
                            "sentiment_score": 1,
                            "date": 1,
                            "_id": 0
                        }).limit(5).sort([("date", -1)]))

                if results:
                    df = pd.DataFrame(results)
                    df.drop_duplicates("text", inplace=True)

                    final = df.to_json(orient="records", date_format="iso")
                    final = json.loads(final)

                    for f in final:
                        f["from_date"] = from_date
                        f["to_date"] = to_date
                else:
                    print("B2")
                    final = [{
                        'constituent': constituent_name,
                        'constituent_id': constituent_id,
                        'constituent_name': constituent_name,
                        'date': datetime.now(),
                        'entity_tags': {
                            'MONEY': ['']
                        },
                        'from_date': from_date,
                        'sentiment_score': 0.0,
                        'text': 'No new tweets.',
                        'to_date': to_date,
                        'tweet_date': None
                    }]

        else:
            print("B")
            results = list(
                collection.find(
                    {
                        "date": {
                            "$gte": from_date,
                            "$lte": to_date
                        },
                        "constituent_id": constituent_id,
                        "user.followers_count": {
                            "$gte": 200
                        }
                    }, {
                        "text": 1,
                        "constituent_id": 1,
                        "constituent_name": 1,
                        "constituent": 1,
                        "entity_tags.MONEY": 1,
                        "sentiment_score": 1,
                        "date": 1,
                        "_id": 0
                    }).limit(5).sort([("date", -1)]))

            if results:
                df = pd.DataFrame(results)
                df.drop_duplicates("text", inplace=True)

                final = df.to_json(orient="records", date_format="iso")
                final = json.loads(final)

                for f in final:
                    f["from_date"] = from_date
                    f["to_date"] = to_date

            else:
                print("C")
                results = list(
                    collection.find({
                        "constituent_id": constituent_id
                    }, {
                        "text": 1,
                        "constituent_id": 1,
                        "constituent_name": 1,
                        "constituent": 1,
                        "entity_tags.MONEY": 1,
                        "sentiment_score": 1,
                        "date": 1,
                        "_id": 0
                    }).limit(5).sort([("date", -1)]))

                if results:
                    df = pd.DataFrame(results)
                    df.drop_duplicates("text", inplace=True)

                    final = df.to_json(orient="records", date_format="iso")
                    final = json.loads(final)

                    for f in final:
                        f["from_date"] = from_date
                        f["to_date"] = to_date

                else:
                    print("D")
                    final = [{
                        'constituent': constituent_name,
                        'constituent_id': constituent_id,
                        'constituent_name': constituent_name,
                        'date': datetime.now(),
                        'entity_tags': {
                            'MONEY': ['']
                        },
                        'from_date': from_date,
                        'sentiment_score': 0.0,
                        'text': 'No new tweets.',
                        'to_date': to_date,
                        'tweet_date': None
                    }]

        if final:
            twitter_analytics_latest_price_tweets.insert_many(final)
        time.sleep(1)
Beispiel #29
0
def get_news_tags_bq(args):
    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')
    common_parameters = get_parameters(args.param_connection_string,
                                       common_table, common_list, common_where)

    print("news_tags")

    columns = [
        "Date", "constituent", "Tags", "Count", "constituent_name",
        "constituent_id", "From_date", "To_date"
    ]

    query = """
    SELECT
  a.news_date AS Date,
  a.constituent,
  b.news_topics as Tags,
  COUNT(b.news_topics) AS Count,
  a.constituent_name,
  a.constituent_id,
  TIMESTAMP('{1}') as From_date, TIMESTAMP('{2}') as To_date
FROM
  `{0}.all_news` a,
(SELECT
  x.news_id,
  news_topics
FROM
  `{0}.all_news` AS x,
  UNNEST(news_topics) AS news_topics) b
WHERE a.news_id = b.news_id AND 
a.news_date BETWEEN TIMESTAMP ('{1}')
  AND TIMESTAMP ('{2}')
GROUP BY
  Date,
  a.constituent,
  b.news_topics,
  a.constituent_name,
  a.constituent_id;
    """.format(common_parameters["BQ_DATASET"],
               common_parameters["FROM_DATE"].strftime("%Y-%m-%d"),
               common_parameters["TO_DATE"].strftime("%Y-%m-%d"))

    storage_client = Storage.Storage(args.google_key_path)

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(
            dict((k, item[k].strftime('%Y-%m-%d %H:%M:%S')
                  ) if isinstance(item[k], datetime) else (k, item[k])
                 for k in columns))

    try:
        storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                            'news_tag', to_insert)
    except Exception as e:
        print(e)
def main(args):
    if __name__ != "__main__":
        sys.path.insert(0, args.python_path)
        from utils.Storage import Storage
        from utils import twitter_analytics_helpers as tah

    driver = webdriver.PhantomJS()

    # Get parameters
    param_table = "PARAM_HISTORICAL_COLLECTION"
    parameters_list = ["LOGGING", "DESTINATION_TABLE"]

    parameters = tah.get_parameters(args.param_connection_string, param_table,
                                    parameters_list)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    # Get constituents
    storage = Storage(google_key_path=args.google_key_path)

    query = """
    SELECT a.CONSTITUENT_ID, a.CONSTITUENT_NAME, b.URL_KEY
    FROM MASTER_CONSTITUENTS a, PARAM_FINANCIAL_URL_KEYS b
    WHERE a.CONSTITUENT_ID = b.CONSTITUENT_ID
    """

    all_constituents = storage.get_sql_data_text_query(
        args.param_connection_string, query)

    # Get dates
    query = """
    SELECT max(date) as last_date FROM `{}.{}`
    """.format(common_parameters["BQ_DATASET"],
               parameters["DESTINATION_TABLE"])

    try:
        result = storage.get_bigquery_data(query=query, iterator_flag=False)
    except Exception as e:
        print(e)
        return

    from_date = result[0]["last_date"] + timedelta(days=1)
    ts = from_date.strftime("%d.%m.%Y")
    from_date_parts = ts.split(".")
    if from_date_parts[0][0] == "0":
        from_date_parts[0] = from_date_parts[0][1:]
    if from_date_parts[1][0] == "0":
        from_date_parts[1] = from_date_parts[1][1:]

    from_date = ".".join(from_date_parts)

    to_date = datetime.now().strftime("%d.%m.%Y")
    to_date_parts = to_date.split(".")
    if to_date_parts[0][0] == "0":
        to_date_parts[0] = to_date_parts[0][1:]
    if to_date_parts[1][0] == "0":
        to_date_parts[1] = to_date_parts[1][1:]

    to_date = ".".join(to_date_parts)

    if from_date == to_date:
        return

    dax_url = 'http://en.boerse-frankfurt.de/index/pricehistory/DAX/{}_{}#History'.format(
        from_date, to_date)
    constituent_base_url = 'http://en.boerse-frankfurt.de/stock/pricehistory/'
    constituent_date_url = '-share/FSE/{}_{}#Price_History'.format(
        from_date, to_date)

    if args.all:
        extract_historical_data(dax_url,
                                driver,
                                storage,
                                common_parameters["BQ_DATASET"],
                                parameters["DESTINATION_TABLE"],
                                constituent='DAX')
        for constituent_id, constituent_name, url_key in all_constituents:
            print("Extracting data for {} from {} to {}".format(
                constituent_name, from_date, to_date))
            extract_historical_data(urljoin(constituent_base_url,
                                            url_key + constituent_date_url),
                                    driver,
                                    storage,
                                    common_parameters["BQ_DATASET"],
                                    parameters["DESTINATION_TABLE"],
                                    constituent=(constituent_id,
                                                 constituent_name))
            time.sleep(10)
    else:
        if args.constituent == 'DAX':
            extract_historical_data(dax_url,
                                    driver,
                                    storage,
                                    common_parameters["BQ_DATASET"],
                                    parameters["DESTINATION_TABLE"],
                                    constituent='DAX')
        else:
            for constituent_id, constituent_name, url_key in all_constituents:
                if constituent_id == args.constituent:
                    print("Extracting data for {} from {} to {}".format(
                        constituent_name, from_date, to_date))
                    constituent_url = urljoin(constituent_base_url,
                                              url_key + constituent_date_url)
                    extract_historical_data(constituent_url,
                                            driver,
                                            storage,
                                            common_parameters["BQ_DATASET"],
                                            parameters["DESTINATION_TABLE"],
                                            constituent=(constituent_id,
                                                         constituent_name))

    driver.quit()
def get_country_data(args):
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET", "FROM_DATE", "TO_DATE"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x["STATUS"] == 'active')
    common_parameters = get_parameters(args.param_connection_string, common_table, common_list, common_where)

    print("country_data")

    #Feature PECTEN-9
    backup_table_name = backup_table(args.google_key_path,common_parameters["BQ_DATASET"],"country_data")

    columns = ["count", "avg_sentiment", "constituent_name", "country_name", "constituent", "constituent_id", "from_date", "to_date"]

    query = """
    SELECT constituent, AVG(sentiment_score) as avg_sentiment, count(place.country_code) as count, place.country_code as country_name, constituent_name, constituent_id, TIMESTAMP('{1}') as from_date, TIMESTAMP('{2}') as to_date
    FROM `{0}.tweets`
    WHERE date BETWEEN TIMESTAMP ('{1}') and TIMESTAMP ('{2}') and place.country_code is not null
    GROUP BY constituent_id, constituent, country_name, constituent_name     
    """.format(common_parameters["BQ_DATASET"],common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S"),
               common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S"))

    storage_client = Storage.Storage(args.google_key_path)

    result = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    for item in result:
        to_insert.append(dict((k,item[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(item[k],datetime) else
                   (k,item[k]) for k in columns))

    # Feature PECTEN-9
    try:
        validate_data(args.google_key_path,to_insert,common_parameters["BQ_DATASET"],'country_data')
    except AssertionError as e:
        drop_backup_table(args.google_key_path, common_parameters["BQ_DATASET"], backup_table_name)
        e.args += ("Schema of results does not match table schema.",)
        raise

    #Feature PECTEN-9
    from_date = common_parameters["FROM_DATE"].strftime("%Y-%m-%d %H:%M:%S")
    to_date = common_parameters["TO_DATE"].strftime("%Y-%m-%d %H:%M:%S")
    try:
        before_insert(args.google_key_path,common_parameters["BQ_DATASET"],"country_data",from_date,to_date,storage_client)
    except AssertionError as e:
        drop_backup_table(args.google_key_path,common_parameters["BQ_DATASET"],backup_table_name)
        e.args += ("Data already exists",)
        raise

    try:
        print("Inserting into BQ")
        storage_client.insert_bigquery_data(common_parameters["BQ_DATASET"],
                                           'country_data', to_insert)
    except Exception as e:
        print(e)
        raise

    #Feature PECTEN-9
    try:
        after_insert(args.google_key_path,common_parameters["BQ_DATASET"],"country_data",from_date,to_date)
    except AssertionError as e:
        e.args += ("No data was inserted.",)
        raise
    finally:
        drop_backup_table(args.google_key_path,common_parameters["BQ_DATASET"],backup_table_name)
Beispiel #32
0
def write_to_file(args):
    from utils.Storage import Storage,MongoEncoder
    # Get tweets
    query = """
        SELECT *
        FROM pecten_dataset.tweets
        WHERE lang = 'de'
        """

    storage_client = Storage(args.google_key_path)
    dataset = 'pecten_dataset'
    if args.environment != 'production':
        dataset += "_" + args.environment

    it = storage_client.get_bigquery_data(query, iterator_flag=True)
    to_insert = []

    total = 0

    ids = set()

    with open("tr_tweets.json", "r") as fo:
        for line in fo:
            ids.add(json.loads(line)['id'])

    with open("tr_tweets.json", "a") as f:
        for tweet in it:
            if tweet.id in ids:
                continue

            start_time = time.time()
            data = dict((k, tweet[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(tweet[k], datetime) else
                        (k, tweet[k]) for k in list(tweet._xxx_field_to_index.keys()))

            response = None

            try:
                url = "http://{}:5674/translate".format(args.ip_address)
                payload = {'q': tweet['text']}
                print("Making request")
                response = requests.get(url, params=payload,timeout=11)
                response = json.loads(response.text)
            except Exception as e:
                print(e)
                continue

            translated_text = None
            if response:
                try:
                    translated_text = response['data']['translations'][0]['translatedText']
                except Exception as e:
                    print(e)
                    continue

            if translated_text:
                data['text_original_language'] = data['text']
                data['text'] = translated_text

            f.write(json.dumps(data, cls=MongoEncoder) + '\n')
            print("--- {} seconds ---".format(time.time() - start_time))
            total += 1

            if total % 1000 == 0:
                print("Saved {} records".format(total))
Beispiel #33
0
def get_rss_feed(args):
    from utils import logging_utils as logging_utils
    from utils import twitter_analytics_helpers as tah
    from utils.Storage import Storage

    storage_client = Storage(google_key_path=args.google_key_path)

    # Get parameters
    param_table = "PARAM_NEWS_COLLECTION"
    parameters_list = ["LOGGING", "DESTINATION_TABLE", "LOGGING_TABLE"]
    where = lambda x: x["SOURCE"] == 'RSS'

    parameters = tah.get_parameters(args.param_connection_string, param_table,
                                    parameters_list, where)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tah.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    # get constituents
    all_constituents = storage_client.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME", "SYMBOL"])

    for constituent_id, constituent_name, symbol in all_constituents:
        # get last news date for the constituent
        query = """
                    SELECT max(news_date) as max_date FROM `{}.{}`
                    WHERE constituent_id = '{}' and news_origin = "Yahoo Finance RSS"
                    """.format(common_parameters["BQ_DATASET"],
                               parameters["DESTINATION_TABLE"], constituent_id)

        try:
            result = storage_client.get_bigquery_data(query=query,
                                                      iterator_flag=False)
            max_date = result[0]["max_date"]
        except Exception as e:
            max_date = None
            continue

        print("Getting RSS Feed for {}".format(constituent_name))
        d = feedparser.parse(
            'http://finance.yahoo.com/rss/headline?s={}'.format(symbol +
                                                                ".DE"))

        to_insert = []

        for post in d.entries:
            #print(post)
            date = datetime(post.published_parsed[0], post.published_parsed[1],
                            post.published_parsed[2], post.published_parsed[3],
                            post.published_parsed[4])
            if max_date:
                max_date = str(max_date)[:19]
                max_date = datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S')
                if date < max_date:
                    continue

            doc = {
                "url": post.link,
                "news_date": date.strftime('%Y-%m-%d %H:%M:%S'),
                "news_language": post.summary_detail["language"],
                "news_title": post.title,
                "news_article_txt": post.summary,
                "news_origin": "Yahoo Finance RSS",
                "show": True
            }

            # Get sentiment score
            doc["score"] = tah.get_nltk_sentiment(str(doc["news_article_txt"]))

            # get sentiment word
            doc["sentiment"] = tah.get_sentiment_word(doc["score"])

            # add constituent name, id and old name
            doc["constituent_id"] = constituent_id
            doc["constituent_name"] = constituent_name
            old_constituent_name = tah.get_old_constituent_name(constituent_id)
            doc["constituent"] = old_constituent_name

            to_insert.append(doc)

        try:
            print("Inserting into BQ")
            storage_client.insert_bigquery_data(
                common_parameters["BQ_DATASET"],
                parameters["DESTINATION_TABLE"], to_insert)
        except Exception as e:
            print(e)

        log = [{
            "date": datetime.now().date().strftime('%Y-%m-%d %H:%M:%S'),
            "constituent_name": constituent_name,
            "constituent_id": constituent_id,
            "downloaded_news": len(to_insert),
            "source": "Yahoo Finance RSS"
        }]

        if parameters["LOGGING"] and to_insert:
            logging_utils.logging(log, common_parameters["BQ_DATASET"],
                                  parameters["LOGGING_TABLE"], storage_client)
Beispiel #34
0
def main(args):
    from utils.Storage import Storage
    #Get tweets
    query = """
    SELECT *
    FROM pecten_dataset.tweets_backup
    WHERE lang = 'de' and id NOT IN(
        SELECT distinct(id)
        FROM pecten_dataset.tweets
        WHERE text_original_language IS NOT NULL)
    """

    storage_client = Storage(args.google_key_path)
    dataset = 'pecten_dataset'

    if args.environment != 'production':
        dataset += "_" + args.environment

    it = storage_client.get_bigquery_data(query,iterator_flag=True)
    to_insert = []

    total = 0

    for tweet in it:
        data = dict((k,tweet[k].strftime('%Y-%m-%d %H:%M:%S')) if isinstance(tweet[k],datetime) else
                   (k,tweet[k]) for k in list(tweet._xxx_field_to_index.keys()))

        response = None

        try:
            url = "http://{}:5674/translate".format(args.ip_address)
            payload = {'q': tweet['text']}
            response = requests.get(url, params=payload,timeout=11)
            response = json.loads(response.text)
        except Exception as e:
            print(e)
            continue

        translated_text = None
        if response:
            try:
                translated_text = response['data']['translations'][0]['translatedText']
            except Exception as e:
                print(e)
                continue

        if translated_text:
            data['text_original_language'] = data['text']
            data['text'] = translated_text

        to_insert.append(data)

        if len(to_insert) == 500:
            print("Inserting to BQ production")
            try:
                result = storage_client.insert_bigquery_data("pecten_dataset", "tweets", to_insert)
                if result:
                    print("Data inserted")
                else:
                    print("Data not inserted")
            except Exception as e:
                print(e)

            print("Inserting to BQ test")
            try:
                result = storage_client.insert_bigquery_data("pecten_dataset_test", "tweets", to_insert)
                if result:
                    print("Data inserted")
                else:
                    print("Data not inserted")
            except Exception as e:
                print(e)

            to_insert = []