Ejemplo n.º 1
0
def file_to_mongodb(args):
    operations = []
    storage = Storage(
        mongo_connection_string=
        "mongodb://*****:*****@35.189.89.82:27017/dax_gcp")
    with open(args.file1, "r") as f1:
        records = 0
        for line in f1:
            data = json.loads(line)
            #change column names
            try:
                data["NEWS_TITLE_NewsDim"] = data.pop("news_title")
                data["NEWS_DATE_NewsDim"] = datetime.strptime(
                    data.pop("news_date"), '%Y-%m-%d %H:%M:%S')
                data["NEWS_ARTICLE_TXT_NewsDim"] = data.pop("news_article_txt")
                data["NEWS_SOURCE_NewsDim"] = data.pop("news_source")
                data["NEWS_PUBLICATION_NewsDim"] = data.pop("news_publication")
                data["categorised_tag"] = data.pop("news_topics")
                if data["constituent"] == "BMW":
                    data["constituent"] = "bmw"
            except Exception as e:
                print(e)
                continue

            operations.append(data)
            records += 1

            if len(operations) == 1000:
                print("Saving {} records".format(records))
                storage.save_to_mongodb(operations, "dax_gcp", "all_news")
                operations = []

        if len(operations) > 0:
            storage.save_to_mongodb(operations, "dax_gcp", "all_news")
Ejemplo n.º 2
0
def main(arguments):
    #connection string + database + table for storage
    all_constituents_dict_bi = {
        'Allianz': 'Allianz',
        'Adidas': 'Adidas',
        'BASF': 'BASF',
        'Bayer': 'Bayer',
        'Beiersdorf': 'Beiersdorf',
        'BMW': 'BMW',
        'Commerzbank': 'Commerzbank',
        'Continental': 'Continental',
        'Daimler': 'Daimler',
        'Deutsche Bank': 'Deutsche_Bank',
        'Deutsche Börse': 'Deutsche_Boerse',
        'Deutsche Post': 'Deutsche_Post',
        'Deutsche Telekom': 'Deutsche_Telekom',
        'EON': 'EON',
        'Fresenius Medical Care': 'Fresenius_Medical_Care',
        'Fresenius': 'Fresenius',
        'HeidelbergCement': 'HeidelbergCement',
        'Infineon': 'Infineon',
        'Linde': 'Linde_6',
        'Lufthansa': 'Lufthansa',
        'Merck': 'Merck',
        'Münchener Rückversicherungs-Gesellschaft': 'Munich_Re',
        'ProSiebenSat1 Media': 'ProSiebenSat1_Media',
        'RWE': 'RWE',
        'Siemens': 'Siemens',
        'Thyssenkrupp': 'thyssenkrupp',
        'Volkswagen (VW) vz': 'Volkswagen_vz',
        'Vonovia': 'Vonovia'
    }
    #,'Vonovia':'Vonovia'

    bi_analyst_table = analyst_businessinsider(all_constituents_dict_bi)
    ws_analyst_table = analyst_wallstreet()
    combined_analyst_table = combined_analyst(ws_analyst_table,
                                              bi_analyst_table)

    import json
    combined_analyst_json = json.loads(
        combined_analyst_table.to_json(orient='records'))
    bi_analyst_json = json.loads(bi_analyst_table.to_json(orient='records'))

    from utils.Storage import Storage
    storage = Storage()

    #save the result: combined - analyst_opinions, bi - analyst_opinions_all
    storage.save_to_mongodb(connection_string=args.param_connection_string,
                            database=args.database,
                            collection=args.collection_selected,
                            data=combined_analyst_json)
    storage.save_to_mongodb(connection_string=args.param_connection_string,
                            database=args.database,
                            collection=args.collection_all,
                            data=bi_analyst_json)
Ejemplo n.º 3
0
def get_tweets(args):
    from utils import logging_utils as logging_utils
    from utils.TwitterDownloader import TwitterDownloader
    from utils.Storage import Storage
    from utils import twitter_analytics_helpers as tap
    from utils.TaggingUtils import TaggingUtils as TU

    param_table = "PARAM_TWITTER_COLLECTION"
    parameters_list = [
        "LANGUAGE", "TWEETS_PER_QUERY", "MAX_TWEETS", "CONNECTION_STRING",
        "DATABASE_NAME", "COLLECTION_NAME", "LOGGING", "EMAIL_USERNAME",
        "EMAIL_PASSWORD", "TWITTER_API_KEY", "TWITTER_API_SECRET",
        "BUCKET_NAME", "DESTINATION_TABLE", "LOGGING_TABLE"
    ]

    parameters = tap.get_parameters(args.param_connection_string, param_table,
                                    parameters_list)

    # Get dataset name
    common_table = "PARAM_READ_DATE"
    common_list = ["BQ_DATASET"]
    common_where = lambda x: (x["ENVIRONMENT"] == args.environment) & (x[
        "STATUS"] == 'active')

    common_parameters = tap.get_parameters(args.param_connection_string,
                                           common_table, common_list,
                                           common_where)

    languages = parameters["LANGUAGE"].split(",")

    storage = Storage(google_key_path=args.google_key_path,
                      mongo_connection_string=parameters["CONNECTION_STRING"])
    tagger = TU()

    downloader = TwitterDownloader(parameters["TWITTER_API_KEY"],
                                   parameters["TWITTER_API_SECRET"])
    downloader.load_api()

    all_constituents = storage.get_sql_data(
        sql_connection_string=args.param_connection_string,
        sql_table_name="MASTER_CONSTITUENTS",
        sql_column_list=["CONSTITUENT_ID", "CONSTITUENT_NAME"])

    fields_to_keep = [
        "text", "favorite_count", "source", "retweeted", "entities", "id_str",
        "retweet_count", "favorited", "user", "lang", "created_at", "place",
        "constituent_name", "constituent_id", "search_term", "id",
        "sentiment_score", "entity_tags", "relevance", "constituent"
    ]

    for language in languages:
        for constituent_id, constituent_name in all_constituents:
            search_query = get_search_string(constituent_id,
                                             args.param_connection_string,
                                             "PARAM_TWITTER_KEYWORDS",
                                             "PARAM_TWITTER_EXCLUSIONS")

            #Get max id of all tweets to extract tweets with id highe than that
            q = "SELECT MAX(id) as max_id FROM `{}.{}` WHERE constituent_id = '{}' " \
                "AND lang = '{}';".format(common_parameters["BQ_DATASET"],parameters["DESTINATION_TABLE"],
                                        constituent_id,language)
            try:
                sinceId = int(
                    storage.get_bigquery_data(
                        q, iterator_flag=False)[0]["max_id"])
            except Exception as e:
                print(e)
                sinceId = None

            max_id = -1
            tweetCount = 0

            print("Downloading max {0} tweets for {1} in {2} on {3}".format(
                parameters["MAX_TWEETS"], constituent_name, language,
                str(datetime.now())))
            while tweetCount < parameters["MAX_TWEETS"]:
                tweets_unmodified = []
                tweets_modified = []
                tweets_mongo = []

                try:
                    tweets, tmp_tweet_count, max_id = downloader.download(
                        constituent_name, search_query, language,
                        parameters["TWEETS_PER_QUERY"], sinceId, max_id)
                except Exception as e:
                    continue

                if not tweets:
                    break
                else:
                    print("Downloaded {} tweets".format(tmp_tweet_count))

                tweetCount += tmp_tweet_count

                #Add fields for both unmodified and modified tweets
                for tweet in tweets:
                    tweet._json['source'] = "Twitter"
                    tweet._json['constituent_name'] = constituent_name
                    tweet._json['constituent_id'] = constituent_id
                    tweet._json['search_term'] = search_query
                    tweet._json["constituent"] = tap.get_old_constituent_name(
                        constituent_id)

                    #Removing bad fields
                    clean_tweet = tap.scrub(tweet._json)

                    # Separate the tweets that go to one topic or the other

                    #unmodified
                    t_unmodified = deepcopy(clean_tweet)
                    t_unmodified["date"] = tap.convert_timestamp(
                        t_unmodified["created_at"])
                    tweets_unmodified.append(t_unmodified)

                    #Add additional fields
                    clean_tweet["sentiment_score"] = tap.get_nltk_sentiment(
                        str(clean_tweet["text"]))
                    tagged_text = tagger.get_spacy_entities(
                        str(clean_tweet["text"]))
                    clean_tweet["entity_tags"] = tap.get_spacey_tags(
                        tagged_text)
                    clean_tweet["relevance"] = -1

                    #mongo
                    t_mongo = deepcopy(clean_tweet)
                    t_mongo['date'] = datetime.strptime(
                        t_mongo['created_at'], '%a %b %d %H:%M:%S %z %Y')
                    tweets_mongo.append(t_mongo)

                    #modified
                    tagged_tweet = dict((k, clean_tweet[k])
                                        for k in fields_to_keep
                                        if k in clean_tweet)
                    tagged_tweet['date'] = tap.convert_timestamp(
                        clean_tweet["created_at"])
                    tweets_modified.append(tagged_tweet)

                #send to PubSub topic
                #ps_utils.publish("igenie-project", "tweets-unmodified", tweets_unmodified)
                #ps_utils.publish("igenie-project", "tweets", tweets_modified)
                try:
                    storage.insert_bigquery_data(
                        common_parameters["BQ_DATASET"],
                        '{}_unmodified'.format(
                            parameters["DESTINATION_TABLE"]),
                        tweets_unmodified)
                except Exception as e:
                    print(e)
                try:
                    storage.insert_bigquery_data(
                        common_parameters["BQ_DATASET"],
                        parameters["DESTINATION_TABLE"], tweets_modified)
                except Exception as e:
                    print(e)
                try:
                    storage.save_to_mongodb(tweets_mongo, "dax_gcp",
                                            parameters["DESTINATION_TABLE"])
                    pass
                except Exception as e:
                    print(e)

                time.sleep(1)

            print("Saved {} tweets for in {}".format(tweetCount,
                                                     constituent_name,
                                                     language))

            if parameters["LOGGING"]:
                doc = [{
                    "date":
                    time.strftime('%Y-%m-%d %H:%M:%S',
                                  datetime.now().date().timetuple()),
                    "constituent_name":
                    constituent_name,
                    "constituent_id":
                    constituent_id,
                    "downloaded_tweets":
                    tweetCount,
                    "language":
                    language
                }]
                logging_utils.logging(doc, common_parameters["BQ_DATASET"],
                                      parameters["LOGGING_TABLE"], storage)

    return "Downloaded tweets"