Ejemplo n.º 1
0
def ingestMongoEvents():

    # -----------------------------------------------------------------------------------------

    # Retrieve MongoDB info from config file
    mongoConnection = APP_CONFIG['MongoDB-Event-Database']['address']
    databaseName = APP_CONFIG['MongoDB-Event-Database']['name']

    # Retrieve the MySQL database curose 
    connection = databaseConnection()
    cursor = connection.cursor()

    # setup localization
    myclient = pymongo.MongoClient(mongoConnection)

    # reference MongoDB database
    database = myclient[databaseName]

    # -----------------------------------------------------------------------------------------

    try:
        # for names in collNames:
        for coll in database.list_collection_names():

            # -----------------------------------------------------------------------------------------

            if (coll == "Cambia-Lens"):
                print("Cambia-Lens Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    cambiaLens.cambiaLensIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Crossref"):
                print("Crossref Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    crossref.crossrefIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Datacite"):
                print("Datacite Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    datacite.dataciteIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "F1000"):
                print("F1000 Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    f1000.F1000Ingest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Hypothesis"):
                print("Hypothesis Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    hypothesis.hypothesisIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Newsfeed"):
                print("Newsfeed Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    newsfeed.newsfeedIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Reddit"):
                print("Reddit Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    reddit.redditIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Reddit-Links"):
                print("Reddit-Links Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    redditLinks.redditLinksIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "StackExchange"):
                print("StackExchange Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    stackExchange.stackExchangeIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Twitter"):
                print("Twitter Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    twitter.twitterIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Web"):
                print("Web Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    web.webIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------

            elif (coll == "Wikipedia"):
                print("Wikipedia Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    wikipedia.wikipediaIngest(uniqueEvent, cursor, connection)

            # -----------------------------------------------------------------------------------------
                    
            elif (coll == "WordPressDotCom"):
                print("WordPressDotCom Ingest!")

                # Match the event with the collection name
                events = database[coll]

                # For all events in the collection, iterate through them and ingest them
                for uniqueEvent in events.find({}):
                    wordpress.wordpressIngest(uniqueEvent, cursor, connection)

    # -----------------------------------------------------------------------------------------

    except:
        print("Ingest failed!")
        cursor.close()
        connection.close()

    # -----------------------------------------------------------------------------------------
Ejemplo n.º 2
0
def main():

    connection = mysql.connector.connect(user=str(mysql_username), password=str(
        mysql_password), host='127.0.0.1', database='crossrefeventdatamain')

    # Returns dictionary values instead of tuples
    cursor = connection.cursor(dictionary=True)

    # Query that grabs ID of the last row(highest ID) of the database
    SQLMaxFindAI = "Select MAX(id) as id FROM paperbuzzeventdata.event_data_json;"
    cursor.execute(SQLMaxFindAI)
    maxAIResults = cursor.fetchone()
    maxAI = maxAIResults["id"]
    AI = 1

    # Loop through the whole paperbuzz database
    while(AI < maxAI):
        query = "select json from paperbuzzeventdata.event_data_json where id = " + \
            str(AI) + ";"
        cursor.execute(query)
        # Grabs the json column and puts it into a dictionary
        eventRow = cursor.fetchone()

        # Stores the "json" key within eventBytes
        # eventBytes turns into a byte class
        eventBytes = eventRow.get("json")

        # Converts eventBytes into a dictionary
        eventDict = json.loads(eventBytes.decode('utf-8'))

        # Go through keys and values of dictionary, searching for source_id and the value based on which online platform the event comes from
        for key, value in eventDict.items():
            try:
                # Speeds up the process of searching for the source_id.
                if(key != "source_id"):
                    pass
                else:
                    if (key == "source_id" and value == "cambia-lens"):
                        print('CambiaLens')
                        cambiaLens.cambiaLensIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "crossref"):
                        print('Crossref')
                        crossref.crossrefIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "datacite"):
                        print('Datacite')
                        datacite.dataciteIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "f1000"):
                        print('F1000')
                        f1000.F1000Ingest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "hypothesis"):
                        print('Hypothesis')
                        hypothesis.hypothesisIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "newsfeed"):
                        print('Newsfeed')
                        newsfeed.newsfeedIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "reddit"):
                        print('Reddit')
                        reddit.redditIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "reddit-links"):
                        print('RedditLinks')
                        redditLinks.redditLinksIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "stackexchange"):
                        print('Stackexchange')
                        stackExchange.stackExchangeIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "twitter"):
                        print('Twitter')
                        twitter.twitterIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "web"):
                        print('Web')
                        web.webIngest(eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "wikipedia"):
                        print('Wikipedia')
                        wikipedia.wikipediaIngest(
                            eventDict, cursor, connection)
                        break
                    elif (key == "source_id" and value == "wordpressdotcom"):
                        print('Wordpress')
                        wordpress.wordpressIngest(
                            eventDict, cursor, connection)
                        break
            except Exception as e:
                logging.info("Failed Ingest. Failed on file")
                logging.info("The error was " + str(e))
                cursor.close()
                connection.close()
                sys.exit()
        # print(type(eventDict))
        AI += 1
    cursor.close()
    connection.close()
    print("--- %s seconds ---" % (time.time() - start_time))
Ejemplo n.º 3
0
def main():

    restorePoint = ""  # Used as last filename we were ingesting
    files = []
    global dataDirectory
    connection = mysql.connector.connect(user=str(mysql_username),
                                         password=str(mysql_password),
                                         host='127.0.0.1',
                                         database='crossrefeventdatamain')
    cursor = connection.cursor(
    )  # Allows us to have multiple seperate working environments through the same connection. Can create individual cursors for each (event) table? redditC = cnx.cursor()?

    for (path, dirnames, filenames) in os.walk(dataDirectory):
        files.extend(os.path.join(path, name) for name in sorted(filenames))

    # For each file in the directory, open the file.
    #   For each opened file, load the contents into memory and extract the events LIST
    #       For i in events, pull out key/value pairs and "INSERT IGNORE INTO crossRefEventData"
    for i in files:
        restorePoint = i  # This is the filename we should start at when rerunning it
        with open(i) as json_file:
            data = json.load(json_file)  # Dict
            events = data.get("message").get("events")  # LIST of dicts
            for uniqueEvent in events:  # each uniqueEvent is a dict
                # Go through each event's key and value pairs
                for key, value in uniqueEvent.items():
                    # Try except incase something goes wrong for if elif statements
                    # The source_id is a distinct field that represents each online platform
                    # If the source_id is *insert online platform here*, then insert the values for each field of the event.
                    # Break statements are used to stop the loop from going through the rest of the event, all we needed was to find source_id. This helps speeds up the process.
                    try:
                        # Speeds up the process of searching for the source_id.
                        if (key != "source_id"):
                            pass
                        else:
                            if (key == "source_id" and value == "cambia-lens"):
                                print('cambia')
                                cambiaLens.cambiaLensIngest(
                                    uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id" and value == "crossref"):
                                print('crossref')
                                crossref.crossrefIngest(
                                    uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id" and value == "datacite"):
                                print('datacite')
                                datacite.dataciteIngest(
                                    uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id" and value == "f1000"):
                                print('F1000')
                                f1000.F1000Ingest(uniqueEvent, cursor,
                                                  connection)
                                break
                            elif (key == "source_id"
                                  and value == "hypothesis"):
                                print('hypothesis')
                                hypothesis.hypothesisIngest(
                                    uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id" and value == "newsfeed"):
                                print('newsfeed')
                                newsfeed.newsfeedIngest(
                                    uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id" and value == "reddit"):
                                print('reddit')
                                reddit.redditIngest(uniqueEvent, cursor,
                                                    connection)
                                break
                            elif (key == "source_id"
                                  and value == "reddit-links"):
                                print('redditlinks')
                                redditLinks.redditLinksIngest(
                                    uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id"
                                  and value == "stackexchange"):
                                print('stackexchange')
                                stackExchange.stackExchangeIngest(
                                    uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id" and value == "twitter"):
                                print('twitter')
                                twitter.twitterIngest(uniqueEvent, cursor,
                                                      connection)
                                break
                            elif (key == "source_id" and value == "web"):
                                print('web')
                                web.webIngest(uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id" and value == "wikipedia"):
                                print('wikipedia')
                                wikipedia.wikipediaIngest(
                                    uniqueEvent, cursor, connection)
                                break
                            elif (key == "source_id"
                                  and value == "wordpressdotcom"):
                                print('wordpress.com')
                                wordpress.wordpressIngest(
                                    uniqueEvent, cursor, connection)
                                break
                    except Exception as e:
                        logging.info("Failed Ingest. Failed on file" + i)
                        logging.info("The error was " + str(e))
                        cursor.close()
                        connection.close()
                        sys.exit()
    cursor.close()
    connection.close()
    print("--- %s seconds ---" % (time.time() - start_time))