コード例 #1
0
def formCSVFromGeoInMessages(outputDir, portInput):
    from MongoDBInterface import getMongoClient
    client = getMongoClient(portInput)
    db_name = "Temp_Analysis"
    db = client[db_name]

    collectionName = "TokenToCoordinates_Combined3Div"
    from MongoDBInterface import getMongoClient
    client = getMongoClient(portInput)
    db = client[db_name]
    collectionToRead = db[collectionName]
    query = {}
    tweetCursor = collectionToRead.find(query, no_cursor_timeout=True)

    rows = [[
        'id', "label", "Americas", "Africa_Europe", "Asia_Australia",
        "ratioAmericas", "ratioAfrica_Europe", "ratioAsia_Australia",
        "ratioMax", "total"
    ]]
    import numpy as np
    for userInfo in tweetCursor:
        values = userInfo['coordinates']
        l1 = 0
        l2 = 0
        l3 = 0
        for value in values:
            if value < 0:
                l1 += 1
            elif value == 0:
                l2 += 1
            else:
                l3 += 1

        Americas = l1
        Africa_Europe = l2
        Asia_Australia = l3
        ratioAmericas = float(Americas) / float(len(values))
        ratioAfrica_Europe = float(Africa_Europe) / float(len(values))
        ratioAsia_Australia = float(Asia_Australia) / float(len(values))
        ratioMax = np.max(
            [ratioAmericas, ratioAfrica_Europe, ratioAsia_Australia])
        label = None
        if ratioAmericas == ratioMax:
            label = "Americas"
        elif ratioAfrica_Europe == ratioMax:
            label = "Africa_Europe"
        else:
            label = "Asia_Australia"
        #token = userInfo['_id'].encode("utf-8")
        #if token.startswith('@') or token.startswith('#'):
        rows.append([
            userInfo['_id'].encode("utf-8"), label, Americas, Africa_Europe,
            Asia_Australia, ratioAmericas, ratioAfrica_Europe,
            ratioAsia_Australia, ratioMax,
            len(values)
        ])

    fileToStoreTo = outputDir + "combineDBsCoordinateGroundTruthDiv3.csv"
    from Step2ProcessTableOfTweets import writeRowsToCSV
    writeRowsToCSV(rows, fileToStoreTo)
コード例 #2
0
def performRawCollection(db_name, portIn):
    port = portIn
    global db
    global collectionName
    from tweepy import Stream
    from TwitterAPI import getAPI
    twitterAPI1 = getAPI()
    twitterStream = Stream(auth=twitterAPI1.auth, listener=listener())
    
    from MongoDBInterface import getMongoClient
    client = getMongoClient(port)
    db = client[db_name]
    collectionName = "Day1"
    
    print("Start day 1. " + "Writing to:" + db_name + "," + str(collectionName))

    twitterStream.sample(async=True)
    import time
    dayCount = 2
    while True:
        time.sleep(86400)
        print("Finished collection for day:" + str(dayCount-1))
        print("Starting day " + str(dayCount))
        collectionName = "Day"+str(dayCount)
        print("Start day " + str(dayCount) + ". Writing to:" + db_name + "," + str(collectionName))
        dayCount += 1
コード例 #3
0
def mainProcessFollowers(api,
                         db_name,
                         collectionName,
                         screenNames,
                         influencerScreenNamesLowerCaseToProper,
                         portInput,
                         maxFollowersToCollect=1000,
                         filePathToStoreToInput=None):
    from datetime import datetime
    print("Starting Collecting Followers with Follower Sample size = " +
          str(maxFollowersToCollect) + " " + str(datetime.now()))

    from MongoDBInterface import getMongoClient
    client = getMongoClient(portInput)
    db = client[db_name]
    collectionToWrite = db[collectionName]

    from MongoDBInterface import getUsersUnwrittenToMongoDB
    usersUnwritten = getUsersUnwrittenToMongoDB(
        collectionToWrite, screenNames, "_id",
        influencerScreenNamesLowerCaseToProper)
    print("Users unwritten " + str(len(usersUnwritten)))

    for screenName in usersUnwritten:
        if filePathToStoreToInput != None:
            getFollowerIDs(api, screenName, maxFollowersToCollect,
                           collectionToWrite, filePathToStoreToInput)
        else:
            getFollowerIDs(api, screenName, maxFollowersToCollect,
                           collectionToWrite)

    print("Finished Collecting Followers " + str(datetime.now()))
コード例 #4
0
def getHoursAccordingToFollowerOrder(db_name, outputDirFollower, portInput=None):
    global port
    if portInput != None:
        port = portInput
        
    from MongoDBInterface import getMongoClient
    client = getMongoClient(port)
    db = client[db_name]
    collectionToRead = db["followerInfo"]
    
    import os
    hoursInOrder = []
    from MongoDBInterface import getMongoClient
    client = getMongoClient(port)
    dbnames = client.list_database_names()
    if db_name in dbnames:
        db = client[db_name]
        if "influencerOverWhichFollowerInfoCollected" in db.list_collection_names():
            userInfoPath = outputDirFollower+str(db_name.lower())+".pickle"
            import pickle
            with open(userInfoPath, "rb") as fp:
                followers = pickle.load(fp)

            idToDate = {}
            query = {}
            tweetCursor = collectionToRead.find(query, no_cursor_timeout=True)
            for userInfo in tweetCursor:
                idToDate[str(userInfo["id_str"])] = userInfo["created_at"]
        
            hoursInOrder = []
            datesInOrder = []
            for follower in followers:
                if follower in idToDate:
                    hoursInOrder.append(idToDate[follower].hour)
                    datesInOrder.append(idToDate[follower])
            
            print(str(len(hoursInOrder)) + " account creation times in order")
            
    if len(hoursInOrder) == 0:
        print("Error could not load any hours")
        import sys
        sys.exit()
    return hoursInOrder, datesInOrder
コード例 #5
0
def mainProcessScreenNames(api, db_name, collectionName, screenNames, portInput, writeDescription=False):
    from datetime import datetime
    print("Starting Collecting User Info " + str(datetime.now()) + " into collection " + collectionName)
     
    from MongoDBInterface import getMongoClient
    if portInput != None:
        client = getMongoClient(portInput)
    else:
        client = getMongoClient()
    db = client[db_name]
    collectionToWrite = db[collectionName]

    from MongoDBInterface import getUsersUnwrittenToMongoDB
    usersUnwritten = list(getUsersUnwrittenToMongoDB(collectionToWrite, screenNames, "screenName", {}))
    print(str(len(usersUnwritten)) + " users unwritten for part 1")
    
    getFollowerInfoFromListOfScreenNames(api, usersUnwritten, collectionToWrite, writeDescription)

    print("Finished Collecting User Info " + str(datetime.now()) + " into collection:" + collectionName)
コード例 #6
0
def formCommunities(screenNames, outputDir, minFriend, maxFriend, maxFollower,
                    portInput):
    screenNameToFollowers = {}
    for screenName in screenNames:
        userInfoPath = outputDir + str(screenName.lower()) + ".pickle"
        import pickle
        with open(userInfoPath, "rb") as fp:
            followers = pickle.load(fp)
            followersSTR = set([])
            for follower in followers:
                try:
                    followersSTR.add(str(follower))
                except:
                    print("Error loading follower")

            followers = followersSTR
            screenNameToFollowers[screenName] = followers

    #iterate over possible pairs
    followersOfInterest = set([])
    for i in range(0, len(screenNames), 1):
        for j in range(0, len(screenNames), 1):
            if i < j:
                mutualFollowers = screenNameToFollowers[
                    screenNames[i]].intersection(
                        screenNameToFollowers[screenNames[j]])
                followersOfInterest = followersOfInterest.union(
                    mutualFollowers)

    print(
        str(len(followersOfInterest)) +
        " followers of interest by iterating over every pair of influencers")
    from MongoDBInterface import getMongoClient
    client = getMongoClient(portInput)
    finalCommunity = set([])
    for db_name in screenNames:
        db = client[db_name]
        collection = db["followerInfo"]

        tweetCursor = collection.find({}, no_cursor_timeout=True)

        fieldsOfInterest = ['followers_count', 'friends_count']
        for userInfo in tweetCursor:
            if str(userInfo['id_str']) in followersOfInterest:
                if (userInfo['followers_count'] <= maxFollower
                        and userInfo['friends_count'] <= maxFriend
                        and userInfo['friends_count'] >= minFriend):
                    finalCommunity.add(str(userInfo['id_str']))

    print(
        str(len(finalCommunity)) +
        " final community after applying thresholds")

    return finalCommunity
コード例 #7
0
def mainProcessFriends(api, db_name, collectionName, ids, portInput):
    from datetime import datetime
    print("Starting Collecting Friends")

    from MongoDBInterface import getMongoClient
    client = getMongoClient(portInput)
    db = client[db_name]
    collectionToWrite = db[collectionName]

    from MongoDBInterface import getUsersUnwrittenToMongoDB
    usersUnwritten = getUsersUnwrittenToMongoDB(collectionToWrite, ids, "_id",
                                                {})
    print("Users unwritten " + str(len(usersUnwritten)))

    for strID in usersUnwritten:
        getFriendsIDs(api, strID, collectionToWrite)

    print("Finished Collecting Friends " + str(datetime.now()))
コード例 #8
0
def writeTopNVectors(communityToCommunityFollows, dictionary, weightedVectors,
                     N, portInput, outputDir):
    for label in weightedVectors:
        weightsTemp = weightedVectors[label]
        communityWeights = {}
        for indexToWeightTuple in weightsTemp:
            communityWeights[indexToWeightTuple[0]] = indexToWeightTuple[1]

        topNIndexes, topNToCount = getTopNFromDict(communityWeights, N)
        topNNames = []
        for index in topNIndexes:
            topNNames.append(dictionary[index])

        db_name = label
        collectionName = "friendInfo"
        print(label)
        print(topNNames)
        from MongoDBInterface import getMongoClient
        client = getMongoClient(portInput)
        db = client[db_name]
        collectionToRead = db[collectionName]

        tweetCursor = collectionToRead.find({}, no_cursor_timeout=True)

        fieldsOfInterest = [
            'screenName', 'followers_count', 'location', 'name', 'created_at',
            'description'
        ]
        nameToRow = {}
        for userInfo in tweetCursor:
            if str(userInfo["id_str"]) in topNNames:
                row = [
                    communityToCommunityFollows[label][str(userInfo["id_str"])]
                ]
                for field in fieldsOfInterest:
                    row.append(userInfo[field])
                nameToRow[str(userInfo["id_str"])] = row

        rows = [['FollowsByCommunity'] + fieldsOfInterest]
        for name in topNNames:
            rows.append(nameToRow[name])

        from Main import writeRowsToCSV
        writeRowsToCSV(rows, outputDir + label + "TopNusingTFIDF.csv")
コード例 #9
0
def generateFrequencyOfTokens(db_name, collectionName, outputDir, port):
    from MongoDBInterface import getMongoClient
    client = getMongoClient(port)
    db = client[db_name]

    import os
    if not os.path.isdir(outputDir):
        os.mkdir(outputDir)

    fileToStoreTo = outputDir + "tokenToFrequency_" + str(db_name) + "_" + str(
        collectionName) + ".pickle"
    if not os.path.isfile(fileToStoreTo):
        collectionToRead = db[collectionName]
        query = {}
        fields = {'tweetOriginal': 1}
        tweetCursor = collectionToRead.find(query,
                                            fields,
                                            no_cursor_timeout=True)

        tokenToFrequency = {}
        for userInfo in tweetCursor:
            tokens = processString(userInfo["tweetOriginal"])
            for token in tokens:
                if not token in tokenToFrequency:
                    tokenToFrequency[token] = 0
                tokenToFrequency[token] += 1

        import pickle
        with open(fileToStoreTo, "wb") as fp:
            pickle.dump(tokenToFrequency, fp)
    else:
        import pickle
        with open(fileToStoreTo, "rb") as fp:
            tokenToFrequency = pickle.load(fp)

    print("loaded " + str(len(tokenToFrequency)) + " tokens.")

    return tokenToFrequency
コード例 #10
0
def getTopNCommunityFollowsInCSV(db_name, collectionName, portInput,
                                 usersCommunityFollows, N):
    from operator import itemgetter
    res = sorted(usersCommunityFollows.items(),
                 key=itemgetter(1),
                 reverse=True)[:N]
    topN = []
    for pair in res:
        topN.append(pair[0])
    print(topN)
    print(res)

    from MongoDBInterface import getMongoClient
    client = getMongoClient(portInput)
    db = client[db_name]
    collectionToRead = db[collectionName]

    tweetCursor = collectionToRead.find({}, no_cursor_timeout=True)

    fieldsOfInterest = [
        'screenName', 'followers_count', 'location', 'name', 'created_at',
        'description'
    ]
    nameToRow = {}
    topNSet = set(topN)
    for userInfo in tweetCursor:
        if str(userInfo["id_str"]) in topNSet:
            row = [usersCommunityFollows[str(userInfo["id_str"])]]
            for field in fieldsOfInterest:
                row.append(userInfo[field])
            nameToRow[str(userInfo["id_str"])] = row

    rows = [['FollowsByCommunity'] + fieldsOfInterest]
    for name in topN:
        rows.append(nameToRow[name])

    writeRowsToCSV(rows,
                   outputDir + db_name + "TopNMostFrequentlyFollowed.csv")
コード例 #11
0
def mainProcessIDs(api, db_name, collectionName, ids, collectionNameWritesPerformed, portInput, writeDescription=False):
    from datetime import datetime
    print("Starting Collecting User Info " + str(datetime.now()))
                        
    from MongoDBInterface import getMongoClient
    client = getMongoClient(portInput)
    db = client[db_name]
    collectionToWrite = db[collectionName]

    tweetCursor = collectionToWrite.find({}, no_cursor_timeout=True)
    userAlreadyWritten = set([])
    for userInfo in tweetCursor:
        userAlreadyWritten.add(str(userInfo["id_str"]))
    if len(userAlreadyWritten) > 0:
        print(str(len(userAlreadyWritten)) + " userAlreadyWritten")
        count = 0
        for uid in ids[::-1]: #reversed list
            if uid in userAlreadyWritten:
                break
            count += 1
        ids = ids[-count:]
        print(len(ids))
    
    getFollowerInfoFromListOfIDs(api, ids, collectionToWrite, writeDescription)
    
    infoToWrite = []
    d = {}
    d["_id"] = db_name
    infoToWrite.append(d)

    from datetime import datetime
    try:
        db[collectionNameWritesPerformed].insert_many(infoToWrite, ordered=False)
    except:
        print("Error when doing bulk write")

    print("Finished Collecting User Info " + str(datetime.now()))
コード例 #12
0
def setupDBUsingSingleUser(twitterAPI1, screenName, maxFollowersToCollectInput, followersDir, portInput, reprocess=False, writeFollowerOnly=False):
    from os import path
    outputDirFollower = 'CollectFollowers/'
    if followersDir != None:
        outputDirFollower = followersDir
    import os
    if not os.path.isdir(outputDirFollower):
        os.mkdir(outputDirFollower)
    
    from MongoDBInterface import getMongoClient
    client = getMongoClient(portInput)
    dbnames = client.list_database_names()
    collectionNeedBePerformed = True
    if screenName in dbnames:
        db = client[screenName]
        if "influencerOverWhichFollowerInfoCollected" in db.list_collection_names():
            collectionNeedBePerformed = False
            
    if collectionNeedBePerformed:
        print("working on " + screenName)
        db_name = screenName
        screenNamesToQuery = [screenName]
        collectionName1 = "influencerInfo"
        collectionName2 = "influencerFollowerSample"
        collectionName3 = "followerInfo"
        collectionName3b = "influencerOverWhichFollowerInfoCollected"
    
        '''step1: collect influencer info'''
        from CollectUserInfo import mainProcessScreenNames
        db = client[db_name]
        mainProcessScreenNames(twitterAPI1, db_name, collectionName1, screenNamesToQuery, portInput)
        
        '''step2: collect a sample of followers for each influencer'''
        db = client[db_name]
        collectionToRead = db[collectionName1]
    
        from MongoDBInterface import getUsersWrittenToMongoDB
        influencerScreenNames = list(getUsersWrittenToMongoDB(collectionToRead, "screenName"))
        if len(influencerScreenNames) > 0:
            '''the screennames from Twitter may have a bit different capitalization, use the format as displayed by Twitter'''
            '''vs. original screenNames used in query'''
            influencerScreenNamesLowerCaseToProper = {}
            for influencerScreenName in influencerScreenNames:
                influencerScreenNamesLowerCaseToProper[influencerScreenName.lower()] = influencerScreenName
            screenNamesToQueryProper = []
            for screenName in screenNamesToQuery:
                if screenName.lower() in influencerScreenNamesLowerCaseToProper:
                    screenNamesToQueryProper.append(influencerScreenNamesLowerCaseToProper[screenName.lower()])
            print("working with ", len(screenNamesToQueryProper), " out of ", len(screenNamesToQuery), " original screenNames")
            
            import os
            if not os.path.isdir(outputDirFollower):
                os.mkdir(outputDirFollower)
            from CollectFollowers import mainProcessFollowers
            if (not os.path.isfile(outputDirFollower+str(screenName.lower())+".pickle")) or reprocess:
                mainProcessFollowers(twitterAPI1, db_name, collectionName2, screenNamesToQueryProper, influencerScreenNamesLowerCaseToProper, portInput, maxFollowersToCollect=maxFollowersToCollectInput, filePathToStoreToInput=outputDirFollower+str(screenName.lower())+".pickle")
            
            if not writeFollowerOnly:
                '''step3: collect profile metadata on each follower'''
                from CollectUserInfo import mainProcessIDs
                userInfoPath = outputDirFollower+str(screenName.lower())+".pickle"
                import pickle
                with open(userInfoPath, "rb") as fp:
                    followers = pickle.load(fp)
                    followersSTR = []
                    for follower in followers:
                        try:
                            followersSTR.append(str(follower))
                        except:
                            print("Error loading follower")

                    followers = followersSTR
                    print(str(len(followers)) + " followers loaded")
                    mainProcessIDs(twitterAPI1, db_name, collectionName3, followers, collectionName3b, portInput)    
コード例 #13
0
def processTimeDistributions(minFreq,
                             port,
                             tokenToFrequencyGlobal,
                             outputDir,
                             atUser=False):
    tokenOfInterest = set([])
    for token in tokenToFrequencyGlobal:
        if tokenToFrequencyGlobal[token] >= minFreq:
            tokenOfInterest.add(token)

    db_name = "Temp_Analysis"
    collectionName = "TimeDist_Combined"
    if atUser:
        collectionName = "TimeDist_Combined_AtUser"

    from MongoDBInterface import getMongoClient
    client = getMongoClient(port)
    db = client[db_name]
    collectionToRead = db[collectionName]
    query = {}
    tweetCursor = collectionToRead.find(query, no_cursor_timeout=True)

    collectionNameToWriteTo = "TokenTimeFeaturesProcessed"
    if atUser:
        collectionNameToWriteTo = "TokenTimeFeaturesProcessedUser"

    db = client["Temp_Analysis"]
    collectionName = collectionNameToWriteTo
    db[collectionName].drop()

    timeOfDay = [
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
        '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23'
    ]
    header = timeOfDay + [
        'indexOfMin', "indexOfMax", "stdOfTimeDist", 'rSquare', "power",
        "predUTC", 'totalRecords', 'id'
    ]
    import numpy as np
    infoToWrite = []
    for userInfo in tweetCursor:
        timeDist = userInfo

        idToken = timeDist.pop('_id')
        if idToken in tokenOfInterest:
            if np.min(list(timeDist.values())) > 0:
                values = []
                total = np.sum(list(timeDist.values()))
                for t in timeOfDay:
                    if t in timeDist:
                        values.append(float(timeDist[t]) / float(total))
                    else:
                        values.append(0)

                stdOfTimeDist = np.std(values)

                resultsTemp = processTimeDist(values, 5, 33)

                indexOfMax = values.index(np.max(values))
                indexOfMin = values.index(np.min(values))

                PSTModified = None
                rSquare = None
                power = None
                if resultsTemp['twoPointTestPass']:
                    if resultsTemp['parabola']:
                        value = resultsTemp['predictedSleepTime']
                        if value >= 14:
                            value = value - 24

                        value = -value + 4
                        PSTModified = value
                        rSquare = resultsTemp['rSquare']
                        power = resultsTemp["power"]
                row = values + [
                    indexOfMin, indexOfMax, stdOfTimeDist, rSquare, power,
                    PSTModified, total, idToken
                ]
                d = dict(zip(header, row))
                d["_id"] = idToken
                infoToWrite.append(d)

                if len(infoToWrite) > 1000:
                    try:
                        db[collectionName].insert_many(infoToWrite,
                                                       ordered=False)
                        infoToWrite = []
                        print("Performed write")
                    except:
                        print("Error when doing bulk write")
コード例 #14
0
def setupTempTableWithTimeDistributionforEachToken(db_nameAndCollection,
                                                   minFreq,
                                                   port,
                                                   tokenToFrequencyGlobal,
                                                   atUser=False):
    print("creating time distribution for each token")

    field = 'tHour'
    if atUser:
        field = 'tHour2'
    collectionNameToWriteTo = "TimeDist_Combined"
    if atUser:
        collectionNameToWriteTo = "TimeDist_Combined_AtUser"

    tokenOfInterest = set([])
    for token in tokenToFrequencyGlobal:
        if tokenToFrequencyGlobal[token] >= minFreq:
            tokenOfInterest.add(token)
    print(str(len(tokenOfInterest)) + " tokenOfInterest")

    tokenToTimeDist = {}
    blankTimeDist = {}
    for hour in range(0, 24, 1):
        blankTimeDist[str(hour)] = 0
    import copy
    usersProcessed = set([])
    for pair in db_nameAndCollection:
        db_name = pair[0]
        collectionName = pair[1]

        from MongoDBInterface import getMongoClient
        client = getMongoClient(port)
        db = client[db_name]

        collectionToRead = db[collectionName]
        query = {}
        fields = {'tweetOriginal': 1, field: 1}
        if atUser:
            fields = {'tweetOriginal': 1, field: 1, 'username': 1}
        tweetCursor = collectionToRead.find(query,
                                            fields,
                                            no_cursor_timeout=True)

        if atUser:
            for userInfo in tweetCursor:
                if not userInfo["username"] in usersProcessed:
                    usersProcessed.add(userInfo["username"])
                    tokens = processString(userInfo["tweetOriginal"])
                    for token in tokens:
                        if token in tokenOfInterest:
                            if not token in tokenToTimeDist:
                                tokenToTimeDist[token] = copy.copy(
                                    blankTimeDist)
                            tokenToTimeDist[token][str(userInfo[field])] += 1
        else:
            for userInfo in tweetCursor:
                tokens = processString(userInfo["tweetOriginal"])
                for token in tokens:
                    if token in tokenOfInterest:
                        if not token in tokenToTimeDist:
                            tokenToTimeDist[token] = copy.copy(blankTimeDist)
                        tokenToTimeDist[token][str(userInfo[field])] += 1

    db = client["Temp_Analysis"]
    collectionName = collectionNameToWriteTo
    db[collectionName].drop()

    infoToWrite = []
    import numpy as np
    for token in tokenToTimeDist:
        d = tokenToTimeDist[token]
        d["_id"] = token
        infoToWrite.append(d)

        if len(infoToWrite) > 1000:
            try:
                db[collectionName].insert_many(infoToWrite, ordered=False)
                infoToWrite = []
                print("Performed write")
            except:
                print("Error when doing bulk write")

    if len(infoToWrite) > 0:
        try:
            db[collectionName].insert_many(infoToWrite, ordered=False)
            infoToWrite = []
            print("Performed write")
        except:
            print("Error when doing bulk write")

    print("finished creating time distribution for each token")
コード例 #15
0
    if not os.path.isdir(outputDir):
        os.mkdir(outputDir)
    followersDir = "collectFollowers/"
    import os
    if not os.path.isdir(followersDir):
        os.mkdir(followersDir)

    from TwitterAPI import getAPI
    twitterAPI1 = getAPI()
    port = 27020
    step0 = False
    if step0:
        print("applying google search")
        db_name = "TempInfluencersFromGoogleSearch"
        from MongoDBInterface import getMongoClient
        client = getMongoClient(port)
        from CollectUserInfo import mainProcessScreenNames
        db = client[db_name]

        queries1 = [
            "Minsk Belarus Twitter", "Moscow Russia Twitter",
            "Moskva Russia Twitter"
        ]
        queries2 = ["Buffalo NY Twitter", "Syracuse NY Twitter"]
        queries = queries1 + queries2
        import time
        for query in queries:
            potentialInfluencerToWebHit = googleSearch(query)
            print(potentialInfluencerToWebHit)

            collectionName = query.replace(" ", "")
コード例 #16
0
def tokenToCoordinates3Div(db_nameAndCollection, portInput):
    from Step2ProcessTableOfTweets import processString
    from MongoDBInterface import getMongoClient

    errorCounts = 0
    tokenToCoordinate = {}
    for pair in db_nameAndCollection:
        db_name = pair[0]
        collectionName = pair[1]

        client = getMongoClient(portInput)
        db = client[db_name]

        collectionToRead = db[collectionName]
        query = {}
        fields = {'tweetOriginal': 1, 'coordinatesPoint': 1, 'place': 1}
        tweetCursor = collectionToRead.find(query,
                                            fields,
                                            no_cursor_timeout=True)

        for userInfo in tweetCursor:
            label = None
            if 'coordinatesPoint' in userInfo:
                long = userInfo['coordinatesPoint']['coordinates'][0]
                if long <= -25:
                    label = -1
                elif long <= 65:
                    label = 0
                else:
                    label = 1
            elif 'place' in userInfo:
                longs = []
                for coordinatePair in userInfo['place']['bounding_box'][
                        'coordinates'][0]:
                    longs.append(coordinatePair[0])

                l1 = 0
                l2 = 0
                l3 = 0
                for long in longs:
                    if long <= -25:
                        l1 += 1
                    elif long <= 65:
                        l2 += 1
                    else:
                        l3 += 1
                if l1 == 4:
                    label = -1
                elif l2 == 4:
                    label = 0
                elif l3 == 4:
                    label = 1
                else:
                    print(str(errorCounts) + " " + str(userInfo['place']))
                    errorCounts += 1

            if label != None:
                tokens = processString(userInfo["tweetOriginal"])
                for token in tokens:
                    if not token in tokenToCoordinate:
                        tokenToCoordinate[token] = []
                    tokenToCoordinate[token].append(label)

    db_name = "Temp_Analysis"
    db = client[db_name]
    collectionName = "TokenToCoordinates_Combined3Div"
    db[collectionName].drop()

    infoToWrite = []
    import numpy as np
    for token in tokenToCoordinate:
        d = {}
        d["_id"] = token
        d["coordinates"] = tokenToCoordinate[token]
        infoToWrite.append(d)

        if len(infoToWrite) > 1000:
            try:
                db[collectionName].insert_many(infoToWrite, ordered=False)
                infoToWrite = []
                print("Performed write")
            except:
                print("Error when doing bulk write")

    if len(infoToWrite) > 0:
        try:
            db[collectionName].insert_many(infoToWrite, ordered=False)
            infoToWrite = []
            print("Performed write")
        except:
            print("Error when doing bulk write")