Example #1
0
            def recCursor(lastTimeStamp, counter=0):
                ''' not an effective method to query across multiple connection '''
                for collName in collectionList:
                    logger.debug('collName = %s & time = %s', collName,
                                 lastTimeStamp)
                    docLists = mongoInt.retrieveCollection(
                        collName, lastTimeStamp,
                        globalS.dictDb['MONGODB_COUNT_LIMIT'])
                    if globalS.dictDb['APP_DEBUG']:

                        def insertQueryData(twit, *argv):
                            logger.debug('Query details %s', collName)
                            twit.update({'queryDetails123': collName})

                        map(lambda twit: insertQueryData(twit, ID), docLists)
                    if len(docLists):
                        logger.info(
                            'fetched %s docs from collection:%s appending to tweets',
                            len(docLists), collName)
                        #print the ID's of feeds so that we verify any dup feeds are obtained
                        map(
                            lambda twit: logger.debug('doc ID is %s', twit[
                                'id']), docLists)
                        tweets.extend(docLists)
                if len(tweets) < 10 and counter < 10000:
                    lastTimeStamp = int(
                        lastTimeStamp) - globalS.dictDb['DELTA_FEEDS_TIME']
                    logger.info(
                        'Docs are not available so recursive calling %s',
                        lastTimeStamp)
                    return recCursor(lastTimeStamp, counter + 1)
                logger.info(
                    'collectively returned %s docs for multiple documents %s',
                    len(tweets), collectionList)
                return 1
Example #2
0
 def fetchInterestFeeds(self,ID):
     '''fetch the all neo4j interest nodes returning name & city then using those
     tags look for mongoDb collection if not then do search in twitter &
     instagram and store the output in mongoDb in a collection mapped to interest nodes'''
     recordList = neo4jInt.getInterestNode(graphDB,ID)
     geoDict = {}
     tweets=[]
     #parse the recordList and frame the has tags here
     for record in recordList:
         if record[0]['lat'] is not None:
             geoDict.update({'lat':record[0]['lat']})
             geoDict.update({'lng':record[0]['lng']})
             geoDict.update({'distance':'.5'})#default radius =500m
         logger.info('recordList output of neo4j:%s',record[0]['name'])
         if record[0]['city'] is not None:
             Q=record[0]['name'] +' '+ record[0]['city']
         else:
             Q=record[0]['name']
         ID=record[0]['id']
         logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',ID,Q,geoDict)
         if mongoInt.checkCollExists(ID) > 1:
             tweets.extend(mongoInt.retrieveCollection(ID))
         else:
             tweets.extend(self.retrieveTweets(ID,Q,geoDict))
             tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict))
             geoDict = {}#revert the geo dictionary
     #sparkInt.Parallelized(tweets)
     #feedJson=sparkInt.wowFieldTrueOrFalse(tweets)
     return tweets
Example #3
0
 def recCursor(lastTimeStamp):
     logger.debug('collName = %s & time = %s',ID,lastTimeStamp)
     docList = mongoInt.retrieveCollection(ID,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT'])
     if len(docList) < 2:
         lastTimeStamp=int(lastTimeStamp)-globalS.dictDb['DELTA_FEEDS_TIME']
         logger.info('Docs are not available so recursive calling %s',lastTimeStamp)
         return recCursor(lastTimeStamp)
     return docList
Example #4
0
 def recCursor(lastTimeStamp):
     for collName in collectionList:
         logger.debug('collName = %s & time = %s',ID,lastTimeStamp)
         tweets.extend( mongoInt.retrieveCollection(ID,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT']))
     if len(tweets) < 2:
         lastTimeStamp=int(lastTimeStamp)-globalS.dictDb['DELTA_FEEDS_TIME']
         logger.info('Docs are not available so recursive calling %s',lastTimeStamp)
         return recCursor()
     logger.info('collectively returned %s docs for multiple documents',len(tweets))
     return
Example #5
0
 def retrieveCollection(self,ID,lastTimeStamp,count):
     ''' for displayFeeds debugging stuff
     '''
     tweets=[]
     #docs = mongoInt.retrieveCollection(ID,lastTimeStamp,count)
     #tweets.extend(docs) if docs>0 else 0
     tweets.extend(mongoInt.retrieveCollection(ID,lastTimeStamp,count))
     if globalS.dictDb['APP_DEBUG']:
         logger.debug('APP_DEBUG is true so seeting the queryDetails:ID field')
         def insertQueryData(twit,ID):
             twit.update({'queryDetails':ID})
             #return twit
         map(lambda twit: insertQueryData(twit, ID), tweets);
     return tweets
Example #6
0
 def retrieveCollection(self,ID,lastTimeStamp,count):
     ''' for displayFeeds debugging stuff
     '''
     tweets=[]
     #docs = mongoInt.retrieveCollection(ID,lastTimeStamp,count)
     #tweets.extend(docs) if docs>0 else 0
     tweets.extend(mongoInt.retrieveCollection(ID,lastTimeStamp,count))
     if globalS.dictDb['APP_DEBUG']:
         logger.debug('APP_DEBUG is true so seeting the queryDetails:ID field')
         def insertQueryData(twit,ID):
             twit.update({'queryDetails':ID})
             #return twit
         map(lambda twit: insertQueryData(twit, ID), tweets);
     return tweets
Example #7
0
 def recCursor(lastTimeStamp):
     logger.debug('collName = %s & time = %s', ID,
                  lastTimeStamp)
     docList = mongoInt.retrieveCollection(
         ID, lastTimeStamp,
         globalS.dictDb['MONGODB_COUNT_LIMIT'])
     if len(docList) < 2:
         lastTimeStamp = int(
             lastTimeStamp) - globalS.dictDb['DELTA_FEEDS_TIME']
         logger.info(
             'Docs are not available so recursive calling %s',
             lastTimeStamp)
         return recCursor(lastTimeStamp)
     return docList
Example #8
0
 def recCursor(lastTimeStamp):
     for collName in collectionList:
         logger.debug('collName = %s & time = %s', ID,
                      lastTimeStamp)
         tweets.extend(
             mongoInt.retrieveCollection(
                 ID, lastTimeStamp,
                 globalS.dictDb['MONGODB_COUNT_LIMIT']))
     if len(tweets) < 2:
         lastTimeStamp = int(
             lastTimeStamp) - globalS.dictDb['DELTA_FEEDS_TIME']
         logger.info(
             'Docs are not available so recursive calling %s',
             lastTimeStamp)
         return recCursor()
     logger.info(
         'collectively returned %s docs for multiple documents',
         len(tweets))
     return
Example #9
0
 def recCursor(lastTimeStamp):
     ''' not an effective method to query across multiple connection '''
     for collName in collectionList:
         logger.debug('collName = %s & time = %s',collName,lastTimeStamp)
         docLists =  mongoInt.retrieveCollection(collName,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT'])
         if globalS.dictDb['APP_DEBUG']:
             def insertQueryData(twit,*argv):
                 twit.update({'queryDetails':argv})
             map(lambda twit: insertQueryData(twit,ID), docLists);
         if len(docLists):
             logger.info('fetched %s docs from collection:%s appending to tweets',len(docLists),collName)
             #print the ID's of feeds so that we verify any dup feeds are obtained
             map(lambda twit: logger.debug('doc ID is %s',twit['id']), docLists);
             tweets.extend(docLists)
     if len(tweets) < 10:
         lastTimeStamp=int(lastTimeStamp)-globalS.dictDb['DELTA_FEEDS_TIME']
         logger.info('Docs are not available so recursive calling %s',lastTimeStamp)
         return recCursor(lastTimeStamp)
     logger.info('collectively returned %s docs for multiple documents %s',len(tweets),collectionList)
     return 1
Example #10
0
    def fetchInterestFeeds(self, ID, lastTimeStamp):
        '''fetch the all neo4j interest nodes returning name & city then using those
        tags look for mongoDb collection if not then do search in twitter &
        instagram and store the output in mongoDb in a collection mapped to interest nodes'''
        recordList = neo4jInt.getInterestNode(graphDB, ID)
        geoDict = {}
        tweets = []
        jobsArgs = []
        #parse the recordList and frame the has tags here
        for record in recordList:
            geoDict = {}  #revert the geo dictionary

            if record[0]['lat'] is not None:
                geoDict.update({'lat': record[0]['lat']})
                geoDict.update({'lng': record[0]['lng']})
                geoDict.update({'distance': '.5'})  #default radius =500m
            logger.info('recordList output of neo4j:%s', record[0]['name'])

            if record[0]['city'] is not None:
                Q = record[0]['name'] + ' ' + record[0]['city']
            else:
                Q = record[0]['name']

            ID = record[0]['id']
            logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',
                         ID, Q, geoDict)

            if mongoInt.checkCollExists(ID) > 1:
                #docs = mongoInt.retrieveCollection(ID,lastTimeStamp)
                #tweets.extend(docs) if len(docs) else 0
                tweets.extend(
                    mongoInt.retrieveCollection(
                        ID, lastTimeStamp,
                        globalS.dictDb['MONGODB_COUNT_LIMIT']))
            else:
                #tweets.extend(self.retrieveTweets(ID,Q,geoDict))
                #tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict))
                jobsArgs.append([ID, Q, geoDict])
                #with Pool(processes=4) as pool:
                #    pool.map()
                #jobs = []
                #job.append(Process(target=self.retrieveTweets, args=(ID,Q,geoDict)))
                #job.append(Process(target=self.retrieveMediaBasedTags, args=(ID,Q,geoDict)))
                #feeds = self.retrieveTweets(ID,Q,geoDict)
                #tweets.extend(feeds) if len(feeds) else 0
                #medias = self.retrieveMediaBasedTags(ID,Q,geoDict)
                #tweets.extend(medias) if len(medias) else 0
        ## auxiliary funciton to make it work

        if len(jobsArgs):
            logger.warn('Collection is empty invoking worker pools:%s',
                        jobsArgs)

            def retrieveMedias_helper(args):
                tweets.extend(self.retrieveMediaBasedTags(*args)[:20])

            def retrieveTweets_helper(args):
                tweets.extend(self.retrieveTweets(*args)[:20])

            #pool = Pool(2)
            #tweets.extend(pool.map(retrieveTweets_helper,jobsArgs))
            #tweets.extend(pool.map(retrieveMedias_helper,jobsArgs))
            map(retrieveTweets_helper, jobsArgs)
            map(retrieveMedias_helper, jobsArgs)
            #pool.close()
            #pool.join()
            logger.debug('multiprocessing pool has returned %s feeds',
                         len(tweets))
            #tweets = tweets[:20]
        if globalS.dictDb['APP_DEBUG']:

            def insertQueryData(twit, *argv):
                twit.update({'queryDetails': argv})
                #return twit

            map(lambda twit: insertQueryData(twit, ID, Q, geoDict), tweets)
        #sparkInt.Parallelized(tweets)
        #feedJson=sparkInt.wowFieldTrueOrFalse(tweets)
        return tweets
Example #11
0
    def fetchInterestFeeds(self,ID,lastTimeStamp):
        '''fetch the all neo4j interest nodes returning name & city then using those
        tags look for mongoDb collection if not then do search in twitter &
        instagram and store the output in mongoDb in a collection mapped to interest nodes'''
        recordList = neo4jInt.getInterestNode(graphDB,ID)
        geoDict = {}
        tweets=[]
        jobsArgs =[]
        #parse the recordList and frame the has tags here
        for record in recordList:
            geoDict = {}#revert the geo dictionary

            if record[0]['lat'] is not None:
                geoDict.update({'lat':record[0]['lat']})
                geoDict.update({'lng':record[0]['lng']})
                geoDict.update({'distance':'.5'})#default radius =500m
            logger.info('recordList output of neo4j:%s',record[0]['name'])

            if record[0]['city'] is not None:
                Q=record[0]['name'] +' '+ record[0]['city']
            else:
                Q=record[0]['name']

            ID=record[0]['id']
            logger.debug('fetchInterestFeeds ID:%s Q=%s geo cordinates =%s',ID,Q,geoDict)

            if mongoInt.checkCollExists(ID) > 1:
                #docs = mongoInt.retrieveCollection(ID,lastTimeStamp)
                #tweets.extend(docs) if len(docs) else 0
                tweets.extend(mongoInt.retrieveCollection(ID,lastTimeStamp,globalS.dictDb['MONGODB_COUNT_LIMIT']))
            else:
                #tweets.extend(self.retrieveTweets(ID,Q,geoDict))
                #tweets.extend(self.retrieveMediaBasedTags(ID,Q,geoDict))
                jobsArgs.append([ID,Q,geoDict])
                #with Pool(processes=4) as pool:
                #    pool.map()
                #jobs = []
                #job.append(Process(target=self.retrieveTweets, args=(ID,Q,geoDict)))
                #job.append(Process(target=self.retrieveMediaBasedTags, args=(ID,Q,geoDict)))
                #feeds = self.retrieveTweets(ID,Q,geoDict)
                #tweets.extend(feeds) if len(feeds) else 0
                #medias = self.retrieveMediaBasedTags(ID,Q,geoDict)
                #tweets.extend(medias) if len(medias) else 0
        ## auxiliary funciton to make it work

        if len(jobsArgs):
            logger.warn('Collection is empty invoking worker pools:%s',jobsArgs)

            def retrieveMedias_helper(args):
                tweets.extend(self.retrieveMediaBasedTags(*args)[:20])
            def retrieveTweets_helper(args):
                tweets.extend(self.retrieveTweets(*args)[:20])
            #pool = Pool(2)
            #tweets.extend(pool.map(retrieveTweets_helper,jobsArgs))
            #tweets.extend(pool.map(retrieveMedias_helper,jobsArgs))
            map(retrieveTweets_helper,jobsArgs)
            map(retrieveMedias_helper,jobsArgs)
            #pool.close()
            #pool.join()
            logger.debug('multiprocessing pool has returned %s feeds',len(tweets))
            #tweets = tweets[:20]
        if globalS.dictDb['APP_DEBUG']:
            def insertQueryData(twit,*argv):
                twit.update({'queryDetails':argv})
                #return twit
            map(lambda twit: insertQueryData(twit,ID,Q,geoDict), tweets);
        #sparkInt.Parallelized(tweets)
        #feedJson=sparkInt.wowFieldTrueOrFalse(tweets)
        return tweets