Esempio n. 1
0
    def retrieveTweets(self, ID, Q, geoCode):
        '''retrieveTweets from twitter and store the feeds into MongoDB
        '''
        since_id = mongoInt.retrieveSinceID(ID)
        #since_id = long(785438635369738240)
        logger.debug('retrieve tweets')
        logger.debug(since_id)
        logger.debug('retrieve tweets123456')
        #fetch the latest since_id and pass it in next twitter call
        #since_id = mongoInt.retrieveSinceID(ID)
        twits = twitterInt.retrieveTweets(Q, geoCode, since_id)

        mongoInt.collectionFeedFrequency(len(twits), ID)

        def removeRetweets(tweet):
            if 'retweeted_status' in tweet:
                tweet = tweet['retweeted_status']
                tweet['alreadyRetweeted'] = True
                print tweet['text']
                #json_obj = json.dumps(obj)
            return tweet

        #map(lambda tw:tw.update({'created_time': timegm(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits)
        value = map(removeRetweets, twits)
        twits = value
        map(
            lambda tw: tw.update({
                'created_time':
                timegm(
                    time.strptime(tw['created_at'],
                                  "%a %b %d %H:%M:%S +0000 %Y"))
            }), twits)
        #callinf directly instead of wrapper change it later
        #pass only twitter text & ID only here
        logger.info('tweets fetched for %s are %s', ID, len(twits))
        if (len(twits)):
            uniqueTweetsFromDB = []
            uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(ID)
            logger.error('existing uniqueTweetsFromDB :%s',
                         len(uniqueTweetsFromDB))
            logger.debug('total uniqe feeds %s', uniqueTweetsFromDB)
            #twits.extend(uniqueTweetsFromDB)
            uniqueTweetsFromDB.extend(twits)
            logger.error('total combined tweets :%s', len(uniqueTweetsFromDB))
            #return uniqueTweetsFromDB
        else:
            return []

        similarTweet = self.topicModelLSI(uniqueTweetsFromDB,
                                          Q)  # new feeds from service
        if similarTweet != 0:
            self.updateRatio(ID, similarTweet, uniqueTweetsFromDB, Q)
        return self.runClassifier(ID)
Esempio n. 2
0
    def retrieveTweets(self,ID,Q,geoCode):
        '''retrieveTweets from twitter and store the feeds into MongoDB
        '''
        since_id = mongoInt.retrieveSinceID(ID)
        #since_id = long(785438635369738240)
        logger.debug('retrieve tweets')
        logger.debug(since_id)
        logger.debug('retrieve tweets123456')
        #fetch the latest since_id and pass it in next twitter call
        #since_id = mongoInt.retrieveSinceID(ID)
        twits = twitterInt.retrieveTweets(Q,geoCode, since_id)
        
        mongoInt.collectionFeedFrequency(len(twits), ID)
        def removeRetweets(tweet):
            if 'retweeted_status' in tweet:
                tweet = tweet['retweeted_status']
                tweet['alreadyRetweeted'] = True
                print tweet['text']
                #json_obj = json.dumps(obj)
            return tweet

        
        #map(lambda tw:tw.update({'created_time': timegm(time.gmtime(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y")))}),twits)
        value = map(removeRetweets, twits)
        print(value)
        twits = value
        map(lambda tw:tw.update({'created_time': timegm(time.strptime(tw['created_at'],"%a %b %d %H:%M:%S +0000 %Y"))}),twits)
        #callinf directly instead of wrapper change it later
        #pass only twitter text & ID only here
        logger.info('tweets fetched are chellaaa %s',len(twits))
        if(len(twits)):
            uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(ID)
            logger.debug('existing uniqueTweetsFromDB :%s',len(uniqueTweetsFromDB))
            #twits.extend(uniqueTweetsFromDB)
            uniqueTweetsFromDB.extend(twits)
            logger.debug('total combined tweets :%s',len(uniqueTweetsFromDB))
            similarTweet = self.topicModelLSI(uniqueTweetsFromDB, Q) # new feeds from service
            if similarTweet != 0:
                self.updateRatio(ID,similarTweet,uniqueTweetsFromDB, Q)
            return len(uniqueTweetsFromDB)
Esempio n. 3
0
    def similarTopicRemoval(self,collName,similarTweet,twits, Q):
        ''' if childId = parentId update mongoDB parentId = true
            else parent id != child ID & ratio != 1.0 update mongodb parentId = parent id, ratio = ratio 
            analysis
        '''
        uniqueTweetsFromTwitter =[]
        similarTweetsFromTwitter =[]
        logger.debug('entering')

        for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]):
            for twit in twits:
                if childID == parentID:
                    if twit['id'] is parentID:
                        twit.update({'parentId' : 1})
                        uniqueTweetsFromTwitter.append(twit)
                        
                elif childID != parentID and ratio <= 0.999999:
                #elif cmp(1,float(ratio)) == 1:
                    #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio)
                    if twit['id'] is childID:
                        tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)}
                        twit.update(tmpDict)
                        similarTweetsFromTwitter.append(twit)
            
        uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(collName)
        # similar_id1 = []
        # for ele in uniqueTweetsFromDB:
            # similar_id1.append(ele['id'])
        logger.debug('existing uniqueTweetsFromDB :%s',len(uniqueTweetsFromDB))
        uniqueTweetsFromDB.extend(uniqueTweetsFromTwitter)
        #<place holder remove the matching tweets between uniqueTweetsFromDB and which is again got from server uniqueTweetsFromTwitter
        allUniqueTweetsIDList = []
        for ele in uniqueTweetsFromDB:
            allUniqueTweetsIDList.append(ele['id'])
        logger.debug('Duplication allUniqueTweetsIDList contains len : %s value :%s',len(allUniqueTweetsIDList),allUniqueTweetsIDList)
        allUniqueTweetsIDList = set(allUniqueTweetsIDList)
        logger.debug('unique allUniqueTweetsIDList contains len : %s value :%s',len(allUniqueTweetsIDList), allUniqueTweetsIDList)
        allUniqueTweets = []
        for ident in allUniqueTweetsIDList:
            for ele in uniqueTweetsFromDB:
                if ele['id'] is ident:
                    allUniqueTweets.append(ele)
        logger.debug('allUniqueTweets lenght is %s',allUniqueTweets)
        similarTweet = self.topicModelLSI(allUniqueTweets, Q)


        uniqueTweetsFromTwitter_1 = []
        for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]):
            for twit in twits :
                #similar_id = similar_id1
                #if twit['id'] not in similar_id:
                #   similar_id.append(twit['id'])
                if childID == parentID:
                    if twit['id'] is parentID:
                        twit.update({'parentId' : 1})
                        uniqueTweetsFromTwitter_1.append(twit)
                        # similar_id1.append(twit['id'])

                # elif childID != parentID and ratio <= 0.999999 and childID not in similar_id1:
                # elif childID != parentID and ratio <= 0.999999 :
                    # #similar_id.append(childID)
                    # #elif cmp(1,float(ratio)) == 1:
                    # #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio)
                    # if twit['id'] is childID:
                        # tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)}
                        # twit.update(tmpDict)
                        # similarTweetsFromTwitter.append(twit)
 
        for ele in uniqueTweetsFromTwitter_1:
            logger.debug('current tweets : text: %s, ID: %s',ele['text'],ele['id'])

        '''
        else:
            #if collection already exists
            for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]):
                for twit in twits:
                    if childID == parentID:
                        if twit['id'] is parentID:
                            uniqueTweetsFromTwitter.append(twit)

            uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(collName)
            logger.debug('uniqueTweetsFromDB :%s',uniqueTweetsFromDB)
            topicModelObj = topicModel.topicModel(uniqueTweetsFromDB)
            dictionary = topicModelObj.createDictionary()
            corpus = []
            for vector in topicModelObj:
                corpus.append(vector)
            
            logger.debug('chelloi corpus : %s',corpus)

            similarTweet= topicModelObj.createLSIModel(corpus,uniqueTweetsFromTwitter)
            logger.debug('chelloi similarTweet : %s',similarTweet)
            
            for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]):
                for twit in twits:
                    if childID == parentID:
                        if twit['id'] is parentID:
                            twit.update({'parentId' : 1})
                            uniqueTweetsFromTwitter.append(twit)
                    elif childID != parentID and ratio <= 0.999999:
                    #elif cmp(1,float(ratio)) == 1:
                        #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio)
                        if twit['id'] is childID:
                            tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)}
                            twit.update(tmpDict)
                            uniqueTweetsFromTwitter.append(twit)
                            '''
        logger.info('Before revomal %s twits after similar Topic removal: %s',len(twits),len(uniqueTweetsFromTwitter_1))
        uniqueTweetsFromTwitter_1.extend(similarTweetsFromTwitter)
        self.insertFeedData(collName,uniqueTweetsFromTwitter_1)
        return len(uniqueTweetsFromTwitter) #simple returning the count of unique tweets
Esempio n. 4
0
    def similarTopicRemoval(self,collName,similarTweet,twits, Q):
        ''' if childId = parentId update mongoDB parentId = true
            else parent id != child ID & ratio != 1.0 update mongodb parentId = parent id, ratio = ratio 
            analysis
        '''
        uniqueTweetsFromTwitter =[]
        similarTweetsFromTwitter =[]
        logger.debug('entering')

        for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]):
            for twit in twits:
                if childID == parentID:
                    if twit['id'] is parentID:
                        twit.update({'parentId' : 1})
                        uniqueTweetsFromTwitter.append(twit)
                        
                elif childID != parentID and ratio <= 0.999999:
                #elif cmp(1,float(ratio)) == 1:
                    #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio)
                    if twit['id'] is childID:
                        tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)}
                        twit.update(tmpDict)
                        similarTweetsFromTwitter.append(twit)
            
        uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(collName)
        # similar_id1 = []
        # for ele in uniqueTweetsFromDB:
            # similar_id1.append(ele['id'])
        logger.debug('existing uniqueTweetsFromDB :%s',len(uniqueTweetsFromDB))
        uniqueTweetsFromDB.extend(uniqueTweetsFromTwitter)
        #<place holder remove the matching tweets between uniqueTweetsFromDB and which is again got from server uniqueTweetsFromTwitter
        allUniqueTweetsIDList = []
        for ele in uniqueTweetsFromDB:
            allUniqueTweetsIDList.append(ele['id'])
        logger.debug('Duplication allUniqueTweetsIDList contains len : %s value :%s',len(allUniqueTweetsIDList),allUniqueTweetsIDList)
        allUniqueTweetsIDList = set(allUniqueTweetsIDList)
        logger.debug('unique allUniqueTweetsIDList contains len : %s value :%s',len(allUniqueTweetsIDList), allUniqueTweetsIDList)
        allUniqueTweets = []
        for ident in allUniqueTweetsIDList:
            for ele in uniqueTweetsFromDB:
                if ele['id'] is ident:
                    allUniqueTweets.append(ele)
        logger.debug('allUniqueTweets lenght is %s',allUniqueTweets)
        similarTweet = self.topicModelLSI(allUniqueTweets, Q)


        uniqueTweetsFromTwitter_1 = []
        for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]):
            for twit in twits :
                #similar_id = similar_id1
                #if twit['id'] not in similar_id:
                #   similar_id.append(twit['id'])
                if childID == parentID:
                    if twit['id'] is parentID:
                        twit.update({'parentId' : 1})
                        uniqueTweetsFromTwitter_1.append(twit)
                        # similar_id1.append(twit['id'])

                # elif childID != parentID and ratio <= 0.999999 and childID not in similar_id1:
                # elif childID != parentID and ratio <= 0.999999 :
                    # #similar_id.append(childID)
                    # #elif cmp(1,float(ratio)) == 1:
                    # #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio)
                    # if twit['id'] is childID:
                        # tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)}
                        # twit.update(tmpDict)
                        # similarTweetsFromTwitter.append(twit)
 
        for ele in uniqueTweetsFromTwitter_1:
            logger.debug('current tweets : text: %s, ID: %s',ele['text'],ele['id'])

        '''
        else:
            #if collection already exists
            for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]):
                for twit in twits:
                    if childID == parentID:
                        if twit['id'] is parentID:
                            uniqueTweetsFromTwitter.append(twit)

            uniqueTweetsFromDB = mongoInt.retrieveParentIdTrue(collName)
            logger.debug('uniqueTweetsFromDB :%s',uniqueTweetsFromDB)
            topicModelObj = topicModel.topicModel(uniqueTweetsFromDB)
            dictionary = topicModelObj.createDictionary()
            corpus = []
            for vector in topicModelObj:
                corpus.append(vector)
            
            logger.debug('chelloi corpus : %s',corpus)

            similarTweet= topicModelObj.createLSIModel(corpus,uniqueTweetsFromTwitter)
            logger.debug('chelloi similarTweet : %s',similarTweet)
            
            for childID,parentID,ratio in zip(similarTweet[0],similarTweet[2],similarTweet[1]):
                for twit in twits:
                    if childID == parentID:
                        if twit['id'] is parentID:
                            twit.update({'parentId' : 1})
                            uniqueTweetsFromTwitter.append(twit)
                    elif childID != parentID and ratio <= 0.999999:
                    #elif cmp(1,float(ratio)) == 1:
                        #logger.error('%s != %s & ratio : %s',childID,cmp(float(ratio),1),ratio)
                        if twit['id'] is childID:
                            tmpDict = {'parentId' : parentID, 'ratio' : float(ratio)}
                            twit.update(tmpDict)
                            uniqueTweetsFromTwitter.append(twit)
                            '''
        logger.info('Before revomal %s twits after similar Topic removal: %s',len(twits),len(uniqueTweetsFromTwitter_1))
        uniqueTweetsFromTwitter_1.extend(similarTweetsFromTwitter)
        self.insertFeedData(collName,uniqueTweetsFromTwitter_1)
        return len(uniqueTweetsFromTwitter) #simple returning the count of unique tweets