Example #1
0
 def start(self, **kwargs):
     only_campaign= kwargs.get('campaign', None)
     regenerate_all = kwargs.get('regenerate', False)
     while True:
         end = self.getCurrentSummarizationEnd()
         for account in MongoManager.getActiveAccounts(max_age=timedelta(hours=1)):
             for campaign in account.getActiveCampaigns():
                 MongoManager.ensureIndex('summarized_tweets_%s' % campaign.getId(), [("start", 1)])
                 if only_campaign and only_campaign.getId() != campaign.getId(): continue
                 if regenerate_all:
                     self.clearSummarization(campaign)
                     collection_name = 'tweets_%s' % campaign.getId()
                     res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1)
                     if res.count():
                         lsd = res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0)
                     else:
                         lsd = datetime.now().replace(minute=0, second=0, microsecond=0)
                 else:
                     lsd = self.getLastSummarizedDate(campaign)
                 if lsd < end:
                     while lsd < end:
                         self.summarize(campaign, lsd, min(end, lsd + timedelta(days=1)), timedelta(hours=1), None)
                         lsd = lsd + timedelta(days=1)
         pprint("sleeping 20 seconds")
         regenerate_all = False
         time.sleep(20)
Example #2
0
 def getLastSummarizedDate(self, campaign):
     collection_name = 'summarized_tweets_%s' % campaign.getId()
     res = MongoManager.find(collection_name, sort=("start", -1), limit=1)
     if res.count():
         return res[0]['end']
     else:
         collection_name = 'tweets_%s' % campaign.getId()
         res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1)
         if res.count():
             return res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0)
         return datetime.now().replace(minute=0, second=0, microsecond=0)
Example #3
0
 def getLastSummarizedDate(self, campaign):
     collection_name = 'summarized_tweets_%s' % campaign.getId()
     res = MongoManager.find(collection_name, sort=("start", -1), limit=1)
     if res.count():
         return res[0]['end']
     else:
         collection_name = 'tweets_%s' % campaign.getId()
         res = MongoManager.findTweets(collection_name,
                                       sort=("x_created_at", 1),
                                       limit=1)
         if res.count():
             return res[0]['x_created_at'].replace(minute=0,
                                                   second=0,
                                                   microsecond=0)
         return datetime.now().replace(minute=0, second=0, microsecond=0)
Example #4
0
 def start(self, **kwargs):
     only_campaign = kwargs.get('campaign', None)
     regenerate_all = kwargs.get('regenerate', False)
     while True:
         end = self.getCurrentSummarizationEnd()
         for account in MongoManager.getActiveAccounts(max_age=timedelta(
                 hours=1)):
             for campaign in account.getActiveCampaigns():
                 MongoManager.ensureIndex(
                     'summarized_tweets_%s' % campaign.getId(),
                     [("start", 1)])
                 if only_campaign and only_campaign.getId(
                 ) != campaign.getId():
                     continue
                 if regenerate_all:
                     self.clearSummarization(campaign)
                     collection_name = 'tweets_%s' % campaign.getId()
                     res = MongoManager.findTweets(collection_name,
                                                   sort=("x_created_at", 1),
                                                   limit=1)
                     if res.count():
                         lsd = res[0]['x_created_at'].replace(minute=0,
                                                              second=0,
                                                              microsecond=0)
                     else:
                         lsd = datetime.now().replace(minute=0,
                                                      second=0,
                                                      microsecond=0)
                 else:
                     lsd = self.getLastSummarizedDate(campaign)
                 if lsd < end:
                     while lsd < end:
                         self.summarize(campaign, lsd,
                                        min(end, lsd + timedelta(days=1)),
                                        timedelta(hours=1), None)
                         lsd = lsd + timedelta(days=1)
         pprint("sleeping 20 seconds")
         regenerate_all = False
         time.sleep(20)
Example #5
0
    def calculateSummarizedIntervals(self, campaign, start, end, interval, tweetlist=None):
        pprint("summarizing tweets for campaign %s between %s and %s" % (campaign.getName(), start, end))
        synonyms = self.getTrendWordsSynonyms(campaign)
        trend_stop_words_set = self.getTrendStopWords(campaign)
        collection_name = 'summarized_tweets_%s' % campaign.getId()
        if tweetlist is None:
            tweetlist = MongoManager.findTweets("tweets_%s" % campaign.getId(), filters={"retweeted_status": {"$exists": False}, "x_created_at": {"$gte": start, "$lte": end}})
        own_fa = campaign.getOwnFollowAccounts()
        timerange = []
        d = start
        while d < end:
            data = SumDict({'start': d, 'end': d+interval})
            data['stats'] = SumDict()
            data['stats']['total_tweets'] = 0
            data['stats']['own_tweets'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])})
            data['stats']['own_tweets']['retweets']  = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])})
            data['stats']['own_tweets']['favorites']  = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])})
            data['stats']['mentions']  = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])})
            data['sentiment'] = SumDict()
            data['brand'] = SumDict()
            data['product'] = SumDict()
            data['topic'] = SumDict()
            data['gender'] = SumDict()
            data['words'] = SumDict()
            timerange.append(data)
            d = d + interval
            
        for t in tweetlist:
            for interv in timerange:
                if t.getCreatedDate() >= interv['start'] and t.getCreatedDate() < interv['end']:
                    interv['stats']['total_tweets'] += 1
                    if t.getUsername() in own_fa:
                        interv['stats']['own_tweets']['total'] += 1
                        interv['stats']['own_tweets']['accounts'][t.getUsername()] += 1                    
                        interv['stats']['own_tweets']['retweets']['total'] += t.getRetweetsCount()
                        interv['stats']['own_tweets']['retweets']['accounts'][t.getUsername()] += t.getRetweetsCount()
                        interv['stats']['own_tweets']['favorites']['total'] += t.getFavoritesCount()
                        interv['stats']['own_tweets']['favorites']['accounts'][t.getUsername()] += t.getRetweetsCount()
                    for k,v in t.getFollowAccountsMentionCount().items():
                        if k in own_fa:
                            interv['stats']['mentions']['total'] += 1
                            interv['stats']['mentions']['accounts'][k] += 1
                    if t.getSentiment():
                        if not t.getSentiment() in interv['sentiment']: interv['sentiment'][t.getSentiment()] = {"total": 0}
                        interv['sentiment'][t.getSentiment()]['total'] += 1
                    pms = t.getExtractedInfo()
                    if pms:
                        pm = pms[0]
                        try:
                            interv['brand'][pm['brand']] += 1
                        except KeyError,e: 
                            interv['brand'][pm['brand']] = 1
                        if pm['product']: 
                            p = pm['brand'] + "/" + pm['product']
                            try:
                                interv['product'][p] += 1
                            except KeyError, e:
                                interv['product'][p] = 1
                    topics = t.getExtractedTopics()
                    if topics is None: topics = []
                    for k in topics:
                        try:
                            interv['topic'][k['topic_name']]['total'] += 1
                        except KeyError, e:
                            interv['topic'][k['topic_name']] = {'total': 1}
                    for word in self.getWordsList(t.getText()):
                        if word in trend_stop_words_set: continue
                        word = word.lower()
                        nword = synonyms.get(word, word)
                        data['words'][nword] = data['words'].get(nword, 0) + 1

                    gender = t.getGender()
                    try:
                        interv['gender'][gender]['total'] += 1
                    except KeyError, e:
                        interv['gender'][gender] = {'total': 1}
Example #6
0
    def extractGender(cls, name):
        #nname = re.sub(ur'[_]+', u' ', name, flags=re.UNICODE)
        nname = re.sub(ur'[_\-\.]', u' ', name)
        nname = re.sub(ur'[^\w ]+', u'', nname)
        words = [w.lower() for w in name.split() if len(w) > 1]
        names = cls.getNamesDatabase(max_age = timedelta(seconds=300)) #5 minutes
        k = 100
        M = 0
        F = 0
        for w in words:
            g = names.get(w, "U")
            if g == "M": M += k
            elif g == "F": F += k
            k -=1
        if M+F == 0: return "U"
        if M>F: return "M"
        return "F"



if __name__ == "__main__":
    print GenderClassifier.getNamesDatabase()
    tweets = MongoManager.findTweets("tweets_g1", limit=40)
    for t in tweets:
        g = GenderClassifier.extractGender(t.getDisplayName())
        print t.getDisplayName(), g

    for n in ("pablo romina XX", "romina pablo"):
        g = GenderClassifier.extractGender(n)
        print n, g
Example #7
0
    def extractGender(cls, name):
        #nname = re.sub(ur'[_]+', u' ', name, flags=re.UNICODE)
        nname = re.sub(ur'[_\-\.]', u' ', name)
        nname = re.sub(ur'[^\w ]+', u'', nname)
        words = [w.lower() for w in name.split() if len(w) > 1]
        names = cls.getNamesDatabase(
            max_age=timedelta(seconds=300))  #5 minutes
        k = 100
        M = 0
        F = 0
        for w in words:
            g = names.get(w, "U")
            if g == "M": M += k
            elif g == "F": F += k
            k -= 1
        if M + F == 0: return "U"
        if M > F: return "M"
        return "F"


if __name__ == "__main__":
    print GenderClassifier.getNamesDatabase()
    tweets = MongoManager.findTweets("tweets_g1", limit=40)
    for t in tweets:
        g = GenderClassifier.extractGender(t.getDisplayName())
        print t.getDisplayName(), g

    for n in ("pablo romina XX", "romina pablo"):
        g = GenderClassifier.extractGender(n)
        print n, g
Example #8
0
    def calculateSummarizedIntervals(self,
                                     campaign,
                                     start,
                                     end,
                                     interval,
                                     tweetlist=None):
        pprint("summarizing tweets for campaign %s between %s and %s" %
               (campaign.getName(), start, end))
        synonyms = self.getTrendWordsSynonyms(campaign)
        trend_stop_words_set = self.getTrendStopWords(campaign)
        collection_name = 'summarized_tweets_%s' % campaign.getId()
        if tweetlist is None:
            tweetlist = MongoManager.findTweets("tweets_%s" % campaign.getId(),
                                                filters={
                                                    "retweeted_status": {
                                                        "$exists": False
                                                    },
                                                    "x_created_at": {
                                                        "$gte": start,
                                                        "$lte": end
                                                    }
                                                })
        own_fa = campaign.getOwnFollowAccounts()
        timerange = []
        d = start
        while d < end:
            data = SumDict({'start': d, 'end': d + interval})
            data['stats'] = SumDict()
            data['stats']['total_tweets'] = 0
            data['stats']['own_tweets'] = SumDict({
                'total':
                0,
                'accounts':
                SumDict([(a, 0) for a in own_fa])
            })
            data['stats']['own_tweets']['retweets'] = SumDict({
                'total':
                0,
                'accounts':
                SumDict([(a, 0) for a in own_fa])
            })
            data['stats']['own_tweets']['favorites'] = SumDict({
                'total':
                0,
                'accounts':
                SumDict([(a, 0) for a in own_fa])
            })
            data['stats']['mentions'] = SumDict({
                'total':
                0,
                'accounts':
                SumDict([(a, 0) for a in own_fa])
            })
            data['sentiment'] = SumDict()
            data['brand'] = SumDict()
            data['product'] = SumDict()
            data['topic'] = SumDict()
            data['gender'] = SumDict()
            data['words'] = SumDict()
            timerange.append(data)
            d = d + interval

        for t in tweetlist:
            for interv in timerange:
                if t.getCreatedDate() >= interv['start'] and t.getCreatedDate(
                ) < interv['end']:
                    interv['stats']['total_tweets'] += 1
                    if t.getUsername() in own_fa:
                        interv['stats']['own_tweets']['total'] += 1
                        interv['stats']['own_tweets']['accounts'][
                            t.getUsername()] += 1
                        interv['stats']['own_tweets']['retweets'][
                            'total'] += t.getRetweetsCount()
                        interv['stats']['own_tweets']['retweets']['accounts'][
                            t.getUsername()] += t.getRetweetsCount()
                        interv['stats']['own_tweets']['favorites'][
                            'total'] += t.getFavoritesCount()
                        interv['stats']['own_tweets']['favorites']['accounts'][
                            t.getUsername()] += t.getRetweetsCount()
                    for k, v in t.getFollowAccountsMentionCount().items():
                        if k in own_fa:
                            interv['stats']['mentions']['total'] += 1
                            interv['stats']['mentions']['accounts'][k] += 1
                    if t.getSentiment():
                        if not t.getSentiment() in interv['sentiment']:
                            interv['sentiment'][t.getSentiment()] = {
                                "total": 0
                            }
                        interv['sentiment'][t.getSentiment()]['total'] += 1
                    pms = t.getExtractedInfo()
                    if pms:
                        pm = pms[0]
                        try:
                            interv['brand'][pm['brand']] += 1
                        except KeyError, e:
                            interv['brand'][pm['brand']] = 1
                        if pm['product']:
                            p = pm['brand'] + "/" + pm['product']
                            try:
                                interv['product'][p] += 1
                            except KeyError, e:
                                interv['product'][p] = 1
                    topics = t.getExtractedTopics()
                    if topics is None: topics = []
                    for k in topics:
                        try:
                            interv['topic'][k['topic_name']]['total'] += 1
                        except KeyError, e:
                            interv['topic'][k['topic_name']] = {'total': 1}
                    for word in self.getWordsList(t.getText()):
                        if word in trend_stop_words_set: continue
                        word = word.lower()
                        nword = synonyms.get(word, word)
                        data['words'][nword] = data['words'].get(nword, 0) + 1

                    gender = t.getGender()
                    try:
                        interv['gender'][gender]['total'] += 1
                    except KeyError, e:
                        interv['gender'][gender] = {'total': 1}