def start(self, **kwargs): only_campaign= kwargs.get('campaign', None) regenerate_all = kwargs.get('regenerate', False) while True: end = self.getCurrentSummarizationEnd() for account in MongoManager.getActiveAccounts(max_age=timedelta(hours=1)): for campaign in account.getActiveCampaigns(): MongoManager.ensureIndex('summarized_tweets_%s' % campaign.getId(), [("start", 1)]) if only_campaign and only_campaign.getId() != campaign.getId(): continue if regenerate_all: self.clearSummarization(campaign) collection_name = 'tweets_%s' % campaign.getId() res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1) if res.count(): lsd = res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0) else: lsd = datetime.now().replace(minute=0, second=0, microsecond=0) else: lsd = self.getLastSummarizedDate(campaign) if lsd < end: while lsd < end: self.summarize(campaign, lsd, min(end, lsd + timedelta(days=1)), timedelta(hours=1), None) lsd = lsd + timedelta(days=1) pprint("sleeping 20 seconds") regenerate_all = False time.sleep(20)
def getLastSummarizedDate(self, campaign): collection_name = 'summarized_tweets_%s' % campaign.getId() res = MongoManager.find(collection_name, sort=("start", -1), limit=1) if res.count(): return res[0]['end'] else: collection_name = 'tweets_%s' % campaign.getId() res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1) if res.count(): return res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0) return datetime.now().replace(minute=0, second=0, microsecond=0)
def start(self, **kwargs): only_campaign = kwargs.get('campaign', None) regenerate_all = kwargs.get('regenerate', False) while True: end = self.getCurrentSummarizationEnd() for account in MongoManager.getActiveAccounts(max_age=timedelta( hours=1)): for campaign in account.getActiveCampaigns(): MongoManager.ensureIndex( 'summarized_tweets_%s' % campaign.getId(), [("start", 1)]) if only_campaign and only_campaign.getId( ) != campaign.getId(): continue if regenerate_all: self.clearSummarization(campaign) collection_name = 'tweets_%s' % campaign.getId() res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1) if res.count(): lsd = res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0) else: lsd = datetime.now().replace(minute=0, second=0, microsecond=0) else: lsd = self.getLastSummarizedDate(campaign) if lsd < end: while lsd < end: self.summarize(campaign, lsd, min(end, lsd + timedelta(days=1)), timedelta(hours=1), None) lsd = lsd + timedelta(days=1) pprint("sleeping 20 seconds") regenerate_all = False time.sleep(20)
def calculateSummarizedIntervals(self, campaign, start, end, interval, tweetlist=None): pprint("summarizing tweets for campaign %s between %s and %s" % (campaign.getName(), start, end)) synonyms = self.getTrendWordsSynonyms(campaign) trend_stop_words_set = self.getTrendStopWords(campaign) collection_name = 'summarized_tweets_%s' % campaign.getId() if tweetlist is None: tweetlist = MongoManager.findTweets("tweets_%s" % campaign.getId(), filters={"retweeted_status": {"$exists": False}, "x_created_at": {"$gte": start, "$lte": end}}) own_fa = campaign.getOwnFollowAccounts() timerange = [] d = start while d < end: data = SumDict({'start': d, 'end': d+interval}) data['stats'] = SumDict() data['stats']['total_tweets'] = 0 data['stats']['own_tweets'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])}) data['stats']['own_tweets']['retweets'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])}) data['stats']['own_tweets']['favorites'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])}) data['stats']['mentions'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])}) data['sentiment'] = SumDict() data['brand'] = SumDict() data['product'] = SumDict() data['topic'] = SumDict() data['gender'] = SumDict() data['words'] = SumDict() timerange.append(data) d = d + interval for t in tweetlist: for interv in timerange: if t.getCreatedDate() >= interv['start'] and t.getCreatedDate() < interv['end']: interv['stats']['total_tweets'] += 1 if t.getUsername() in own_fa: interv['stats']['own_tweets']['total'] += 1 interv['stats']['own_tweets']['accounts'][t.getUsername()] += 1 interv['stats']['own_tweets']['retweets']['total'] += t.getRetweetsCount() interv['stats']['own_tweets']['retweets']['accounts'][t.getUsername()] += t.getRetweetsCount() interv['stats']['own_tweets']['favorites']['total'] += t.getFavoritesCount() interv['stats']['own_tweets']['favorites']['accounts'][t.getUsername()] += t.getRetweetsCount() for k,v in t.getFollowAccountsMentionCount().items(): if k in own_fa: interv['stats']['mentions']['total'] += 1 interv['stats']['mentions']['accounts'][k] += 1 if t.getSentiment(): if not t.getSentiment() in interv['sentiment']: interv['sentiment'][t.getSentiment()] = {"total": 0} interv['sentiment'][t.getSentiment()]['total'] += 1 pms = t.getExtractedInfo() if pms: pm = pms[0] try: interv['brand'][pm['brand']] += 1 except KeyError,e: interv['brand'][pm['brand']] = 1 if pm['product']: p = pm['brand'] + "/" + pm['product'] try: interv['product'][p] += 1 except KeyError, e: interv['product'][p] = 1 topics = t.getExtractedTopics() if topics is None: topics = [] for k in topics: try: interv['topic'][k['topic_name']]['total'] += 1 except KeyError, e: interv['topic'][k['topic_name']] = {'total': 1} for word in self.getWordsList(t.getText()): if word in trend_stop_words_set: continue word = word.lower() nword = synonyms.get(word, word) data['words'][nword] = data['words'].get(nword, 0) + 1 gender = t.getGender() try: interv['gender'][gender]['total'] += 1 except KeyError, e: interv['gender'][gender] = {'total': 1}
def extractGender(cls, name): #nname = re.sub(ur'[_]+', u' ', name, flags=re.UNICODE) nname = re.sub(ur'[_\-\.]', u' ', name) nname = re.sub(ur'[^\w ]+', u'', nname) words = [w.lower() for w in name.split() if len(w) > 1] names = cls.getNamesDatabase(max_age = timedelta(seconds=300)) #5 minutes k = 100 M = 0 F = 0 for w in words: g = names.get(w, "U") if g == "M": M += k elif g == "F": F += k k -=1 if M+F == 0: return "U" if M>F: return "M" return "F" if __name__ == "__main__": print GenderClassifier.getNamesDatabase() tweets = MongoManager.findTweets("tweets_g1", limit=40) for t in tweets: g = GenderClassifier.extractGender(t.getDisplayName()) print t.getDisplayName(), g for n in ("pablo romina XX", "romina pablo"): g = GenderClassifier.extractGender(n) print n, g
def extractGender(cls, name): #nname = re.sub(ur'[_]+', u' ', name, flags=re.UNICODE) nname = re.sub(ur'[_\-\.]', u' ', name) nname = re.sub(ur'[^\w ]+', u'', nname) words = [w.lower() for w in name.split() if len(w) > 1] names = cls.getNamesDatabase( max_age=timedelta(seconds=300)) #5 minutes k = 100 M = 0 F = 0 for w in words: g = names.get(w, "U") if g == "M": M += k elif g == "F": F += k k -= 1 if M + F == 0: return "U" if M > F: return "M" return "F" if __name__ == "__main__": print GenderClassifier.getNamesDatabase() tweets = MongoManager.findTweets("tweets_g1", limit=40) for t in tweets: g = GenderClassifier.extractGender(t.getDisplayName()) print t.getDisplayName(), g for n in ("pablo romina XX", "romina pablo"): g = GenderClassifier.extractGender(n) print n, g
def calculateSummarizedIntervals(self, campaign, start, end, interval, tweetlist=None): pprint("summarizing tweets for campaign %s between %s and %s" % (campaign.getName(), start, end)) synonyms = self.getTrendWordsSynonyms(campaign) trend_stop_words_set = self.getTrendStopWords(campaign) collection_name = 'summarized_tweets_%s' % campaign.getId() if tweetlist is None: tweetlist = MongoManager.findTweets("tweets_%s" % campaign.getId(), filters={ "retweeted_status": { "$exists": False }, "x_created_at": { "$gte": start, "$lte": end } }) own_fa = campaign.getOwnFollowAccounts() timerange = [] d = start while d < end: data = SumDict({'start': d, 'end': d + interval}) data['stats'] = SumDict() data['stats']['total_tweets'] = 0 data['stats']['own_tweets'] = SumDict({ 'total': 0, 'accounts': SumDict([(a, 0) for a in own_fa]) }) data['stats']['own_tweets']['retweets'] = SumDict({ 'total': 0, 'accounts': SumDict([(a, 0) for a in own_fa]) }) data['stats']['own_tweets']['favorites'] = SumDict({ 'total': 0, 'accounts': SumDict([(a, 0) for a in own_fa]) }) data['stats']['mentions'] = SumDict({ 'total': 0, 'accounts': SumDict([(a, 0) for a in own_fa]) }) data['sentiment'] = SumDict() data['brand'] = SumDict() data['product'] = SumDict() data['topic'] = SumDict() data['gender'] = SumDict() data['words'] = SumDict() timerange.append(data) d = d + interval for t in tweetlist: for interv in timerange: if t.getCreatedDate() >= interv['start'] and t.getCreatedDate( ) < interv['end']: interv['stats']['total_tweets'] += 1 if t.getUsername() in own_fa: interv['stats']['own_tweets']['total'] += 1 interv['stats']['own_tweets']['accounts'][ t.getUsername()] += 1 interv['stats']['own_tweets']['retweets'][ 'total'] += t.getRetweetsCount() interv['stats']['own_tweets']['retweets']['accounts'][ t.getUsername()] += t.getRetweetsCount() interv['stats']['own_tweets']['favorites'][ 'total'] += t.getFavoritesCount() interv['stats']['own_tweets']['favorites']['accounts'][ t.getUsername()] += t.getRetweetsCount() for k, v in t.getFollowAccountsMentionCount().items(): if k in own_fa: interv['stats']['mentions']['total'] += 1 interv['stats']['mentions']['accounts'][k] += 1 if t.getSentiment(): if not t.getSentiment() in interv['sentiment']: interv['sentiment'][t.getSentiment()] = { "total": 0 } interv['sentiment'][t.getSentiment()]['total'] += 1 pms = t.getExtractedInfo() if pms: pm = pms[0] try: interv['brand'][pm['brand']] += 1 except KeyError, e: interv['brand'][pm['brand']] = 1 if pm['product']: p = pm['brand'] + "/" + pm['product'] try: interv['product'][p] += 1 except KeyError, e: interv['product'][p] = 1 topics = t.getExtractedTopics() if topics is None: topics = [] for k in topics: try: interv['topic'][k['topic_name']]['total'] += 1 except KeyError, e: interv['topic'][k['topic_name']] = {'total': 1} for word in self.getWordsList(t.getText()): if word in trend_stop_words_set: continue word = word.lower() nword = synonyms.get(word, word) data['words'][nword] = data['words'].get(nword, 0) + 1 gender = t.getGender() try: interv['gender'][gender]['total'] += 1 except KeyError, e: interv['gender'][gender] = {'total': 1}