def start(self, **kwargs): only_campaign= kwargs.get('campaign', None) regenerate_all = kwargs.get('regenerate', False) while True: end = self.getCurrentSummarizationEnd() for account in MongoManager.getActiveAccounts(max_age=timedelta(hours=1)): for campaign in account.getActiveCampaigns(): MongoManager.ensureIndex('summarized_tweets_%s' % campaign.getId(), [("start", 1)]) if only_campaign and only_campaign.getId() != campaign.getId(): continue if regenerate_all: self.clearSummarization(campaign) collection_name = 'tweets_%s' % campaign.getId() res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1) if res.count(): lsd = res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0) else: lsd = datetime.now().replace(minute=0, second=0, microsecond=0) else: lsd = self.getLastSummarizedDate(campaign) if lsd < end: while lsd < end: self.summarize(campaign, lsd, min(end, lsd + timedelta(days=1)), timedelta(hours=1), None) lsd = lsd + timedelta(days=1) pprint("sleeping 20 seconds") regenerate_all = False time.sleep(20)
def processItem(self, entry): campaigns = entry['campaigns'] del entry['campaigns'] for campaign in campaigns: collection_name = "fb_posts_%s" % campaign.getId() #pprint("saving entry to campaign %s" % campaign.getName()) MongoManager.saveDocument(collection_name, entry)
def processItem(self, tweet): #accs = MongoManager.getActiveAccounts(max_age=timedelta(seconds=10)) // ES NECESARIO?? LO COMENTO POR AHORA #pprint (tweet) #pprint (tweet.getExtractedInfo()) follow_accounts = MongoManager.getFollowAccountsbyCampaign( max_age=timedelta(seconds=10)) bcs = ClassifierManager.getBrandClassifiers( ) #esto tendria que esta cacheado tambien en classifiermanager tcs = None pms = self.getBrandClassifiersByCampaign( tweet, bcs, follow_accounts ) ##FALTA AGREGAR TAMBIEN A LOS TWEETS QUE NO MATCHEAN PERO QUE SON DE UN USUARIO SEGUIDO POR LA MARCA #pprint(pms) for cid, pmlist in pms.items(): if tcs is None: tcs = ClassifierManager.getTopicClassifiers() tms = self.getTopicClassifiers(tweet, cid, tcs) tweet.setExtractedTopics(tms) tweet.setExtractedInfo(pmlist) tweet.setGender( GenderClassifier.extractGender(tweet.getDisplayName())) tweet.resetFollowAccountsMentionCount() user_mentions = tweet.getUserMentions() for fa in follow_accounts: if fa in user_mentions: for fainfo in follow_accounts[fa]: if fainfo['cid'] == cid: tweet.setFollowAccountsMentionCount(fa, 1) #pprint(pmlist) #pprint("saving tweet to campaign %s" % cid) MongoManager.saveDocument("tweets_%s" % cid, tweet.getDictionary()) return None #no devuelvo nada para que no se acumulen los tweets en la ultima lista y se sature la memoria
def summarize(self, campaign, start, end, interval, tweetlist=None): collection_name = 'summarized_tweets_%s' % campaign.getId() timerange = self.calculateSummarizedIntervals(campaign, start, end, interval, tweetlist) for interv in timerange: res = MongoManager.findOne(collection_name, filters={'start': start, 'end': end}) if res: interv['_id'] = res['_id'] MongoManager.saveDocument(collection_name, interv)
def run(self): year, month = self.findFirstMonth() #year = 2015 #month = 2 if not year or not month: return d = datetime(year, month, 1) while not self.finish_flag and d <= datetime.now(): feed = self.getFeed(self.url + "/%s/%s/%s/feed" % (d.year, d.month, d.day)) for entry in feed.entries: if self.finish_flag: break if entry.slash_comments > 0: comments_feed = self.getFeed(entry.wfw_commentrss) if comments_feed: for comment_entry in comments_feed.entries: fe = FeedEntry.fromFeedParserEntry(comments_feed.feed.link, comment_entry) fe.account = self.account fe.campaign = self.campaign self.queue.put(fe) d = d + timedelta(days=1) if d.day == 1: #cambio el mes, me fijo que si posts en el nuevo mes while not self.finish_flag and datetime.now(): dummy_feed = self.getFeed(self.url + "/%s/%s/feed" % (d.year, d.month)) if dummy_feed.entries: break d = (d + timedelta(days=32)).replace(day=1) #agrego 1 mes if d > datetime.now(): acc = MongoManager.getAccount(id=self.account.getId()) camp = acc.getCampaign(id =self.campaign.getId()) camp.addHistoryFetchedForum(self.url) MongoManager.saveCampaign(acc, camp)
def getListOfUsers(): mng = MongoManager() dbResult=mng.get("tests") response.set_header("Content-Type:","text/json") result =json.dumps(dbResult) return result
def getLastSummarizedDate(self, campaign): collection_name = 'summarized_tweets_%s' % campaign.getId() res = MongoManager.find(collection_name, sort=("start", -1), limit=1) if res.count(): return res[0]['end'] else: collection_name = 'tweets_%s' % campaign.getId() res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1) if res.count(): return res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0) return datetime.now().replace(minute=0, second=0, microsecond=0)
def summarize(self, campaign, start, end, interval, tweetlist=None): collection_name = 'summarized_tweets_%s' % campaign.getId() timerange = self.calculateSummarizedIntervals(campaign, start, end, interval, tweetlist) for interv in timerange: res = MongoManager.findOne(collection_name, filters={ 'start': start, 'end': end }) if res: interv['_id'] = res['_id'] MongoManager.saveDocument(collection_name, interv)
def processItem(self, item): polls_ht = MongoManager.getPollsByHashtag(max_age=timedelta( seconds=10)) tweet = Tweet.createFromUnknownSource(item) #pprint(tweet) for ht in tweet.getHashtags(): if ht in polls_ht: for poll in polls_ht[ht]: MongoManager.saveDocument("polls_" + poll.getId(), tweet.getDictionary()) #pprint("grabando tweet para poll %s" % poll.getName()) return tweet
def getSummarizedData(self, campaign, start, end): collection_name = 'summarized_tweets_%s' % campaign.getId() #print 41, datetime.now() res = MongoManager.find(collection_name, filters={ 'start': { "$gte": start, "$lte": end }, 'end': { "$lte": end } }, sort=('start', 1)) #pprint(res.explain()) #print 43, datetime.now() #timerange = [SumDict(r) for r in res] timerange = list(res) #print 44, datetime.now() #for r in timerange: # print r['start'], r['end'], r['stats']['total_tweets'], r['sentiment'], r.get('calculated', '') #print 44, datetime.now() #print timerange[-1]['end'], end if timerange and timerange[-1]['end'] < end: d = self.calculateSummarizedIntervals(campaign, timerange[-1]['end'], end, end - timerange[-1]['end']) #for k in d: # k['calculated'] = True timerange.extend(d) #for r in timerange: # print r['start'], r['end'], r['stats']['total_tweets'], r['sentiment'], r.get('calculated', '') #print 45, datetime.now() return timerange
def generateBrandClassifier(self, br): bc = BrandClassifier() bc.account_id = br.account_id bc.account_name = br.account_name bc.campaign_id = br.campaign_id bc.campaign_name = br.campaign_name bc.score_threshold = br.score_threshold bc.name = {br.name: br.synonyms} bc.brand_confidence_clues = self.genClassifierClues(br.keywords) for kws in br.keyword_sets: if kws.getId(): bc.brand_confidence_clues.append((kws.getValue(),) + tuple(MongoManager.getKeywordset(id=kws.getId()).getKeywords())) if br.rules: bc.brand_regexps = [(re.compile(self.getBrandRegexpFromRule(br, rule), re.I|re.U), rule) for rule in br.rules] pr_number = 0 for pr in br.children: bc.product_list.append(pr.name) bc.products[pr.name] = pr.synonyms bc.product_regexps[pr.name] = [] for rule in pr.rules: bc.product_regexps[pr.name].append((re.compile(self.getProductRegexpFromRule(br, pr, pr_number, rule), re.I|re.U), rule)) if pr.use_brand_id_rules: for rule in br.rules: if rule.find("[P]") >= 0: bc.product_regexps[pr.name].append((re.compile(self.getProductRegexpFromRule(br, pr, pr_number, rule), re.I|re.U), rule)) pr_number += 1 bc.product_confidence_clues[pr.name] = self.genClassifierClues(pr.keywords) return bc
def getAllFeedURLs(self): res = [] accs = MongoManager.getActiveAccounts() for acc in accs: for camp in acc.getActiveCampaigns(): for url in camp.getForums(): res.append((acc, camp, url)) return res
def generateTopicClassifier(self, topicdoc): tc = TopicClassifier() tc.topic_name = topicdoc.getName() tc.topic_id = str(topicdoc.getId()) tc.topic_confidence_clues = self.genClassifierClues(topicdoc.getKeywords()) for kws in topicdoc.getKeywordsets(): tc.topic_confidence_clues.append((kws.getValue(),) + tuple(MongoManager.getKeywordset(id=kws.getId()).getKeywords())) return tc
def getNamesDatabase(cls, **kwargs): max_age = kwargs.get('max_age', timedelta(seconds=0)) if not max_age or not cls.cached_names_database or (datetime.now() - cls.cached_names_database['fetch_time'] > max_age): namesdb = MongoManager.find("gender_names") res = {} for name in namesdb: res[name["name"].lower()] = name["gender"] cls.cached_names_database = {'data': res, 'fetch_time': datetime.now()} return cls.cached_names_database['data']
def generateGnipRulesFromMongo(self): accounts = MongoManager.getActiveAccounts() rules = [] for acc in accounts: for camp in acc.getActiveCampaigns(): for fp in camp.getFacebookFanpages(): #rules.append({"value": fp, "tag": "%s/%s/%s" % (acc.getName(), camp.getName(), fp)}) rules.append({"value": fp, "tag": None}) return rules
def getAllHistoryFeedURLs(self): res = [] accs = MongoManager.getActiveAccounts() for acc in accs: for camp in acc.getActiveCampaigns(): hff = camp.getHistoryFetchedForums() for url in camp.getForums(): if url not in hff: res.append((acc, camp, url)) return res
def getBrandClassifiers(cls): #faltaria buffer por max_age o = cls() accounts = MongoManager.getActiveAccounts(max_age=timedelta(seconds=10)) rules = [] for acc in accounts: rules.extend(o.getAccountRules(acc)) res = [] for r in rules: res.append(o.generateBrandClassifier(r)) return res
def getFanpageToCampaignsDict(cls): if not cls.fanpage_to_campaigns_max_age or not cls.cached_fanpage_to_campaigns or (datetime.now() - cls.cached_fanpage_to_campaigns['fetch_time'] > cls.fanpage_to_campaigns_max_age): print "refetching fanpages to campagins dict" accounts = MongoManager.getActiveAccounts() data = {} for acc in accounts: for camp in acc.getActiveCampaigns(): for fp in camp.getFacebookFanpages(): if fp not in data: data[fp] = [] data[fp].append(camp) cls.cached_fanpage_to_campaigns = {'data': data, 'fetch_time': datetime.now()} return cls.cached_fanpage_to_campaigns['data']
def start(self, **kwargs): only_campaign = kwargs.get('campaign', None) regenerate_all = kwargs.get('regenerate', False) while True: end = self.getCurrentSummarizationEnd() for account in MongoManager.getActiveAccounts(max_age=timedelta( hours=1)): for campaign in account.getActiveCampaigns(): MongoManager.ensureIndex( 'summarized_tweets_%s' % campaign.getId(), [("start", 1)]) if only_campaign and only_campaign.getId( ) != campaign.getId(): continue if regenerate_all: self.clearSummarization(campaign) collection_name = 'tweets_%s' % campaign.getId() res = MongoManager.findTweets(collection_name, sort=("x_created_at", 1), limit=1) if res.count(): lsd = res[0]['x_created_at'].replace(minute=0, second=0, microsecond=0) else: lsd = datetime.now().replace(minute=0, second=0, microsecond=0) else: lsd = self.getLastSummarizedDate(campaign) if lsd < end: while lsd < end: self.summarize(campaign, lsd, min(end, lsd + timedelta(days=1)), timedelta(hours=1), None) lsd = lsd + timedelta(days=1) pprint("sleeping 20 seconds") regenerate_all = False time.sleep(20)
def getNamesDatabase(cls, **kwargs): max_age = kwargs.get('max_age', timedelta(seconds=0)) if not max_age or not cls.cached_names_database or ( datetime.now() - cls.cached_names_database['fetch_time'] > max_age): namesdb = MongoManager.find("gender_names") res = {} for name in namesdb: res[name["name"].lower()] = name["gender"] cls.cached_names_database = { 'data': res, 'fetch_time': datetime.now() } return cls.cached_names_database['data']
def getGlobalTrendStopWords(cls, language, **kwargs): max_age = kwargs.get('max_age', timedelta(seconds=0)) if not max_age or not cls.global_trend_stop_words.get( language, None) or (datetime.now() - cls.global_trend_stop_words[language]['fetch_time'] > max_age): cls.global_trend_stop_words[language] = { 'data': set(MongoManager.getGlobalTrendStopWords(language)['words']), 'fetch_time': datetime.now() } return cls.global_trend_stop_words[language]['data']
def getTopicClassifiers(cls): #faltaria buffer por max_age #devuelve un diccionario con los topics x campania o = cls() res = {} accounts = MongoManager.getActiveAccounts(max_age=timedelta(seconds=10)) for acc in accounts: for campaign in acc.getActiveCampaigns(): topics = campaign.getTopics() if not topics: continue res[campaign.getId()] = {} for topic in topics: #topic['_id'] = topic.getId() ###ESTO VA??? res[campaign.getId()][topic.getId()] = o.generateTopicClassifier(topic) return res
def run(self): year, month = self.findFirstMonth() #year = 2015 #month = 2 if not year or not month: return d = datetime(year, month, 1) while not self.finish_flag and d <= datetime.now(): feed = self.getFeed(self.url + "/%s/%s/%s/feed" % (d.year, d.month, d.day)) for entry in feed.entries: if self.finish_flag: break if entry.slash_comments > 0: comments_feed = self.getFeed(entry.wfw_commentrss) if comments_feed: for comment_entry in comments_feed.entries: fe = FeedEntry.fromFeedParserEntry( comments_feed.feed.link, comment_entry) fe.account = self.account fe.campaign = self.campaign self.queue.put(fe) d = d + timedelta(days=1) if d.day == 1: #cambio el mes, me fijo que si posts en el nuevo mes while not self.finish_flag and datetime.now(): dummy_feed = self.getFeed(self.url + "/%s/%s/feed" % (d.year, d.month)) if dummy_feed.entries: break d = (d + timedelta(days=32)).replace(day=1) #agrego 1 mes if d > datetime.now(): acc = MongoManager.getAccount(id=self.account.getId()) camp = acc.getCampaign(id=self.campaign.getId()) camp.addHistoryFetchedForum(self.url) MongoManager.saveCampaign(acc, camp)
def getSummarizedData(self, campaign, start, end): collection_name = 'summarized_tweets_%s' % campaign.getId() #print 41, datetime.now() res = MongoManager.find(collection_name, filters={'start': {"$gte": start, "$lte": end}, 'end': {"$lte": end}}, sort=('start',1)) #pprint(res.explain()) #print 43, datetime.now() #timerange = [SumDict(r) for r in res] timerange = list(res) #print 44, datetime.now() #for r in timerange: # print r['start'], r['end'], r['stats']['total_tweets'], r['sentiment'], r.get('calculated', '') #print 44, datetime.now() #print timerange[-1]['end'], end if timerange and timerange[-1]['end'] < end: d = self.calculateSummarizedIntervals(campaign, timerange[-1]['end'], end, end - timerange[-1]['end']) #for k in d: # k['calculated'] = True timerange.extend(d) #for r in timerange: # print r['start'], r['end'], r['stats']['total_tweets'], r['sentiment'], r.get('calculated', '') #print 45, datetime.now() return timerange
def processItem(self, feed): #pprint (feed) #pprint (tweet.getExtractedInfo()) bcs = ClassifierManager.getCampaignBrandClassifiers( feed.account, feed.campaign ) #esto tendria que esta cacheado tambien en classifiermanager tcs = None pms = self.getBrandClassifiersByCampaign( feed.getText(), bcs ) ##FALTA AGREGAR TAMBIEN A LOS TWEETS QUE NO MATCHEAN PERO QUE SON DE UN USUARIO SEGUIDO POR LA MARCA #print "processing feed:", feed for cid, pmlist in pms.items(): if tcs is None: tcs = ClassifierManager.getCampaignTopicClassifiers( feed.campaign) tms = self.getTopicClassifiers(feed.getText(), cid, tcs) feed.setExtractedTopics(tms) feed.setExtractedInfo(pmlist) if not self.APPLY_BRAND_FILTERS or feed.getExtractedInfo(): mongores = MongoManager.saveDocument( "feeds_%s" % feed.campaign.getId(), feed.getDictionary()) #print "mongo result: ", mongores return None #no devuelvo nada para que no se acumulen los feeds en la ultima lista y se sature la memoria
from mongo import MongoManager mongo_mgr = MongoManager("mongodb://*****:*****@192.168.1.14:27017/stock") l = mongo_mgr.get_collection_names('stock') for collection_name in l: if "DailyInfo_" in collection_name: print(collection_name) #mongo_mgr.drop_collection('stock', collection_name)
def clearAllSummarizedData(self, campaign_id): #esto esta repetido! collection_name = 'summarized_tweets_%s' % campaign_id #print 41, datetime.now() res = MongoManager.remove(collection_name, filters={})
def calculateSummarizedIntervals(self, campaign, start, end, interval, tweetlist=None): pprint("summarizing tweets for campaign %s between %s and %s" % (campaign.getName(), start, end)) synonyms = self.getTrendWordsSynonyms(campaign) trend_stop_words_set = self.getTrendStopWords(campaign) collection_name = 'summarized_tweets_%s' % campaign.getId() if tweetlist is None: tweetlist = MongoManager.findTweets("tweets_%s" % campaign.getId(), filters={"retweeted_status": {"$exists": False}, "x_created_at": {"$gte": start, "$lte": end}}) own_fa = campaign.getOwnFollowAccounts() timerange = [] d = start while d < end: data = SumDict({'start': d, 'end': d+interval}) data['stats'] = SumDict() data['stats']['total_tweets'] = 0 data['stats']['own_tweets'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])}) data['stats']['own_tweets']['retweets'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])}) data['stats']['own_tweets']['favorites'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])}) data['stats']['mentions'] = SumDict({'total': 0, 'accounts': SumDict([(a,0) for a in own_fa])}) data['sentiment'] = SumDict() data['brand'] = SumDict() data['product'] = SumDict() data['topic'] = SumDict() data['gender'] = SumDict() data['words'] = SumDict() timerange.append(data) d = d + interval for t in tweetlist: for interv in timerange: if t.getCreatedDate() >= interv['start'] and t.getCreatedDate() < interv['end']: interv['stats']['total_tweets'] += 1 if t.getUsername() in own_fa: interv['stats']['own_tweets']['total'] += 1 interv['stats']['own_tweets']['accounts'][t.getUsername()] += 1 interv['stats']['own_tweets']['retweets']['total'] += t.getRetweetsCount() interv['stats']['own_tweets']['retweets']['accounts'][t.getUsername()] += t.getRetweetsCount() interv['stats']['own_tweets']['favorites']['total'] += t.getFavoritesCount() interv['stats']['own_tweets']['favorites']['accounts'][t.getUsername()] += t.getRetweetsCount() for k,v in t.getFollowAccountsMentionCount().items(): if k in own_fa: interv['stats']['mentions']['total'] += 1 interv['stats']['mentions']['accounts'][k] += 1 if t.getSentiment(): if not t.getSentiment() in interv['sentiment']: interv['sentiment'][t.getSentiment()] = {"total": 0} interv['sentiment'][t.getSentiment()]['total'] += 1 pms = t.getExtractedInfo() if pms: pm = pms[0] try: interv['brand'][pm['brand']] += 1 except KeyError,e: interv['brand'][pm['brand']] = 1 if pm['product']: p = pm['brand'] + "/" + pm['product'] try: interv['product'][p] += 1 except KeyError, e: interv['product'][p] = 1 topics = t.getExtractedTopics() if topics is None: topics = [] for k in topics: try: interv['topic'][k['topic_name']]['total'] += 1 except KeyError, e: interv['topic'][k['topic_name']] = {'total': 1} for word in self.getWordsList(t.getText()): if word in trend_stop_words_set: continue word = word.lower() nword = synonyms.get(word, word) data['words'][nword] = data['words'].get(nword, 0) + 1 gender = t.getGender() try: interv['gender'][gender]['total'] += 1 except KeyError, e: interv['gender'][gender] = {'total': 1}
def extractGender(cls, name): #nname = re.sub(ur'[_]+', u' ', name, flags=re.UNICODE) nname = re.sub(ur'[_\-\.]', u' ', name) nname = re.sub(ur'[^\w ]+', u'', nname) words = [w.lower() for w in name.split() if len(w) > 1] names = cls.getNamesDatabase( max_age=timedelta(seconds=300)) #5 minutes k = 100 M = 0 F = 0 for w in words: g = names.get(w, "U") if g == "M": M += k elif g == "F": F += k k -= 1 if M + F == 0: return "U" if M > F: return "M" return "F" if __name__ == "__main__": print GenderClassifier.getNamesDatabase() tweets = MongoManager.findTweets("tweets_g1", limit=40) for t in tweets: g = GenderClassifier.extractGender(t.getDisplayName()) print t.getDisplayName(), g for n in ("pablo romina XX", "romina pablo"): g = GenderClassifier.extractGender(n) print n, g
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--regenerate', action="store_true", default=False) parser.add_argument('--account', default=None) parser.add_argument('--list', action="store_true", default=False) parser.add_argument('--start', default=None) parser.add_argument('--end', default=None) parser.add_argument('--clear', action="store_true", default=False) args, known = parser.parse_known_args() campaign = None if args.account: account = MongoManager.getAccount(name=args.account) if not account: pprint("Account %s not found" % args.account) exit(1) campaign = account.getActiveCampaigns()[0] summarizer = Summarizer() if not args.list and not args.clear: summarizer.start(campaign=campaign, regenerate=args.regenerate) elif args.clear and campaign: summarizer.clearSummarization(campaign) elif args.list and campaign and args.start and args.end: print args start = datetime.strptime(args.start, "%Y-%m-%dT%H") end = datetime.strptime(args.end, "%Y-%m-%dT%H") records = summarizer.getSummarizedData(campaign,start,end)
from mongo import MongoManager doc={ "nome":"Ruben2", "email":"*****@*****.**" } print "insert item 1" mng= MongoManager() IDResult=mng.add("tests",doc) print IDResult print doc del doc["_id"] print "insert item 2" IDResult=mng.add("tests",doc) print IDResult print "find item 2" filterDoc ={"_id":IDResult} resultDoc=mng.get("tests",filterDoc) print resultDoc print "find all items" resultDoc= mng.get("tests") print resultDoc
from datetime import timedelta, date, datetime import pandas as pd import csv try: sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import global_func import define from mongo import MongoManager from define import DB_KEY as DB_KEY except: import src.global_func import src.define from src.tools.mongo import MongoManager from src.define import DB_KEY as DB_KEY mongo_mgr = MongoManager("mongodb://*****:*****@192.168.1.14:27017/stock") LOG_ENABLE = True def normalize_file(market_type:str, file_path:str): with open(file_path, "r+", encoding='utf8') as f: text = f.read() text_arr = [i.translate({ord(' '): None, ord('='):None}).rstrip(',') for i in text.split('\n') if (len(i.split('",')) >= 15 and len(i.split('",')) <= 17) or "代號" in i] if market_type == define.MarketType.TPEX: if len(text_arr) > 0: if "代號" in text_arr[0]: del text_arr[0] if len(text_arr) > 0: length = len(text_arr[0].split('",')) if text_arr != None and len(text_arr) > 0 else 0 if "證券代號" not in text_arr[0]: if length == 15:
def extractGender(cls, name): #nname = re.sub(ur'[_]+', u' ', name, flags=re.UNICODE) nname = re.sub(ur'[_\-\.]', u' ', name) nname = re.sub(ur'[^\w ]+', u'', nname) words = [w.lower() for w in name.split() if len(w) > 1] names = cls.getNamesDatabase(max_age = timedelta(seconds=300)) #5 minutes k = 100 M = 0 F = 0 for w in words: g = names.get(w, "U") if g == "M": M += k elif g == "F": F += k k -=1 if M+F == 0: return "U" if M>F: return "M" return "F" if __name__ == "__main__": print GenderClassifier.getNamesDatabase() tweets = MongoManager.findTweets("tweets_g1", limit=40) for t in tweets: g = GenderClassifier.extractGender(t.getDisplayName()) print t.getDisplayName(), g for n in ("pablo romina XX", "romina pablo"): g = GenderClassifier.extractGender(n) print n, g
def getGlobalTrendStopWords(cls, language, **kwargs): max_age = kwargs.get('max_age', timedelta(seconds=0)) if not max_age or not cls.global_trend_stop_words.get(language, None) or (datetime.now() - cls.global_trend_stop_words[language]['fetch_time'] > max_age): cls.global_trend_stop_words[language] = {'data': set(MongoManager.getGlobalTrendStopWords(language)['words']), 'fetch_time': datetime.now()} return cls.global_trend_stop_words[language]['data']
def calculateSummarizedIntervals(self, campaign, start, end, interval, tweetlist=None): pprint("summarizing tweets for campaign %s between %s and %s" % (campaign.getName(), start, end)) synonyms = self.getTrendWordsSynonyms(campaign) trend_stop_words_set = self.getTrendStopWords(campaign) collection_name = 'summarized_tweets_%s' % campaign.getId() if tweetlist is None: tweetlist = MongoManager.findTweets("tweets_%s" % campaign.getId(), filters={ "retweeted_status": { "$exists": False }, "x_created_at": { "$gte": start, "$lte": end } }) own_fa = campaign.getOwnFollowAccounts() timerange = [] d = start while d < end: data = SumDict({'start': d, 'end': d + interval}) data['stats'] = SumDict() data['stats']['total_tweets'] = 0 data['stats']['own_tweets'] = SumDict({ 'total': 0, 'accounts': SumDict([(a, 0) for a in own_fa]) }) data['stats']['own_tweets']['retweets'] = SumDict({ 'total': 0, 'accounts': SumDict([(a, 0) for a in own_fa]) }) data['stats']['own_tweets']['favorites'] = SumDict({ 'total': 0, 'accounts': SumDict([(a, 0) for a in own_fa]) }) data['stats']['mentions'] = SumDict({ 'total': 0, 'accounts': SumDict([(a, 0) for a in own_fa]) }) data['sentiment'] = SumDict() data['brand'] = SumDict() data['product'] = SumDict() data['topic'] = SumDict() data['gender'] = SumDict() data['words'] = SumDict() timerange.append(data) d = d + interval for t in tweetlist: for interv in timerange: if t.getCreatedDate() >= interv['start'] and t.getCreatedDate( ) < interv['end']: interv['stats']['total_tweets'] += 1 if t.getUsername() in own_fa: interv['stats']['own_tweets']['total'] += 1 interv['stats']['own_tweets']['accounts'][ t.getUsername()] += 1 interv['stats']['own_tweets']['retweets'][ 'total'] += t.getRetweetsCount() interv['stats']['own_tweets']['retweets']['accounts'][ t.getUsername()] += t.getRetweetsCount() interv['stats']['own_tweets']['favorites'][ 'total'] += t.getFavoritesCount() interv['stats']['own_tweets']['favorites']['accounts'][ t.getUsername()] += t.getRetweetsCount() for k, v in t.getFollowAccountsMentionCount().items(): if k in own_fa: interv['stats']['mentions']['total'] += 1 interv['stats']['mentions']['accounts'][k] += 1 if t.getSentiment(): if not t.getSentiment() in interv['sentiment']: interv['sentiment'][t.getSentiment()] = { "total": 0 } interv['sentiment'][t.getSentiment()]['total'] += 1 pms = t.getExtractedInfo() if pms: pm = pms[0] try: interv['brand'][pm['brand']] += 1 except KeyError, e: interv['brand'][pm['brand']] = 1 if pm['product']: p = pm['brand'] + "/" + pm['product'] try: interv['product'][p] += 1 except KeyError, e: interv['product'][p] = 1 topics = t.getExtractedTopics() if topics is None: topics = [] for k in topics: try: interv['topic'][k['topic_name']]['total'] += 1 except KeyError, e: interv['topic'][k['topic_name']] = {'total': 1} for word in self.getWordsList(t.getText()): if word in trend_stop_words_set: continue word = word.lower() nword = synonyms.get(word, word) data['words'][nword] = data['words'].get(nword, 0) + 1 gender = t.getGender() try: interv['gender'][gender]['total'] += 1 except KeyError, e: interv['gender'][gender] = {'total': 1}
def generateGnipRulesFromMongo(self): accounts = MongoManager.getActiveAccounts() rules = [] for acc in accounts: for camp in acc.getActiveCampaigns(): for brand in camp.getBrands(): fa = sorted(brand.getFollowAccounts()) if fa: rules.append({ "value": " OR ".join(fa), "tag": "%s/%s/%s/follow accounts - mention" % (acc.getName(), camp.getName(), brand.getName()) }) clean_user_names = [x.replace("@", "") for x in fa] rules.append({ "value": " OR ".join( ["from:%s" % x for x in clean_user_names]), "tag": "%s/%s/%s/follow accounts - from" % (acc.getName(), camp.getName(), brand.getName()) }) rules.append({ "value": " OR ".join( ["to:%s" % x for x in clean_user_names]), "tag": "%s/%s/%s/follow accounts - to" % (acc.getName(), camp.getName(), brand.getName()) }) #BRAND RULES for brule in brand.getIdentificationRules(): brule = brule.replace("[m]", "[M]").replace("[p]", "[P]") for bsearch_keyword in brand.getSearchKeywords(): brand_replaced_rule = '"' + brule.replace( "[M]", bsearch_keyword) + '"' if (brule.upper().find("[P]") >= 0): for product in brand.getProducts(): if product.isUsingBrandIdRules(): for psearch_keyword in product.getSearchKeywords( ): product_replaced_rule = brand_replaced_rule.replace( "[P]", psearch_keyword) rules.append({ "value": product_replaced_rule, "tag": "%s/%s/%s/%s: %s" % (acc.getName(), camp.getName(), brand.getName(), product.getName(), brule) }) else: rules.append({ "value": brand_replaced_rule, "tag": "%s/%s/%s: %s" % (acc.getName(), camp.getName(), brand.getName(), brule) }) #PRODUCT RULES for product in brand.getProducts(): for prule in product.getIdentificationRules(): prule = prule.replace("[m]", "[M]").replace("[p]", "[P]") for bsearch_keyword in brand.getSearchKeywords(): brand_replaced_rule = '"' + prule.replace( "[M]", bsearch_keyword) + '"' for psearch_keyword in product.getSearchKeywords( ): product_replaced_rule = brand_replaced_rule.replace( "[P]", psearch_keyword) rules.append({ "value": product_replaced_rule, "tag": "%s/%s/%s/%s: %s" % (acc.getName(), camp.getName(), brand.getName(), product.getName(), prule) }) for poll in acc.getActivePolls(): rules.append({ "value": " OR ".join(sorted(poll.getSearchHashtags())), "tag": "%s/poll %s" % (acc.getName(), poll.getName()) }) return rules
def clearSummarization(self, campaign): MongoManager.remove('summarized_tweets_%s' % campaign.getId())
def getUser(user): doc={"_id":user} mng = MongoManager() dbResult=mng.get("tests",doc) response.set_header("Content-Type:","text/json")
return res """ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--regenerate', action="store_true", default=False) parser.add_argument('--account', default=None) parser.add_argument('--list', action="store_true", default=False) parser.add_argument('--start', default=None) parser.add_argument('--end', default=None) parser.add_argument('--clear', action="store_true", default=False) args, known = parser.parse_known_args() campaign = None if args.account: account = MongoManager.getAccount(name=args.account) if not account: pprint("Account %s not found" % args.account) exit(1) campaign = account.getActiveCampaigns()[0] summarizer = Summarizer() if not args.list and not args.clear: summarizer.start(campaign=campaign, regenerate=args.regenerate) elif args.clear and campaign: summarizer.clearSummarization(campaign) elif args.list and campaign and args.start and args.end: print args start = datetime.strptime(args.start, "%Y-%m-%dT%H") end = datetime.strptime(args.end, "%Y-%m-%dT%H") records = summarizer.getSummarizedData(campaign, start, end)