def process_twitter_feed(self, listtweets, feedtype, query=None, pagecount=0): if not listtweets: returnD(False) if query: if not isinstance(listtweets, dict): returnD(False) nexturl = "" if 'max_id_str' in listtweets['search_metadata']: nexturl = listtweets['search_metadata']['max_id_str'] elif 'next_results' in listtweets['search_metadata']: nexturl = self.re_max_id.sub(r'\1', listtweets['search_metadata']['next_results']) res = {'nexturl': nexturl} listtweets = listtweets['statuses'] elif not isinstance(listtweets, list): returnD(False) feed = [] for tweet in listtweets: if not isinstance(tweet, dict): continue tw = {'created_at': tweet['created_at'], 'title': unescape_html(tweet['text']), 'link': tweet['url']} tw = grab_extra_meta(tweet, tw) feed.append(tw) if query: res['tweets'] = feed processed = yield self.process_tweets(res, 'search', query=query, pagecount=pagecount) else: processed = yield self.process_tweets(feed, 'my%s' % feedtype) returnD(processed)
def process_twitter_feed(self, listtweets, feedtype, query=None, pagecount=0): if not listtweets: returnD(False) if query: if not isinstance(listtweets, dict): returnD(False) nexturl = "" if 'max_id_str' in listtweets['search_metadata']: nexturl = listtweets['search_metadata']['max_id_str'] elif 'next_results' in listtweets['search_metadata']: nexturl = self.re_max_id.sub(r'\1', listtweets['search_metadata']['next_results']) res = {'nexturl': nexturl} listtweets = listtweets['statuses'] elif not isinstance(listtweets, list): returnD(False) feed = [] for tweet in listtweets: if not isinstance(tweet, dict): continue if 'entities' in tweet: entities = [] for entitype in ['media', 'urls']: if entitype in tweet['entities']: entities += tweet['entities'][entitype] for entity in entities: try: if 'expanded_url' in entity and 'url' in entity and entity['expanded_url'] and entity['url'] not in self.fact.cache_urls and len(entity['expanded_url']) < 250: cleaned, self.fact.cache_urls = clean_url(entity['expanded_url'].encode('utf-8'), entity['url'].encode('utf-8'), self.fact.cache_urls) _, self.fact.cache_urls = yield clean_redir_urls(cleaned.decode('utf-8'), self.fact.cache_urls) except Exception as e: self.log(e, error=True) if "retweeted_status" in tweet and tweet['retweeted_status']['id_str'] != tweet['id_str']: text = "RT @%s: %s" % (tweet['retweeted_status']['user']['screen_name'], tweet['retweeted_status']['text']) else: text = tweet['text'] tw = {'created_at': tweet['created_at'], 'title': unescape_html(text), 'link': "https://twitter.com/%s/status/%s" % (tweet['user']['screen_name'], tweet['id_str'])} tw = grab_extra_meta(tweet, tw) feed.append(tw) if query: res['tweets'] = feed processed = yield self.process_tweets(res, 'search', query=query, pagecount=pagecount) else: processed = yield self.process_tweets(feed, 'my%s' % feedtype) returnD(processed)
def process_twitter_feed(self, listtweets, feedtype, query=None, pagecount=0): if not listtweets: return None if query: if not isinstance(listtweets, dict): return None nexturl = "" if 'next_results' in listtweets['search_metadata']: nexturl = self.re_max_id.sub(r'\1', listtweets['search_metadata']['next_results']) res = {'nexturl': nexturl} listtweets = listtweets['statuses'] feed = [] for tweet in listtweets: if "retweeted_status" in tweet and tweet['retweeted_status']['id_str'] != tweet['id_str']: text = "RT @%s: %s" % (tweet['retweeted_status']['user']['screen_name'], tweet['retweeted_status']['text']) else: text = tweet['text'] tw = {'created_at': tweet['created_at'], 'title': unescape_html(text), 'link': "http://twitter.com/%s/statuses/%s" % (tweet['user']['screen_name'], tweet['id_str'])} tw = grab_extra_meta(tweet, tw) feed.append(tw) if query: res['tweets'] = feed return self.process_tweets(res, 'search', query=query, pagecount=pagecount) return self.process_tweets(feed, 'my%s' % feedtype)
def process_tweets(self, feed, source, query=None, pagecount=0): # handle tweets from icerocket or topsy fake rss nexturl = "" try: elements = feed.entries except: # handle tweets from Twitter API if isinstance(feed, list) and len(feed): elements = feed elif isinstance(feed, dict) and "nexturl" in feed: nexturl = feed["nexturl"] elements = feed["tweets"] else: returnD(False) if query: source = "%s https://api.twitter.com/api/1.1/search/tweets.json?q=%s" % (source, query) ids = [] hashs = [] tweets = [] fresh = True for i in elements: try: time_tweet = time.mktime(i.get('published_parsed', '')) - 4*60*60 except: if i.get('created_at', '') == "now": time_tweet = time.time() else: time_tweet = time.mktime(time.strptime(i.get('created_at', ''), '%a %b %d %H:%M:%S +0000 %Y')) + 2*60*60 date = datetime.fromtimestamp(time_tweet) if datetime.today() - date > timedelta(hours=config.BACK_HOURS): fresh = False break tweet, self.fact.cache_urls = yield clean_redir_urls(i.get('title', '').replace('\n', ' '), self.fact.cache_urls) link = i.get('link', '') res = re_tweet_url.search(link) if res: user = res.group(1) tid = long(res.group(2)) ids.append(tid) tw = {'_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user.lower(), 'screenname': user, 'message': tweet, 'uniq_rt_hash': uniq_rt_hash(tweet), 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': source} tw = grab_extra_meta(i, tw) tweets.append(tw) # Delay displaying to avoid duplicates from the stream if source != "mystream" and not self.fact.tweets_search_page: yield deferredSleep() existings = yield self.fact.db['tweets'].find({'channel': self.fact.channel, 'id': {'$in': ids}}, fields=['_id'], filter=sortdesc('id')) existing = [t['_id'] for t in existings] news = [t for t in tweets if t['_id'] not in existing] if not news: returnD(False) good = [] news.sort(key=itemgetter('id')) if fresh and not source.startswith("my") and len(news) > len(elements) / 2: if query and nexturl and pagecount < 3*self.fact.back_pages_limit: deferToThreadPool(reactor, self.threadpool, reactor.callLater, 15, self.start_twitter_search, [query], max_id=nexturl, pagecount=pagecount+1) elif not query and nexturl and "p=%d" % (self.fact.back_pages_limit+1) not in nexturl and "page=%s" % (2*self.fact.back_pages_limit) not in nexturl: deferToThreadPool(reactor, self.threadpool, reactor.callLater, 41, self.start, nexturl) elif not query and not nexturl and int(source[-1:]) <= self.fact.back_pages_limit: deferToThreadPool(reactor, self.threadpool, reactor.callLater, 41, self.start, next_page(source)) if self.fact.displayRT: good = news else: hashs = [t['uniq_rt_hash'] for t in news if t['uniq_rt_hash'] not in hashs] existings = yield self.fact.db['tweets'].find({'channel': self.fact.channel, 'uniq_rt_hash': {'$in': hashs}}, fields=['uniq_rt_hash'], filter=sortdesc('id')) existing = [t['uniq_rt_hash'] for t in existings] for t in news: if self.fact.twuser == t['user'] or t['uniq_rt_hash'] not in existing or (self.fact.displayMyRT and "@%s" % self.fact.twuser in t['message'].lower()): existing.append(t['uniq_rt_hash']) good.append(t) if config.DEBUG: nb_rts_str = "" nb_rts = len(news) - len(good) if nb_rts: nb_rts_str = " (%s RTs filtered)" % nb_rts self.log("Displaying %s tweets%s" % (len(good), nb_rts_str), hint=True) if self.fact.status != "closed": for t in good: msg = "%s: %s" % (t['screenname'].encode('utf-8'), self.format_tweet(t)) self.fact.ircclient._send_message(msg, self.fact.channel) for t in news: yield self.fact.db['tweets'].save(t, safe=True) returnD(True)
def process_tweets(self, feed, source, query=None, pagecount=0): # handle tweets from icerocket or topsy fake rss nexturl = "" try: elements = feed.entries except: # handle tweets from Twitter API if isinstance(feed, list) and len(feed): elements = feed elif isinstance(feed, dict) and "nexturl" in feed: nexturl = feed["nexturl"] elements = feed["tweets"] else: returnD(False) if query: source = "%s https://api.twitter.com/api/1.1/search/tweets.json?q=%s" % ( source, query) ids = [] hashs = [] tweets = [] fresh = True for i in elements: try: date = datetime.fromtimestamp( time.mktime(i.get('published_parsed', '')) - 4 * 60 * 60) except: if i.get('created_at', '') == "now": date = datetime.now() else: #date = datetime.strptime(i.get('created_at', ''), '%a %b %d %H:%M:%S +0000 %Y') + timedelta(hours=2) date = parse_date(i.get('created_at', '')) if datetime.today() - date > timedelta(hours=config.BACK_HOURS): fresh = False break tweet, self.fact.cache_urls = yield clean_redir_urls( i.get('title', '').replace('\n', ' '), self.fact.cache_urls) link = i.get('link', '') res = re_tweet_url.search(link) if res: user = res.group(1) tid = long(res.group(2)) ids.append(tid) tw = { '_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user.lower(), 'screenname': user, 'message': tweet, 'uniq_rt_hash': uniq_rt_hash(tweet), 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': source } tw = grab_extra_meta(i, tw) tweets.append(tw) # Delay displaying to avoid duplicates from the stream if source != "mystream" and not self.fact.tweets_search_page: yield deferredSleep() existings = yield self.fact.db['tweets'].find( { 'channel': self.fact.channel, 'id': { '$in': ids } }, fields=['_id'], filter=sortdesc('id')) existing = [t['_id'] for t in existings] news = [t for t in tweets if t['_id'] not in existing] if not news: returnD(False) good = [] news.sort(key=itemgetter('id')) if fresh and not source.startswith( "my") and len(news) > len(elements) / 2: if query and nexturl and pagecount < 3 * self.fact.back_pages_limit: deferToThreadPool(reactor, self.threadpool, reactor.callLater, 15, self.start_twitter_search, [query], max_id=nexturl, pagecount=pagecount + 1) elif not query and nexturl and "p=%d" % ( self.fact.back_pages_limit + 1) not in nexturl and "page=%s" % ( 2 * self.fact.back_pages_limit) not in nexturl: deferToThreadPool(reactor, self.threadpool, reactor.callLater, 41, self.start_web, nexturl) elif not query and not nexturl and int( source[-1:]) <= self.fact.back_pages_limit: deferToThreadPool(reactor, self.threadpool, reactor.callLater, 41, self.start_web, next_page(source)) if self.fact.displayRT: good = news else: hashs = [ t['uniq_rt_hash'] for t in news if t['uniq_rt_hash'] not in hashs ] existings = yield self.fact.db['tweets'].find( { 'channel': self.fact.channel, 'uniq_rt_hash': { '$in': hashs } }, fields=['uniq_rt_hash'], filter=sortdesc('id')) existing = [t['uniq_rt_hash'] for t in existings] for t in news: if self.fact.twuser == t['user'] or t[ 'uniq_rt_hash'] not in existing or ( self.fact.displayMyRT and "@%s" % self.fact.twuser in t['message'].lower()): existing.append(t['uniq_rt_hash']) good.append(t) if config.DEBUG: nb_rts_str = "" nb_rts = len(news) - len(good) if nb_rts: nb_rts_str = " (%s RTs filtered)" % nb_rts self.log("Displaying %s tweets%s" % (len(good), nb_rts_str), hint=True) if self.fact.status != "closed": for t in good: msg = "%s: %s" % (t['screenname'].encode('utf-8'), self.format_tweet(t)) self.fact.ircclient._send_message(msg, self.fact.channel) for t in news: yield self.fact.db['tweets'].save(t, safe=True) returnD(True)
def process_tweets(self, feed, source, query=None, pagecount=0): # handle tweets from icerocket or topsy fake rss nexturl = "" try: elements = feed.entries except: # handle tweets from Twitter API if isinstance(feed, list) and len(feed): elements = feed elif isinstance(feed, dict) and "nexturl" in feed: nexturl = feed["nexturl"] elements = feed["tweets"] else: defer.returnValue(None) if query: source = "%s https://api.twitter.com/api/1.1/search/tweets.json?q=%s" % (source, query) ids = [] hashs = [] tweets = [] fresh = True for i in elements: try: time_tweet = time.mktime(i.get('published_parsed', '')) - 4*60*60 except: if i.get('created_at', '') == "now": time_tweet = time.time() else: time_tweet = time.mktime(time.strptime(i.get('created_at', ''), '%a %b %d %H:%M:%S +0000 %Y')) + 2*60*60 date = datetime.fromtimestamp(time_tweet) if datetime.today() - date > timedelta(hours=config.BACK_HOURS): fresh = False break tweet, self.fact.cache_urls = yield clean_redir_urls(i.get('title', '').replace('\n', ' '), self.fact.cache_urls, pool=self.threadpool) tweet = tweet.replace('~', '~') link = i.get('link', '') res = re_tweet_url.search(link) if res: user = res.group(1) tid = long(res.group(2)) ids.append(tid) tw = {'_id': "%s:%s" % (self.fact.channel, tid), 'channel': self.fact.channel, 'id': tid, 'user': user.lower(), 'screenname': user, 'message': tweet, 'uniq_rt_hash': uniq_rt_hash(tweet), 'link': link, 'date': date, 'timestamp': datetime.today(), 'source': source} tw = grab_extra_meta(i, tw) tweets.append(tw) existing = [t['_id'] for t in self.db['tweets'].find({'channel': self.fact.channel, 'id': {'$in': ids}}, fields=['_id'], sort=[('id', pymongo.DESCENDING)])] news = [t for t in tweets if t['_id'] not in existing] if news: good = 0 news.reverse() if fresh and not source.startswith("my") and len(news) > len(elements) / 2: if query and nexturl and pagecount < self.fact.back_pages_limit: yield self.start_twitter_search([query], max_id=nexturl, pagecount=pagecount+1) elif not query and nexturl and "p=%d" % (self.fact.back_pages_limit+1) not in nexturl and "page=%s" % (2*self.fact.back_pages_limit) not in nexturl: reactor.callFromThread(reactor.callLater, 41, self.start, nexturl) elif not query and not nexturl and int(source[-1:]) <= self.fact.back_pages_limit: reactor.callFromThread(reactor.callLater, 41, self.start, next_page(source)) if not self.fact.displayRT: hashs = [t['uniq_rt_hash'] for t in news if t['uniq_rt_hash'] not in hashs] existing = [t['uniq_rt_hash'] for t in self.db['tweets'].find({'channel': self.fact.channel, 'uniq_rt_hash': {'$in': hashs}}, fields=['uniq_rt_hash'], sort=[('id', pymongo.DESCENDING)])] for t in news: if t['uniq_rt_hash'] not in existing: existing.append(t['uniq_rt_hash']) self.displayTweet(t) good += 1 else: [self.displayTweet(t) for t in news] if config.DEBUG: nb_rts_str = "" nb_rts = len(news) - good if nb_rts: nb_rts_str = " (%s RTs filtered)" % nb_rts self.log("Displaying %s tweets%s" % (good, nb_rts_str), self.fact.database, hint=True) self.db['tweets'].insert(news, continue_on_error=True, safe=True) defer.returnValue(None)