def pull_deleted(db, api, twitterapi, uid, nort=False): if uid: tweets = db.tweets.find({'deleted': True, 'user.id': uid}) else: tweets = db.tweets.find({'deleted': True}) if verbose(): tweets = Bar("Processing:", max=tweets.count(), suffix='%(index)d/%(max)d - %(eta_td)s').iter(tweets) idlist = [] for t in tweets: twid = t['id'] if nort and 'retweeted_status' in t: continue idlist.append(twid) if len(idlist) == 100: add100(db, api, twitterapi, idlist) idlist = [] if len(idlist): add100(db, api, twitterapi, idlist)
def pull_favorited(db, api, twitterapi): favs = db.favorites.find({'pulled': None}).batch_size(100) idlist = [] if verbose(): favs = Bar("Processing:", max=favs.count(), suffix='%(index)d/%(max)d - %(eta_td)s').iter(favs) for f in favs: twid = f['tweet_id'] if db.tweets.find_one({'id': twid}) is not None: db.favorites.update(f, {'$set': {'pulled': True}}) continue idlist.append(twid) if verbose(): print " ", twid if len(idlist) == 100: add100(db, api, twitterapi, idlist) idlist = [] if len(idlist): add100(db, api, twitterapi, idlist)
def pull_quoted(db, api, twitterapi): tweets = db.tweets.find( { 'quoted_status_id': { '$gt': 0 }, 'quote_pulled': None }, { 'quoted_status_id': 1, 'quoted_status': 1, 'id': 1 }) if verbose(): tweets = Bar("Processing:", max=tweets.count(), suffix='%(index)d/%(max)d - %(eta_td)s').iter(tweets) idlist = [] for t in tweets: twid = t['quoted_status_id'] if twid is None: db.tweets.update(t, {'$unset': {'quoted_status_id': 1}}) print("point 1: this should never? be reached, i think") continue #if get_tracked(db, uid=t['user']['id']) is None or not is_greek(db, uid=t['user']['id']): continue orig = db.tweets.find_one({'id': twid}) if orig: if 'quoted_status' not in t: del orig['_id'] db.tweets.update_one(t, {'$set': {'quoted_status': orig}}) if verbose(): print(u"filled in tweet {} into {}".format(twid, t['id'])) db.tweets.update(t, {'$set': {'quote_pulled': True}}) continue if twid not in idlist: idlist.append(twid) if verbose(): print(" ", twid) if len(idlist) >= 100: add100(db, api, twitterapi, idlist) idlist = [] if len(idlist): add100(db, api, twitterapi, idlist)
criteria = {} if options.before: criteria['$lte'] = dateutil.parser.parse(options.before) if options.after: criteria['$gt'] = dateutil.parser.parse(options.after) #edges = db.favorites.find({}, {'user_id':1, 'tweet_id':1}).sort('user_id', 1).batch_size(10) if options.before or options.after: tweets = db.tweets.find({'created_at': criteria}, { 'id': 1, 'user.id': 1 }) if verbose: tweets = Bar("Loading:", max=tweets.count(), suffix='%(index)d/%(max)d - %(eta_td)s').iter(tweets) tweets = list(tweets) edgecnt = scan_by_tweets(db, tweets) save_edgelist(db, edgecnt, options.filename, weight=True) elif options.user: uid = int(options.user) if options.ids else lookup_user( db, uname=options.user).get('id', -1) tweets = db.tweets.find({'user.id': uid}, {'id': 1, 'user.id': 1}) tweets = list(tweets) edgecnt = scan_by_tweets(db, tweets) if options.dot: save_dot(db, edgecnt, options.filename, weight=True) else: save_edgelist(db, edgecnt, options.filename, weight=True) else:
auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret) auth.set_access_token(config.access_token, config.access_token_secret) api = tweepy.API(auth) for user in args: uid = long(user) if options.ids else None uname = None if options.ids else user u = lookup_user(db, uid, uname) if u is None: print uid, uname, "not found" if options.scan: tweets = db.tweets.find({'user.id': u['id'], 'deleted': None}).sort('created_at', 1) idlist = [] for t in tweets: idlist.append(t['id']) if len(idlist) == 100: idlist = add100(db, api, twitterapi, idlist) print u'found {} deleted'.format(len(idlist)) idlist = [] idlist = add100(db, api, twitterapi, idlist) print u'found {} deleted'.format(len(idlist)) idlist = [] tweets = db.tweets.find({'deleted': True, 'user.id': u['id']}).sort('created_at', 1) if verbose(): tweets = Bar("Processing:", max=tweets.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(tweets) for t in tweets: if options.nort and 'retweeted_status' in t: continue print u'{} {} {}: {}'.format(t.get('id', '-'), t.get('created_at', None), u['screen_name_lower'], t.get('text', '<not found>')).encode('utf-8')
if __name__ == '__main__': parser = optparse.OptionParser() parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='List names of tracked users') (options, args) = parser.parse_args() verbose(options.verbose) db, api = init_state() users = db.following.find().batch_size(100) if verbose(): users = Bar("Processing:", max=users.count(), suffix='%(index)d/%(max)d - %(eta_td)s').iter(users) for u in users: uid = u['id'] us = lookup_user(db, uid) cdata = db.crawlerdata.find_one({'id': uid}) d = datetime.utcnow().date() d = datetime(d.year, d.month, d.day) #if us.get('deleted', False): #print "User marked deleted. Skip." #continue if cdata.get('downloaded_profile_date', datetime(1970, 01, 01, 00, 00, 00)) > (d - timedelta(days=30)): #if verbose(): print "Picture already downloaded. Skip." continue
if verbose(): cursor = Bar('Loading:', max=db.tweets.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(cursor) for t in cursor: out_urls = [] if 'urls' not in t or t['urls'] is None: continue for url in t['urls']: try: out_url = deshorten_url(db, url) except pymongo.errors.WriteError: continue if out_url: out_urls.append(out_url) db.tweets.update_one({'id': t['id']}, {'$set': {'deshorten': True}}) #if len(out_urls): #db.tweets.update_one({'id': t['id']}, {'$set': {'urls': out_urls}}) else: if options.user: cursor = db.users.find({'id': u['id'], 'url': {'$ne': None}}).batch_size(10) else: cursor = db.users.find({'url': {'$ne': None}}).batch_size(10) cnt = cursor.count() if verbose(): cursor = Bar('Loading:', max=cnt, suffix = '%(index)d/%(max)d - %(eta_td)s').iter(cursor) print u'Found {}'.format(cnt) for u in cursor: url = u['url'] out_url = deshorten_url(db, url) #if out_url: #db.users.update_one(u, {'$set': {'url': out_url}})
names = twkit.utils.cache['names'] ids = twkit.utils.cache['ids'] ignored = twkit.utils.cache['ign'] dead = twkit.utils.cache['dead'] suspended = twkit.utils.cache['susp'] protected = twkit.utils.cache['prot'] greek = twkit.utils.cache['gr'] seen = Counter() unseen = Counter() cursor = db.tweets.find({'retweeted_status.lang': config.lang}, { 'user': 1, 'retweeted_status': 1 }) if verbose(): cursor = Bar("Adding:", max=cursor.count(), suffix='%(index)d/%(max)d - %(eta_td)s ').iter(cursor) for tweet in cursor: whoid = tweet["user"]["id"] if whoid in dead: continue if whoid in ignored: continue if whoid in protected: continue if whoid in suspended: continue u = names.get(whoid, None) if u is None: u = lookup_user(db, uid=whoid) if u is None: unseen[whoid] += 1 continue if options.user: if user != u['screen_name_lower']: continue
action="store", type="int", dest="stopafter", default=None, help="Scan the given number of users") (options, args) = parser.parse_args() verbose(options.verbose) db, api = init_state() if options.suspended: userlist = db.suspended.find() else: userlist = db.protected.find() if options.stopafter: current = 0 if verbose(): userlist = Bar("Loading:", max=userlist.count(), suffix='%(index)d/%(max)d - %(eta_td)s').iter(userlist) for user in userlist: uid = long(user['id']) if not options.suspended and is_protected(db, uid): continue if options.suspended and is_suspended(db, uid): continue follow_user(db, api, uid=uid, wait=True, refollow=True) if options.stopafter: current += 1 if current == options.stopafter: break
dest='users', default=False, help='Also output user id.') (options, args) = parser.parse_args() db, _ = init_state(use_cache=False, ignore_api=True) verbose(options.verbose) criteria = defaultdict(lambda: {}) if options.after: criteria['event_start'].update( {'$gte': dateutil.parser.parse(options.after)}) if options.before: criteria['event_start'].update( {'$lte': dateutil.parser.parse(options.before)}) botsfound = db.botsperweek.find(dict(criteria)) if verbose(): botsfound = Bar( "Loading:", max=botsfound.count(), suffix='%(index)d/%(max)d - %(eta_td)s').iter(botsfound) for v in botsfound: for tid in v['tweet_ids']: tw = db.tweets.find_one({'id': tid}) if options.users: print(u'{} {}'.format(tw['user']['id'], tw['source']).encode('utf-8')) else: print(u'{}'.format(tw['source']).encode('utf-8'))
parser = optparse.OptionParser() parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='List names of tracked users') parser.add_option('--vectorized', action='store_true', dest='vectorized', default=False, help='List only vectorized users.') parser.add_option('--greek', action='store_true', dest='greek', default=False, help='List only greek users.') (options, args) = parser.parse_args() db,api = init_state(use_cache=False) twittercounts = [] crawlercounts = [] if options.vectorized: vectors = db.uservectors.find({}, {'tweet_count': 1, 'seen_total': 1}) if options.verbose: vectors = Bar("Processing:", max=vectors.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(vectors) for v in vectors: twittercounts.append(v['tweet_count']) crawlercounts.append(v['seen_total']) elif options.greek: greeks = db.greeks.find().batch_size(1) if options.verbose: greeks = Bar("Processing:", max=greeks.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(greeks) for g in greeks: cursor = db.tweets.aggregate([ { '$match': { 'user.id' : g['id'] } }, { '$group': { '_id': '$user.id', 'count': {'$sum': 1} } }],
#!/usr/bin/python3 # -*- coding: utf-8 -*- ########################################### # (c) 2016-2020 Polyvios Pratikakis # [email protected] ########################################### from collections import Counter from progress.bar import Bar from twkit.utils import * if __name__ == '__main__': parser = optparse.OptionParser() parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="make noise.") (options, args) = parser.parse_args() verbose(options.verbose) db, api = init_state(True) users = db.users.find({}, {'id':1}) users = Bar("Processing:", max=users.count(), suffix = '%(index)d/%(max)d - %(eta_td)s').iter(users) counter = Counter() for u in users: counter[u['id']] += 1 for c in sorted(counter): print(u'{} : {}'.format(c, counter[c]))