def load_watchlist(): global watchlist, wl_hash, has_keywords wl = [] for x in open(args.watchlist, 'r').readlines(): x = x.rstrip().lower() if x.startswith(';'): username = x[1:] disabled_users[username] = True else: username = x if username[0] == '#' and not has_keywords: has_keywords = True if not username[0] == '#' and not os.path.exists( paths.get_user_json(username)): new_accounts.append(username) if not os.path.exists(paths.get_user(username)): retry_makedirs(paths.get_user(username)) wl.append(username) newhash = hashlib.md5(''.join(wl)).hexdigest() if newhash != wl_hash: print('reloading watchlist') wl_hash = newhash watchlist = wl json_loads() if has_keywords and os.path.exists('users'): for file in os.listdir('users'): d = os.path.join('users', file) if os.path.isdir(d): load_user_json(file)
def load_watchlist(): global watchlist, wl_hash wl = [] for x in open(args.watchlist, 'r').readlines(): x = x.rstrip() if x.startswith(';'): username = x[1:] disabled_users[username] = True else: username = x if not os.path.exists(paths.get_user_json(username)): new_accounts.append(username) if not os.path.exists(paths.get_user(username)): retry_makedirs(paths.get_user(username)) wl.append(username) newhash = hashlib.md5(''.join(wl)).hexdigest() if newhash != wl_hash: print('reloading watchlist') wl_hash = newhash watchlist = wl json_loads()
def scrape(user, http, host): global nitters if user in new_accounts: count = args.count checkfn = None new_accounts.remove(user) else: checkfn = fetch_more_tweets_callback count = -1 elapsed_time = time.time() insert_pos = 0 sys.stdout.write('\r[%s] scraping %s... ' % (get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), user)) sys.stdout.flush() twats, nitters, host, http = get_twats(user, proxies=args.proxy, count=count, http=http, checkfn=checkfn, nitters=nitters, host=host) new = False for t in twats: if not in_twatlist(user, t): new = True if args.unshorten: t = unshorten_urls(t, proxies=args.proxy, shorteners=shorteners) add_twatlist(user, t, insert_pos) insert_pos += 1 if 'quote_tweet' in t: if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])): retry_makedirs(paths.get_user(t[quote_tweet]['user'])) fetch_profile_picture(t[quote_tweet]['user'], args.proxy, twhttp=nitter_rshttp, nitters=nitters) if 'user' in t: if not os.path.isdir(paths.get_user(t['user'])): retry_makedirs(paths.get_user(t['user'])) fetch_profile_picture(t['user'], args.proxy, twhttp=nitter_rshttp, nitters=nitters) if args.mirror: mirror_twat(t, args=args) sys.stdout.write('\r[%s] scraping %s... +%d ' % (get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), user, insert_pos)) sys.stdout.flush() if new: write_user_tweets(user) elapsed_time = (time.time() - elapsed_time) sys.stdout.write('done (%s)\n' % get_timestamp("%H:%M:%S", elapsed_time)) sys.stdout.flush() return http, host
print('youtube-dl not found, videos won\'t be downloaded (path: %s)' % args.ytdl) args.mirror = args.mirror.replace('v','') if args.mirror_size > 0: args.mirror_size = args.mirror_size * 1024*1024 shorteners = {} if args.unshorten: with open('shorteners.txt', 'r') as f: for i in f.readlines(): i = i.strip() if len(i): shorteners[i] = True if args.dir: if not os.path.exists(args.dir): retry_makedirs(args.dir) for d in site_dirs: if not os.path.exists(args.dir + d): os.symlink(os.getcwd() + d, args.dir + d) os.chdir(args.dir) args.proxy = [RocksockProxyFromURL(args.proxy)] if args.proxy else None nitter_rshttp = None host = None load_watchlist() ## resume/retry mirroring process mirroring_done = threading.Event() if args.resume and args.mirror:
def scrape(item, http, host, search, user_agent): global nitters global mastodon_rshttp item = item.lower() if item in new_accounts: count = args.count checkfn = None new_accounts.remove(item) else: checkfn = fetch_more_tweets_callback count = args.count if item[0] == '#' else -1 if item.count('@') < 2: platform = 'twitter' twats, nitters, host, http, page = get_twats(item, proxies=args.proxy, count=count, http=http, checkfn=checkfn, nitters=nitters, host=host, search=search, user_agent=user_agent, blacklist=blacklist, whitelist=whitelist) else: platform = 'mastodon' twats, http = get_toots(item, proxies=args.proxy, count=count, http=http, checkfn=checkfn, user_agent=user_agent, blacklist=args.blacklist, whitelist=args.whitelist) mastodon_rshttp[host] = http insert_pos = dict() new = False user = None if item[0] == '#' else item insert_pos_total = 0 elapsed_time = time.time() for t in twats: if item[0] == '#': user = t['user'].lower() if not user in insert_pos: insert_pos[user] = 0 if not in_twatlist(user, t): new = True if args.unshorten: t = unshorten_urls(t, proxies=args.proxy, shorteners=shorteners) add_twatlist(user, t, insert_pos[user]) insert_pos[user] += 1 insert_pos_total += 1 if 'quote_tweet' in t: if '@' in t['quote_tweet']['user']: _, foo, bar = t['quote_tweet']['user'].split('@') http = None if not bar in mastodon_rshttp else mastodon_rshttp[ bar] if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])): retry_makedirs(paths.get_user(t[quote_tweet]['user'])) fetch_profile_picture(t[quote_tweet]['user'], args.proxy, twhttp=http, nitters=nitters, platform=platform) if 'user' in t: if '@' in t['user']: _, foo, bar = t['user'].split('@') http = None if not bar in mastodon_rshttp else mastodon_rshttp[ bar] if not os.path.isdir(paths.get_user(t['user'])): retry_makedirs(paths.get_user(t['user'])) fetch_profile_picture(t['user'], args.proxy, twhttp=http, nitters=nitters, platform=platform) if args.mirror: mirror_twat(t, args=args) sys.stdout.write( '\r[%s] %s: extracting from %d page(s): +%d twat(s)' % (misc.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), item, page, insert_pos_total)) sys.stdout.flush() if new: if item[0] == '#': for user in insert_pos.keys(): write_user_tweets(user) else: write_user_tweets(item) elapsed_time = (time.time() - elapsed_time) sys.stdout.write('done (%s)\n' % misc.get_timestamp("%H:%M:%S", elapsed_time)) sys.stdout.flush() return http, host
def mirror_twat(twat, args=None): if 'owner' in twat: user = twat['owner'].lower() else: user = twat['user'].lower() if not os.path.isdir('data'): retry_makedirs('data') ## soupify user's text soup = soupify(twat["text"]) ## try to automatically mirror links posted by the user, ## if it matches the extension list. if 'c' in args.mirror and 'curl' in twat: url = twat['curl'] # XXX: unsupported nitter feature # this displays fine when loading from twitter in a regular browser, # which is probably converted using some js code # TODO: check if nitter handles card:// stuff.. unsuported_shemes = ['card://'] for _us in unsuported_shemes: if url.startswith(_us): continue url_components = _split_url(url) url_components['filename'] = 'card.html' #% twat['id'] _mirror_file(url_components, user, twat['id'], args) if 'f' in args.mirror: for a in soup.body.find_all('a'): if 'data-expanded-url' in a.attrs: url_components = _split_url(a.attrs['data-expanded-url']) if 'filename' in url_components: _mirror_file(url_components, user, twat['id'], args, content_type=True) ## mirror videos if 'v' in args.mirror and 'video' in twat: tid = str(twat['id']) url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid) outname = paths.get_user(twat['user']) + '/%s.mp4' % tid if not os.path.exists('data/%s.mp4' % tid): if args.proxy: os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, args.rawproxy, tid, url)) else: os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, tid, url)) if not os.path.exists('%s' % outname) and os.path.exists( 'data/%s.mp4' % tid): os.symlink('../../data/%s.mp4' % tid, outname) ## mirror posted pictures if 'images' in twat and 'i' in args.mirror: for x in xrange(0, len(twat['images'])): i = twat['images'][x] if '?format=' in i: i = i.split('&')[0] fmt = i.split('=')[1] i = '%s.%s' % (i.split('?')[0], fmt) url_components = _split_url(i) if 'filename' in url_components: _mirror_file(url_components, user, twat['id'], args) ## deal with emojis if 'e' in args.mirror: for img in soup.body.find_all('img'): if 'class' in img.attrs and 'Emoji' in img.attrs['class']: src = img.attrs['src'] src = src.encode('utf-8') if isinstance(src, unicode) else src split = src.split('/') host = split[2] emodir = '/'.join(split[3:len(split) - 1]) filename = split[-1] uri = '%s/%s' % (emodir, filename) if not os.path.isdir(emodir): retry_makedirs(emodir) if not os.path.exists('%s/%s' % (emodir, filename)): http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.60.0") while not http.connect(): # FIXME : what should happen on connect error ? pass hdr, res = http.get('/%s' % uri) res = res.encode('utf-8') if isinstance(res, unicode) else res retry_write('%s/%s' % (emodir, filename), res)