def load_watchlist(): global watchlist, wl_hash, has_keywords wl = [] for x in open(args.watchlist, 'r').readlines(): x = x.rstrip().lower() if x.startswith(';'): username = x[1:] disabled_users[username] = True else: username = x if username[0] == '#' and not has_keywords: has_keywords = True if not username[0] == '#' and not os.path.exists( paths.get_user_json(username)): new_accounts.append(username) if not os.path.exists(paths.get_user(username)): retry_makedirs(paths.get_user(username)) wl.append(username) newhash = hashlib.md5(''.join(wl)).hexdigest() if newhash != wl_hash: print('reloading watchlist') wl_hash = newhash watchlist = wl json_loads() if has_keywords and os.path.exists('users'): for file in os.listdir('users'): d = os.path.join('users', file) if os.path.isdir(d): load_user_json(file)
def load_watchlist(): global watchlist, wl_hash wl = [] for x in open(args.watchlist, 'r').readlines(): x = x.rstrip() if x.startswith(';'): username = x[1:] disabled_users[username] = True else: username = x if not os.path.exists(paths.get_user_json(username)): new_accounts.append(username) if not os.path.exists(paths.get_user(username)): retry_makedirs(paths.get_user(username)) wl.append(username) newhash = hashlib.md5(''.join(wl)).hexdigest() if newhash != wl_hash: print('reloading watchlist') wl_hash = newhash watchlist = wl json_loads()
def scrape(user, http, host): global nitters if user in new_accounts: count = args.count checkfn = None new_accounts.remove(user) else: checkfn = fetch_more_tweets_callback count = -1 elapsed_time = time.time() insert_pos = 0 sys.stdout.write('\r[%s] scraping %s... ' % (get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), user)) sys.stdout.flush() twats, nitters, host, http = get_twats(user, proxies=args.proxy, count=count, http=http, checkfn=checkfn, nitters=nitters, host=host) new = False for t in twats: if not in_twatlist(user, t): new = True if args.unshorten: t = unshorten_urls(t, proxies=args.proxy, shorteners=shorteners) add_twatlist(user, t, insert_pos) insert_pos += 1 if 'quote_tweet' in t: if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])): retry_makedirs(paths.get_user(t[quote_tweet]['user'])) fetch_profile_picture(t[quote_tweet]['user'], args.proxy, twhttp=nitter_rshttp, nitters=nitters) if 'user' in t: if not os.path.isdir(paths.get_user(t['user'])): retry_makedirs(paths.get_user(t['user'])) fetch_profile_picture(t['user'], args.proxy, twhttp=nitter_rshttp, nitters=nitters) if args.mirror: mirror_twat(t, args=args) sys.stdout.write('\r[%s] scraping %s... +%d ' % (get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), user, insert_pos)) sys.stdout.flush() if new: write_user_tweets(user) elapsed_time = (time.time() - elapsed_time) sys.stdout.write('done (%s)\n' % get_timestamp("%H:%M:%S", elapsed_time)) sys.stdout.flush() return http, host
def htmlize_twat(twat, variables, quoted=False): tw = '<div class="twat-container">' tweet_pic = None retweet_pic = None if not 'rid' in twat: retweet_str = "" if paths.has_profile_pic(twat['owner']): tweet_pic = paths.get_profile_pic(twat['owner']) else: if paths.has_profile_pic(twat['user']): tweet_pic = paths.get_profile_pic(twat['user']) else: tweet_pic = "" if paths.has_profile_pic(twat['owner']): retweet_pic = paths.get_profile_pic(twat['owner']) retweet_str = " (RT %s<a target='_blank' href='https://twitter.com/%s/status/%s'>%s</a>)" % \ (user_at_link(twat['user']), twat['user'], twat['id'], twat['user']) if tweet_pic: tw += '<div class="profile_picture"><img width="100%%" height="100%%" src="%s"></div>' % tweet_pic if retweet_pic: tw += '<div class="profile_picture_retweet"><img width="100%%" height="100%%" src="%s"></div>' % retweet_pic user_str = user_at_link(twat["owner"]) user_str += "<a target='_blank' href='https://twitter.com/%s/status/%s'>%s</a>%s" % \ (twat["owner"], get_effective_twat_id(twat), twat["owner"], retweet_str) tw += '\n<div class="twat-title">' ## add icon bar if args.iconbar: tw += build_iconbar(twat, variables, quoted) time_str = 'unknown' if twat["time"] == 0 else format_time(twat["time"]) tw += '%s - %s' % (user_str, time_str) tw += '\n</div>\n' ## replace urls in twats twat['text'] = replace_url_in_twat(twat, args=args) ## strip html ? if args.nohtml: twat['text']= strip_tags(twat['text']) tw += '<p class="twat-text">%s</p>\n' % (replace_twat_text(twat['text'])) if 'curl' in twat and args.iframe > 0: user = twat['user'].lower() ifu = paths.get_user(user) + '/%s-%s' % (twat['id'], "card.html") if (not 'c' in args.mirror) or (not file_exists(ifu)): ifu = twat['curl'] tw += '<span class="twat-iframe"><iframe src="%s"></iframe></span>\n'%ifu if 'images' in twat: tw += '<p class="twat-image">' if len(twat['images']) > 1: wdth = (100/len(twat['images'])) - 1 else: wdth = 100 for i in twat['images']: if args.images <= 0: tw += '<a href="%s">%s</a>'%(i, i) else: img_path = paths.get_user(twat['user']) + "/%s-%s" % (twat['id'], i.split('/')[-1]) if not file_exists(img_path): img_path = i span_or_div = "span" img_class = "img" div_class = "" if args.upstream_img: href = i title = "view remote image" elif 'video' in twat or 'ext_tw_video_thumb' in i: mp4_path = paths.get_user(twat['user']) + '/%s.mp4' % str(twat['id']) if os.path.exists(mp4_path): href = mp4_path title = "view local video" else: href = "https://twitter.com/i/status/" + twat['id'] title = "view remote video" img_class = "" div_class = "video-thumbnail" span_or_div = "div" else: href = img_path title = "view local image" tw += '<a href="%s" title="%s"><%s class="%s"><img class="%s" src="%s" width="%d%%"></%s></a>' % (href, title, span_or_div, div_class, img_class, img_path, wdth, span_or_div) tw += '</p>\n' if 'quote' in twat: pseudo_twat = { 'user' : twat['quote']['user'], 'owner' : twat['quote']['user'], 'id' : twat['quote']['id'], 'text' : twat['quote']['text'], 'time' : 0 } tw += htmlize_twat(pseudo_twat, variables, quoted=True) tw += '</div>\n' return tw
def scrape(item, http, host, search, user_agent): global nitters global mastodon_rshttp item = item.lower() if item in new_accounts: count = args.count checkfn = None new_accounts.remove(item) else: checkfn = fetch_more_tweets_callback count = args.count if item[0] == '#' else -1 if item.count('@') < 2: platform = 'twitter' twats, nitters, host, http, page = get_twats(item, proxies=args.proxy, count=count, http=http, checkfn=checkfn, nitters=nitters, host=host, search=search, user_agent=user_agent, blacklist=blacklist, whitelist=whitelist) else: platform = 'mastodon' twats, http = get_toots(item, proxies=args.proxy, count=count, http=http, checkfn=checkfn, user_agent=user_agent, blacklist=args.blacklist, whitelist=args.whitelist) mastodon_rshttp[host] = http insert_pos = dict() new = False user = None if item[0] == '#' else item insert_pos_total = 0 elapsed_time = time.time() for t in twats: if item[0] == '#': user = t['user'].lower() if not user in insert_pos: insert_pos[user] = 0 if not in_twatlist(user, t): new = True if args.unshorten: t = unshorten_urls(t, proxies=args.proxy, shorteners=shorteners) add_twatlist(user, t, insert_pos[user]) insert_pos[user] += 1 insert_pos_total += 1 if 'quote_tweet' in t: if '@' in t['quote_tweet']['user']: _, foo, bar = t['quote_tweet']['user'].split('@') http = None if not bar in mastodon_rshttp else mastodon_rshttp[ bar] if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])): retry_makedirs(paths.get_user(t[quote_tweet]['user'])) fetch_profile_picture(t[quote_tweet]['user'], args.proxy, twhttp=http, nitters=nitters, platform=platform) if 'user' in t: if '@' in t['user']: _, foo, bar = t['user'].split('@') http = None if not bar in mastodon_rshttp else mastodon_rshttp[ bar] if not os.path.isdir(paths.get_user(t['user'])): retry_makedirs(paths.get_user(t['user'])) fetch_profile_picture(t['user'], args.proxy, twhttp=http, nitters=nitters, platform=platform) if args.mirror: mirror_twat(t, args=args) sys.stdout.write( '\r[%s] %s: extracting from %d page(s): +%d twat(s)' % (misc.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), item, page, insert_pos_total)) sys.stdout.flush() if new: if item[0] == '#': for user in insert_pos.keys(): write_user_tweets(user) else: write_user_tweets(item) elapsed_time = (time.time() - elapsed_time) sys.stdout.write('done (%s)\n' % misc.get_timestamp("%H:%M:%S", elapsed_time)) sys.stdout.flush() return http, host
def _mirror_file(url_components, user, tid, args=None, content_type=None, force=False): outname = paths.get_user(user) + '/%s-%s' % (tid, url_components['filename']) if not force and os.path.exists(outname): return http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.60.0") ## do nothing if we cannot connect if not http.connect(): return None ext = url_components['filename'].split('.')[-1] if content_type: if args.ext: filtre = str(args.ext).split(',') else: filtre = [] hdr = http.head(url_components['uri']) ## max mirror size if args.mirror_size: # extract second part of the Content-Length: line value = [ str(i.split(':')[1]).strip() for i in hdr.split('\n') if i.lower().startswith('content-length:') ] if not len(value) or int(value[0]) > args.mirror_size: return # extract second part of the Content-Type: line value = [ str(i.split(':')[1]).strip() for i in hdr.split('\n') if i.lower().startswith('content-type:') ] ## server does not provide Content-Type info if not len(value): return # content type contains ';' (usually when html) elif ';' in value[0]: value[0] = value[0].split(';')[0] value = value[0].split('/') ## when filtering extensions (--ext) ## if unset, everything is mirrored if len(filtre): ## values don't match anything if len(value) < 2 or (not value[0] in filtre and not value[1] in filtre): return # XXX : mirror html files ## we actually don't save html files ## what about making automated save ## thru the wayback machine ? if 'html' in value: return ## previous http object cannot be re-used http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.60.0") ## do nothing if we cannot connect if not http.connect(): return extras = [] if url_components[ 'filename'] == 'card.html' and 'twitter.com' in url_components[ 'host']: extras.append("Referer: https://twitter.com/") hdr, res = http.get(url_components['uri'], extras=extras) if res == '' and hdr != "": # print http error code when things go wrong print "%s%s : %s" % (url_components['host'], url_components['uri'], hdr.split('\n')[0]) return res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res filehash = _hash(res_bytes) out_fn = 'data/%s.%s' % (filehash, ext) if not os.path.exists(out_fn): retry_write(out_fn, res_bytes) if os.path.lexists(outname): os.unlink(outname) os.symlink('../../data/%s.%s' % (filehash, ext), outname)
def mirror_twat(twat, args=None): if 'owner' in twat: user = twat['owner'].lower() else: user = twat['user'].lower() if not os.path.isdir('data'): retry_makedirs('data') ## soupify user's text soup = soupify(twat["text"]) ## try to automatically mirror links posted by the user, ## if it matches the extension list. if 'c' in args.mirror and 'curl' in twat: url = twat['curl'] # XXX: unsupported nitter feature # this displays fine when loading from twitter in a regular browser, # which is probably converted using some js code # TODO: check if nitter handles card:// stuff.. unsuported_shemes = ['card://'] for _us in unsuported_shemes: if url.startswith(_us): continue url_components = _split_url(url) url_components['filename'] = 'card.html' #% twat['id'] _mirror_file(url_components, user, twat['id'], args) if 'f' in args.mirror: for a in soup.body.find_all('a'): if 'data-expanded-url' in a.attrs: url_components = _split_url(a.attrs['data-expanded-url']) if 'filename' in url_components: _mirror_file(url_components, user, twat['id'], args, content_type=True) ## mirror videos if 'v' in args.mirror and 'video' in twat: tid = str(twat['id']) url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid) outname = paths.get_user(twat['user']) + '/%s.mp4' % tid if not os.path.exists('data/%s.mp4' % tid): if args.proxy: os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, args.rawproxy, tid, url)) else: os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, tid, url)) if not os.path.exists('%s' % outname) and os.path.exists( 'data/%s.mp4' % tid): os.symlink('../../data/%s.mp4' % tid, outname) ## mirror posted pictures if 'images' in twat and 'i' in args.mirror: for x in xrange(0, len(twat['images'])): i = twat['images'][x] if '?format=' in i: i = i.split('&')[0] fmt = i.split('=')[1] i = '%s.%s' % (i.split('?')[0], fmt) url_components = _split_url(i) if 'filename' in url_components: _mirror_file(url_components, user, twat['id'], args) ## deal with emojis if 'e' in args.mirror: for img in soup.body.find_all('img'): if 'class' in img.attrs and 'Emoji' in img.attrs['class']: src = img.attrs['src'] src = src.encode('utf-8') if isinstance(src, unicode) else src split = src.split('/') host = split[2] emodir = '/'.join(split[3:len(split) - 1]) filename = split[-1] uri = '%s/%s' % (emodir, filename) if not os.path.isdir(emodir): retry_makedirs(emodir) if not os.path.exists('%s/%s' % (emodir, filename)): http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.60.0") while not http.connect(): # FIXME : what should happen on connect error ? pass hdr, res = http.get('/%s' % uri) res = res.encode('utf-8') if isinstance(res, unicode) else res retry_write('%s/%s' % (emodir, filename), res)