def fetch_profile_picture(user, proxies, res=None, twhttp=None, nitters={}): pic_path = paths.get_profile_pic(user) if os.path.isfile(pic_path): return if not res: while not twhttp: twhttp, host, nitters = nitter_connect(nitters, proxies) # no avail. instance, pic will be scraped another time if not twhttp: return hdr, res = twhttp.get("/%s" % user) soup = soupify(res) for meta in soup.find_all('meta', attrs={'property': 'og:image'}): pic_url = meta.get('content') if '://' in meta.get( 'content') else 'https://%s%s' % (get_nitter_instance( nitters, False), meta.get('content')) url_components = _split_url(pic_url) http = RsHttp(host=url_components['host'], port=url_components['port'], timeout=15, ssl=url_components['ssl'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.60.0") # if connection fails, the profile picture # will be fetched another time if not http.connect(): return hdr, res = http.get(url_components['uri']) if res == '' and hdr != "": print('error fetching profile picture: %s' % url_components) else: res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res retry_write(pic_path, res_bytes) return return
def _mirror_file(url_components, user, tid, args=None, content_type=None, force=False): outname = paths.get_user(user) + '/%s-%s' % (tid, url_components['filename']) if not force and os.path.exists(outname): return http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.60.0") ## do nothing if we cannot connect if not http.connect(): return None ext = url_components['filename'].split('.')[-1] if content_type: if args.ext: filtre = str(args.ext).split(',') else: filtre = [] hdr = http.head(url_components['uri']) ## max mirror size if args.mirror_size: # extract second part of the Content-Length: line value = [ str(i.split(':')[1]).strip() for i in hdr.split('\n') if i.lower().startswith('content-length:') ] if not len(value) or int(value[0]) > args.mirror_size: return # extract second part of the Content-Type: line value = [ str(i.split(':')[1]).strip() for i in hdr.split('\n') if i.lower().startswith('content-type:') ] ## server does not provide Content-Type info if not len(value): return # content type contains ';' (usually when html) elif ';' in value[0]: value[0] = value[0].split(';')[0] value = value[0].split('/') ## when filtering extensions (--ext) ## if unset, everything is mirrored if len(filtre): ## values don't match anything if len(value) < 2 or (not value[0] in filtre and not value[1] in filtre): return # XXX : mirror html files ## we actually don't save html files ## what about making automated save ## thru the wayback machine ? if 'html' in value: return ## previous http object cannot be re-used http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.60.0") ## do nothing if we cannot connect if not http.connect(): return extras = [] if url_components[ 'filename'] == 'card.html' and 'twitter.com' in url_components[ 'host']: extras.append("Referer: https://twitter.com/") hdr, res = http.get(url_components['uri'], extras=extras) if res == '' and hdr != "": # print http error code when things go wrong print "%s%s : %s" % (url_components['host'], url_components['uri'], hdr.split('\n')[0]) return res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res filehash = _hash(res_bytes) out_fn = 'data/%s.%s' % (filehash, ext) if not os.path.exists(out_fn): retry_write(out_fn, res_bytes) if os.path.lexists(outname): os.unlink(outname) os.symlink('../../data/%s.%s' % (filehash, ext), outname)
def mirror_twat(twat, args=None): if 'owner' in twat: user = twat['owner'].lower() else: user = twat['user'].lower() if not os.path.isdir('data'): retry_makedirs('data') ## soupify user's text soup = soupify(twat["text"]) ## try to automatically mirror links posted by the user, ## if it matches the extension list. if 'c' in args.mirror and 'curl' in twat: url = twat['curl'] # XXX: unsupported nitter feature # this displays fine when loading from twitter in a regular browser, # which is probably converted using some js code # TODO: check if nitter handles card:// stuff.. unsuported_shemes = ['card://'] for _us in unsuported_shemes: if url.startswith(_us): continue url_components = _split_url(url) url_components['filename'] = 'card.html' #% twat['id'] _mirror_file(url_components, user, twat['id'], args) if 'f' in args.mirror: for a in soup.body.find_all('a'): if 'data-expanded-url' in a.attrs: url_components = _split_url(a.attrs['data-expanded-url']) if 'filename' in url_components: _mirror_file(url_components, user, twat['id'], args, content_type=True) ## mirror videos if 'v' in args.mirror and 'video' in twat: tid = str(twat['id']) url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid) outname = paths.get_user(twat['user']) + '/%s.mp4' % tid if not os.path.exists('data/%s.mp4' % tid): if args.proxy: os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, args.rawproxy, tid, url)) else: os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, tid, url)) if not os.path.exists('%s' % outname) and os.path.exists( 'data/%s.mp4' % tid): os.symlink('../../data/%s.mp4' % tid, outname) ## mirror posted pictures if 'images' in twat and 'i' in args.mirror: for x in xrange(0, len(twat['images'])): i = twat['images'][x] if '?format=' in i: i = i.split('&')[0] fmt = i.split('=')[1] i = '%s.%s' % (i.split('?')[0], fmt) url_components = _split_url(i) if 'filename' in url_components: _mirror_file(url_components, user, twat['id'], args) ## deal with emojis if 'e' in args.mirror: for img in soup.body.find_all('img'): if 'class' in img.attrs and 'Emoji' in img.attrs['class']: src = img.attrs['src'] src = src.encode('utf-8') if isinstance(src, unicode) else src split = src.split('/') host = split[2] emodir = '/'.join(split[3:len(split) - 1]) filename = split[-1] uri = '%s/%s' % (emodir, filename) if not os.path.isdir(emodir): retry_makedirs(emodir) if not os.path.exists('%s/%s' % (emodir, filename)): http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.60.0") while not http.connect(): # FIXME : what should happen on connect error ? pass hdr, res = http.get('/%s' % uri) res = res.encode('utf-8') if isinstance(res, unicode) else res retry_write('%s/%s' % (emodir, filename), res)