def extract_twats(html, user, twats, timestamp, checkfn, nitters): def find_div_end(html): level = 0 for i in xrange(len(html)): if html[i] == '<' and html[i + 1] == 'd' and html[ i + 2] == 'i' and html[i + 3] == 'v': level += 1 if html[i] == '<' and html[i + 1] == '/' and html[ i + 2] == 'd' and html[i + 3] == 'i' and html[i + 4] == 'v': level -= 1 if level == 0: return i + len('</div>') regex = re.compile(r'<div.*class.*[" ]timeline.item[" ]') nfetched = 0 cursor = [ a.get('href') for a in soupify(html).body.find_all('a') if a.get('href').startswith('?cursor=') ] while 1: match = regex.search(html) if not match: return twats, cursor html = html[match.start():] div_end = find_div_end(html) slice = html[:div_end] html = html[div_end:] #twats = extract_twat(soupify(slice), twats, timestamp) twats = extract_twat(soupify(html), twats, timestamp, nitters) nfetched += 1 # if the first two (the very first could be pinned) tweets are already known # do not waste cpu processing more html if nfetched == 2 and checkfn and not checkfn(user, twats): return twats, cursor
def replace_url_in_twat(twat, args=None): user = twat['user'].lower() soup = soupify(twat["text"]) # linked files for a in soup.body.find_all('a'): ## replace /search?q= links if a.attrs['href'].startswith('/search'): twat['text'] = twat['text'].replace('/search?q=', '/index.html?search=') ## @username : replace when local elif 'title' in a.attrs: username = a.attrs['href'].split('/')[1] at_link = user_at_link(username.lower()) if username.find('@') == -1: rebuild = '<b>%s<a href="https://%s/%s">%s</a></b>' % ( at_link, random.choice(args.instances), username, username) else: _, u, h = username.split('@') rebuild = '<b>%s<a href="https://%s/@%s">%s</a></b>' % ( at_link, h, u, username) # this fails when nonascii chars are present in a['title'] # XXX: would be nice to remove that 'title' attr, which would solve the issue try: twat['text'] = twat['text'].replace(str(a), rebuild) except Exception as e: print('replace_url_in_twats: %s' % e) pass return twat['text']
def get_twats_mobile(user, proxies=None): host = 'mobile.twitter.com' http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.60.0") # http.debugreq = True while not http.connect(): # FIXME : what should happen on connect error ? pass hdr, res = http.get("/" + user) twats = [] soup = soupify(res) tweet_id = 0 tweet_user = None tweet_time = None tweet_text = None for tbl in soup.body.find_all('table'): # , attrs={'class':'tweet '}): if not "class" in tbl.attrs: continue if not "tweet" in repr(tbl.attrs["class"]): continue for td in tbl.find_all('td'): cls = td.attrs["class"][0] #print "." + repr(cls) + "." if cls == "user-info": tweet_user = td.find('div', attrs={ 'class': 'username' }).text.strip() elif cls == 'timestamp': a = td.find('a') tweet_time = a.text tweet_id = a.attrs["href"].rstrip("?p=p") elif cls == 'tweet-content': tweet_text = td.find('div', attrs={ 'class': 'tweet-text' }).text.strip() if tweet_user != None and tweet_id: twats.append({ 'id': tweet_id, 'user': tweet_user, 'time': tweet_time, 'text': tweet_text }) return twats
def unshorten_urls(twat, proxies=None, shorteners={}): soup = soupify(twat["text"]) for a in soup.body.find_all('a'): href = a.attrs['href'] comp = _split_url(href) if comp['host'] in shorteners: try: twat['text'] = twat['text'].decode('utf8').replace( href, _get_real_location(href, proxies=proxies)) except: pass return twat
def extract_twats(html, item, twats, timestamp, checkfn, nitters, blacklist, whitelist): def find_div_end(html): level = 0 for i in xrange(len(html)): if html[i] == '<' and html[i + 1] == 'd' and html[ i + 2] == 'i' and html[i + 3] == 'v': level += 1 if html[i] == '<' and html[i + 1] == '/' and html[ i + 2] == 'd' and html[i + 3] == 'i' and html[i + 4] == 'v': level -= 1 if level == 0: return i + len('</div>') regex = re.compile(r'<div.*class.*[" ]timeline.item[" ]') nfetched = 0 _as = '\n'.join([rs for rs in rsparse.find_all_tags(html, 'a')]) cursor = [ a.get('href') for a in soupify(_as).body.find_all('a') if a.get('href').find('cursor=') != -1 ] while 1: match = regex.search(html) if not match: return twats, cursor html = html[match.start():] div_end = find_div_end(html) slice = html[:div_end] html = html[div_end:] twats = extract_twat(soupify(html), twats, timestamp, nitters, blacklist, whitelist) nfetched += 1 # if the first two (the very first could be pinned) tweets are already known # do not waste cpu processing more html if nfetched == 2 and checkfn and not checkfn(item, twats): return twats, cursor
def fetch_profile_picture(user, proxies, res=None, twhttp=None, nitters={}): pic_path = paths.get_profile_pic(user) if os.path.isfile(pic_path): return if not res: while not twhttp: twhttp, host, nitters = nitter_connect(nitters, proxies) # no avail. instance, pic will be scraped another time if not twhttp: return hdr, res = twhttp.get("/%s" % user) soup = soupify(res) for meta in soup.find_all('meta', attrs={'property': 'og:image'}): pic_url = meta.get('content') if '://' in meta.get( 'content') else 'https://%s%s' % (get_nitter_instance( nitters, False), meta.get('content')) url_components = _split_url(pic_url) http = RsHttp(host=url_components['host'], port=url_components['port'], timeout=15, ssl=url_components['ssl'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.60.0") # if connection fails, the profile picture # will be fetched another time if not http.connect(): return hdr, res = http.get(url_components['uri']) if res == '' and hdr != "": print('error fetching profile picture: %s' % url_components) else: res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res retry_write(pic_path, res_bytes) return return
def get_twat_timestamp(twat_id): host = 'twitter.com' http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, user_agent="curl/7.60.0") while not http.connect(): # FIXME : what should happen on connect error ? pass hdr, res = http.get(twat_id) soup = soupify(res) for small in soup.body.find_all('small', attrs={'class': 'time'}): if small.find('a').attrs["href"] == twat_id: for span in small.find_all('span'): span.attrs['data-time'] if 'data-time' in span.attrs: return int(span.attrs['data-time']) return 0
def replace_url_in_twat(twat, args=None): user = twat['user'].lower() soup = soupify(twat["text"]) # linked files for a in soup.body.find_all('a'): ## replace /search?q= links if a.attrs['href'].startswith('/search'): twat['text'] = twat['text'].replace('/search?q=', '/index.html?search=') ## @username : replace when local elif 'title' in a.attrs: username = a.attrs['href'].split('/')[1] at_link = user_at_link(username) rebuild = '<b>%s<a href="https://twitter.com/%s">%s</a></b>' % (at_link, username, username) # this fails when nonascii chars are present in a['title'] # XXX: would be nice to remove that 'title' attr, which would solve the issue try: twat['text'] = twat['text'].replace(str(a), rebuild) except: pass return twat['text']
def mirror_twat(twat, args=None): if 'owner' in twat: user = twat['owner'].lower() else: user = twat['user'].lower() if not os.path.isdir('data'): retry_makedirs('data') ## soupify user's text soup = soupify(twat["text"]) ## try to automatically mirror links posted by the user, ## if it matches the extension list. if 'c' in args.mirror and 'curl' in twat: url = twat['curl'] # XXX: unsupported nitter feature # this displays fine when loading from twitter in a regular browser, # which is probably converted using some js code # TODO: check if nitter handles card:// stuff.. unsuported_shemes = ['card://'] for _us in unsuported_shemes: if url.startswith(_us): continue url_components = _split_url(url) url_components['filename'] = 'card.html' #% twat['id'] _mirror_file(url_components, user, twat['id'], args) if 'f' in args.mirror: for a in soup.body.find_all('a'): if 'data-expanded-url' in a.attrs: url_components = _split_url(a.attrs['data-expanded-url']) if 'filename' in url_components: _mirror_file(url_components, user, twat['id'], args, content_type=True) ## mirror videos if 'v' in args.mirror and 'video' in twat: tid = str(twat['id']) url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid) outname = paths.get_user(twat['user']) + '/%s.mp4' % tid if not os.path.exists('data/%s.mp4' % tid): if args.proxy: os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, args.rawproxy, tid, url)) else: os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, tid, url)) if not os.path.exists('%s' % outname) and os.path.exists( 'data/%s.mp4' % tid): os.symlink('../../data/%s.mp4' % tid, outname) ## mirror posted pictures if 'images' in twat and 'i' in args.mirror: for x in xrange(0, len(twat['images'])): i = twat['images'][x] if '?format=' in i: i = i.split('&')[0] fmt = i.split('=')[1] i = '%s.%s' % (i.split('?')[0], fmt) url_components = _split_url(i) if 'filename' in url_components: _mirror_file(url_components, user, twat['id'], args) ## deal with emojis if 'e' in args.mirror: for img in soup.body.find_all('img'): if 'class' in img.attrs and 'Emoji' in img.attrs['class']: src = img.attrs['src'] src = src.encode('utf-8') if isinstance(src, unicode) else src split = src.split('/') host = split[2] emodir = '/'.join(split[3:len(split) - 1]) filename = split[-1] uri = '%s/%s' % (emodir, filename) if not os.path.isdir(emodir): retry_makedirs(emodir) if not os.path.exists('%s/%s' % (emodir, filename)): http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.60.0") while not http.connect(): # FIXME : what should happen on connect error ? pass hdr, res = http.get('/%s' % uri) res = res.encode('utf-8') if isinstance(res, unicode) else res retry_write('%s/%s' % (emodir, filename), res)
def extract_toots(html, item, toots, timestamp, checkfn, ignore={}): cursor = [ a.get('href') for a in soupify(html).body.find_all('a') if a.get('href').find('?max_id=') != -1 ] cursor = cursor[0] if len(cursor) else None quote_toot = None images = [] toot = dict() elements = [ div for div in soupify(html).body.find_all('div') if ('class' in div.attrs and 'status-public' in div.attrs['class']) ] for element in elements: video = None card = None images = list() toot_text = None toot_boosted = False pinned = False toot_author = None toot_time = None for span in element.find_all('span'): if span.get_text() == 'Pinned post': pinned = True break infodiv = element.find('div', attrs={'class': 'status__info'}) if infodiv is None: continue # should not happen toot_id = infodiv.find('a', attrs={ 'class': 'status__relative-time' }).get('href').split('/')[4] # XXX some toot_id are in format dead-beef-0123 # also, usernames could appear ? toot_id = int(toot_id) if isinstance(toot_id, int) else toot_id toot_time = time_to_timegm( infodiv.find('data', attrs={ 'class': 'dt-published' }).get('value')) toot_author = infodiv.find('a', attrs={ 'class': 'status__display-name' }).get('href').split('/')[3].lower() toot_displayname = infodiv.find('strong', attrs={ 'class': 'display-name__html' }).get_text() toot_account = infodiv.find('span', attrs={ 'class': 'display-name__account' }).contents[0].strip() if toot_account in ignore: continue # FIXME: toot_text has weird formatting upon scraping, but displays fine # once twatbot is restarted... needs to investigate this. toot_text = str(element.find('div', attrs={'class': 'e-content'})) toot_text = toot_text.encode('utf-8') if isinstance( toot_text, unicode) else toot_text #toot_avatar = infodiv.find('img', attrs={'class':'account__avatar'}).get('src') card = element.find('div', attrs={'data-component': 'Card'}) if card: card = extract_props(card) video = element.find('div', attrs={'data-component': 'Video'}) if video: video = extract_props(video) for v in video['media']: images.append(v['preview_url']) gallery = element.find('div', attrs={'data-component': 'MediaGallery'}) if gallery: gallery = extract_props(gallery) images.append(gallery['media'][0]['url']) toot = { 'owner': toot_account, 'fetched': int(time.time()), 'time': toot_time, 'id': toot_id, 'user': toot_account, 'displayname': toot_displayname, 'account': toot_account, 'text': toot_text, } if item != toot_account: toot['rid'] = toot_id if pinned: toot['pinned'] = 1 if len(images): toot['images'] = images if video: toot['video'] = 1 if card: toot['curl'] = card['card']['url'] toot['ctitle'] = card['card']['title'] toot['cdesc'] = card['card']['description'] toots.append(toot) # print(toot) return toots, cursor