Ejemplo n.º 1
0
def load_watchlist():
    global watchlist, wl_hash, has_keywords
    wl = []
    for x in open(args.watchlist, 'r').readlines():
        x = x.rstrip().lower()
        if x.startswith(';'):
            username = x[1:]
            disabled_users[username] = True
        else:
            username = x
            if username[0] == '#' and not has_keywords:
                has_keywords = True
        if not username[0] == '#' and not os.path.exists(
                paths.get_user_json(username)):
            new_accounts.append(username)
            if not os.path.exists(paths.get_user(username)):
                retry_makedirs(paths.get_user(username))
        wl.append(username)
    newhash = hashlib.md5(''.join(wl)).hexdigest()
    if newhash != wl_hash:
        print('reloading watchlist')
        wl_hash = newhash
        watchlist = wl
        json_loads()

    if has_keywords and os.path.exists('users'):
        for file in os.listdir('users'):
            d = os.path.join('users', file)
            if os.path.isdir(d): load_user_json(file)
Ejemplo n.º 2
0
def load_watchlist():
	global watchlist, wl_hash
	wl = []
	for x in open(args.watchlist, 'r').readlines():
		x = x.rstrip()
		if x.startswith(';'):
			username = x[1:]
			disabled_users[username] = True
		else:
			username = x
		if not os.path.exists(paths.get_user_json(username)):
			new_accounts.append(username)
			if not os.path.exists(paths.get_user(username)):
				retry_makedirs(paths.get_user(username))
		wl.append(username)
	newhash = hashlib.md5(''.join(wl)).hexdigest()
	if newhash != wl_hash:
		print('reloading watchlist')
		wl_hash = newhash
		watchlist = wl
		json_loads()
Ejemplo n.º 3
0
def scrape(user, http, host):
	global nitters

	if user in new_accounts:
		count = args.count
		checkfn = None
		new_accounts.remove(user)
	else:
		checkfn = fetch_more_tweets_callback
		count = -1

	elapsed_time = time.time()
	insert_pos = 0
	sys.stdout.write('\r[%s] scraping %s... ' % (get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), user))
	sys.stdout.flush()

	twats, nitters, host, http = get_twats(user, proxies=args.proxy, count=count, http=http, checkfn=checkfn, nitters=nitters, host=host)

	new = False
	for t in twats:
		if not in_twatlist(user, t):
			new = True
			if args.unshorten: t = unshorten_urls(t, proxies=args.proxy, shorteners=shorteners)
			add_twatlist(user, t, insert_pos)
			insert_pos += 1
			if 'quote_tweet' in t:
				if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])): retry_makedirs(paths.get_user(t[quote_tweet]['user']))
				fetch_profile_picture(t[quote_tweet]['user'], args.proxy, twhttp=nitter_rshttp, nitters=nitters)
			if 'user' in t:
				if not os.path.isdir(paths.get_user(t['user'])): retry_makedirs(paths.get_user(t['user']))
				fetch_profile_picture(t['user'], args.proxy, twhttp=nitter_rshttp, nitters=nitters)
			if args.mirror: mirror_twat(t, args=args)
			sys.stdout.write('\r[%s] scraping %s... +%d ' % (get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), user, insert_pos))
			sys.stdout.flush()

	if new: write_user_tweets(user)
	elapsed_time = (time.time() - elapsed_time)
	sys.stdout.write('done (%s)\n' % get_timestamp("%H:%M:%S", elapsed_time))
	sys.stdout.flush()
	return http, host
Ejemplo n.º 4
0
			print('youtube-dl not found, videos won\'t be downloaded (path: %s)' % args.ytdl)
			args.mirror = args.mirror.replace('v','')

	if args.mirror_size > 0:
		args.mirror_size = args.mirror_size * 1024*1024

	shorteners = {}
	if args.unshorten:
		with open('shorteners.txt', 'r') as f:
			for i in f.readlines():
				i = i.strip()
				if len(i): shorteners[i] = True

	if args.dir:
		if not os.path.exists(args.dir):
			retry_makedirs(args.dir)
		for d in site_dirs:
			if not os.path.exists(args.dir + d):
				os.symlink(os.getcwd() + d, args.dir + d)
		os.chdir(args.dir)

	args.proxy = [RocksockProxyFromURL(args.proxy)] if args.proxy else None

	nitter_rshttp = None
	host = None

	load_watchlist()

	## resume/retry mirroring process
	mirroring_done = threading.Event()
	if args.resume and args.mirror:
Ejemplo n.º 5
0
def scrape(item, http, host, search, user_agent):
    global nitters
    global mastodon_rshttp
    item = item.lower()

    if item in new_accounts:
        count = args.count
        checkfn = None
        new_accounts.remove(item)
    else:
        checkfn = fetch_more_tweets_callback
        count = args.count if item[0] == '#' else -1

    if item.count('@') < 2:
        platform = 'twitter'
        twats, nitters, host, http, page = get_twats(item,
                                                     proxies=args.proxy,
                                                     count=count,
                                                     http=http,
                                                     checkfn=checkfn,
                                                     nitters=nitters,
                                                     host=host,
                                                     search=search,
                                                     user_agent=user_agent,
                                                     blacklist=blacklist,
                                                     whitelist=whitelist)
    else:
        platform = 'mastodon'
        twats, http = get_toots(item,
                                proxies=args.proxy,
                                count=count,
                                http=http,
                                checkfn=checkfn,
                                user_agent=user_agent,
                                blacklist=args.blacklist,
                                whitelist=args.whitelist)
        mastodon_rshttp[host] = http

    insert_pos = dict()
    new = False
    user = None if item[0] == '#' else item
    insert_pos_total = 0
    elapsed_time = time.time()
    for t in twats:
        if item[0] == '#': user = t['user'].lower()
        if not user in insert_pos: insert_pos[user] = 0

        if not in_twatlist(user, t):
            new = True
            if args.unshorten:
                t = unshorten_urls(t,
                                   proxies=args.proxy,
                                   shorteners=shorteners)
            add_twatlist(user, t, insert_pos[user])
            insert_pos[user] += 1
            insert_pos_total += 1
            if 'quote_tweet' in t:
                if '@' in t['quote_tweet']['user']:
                    _, foo, bar = t['quote_tweet']['user'].split('@')
                    http = None if not bar in mastodon_rshttp else mastodon_rshttp[
                        bar]

                if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])):
                    retry_makedirs(paths.get_user(t[quote_tweet]['user']))
                fetch_profile_picture(t[quote_tweet]['user'],
                                      args.proxy,
                                      twhttp=http,
                                      nitters=nitters,
                                      platform=platform)
            if 'user' in t:
                if '@' in t['user']:
                    _, foo, bar = t['user'].split('@')
                    http = None if not bar in mastodon_rshttp else mastodon_rshttp[
                        bar]

                if not os.path.isdir(paths.get_user(t['user'])):
                    retry_makedirs(paths.get_user(t['user']))
                fetch_profile_picture(t['user'],
                                      args.proxy,
                                      twhttp=http,
                                      nitters=nitters,
                                      platform=platform)
            if args.mirror: mirror_twat(t, args=args)
            sys.stdout.write(
                '\r[%s] %s: extracting from %d page(s): +%d twat(s)' %
                (misc.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), item,
                 page, insert_pos_total))
            sys.stdout.flush()

    if new:
        if item[0] == '#':
            for user in insert_pos.keys():
                write_user_tweets(user)
        else:
            write_user_tweets(item)
    elapsed_time = (time.time() - elapsed_time)
    sys.stdout.write('done (%s)\n' %
                     misc.get_timestamp("%H:%M:%S", elapsed_time))
    sys.stdout.flush()
    return http, host
Ejemplo n.º 6
0
def mirror_twat(twat, args=None):

    if 'owner' in twat:
        user = twat['owner'].lower()
    else:
        user = twat['user'].lower()

    if not os.path.isdir('data'): retry_makedirs('data')

    ## soupify user's text
    soup = soupify(twat["text"])

    ## try to automatically mirror links posted by the user,
    ## if it matches the extension list.

    if 'c' in args.mirror and 'curl' in twat:
        url = twat['curl']
        # XXX: unsupported nitter feature
        # this displays fine when loading from twitter in a regular browser,
        # which is probably converted using some js code
        # TODO: check if nitter handles card:// stuff..
        unsuported_shemes = ['card://']
        for _us in unsuported_shemes:
            if url.startswith(_us): continue
            url_components = _split_url(url)
            url_components['filename'] = 'card.html'  #% twat['id']
            _mirror_file(url_components, user, twat['id'], args)

    if 'f' in args.mirror:
        for a in soup.body.find_all('a'):
            if 'data-expanded-url' in a.attrs:
                url_components = _split_url(a.attrs['data-expanded-url'])

                if 'filename' in url_components:
                    _mirror_file(url_components,
                                 user,
                                 twat['id'],
                                 args,
                                 content_type=True)

    ## mirror videos
    if 'v' in args.mirror and 'video' in twat:
        tid = str(twat['id'])
        url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid)
        outname = paths.get_user(twat['user']) + '/%s.mp4' % tid
        if not os.path.exists('data/%s.mp4' % tid):
            if args.proxy:
                os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' %
                          (args.ytdl, args.rawproxy, tid, url))
            else:
                os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' %
                          (args.ytdl, tid, url))
        if not os.path.exists('%s' % outname) and os.path.exists(
                'data/%s.mp4' % tid):
            os.symlink('../../data/%s.mp4' % tid, outname)

    ## mirror posted pictures
    if 'images' in twat and 'i' in args.mirror:

        for x in xrange(0, len(twat['images'])):
            i = twat['images'][x]

            if '?format=' in i:
                i = i.split('&')[0]
                fmt = i.split('=')[1]
                i = '%s.%s' % (i.split('?')[0], fmt)

            url_components = _split_url(i)
            if 'filename' in url_components:
                _mirror_file(url_components, user, twat['id'], args)

    ## deal with emojis
    if 'e' in args.mirror:
        for img in soup.body.find_all('img'):
            if 'class' in img.attrs and 'Emoji' in img.attrs['class']:
                src = img.attrs['src']
                src = src.encode('utf-8') if isinstance(src, unicode) else src

                split = src.split('/')
                host = split[2]
                emodir = '/'.join(split[3:len(split) - 1])
                filename = split[-1]
                uri = '%s/%s' % (emodir, filename)

                if not os.path.isdir(emodir):
                    retry_makedirs(emodir)

                if not os.path.exists('%s/%s' % (emodir, filename)):
                    http = RsHttp(host=host,
                                  port=443,
                                  timeout=30,
                                  ssl=True,
                                  keep_alive=True,
                                  follow_redirects=True,
                                  auto_set_cookies=True,
                                  proxies=args.proxy,
                                  user_agent="curl/7.60.0")
                    while not http.connect():
                        # FIXME : what should happen on connect error ?
                        pass
                    hdr, res = http.get('/%s' % uri)
                    res = res.encode('utf-8') if isinstance(res,
                                                            unicode) else res
                    retry_write('%s/%s' % (emodir, filename), res)