Example #1
0
def fetch_profile_picture(user, proxies, res=None, twhttp=None, nitters={}):
    pic_path = paths.get_profile_pic(user)
    if os.path.isfile(pic_path): return

    if not res:
        while not twhttp:
            twhttp, host, nitters = nitter_connect(nitters, proxies)
            # no avail. instance, pic will be scraped another time
            if not twhttp: return

        hdr, res = twhttp.get("/%s" % user)

    soup = soupify(res)
    for meta in soup.find_all('meta', attrs={'property': 'og:image'}):
        pic_url = meta.get('content') if '://' in meta.get(
            'content') else 'https://%s%s' % (get_nitter_instance(
                nitters, False), meta.get('content'))
        url_components = _split_url(pic_url)
        http = RsHttp(host=url_components['host'],
                      port=url_components['port'],
                      timeout=15,
                      ssl=url_components['ssl'],
                      keep_alive=True,
                      follow_redirects=True,
                      auto_set_cookies=True,
                      proxies=proxies,
                      user_agent="curl/7.60.0")

        # if connection fails, the profile picture
        # will be fetched another time
        if not http.connect(): return

        hdr, res = http.get(url_components['uri'])
        if res == '' and hdr != "":
            print('error fetching profile picture: %s' % url_components)
        else:
            res_bytes = res.encode('utf-8') if isinstance(res,
                                                          unicode) else res
            retry_write(pic_path, res_bytes)
        return

    return
Example #2
0
def _mirror_file(url_components,
                 user,
                 tid,
                 args=None,
                 content_type=None,
                 force=False):
    outname = paths.get_user(user) + '/%s-%s' % (tid,
                                                 url_components['filename'])
    if not force and os.path.exists(outname):
        return

    http = RsHttp(url_components['host'],
                  ssl=url_components['ssl'],
                  port=url_components['port'],
                  keep_alive=True,
                  follow_redirects=True,
                  auto_set_cookies=True,
                  proxies=args.proxy,
                  user_agent="curl/7.60.0")

    ## do nothing if we cannot connect
    if not http.connect(): return None

    ext = url_components['filename'].split('.')[-1]

    if content_type:

        if args.ext: filtre = str(args.ext).split(',')
        else: filtre = []

        hdr = http.head(url_components['uri'])

        ## max mirror size
        if args.mirror_size:
            # extract second part of the Content-Length: line
            value = [
                str(i.split(':')[1]).strip() for i in hdr.split('\n')
                if i.lower().startswith('content-length:')
            ]
            if not len(value) or int(value[0]) > args.mirror_size: return

        # extract second part of the Content-Type: line
        value = [
            str(i.split(':')[1]).strip() for i in hdr.split('\n')
            if i.lower().startswith('content-type:')
        ]

        ## server does not provide Content-Type info
        if not len(value):
            return
            # content type contains ';' (usually when html)
        elif ';' in value[0]:
            value[0] = value[0].split(';')[0]
        value = value[0].split('/')

        ## when filtering extensions (--ext)
        ## if unset, everything is mirrored
        if len(filtre):
            ## values don't match anything
            if len(value) < 2 or (not value[0] in filtre
                                  and not value[1] in filtre):
                return

        # XXX : mirror html files
        ## we actually don't save html files
        ## what about making automated save
        ## thru the wayback machine ?
        if 'html' in value: return

        ## previous http object cannot be re-used
        http = RsHttp(url_components['host'],
                      ssl=url_components['ssl'],
                      port=url_components['port'],
                      keep_alive=True,
                      follow_redirects=True,
                      auto_set_cookies=True,
                      proxies=args.proxy,
                      user_agent="curl/7.60.0")

        ## do nothing if we cannot connect
        if not http.connect(): return

    extras = []
    if url_components[
            'filename'] == 'card.html' and 'twitter.com' in url_components[
                'host']:
        extras.append("Referer: https://twitter.com/")

    hdr, res = http.get(url_components['uri'], extras=extras)
    if res == '' and hdr != "":
        # print http error code when things go wrong
        print "%s%s : %s" % (url_components['host'], url_components['uri'],
                             hdr.split('\n')[0])
        return

    res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res
    filehash = _hash(res_bytes)
    out_fn = 'data/%s.%s' % (filehash, ext)
    if not os.path.exists(out_fn):
        retry_write(out_fn, res_bytes)

    if os.path.lexists(outname): os.unlink(outname)
    os.symlink('../../data/%s.%s' % (filehash, ext), outname)
Example #3
0
def mirror_twat(twat, args=None):

    if 'owner' in twat:
        user = twat['owner'].lower()
    else:
        user = twat['user'].lower()

    if not os.path.isdir('data'): retry_makedirs('data')

    ## soupify user's text
    soup = soupify(twat["text"])

    ## try to automatically mirror links posted by the user,
    ## if it matches the extension list.

    if 'c' in args.mirror and 'curl' in twat:
        url = twat['curl']
        # XXX: unsupported nitter feature
        # this displays fine when loading from twitter in a regular browser,
        # which is probably converted using some js code
        # TODO: check if nitter handles card:// stuff..
        unsuported_shemes = ['card://']
        for _us in unsuported_shemes:
            if url.startswith(_us): continue
            url_components = _split_url(url)
            url_components['filename'] = 'card.html'  #% twat['id']
            _mirror_file(url_components, user, twat['id'], args)

    if 'f' in args.mirror:
        for a in soup.body.find_all('a'):
            if 'data-expanded-url' in a.attrs:
                url_components = _split_url(a.attrs['data-expanded-url'])

                if 'filename' in url_components:
                    _mirror_file(url_components,
                                 user,
                                 twat['id'],
                                 args,
                                 content_type=True)

    ## mirror videos
    if 'v' in args.mirror and 'video' in twat:
        tid = str(twat['id'])
        url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid)
        outname = paths.get_user(twat['user']) + '/%s.mp4' % tid
        if not os.path.exists('data/%s.mp4' % tid):
            if args.proxy:
                os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' %
                          (args.ytdl, args.rawproxy, tid, url))
            else:
                os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' %
                          (args.ytdl, tid, url))
        if not os.path.exists('%s' % outname) and os.path.exists(
                'data/%s.mp4' % tid):
            os.symlink('../../data/%s.mp4' % tid, outname)

    ## mirror posted pictures
    if 'images' in twat and 'i' in args.mirror:

        for x in xrange(0, len(twat['images'])):
            i = twat['images'][x]

            if '?format=' in i:
                i = i.split('&')[0]
                fmt = i.split('=')[1]
                i = '%s.%s' % (i.split('?')[0], fmt)

            url_components = _split_url(i)
            if 'filename' in url_components:
                _mirror_file(url_components, user, twat['id'], args)

    ## deal with emojis
    if 'e' in args.mirror:
        for img in soup.body.find_all('img'):
            if 'class' in img.attrs and 'Emoji' in img.attrs['class']:
                src = img.attrs['src']
                src = src.encode('utf-8') if isinstance(src, unicode) else src

                split = src.split('/')
                host = split[2]
                emodir = '/'.join(split[3:len(split) - 1])
                filename = split[-1]
                uri = '%s/%s' % (emodir, filename)

                if not os.path.isdir(emodir):
                    retry_makedirs(emodir)

                if not os.path.exists('%s/%s' % (emodir, filename)):
                    http = RsHttp(host=host,
                                  port=443,
                                  timeout=30,
                                  ssl=True,
                                  keep_alive=True,
                                  follow_redirects=True,
                                  auto_set_cookies=True,
                                  proxies=args.proxy,
                                  user_agent="curl/7.60.0")
                    while not http.connect():
                        # FIXME : what should happen on connect error ?
                        pass
                    hdr, res = http.get('/%s' % uri)
                    res = res.encode('utf-8') if isinstance(res,
                                                            unicode) else res
                    retry_write('%s/%s' % (emodir, filename), res)