Python soupify Exemples, soup_parser.soupify Python Exemples

Exemple #1

0

Afficher le fichier

def extract_twats(html, user, twats, timestamp, checkfn, nitters):
    def find_div_end(html):
        level = 0
        for i in xrange(len(html)):
            if html[i] == '<' and html[i + 1] == 'd' and html[
                    i + 2] == 'i' and html[i + 3] == 'v':
                level += 1
            if html[i] == '<' and html[i + 1] == '/' and html[
                    i + 2] == 'd' and html[i + 3] == 'i' and html[i +
                                                                  4] == 'v':
                level -= 1
            if level == 0:
                return i + len('</div>')

    regex = re.compile(r'<div.*class.*[" ]timeline.item[" ]')
    nfetched = 0
    cursor = [
        a.get('href') for a in soupify(html).body.find_all('a')
        if a.get('href').startswith('?cursor=')
    ]
    while 1:
        match = regex.search(html)
        if not match:
            return twats, cursor
        html = html[match.start():]
        div_end = find_div_end(html)
        slice = html[:div_end]
        html = html[div_end:]
        #twats = extract_twat(soupify(slice), twats, timestamp)
        twats = extract_twat(soupify(html), twats, timestamp, nitters)
        nfetched += 1
        # if the first two (the very first could be pinned) tweets are already known
        # do not waste cpu processing more html
        if nfetched == 2 and checkfn and not checkfn(user, twats):
            return twats, cursor

Exemple #2

0

Afficher le fichier

def replace_url_in_twat(twat, args=None):

    user = twat['user'].lower()

    soup = soupify(twat["text"])

    # linked files
    for a in soup.body.find_all('a'):
        ## replace /search?q= links
        if a.attrs['href'].startswith('/search'):
            twat['text'] = twat['text'].replace('/search?q=',
                                                '/index.html?search=')

        ## @username : replace when local
        elif 'title' in a.attrs:
            username = a.attrs['href'].split('/')[1]
            at_link = user_at_link(username.lower())
            if username.find('@') == -1:
                rebuild = '<b>%s<a href="https://%s/%s">%s</a></b>' % (
                    at_link, random.choice(args.instances), username, username)
            else:
                _, u, h = username.split('@')
                rebuild = '<b>%s<a href="https://%s/@%s">%s</a></b>' % (
                    at_link, h, u, username)
            # this fails when nonascii chars are present in a['title']
            # XXX: would be nice to remove that 'title' attr, which would solve the issue
            try:
                twat['text'] = twat['text'].replace(str(a), rebuild)
            except Exception as e:
                print('replace_url_in_twats: %s' % e)
                pass

    return twat['text']

Exemple #3

0

Afficher le fichier

def get_twats_mobile(user, proxies=None):
    host = 'mobile.twitter.com'
    http = RsHttp(host=host,
                  port=443,
                  timeout=30,
                  ssl=True,
                  keep_alive=True,
                  follow_redirects=True,
                  auto_set_cookies=True,
                  proxies=proxies,
                  user_agent="curl/7.60.0")
    #	http.debugreq = True
    while not http.connect():
        # FIXME : what should happen on connect error ?
        pass
    hdr, res = http.get("/" + user)

    twats = []

    soup = soupify(res)
    tweet_id = 0
    tweet_user = None
    tweet_time = None
    tweet_text = None

    for tbl in soup.body.find_all('table'):  # , attrs={'class':'tweet  '}):
        if not "class" in tbl.attrs: continue
        if not "tweet" in repr(tbl.attrs["class"]): continue
        for td in tbl.find_all('td'):
            cls = td.attrs["class"][0]
            #print "." + repr(cls) + "."
            if cls == "user-info":
                tweet_user = td.find('div', attrs={
                    'class': 'username'
                }).text.strip()
            elif cls == 'timestamp':
                a = td.find('a')
                tweet_time = a.text
                tweet_id = a.attrs["href"].rstrip("?p=p")
            elif cls == 'tweet-content':
                tweet_text = td.find('div', attrs={
                    'class': 'tweet-text'
                }).text.strip()
        if tweet_user != None and tweet_id:
            twats.append({
                'id': tweet_id,
                'user': tweet_user,
                'time': tweet_time,
                'text': tweet_text
            })

    return twats

Exemple #4

0

Afficher le fichier

def unshorten_urls(twat, proxies=None, shorteners={}):
    soup = soupify(twat["text"])
    for a in soup.body.find_all('a'):
        href = a.attrs['href']
        comp = _split_url(href)
        if comp['host'] in shorteners:
            try:
                twat['text'] = twat['text'].decode('utf8').replace(
                    href, _get_real_location(href, proxies=proxies))
            except:
                pass

    return twat

Exemple #5

0

Afficher le fichier

def extract_twats(html, item, twats, timestamp, checkfn, nitters, blacklist,
                  whitelist):
    def find_div_end(html):
        level = 0
        for i in xrange(len(html)):
            if html[i] == '<' and html[i + 1] == 'd' and html[
                    i + 2] == 'i' and html[i + 3] == 'v':
                level += 1
            if html[i] == '<' and html[i + 1] == '/' and html[
                    i + 2] == 'd' and html[i + 3] == 'i' and html[i +
                                                                  4] == 'v':
                level -= 1
            if level == 0:
                return i + len('</div>')

    regex = re.compile(r'<div.*class.*[" ]timeline.item[" ]')
    nfetched = 0
    _as = '\n'.join([rs for rs in rsparse.find_all_tags(html, 'a')])
    cursor = [
        a.get('href') for a in soupify(_as).body.find_all('a')
        if a.get('href').find('cursor=') != -1
    ]
    while 1:
        match = regex.search(html)
        if not match:
            return twats, cursor
        html = html[match.start():]
        div_end = find_div_end(html)
        slice = html[:div_end]
        html = html[div_end:]
        twats = extract_twat(soupify(html), twats, timestamp, nitters,
                             blacklist, whitelist)
        nfetched += 1
        # if the first two (the very first could be pinned) tweets are already known
        # do not waste cpu processing more html
        if nfetched == 2 and checkfn and not checkfn(item, twats):
            return twats, cursor

Exemple #6

0

Afficher le fichier

def fetch_profile_picture(user, proxies, res=None, twhttp=None, nitters={}):
    pic_path = paths.get_profile_pic(user)
    if os.path.isfile(pic_path): return

    if not res:
        while not twhttp:
            twhttp, host, nitters = nitter_connect(nitters, proxies)
            # no avail. instance, pic will be scraped another time
            if not twhttp: return

        hdr, res = twhttp.get("/%s" % user)

    soup = soupify(res)
    for meta in soup.find_all('meta', attrs={'property': 'og:image'}):
        pic_url = meta.get('content') if '://' in meta.get(
            'content') else 'https://%s%s' % (get_nitter_instance(
                nitters, False), meta.get('content'))
        url_components = _split_url(pic_url)
        http = RsHttp(host=url_components['host'],
                      port=url_components['port'],
                      timeout=15,
                      ssl=url_components['ssl'],
                      keep_alive=True,
                      follow_redirects=True,
                      auto_set_cookies=True,
                      proxies=proxies,
                      user_agent="curl/7.60.0")

        # if connection fails, the profile picture
        # will be fetched another time
        if not http.connect(): return

        hdr, res = http.get(url_components['uri'])
        if res == '' and hdr != "":
            print('error fetching profile picture: %s' % url_components)
        else:
            res_bytes = res.encode('utf-8') if isinstance(res,
                                                          unicode) else res
            retry_write(pic_path, res_bytes)
        return

    return

Exemple #7

0

Afficher le fichier

def get_twat_timestamp(twat_id):
    host = 'twitter.com'
    http = RsHttp(host=host,
                  port=443,
                  timeout=30,
                  ssl=True,
                  keep_alive=True,
                  follow_redirects=True,
                  auto_set_cookies=True,
                  user_agent="curl/7.60.0")
    while not http.connect():
        # FIXME : what should happen on connect error ?
        pass
    hdr, res = http.get(twat_id)
    soup = soupify(res)
    for small in soup.body.find_all('small', attrs={'class': 'time'}):
        if small.find('a').attrs["href"] == twat_id:
            for span in small.find_all('span'):
                span.attrs['data-time']
                if 'data-time' in span.attrs:
                    return int(span.attrs['data-time'])
    return 0

Exemple #8

0

Afficher le fichier

def replace_url_in_twat(twat, args=None):

	user = twat['user'].lower()

	soup = soupify(twat["text"])

	# linked files
	for a in soup.body.find_all('a'):
		## replace /search?q= links
		if a.attrs['href'].startswith('/search'):
			twat['text'] = twat['text'].replace('/search?q=', '/index.html?search=')

		## @username : replace when local
		elif 'title' in a.attrs:
			username = a.attrs['href'].split('/')[1]
			at_link = user_at_link(username)
			rebuild = '<b>%s<a href="https://twitter.com/%s">%s</a></b>' % (at_link, username, username)
			# this fails when nonascii chars are present in a['title']
			# XXX: would be nice to remove that 'title' attr, which would solve the issue
			try: twat['text'] = twat['text'].replace(str(a), rebuild)
			except: pass

	return twat['text']

Exemple #9

0

Afficher le fichier

def mirror_twat(twat, args=None):

    if 'owner' in twat:
        user = twat['owner'].lower()
    else:
        user = twat['user'].lower()

    if not os.path.isdir('data'): retry_makedirs('data')

    ## soupify user's text
    soup = soupify(twat["text"])

    ## try to automatically mirror links posted by the user,
    ## if it matches the extension list.

    if 'c' in args.mirror and 'curl' in twat:
        url = twat['curl']
        # XXX: unsupported nitter feature
        # this displays fine when loading from twitter in a regular browser,
        # which is probably converted using some js code
        # TODO: check if nitter handles card:// stuff..
        unsuported_shemes = ['card://']
        for _us in unsuported_shemes:
            if url.startswith(_us): continue
            url_components = _split_url(url)
            url_components['filename'] = 'card.html'  #% twat['id']
            _mirror_file(url_components, user, twat['id'], args)

    if 'f' in args.mirror:
        for a in soup.body.find_all('a'):
            if 'data-expanded-url' in a.attrs:
                url_components = _split_url(a.attrs['data-expanded-url'])

                if 'filename' in url_components:
                    _mirror_file(url_components,
                                 user,
                                 twat['id'],
                                 args,
                                 content_type=True)

    ## mirror videos
    if 'v' in args.mirror and 'video' in twat:
        tid = str(twat['id'])
        url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid)
        outname = paths.get_user(twat['user']) + '/%s.mp4' % tid
        if not os.path.exists('data/%s.mp4' % tid):
            if args.proxy:
                os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' %
                          (args.ytdl, args.rawproxy, tid, url))
            else:
                os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' %
                          (args.ytdl, tid, url))
        if not os.path.exists('%s' % outname) and os.path.exists(
                'data/%s.mp4' % tid):
            os.symlink('../../data/%s.mp4' % tid, outname)

    ## mirror posted pictures
    if 'images' in twat and 'i' in args.mirror:

        for x in xrange(0, len(twat['images'])):
            i = twat['images'][x]

            if '?format=' in i:
                i = i.split('&')[0]
                fmt = i.split('=')[1]
                i = '%s.%s' % (i.split('?')[0], fmt)

            url_components = _split_url(i)
            if 'filename' in url_components:
                _mirror_file(url_components, user, twat['id'], args)

    ## deal with emojis
    if 'e' in args.mirror:
        for img in soup.body.find_all('img'):
            if 'class' in img.attrs and 'Emoji' in img.attrs['class']:
                src = img.attrs['src']
                src = src.encode('utf-8') if isinstance(src, unicode) else src

                split = src.split('/')
                host = split[2]
                emodir = '/'.join(split[3:len(split) - 1])
                filename = split[-1]
                uri = '%s/%s' % (emodir, filename)

                if not os.path.isdir(emodir):
                    retry_makedirs(emodir)

                if not os.path.exists('%s/%s' % (emodir, filename)):
                    http = RsHttp(host=host,
                                  port=443,
                                  timeout=30,
                                  ssl=True,
                                  keep_alive=True,
                                  follow_redirects=True,
                                  auto_set_cookies=True,
                                  proxies=args.proxy,
                                  user_agent="curl/7.60.0")
                    while not http.connect():
                        # FIXME : what should happen on connect error ?
                        pass
                    hdr, res = http.get('/%s' % uri)
                    res = res.encode('utf-8') if isinstance(res,
                                                            unicode) else res
                    retry_write('%s/%s' % (emodir, filename), res)

Exemple #10

0

Afficher le fichier

def extract_toots(html, item, toots, timestamp, checkfn, ignore={}):
    cursor = [
        a.get('href') for a in soupify(html).body.find_all('a')
        if a.get('href').find('?max_id=') != -1
    ]
    cursor = cursor[0] if len(cursor) else None
    quote_toot = None
    images = []
    toot = dict()

    elements = [
        div for div in soupify(html).body.find_all('div')
        if ('class' in div.attrs and 'status-public' in div.attrs['class'])
    ]

    for element in elements:
        video = None
        card = None
        images = list()
        toot_text = None
        toot_boosted = False
        pinned = False
        toot_author = None
        toot_time = None

        for span in element.find_all('span'):
            if span.get_text() == 'Pinned post':
                pinned = True
                break

        infodiv = element.find('div', attrs={'class': 'status__info'})
        if infodiv is None: continue  # should not happen
        toot_id = infodiv.find('a', attrs={
            'class': 'status__relative-time'
        }).get('href').split('/')[4]
        # XXX some toot_id are in format dead-beef-0123
        # also, usernames could appear ?
        toot_id = int(toot_id) if isinstance(toot_id, int) else toot_id
        toot_time = time_to_timegm(
            infodiv.find('data', attrs={
                'class': 'dt-published'
            }).get('value'))
        toot_author = infodiv.find('a',
                                   attrs={
                                       'class': 'status__display-name'
                                   }).get('href').split('/')[3].lower()
        toot_displayname = infodiv.find('strong',
                                        attrs={
                                            'class': 'display-name__html'
                                        }).get_text()
        toot_account = infodiv.find('span',
                                    attrs={
                                        'class': 'display-name__account'
                                    }).contents[0].strip()
        if toot_account in ignore: continue
        # FIXME: toot_text has weird formatting upon scraping, but displays fine
        # once twatbot is restarted... needs to investigate this.
        toot_text = str(element.find('div', attrs={'class': 'e-content'}))
        toot_text = toot_text.encode('utf-8') if isinstance(
            toot_text, unicode) else toot_text
        #toot_avatar = infodiv.find('img', attrs={'class':'account__avatar'}).get('src')

        card = element.find('div', attrs={'data-component': 'Card'})
        if card:
            card = extract_props(card)

        video = element.find('div', attrs={'data-component': 'Video'})
        if video:
            video = extract_props(video)
            for v in video['media']:
                images.append(v['preview_url'])

        gallery = element.find('div', attrs={'data-component': 'MediaGallery'})
        if gallery:
            gallery = extract_props(gallery)
            images.append(gallery['media'][0]['url'])

        toot = {
            'owner': toot_account,
            'fetched': int(time.time()),
            'time': toot_time,
            'id': toot_id,
            'user': toot_account,
            'displayname': toot_displayname,
            'account': toot_account,
            'text': toot_text,
        }

        if item != toot_account: toot['rid'] = toot_id
        if pinned: toot['pinned'] = 1
        if len(images): toot['images'] = images
        if video: toot['video'] = 1

        if card:
            toot['curl'] = card['card']['url']
            toot['ctitle'] = card['card']['title']
            toot['cdesc'] = card['card']['description']

        toots.append(toot)


#		print(toot)

    return toots, cursor