Python parse_html Examples, oauth_dropins.webutil.util.parse_html Python Examples

Example #1

0

Show file

File: common.py Project: snarfed/bridgy-fed

def get_as2(url):
    """Tries to fetch the given URL as ActivityStreams 2.

    Uses HTTP content negotiation via the Content-Type header. If the url is
    HTML and it has a rel-alternate link with an AS2 content type, fetches and
    returns that URL.

    Args:
        url: string

    Returns:
        :class:`requests.Response`

    Raises:
        :class:`requests.HTTPError`, :class:`werkzeug.exceptions.HTTPException`

        If we raise a werkzeug HTTPException, it will have an additional
        requests_response attribute with the last requests.Response we received.
    """
    def _error(resp):
        msg = "Couldn't fetch %s as ActivityStreams 2" % url
        logging.warning(msg)
        err = BadGateway(msg)
        err.requests_response = resp
        raise err

    resp = requests_get(url, headers=CONNEG_HEADERS_AS2_HTML)
    if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
        return resp

    parsed = util.parse_html(resp)
    as2 = parsed.find('link',
                      rel=('alternate', 'self'),
                      type=(CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD))
    if not (as2 and as2['href']):
        _error(resp)

    resp = requests_get(urllib.parse.urljoin(resp.url, as2['href']),
                        headers=CONNEG_HEADERS_AS2)
    if content_type(resp) in (CONTENT_TYPE_AS2, CONTENT_TYPE_AS2_LD):
        return resp

    _error(resp)

Example #2

0

Show file

File: webmention.py Project: iitians/bridgy-fed

    def _try_salmon(self, resp):
        """
        Args:
          resp: Response
        """
        # fetch target HTML page, extract Atom rel-alternate link
        target = resp.target()
        if not self.target_resp:
            self.target_resp = common.requests_get(target)

        parsed = util.parse_html(self.target_resp)
        atom_url = parsed.find('link',
                               rel='alternate',
                               type=common.CONTENT_TYPE_ATOM)
        if not atom_url or not atom_url.get('href'):
            self.error('Target post %s has no Atom link' % resp.target(),
                       status=400)

        # fetch Atom target post, extract and inject id into source object
        base_url = ''
        base = parsed.find('base')
        if base and base.get('href'):
            base_url = base['href']
        atom_link = parsed.find('link',
                                rel='alternate',
                                type=common.CONTENT_TYPE_ATOM)
        atom_url = urllib.parse.urljoin(
            resp.target(), urllib.parse.urljoin(base_url, atom_link['href']))

        feed = common.requests_get(atom_url).text
        parsed = feedparser.parse(feed)
        logging.info('Parsed: %s', json_dumps(parsed, indent=2))
        entry = parsed.entries[0]
        target_id = entry.id
        in_reply_to = self.source_obj.get('inReplyTo')
        source_obj_obj = self.source_obj.get('object')
        if in_reply_to:
            for elem in in_reply_to:
                if elem.get('url') == target:
                    elem['id'] = target_id
        elif isinstance(source_obj_obj, dict):
            source_obj_obj['id'] = target_id

        # Mastodon (and maybe others?) require a rel-mentioned link to the
        # original post's author to make it show up as a reply:
        #   app/services/process_interaction_service.rb
        # ...so add them as a tag, which atom renders as a rel-mention link.
        authors = entry.get('authors', None)
        if authors:
            url = entry.authors[0].get('href')
            if url:
                self.source_obj.setdefault('tags', []).append({'url': url})

        # extract and discover salmon endpoint
        logging.info('Discovering Salmon endpoint in %s', atom_url)
        endpoint = django_salmon.discover_salmon_endpoint(feed)

        if not endpoint:
            # try webfinger
            parsed = urllib.parse.urlparse(resp.target())
            # TODO: test missing email
            author = entry.get('author_detail', {})
            email = author.get('email') or '@'.join(
                (author.get('name', ''), parsed.netloc))
            try:
                # TODO: always https?
                profile = common.requests_get(
                    '%s://%s/.well-known/webfinger?resource=acct:%s' %
                    (parsed.scheme, parsed.netloc, email),
                    verify=False)
                endpoint = django_salmon.get_salmon_replies_link(
                    profile.json())
            except requests.HTTPError as e:
                pass

        if not endpoint:
            self.error('No salmon endpoint found!', status=400)
        logging.info('Discovered Salmon endpoint %s', endpoint)

        # construct reply Atom object
        self.source_url = resp.source()
        activity = self.source_obj
        if self.source_obj.get('verb') not in source.VERBS_WITH_OBJECT:
            activity = {'object': self.source_obj}
        entry = atom.activity_to_atom(activity, xml_base=self.source_url)
        logging.info('Converted %s to Atom:\n%s', self.source_url, entry)

        # sign reply and wrap in magic envelope
        domain = urllib.parse.urlparse(self.source_url).netloc
        key = MagicKey.get_or_create(domain)
        logging.info('Using key for %s: %s', domain, key)
        magic_envelope = magicsigs.magic_envelope(entry,
                                                  common.CONTENT_TYPE_ATOM,
                                                  key).decode()

        logging.info('Sending Salmon slap to %s', endpoint)
        common.requests_post(
            endpoint,
            data=common.XML_UTF8 + magic_envelope,
            headers={'Content-Type': common.CONTENT_TYPE_MAGIC_ENVELOPE})
        return True

Example #3

0

Show file

    def scraped_to_activities(self,
                              html,
                              cookie=None,
                              count=None,
                              fetch_extras=False):
        """Converts scraped Instagram HTML to ActivityStreams activities.

    The input HTML may be from:

    * a user's feed, eg https://www.instagram.com/ while logged in
    * a user's profile, eg https://www.instagram.com/snarfed/
    * a photo or video, eg https://www.instagram.com/p/BBWCSrfFZAk/

    Args:
      html: unicode string
      cookie: string, optional sessionid cookie to be used for subsequent HTTP
        fetches, if necessary.
      count: integer, number of activities to return, None for all
      fetch_extras: whether to make extra HTTP fetches to get likes, etc.

    Returns:
      tuple, ([ActivityStreams activities], ActivityStreams viewer actor)
    """
        cookie = cookie or self.cookie
        # extract JSON data blob
        # (can also get just this JSON by adding ?__a=1 to any IG URL.)
        matches = HTML_DATA_RE.findall(html)
        if not matches:
            # Instagram sometimes returns 200 with incomplete HTML. often it stops at
            # the end of one of the <style> tags inside <head>. not sure why.
            logging.warning('JSON script tag not found!')
            return [], None

        # find media
        medias = []
        profile_user = None
        viewer_user = None

        for match in matches:
            data = util.trim_nulls(json_loads(match[1]))
            entry_data = data.get('entry_data', {})

            # home page ie news feed
            for page in entry_data.get('FeedPage', []):
                edges = page.get('graphql', {}).get('user', {})\
                            .get('edge_web_feed_timeline', {}).get('edges', [])
                medias.extend(
                    e.get('node') for e in edges
                    if e.get('node', {}).get('__typename') not in (
                        'GraphSuggestedUserFeedUnit', ))

            if 'user' in data:
                edges = data['user'].get('edge_web_feed_timeline',
                                         {}).get('edges', [])
                medias.extend(e.get('node') for e in edges)

            # user profiles
            for page in entry_data.get('ProfilePage', []):
                profile_user = page.get('graphql', {}).get('user', {})
                medias.extend(edge['node'] for edge in profile_user.get(
                    'edge_owner_to_timeline_media', {}).get('edges', [])
                              if edge.get('node'))

            if not viewer_user:
                viewer_user = data.get('config', {}).get('viewer')

            # individual photo/video permalinks
            for page in [data] + entry_data.get('PostPage', []):
                media = page.get('graphql', {}).get('shortcode_media')
                if media:
                    medias.append(media)

        if not medias:
            # As of 2018-02-15, embedded JSON in logged in https://www.instagram.com/
            # no longer has any useful data. Need to do a second header link fetch.
            soup = util.parse_html(html)
            link = soup.find('link', href=HTML_PRELOAD_RE)
            if link:
                url = urllib.parse.urljoin(HTML_BASE_URL, link['href'])
                data = self._scrape_json(url, cookie=cookie)
                edges = data.get('data', {}).get('user', {})\
                            .get('edge_web_feed_timeline', {}).get('edges', [])
                medias = [e.get('node') for e in edges]

        if count:
            medias = medias[:count]

        activities = []
        for media in util.trim_nulls(medias):
            activity = self._json_media_node_to_activity(media,
                                                         user=profile_user)

            # likes
            shortcode = media.get('code') or media.get('shortcode')
            likes = media.get('edge_media_preview_like') or {}
            if shortcode and fetch_extras and likes.get(
                    'count') and not likes.get('edges'):
                # extra GraphQL fetch to get likes, as of 8/2018
                likes_json = self._scrape_json(HTML_LIKES_URL % shortcode,
                                               cookie=cookie)
                self.merge_scraped_reactions(likes_json, activity)

            activities.append(util.trim_nulls(activity))

        actor = None
        user = self._json_user_to_user(viewer_user or profile_user)
        if user:
            actor = self.user_to_actor(user)

        return activities, actor

Example #4

0

Show file

File: source.py Project: wansuiye09/granary

    def _content_for_create(self,
                            obj,
                            ignore_formatting=False,
                            prefer_name=False,
                            strip_first_video_tag=False,
                            strip_quotations=False):
        """Returns content text for :meth:`create()` and :meth:`preview_create()`.

    Returns summary if available, then content, then displayName.

    If using content, renders the HTML content to text using html2text so
    that whitespace is formatted like in the browser.

    Args:
      obj: dict, ActivityStreams object
      ignore_formatting: boolean, whether to use content text as is, instead of
        converting its HTML to plain text styling (newlines, etc.)
      prefer_name: boolean, whether to prefer displayName to content
      strip_first_video_tag: if true, removes the first <video> tag. useful when
        it will be uploaded and attached to the post natively in the silo.
      strip_quotations: if true, removes .u-quotation-of tags. useful when
        creating quote tweets.

    Returns:
      string, possibly empty
    """
        summary = obj.get('summary', '').strip()
        name = obj.get('displayName', '').strip()
        content = obj.get('content', '').strip()

        # note that unicode() on a BeautifulSoup object preserves HTML and
        # whitespace, even after modifying the DOM, which is important for
        # formatting.
        #
        # The catch is that it adds a '<html><head></head><body>' header and
        # '</body></html>' footer. ah well. harmless.
        soup = util.parse_html(content)
        if strip_first_video_tag:
            video = soup.video or soup.find(class_='u-video')
            if video:
                video.extract()
                content = str(soup)

        if strip_quotations:
            quotations = soup.find_all(class_='u-quotation-of')
            if quotations:
                for q in quotations:
                    q.extract()
                content = str(soup)

        # compare to content with HTML tags stripped
        if summary == soup.get_text('').strip():
            # summary and content are the same; prefer content so that we can use its
            # HTML formatting.
            summary = None

        # sniff whether content is HTML or plain text. use html.parser instead of
        # the default html5lib since html.parser is stricter and expects actual
        # HTML tags.
        # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#differences-between-parsers
        is_html = (bool(BeautifulSoup(content, 'html.parser').find())
                   or HTML_ENTITY_RE.search(content))
        if is_html and not ignore_formatting:
            content = html_to_text(content, **self.HTML2TEXT_OPTIONS)
        elif not is_html and ignore_formatting:
            content = re.sub(r'\s+', ' ', content)

        return summary or ((name or content) if prefer_name else
                           (content or name)) or ''

Example #5

0

Show file

File: rss.py Project: whyouare111/granary

def from_activities(activities,
                    actor=None,
                    title=None,
                    feed_url=None,
                    home_page_url=None,
                    hfeed=None):
    """Converts ActivityStreams activities to an RSS 2.0 feed.

  Args:
    activities: sequence of ActivityStreams activity dicts
    actor: ActivityStreams actor dict, the author of the feed
    title: string, the feed title
    feed_url: string, the URL for this RSS feed
    home_page_url: string, the home page URL
    hfeed: dict, parsed mf2 h-feed, if available

  Returns:
    unicode string with RSS 2.0 XML
  """
    try:
        iter(activities)
    except TypeError:
        raise TypeError('activities must be iterable')

    if isinstance(activities, (dict, str)):
        raise TypeError('activities may not be a dict or string')

    fg = FeedGenerator()
    fg.id(feed_url)
    assert feed_url
    fg.link(href=feed_url, rel='self')
    if home_page_url:
        fg.link(href=home_page_url, rel='alternate')
    # TODO: parse language from lang attribute:
    # https://github.com/microformats/mf2py/issues/150
    fg.language('en')
    fg.generator('granary', uri='https://granary.io/')

    hfeed = hfeed or {}
    actor = actor or {}
    image = (util.get_url(hfeed.get('properties', {}), 'photo')
             or util.get_url(actor, 'image'))
    if image:
        fg.image(image)

    props = hfeed.get('properties') or {}
    content = microformats2.get_text(util.get_first(props, 'content', ''))
    summary = util.get_first(props, 'summary', '')
    desc = content or summary or '-'
    fg.description(desc)  # required
    fg.title(title or util.ellipsize(desc))  # required

    latest = None
    feed_has_enclosure = False
    for activity in activities:
        obj = activity.get('object') or activity
        if obj.get('objectType') == 'person':
            continue

        item = fg.add_entry()
        url = obj.get('url')
        id = obj.get('id') or url
        item.id(id)
        item.link(href=url)
        item.guid(url, permalink=True)

        # title (required)
        title = (obj.get('title') or obj.get('displayName')
                 or util.ellipsize(obj.get('content', '-')))
        # strip HTML tags
        title = util.parse_html(title).get_text('').strip()
        item.title(title)

        content = microformats2.render_content(obj,
                                               include_location=True,
                                               render_attachments=True,
                                               render_image=True)
        if not content:
            content = obj.get('summary')
        if content:
            item.content(content, type='CDATA')

        categories = [
            {
                'term': t['displayName']
            } for t in obj.get('tags', [])
            if t.get('displayName') and t.get('verb') not in ('like', 'react',
                                                              'share')
            and t.get('objectType') not in ('article', 'person', 'mention')
        ]
        item.category(categories)

        author = obj.get('author', {})
        author = {
            'name': author.get('displayName') or author.get('username'),
            'uri': author.get('url'),
            'email': author.get('email') or '-',
        }
        item.author(author)

        published = obj.get('published') or obj.get('updated')
        if published and isinstance(published, str):
            try:
                dt = mf2util.parse_datetime(published)
                if not isinstance(dt, datetime):
                    dt = datetime.combine(dt, time.min)
                if not dt.tzinfo:
                    dt = dt.replace(tzinfo=util.UTC)
                item.published(dt)
                if not latest or dt > latest:
                    latest = dt
            except ValueError:  # bad datetime string
                pass

        item_has_enclosure = False
        for att in obj.get('attachments', []):
            stream = util.get_first(att, 'stream') or att
            if not stream:
                continue

            url = stream.get('url') or ''
            mime = mimetypes.guess_type(url)[0] or ''
            if (att.get('objectType') in ENCLOSURE_TYPES
                    or mime and mime.split('/')[0] in ENCLOSURE_TYPES):
                if item_has_enclosure:
                    logging.info(
                        'Warning: item %s already has an RSS enclosure, skipping additional enclosure %s',
                        id, url)
                    continue

                item_has_enclosure = feed_has_enclosure = True
                item.enclosure(url=url,
                               type=mime,
                               length=str(stream.get('size', '')))
                item.load_extension('podcast')
                duration = stream.get('duration')
                if duration:
                    item.podcast.itunes_duration(duration)

    if feed_has_enclosure:
        fg.load_extension('podcast')
        fg.podcast.itunes_author(
            actor.get('displayName') or actor.get('username'))
        if summary:
            fg.podcast.itunes_summary(summary)
        fg.podcast.itunes_explicit('no')
        fg.podcast.itunes_block(False)
        name = author.get('name')
        if name:
            fg.podcast.itunes_author(name)
        if image:
            fg.podcast.itunes_image(image)
        fg.podcast.itunes_category(categories)

    if latest:
        fg.lastBuildDate(latest)

    return fg.rss_str(pretty=True).decode('utf-8')

Example #6

0

Show file

def _prepare_activity(a, reader=True):
  """Preprocesses an activity to prepare it to be rendered as Atom.

  Modifies a in place.

  Args:
    a: ActivityStreams 1 activity dict
    reader: boolean, whether the output will be rendered in a feed reader.
      Currently just includes location if True, not otherwise.
  """
  act_type = source.object_type(a)
  obj = util.get_first(a, 'object', default={})
  primary = obj if (not act_type or act_type == 'post') else a

  # Render content as HTML; escape &s
  obj['rendered_content'] = _encode_ampersands(microformats2.render_content(
    primary, include_location=reader, render_attachments=True,
    # Readers often obey CSS white-space: pre strictly and don't even line wrap,
    # so don't use it.
    # https://forum.newsblur.com/t/android-cant-read-line-pre-formatted-lines/6116
    white_space_pre=False))

  # Make sure every activity has the title field, since Atom <entry> requires
  # the title element.
  if not a.get('title'):
    a['title'] = util.ellipsize(_encode_ampersands(
      a.get('displayName') or a.get('content') or obj.get('title') or
      obj.get('displayName') or obj.get('content') or 'Untitled'))

  # strip HTML tags. the Atom spec says title is plain text:
  # http://atomenabled.org/developers/syndication/#requiredEntryElements
  a['title'] = xml.sax.saxutils.escape(util.parse_html(a['title']).get_text(''))

  children = []
  image_urls_seen = set()
  image_atts = []

  # normalize actors
  for elem in a, obj:
    _prepare_actor(elem.get('actor'))

  # normalize attachments, render attached notes/articles
  attachments = a.get('attachments') or obj.get('attachments') or []
  for att in attachments:
    att['stream'] = util.get_first(att, 'stream')
    type = att.get('objectType')

    if type == 'image':
      att['image'] = util.get_first(att, 'image')
      image_atts.append(att['image'])
      continue

    image_urls_seen |= set(util.get_urls(att, 'image'))
    if type in ('note', 'article'):
      html = microformats2.render_content(
        att, include_location=reader, render_attachments=True,
        white_space_pre=False)
      author = att.get('author')
      if author:
        name = microformats2.maybe_linked_name(
          microformats2.object_to_json(author).get('properties') or {})
        html = '%s: %s' % (name.strip(), html)
      children.append(html)

  # render image(s) that we haven't already seen
  for image in image_atts + util.get_list(obj, 'image'):
    if not image:
      continue
    url = image.get('url')
    parsed = urllib.parse.urlparse(url)
    rest = urllib.parse.urlunparse(('', '') + parsed[2:])
    img_src_re = re.compile(r"""src *= *['"] *((https?:)?//%s)?%s *['"]""" %
                            (re.escape(parsed.netloc),
                             _encode_ampersands(re.escape(rest))))
    if (url and url not in image_urls_seen and
        not img_src_re.search(obj['rendered_content'])):
      children.append(microformats2.img(url))
      image_urls_seen.add(url)

  obj['rendered_children'] = [_encode_ampersands(child) for child in children]

  # make sure published and updated are strict RFC 3339 timestamps
  for prop in 'published', 'updated':
    val = obj.get(prop)
    if val:
      obj[prop] = util.maybe_iso8601_to_rfc3339(val)
      # Atom timestamps are even stricter than RFC 3339: they can't be naive ie
      # time zone unaware. They must have either an offset or the Z suffix.
      # https://www.feedvalidator.org/docs/error/InvalidRFC3339Date.html
      if not util.TIMEZONE_OFFSET_RE.search(obj[prop]):
        obj[prop] += 'Z'

Example #7

0

Show file

File: webfinger.py Project: mdheller/bridgy-fed

    def template_vars(self, domain, url=None):
        assert domain

        if domain.split('.')[-1] in NON_TLDS:
            self.error("%s doesn't look like a domain" % domain, status=404)

        # find representative h-card. try url, then url's home page, then domain
        urls = ['http://%s/' % domain]
        if url:
            urls = [url, urllib.parse.urljoin(url, '/')] + urls

        for candidate in urls:
            resp = common.requests_get(candidate)
            parsed = util.parse_html(resp)
            mf2 = util.parse_mf2(parsed, url=resp.url)
            # logging.debug('Parsed mf2 for %s: %s', resp.url, json_dumps(mf2, indent=2))
            hcard = mf2util.representative_hcard(mf2, resp.url)
            if hcard:
                logging.info('Representative h-card: %s',
                             json_dumps(hcard, indent=2))
                break
        else:
            self.error("""\
Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on %s"""
                       % resp.url)

        logging.info('Generating WebFinger data for %s', domain)
        key = models.MagicKey.get_or_create(domain)
        props = hcard.get('properties', {})
        urls = util.dedupe_urls(props.get('url', []) + [resp.url])
        canonical_url = urls[0]

        acct = '%s@%s' % (domain, domain)
        for url in urls:
            if url.startswith('acct:'):
                urluser, urldomain = util.parse_acct_uri(url)
                if urldomain == domain:
                    acct = '%s@%s' % (urluser, domain)
                    logging.info('Found custom username: acct:%s', acct)
                    break

        # discover atom feed, if any
        atom = parsed.find('link',
                           rel='alternate',
                           type=common.CONTENT_TYPE_ATOM)
        if atom and atom['href']:
            atom = urllib.parse.urljoin(resp.url, atom['href'])
        else:
            atom = 'https://granary.io/url?' + urllib.parse.urlencode(
                {
                    'input': 'html',
                    'output': 'atom',
                    'url': resp.url,
                    'hub': resp.url,
                })

        # discover PuSH, if any
        for link in resp.headers.get('Link', '').split(','):
            match = common.LINK_HEADER_RE.match(link)
            if match and match.group(2) == 'hub':
                hub = match.group(1)
            else:
                hub = 'https://bridgy-fed.superfeedr.com/'

        # generate webfinger content
        data = util.trim_nulls({
            'subject':
            'acct:' + acct,
            'aliases':
            urls,
            'magic_keys': [{
                'value': key.href()
            }],
            'links':
            sum(([{
                'rel': 'http://webfinger.net/rel/profile-page',
                'type': 'text/html',
                'href': url,
            }] for url in urls if url.startswith("http")), []) + [{
                'rel': 'http://webfinger.net/rel/avatar',
                'href': url,
            } for url in props.get('photo', [])] + [
                {
                    'rel': 'canonical_uri',
                    'type': 'text/html',
                    'href': canonical_url,
                },

                # ActivityPub
                {
                    'rel': 'self',
                    'type': common.CONTENT_TYPE_AS2,
                    # WARNING: in python 2 sometimes request.host_url lost port,
                    # http://localhost:8080 would become just http://localhost. no
                    # clue how or why. pay attention here if that happens again.
                    'href': '%s/%s' % (self.request.host_url, domain),
                },
                {
                    'rel': 'inbox',
                    'type': common.CONTENT_TYPE_AS2,
                    'href': '%s/%s/inbox' % (self.request.host_url, domain),
                },

                # OStatus
                {
                    'rel': 'http://schemas.google.com/g/2010#updates-from',
                    'type': common.CONTENT_TYPE_ATOM,
                    'href': atom,
                },
                {
                    'rel': 'hub',
                    'href': hub,
                },
                {
                    'rel': 'magic-public-key',
                    'href': key.href(),
                },
                {
                    'rel': 'salmon',
                    'href': '%s/%s/salmon' % (self.request.host_url, domain),
                }
            ]
        })
        logging.info('Returning WebFinger data: %s', json_dumps(data,
                                                                indent=2))
        return data