Example #1
0
    def test_no_accept_header(self):
        self.assertEqual({}, util.request_headers(url='http://foo/bar'))
        self.assertEqual({},
                         util.request_headers(source=Twitter(id='not-rhiaro')))

        self.expect_requests_get('http://foo/bar', '')
        self.mox.ReplayAll()
        util.requests_get('http://foo/bar')
Example #2
0
  def test_no_accept_header(self):
    self.assertEquals(util.REQUEST_HEADERS,
                      util.request_headers(url='http://foo/bar'))
    self.assertEquals(util.REQUEST_HEADERS,
                      util.request_headers(source=Twitter(id='not-rhiaro')))

    self.expect_requests_get('http://foo/bar', '', headers=util.REQUEST_HEADERS)
    self.mox.ReplayAll()
    util.requests_get('http://foo/bar')
Example #3
0
  def test_rhiaro_accept_header(self):
    """Only send Accept header to rhiaro.co.uk right now.
    https://github.com/snarfed/bridgy/issues/713
    """
    self.assertEquals(util.REQUEST_HEADERS_CONNEG,
                      util.request_headers(url='http://rhiaro.co.uk/'))
    self.assertEquals(util.REQUEST_HEADERS_CONNEG,
                      util.request_headers(source=Twitter(id='rhiaro')))

    self.expect_requests_get('http://rhiaro.co.uk/', '',
                             headers=util.REQUEST_HEADERS_CONNEG)
    self.mox.ReplayAll()
    util.requests_get('http://rhiaro.co.uk/')
Example #4
0
  def test_rhiaro_accept_header(self):
    """Only send Accept header to rhiaro.co.uk right now.
    https://github.com/snarfed/bridgy/issues/713
    """
    self.assertEqual(util.REQUEST_HEADERS_CONNEG,
                      util.request_headers(url='http://rhiaro.co.uk/'))
    self.assertEqual(util.REQUEST_HEADERS_CONNEG,
                      util.request_headers(source=Twitter(id='rhiaro')))

    self.expect_requests_get('http://rhiaro.co.uk/', '',
                             headers=util.REQUEST_HEADERS_CONNEG)
    self.mox.ReplayAll()
    util.requests_get('http://rhiaro.co.uk/')
Example #5
0
 def test_blocklist_localhost_when_deployed(self):
   self.mox.StubOutWithMock(util, 'LOCAL')
   util.LOCAL = False
   for bad in 'http://localhost:8080/', 'http://127.0.0.1/':
     resp = util.requests_get(bad)
     self.assertEqual(util.HTTP_REQUEST_REFUSED_STATUS_CODE, resp.status_code)
     self.assertEqual('Sorry, Bridgy has blocklisted this URL.', resp.text)
Example #6
0
    def test_requests_get_content_length_not_int(self):
        self.expect_requests_get("http://foo/bar", "xyz", response_headers={"Content-Length": "foo"})
        self.mox.ReplayAll()

        resp = util.requests_get("http://foo/bar")
        self.assertEquals(200, resp.status_code)
        self.assertEquals("xyz", resp.content)
Example #7
0
def query_live_status(uid=None):
    if uid is None:
        return
    uid = str(uid)
    query_url = 'https://api.bilibili.com/x/space/acc/info?mid={}&my_ts={}'.format(
        uid, int(time.time()))
    response = util.requests_get(query_url, '查询直播状态')
    if util.check_response_is_ok(response):
        result = json.loads(str(response.content, 'utf-8'))
        if result['code'] != 0:
            logger.error(
                '【查询直播状态】请求返回数据code错误:{code}'.format(code=result['code']))
        else:
            name = result['data']['name']
            live_status = result['data']['live_room']['liveStatus']

            if LIVING_STATUS_DICT.get(uid, None) is None:
                LIVING_STATUS_DICT[uid] = live_status
                logger.info('【查询直播状态】【{uname}】初始化'.format(uname=name))
                return

            if LIVING_STATUS_DICT.get(uid, None) != live_status:
                LIVING_STATUS_DICT[uid] = live_status

                room_id = result['data']['live_room']['roomid']
                room_title = result['data']['live_room']['title']
                room_cover_url = result['data']['live_room']['cover']

                if live_status == 1:
                    logger.info('【查询直播状态】【{name}】开播了,准备推送:{room_title}'.format(
                        name=name, room_title=room_title))
                    push.push_for_bili_live(name, room_id, room_title,
                                            room_cover_url)
Example #8
0
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        resp = util.requests_get(root)
        resp.raise_for_status()
        data = util.mf2py_parse(resp.text, root)
        me_urls = data.get('rels', {}).get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, exc_info=True)

    return final
Example #9
0
  def fetch_mf2(self, url):
    """Fetches a URL and extracts its mf2 data.

    Side effects: sets self.entity.html on success, calls self.error() on
    errors.

    Args:
      url: string

    Returns:
      (requests.Response, mf2 data dict) on success, None on failure
    """
    try:
      fetched = util.requests_get(url)
      fetched.raise_for_status()
    except BaseException as e:
      util.interpret_http_exception(e)  # log exception
      return self.error('Could not fetch source URL %s' % url)

    if self.entity:
      self.entity.html = fetched.text

    # .text is decoded unicode string, .content is raw bytes. if the HTTP
    # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it
    # can look for a <meta> tag with a charset and decode.
    text = (fetched.text if 'charset' in fetched.headers.get('content-type', '')
            else fetched.content)
    doc = BeautifulSoup(text)

    # special case tumblr's markup: div#content > div.post > div.copy
    # convert to mf2.
    contents = doc.find_all(id='content')
    if contents:
      post = contents[0].find_next(class_='post')
      if post:
        post['class'] = 'h-entry'
        copy = post.find_next(class_='copy')
        if copy:
          copy['class'] = 'e-content'
        photo = post.find_next(class_='photo-wrapper')
        if photo:
          img = photo.find_next('img')
          if img:
            img['class'] = 'u-photo'
        doc = unicode(post)

    # parse microformats, convert to ActivityStreams
    data = parser.Parser(doc=doc, url=fetched.url).to_dict()
    logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2))
    items = data.get('items', [])
    if not items or not items[0]:
      return self.error('No microformats2 data found in ' + fetched.url,
                        data=data, html="""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""" % (fetched.url, util.pretty_link(fetched.url)))

    return fetched, data
Example #10
0
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        resp = util.requests_get(root)
        resp.raise_for_status()
        data = util.mf2py_parse(resp.text, root)
        me_urls = data.get('rels', {}).get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, exc_info=True)

    return final
Example #11
0
    def create_comment(self, post_url, author_name, author_url, content):
        """Creates a new comment in the source silo.

    Must be implemented by subclasses.

    Args:
      post_url: string
      author_name: string
      author_url: string
      content: string

    Returns:
      JSON response dict with 'id' and other fields
    """
        if not self.disqus_shortname:
            resp = util.requests_get(post_url)
            resp.raise_for_status()
            self.discover_disqus_shortname(resp.text)
            if not self.disqus_shortname:
                raise exc.HTTPBadRequest(
                    "Your Bridgy account isn't fully set up yet: " "we haven't found your Disqus account."
                )

        # strip slug, query and fragment from post url
        parsed = urlparse.urlparse(post_url)
        path = parsed.path.split("/")
        try:
            tumblr_post_id = int(path[-1])
        except ValueError:
            path.pop(-1)
        post_url = urlparse.urlunparse(parsed[:2] + ("/".join(path), "", "", ""))

        # get the disqus thread id. details on thread queries:
        # http://stackoverflow.com/questions/4549282/disqus-api-adding-comment
        # https://disqus.com/api/docs/threads/details/
        resp = self.disqus_call(
            util.requests_get,
            DISQUS_API_THREAD_DETAILS_URL,
            {
                "forum": self.disqus_shortname,
                # ident:[tumblr_post_id] should work, but doesn't :/
                "thread": "link:%s" % post_url,
            },
        )
        thread_id = resp["id"]

        # create the comment
        message = u'<a href="%s">%s</a>: %s' % (author_url, author_name, content)
        resp = self.disqus_call(
            util.requests_post,
            DISQUS_API_CREATE_POST_URL,
            {
                "thread": thread_id,
                "message": message.encode("utf-8"),
                # only allowed when authed as moderator/owner
                # 'state': 'approved',
            },
        )
        return resp
Example #12
0
    def create_comment(self, post_url, author_name, author_url, content):
        """Creates a new comment in the source silo.

    Must be implemented by subclasses.

    Args:
      post_url: string
      author_name: string
      author_url: string
      content: string

    Returns:
      JSON response dict with 'id' and other fields
    """
        if not self.disqus_shortname:
            resp = util.requests_get(post_url)
            resp.raise_for_status()
            self.discover_disqus_shortname(resp.text)
            if not self.disqus_shortname:
                raise exc.HTTPBadRequest(
                    "Your Bridgy account isn't fully set up yet: "
                    "we haven't found your Disqus account.")

        # strip slug, query and fragment from post url
        parsed = urlparse.urlparse(post_url)
        path = parsed.path.split('/')
        try:
            tumblr_post_id = int(path[-1])
        except ValueError:
            path.pop(-1)
        post_url = urlparse.urlunparse(parsed[:2] +
                                       ('/'.join(path), '', '', ''))

        # get the disqus thread id. details on thread queries:
        # http://stackoverflow.com/questions/4549282/disqus-api-adding-comment
        # https://disqus.com/api/docs/threads/details/
        resp = self.disqus_call(
            util.requests_get,
            DISQUS_API_THREAD_DETAILS_URL,
            {
                'forum': self.disqus_shortname,
                # ident:[tumblr_post_id] should work, but doesn't :/
                'thread': 'link:%s' % post_url,
            })
        thread_id = resp['id']

        # create the comment
        message = u'<a href="%s">%s</a>: %s' % (author_url, author_name,
                                                content)
        resp = self.disqus_call(
            util.requests_post,
            DISQUS_API_CREATE_POST_URL,
            {
                'thread': thread_id,
                'message': message.encode('utf-8'),
                # only allowed when authed as moderator/owner
                # 'state': 'approved',
            })
        return resp
Example #13
0
  def test_requests_get_content_length_not_int(self):
    self.expect_requests_get('http://foo/bar', 'xyz',
                             response_headers={'Content-Length': 'foo'})
    self.mox.ReplayAll()

    resp = util.requests_get('http://foo/bar')
    self.assertEquals(200, resp.status_code)
    self.assertEquals('xyz', resp.content)
Example #14
0
  def test_requests_get_content_length_not_int(self):
    self.expect_requests_get('http://foo/bar', 'xyz',
                             response_headers={'Content-Length': 'foo'})
    self.mox.ReplayAll()

    resp = util.requests_get('http://foo/bar')
    self.assertEqual(200, resp.status_code)
    self.assertEqual('xyz', resp.text)
Example #15
0
  def test_requests_get_too_big(self):
    self.expect_requests_get(
      'http://foo/bar', '',
      response_headers={'Content-Length': str(util.MAX_HTTP_RESPONSE_SIZE + 1)})
    self.mox.ReplayAll()

    resp = util.requests_get('http://foo/bar')
    self.assertEquals(util.HTTP_REQUEST_REFUSED_STATUS_CODE, resp.status_code)
    self.assertIn(' larger than our limit ', resp.content)
Example #16
0
  def test_requests_get_too_big(self):
    self.expect_requests_get(
      'http://foo/bar', '',
      response_headers={'Content-Length': str(util.MAX_HTTP_RESPONSE_SIZE + 1)})
    self.mox.ReplayAll()

    resp = util.requests_get('http://foo/bar')
    self.assertEqual(util.HTTP_RESPONSE_TOO_BIG_STATUS_CODE, resp.status_code)
    self.assertIn(' larger than our limit ', resp.text)
Example #17
0
 def _get_wechat_access_token(self):
     access_token = None
     url = 'https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid={corpid}&corpsecret={corpsecret}'.format(
         corpid=self.wechat_corp_id, corpsecret=self.wechat_corp_secret)
     response = util.requests_get(url, '推送_wechat_获取access_tokon')
     if util.check_response_is_ok(response):
         result = json.loads(str(response.content, 'utf-8'))
         access_token = result['access_token']
     return access_token
Example #18
0
def query_live_status(room_id=None):
    if room_id is None:
        return
    query_url = 'https://webcast.amemv.com/webcast/reflow/{}?my_ts={}`'.format(
        room_id, int(time.time()))
    headers = get_headers_for_live()
    response = util.requests_get(query_url,
                                 '查询直播状态',
                                 headers=headers,
                                 use_proxy=True)
    if util.check_response_is_ok(response):
        html_text = response.text
        soup = BeautifulSoup(html_text, "html.parser")
        result = None
        scripts = soup.findAll('script')
        for script in scripts:
            script_string = script.string
            if script_string is None:
                continue
            if 'window.__INIT_PROPS__ = ' in script_string:
                result_str = script.string.replace('window.__INIT_PROPS__ = ',
                                                   '')
                try:
                    result = json.loads(result_str).get(
                        '/webcast/reflow/:id', None)
                except TypeError:
                    logger.error('【查询直播状态】json解析错误,room_id:{}'.format(room_id))
                    return None
                break
        if result is None:
            logger.error('【查询直播状态】请求返回数据为空,room_id:{}'.format(room_id))
        else:
            if result.get('room', None) is None:
                logger.error(
                    '【查询直播状态】请求返回数据中room为空,room_id:{}'.format(room_id))
                return
            name = result['room']['owner']['nickname']
            live_status = result['room']['status']

            if LIVING_STATUS_DICT.get(room_id, None) is None:
                LIVING_STATUS_DICT[room_id] = live_status
                logger.info('【查询直播状态】【{uname}】初始化'.format(uname=name))
                return

            if LIVING_STATUS_DICT.get(room_id, None) != live_status:
                LIVING_STATUS_DICT[room_id] = live_status

                room_title = result['room']['title']
                room_cover_url = result['room']['cover']['url_list'][0]
                room_stream_url = result['room']['stream_url']['hls_pull_url']

                if live_status == 2:
                    logger.info('【查询直播状态】【{name}】开播了,准备推送:{room_title}'.format(
                        name=name, room_title=room_title))
                    push.push_for_douyin_live(name, room_stream_url,
                                              room_title, room_cover_url)
Example #19
0
    def _urls_and_domains(self, auth_entity, user_url):
        """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
        actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
        logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

        candidates = util.trim_nulls(
            util.uniquify([user_url] + microformats2.object_urls(actor)))

        if len(candidates) > MAX_AUTHOR_URLS:
            logging.info(
                'Too many profile links! Only resolving the first %s: %s',
                MAX_AUTHOR_URLS, candidates)

        urls = []
        for i, url in enumerate(candidates):
            final, domain, ok = util.get_webmention_target(
                url, resolve=i < MAX_AUTHOR_URLS)
            if ok:
                final = final.lower()
                if util.schemeless(final).startswith(
                        util.schemeless(url.lower())):
                    # redirected to a deeper path. use the original higher level URL. #652
                    final = url
                # If final has a path segment check if root has a matching rel=me.
                match = re.match(r'^(https?://[^/]+)/.+', final)
                if match and i < MAX_AUTHOR_URLS:
                    root = match.group(1)
                    resp = util.requests_get(root)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, root)
                    me_urls = data.get('rels', {}).get('me', [])
                    if final in me_urls:
                        final = root
                urls.append(final)

        urls = util.dedupe_urls(urls)  # normalizes domains to lower case
        domains = [util.domain_from_link(url) for url in urls]
        return urls, domains
Example #20
0
  def _urls_and_domains(self, auth_entity, user_url):
    """Returns this user's valid (not webmention-blacklisted) URLs and domains.

    Converts the auth entity's user_json to an ActivityStreams actor and uses
    its 'urls' and 'url' fields. May be overridden by subclasses.

    Args:
      auth_entity: :class:`oauth_dropins.models.BaseAuth`
      user_url: string, optional URL passed in when authorizing

    Returns:
      ([string url, ...], [string domain, ...])
    """
    actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json))
    logging.debug('Converted to actor: %s', json.dumps(actor, indent=2))

    candidates = util.trim_nulls(util.uniquify(
        [user_url] + microformats2.object_urls(actor)))

    if len(candidates) > MAX_AUTHOR_URLS:
      logging.info('Too many profile links! Only resolving the first %s: %s',
                   MAX_AUTHOR_URLS, candidates)

    urls = []
    for i, url in enumerate(candidates):
      final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS)
      if ok:
        final = final.lower()
        if util.schemeless(final).startswith(util.schemeless(url.lower())):
          # redirected to a deeper path. use the original higher level URL. #652
          final = url
        # If final has a path segment check if root has a matching rel=me.
        match = re.match(r'^(https?://[^/]+)/.+', final)
        if match and i < MAX_AUTHOR_URLS:
          root = match.group(1)
          resp = util.requests_get(root)
          resp.raise_for_status()
          data = util.mf2py_parse(resp.text, root)
          me_urls = data.get('rels', {}).get('me', [])
          if final in me_urls:
            final = root
        urls.append(final)

    urls = util.dedupe_urls(urls)  # normalizes domains to lower case
    domains = [util.domain_from_link(url) for url in urls]
    return urls, domains
Example #21
0
    def expand_target_urls(self, activity):
        """Expand the inReplyTo or object fields of an ActivityStreams object
    by fetching the original and looking for rel=syndication URLs.

    This method modifies the dict in place.

    Args:
      activity: an ActivityStreams dict of the activity being published
    """
        for field in ('inReplyTo', 'object'):
            # microformats2.json_to_object de-dupes, no need to do it here
            objs = activity.get(field)
            if not objs:
                continue

            if isinstance(objs, dict):
                objs = [objs]

            augmented = list(objs)
            for obj in objs:
                url = obj.get('url')
                if not url:
                    continue

                # get_webmention_target weeds out silos and non-HTML targets
                # that we wouldn't want to download and parse
                url, _, ok = util.get_webmention_target(url)
                if not ok:
                    continue

                # fetch_mf2 raises a fuss if it can't fetch a mf2 document;
                # easier to just grab this ourselves than add a bunch of
                # special-cases to that method
                logging.debug('expand_target_urls fetching field=%s, url=%s',
                              field, url)
                try:
                    resp = util.requests_get(url)
                    resp.raise_for_status()
                    data = util.mf2py_parse(resp.text, url)
                except AssertionError:
                    raise  # for unit tests
                except BaseException:
                    # it's not a big deal if we can't fetch an in-reply-to url
                    logging.warning(
                        'expand_target_urls could not fetch field=%s, url=%s',
                        field,
                        url,
                        exc_info=True)
                    continue

                synd_urls = data.get('rels', {}).get('syndication', [])

                # look for syndication urls in the first h-entry
                queue = collections.deque(data.get('items', []))
                while queue:
                    item = queue.popleft()
                    item_types = set(item.get('type', []))
                    if 'h-feed' in item_types and 'h-entry' not in item_types:
                        queue.extend(item.get('children', []))
                        continue

                    # these can be urls or h-cites
                    synd_urls += microformats2.get_string_urls(
                        item.get('properties', {}).get('syndication', []))

                logging.debug(
                    'expand_target_urls found rel=syndication for url=%s: %r',
                    url, synd_urls)
                augmented += [{'url': u} for u in synd_urls]

            activity[field] = augmented
Example #22
0
  def expand_target_urls(self, activity):
    """Expand the inReplyTo or object fields of an ActivityStreams object
    by fetching the original and looking for rel=syndication URLs.

    This method modifies the dict in place.

    Args:
      activity: an ActivityStreams dict of the activity being published
    """
    for field in ('inReplyTo', 'object'):
      # microformats2.json_to_object de-dupes, no need to do it here
      objs = activity.get(field)
      if not objs:
        continue

      if isinstance(objs, dict):
        objs = [objs]

      augmented = list(objs)
      for obj in objs:
        url = obj.get('url')
        if not url:
          continue

        # get_webmention_target weeds out silos and non-HTML targets
        # that we wouldn't want to download and parse
        url, _, ok = util.get_webmention_target(url)
        if not ok:
          continue

        # fetch_mf2 raises a fuss if it can't fetch a mf2 document;
        # easier to just grab this ourselves than add a bunch of
        # special-cases to that method
        logging.debug('expand_target_urls fetching field=%s, url=%s', field, url)
        try:
          resp = util.requests_get(url)
          resp.raise_for_status()
          data = mf2py.Parser(url=url, doc=resp.text).to_dict()
        except AssertionError:
          raise  # for unit tests
        except BaseException:
          # it's not a big deal if we can't fetch an in-reply-to url
          logging.warning('expand_target_urls could not fetch field=%s, url=%s',
                          field, url, exc_info=True)
          continue

        synd_urls = data.get('rels', {}).get('syndication', [])

        # look for syndication urls in the first h-entry
        queue = collections.deque(data.get('items', []))
        while queue:
          item = queue.popleft()
          item_types = set(item.get('type', []))
          if 'h-feed' in item_types and 'h-entry' not in item_types:
            queue.extend(item.get('children', []))
            continue

          # these can be urls or h-cites
          synd_urls += microformats2.get_string_urls(
            item.get('properties', {}).get('syndication', []))

        logging.debug('expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls)
        augmented += [{'url': u} for u in synd_urls]

      activity[field] = augmented
Example #23
0
 def test_requests_get_url_blacklist(self):
   resp = util.requests_get(next(iter(util.URL_BLACKLIST)))
   self.assertEquals(util.HTTP_REQUEST_REFUSED_STATUS_CODE, resp.status_code)
   self.assertEquals('Sorry, Bridgy has blacklisted this URL.', resp.content)
Example #24
0
    def fetch_mf2(self, url, id=None, require_mf2=True, raise_errors=False):
        """Fetches a URL and extracts its mf2 data.

    Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()`
    on errors.

    Args:
      url: string
      id: string, optional id of specific element to extract and parse. defaults
        to the whole page.
      require_mf2: boolean, whether to return error if no mf2 are found
      raise_errors: boolean, whether to let error exceptions propagate up or
        handle them

    Returns:
      (:class:`requests.Response`, mf2 data dict) on success, None on failure
    """
        try:
            resp = util.requests_get(url)
            resp.raise_for_status()
        except werkzeug.exceptions.HTTPException:
            # raised by us, probably via self.error()
            raise
        except BaseException as e:
            if raise_errors:
                raise
            util.interpret_http_exception(e)  # log exception
            self.error(f'Could not fetch source URL {url}')

        if self.entity:
            self.entity.html = resp.text

        # parse microformats
        soup = util.parse_html(resp)
        mf2 = util.parse_mf2(soup, url=resp.url, id=id)
        if id and not mf2:
            self.error(f'Got fragment {id} but no element found with that id.')

        # special case tumblr's markup: div#content > div.post > div.copy
        # convert to mf2 and re-parse
        if not mf2.get('items'):
            contents = soup.find_all(id='content')
            if contents:
                post = contents[0].find_next(class_='post')
                if post:
                    post['class'] = 'h-entry'
                    copy = post.find_next(class_='copy')
                    if copy:
                        copy['class'] = 'e-content'
                    photo = post.find_next(class_='photo-wrapper')
                    if photo:
                        img = photo.find_next('img')
                        if img:
                            img['class'] = 'u-photo'
                    # TODO: i should be able to pass post or contents[0] to mf2py instead
                    # here, but it returns no items. mf2py bug?
                    doc = str(post)
                    mf2 = util.parse_mf2(doc, resp.url)

        logger.debug(f'Parsed microformats2: {json_dumps(mf2, indent=2)}')
        items = mf2.get('items', [])
        if require_mf2 and (not items or not items[0]):
            self.error('No microformats2 data found in ' + resp.url,
                       data=mf2,
                       html=f"""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="{resp.url}">{util.pretty_link(resp.url)}</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""")

        return resp, mf2
Example #25
0
 def test_requests_get_url_blacklist(self):
     resp = util.requests_get(next(iter(util.URL_BLACKLIST)))
     self.assertEquals(util.HTTP_REQUEST_REFUSED_STATUS_CODE,
                       resp.status_code)
     self.assertEquals('Sorry, Bridgy has blacklisted this URL.',
                       resp.content)
Example #26
0
def query_dynamic(uid=None):
    if uid is None:
        return
    uid = str(uid)
    query_url = 'http://api.vc.bilibili.com/dynamic_svr/v1/dynamic_svr/space_history' \
                '?host_uid={uid}&offset_dynamic_id=0&need_top=0&platform=web&my_ts={my_ts}'.format(uid=uid, my_ts=int(time.time()))
    headers = get_headers(uid)
    response = util.requests_get(query_url,
                                 '查询动态状态',
                                 headers=headers,
                                 use_proxy=True)
    if util.check_response_is_ok(response):
        result = json.loads(str(response.content, 'utf-8'))
        if result['code'] != 0:
            logger.error(
                '【查询动态状态】请求返回数据code错误:{code}'.format(code=result['code']))
        else:
            data = result['data']
            if len(data['cards']) == 0:
                logger.info('【查询动态状态】【{uid}】动态列表为空'.format(uid=uid))
                return

            item = data['cards'][0]
            dynamic_id = item['desc']['dynamic_id']
            try:
                uname = item['desc']['user_profile']['info']['uname']
            except KeyError:
                logger.error('【查询动态状态】【{uid}】获取不到uname'.format(uid=uid))
                return

            if DYNAMIC_DICT.get(uid, None) is None:
                DYNAMIC_DICT[uid] = deque(maxlen=LEN_OF_DEQUE)
                cards = data['cards']
                for index in range(LEN_OF_DEQUE):
                    if index < len(cards):
                        DYNAMIC_DICT[uid].appendleft(
                            cards[index]['desc']['dynamic_id'])
                logger.info('【查询动态状态】【{uname}】动态初始化:{queue}'.format(
                    uname=uname, queue=DYNAMIC_DICT[uid]))
                return

            if dynamic_id not in DYNAMIC_DICT[uid]:
                previous_dynamic_id = DYNAMIC_DICT[uid].pop()
                DYNAMIC_DICT[uid].append(previous_dynamic_id)
                logger.info('【查询动态状态】【{}】上一条动态id[{}],本条动态id[{}]'.format(
                    uname, previous_dynamic_id, dynamic_id))
                DYNAMIC_DICT[uid].append(dynamic_id)
                logger.info(DYNAMIC_DICT[uid])

                dynamic_type = item['desc']['type']
                if dynamic_type not in [2, 4, 8, 64]:
                    logger.info(
                        '【查询动态状态】【{uname}】动态有更新,但不在需要推送的动态类型列表中'.format(
                            uname=uname))
                    return

                timestamp = item['desc']['timestamp']
                dynamic_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime(timestamp))
                card_str = item['card']
                card = json.loads(card_str)

                content = None
                pic_url = None
                if dynamic_type == 1:
                    # 转发动态
                    content = card['item']['content']
                elif dynamic_type == 2:
                    # 图文动态
                    content = card['item']['description']
                    pic_url = card['item']['pictures'][0]['img_src']
                elif dynamic_type == 4:
                    # 文字动态
                    content = card['item']['content']
                elif dynamic_type == 8:
                    # 投稿动态
                    content = card['title']
                    pic_url = card['pic']
                elif dynamic_type == 64:
                    # 专栏动态
                    content = card['title']
                    pic_url = card['image_urls'][0]
                logger.info('【查询动态状态】【{uname}】动态有更新,准备推送:{content}'.format(
                    uname=uname, content=content[:30]))
                push.push_for_bili_dynamic(uname, dynamic_id, content, pic_url,
                                           dynamic_type, dynamic_time)
Example #27
0
    def fetch_mf2(self, url, require_mf2=True, raise_errors=False):
        """Fetches a URL and extracts its mf2 data.

    Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()`
    on errors.

    Args:
      url: string
      require_mf2: boolean, whether to return error if no mf2 are found
      raise_errors: boolean, whether to let error exceptions propagate up or
        handle them

    Returns:
      (:class:`requests.Response`, mf2 data dict) on success, None on failure
    """
        try:
            fetched = util.requests_get(url)
            fetched.raise_for_status()
        except BaseException as e:
            if raise_errors:
                raise
            util.interpret_http_exception(e)  # log exception
            return self.error('Could not fetch source URL %s' % url)

        if self.entity:
            self.entity.html = fetched.text

        # .text is decoded unicode string, .content is raw bytes. if the HTTP
        # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it
        # can look for a <meta> tag with a charset and decode.
        text = (fetched.text if 'charset' in fetched.headers.get(
            'content-type', '') else fetched.content)
        doc = util.beautifulsoup_parse(text)

        # parse microformats
        data = util.mf2py_parse(doc, fetched.url)

        # special case tumblr's markup: div#content > div.post > div.copy
        # convert to mf2 and re-parse
        if not data.get('items'):
            contents = doc.find_all(id='content')
            if contents:
                post = contents[0].find_next(class_='post')
                if post:
                    post['class'] = 'h-entry'
                    copy = post.find_next(class_='copy')
                    if copy:
                        copy['class'] = 'e-content'
                    photo = post.find_next(class_='photo-wrapper')
                    if photo:
                        img = photo.find_next('img')
                        if img:
                            img['class'] = 'u-photo'
                    doc = unicode(post)
                    data = util.mf2py_parse(doc, fetched.url)

        logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2))
        items = data.get('items', [])
        if require_mf2 and (not items or not items[0]):
            return self.error('No microformats2 data found in ' + fetched.url,
                              data=data,
                              html="""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""" % (fetched.url, util.pretty_link(fetched.url)))

        return fetched, data
Example #28
0
def query_dynamic(uid=None, sec_uid=None):
    if uid is None or sec_uid is None:
        return
    signature = sign.get_signature()
    query_url = 'http://www.iesdouyin.com/web/api/v2/aweme/post?sec_uid={}&count=21&max_cursor=0&aid=1128&_signature={}'.format(
        sec_uid, signature)
    headers = get_headers(uid, sec_uid)
    response = util.requests_get(query_url,
                                 '查询动态状态',
                                 headers=headers,
                                 use_proxy=True)
    if util.check_response_is_ok(response):
        result = json.loads(str(response.content, 'utf-8'))
        if result['status_code'] != 0:
            logger.error('【查询动态状态】请求返回数据code错误:{code}'.format(
                code=result['status_code']))
        else:
            aweme_list = result['aweme_list']
            if len(aweme_list) == 0:
                logger.info(
                    '【查询动态状态】【{sec_uid}】动态列表为空'.format(sec_uid=sec_uid))
                return

            aweme = aweme_list[0]
            aweme_id = aweme['aweme_id']
            uid = aweme['author']['uid']
            nickname = aweme['author']['nickname']

            if DYNAMIC_DICT.get(uid, None) is None:
                DYNAMIC_DICT[uid] = deque(maxlen=LEN_OF_DEQUE)
                for index in range(LEN_OF_DEQUE):
                    if index < len(aweme_list):
                        DYNAMIC_DICT[uid].appendleft(
                            aweme_list[index]['aweme_id'])
                logger.info('【查询动态状态】【{nickname}】动态初始化:{queue}'.format(
                    nickname=nickname, queue=DYNAMIC_DICT[uid]))
                return

            if aweme_id not in DYNAMIC_DICT[uid]:
                previous_aweme_id = DYNAMIC_DICT[uid].pop()
                DYNAMIC_DICT[uid].append(previous_aweme_id)
                logger.info('【查询动态状态】【{}】上一条动态id[{}],本条动态id[{}]'.format(
                    nickname, previous_aweme_id, aweme_id))
                DYNAMIC_DICT[uid].append(aweme_id)
                logger.info(DYNAMIC_DICT[uid])

                aweme_type = aweme['aweme_type']
                if aweme_type not in [4]:
                    logger.info(
                        '【查询动态状态】【{nickname}】动态有更新,但不在需要推送的动态类型列表中'.format(
                            nickname=nickname))
                    return

                content = None
                pic_url = None
                video_url = None
                if aweme_type == 4:
                    content = aweme['desc']
                    pic_url = aweme['video']['origin_cover']['url_list'][0]
                    video_url_list = aweme['video']['play_addr']['url_list']
                    for temp in video_url_list:
                        if 'ixigua.com' in temp or 'api.amemv.com' in temp:
                            continue
                        if 'aweme.snssdk.com' in temp or 'douyinvod.com' in temp:
                            video_url = temp
                            break
                logger.info('【查询动态状态】【{nickname}】动态有更新,准备推送:{content}'.format(
                    nickname=nickname, content=content[:30]))
                push.push_for_douyin_dynamic(nickname, aweme_id, content,
                                             pic_url, video_url)
Example #29
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of models.Source
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new models.SyndicatedPost
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  # TODO skip sites we know don't have microformats2 markup
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = BeautifulSoup(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.warning('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if not feed_type:
      # type is not specified, use this to confirm that it's text/html
      feed_url, _, feed_type_ok = util.get_webmention_target(feed_url)
    else:
      feed_type_ok = feed_type == 'text/html'

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_type_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.warning('Could not fetch h-feed url %s.', feed_url,
                      exc_info=True)

  permalink_to_entry = {}
  for child in feeditems:
    if 'h-entry' in child['type']:
      # TODO maybe limit to first ~30 entries? (do that here rather than,
      # below because we want the *first* n entries)
      for permalink in child['properties'].get('url', []):
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = _process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    now = util.now_fn()
    logging.debug('updating source last_syndication_url %s', now)
    source.updates['last_syndication_url'] = now

  return results
Example #30
0
def process_entry(source,
                  permalink,
                  feed_entry,
                  refetch,
                  preexisting,
                  store_blanks=True):
    """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logging.debug(
                'previously found relationship(s) for original %s: %s',
                permalink, synds)

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    if usynd:
        logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
    results = _process_syndication_urls(
        source, permalink,
        set(url for url in usynd if isinstance(url, basestring)), preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url or not feed_entry:
        # fetch the full permalink page if we think it might have more details
        parsed = None
        try:
            logging.debug('fetching post permalink %s', permalink)
            if type_ok:
                resp = util.requests_get(permalink)
                resp.raise_for_status()
                parsed = util.mf2py_parse(resp.text, permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logging.info('Could not fetch permalink %s',
                         permalink,
                         exc_info=True)
            success = False

        if parsed:
            syndication_urls = set()
            relsynd = parsed.get('rels').get('syndication', [])
            if relsynd:
                logging.debug('rel-syndication links: %s', relsynd)
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, basestring))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in parsed['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logging.debug('u-syndication links: %s', usynd)
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, basestring))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = itertools.chain(*results.values())
        for syndpost in list(preexisting):
            if syndpost.syndication and syndpost not in result_syndposts:
                logging.info('deleting relationship that disappeared: %s',
                             syndpost)
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logging.debug('no syndication links from %s to current source %s.',
                      permalink, source.label())
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logging.debug(
                'saving empty relationship so that %s will not be '
                'searched again', permalink)
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.iteritems():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logging.debug('discovered relationships %s', new_results)
    return new_results
Example #31
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    try:
        logging.debug('fetching author url %s', author_url)
        author_resp = util.requests_get(author_url)
        # TODO for error codes that indicate a temporary error, should we make
        # a certain number of retries before giving up forever?
        author_resp.raise_for_status()
        author_dom = util.beautifulsoup_parse(author_resp.text)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logging.info('Could not fetch author url %s',
                     author_url,
                     exc_info=True)
        return {}

    feeditems = _find_feed_items(author_url, author_dom)

    # look for all other feed urls using rel='feed', type='text/html'
    feed_urls = set()
    for rel_feed_node in (author_dom.find_all('link', rel='feed') +
                          author_dom.find_all('a', rel='feed')):
        feed_url = rel_feed_node.get('href')
        if not feed_url:
            continue

        feed_url = urlparse.urljoin(author_url, feed_url)
        feed_type = rel_feed_node.get('type')
        if feed_type and feed_type != 'text/html':
            feed_ok = False
        else:
            # double check that it's text/html, not too big, etc
            feed_url, _, feed_ok = util.get_webmention_target(feed_url)

        if feed_url == author_url:
            logging.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logging.debug('skipping feed of type %s', feed_type)
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logging.debug("fetching author's rel-feed %s", feed_url)
            feed_resp = util.requests_get(feed_url)
            feed_resp.raise_for_status()
            logging.debug("author's rel-feed fetched successfully %s",
                          feed_url)
            feeditems = _merge_hfeeds(
                feeditems, _find_feed_items(feed_url, feed_resp.text))

            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logging.info(
                        'rel-feed found new domain %s! adding to source',
                        domain)
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logging.info('Could not fetch h-feed url %s.',
                         feed_url,
                         exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published')

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logging.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, basestring):
                    permalink_to_entry[permalink] = child
                else:
                    logging.warn('unexpected non-string "url" property: %s',
                                 permalink)

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logging.info('Hit cap of %d permalinks. Stopping.', max)
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.iteritems():
        logging.debug('processing permalink: %s', permalink)
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.iteritems():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
Example #32
0
def _process_entry(source, permalink, feed_entry, refetch, preexisting,
                   store_blanks=True):
  """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: a list of previously discovered models.SyndicatedPosts
      for this permalink
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new models.SyndicatedPosts
  """
  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting:
    # if we're refetching and this one is blank, do not return.
    # if there is a blank entry, it should be the one and only entry,
    # but go ahead and check 'all' of them to be safe.
    if not refetch:
      return {}
    synds = [s.syndication for s in preexisting if s.syndication]
    if synds:
      logging.debug('previously found relationship(s) for original %s: %s',
                    permalink, synds)

  # first try with the h-entry from the h-feed. if we find the syndication url
  # we're looking for, we don't have to fetch the permalink
  permalink, _, type_ok = util.get_webmention_target(permalink)
  usynd = feed_entry.get('properties', {}).get('syndication', [])
  if usynd:
    logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
  results = _process_syndication_urls(source, permalink, set(
    url for url in usynd if isinstance(url, basestring)), preexisting)
  success = True

  # fetch the full permalink page, which often has more detailed information
  if not results:
    parsed = None
    try:
      logging.debug('fetching post permalink %s', permalink)
      if type_ok:
        resp = util.requests_get(permalink)
        resp.raise_for_status()
        parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict()
    except AssertionError:
      raise  # for unit tests
    except BaseException:
      # TODO limit the number of allowed failures
      logging.warning('Could not fetch permalink %s', permalink, exc_info=True)
      success = False

    if parsed:
      syndication_urls = set()
      relsynd = parsed.get('rels').get('syndication', [])
      if relsynd:
        logging.debug('rel-syndication links: %s', relsynd)
      syndication_urls.update(url for url in relsynd
                              if isinstance(url, basestring))
      # there should only be one h-entry on a permalink page, but
      # we'll check all of them just in case.
      for hentry in (item for item in parsed['items']
                     if 'h-entry' in item['type']):
        usynd = hentry.get('properties', {}).get('syndication', [])
        if usynd:
          logging.debug('u-syndication links: %s', usynd)
        syndication_urls.update(url for url in usynd
                                if isinstance(url, basestring))
      results = _process_syndication_urls(
        source, permalink, syndication_urls, preexisting)

  # detect and delete SyndicatedPosts that were removed from the site
  if success:
    result_syndposts = itertools.chain(*results.values())
    for syndpost in list(preexisting):
      if syndpost.syndication and syndpost not in result_syndposts:
        logging.info('deleting relationship that disappeared: %s', syndpost)
        syndpost.key.delete()
        preexisting.remove(syndpost)

  if not results:
    logging.debug('no syndication links from %s to current source %s.',
                  permalink, source.label())
    results = {}
    if store_blanks and not preexisting:
      # remember that this post doesn't have syndication links for this
      # particular source
      logging.debug('saving empty relationship so that %s will not be '
                    'searched again', permalink)
      SyndicatedPost.insert_original_blank(source, permalink)

  # only return results that are not in the preexisting list
  new_results = {}
  for syndurl, syndposts_for_url in results.iteritems():
    for syndpost in syndposts_for_url:
      if syndpost not in preexisting:
        new_results.setdefault(syndurl, []).append(syndpost)

  if new_results:
    logging.debug('discovered relationships %s', new_results)
  return new_results
Example #33
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = util.beautifulsoup_parse(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.info('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if feed_type and feed_type != 'text/html':
      feed_ok = False
    else:
      # double check that it's text/html, not too big, etc
      feed_url, _, feed_ok = util.get_webmention_target(feed_url)

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True)

  # sort by dt-updated/dt-published
  def updated_or_published(item):
    props = microformats2.first_props(item.get('properties'))
    return props.get('updated') or props.get('published')

  feeditems.sort(key=updated_or_published, reverse=True)

  permalink_to_entry = collections.OrderedDict()
  for child in feeditems:
    if 'h-entry' in child['type']:
      permalinks = child['properties'].get('url', [])
      if not permalinks:
        logging.debug('ignoring h-entry with no u-url!')
      for permalink in permalinks:
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

    max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user()
           else MAX_PERMALINK_FETCHES)
    if len(permalink_to_entry) >= max:
      logging.info('Hit cap of %d permalinks. Stopping.', max)
      break

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    source.updates['last_syndication_url'] = util.now_fn()

  return results