Example #1
0
def get_hashid_and_urlid(url):
    """
        Retrieves hash ID ('Hashes' table) and URL ID
        ('ImageURLs' table) for an image at a given URL.
        Populates 'Hashes' and 'ImageURLs' if needed.
        3rd tuple is True if downloading of image was required
    """
    existing = db.select('id, hashid', 'ImageURLs',
                         'url LIKE "%s"' % clean_url(url))
    if existing:
        urlid = existing[0][0]
        hashid = existing[0][1]
        return hashid, urlid, False

    # Download image
    if url.startswith('//'):
        url = 'http:%s' % url
    logger.debug('Downloading %s ...' % url)
    try:
        image_buffer = web.download(url)
    except Exception as e:
        logger.debug('Failed')
        raise Exception('Unable to download image at %s: %s' % (url, e))

    # Get image hash
    try:
        logger.debug('Hashing ...')
        image = image_from_buffer(image_buffer)
        (width, height) = image.size
        image_hash = str(avhash(image))
    except Exception as e:
        logger.debug('Failed')
        raise e
    logger.debug('Indexing ... ')

    # Insert image hash into Hashes table
    hashid = db.insert('Hashes', (None, image_hash))
    if hashid == -1:
        # Already exists, need to lookup existing hash
        hashids = db.select('id', 'Hashes', 'hash = "%s"' % (image_hash, ))
        if not hashids:
            raise Exception('unable to add hash to table, or find hash (wtf?)')
        hashid = hashids[0][0]

    # Image attributes
    try:
        filesize = len(image_buffer)
        url = clean_url(url)
        urlid = db.insert('ImageURLs',
                          (None, url, hashid, width, height, filesize))
        create_thumb(image, urlid)
        logger.debug('Done')
    except Exception as e:
        raise e
    return hashid, urlid, True
Example #2
0
def get_hashid(url, timeout=10):
    """ 
        Retrieves hash ID ('Hashes' table) for image.
        Returns -1 if the image's hash was not found in the table.
        Does not modify DB! (read only)
    """
    cleaned_url = clean_url(url)
    existing = db.select('hashid', 'ImageURLs', 'url LIKE "%s"' % cleaned_url)
    if existing:
        return existing[0][0], False

    # Download image
    image_buffer = web.download(url, timeout=timeout)
    if not image_buffer:
        raise Exception('unable to download image at %s' % url)

    # Get image hash
    try:
        image = image_from_buffer(image_buffer)
        image_hash = str(avhash(image))
    except:
        # Failed to get hash, delete image & raise exception
        raise Exception("Could not identify image")

    hashids = db.select('id', 'Hashes', 'hash = "%s"' % image_hash)
    if not hashids:
        return -1, True
    return hashids[0][0], True
Example #3
0
def search_cache(url):
    """
        Prints list of images inside of an album
        The images are stored in the database, so 404'd albums
        can be retrieved via this method (sometimes)
    """
    try:
        url = clean_url(url)
    except Exception as e:
        return Response(json.dumps({"error": str(e)}),
                        mimetype="application/json")
    images = []
    query_text = 'id IN (SELECT urlid FROM Images WHERE albumid IN (SELECT DISTINCT id FROM albums WHERE url LIKE "%s"))' \
                 % (url,)
    image_tuples = db.select('id, url', 'ImageURLs', query_text)
    for (urlid, imageurl) in image_tuples:
        image = {
            'thumb': path.join(thumb_path(urlid), '%d.jpg' % urlid),
            'url': imageurl
        }
        images.append(image)

    return Response(json.dumps({
        'url': 'cache:%s' % url,
        'images': images
    }),
                    mimetype="application/json")
Example #4
0
def insert_url(conn, url_string):
    '''
    Inserts a url into the database, unless it already exists.
    Always returns the corresponding url_id in the database,
        regardless if it already existed or was just inserted.
    '''
    url_string = util.clean_url(url_string)

    cursor = conn.cursor()
    # borrowed much of this query from a stackoverflow post
    # http://stackoverflow.com/questions/18192570/insert-if-not-exists-else-return-id-in-postgresql
    q = '''
        WITH s AS (
            SELECT id
            FROM urls
            WHERE url=%s
        ), i AS (
            INSERT INTO urls (url)
            SELECT %s WHERE NOT EXISTS
                (SELECT 1 FROM s)
            RETURNING id
        )
        SELECT id FROM i
        UNION ALL
        SELECT id from s;
        '''
    cursor.execute(q, (url_string, url_string))
    url_id = cursor.fetchone()[0]
    conn.commit()
    return url_id
Example #5
0
def search_album(url):
    """
        Prints list of images inside of an album
        The images are stored in the database, so 404'd albums
        can be retrieved via this method (sometimes)
    """
    try:
        url = clean_url(url)
    except Exception as e:
        return Response(json.dumps({"error": str(e)}), mimetype="application/json")
    images = []
    image_tuples = db.get_images_from_album_url(album_url=url)

    for (urlid, imageurl, width, height) in image_tuples:
        image = {
            "thumb": path.join(thumb_path(urlid), '%d.jpg' % urlid),
            "url": imageurl,
            "width": width,
            "height": height,
        }
        images.append(image)

    return Response(json.dumps({
        'url': 'cache:%s' % url,
        'images': images
    }), mimetype="application/json")
Example #6
0
def parse_url(url, postid=0, commentid=0):
    """ Gets image hash(es) from URL, populates database """

    if is_direct_link(url):
        parse_image(url, postid, commentid)
        return True

    if not should_parse_link(url):
        return

    image_urls = get_image_urls(url)
    url = clean_url(url)

    # We assume that any url that yields more than 1 image is an album
    albumid = 0
    if len(image_urls) > 1:
        albumid = get_or_create_album(url)

    if len(image_urls) > 10:
        logger.debug("Using multithreading to download large album")
        pool = ThreadPool(processes=10)
        pool.starmap(func=parse_image,
                     iterable=zip(image_urls, repeat(postid),
                                  repeat(commentid), repeat(albumid)))
        pool.close()
    else:
        for image_url in image_urls:
            parse_image(image_url, postid, commentid, albumid)
    return True
Example #7
0
def handle_feed(feed, source):
    """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
    logging.info('Source: %s %s', source.label(), source.key.string_id())
    logging.info('Raw feed: %s', feed)

    if source.status != 'enabled':
        logging.info('Dropping because source is %s', source.status)
        return
    elif 'webmention' not in source.features:
        logging.info("Dropping because source doesn't have webmention feature")
        return

    for item in json.loads(feed).get('items', []):
        url = item.get('permalinkUrl') or item.get('id')
        if not url:
            logging.error('Dropping feed item without permalinkUrl or id!')
            continue

        # extract links from content, discarding self links.
        #
        # i don't use get_webmention_target[s]() here because they follows redirects
        # and fetch link contents, and this handler should be small and fast and try
        # to return a response to superfeedr successfully.
        #
        # TODO: extract_links currently has a bug that makes it drop trailing
        # slashes. ugh. fix that.
        content = item.get('content') or item.get('summary', '')
        links = [
            util.clean_url(util.unwrap_t_umblr_com(l))
            for l in util.extract_links(content)
            if util.domain_from_link(l) not in source.domains
        ]

        logging.info('Found links: %s', links)
        if len(url) > _MAX_KEYPART_BYTES:
            logging.warning(
                'Blog post URL is too long (over 500 chars)! Giving up.')
            bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES],
                                 source=source.key,
                                 feed_item=item,
                                 failed=links)
        else:
            bp = models.BlogPost(id=url,
                                 source=source.key,
                                 feed_item=item,
                                 unsent=links)

        bp.get_or_save()
Example #8
0
    def get_video_from_url(self, url):
        with self.get_conn() as conn:
            res = conn.query(
                "SELECT v.id from videourls "
                "INNER JOIN videos v on v.id = videourls.videoid "
                "WHERE clean_url = %s", (clean_url(url), ))

        return None if not res else res[0][0]
Example #9
0
    def get_image_from_url(self, url):
        with self.get_conn() as conn:
            res = conn.query(
                "SELECT i.id from imageurls "
                "INNER JOIN images i on i.id = imageurls.imageid "
                "WHERE clean_url = %s", (clean_url(url), ))

        return None if not res else res[0][0]
Example #10
0
def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.info('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.info("Dropping because source doesn't have webmention feature")
    return

  for item in json.loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [util.clean_url(util.unwrap_t_umblr_com(l))
             for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    unique = []
    for link in util.dedupe_urls(links):
      if len(link) <= _MAX_STRING_LENGTH:
        unique.append(link)
      else:
        logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link)

    logging.info('Found links: %s', unique)
    if len(url) > _MAX_KEYPART_BYTES:
      logging.warning('Blog post URL is too long (over 500 chars)! Giving up.')
      bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key,
                           feed_item=item, failed=unique)
    else:
      bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique)

    bp.get_or_save()
Example #11
0
  def test_clean_url(self):
    for unchanged in '', 'http://foo', 'http://foo#bar', 'http://foo?x=y&z=w':
      self.assertEquals(unchanged, util.clean_url(unchanged))

    for bad in None, 'http://foo]', 3.14, ['http://foo']:
      self.assertIsNone(util.clean_url(bad))

    self.assertEquals(
      'http://foo',
      util.clean_url('http://foo?utm_source=x&utm_campaign=y'
                     '&source=rss----12b80d28f892---4'))
    self.assertEquals(
      'http://foo?a=b&c=d',
      util.clean_url('http://foo?a=b&utm_source=x&c=d'
                     '&source=rss----12b80d28f892---4'))
    self.assertEquals(
      'http://foo?source=not-rss',
      util.clean_url('http://foo?&source=not-rss'))
Example #12
0
# alter table imageurls add clean_url TEXT;
# drop index imageurls_url_index;
# create index imageurls_clean_url_index on imageurls (clean_url);

from DB import DB
from common import DBFILE
from util import clean_url

db = DB(DBFILE)
with db.get_conn() as conn:
    update_map = list()
    for row in conn.query("SELECT id, url FROM ir.public.imageurls"):
        update_map.append((row[0], clean_url(row[1])))

print("Updating %s imageurls" % len(update_map))
input("Continue?")

with db.get_conn() as conn:
    for i, update in enumerate(update_map):
        conn.exec("UPDATE imageurls SET clean_url = %s WHERE id=%s",
                  (update[1], update[0]))
        print("%08d/%08d" % (i, len(update_map)))
Example #13
0
    def post(self, source_short_name):
        logging.info('Params: %self', self.request.params.items())
        # strip fragments from source and target url
        self.source_url = urlparse.urldefrag(
            util.get_required_param(self, 'source'))[0]
        self.target_url = urlparse.urldefrag(
            util.get_required_param(self, 'target'))[0]

        # follow target url through any redirects, strip utm_* query params
        resp = util.follow_redirects(self.target_url)
        redirected_target_urls = [r.url for r in resp.history]
        self.target_url = util.clean_url(resp.url)

        # parse and validate target URL
        domain = util.domain_from_link(self.target_url)
        if not domain:
            return self.error('Could not parse target URL %s' %
                              self.target_url)

        # look up source by domain
        source_cls = models.sources[source_short_name]
        domain = domain.lower()
        self.source = (source_cls.query().filter(
            source_cls.domains == domain).filter(
                source_cls.features == 'webmention').filter(
                    source_cls.status == 'enabled').get())
        if not self.source:
            return self.error(
                'Could not find %s account for %s. Is it registered with Bridgy?'
                % (source_cls.GR_CLASS.NAME, domain))

        if urlparse.urlparse(self.target_url).path in ('', '/'):
            return self.error(
                'Home page webmentions are not currently supported.')

        # create BlogWebmention entity
        id = u'%s %s' % (self.source_url, self.target_url)
        self.entity = BlogWebmention.get_or_insert(
            id,
            source=self.source.key,
            redirected_target_urls=redirected_target_urls)
        if self.entity.status == 'complete':
            # TODO: response message saying update isn't supported
            self.response.write(self.entity.published)
            return
        logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe())

        # fetch source page
        resp = self.fetch_mf2(self.source_url)
        if not resp:
            return
        self.fetched, data = resp

        item = self.find_mention_item(data)
        if not item:
            return self.error(
                'Could not find target URL %s in source page %s' %
                (self.target_url, self.fetched.url),
                data=data,
                log_exception=False)

        # default author to target domain
        author_name = domain
        author_url = 'http://%s/' % domain

        # extract author name and URL from h-card, if any
        props = item['properties']
        author = first_value(props, 'author')
        if author:
            if isinstance(author, basestring):
                author_name = author
            else:
                author_props = author.get('properties', {})
                author_name = first_value(author_props, 'name')
                author_url = first_value(author_props, 'url')

        # if present, u-url overrides source url
        u_url = first_value(props, 'url')
        if u_url:
            self.entity.u_url = u_url

        # generate content
        content = props['content'][
            0]  # find_mention_item() guaranteed this is here
        text = (content.get('html') or content.get('value')).strip()
        source_url = self.entity.source_url()
        text += ' <br /> <a href="%s">via %s</a>' % (
            source_url, util.domain_from_link(source_url))

        # write comment
        try:
            self.entity.published = self.source.create_comment(
                self.target_url, author_name, author_url, text)
        except Exception as e:
            code, body = util.interpret_http_exception(e)
            msg = 'Error: %s %s; %s' % (code, e, body)
            if code == '401':
                logging.warning('Disabling source due to: %s' % e,
                                exc_info=True)
                self.source.status = 'disabled'
                self.source.put()
                return self.error(msg,
                                  status=code,
                                  mail=self.source.is_beta_user())
            elif code == '404':
                # post is gone
                return self.error(msg, status=code, mail=False)
            elif util.is_connection_failure(e) or (code
                                                   and int(code) // 100 == 5):
                return self.error(msg,
                                  status=util.ERROR_HTTP_RETURN_CODE,
                                  mail=False)
            elif code or body:
                return self.error(msg, status=code, mail=True)
            else:
                raise

        # write results to datastore
        self.entity.status = 'complete'
        self.entity.put()
        self.response.write(json.dumps(self.entity.published))
Example #14
0
    def post(self, source_short_name):
        logging.info('Params: %s', list(self.request.params.items()))
        # strip fragments from source and target url
        self.source_url = urllib.parse.urldefrag(
            util.get_required_param(self, 'source'))[0]
        self.target_url = urllib.parse.urldefrag(
            util.get_required_param(self, 'target'))[0]

        # follow target url through any redirects, strip utm_* query params
        resp = util.follow_redirects(self.target_url)
        redirected_target_urls = [r.url for r in resp.history]
        self.target_url = util.clean_url(resp.url)

        # parse and validate target URL
        domain = util.domain_from_link(self.target_url)
        if not domain:
            return self.error('Could not parse target URL %s' %
                              self.target_url)

        # look up source by domain
        source_cls = models.sources[source_short_name]
        domain = domain.lower()
        self.source = (source_cls.query().filter(
            source_cls.domains == domain).filter(
                source_cls.features == 'webmention').filter(
                    source_cls.status == 'enabled').get())
        if not self.source:
            # check for a rel-canonical link. Blogger uses these when it serves a post
            # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
            # epeus.blogspot.com.
            # https://github.com/snarfed/bridgy/issues/805
            mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
            if not mf2:
                # fetch_mf2() already wrote the error response
                return
            domains = util.dedupe_urls(
                util.domain_from_link(url)
                for url in mf2[1]['rels'].get('canonical', []))
            if domains:
                self.source = (source_cls.query().filter(
                    source_cls.domains.IN(domains)).filter(
                        source_cls.features == 'webmention').filter(
                            source_cls.status == 'enabled').get())

        if not self.source:
            return self.error(
                'Could not find %s account for %s. Is it registered with Bridgy?'
                % (source_cls.GR_CLASS.NAME, domain))

        # check that the target URL path is supported
        target_path = urllib.parse.urlparse(self.target_url).path
        if target_path in ('', '/'):
            return self.error(
                'Home page webmentions are not currently supported.',
                status=202)
        for pattern in self.source.PATH_BLOCKLIST:
            if pattern.match(target_path):
                return self.error(
                    '%s webmentions are not supported for URL path: %s' %
                    (self.source.GR_CLASS.NAME, target_path),
                    status=202)

        # create BlogWebmention entity
        id = '%s %s' % (self.source_url, self.target_url)
        self.entity = BlogWebmention.get_or_insert(
            id,
            source=self.source.key,
            redirected_target_urls=redirected_target_urls)
        if self.entity.status == 'complete':
            # TODO: response message saying update isn't supported
            self.response.write(self.entity.published)
            return
        logging.debug("BlogWebmention entity: '%s'",
                      self.entity.key.urlsafe().decode())

        # fetch source page
        fetched = self.fetch_mf2(self.source_url)
        if not fetched:
            return
        resp, mf2 = fetched

        item = self.find_mention_item(mf2.get('items', []))
        if not item:
            return self.error(
                'Could not find target URL %s in source page %s' %
                (self.target_url, resp.url),
                data=mf2,
                log_exception=False)

        # default author to target domain
        author_name = domain
        author_url = 'http://%s/' % domain

        # extract author name and URL from h-card, if any
        props = item['properties']
        author = first_value(props, 'author')
        if author:
            if isinstance(author, str):
                author_name = author
            else:
                author_props = author.get('properties', {})
                author_name = first_value(author_props, 'name')
                author_url = first_value(author_props, 'url')

        # if present, u-url overrides source url
        u_url = first_value(props, 'url')
        if u_url:
            self.entity.u_url = u_url

        # generate content
        content = props['content'][
            0]  # find_mention_item() guaranteed this is here
        text = (content.get('html') or content.get('value')).strip()
        source_url = self.entity.source_url()
        text += ' <br /> <a href="%s">via %s</a>' % (
            source_url, util.domain_from_link(source_url))

        # write comment
        try:
            self.entity.published = self.source.create_comment(
                self.target_url, author_name, author_url, text)
        except Exception as e:
            code, body = util.interpret_http_exception(e)
            msg = 'Error: %s %s; %s' % (code, e, body)
            if code == '401':
                logging.warning('Disabling source due to: %s' % e,
                                stack_info=True)
                self.source.status = 'disabled'
                self.source.put()
                return self.error(msg,
                                  status=code,
                                  report=self.source.is_beta_user())
            elif code == '404':
                # post is gone
                return self.error(msg, status=code, report=False)
            elif util.is_connection_failure(e) or (code
                                                   and int(code) // 100 == 5):
                return self.error(msg,
                                  status=util.ERROR_HTTP_RETURN_CODE,
                                  report=False)
            elif code or body:
                return self.error(msg, status=code, report=True)
            else:
                raise

        # write results to datastore
        self.entity.status = 'complete'
        self.entity.put()
        self.response.write(json_dumps(self.entity.published))
Example #15
0
  def post(self, source_short_name):
    logging.info('Params: %self', self.request.params.items())
    # strip fragments from source and target url
    self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0]
    self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      return self.error('Could not parse target URL %s' % self.target_url)

    # look up source by domain
    source_cls = models.sources[source_short_name]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      return self.error(
        'Could not find %s account for %s. Is it registered with Bridgy?' %
        (source_cls.GR_CLASS.NAME, domain))

    if urlparse.urlparse(self.target_url).path in ('', '/'):
      return self.error('Home page webmentions are not currently supported.')

    # create BlogWebmention entity
    id = u'%s %s' % (self.source_url, self.target_url)
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      self.response.write(self.entity.published)
      return
    logging.debug('BlogWebmention entity: %s', self.entity.key.urlsafe())

    # fetch source page
    resp = self.fetch_mf2(self.source_url)
    if not resp:
      return
    self.fetched, data = resp

    item = self.find_mention_item(data)
    if not item:
      return self.error('Could not find target URL %s in source page %s' %
                        (self.target_url, self.fetched.url),
                        data=data, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = 'http://%s/' % domain

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = first_value(props, 'author')
    if author:
      if isinstance(author, basestring):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = first_value(author_props, 'name')
        author_url = first_value(author_props, 'url')

    # if present, u-url overrides source url
    u_url = first_value(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += ' <br /> <a href="%s">via %s</a>' % (
      source_url, util.domain_from_link(source_url))

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception, e:
      code, body = util.interpret_http_exception(e)
      msg = 'Error: %s %s; %s' % (code, e, body)
      if code == '401':
        logging.warning('Disabling source!')
        self.source.status = 'disabled'
        self.source.put()
        return self.error(msg, status=code, mail=False)
      elif code == '404':
        # post is gone
        return self.error(msg, status=code, mail=False)
      elif code or body:
        return self.error(msg, status=code, mail=True)
      else:
        raise
Example #16
0
 def insert_imageurl(self, url, imageid, albumid, postid, commentid):
     with self.get_conn() as conn:
         conn.exec(
             "INSERT INTO imageurls (url, clean_url, imageid, albumid, postid, commentid) "
             "VALUES (%s,%s,%s,%s,%s,%s)",
             (url, clean_url(url), imageid, albumid, postid, commentid))
Example #17
0
 def insert_videourl(self, url, video_id, postid, commentid):
     with self.get_conn() as conn:
         conn.exec(
             "INSERT INTO videourls (url, clean_url, videoid, postid, commentid) "
             "VALUES (%s,%s,%s,%s,%s)",
             (url, clean_url(url), video_id, postid, commentid))
Example #18
0
def handle_feed(feed, source):
    """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
    logger.info(f'Source: {source.label()} {source.key_id()}')
    logger.info(f'Raw feed: {feed}')

    if not feed:
        return

    if source.status != 'enabled':
        logger.info(f'Dropping because source is {source.status}')
        return
    elif 'webmention' not in source.features:
        logger.info("Dropping because source doesn't have webmention feature")
        return

    for item in feed.get('items', []):
        url = item.get('permalinkUrl') or item.get('id')
        if not url:
            logger.error('Dropping feed item without permalinkUrl or id!')
            continue

        # extract links from content, discarding self links.
        #
        # i don't use get_webmention_target[s]() here because they follows redirects
        # and fetch link contents, and this handler should be small and fast and try
        # to return a response to superfeedr successfully.
        #
        # TODO: extract_links currently has a bug that makes it drop trailing
        # slashes. ugh. fix that.
        content = item.get('content') or item.get('summary', '')
        links = [
            util.clean_url(util.unwrap_t_umblr_com(url))
            for url in util.extract_links(content)
            if util.domain_from_link(url) not in source.domains
        ]

        unique = []
        for link in util.dedupe_urls(links):
            if len(link) <= _MAX_STRING_LENGTH:
                unique.append(link)
            else:
                logger.info(
                    f'Giving up on link over {_MAX_STRING_LENGTH} chars! {link}'
                )
            if len(unique) >= MAX_BLOGPOST_LINKS:
                logger.info('Stopping at 10 links! Skipping the rest.')
                break

        logger.info(f'Found links: {unique}')
        if len(url) > _MAX_KEYPART_BYTES:
            logger.warning(
                'Blog post URL is too long (over 500 chars)! Giving up.')
            bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES],
                                 source=source.key,
                                 feed_item=item,
                                 failed=unique)
        else:
            bp = models.BlogPost(id=url,
                                 source=source.key,
                                 feed_item=item,
                                 unsent=unique)

        bp.get_or_save()
Example #19
0
  def dispatch_request(self, site):
    logger.info(f'Params: {list(request.values.items())}')
    # strip fragments from source and target url
    self.source_url = urllib.parse.urldefrag(request.form['source'])[0]
    self.target_url = urllib.parse.urldefrag(request.form['target'])[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      self.error(f'Could not parse target URL {self.target_url}')

    # look up source by domain
    source_cls = models.sources[site]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      # check for a rel-canonical link. Blogger uses these when it serves a post
      # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
      # epeus.blogspot.com.
      # https://github.com/snarfed/bridgy/issues/805
      mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
      if not mf2:
        # fetch_mf2() already wrote the error response
        return
      domains = util.dedupe_urls(
        util.domain_from_link(url)
        for url in mf2[1]['rels'].get('canonical', []))
      if domains:
        self.source = (source_cls.query()
                       .filter(source_cls.domains.IN(domains))
                       .filter(source_cls.features == 'webmention')
                       .filter(source_cls.status == 'enabled')
                       .get())

    if not self.source:
      self.error(
        f'Could not find {source_cls.GR_CLASS.NAME} account for {domain}. Is it registered with Bridgy?')

    # check that the target URL path is supported
    target_path = urllib.parse.urlparse(self.target_url).path
    if target_path in ('', '/'):
      msg = 'Home page webmentions are not currently supported.'
      logger.info(msg)
      return {'error': msg}, 202
    for pattern in self.source.PATH_BLOCKLIST:
      if pattern.match(target_path):
        msg = f'{self.source.GR_CLASS.NAME} webmentions are not supported for URL path: {target_path}'
        logger.info(msg)
        return {'error': msg}, 202

    # create BlogWebmention entity
    id = f'{self.source_url} {self.target_url}'
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      return self.entity.published
    logger.debug(f'BlogWebmention entity: {self.entity.key.urlsafe().decode()}')

    # fetch source page
    fetched = self.fetch_mf2(self.source_url)
    if not fetched:
      return
    resp, mf2 = fetched

    item = self.find_mention_item(mf2.get('items', []))
    if not item:
      self.error(f'Could not find target URL {self.target_url} in source page {resp.url}', data=mf2, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = f'http://{domain}/'

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = get_first(props, 'author')
    if author:
      if isinstance(author, str):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = get_first(author_props, 'name')
        author_url = get_first(author_props, 'url')

    # if present, u-url overrides source url
    u_url = get_first(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += f' <br /> <a href="{source_url}">via {util.domain_from_link(source_url)}</a>'

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      msg = f'Error: {code}: {e}; {body}'
      if code == '401':
        logger.warning(f'Disabling source due to: {e}', exc_info=True)
        self.source.status = 'disabled'
        self.source.put()
        self.error(msg, status=code, report=self.source.is_beta_user())
      elif code == '404':
        # post is gone
        self.error(msg, status=code, report=False)
      elif util.is_connection_failure(e) or (code and int(code) // 100 == 5):
        self.error(msg, status=502, report=False)
      elif code or body:
        self.error(msg, status=code, report=True)
      else:
        raise

    # write results to datastore
    self.entity.status = 'complete'
    self.entity.put()

    return self.entity.published
Example #20
0
  def post(self, source_short_name):
    logging.info('Params: %self', self.request.params.items())
    # strip fragments from source and target url
    self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0]
    self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      return self.error('Could not parse target URL %s' % self.target_url)

    # look up source by domain
    source_cls = models.sources[source_short_name]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      # check for a rel-canonical link. Blogger uses these when it serves a post
      # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
      # epeus.blogspot.com.
      # https://github.com/snarfed/bridgy/issues/805
      mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
      if not mf2:
        # fetch_mf2() already wrote the error response
        return
      domains = util.dedupe_urls(
        util.domain_from_link(url)
        for url in mf2[1].get('rels', {}).get('canonical', []))
      if domains:
        self.source = (source_cls.query()
                       .filter(source_cls.domains.IN(domains))
                       .filter(source_cls.features == 'webmention')
                       .filter(source_cls.status == 'enabled')
                       .get())

    if not self.source:
      return self.error(
        'Could not find %s account for %s. Is it registered with Bridgy?' %
        (source_cls.GR_CLASS.NAME, domain))

    # check that the target URL path is supported
    target_path = urlparse.urlparse(self.target_url).path
    if target_path in ('', '/'):
      return self.error('Home page webmentions are not currently supported.',
                        status=202)
    for pattern in self.source.PATH_BLACKLIST:
      if pattern.match(target_path):
        return self.error('%s webmentions are not supported for URL path: %s' %
                          (self.source.GR_CLASS.NAME, target_path), status=202)

    # create BlogWebmention entity
    id = '%s %s' % (self.source_url, self.target_url)
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      self.response.write(self.entity.published)
      return
    logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe())

    # fetch source page
    resp = self.fetch_mf2(self.source_url)
    if not resp:
      return
    self.fetched, data = resp

    item = self.find_mention_item(data.get('items', []))
    if not item:
      return self.error('Could not find target URL %s in source page %s' %
                        (self.target_url, self.fetched.url),
                        data=data, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = 'http://%s/' % domain

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = first_value(props, 'author')
    if author:
      if isinstance(author, basestring):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = first_value(author_props, 'name')
        author_url = first_value(author_props, 'url')

    # if present, u-url overrides source url
    u_url = first_value(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += ' <br /> <a href="%s">via %s</a>' % (
      source_url, util.domain_from_link(source_url))

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      msg = 'Error: %s %s; %s' % (code, e, body)
      if code == '401':
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        self.source.status = 'disabled'
        self.source.put()
        return self.error(msg, status=code, mail=self.source.is_beta_user())
      elif code == '404':
        # post is gone
        return self.error(msg, status=code, mail=False)
      elif util.is_connection_failure(e) or (code and int(code) // 100 == 5):
        return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False)
      elif code or body:
        return self.error(msg, status=code, mail=True)
      else:
        raise

    # write results to datastore
    self.entity.status = 'complete'
    self.entity.put()
    self.response.write(json.dumps(self.entity.published))