Python strip_html Examples, markup.html.strip_html Python Examples

Example #1

0

Show file

def get_entry_title(entry):
    if 'title' in entry:
        return html.strip_html(entry.title)
    return 'Untitled'

Example #2

0

Show file

File: fetcher.py Project: tewe/coldsweat

def get_entry_title(entry):
    if 'title' in entry:
        return truncate(html.strip_html(entry.title), MAX_TITLE_LENGTH)
    return 'Untitled'

Example #3

0

Show file

def fetch_feed(feed, add_entries=False):
    def post_fetch(status, error=False):
        if status:
            feed.last_status = status
        if error:
            feed.error_count = feed.error_count + 1
        error_threshold = config.getint('fetcher', 'error_threshold')
        if error_threshold and (feed.error_count > error_threshold):
            feed.is_enabled = False
            feed.last_status = ProblematicFeedError.code
            log.warn("%s has too many errors, disabled" % netloc)
        feed.save()

    log.debug("fetching %s" % feed.self_link)

    schema, netloc, path, params, query, fragment = urlparse.urlparse(
        feed.self_link)

    now = datetime.utcnow()

    request_headers = {'User-Agent': user_agent}

    interval = config.getint('fetcher', 'min_interval')

    # Check freshness
    for fieldname in ['last_checked_on', 'last_updated_on']:
        value = getattr(feed, fieldname)
        if not value:
            continue

        # No datetime.timedelta since we need to deal with large seconds values
        delta = datetime_as_epoch(now) - datetime_as_epoch(value)
        if delta < interval:
            log.debug("%s for %s is below min_interval, skipped" %
                      (fieldname, netloc))
            return

    # Conditional GET headers
    if feed.etag and feed.last_updated_on:
        request_headers['If-None-Match'] = feed.etag
        request_headers['If-Modified-Since'] = format_http_datetime(
            feed.last_updated_on)

    timeout = config.getint('fetcher', 'timeout')

    try:
        response = requests.get(feed.self_link,
                                timeout=timeout,
                                headers=request_headers)
    except (IOError, RequestException):
        # Interpret as 'Service Unavailable'
        post_fetch(503, error=True)
        log.warn("a network error occured while fetching %s, skipped" % netloc)
        return

    feed.last_checked_on = now

    if response.history and response.history[
            0].status_code == 301:  # Moved permanently
        self_link = response.url

        try:
            Feed.get(self_link=self_link)
        except Feed.DoesNotExist:
            feed.self_link = self_link
            log.info("%s has changed its location, updated to %s" %
                     (netloc, self_link))
        else:
            feed.is_enabled = False
            log.warn("new %s location %s is duplicated, disabled" %
                     (netloc, self_link))
            post_fetch(DuplicatedFeedError.code)
            return

    if response.status_code == 304:  # Not modified
        log.debug("%s hasn't been modified, skipped" % netloc)
        post_fetch(response.status_code)
        return
    elif response.status_code == 410:  # Gone
        log.warn("%s is gone, disabled" % netloc)
        feed.is_enabled = False
        post_fetch(response.status_code)
        return
    elif response.status_code not in (200, ):  # No good
        log.warn("%s replied with status %d, aborted" %
                 (netloc, response.status_code))
        post_fetch(response.status_code, error=True)
        return

    soup = feedparser.parse(response.text)
    # Got parsing error? Log error but do not increment the error counter
    if hasattr(soup, 'bozo') and soup.bozo:
        log.info("%s caused a parser error (%s), tried to parse it anyway" %
                 (netloc, soup.bozo_exception))
        post_fetch(response.status_code, error=False)

    feed.etag = response.headers.get('ETag', None)

    if 'link' in soup.feed:
        feed.alternate_link = soup.feed.link

    # Reset value only if not set before
    if ('title' in soup.feed) and not feed.title:
        feed.title = html.strip_html(soup.feed.title)

    feed.last_updated_on = get_feed_timestamp(soup.feed, now)
    post_fetch(response.status_code)

    if not add_entries:
        return

    for entry in soup.entries:

        link = get_entry_link(entry)
        guid = get_entry_id(entry, default=link)

        if not guid:
            log.warn('could not find guid for entry from %s, skipped' % netloc)
            continue

        title = get_entry_title(entry)
        timestamp = get_entry_timestamp(entry, default=now)
        author = get_entry_author(entry, soup.feed)

        # Skip ancient feed items
        max_history = config.getint('fetcher', 'max_history')
        if max_history and ((now - timestamp).days > max_history):
            log.debug("entry %s from %s is over max_history, skipped" %
                      (guid, netloc))
            continue

        try:
            # If entry is already in database with same id, then skip it
            Entry.get(guid=guid)
            log.debug("duplicated entry %s, skipped" % guid)
            continue
        except Entry.DoesNotExist:
            pass

        mime_type, content = get_entry_content(entry)
        if blacklist and ('html' in mime_type):
            content = html.scrub_html(content, blacklist)

        d = {
            'guid': guid,
            'feed': feed,
            'title': title,
            'author': author,
            'content': content,
            'link': link,
            'last_updated_on': timestamp,
        }

        # Save to database
        Entry.create(**d)

        log.debug(u"added entry %s from %s" % (guid, netloc))

Example #4

0

Show file

File: fetcher.py Project: tewe/coldsweat

def fetch_feed(feed, add_entries=False):
    
    def post_fetch(status, error=False):
        if status:
            feed.last_status = status
        if error:
            feed.error_count = feed.error_count + 1        
        error_threshold = config.getint('fetcher', 'error_threshold')
        if error_threshold and (feed.error_count > error_threshold):
            feed.is_enabled = False
            feed.last_status = status # Save status code for posterity           
            log.warn("%s has too many errors, disabled" % netloc)        
        feed.save()

    if hasattr(feed, 'subscriptions') and not feed.subscriptions:
        log.debug("feed %s has no subscribers, skipped" % feed.self_link)
        return

    log.debug("fetching %s" % feed.self_link)
           
    schema, netloc, path, params, query, fragment = urlparse.urlparse(feed.self_link)

    now = datetime.utcnow()

    request_headers = {
        'User-Agent': user_agent
    }

    interval = config.getint('fetcher', 'min_interval')

    # Check freshness
    for fieldname in ['last_checked_on', 'last_updated_on']:
        value = getattr(feed, fieldname)
        if not value:
            continue

        # No datetime.timedelta since we need to deal with large seconds values            
        delta = datetime_as_epoch(now) - datetime_as_epoch(value)    
        if delta < interval:
            log.debug("%s for %s is below min_interval, skipped" % (fieldname, netloc))
            return            
                      
    # Conditional GET headers
    if feed.etag and feed.last_updated_on:
        request_headers['If-None-Match'] = feed.etag
        request_headers['If-Modified-Since'] = format_http_datetime(feed.last_updated_on)

    timeout = config.getint('fetcher', 'timeout')
                
    try:
        response = requests.get(feed.self_link, timeout=timeout, headers=request_headers)
    except (IOError, RequestException):        
        # Interpret as 'Service Unavailable'
        #@@FIXME: catch ContentDecodingError? 
        post_fetch(503, error=True)
        log.warn("a network error occured while fetching %s, skipped" % netloc)
        return

    feed.last_checked_on = now

    if response.history and response.history[0].status_code == 301:     # Moved permanently        
        self_link = response.url
        
        try:
            Feed.get(self_link=self_link)
        except Feed.DoesNotExist:
            feed.self_link = self_link                               
            log.info("%s has changed its location, updated to %s" % (netloc, self_link))
        else:
            feed.is_enabled = False
            log.warn("new %s location %s is duplicated, disabled" % (netloc, self_link))                
            post_fetch(DuplicatedFeedError.code)
            return

    if response.status_code == 304:                                     # Not modified
        log.debug("%s hasn't been modified, skipped" % netloc)
        post_fetch(response.status_code)
        return
    elif response.status_code == 410:                                   # Gone
        log.warn("%s is gone, disabled" % netloc)
        feed.is_enabled = False
        post_fetch(response.status_code)
        return
    elif response.status_code not in (200, ):                           # No good
        log.warn("%s replied with status %d, aborted" % (netloc, response.status_code))
        post_fetch(response.status_code, error=True)
        return

    soup = feedparser.parse(response.text) 
    # Got parsing error? Log error but do not increment the error counter
    if hasattr(soup, 'bozo') and soup.bozo:
        log.info("%s caused a parser error (%s), tried to parse it anyway" % (netloc, soup.bozo_exception))
        post_fetch(response.status_code, error=False)

    feed.etag = response.headers.get('ETag', None)    
    
    if 'link' in soup.feed:
        feed.alternate_link = soup.feed.link

    # Reset value only if not set before
    if ('title' in soup.feed) and not feed.title:
        feed.title = html.strip_html(soup.feed.title)

    feed.last_updated_on = get_feed_timestamp(soup.feed, now)        
    post_fetch(response.status_code)
    
    if not add_entries:    
        return
        
    for entry in soup.entries:
        
        link        = get_entry_link(entry)
        guid        = get_entry_id(entry, default=link)

        if not guid:
            log.warn('could not find guid for entry from %s, skipped' % netloc)
            continue

        title       = get_entry_title(entry)
        timestamp   = get_entry_timestamp(entry, default=now)
        author      = get_entry_author(entry, soup.feed)
                
        # Skip ancient feed items        
        max_history = config.getint('fetcher', 'max_history')
        if max_history and ((now - timestamp).days > max_history):  
            log.debug("entry %s from %s is over max_history, skipped" % (guid, netloc))
            continue

        try:
            # If entry is already in database with same id, then skip it
            Entry.get(guid=guid)
            log.debug("duplicated entry %s, skipped" % guid)
            continue
        except Entry.DoesNotExist:
            pass

        mime_type, content = get_entry_content(entry)
        if blacklist and ('html' in mime_type):
            content = html.scrub_html(content, blacklist)

        d = {
            'guid'              : guid,
            'feed'              : feed,
            'title'             : title,
            'author'            : author,
            'content'           : content,
            'link'              : link,
            'last_updated_on'   : timestamp,         
        }

        # Save to database
        Entry.create(**d)

        log.debug(u"added entry %s from %s" % (guid, netloc))

Example #5

0

Show file

File: fetcher.py Project: russell1969/coldsweat

def fetch_feed(feed, add_entries=False):

    def synthesize_entry(reason):    
        title, content = u'This feed has been disabled', render_template(os.path.join(template_dir, '_entry_feed_disabled.html'), {'reason': reason})
        return add_synthesized_entry(feed, title, content)

    def post_fetch(status, error=False):
        if status:
            feed.last_status = status
        if error:
            feed.error_count = feed.error_count + 1        
        error_threshold = config.getint('fetcher', 'error_threshold')
        if error_threshold and (feed.error_count > error_threshold):
            feed.is_enabled = False
            feed.last_status = status # Save status code for posterity           
            logger.warn("%s has too many errors, disabled" % netloc)        
            synthesize_entry('Feed has accomulated too many errors (last was %s).' % status_title(status))
        feed.save()

    logger.debug("fetching %s" % feed.self_link)
           
    schema, netloc, path, params, query, fragment = urlparse.urlparse(feed.self_link)

    now = datetime.utcnow()

    interval = config.getint('fetcher', 'min_interval')

    # Check freshness
    for fieldname in ['last_checked_on', 'last_updated_on']:
        value = getattr(feed, fieldname)
        if not value:
            continue
        # No datetime.timedelta since we need to deal with large seconds values
        delta = datetime_as_epoch(now) - datetime_as_epoch(value)    
        if delta < interval:
            logger.debug("%s for %s is below min_interval, skipped" % (fieldname, netloc))
            return            
                      
    response = fetch_url(feed.self_link, etag=feed.etag, modified_since=feed.last_updated_on)
    if not response:
        # Record as "503 Service unavailable"
        post_fetch(503, error=True)
        logger.warn("a network error occured while fetching %s" % netloc)
        return

    feed.last_checked_on = now

    if response.history and response.history[0].status_code == 301:     # Moved permanently        
        self_link = response.url
        
        try:
            Feed.get(self_link=self_link)
        except Feed.DoesNotExist:
            feed.self_link = self_link                               
            logger.info("%s has changed its location, updated to %s" % (netloc, self_link))
        else:
            feed.is_enabled = False
            logger.warn("new %s location %s is duplicated, disabled" % (netloc, self_link))                
            synthesize_entry('Feed has a duplicated web address.')
            post_fetch(DuplicatedFeedError.code)
            return

    if response.status_code == 304:                                     # Not modified
        logger.debug("%s hasn't been modified, skipped" % netloc)
        post_fetch(response.status_code)
        return
    elif response.status_code == 410:                                   # Gone
        feed.is_enabled = False
        logger.warn("%s is gone, disabled" % netloc)
        synthesize_entry('Feed has been removed from the origin server.')
        post_fetch(response.status_code)
        return
    elif response.status_code not in POSITIVE_STATUS_CODES:             # No good
        logger.warn("%s replied with status %d, aborted" % (netloc, response.status_code))
        post_fetch(response.status_code, error=True)
        return

    soup = feedparser.parse(response.text) 
    # Got parsing error? Log error but do not increment the error counter
    if hasattr(soup, 'bozo') and soup.bozo:
        logger.info("%s caused a parser error (%s), tried to parse it anyway" % (netloc, soup.bozo_exception))
        post_fetch(response.status_code)

    feed.etag = response.headers.get('ETag', None)    
    
    if 'link' in soup.feed:
        feed.alternate_link = soup.feed.link

    # Reset value only if not set before
    if ('title' in soup.feed) and not feed.title:
        feed.title = html.strip_html(soup.feed.title)

    feed.last_updated_on = get_feed_timestamp(soup.feed, now)        
    post_fetch(response.status_code)

    if not add_entries:    
        return
        
    for parsed_entry in soup.entries:
        
        link = get_entry_link(parsed_entry)
        guid = get_entry_id(parsed_entry, default=link)

        if not guid:
            logger.warn('could not find guid for entry from %s, skipped' % netloc)
            continue

        title                = get_entry_title(parsed_entry)
        mime_type, content   = get_entry_content(parsed_entry)
        timestamp            = get_entry_timestamp(parsed_entry, default=now)
        author               = get_entry_author(parsed_entry, soup.feed)
                
        # Skip ancient feed items        
        max_history = config.getint('fetcher', 'max_history')
        if max_history and ((now - timestamp).days > max_history):  
            logger.debug("entry %s from %s is over max_history, skipped" % (guid, netloc))
            continue

        try:
            # If entry is already in database with same id, then skip it
            Entry.get(guid=guid)
            logger.debug("duplicated entry %s, skipped" % guid)
            continue
        except Entry.DoesNotExist:
            pass

        entry = Entry(
            guid              = guid,
            feed              = feed,
            title             = title,
            author            = author,
            content           = content,
            #@@TODO: add mime_type too
            link              = link,
            last_updated_on   = timestamp
        )
        trigger_event('entry_parsed', entry, parsed_entry)
        entry.save()

        logger.debug(u"added entry %s from %s" % (guid, netloc))

Example #6

0

Show file

def get_entry_title(entry, default):
    if 'title' in entry:
        return truncate(html.strip_html(entry.title), MAX_TITLE_LENGTH)
    return default

Example #7

0

Show file

def fetch_feed(feed, add_entries=False):
    def synthesize_entry(reason):
        title, content = u'This feed has been disabled', render_template(
            os.path.join(template_dir, '_entry_feed_disabled.html'),
            {'reason': reason})
        return add_synthesized_entry(feed, title, 'text/html', content)

    def post_fetch(status, error=False):
        if status:
            feed.last_status = status
        if error:
            feed.error_count = feed.error_count + 1
        error_threshold = config.getint('fetcher', 'error_threshold')
        if error_threshold and (feed.error_count > error_threshold):
            feed.is_enabled = False
            feed.last_status = status  # Save status code for posterity
            logger.warn(u"%s has too many errors, disabled" % netloc)
            synthesize_entry(
                'Feed has accomulated too many errors (last was %s).' %
                status_title(status))
        feed.save()

    max_history = config.getint('fetcher', 'max_history')
    interval = config.getint('fetcher', 'min_interval')
    timeout = config.getint('fetcher', 'timeout')

    logger.debug(u"fetching %s" % feed.self_link)

    schema, netloc, path, params, query, fragment = urlparse.urlparse(
        feed.self_link)

    now = datetime.utcnow()

    # Check freshness
    for fieldname in ['last_checked_on', 'last_updated_on']:
        value = getattr(feed, fieldname)
        if not value:
            continue
        # No datetime.timedelta since we need to deal with large seconds values
        delta = datetime_as_epoch(now) - datetime_as_epoch(value)
        if delta < interval:
            logger.debug(u"%s for %s is below min_interval, skipped" %
                         (fieldname, netloc))
            return

    response = fetch_url(feed.self_link,
                         timeout=timeout,
                         etag=feed.etag,
                         modified_since=feed.last_updated_on)
    if not response:
        # Record as "503 Service unavailable"
        post_fetch(503, error=True)
        logger.warn(u"a network error occured while fetching %s" % netloc)
        return

    feed.last_checked_on = now

    if response.history and response.history[
            0].status_code == 301:  # Moved permanently
        self_link = response.url

        try:
            Feed.get(self_link=self_link)
        except Feed.DoesNotExist:
            feed.self_link = self_link
            logger.info(u"%s has changed its location, updated to %s" %
                        (netloc, self_link))
        else:
            feed.is_enabled = False
            logger.warn(u"new %s location %s is duplicated, disabled" %
                        (netloc, self_link))
            synthesize_entry('Feed has a duplicated web address.')
            post_fetch(DuplicatedFeedError.code, error=True)
            return

    if response.status_code == 304:  # Not modified
        logger.debug(u"%s hasn't been modified, skipped" % netloc)
        post_fetch(response.status_code)
        return
    elif response.status_code == 410:  # Gone
        feed.is_enabled = False
        logger.warn(u"%s is gone, disabled" % netloc)
        synthesize_entry('Feed has been removed from the origin server.')
        post_fetch(response.status_code, error=True)
        return
    elif response.status_code not in POSITIVE_STATUS_CODES:  # No good
        logger.warn(u"%s replied with status %d, aborted" %
                    (netloc, response.status_code))
        post_fetch(response.status_code, error=True)
        return

    soup = feedparser.parse(response.text)
    # Got parsing error? Log error but do not increment the error counter
    if hasattr(soup, 'bozo') and soup.bozo:
        logger.info(
            u"%s caused a parser error (%s), tried to parse it anyway" %
            (netloc, soup.bozo_exception))
        post_fetch(response.status_code)

    feed.etag = response.headers.get('ETag', None)

    if 'link' in soup.feed:
        feed.alternate_link = soup.feed.link

    # Reset value only if not set before
    if ('title' in soup.feed) and not feed.title:
        feed.title = html.strip_html(soup.feed.title)

    feed.last_updated_on = get_feed_timestamp(soup.feed, now)

    if not feed.icon or not feed.icon_last_updated_on or (
            now - feed.icon_last_updated_on).days > FETCH_ICONS_DELTA:
        # Prefer alternate_link if available since self_link could
        #   point to Feed Burner or similar services
        feed.icon = favicon.fetch(feed.alternate_link or feed.self_link)
        feed.icon_last_updated_on = now
        logger.debug(u"saved favicon %s..." % (feed.icon[:70]))

    post_fetch(response.status_code)

    if not add_entries:
        return

    for parsed_entry in soup.entries:

        link = get_entry_link(parsed_entry)
        guid = get_entry_id(parsed_entry, default=link)

        if not guid:
            logger.warn(u'could not find guid for entry from %s, skipped' %
                        netloc)
            continue

        author = get_entry_author(parsed_entry, soup.feed)

        title = get_entry_title(parsed_entry, default='Untitled')
        content_type, content = get_entry_content(parsed_entry,
                                                  default=('text/plain', ''))
        timestamp = get_entry_timestamp(parsed_entry, default=now)

        # Skip ancient feed items
        if max_history and ((now - timestamp).days > max_history):
            logger.debug(u"entry %s from %s is over max_history, skipped" %
                         (guid, netloc))
            continue

        try:
            # If entry is already in database with same id, then skip it
            Entry.get(guid=guid)
            logger.debug(u"duplicated entry %s, skipped" % guid)
            continue
        except Entry.DoesNotExist:
            pass

        entry = Entry(guid=guid,
                      feed=feed,
                      title=title,
                      author=author,
                      content=content,
                      content_type=content_type,
                      link=link,
                      last_updated_on=timestamp)
        trigger_event('entry_parsed', entry, parsed_entry)
        entry.save()

        logger.debug(u"added entry %s from %s" % (guid, netloc))

Example #8

0

Show file

File: fetcher.py Project: djdarkbeat/coldsweat

def get_entry_title(entry):
    if 'title' in entry:
        return html.strip_html(entry.title)
    return 'Untitled'