Python parseの例、syndication_app.feedparser.parse Pythonの例

コード例 #1

0

ファイルを表示

ファイル: syndication.py プロジェクト: linearregression/splunk-syndication-input

 def get_feed(cls, feed_url, return_latest_date=False, include_later_than=None, logger=None, username=None, password=None):
     
     auth_handler = None
     
     # Get an authentication handler if needed
     if username is not None and password is not None:
         auth_handler = cls.get_auth_handler(feed_url, username, password)
         
     # Parse the feed
     if auth_handler is not None:
         opener = urllib2.build_opener(auth_handler) 
         feed = opener.open(feed_url)
         d = feedparser.parse(feed)
     else:
         d = feedparser.parse(feed_url)
     
     entries = []
     latest_date = None
     
     # Stop if we didn't get a result
     if d is None or not hasattr(d, 'entries'):
         logger.warn("No entries returned from the feed, url=\"%s\"", feed_url)
     else:
         for entry in d.entries:
             
             # Get the updated or published date
             entry_date = cls.get_updated_date(entry)
             
             # Perform the operations that are based on the date
             if entry_date is not None:
                 
                 # If this is the latest one, then save it
                 if latest_date is None or entry_date > latest_date:
                     latest_date = entry_date
                     
                 # If the item is earlier than the date we are to include, then skip it
                 if include_later_than is not None and entry_date <= include_later_than:
                     
                     if logger is not None:
                         logger.debug("Skipping entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), time.strftime('%Y-%m-%dT%H:%M:%SZ', include_later_than), entry.title)
                     
                     continue
                 
                 elif logger is not None and include_later_than is not None:
                     logger.debug("Including entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), time.strftime('%Y-%m-%dT%H:%M:%SZ', include_later_than), entry.title)
                 
                 elif logger is not None and include_later_than is None:
                     logger.debug("Including entry with date=%r, since its not later than latest_date=%r, title=\"%s\"", time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date), "none", entry.title)
                       
             entries.append(cls.flatten(entry))
     
     # Return the latest date if requested
     if return_latest_date:
         return entries, latest_date
     else:
         return entries

コード例 #2

0

ファイルを表示

ファイル: syndication.py プロジェクト: linearregression/splunk-syndication-input

 def get_realm_and_auth_type(cls, feed_url, username, password):
     
     # Perform request and get the realm and whether or not the view uses HTTP digest or basic authentication
     d = feedparser.parse(feed_url)
     
     # Make sure we got a result
     if d is None or not hasattr(d, 'status'):
         return None, None
     
     # If the status isn't 401, then authentication isn't required
     if d.status != 401:
         return None, None
     
     # Make sure that an auth header exists
     if 'www-authenticate' not in d.headers:
         return None, None
      
     auth_header = d.headers['www-authenticate']
     
     # Get the realm and whether it is using basic or digest authentication
     http_auth_re = re.compile("((Digest)|(Basic))( realm=[\"]?([^\"]*)[\"]?)?")
     match = http_auth_re.search(auth_header)
     
     auth_type = match.groups()[0]
     auth_realm = match.groups()[4]
     
     return auth_realm, auth_type

コード例 #3

0

ファイルを表示

ファイル: syndication.py プロジェクト: dakiri/splunk-syndication-input

 def get_feed(cls, feed_url, return_latest_date=False, include_later_than=None, logger=None, username=None, password=None):
     
     auth_handler = None
     
     # Get an authentication handler if needed
     if username is not None and password is not None:
         auth_handler = cls.get_auth_handler(feed_url, username, password)
         
     # Parse the feed
     if auth_handler is not None:
         opener = urllib2.build_opener(auth_handler) 
         feed = opener.open(feed_url)
         d = feedparser.parse(feed)
     else:
         d = feedparser.parse(feed_url)
     
     entries = []
     latest_date = None
     
     for entry in d.entries:
         
         # Get the updated or published date
         entry_date = cls.get_updated_date(entry)
         
         # Perform the operations that are based on the date
         if entry_date is not None:
             
             # If this is the latest one, then save it
             if latest_date is None or entry_date > latest_date:
                 latest_date = entry_date
                 
             # If the item is earlier than the date we are to include, then skip it
             if include_later_than is not None and entry_date <= include_later_than:
                 
                 if logger is not None:
                     logger.debug("Skipping entry with date=%r, since its not later than latest_date=%r", entry_date, include_later_than)
                 
                 continue
             
         entries.append(cls.flatten(entry))
     
     # Return the latest date if requested
     if return_latest_date:
         return entries, latest_date
     else:
         return entries

コード例 #4

0

ファイルを表示

ファイル: syndication.py プロジェクト: dakiri/splunk-syndication-input

 def get_realm_and_auth_type(cls, feed_url, username, password):
     
     # Perform request and get the realm and whether or not the view uses HTTP digest or basic authentication
     d = feedparser.parse(feed_url)
     
     # If the status isn't 401, then authentication isn't required
     if d.status != 401:
         return None, None
     
     auth_header = d.headers['www-authenticate']
     
     # Get the realm and whether it is using basic or digest authentication
     http_auth_re = re.compile("((Digest)|(Basic))( realm=[\"]?([^\"]*)[\"]?)?")
     match = http_auth_re.search(auth_header)
     
     auth_type = match.groups()[0]
     auth_realm = match.groups()[4]
     
     return auth_realm, auth_type

コード例 #5

0

ファイルを表示

ファイル: syndication.py プロジェクト: hatan4ik/splunk-syndication-input

    def get_feed(cls,
                 feed_url,
                 return_latest_date=False,
                 include_later_than=None,
                 logger=None,
                 username=None,
                 password=None,
                 clean_html=True):
        """
        Get the feed results as a dictionary.

        Arguments:
        feed_url -- The URL of the feed to retrieve (as a string)
        return_latest_date -- Return the date of the latest feed entry as the second item in a tuple
        include_later_than -- Only return the feeds that are after this date
        logger -- The logger to log the data to
        username -- The username to use when authenticating
        password -- The password to use when authenticating
        clean_html -- If true, HTML will be convrted to something human readable
        """

        auth_handler = None

        # Get an authentication handler if needed
        if username is not None and password is not None:
            auth_handler = cls.get_auth_handler(feed_url, username, password)

        # Parse the feed
        if auth_handler is not None:
            opener = urllib2.build_opener(auth_handler)
            feed = opener.open(feed_url)
            d = feedparser.parse(feed)
        else:
            d = feedparser.parse(feed_url)

        entries = []
        latest_date = None

        # Stop if we didn't get a result
        if d is None or not hasattr(d, 'entries'):
            logger.warn("No entries returned from the feed, url=\"%s\"",
                        feed_url)
        else:
            for entry in d.entries:

                # Get the updated or published date
                entry_date = cls.get_updated_date(entry)

                # Perform the operations that are based on the date
                if entry_date is not None:

                    # If this is the latest one, then save it
                    if latest_date is None or entry_date > latest_date:
                        latest_date = entry_date

                    # If the item is earlier than the date we are to include, then skip it
                    if include_later_than is not None and entry_date <= include_later_than:

                        if logger is not None:
                            logger.debug(
                                "Skipping entry with date=%r, since its not later than latest_date=%r, title=\"%s\"",
                                time.strftime('%Y-%m-%dT%H:%M:%SZ',
                                              entry_date),
                                time.strftime('%Y-%m-%dT%H:%M:%SZ',
                                              include_later_than), entry.title)

                        continue

                    elif logger is not None and include_later_than is not None:
                        logger.debug(
                            "Including entry with date=%r, since its not later than latest_date=%r, title=\"%s\"",
                            time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date),
                            time.strftime('%Y-%m-%dT%H:%M:%SZ',
                                          include_later_than), entry.title)

                    elif logger is not None and include_later_than is None:
                        logger.debug(
                            "Including entry with date=%r, since its not later than latest_date=%r, title=\"%s\"",
                            time.strftime('%Y-%m-%dT%H:%M:%SZ', entry_date),
                            "none", entry.title)

                # Clean up the HTML if requested
                if clean_html:

                    # Clean up the content
                    try:
                        if entry.get(
                                'content', None) and entry['content'][0].get(
                                    'type', 'text/html'
                                ) == 'text/html' and entry['content'][0].get(
                                    'value', None):
                            entry['content'][0]['value'] = html2text.html2text(
                                entry['content'][0]['value'])
                    except:
                        logger.warn(
                            "Unable to convert the HTML content, field=%s",
                            "value")

                    # Clean up the summary
                    try:
                        if entry.get('summary', None):
                            entry['summary'] = html2text.html2text(
                                entry['summary'])
                    except:
                        logger.warn(
                            "Unable to convert the HTML content, field=%s",
                            "summary")

                    # Clean up the summary_detail
                    try:
                        if entry.get('summary_detail',
                                     None) and entry['summary_detail'].get(
                                         'type',
                                         'text/html') == 'text/html' and entry[
                                             'summary_detail'].get(
                                                 'value', None):
                            entry['summary_detail'][
                                'value'] = html2text.html2text(
                                    entry['summary_detail']['value'])
                    except:
                        logger.warn(
                            "Unable to convert the HTML content, field=%s",
                            "summary_detail")

                entries.append(cls.flatten(entry, sort=True))

        # Return the latest date if requested
        if return_latest_date:
            return entries, latest_date
        else:
            return entries