Ejemplo n.º 1
0
def parseDate(date):
  """
  Parse a TextMate date (YYYY-MM-DD HH-MM-SS, no time zone)
  """
  m = _textmate_date_re.match(date)
  if not m:
    return time.mktime(feedparser._parse_date(date))
  isodate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % {'year': m.group(1), 'month': m.group(2), 'day': m.group(3), 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), 'zonediff': '+00:00'} 
  return time.mktime(feedparser._parse_date(isodate))
Ejemplo n.º 2
0
    def from_xml(self, x):
        AtomItem.from_xml(self, x)
        updated = unicode(x.updated)
        published = unicode(x.published)
        updated_parsed = feedparser._parse_date(updated)
        published_parsed = feedparser._parse_date(published)

        activity_type = unicode(x.category.attrib.get('label'))

        verb = unicode(
            getattr(x, '{%s}verb' % constants.NS.ActivityStreams, None))
        object_tags = list(
            getattr(x, '{%s}object' % constants.NS.ActivityStreams, None))

        objects = []
        for object_tag in object_tags:
            if object_tag is not None:
                object = ActivityObject()
                object.populate(object_tag, InputType.XML)
                objects.append(object)
                source = object.source
            else:
                object = None
                source = None

        # context things

        # TODO: get an example and parse out all the values from a location
        # so we don't pass around lxml objects
        #location = getattr(x, '{%s}location', None)
        mood = getattr(x, '{%s}mood' % constants.NS.ActivityContext, None)
        if mood is None:
            mood_text = mood_icon = None
        else:
            mood_text = unicode(mood)
            mood_icon = unicode(mood.attrib.get('icon'))

        object.mood_text = mood_text
        object.mood_icon = mood_icon

        self.__dict__.update(
            verb=verb,
            source=source,
            objects=[],
            updated=updated,
            published=published,
            updated_parsed=int(calendar.timegm(updated_parsed)) + TIMEZONE_FIX,
            published_parsed=int(calendar.timegm(published_parsed)) +
            TIMEZONE_FIX,
            activity_type=activity_type,
        )
Ejemplo n.º 3
0
    def from_xml(self, x):
        AtomItem.from_xml(self, x)
        updated = unicode(x.updated)
        published = unicode(x.published)
        updated_parsed = feedparser._parse_date(updated)
        published_parsed = feedparser._parse_date(published)

        activity_type = unicode(x.category.attrib.get("label"))

        verb = unicode(getattr(x, "{%s}verb" % constants.NS.ActivityStreams, None))
        object_tags = list(getattr(x, "{%s}object" % constants.NS.ActivityStreams, None))

        objects = []
        for object_tag in object_tags:
            if object_tag is not None:
                object = ActivityObject()
                object.populate(object_tag, InputType.XML)
                objects.append(object)
                source = object.source
            else:
                object = None
                source = None

        # context things

        # TODO: get an example and parse out all the values from a location
        # so we don't pass around lxml objects
        # location = getattr(x, '{%s}location', None)
        mood = getattr(x, "{%s}mood" % constants.NS.ActivityContext, None)
        if mood is None:
            mood_text = mood_icon = None
        else:
            mood_text = unicode(mood)
            mood_icon = unicode(mood.attrib.get("icon"))

        object.mood_text = mood_text
        object.mood_icon = mood_icon

        self.__dict__.update(
            verb=verb,
            source=source,
            objects=[],
            updated=updated,
            published=published,
            updated_parsed=int(calendar.timegm(updated_parsed)) + TIMEZONE_FIX,
            published_parsed=int(calendar.timegm(published_parsed)) + TIMEZONE_FIX,
            activity_type=activity_type,
        )
Ejemplo n.º 4
0
 def parse_date(self, node, feed, ns=''):
     value = unicoder(node.text)
     feed['updated'] = value
     date = feedparser._parse_date(value)
     if self.unix_timestamp and date:
         date = time.mktime(date)
     feed['updated_parsed'] = date
Ejemplo n.º 5
0
    def _filter_entry(self, entry):
        keys = entry.keys()
        for field in ('date_parsed', 'updated_parsed', 'published_parsed'):
            if field in keys:
                del entry[field]

        for field in ('date', 'updated', 'published'):
            if field in keys:
                parsed = _parse_date(entry[field])
                if parsed is None:
                    del entry[field]
                else:
                    entry[field] = datetime(*parsed[:6])

        if 'date' not in keys and 'published' in keys:
            entry['date'] = entry['published']

        entry['links'] = [l for l in [link.get('href') for link in entry['links']]
                          if l is not None]
        if 'summary' not in entry:
            if 'content' in entry:
                entry['summary'] = entry['content'][0]['value']
        if self.name is not None:
            entry['title'] = u'[%s] %s' % (self.name, entry['title'])

        entry['root_link'] = self.url
        return entry
Ejemplo n.º 6
0
def str2datetime(string):
    import feedparser
    from datetime import datetime
    try:
        return datetime(*(feedparser._parse_date(string)[:6]))
    except Exception as e:
        logging.debug("Failed to convert %s into datetime" % string)
Ejemplo n.º 7
0
    def get(self, slug):
        logging.debug('PipesHandler.get')
        self.response.headers["Content-Type"] = "text/xml"
        if slug[0] == '/':
            slug = slug[1:]
        if slug:
            logging.debug('slug is ' + slug)
            #req_path = self.request.path
            feed_xml_memcache_key = PIPE_OUTPUT_FEED_XML_MEMCACHE_KEY + '_' + slug
            feed_xml = memcache.get(feed_xml_memcache_key)
            updated_time_str = self.request.get('updated_time')
            updated_time = None
            if updated_time_str:
                updated_time = datetime.datetime.fromtimestamp(
                    time.mktime(feedparser._parse_date(updated_time_str)))

            if not feed_xml:
                logging.debug('feed_xml not found in memcache, query enties')
                pipe = get_pipe(slug)
                if pipe:
                    try:
                        feed_xml = self.get_feed_xml(pipe, updated_time)

                        memcache.add(feed_xml_memcache_key, feed_xml,
                                     60 * 30)  #cache ten minutes
                    except Exception, e:
                        logging.exception(e)
                        return self.response.set_status(500)
                else:
                    return self.response.set_status(404)
            if feed_xml:
                logging.debug('the len(feed_xml) is %d', len(feed_xml))
                self.response.out.write(feed_xml)
Ejemplo n.º 8
0
  def get(self, slug):
    logging.debug('PipesHandler.get')
    self.response.headers["Content-Type"] = "text/xml"
    if slug[0] == '/':
      slug = slug[1:]
    if slug:
      logging.debug('slug is ' + slug)
      #req_path = self.request.path
      feed_xml_memcache_key = PIPE_OUTPUT_FEED_XML_MEMCACHE_KEY + '_' + slug
      feed_xml = memcache.get(feed_xml_memcache_key)
      updated_time_str = self.request.get('updated_time')
      updated_time = None
      if updated_time_str:
        updated_time = datetime.datetime.fromtimestamp(time.mktime(feedparser._parse_date(updated_time_str)))

      if not feed_xml:
        logging.debug('feed_xml not found in memcache, query enties')
        pipe = get_pipe(slug)
        if pipe:
          try:
            feed_xml = self.get_feed_xml(pipe, updated_time)

            memcache.add(feed_xml_memcache_key, feed_xml, 60 * 30)#cache ten minutes
          except Exception , e:
            logging.exception(e)
            return self.response.set_status(500)
        else:
          return self.response.set_status(404)
      if feed_xml:
        logging.debug('the len(feed_xml) is %d', len(feed_xml))
        self.response.out.write(feed_xml)
Ejemplo n.º 9
0
def get_pubdate(entry):
    """Try to determine the real pubDate of a feedparser entry

    This basically takes the updated_parsed value, but also uses some more
    advanced techniques to work around various issues with ugly feeds.

    "published" now also takes precedence over "updated" (with updated used as
    a fallback if published is not set/available). RSS' "pubDate" element is
    "updated", and will only be used if published_parsed is not available.
    """

    pubdate = entry.get('published_parsed', None)

    if pubdate is None:
        pubdate = entry.get('updated_parsed', None)

    if pubdate is None:
        # See http://code.google.com/p/feedparser/issues/detail?id=327
        updated = entry.get('published', entry.get('updated', None))
        if updated is not None:
            # FIXME: This is kludgy. We should write our own date handler
            # and register it with feedparser.registerDateHandler() and/or
            # wait for feedparser to add support for this bogus date format.
            pubdate = feedparser._parse_date(updated.replace(',', ''))

    if pubdate is None:
        # Cannot determine pubdate - party like it's 1970!
        return 0

    return mktime_tz(pubdate + (0,))
Ejemplo n.º 10
0
 def _check_date(self, func, dtstring, dttuple):
     try:
         tup = func(dtstring)
     except (OverflowError, ValueError):
         tup = None
     self.assertEqual(tup, dttuple)
     self.assertEqual(tup, feedparser._parse_date(dtstring))
Ejemplo n.º 11
0
    def _compute_rss_stats(self, rsslink, fr):
        if fr is not None and not (
                "application/xml" in fr.headers["content-type"]
                or "text/xml" in fr.headers["content-type"]
                or "application/rss+xml" in fr.headers["content-type"]):
            return (None, None)

        try:
            rss = self.wrapper.pq(fr.webpage)
        except (lxml.etree.XMLSyntaxError, lxml.etree.ParserError):
            return (rsslink, 0)

        # Now let's get more recent and oldest item dates in stream
        first = last = None
        count = 0
        for entry in rss("item").items():
            count += 1
            date = feedparser._parse_date(entry("pubDate").text())
            if date is not None:
                publication = time.mktime(date)
                if first is None or first < publication:
                    first = publication
                if last is None or last > publication:
                    last = publication

        # Compute ratio items per week
        if first is not None and last is not None:
            timedelta = first - last
            if timedelta > 0:
                weekratio = count / (timedelta / (7 * 24 * 60 * 60))

                return (rsslink, weekratio)

        return (rsslink, 0)
Ejemplo n.º 12
0
 def _check_date(self, func, dtstring, dttuple):
     try:
         tup = func(dtstring)
     except (OverflowError, ValueError):
         tup = None
     self.assertEqual(tup, dttuple)
     self.assertEqual(tup, feedparser._parse_date(dtstring))
Ejemplo n.º 13
0
Archivo: models.py Proyecto: kr/feedie
def parse_http_datetime(s):
  try:
    if INT_PATTERN.match(s):
      return int(time.time()) + int(s)
    return int(calendar.timegm(feedparser._parse_date(s)))
  except:
    return 0
Ejemplo n.º 14
0
 def parse_date(self, node, feed, ns=''):
     value = unicoder(node.text)
     feed['updated'] = value
     date = feedparser._parse_date(value)
     if self.unix_timestamp and date:
         date = time.mktime(date)
     feed['updated_parsed'] = date
Ejemplo n.º 15
0
    def _patch_feedparser(self, 
        path='quotes_app.services.feedparser.parse', image=True):
        
        feed_dict = {
            'title':self.expected_feed_title,
            'description':self.expected_feed_description, 
            'link':self.expected_feed_homepage,
            'tags':self.expected_feed_tags
        }
        
        if image == True:
            feed_dict['image'] = MicroMock(
                url=self.expected_feed_image_url
            )
        
        mock_feedparser_results = MicroMock(
            feed=MicroMock(**feed_dict),
            entries=[MicroMock(
                title="Why is yoda so old?",
                publication_date="Thu, 04 Aug 2005 17:02:29 -0400",
                description="Lets find out why yoda won't die quickly.",
                link = "http://starwars.fke/ep/40",
                guid = "http://starwars.fke/ep/40",
                published_parsed=feedparser._parse_date("Thu, 04 Aug 2005 17:02:29 -0400")
            )]
        )

        patcher = patch(path, return_value=mock_feedparser_results)
        self.parse_spy = patcher.start()
        self.addCleanup(patcher.stop)
Ejemplo n.º 16
0
def get_pubdate(entry):
    """Try to determine the real pubDate of a feedparser entry

    This basically takes the updated_parsed value, but also uses some more
    advanced techniques to work around various issues with ugly feeds.

    "published" now also takes precedence over "updated" (with updated used as
    a fallback if published is not set/available). RSS' "pubDate" element is
    "updated", and will only be used if published_parsed is not available.
    """

    pubdate = entry.get('published_parsed', None)

    if pubdate is None:
        pubdate = entry.get('updated_parsed', None)

    if pubdate is None:
        # See http://code.google.com/p/feedparser/issues/detail?id=327
        updated = entry.get('published', entry.get('updated', None))
        if updated is not None:
            # FIXME: This is kludgy. We should write our own date handler
            # and register it with feedparser.registerDateHandler() and/or
            # wait for feedparser to add support for this bogus date format.
            pubdate = feedparser._parse_date(updated.replace(',', ''))

    if pubdate is None:
        # Cannot determine pubdate - party like it's 1970!
        return 0

    return mktime_tz(pubdate + (0,))
Ejemplo n.º 17
0
 def makeSolrDate(self, datestr):
     """
     Solr is very particular about the date format it can handle
     """
     d = feedparser._parse_date(datestr)
     date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday, d.tm_hour, d.tm_min, d.tm_sec)
     return date.isoformat()+'Z'
Ejemplo n.º 18
0
 def date(self):
     #TODO use http://labix.org/python-dateutil instead of feedparser
     d = self._get("DC.date", "date")
     if d:
         #TODO ??? timezone
         timetuple = feedparser._parse_date(d)
         return datetime.datetime(*timetuple[:7])
     return None 
Ejemplo n.º 19
0
 def makeSolrDate(self, datestr):
     """
     Solr is very particular about the date format it can handle
     """
     d = feedparser._parse_date(datestr)
     date = datetime.datetime(d.tm_year, d.tm_mon, d.tm_mday, d.tm_hour,
                              d.tm_min, d.tm_sec)
     return date.isoformat() + 'Z'
Ejemplo n.º 20
0
 def _end_updated(self):
     value = self.pop('updated')
     parsed_value = feedparser._parse_date(value)
     overwrite = ('youtube.com' not in self.baseuri)
     try:
         self._save('updated_parsed', parsed_value, overwrite=overwrite)
     except TypeError, te:
         logger.warn('Your feedparser version is too old: %s', te)
Ejemplo n.º 21
0
 def _end_updated(self):
     value = self.pop("updated")
     parsed_value = feedparser._parse_date(value)
     overwrite = "youtube.com" not in self.baseuri
     try:
         self._save("updated_parsed", parsed_value, overwrite=overwrite)
     except TypeError, te:
         logger.warn("Your feedparser version is too old: %s", te)
Ejemplo n.º 22
0
 def _end_updated(self):
     value = self.pop('updated')
     parsed_value = feedparser._parse_date(value)
     overwrite = ('youtube.com' not in self.baseuri)
     try:
         self._save('updated_parsed', parsed_value, overwrite=overwrite)
     except TypeError, te:
         logger.warn('Your feedparser version is too old: %s', te)
Ejemplo n.º 23
0
def parseDate(date):
  """
  Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime)
  """
  m = _textmate_date_re.match(date)
  if not m:
    return time.mktime(feedparser._parse_date(date))
  return time.mktime(time.localtime(calendar.timegm(time.gmtime(time.mktime(time.strptime(date,"%Y-%m-%d %H:%M:%S"))))))
Ejemplo n.º 24
0
    def testdate(self):
        dates0 = '2010/07/07 14:40:24 +0800'
        #print feedparser._parse_date(dates0)
        #dates1 = '2010-07-07T14:49:24 +0800'
        #print feedparser._parse_date(dates1)
        date = feedparser._parse_date(dates0)

        t = datetime.datetime.fromtimestamp(time.mktime(date))
        print t.strftime("%Y-%m-%dT%H:%M:%SZ"), time.asctime(date)
Ejemplo n.º 25
0
    def testdate(self):
        dates0 = "2010/07/07 14:40:24 +0800"
        # print feedparser._parse_date(dates0)
        # dates1 = '2010-07-07T14:49:24 +0800'
        # print feedparser._parse_date(dates1)
        date = feedparser._parse_date(dates0)

        t = datetime.datetime.fromtimestamp(time.mktime(date))
        print t.strftime("%Y-%m-%dT%H:%M:%SZ"), time.asctime(date)
Ejemplo n.º 26
0
 def fetch_channel(self, channel):
     etag = channel.http_etag
     modified = feedparser._parse_date(channel.http_last_modified)
     # If we have a username or password, rebuild the url with them included
     # Note: using a HTTPBasicAuthHandler would be pain because we need to
     # know the realm. It can be done, but I think this method works, too
     url = channel.authenticate_url(channel.url)
     for handler in self.custom_handlers:
         custom_feed = handler.handle_url(url)
         if custom_feed is not None:
             return feedcore.Result(feedcore.CUSTOM_FEED, custom_feed)
     return self.fetch(url, etag, modified)
Ejemplo n.º 27
0
Archivo: Utils.py Proyecto: oier/Yaki
def parseDate(date):
    """
  Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime)
  """
    m = _textmate_date_re.match(date)
    if not m:
        return time.mktime(feedparser._parse_date(date))
    return time.mktime(
        time.localtime(
            calendar.timegm(
                time.gmtime(
                    time.mktime(time.strptime(date, "%Y-%m-%d %H:%M:%S"))))))
Ejemplo n.º 28
0
def parse_date(date):
    """Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime)"""

    m = _textmate_date_re.match(date)
    try:
        from feedparser import _parse_date
        if not m:
            return time.mktime(_parse_date(date))
    except:
        pass

    return time.mktime(time.localtime(calendar.timegm(time.gmtime(time.mktime(time.strptime(date,
                       '%Y-%m-%d %H:%M:%S'))))))
Ejemplo n.º 29
0
def get_tornado_warnings(feed):
    """Get a list of the current tornado warnings in effect"""

    state_list = ['AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC',
                  'FM', 'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS',
                  'KY', 'LA', 'ME', 'MH', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO',
                  'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP',
                  'OH', 'OK', 'OR', 'PW', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN',
                  'TX', 'UT', 'VT', 'VA', 'VI', 'WA', 'WV', 'WI', 'WY']

    print_debug('Parsing entries')

    # Trim down the list of alerts to actual tornado warnings
    tornado_alerts = [ x for x in feed.entries if
                      re.search('tornado', x['cap_event'], re.IGNORECASE) and
                      x['cap_status'] == 'Actual' ]
    for entry in tornado_alerts:
        affected_counties = entry['cap_areadesc'].split('; ')
        if not affected_counties:
            print_debug('Counties list empty:\n' + entry)
            debug_mail('Counties empty', entry)

        affected_state = re.search('\?x=(..)', entry['id']).group(1)
        if affected_state not in state_list:
            print_debug('State not found:\n' + entry)
            debug_mail('State not found', entry)
            continue
        if entry.cap_event == 'Tornado Watch':
            alert_type='watch'
        elif entry.cap_event == 'Tornado Warning':
            alert_type='warning'
        else:
            alert_type='Unknown'

        starttime = time.mktime(feedparser._parse_date(entry['cap_effective'])) - time.timezone
        endtime = time.mktime(feedparser._parse_date(entry['cap_expires'])) - time.timezone
        for affected_county in affected_counties:
            yield affected_county, affected_state, starttime, endtime, alert_type
Ejemplo n.º 30
0
 def _start_newznab_attr(self, attrsD):
     context = self._getContext()
     # Add the dict
     if "newznab" not in context:
         context["newznab"] = {}
     # Don't crash when it fails
     try:
         # Add keys
         context["newznab"][attrsD["name"]] = attrsD["value"]
         # Try to get date-object
         if attrsD["name"] == "usenetdate":
             context["newznab"][attrsD["name"] + "_parsed"] = feedparser._parse_date(attrsD["value"])
     except KeyError:
         pass
Ejemplo n.º 31
0
    def run(self, saved_state):
        feed = feedparser.parse(self.url)

        artifacts = []
        for item in list(reversed(feed['items'])):
            # Only new items.
            published_parsed = item.get('published_parsed') or item.get(
                'updated_parsed')
            if published_parsed and published_parsed <= feedparser._parse_date(
                    saved_state or '0001-01-01'):
                continue

            try:
                soup = bs4.BeautifulSoup(item['content'][0]['value'],
                                         'html.parser')
            except KeyError:
                try:
                    soup = bs4.BeautifulSoup(item['summary'], 'html.parser')
                except KeyError:
                    # Can't find any feed content, just skip this entry.
                    continue

            # do some preprocessing to remove common obfuscation methods
            [x.unwrap() for x in soup.find_all('strong')]
            [x.unwrap() for x in soup.find_all('b')]
            [x.unwrap() for x in soup.find_all('em')]
            [x.unwrap() for x in soup.find_all('i')]
            soup = bs4.BeautifulSoup(soup.decode(), 'html.parser')

            text = ''
            if self.feed_type == 'afterioc':
                text = soup.get_text(separator=' ').split(AFTERIOC)[-1]
                artifacts += self.process_element(text,
                                                  item.get('link') or self.url,
                                                  include_nonobfuscated=True)
            elif self.feed_type == 'clean':
                text = soup.get_text(separator=' ')
                artifacts += self.process_element(text,
                                                  item.get('link') or self.url,
                                                  include_nonobfuscated=True)
            else:
                # Default: self.feed_type == 'messy'.
                text = soup.get_text(separator=' ')
                artifacts += self.process_element(text,
                                                  item.get('link') or self.url)

            saved_state = item.get('published') or item.get('updated')

        return saved_state, artifacts
Ejemplo n.º 32
0
def parse_by_url(url):
    try:
        objs = []
        data = feedparser.parse(url)
        for item in data['entries']:
            objs.append({
                'title': strip_tags(item['title']),
                'published': feedparser._parse_date(item['published']),
                'link': item['link']
            })
            objs.sort(key=lambda x: x['published'])
            objs = objs[::-1]
    except:  # everything can happened here
        objs = []
    return objs
Ejemplo n.º 33
0
def to_datetime(t, tzinfo=None):
    if not t:
        return None
    if isinstance(t, str):
        t = datetime.datetime(*feedparser._parse_date(t)[:6], tzinfo=UTC)
    tz = tzinfo or LocalTimezone()
    if isinstance(t, (tuple, time.struct_time)):
        t = datetime.datetime(*t[:6], tzinfo=tz)
    if isinstance(t, (int, float)):
        t = datetime.datetime.fromtimestamp(t, tz=tz)
    if not isinstance(t, datetime.datetime):
        raise ValueError(repr(t))
    if not t.tzinfo:
        t = datetime.datetime(*t.timetuple()[:6], tzinfo=tz)
    return t
Ejemplo n.º 34
0
def to_datetime(t, tzinfo=None):
    if not t:
        return None
    if isinstance(t, str):
        t = datetime.datetime(*feedparser._parse_date(t)[:6], tzinfo=UTC)
    tz = tzinfo or LocalTimezone()
    if isinstance(t, (tuple, time.struct_time)):
        t = datetime.datetime(*t[:6], tzinfo=tz)
    if isinstance(t, (int, float)):
        t = datetime.datetime.fromtimestamp(t, tz=tz)
    if not isinstance(t, datetime.datetime):
        raise ValueError(repr(t))
    if not t.tzinfo:
        t = datetime.datetime(*t.timetuple()[:6], tzinfo=tz)
    return t
Ejemplo n.º 35
0
def parse_date(date):
    """Parse a TextMate date (YYYY-MM-DD HH:MM:SS, no time zone, assume it's always localtime)"""

    m = _textmate_date_re.match(date)
    try:
        from feedparser import _parse_date
        if not m:
            return time.mktime(_parse_date(date))
    except:
        pass

    return time.mktime(
        time.localtime(
            calendar.timegm(
                time.gmtime(
                    time.mktime(time.strptime(date, '%Y-%m-%d %H:%M:%S'))))))
Ejemplo n.º 36
0
    def __init__(self, data):
        self.userid = data.author.id
        self.text = data.body
        self.commentId = data.commentId
        if getattr(data, "postedDate", None) is not None:
            self.postedDate = data.postedDate
            self.postedDate_parsed = int(calendar.timegm(feedparser._parse_date(self.postedDate)))
        else:
            self.postedDate_parsed = data.postedDate_parsed

        try:
            int(self.userid)
        except ValueError:
            pass
        else:
            self.userid = "myspace.com.person.%s" % self.userid
Ejemplo n.º 37
0
    def __init__(self, data):
        self.userid = data.author.id
        self.text = data.body
        self.commentId = data.commentId
        if getattr(data, 'postedDate', None) is not None:
            self.postedDate = data.postedDate
            self.postedDate_parsed = int(
                calendar.timegm(feedparser._parse_date(self.postedDate)))
        else:
            self.postedDate_parsed = data.postedDate_parsed

        try:
            int(self.userid)
        except ValueError:
            pass
        else:
            self.userid = 'myspace.com.person.%s' % self.userid
Ejemplo n.º 38
0
 def _new_with_headers_expires(self, url, stale_feed):
     """
     >>> from datetime import datetime, timedelta
     >>> now = datetime.utcnow()
     >>> stale = {'headers': {'expires':'Wed, 18 Aug 2010 06:06:11 GMT'}}
     >>> url = 'http://feedparser.org/docs/examples/atom10.xml'
     >>> res = _new_with_headers_expires(url, stale)
     >>> res.status
     200
     >>> stale = {'headers': { 'expires':datetime.isoformat(now + timedelta(minutes=30))}}
     >>> res2 = _new_with_headers_expires(url, stale)
     >>> res == res2
     False
     """
     exp = fp._parse_date(stale_feed['headers']['expires'])
     if time.time() > exp:
         return retrieve_feed(url)
Ejemplo n.º 39
0
def relatize(value):
    """
    
    Returns the relative time of each request.  Another feature stolen from
    github.

    How it works:
    
        get the date from value - use _parse_date from feed parser
        get current utc time.
        compare current utc time and output relative time
        
    """
    
    date_struct = _parse_date(value)[0:6]
    the_date = datetime(*date_struct)
    
    now = datetime.utcnow()
    if time.daylight:
        now = now + timedelta(hours=1)

    time_difference = now - the_date
    if time_difference.days < 0:
        return 'sometime in the near future' # just in case the time screws up
    
    if time_difference.days > 356:
        return 'about %d years ago' % (time_difference.days / 356)
    elif time_difference.days > 60:
        return 'about %d months ago' % (time_difference.days / 30)
    elif time_difference.days > 30:
        return 'about a month ago'
    elif time_difference.days > 1:
        return 'about %d days ago' % time_difference.days
    elif time_difference.days > 0:
        return 'about a day ago'
    elif time_difference.seconds > 7200:
        return 'about %d hours ago' % (time_difference.seconds / 3600)
    elif time_difference.seconds > 3600:
        return 'about an hour ago'
    elif time_difference.seconds > 120:
        return 'about %d minutes ago' % (time_difference.seconds / 60)
    elif time_difference.seconds > 60:
        return 'about a minute ago'
    elif time_difference.seconds < 60 or time_difference.days < 1:
        return 'just now'
    def __real_check_now_cb(self):
        rss_file = os.path.join(os.environ["HOME"], ".movistar_desktop/", "rss.xml")
        
        if not os.path.exists(rss_file) :
            return True
        
        d = feedparser.parse(rss_file)

        os.system("rm %s" % rss_file)
        
        if (len(d['entries']) < 1):
            print _(u"No entries in RSS")
            return True
        
        # Check date with saved feed
        new_feed_date = md5.new(d.entries[0].date).hexdigest()
        saved_feed_date = md5.new(self.conf.get_updater_feed_date()).hexdigest()
        release_date_parsed = feedparser._parse_date(self.conf.get_release_date())
        # Debug
        print "RSS-----------------"
        print d.entries[0]
        print "--------------------"
        print "new_feed_date %s" % new_feed_date
        print "saved_feed_date %s" % saved_feed_date
        print "fecha: %s" % d.entries[0].updated_parsed
        print "release: %s" % release_date_parsed
        # End debug
        
        if ((new_feed_date != saved_feed_date) and (release_date_parsed < d.entries[0].updated_parsed)):
            self.uw_dialog.set_title(d.entries[0].title)
            self.uw_label.set_text(d.entries[0].description)

            self.uw_dialog.show_all()
            result = self.uw_dialog.run()
            if (result == gtk.RESPONSE_OK):
                os.system("gnome-open %s" % d.entries[0].link)
                self.conf.set_updater_feed_date(d.entries[0].date)
                self.conf.save_conf()
            elif (result == gtk.RESPONSE_NO):
                self.conf.set_updater_feed_date(d.entries[0].date)
                self.conf.save_conf()

            self.uw_dialog.hide()

        return True
Ejemplo n.º 41
0
def rdfaparse(content):
    resources = []
    triples = rdfascrape.rdfascrape(content)
    for count, (s, p, o, dt) in enumerate(triples):
        obj = {}
        obj['label'] = '_' + str(count)
        obj['id'] = '_' + str(count)
        pred = p.split('/')[-1].split('#')[-1]
        if pred == u'dc:date' or dt in [u'xsd:date', u'xs:date', u'http://www.w3.org/2001/XMLSchema' + u'date']:
            # feedparer's internal date parser robustly handles different
            # time formats and returns a 9-tuple
            import feedparser
            normalizeddate = feedparser._parse_date(o)
            obj[pred] = time.strftime("%Y-%m-%dT%H:%M:%S", normalizeddate)
            obj[pred + u'localized'] = time.strftime("%a, %d %b %Y %H:%M:%S", normalizeddate)
        else:
            obj[pred] = o
        resources.append(obj)
    return resources
Ejemplo n.º 42
0
def discovery():
    '''
    Sample query:
        curl "http://localhost:8880/osci.jove.discovery"
    '''
    doc = '''<?xml version="1.0" encoding="UTF-8"?>
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/" xmlns:osci="%(oscibase)s/content/jove/datamodel#">
  <ShortName>JoVE</ShortName>
  <LongName>JoVE OSCI adapter</LongName>
  <Description>JoVE</Description>
  <Contact>%(admin)s</Contact>
  <Url type="application/atom+xml" rel="results" template="%(oscibase)s/content/jove?search={searchTerms}"/>
  <Url type="application/atom+xml" rel="http://purl.zepheira.com/osci/content/model#id" template="%(oscibase)s/content/jove?id={searchTerms}"/>
  <Attribution>© 2009 Zepheira, LLC</Attribution>
  <osci:metadata-profile href="%(oscibase)s/content/jove/metadata-profile"/>
</OpenSearchDescription>
'''%{'admin': ADMIN_EMAIL, 'oscibase': OSCI_BASE}
    #Check XML
    amara.parse(doc)
    return doc

# --- %< ---

    entries = []

    for it in islice(feed.rss.channel.item, 0, 3):
        entry = {}
        print >> sys.stderr, "processing", unicode(it.link)
        entry['id'] = unicode(it.link)
        entry['label'] = entry['id']
        entry['title'] = unicode(it.title)
        desc = unicode(it.description)
        entry['description'] = desc[:desc.find(u'<div class=\"feedflare\">')]
        #print htmlparse(str(it.description)) #Above hack will do for now ;)
        entry['link'] = unicode(it.origLink)
        entry['pubDate'] =time.strftime("%Y-%m-%dT%H:%M:%S", feedparser._parse_date(str(it.pubDate)))
        entry['categories'] = [ unicode(c).strip() for c in it.category ]
        entry['snapshot'] = MOCKUP_IMAGES[unicode(it.link)]
        entry['icon'] = MOCKUP_ICON
        entry = get_data_from_page(entry, str(it.origLink))
        entries.append(entry)

    print simplejson.dumps({'items': entries}, indent=4)
Ejemplo n.º 43
0
    def from_json(self, js):
        # hooray for utc
        updated = self.updated = self.published = getattr(
            js, 'moodLastUpdated', getattr(js, 'moodStatusLastUpdated', 0))
        if self.updated == 0:
            self.updated_parsed = self.published_parsed = updated
        else:
            try:
                self.updated_parsed = self.published_parsed = int(updated)
            except (ValueError, TypeError):
                updated_parsed = feedparser._parse_date(updated)
                self.updated_parsed = self.published_parsed = int(
                    calendar.timegm(updated_parsed))

        user = getattr(js, 'user', None)
        if user is None:
            self.author_id = js.userId
        else:
            self.author_id = user.userId

        log.info_s("status json: %r", js)
        self.id = js.statusId
        moodimage_url = getattr(js, 'moodPictureUrl',
                                getattr(js, 'moodimageurl', None))
        if moodimage_url == self.SPACER_URL:
            moodimage_url = None
        #self.icon_url = user.image
        #self.icon_url = moodimage_url
        self.icon_url = None

        self.contents = [(u'xhtml', js.status)]
        self.body = js.status
        self.mood_text = getattr(js, 'moodName', getattr(js, 'mood', None))
        self.mood_icon = moodimage_url
        self._numComments = 0
        try:
            self._numComments = int(
                getattr(js, 'numComments', None)
                or getattr(js, '_numComments', None))
        except (AttributeError, ValueError, TypeError):
            self._numComments = None

        self.comments = map(MyspaceComment.from_json, js.get('comments', []))
Ejemplo n.º 44
0
def str2datetime(string: str) -> datetime:
    """
    >>> str2datetime("01.01.1990")
    datetime.datetime(1990, 1, 1, 0, 0)

    >>> str2datetime("25 AUG 2012")
    datetime.datetime(2012, 8, 25, 0, 0)

    >>> str2datetime("18 APR 1973")
    datetime.datetime(1973, 4, 18, 0, 0)

    >>> str2datetime("1968")
    datetime.datetime(1968, 1, 1, 0, 0)
    """

    try:
        return datetime(*(feedparser._parse_date(string)[:6]))
    except:
        logging.error("failed to parse %s as date" % repr(string))
        raise
Ejemplo n.º 45
0
def get_datetime(unparsed_date):
    """ string to datetime """
    parsed_date = feedparser._parse_date(unparsed_date)
    if not parsed_date:
        return datetime.datetime.min
    if isinstance(parsed_date, dict):
        return datetime.datetime(parsed_date['year'],
                                 parsed_date['month'],
                                 parsed_date['day'],
                                 parsed_date['hour'],
                                 parsed_date['min'],
                                 parsed_date['sec'],
                                 tzinfo=None)
    else:
        return datetime.datetime(parsed_date[0],
                                 parsed_date[1],
                                 parsed_date[2],
                                 parsed_date[3],
                                 parsed_date[4],
                                 parsed_date[5],
                                 tzinfo=None)
Ejemplo n.º 46
0
def get_metadata(url):
    """Get file download metadata

    Returns a (size, type, name) from the given download
    URL. Will use the network connection to determine the
    metadata via the HTTP header fields.
    """
    track_fp = util.urlopen(url)
    headers = track_fp.info()
    filesize = headers['content-length'] or '0'
    filetype = headers['content-type'] or 'application/octet-stream'

    if 'last-modified' in headers:
        parsed_date = feedparser._parse_date(headers['last-modified'])
        filedate = time.mktime(parsed_date)
    else:
        filedate = None

    filename = os.path.basename(os.path.dirname(url))
    track_fp.close()
    return filesize, filetype, filedate, filename
def GetTorrents(feed_list):
  torrents = []
  for feed in feed_list:
    last_sync = feedparser._parse_date(feed['last_sync'])
    feedparser_dict = feedparser.parse(feed['link'])
    for entry in feedparser_dict.entries:
      # Torrent links are stored as a link element or as an enclosure
      if entry.published_parsed > last_sync:
        if '.torrent' in entry.link:
          torrents.append({"link": entry.link,
                           "date": entry.published,
                           "date_parsed": entry.published_parsed})
        elif (len(entry.enclosures) and
                 entry.enclosures[0]['type'] == 'application/x-bittorrent'):
          torrents.append({"link": entry.enclosures[0]['href'],
                           "date": entry.published,
                           "date_parsed": entry.published_parsed})
    # Get highest date of this feed, update json, and return only torrents
  UpdateSyncDates(torrents)
  torrents = [torrent["link"] for torrent in torrents]
  return torrents
Ejemplo n.º 48
0
def get_metadata(url):
    """Get file download metadata

    Returns a (size, type, name) from the given download
    URL. Will use the network connection to determine the
    metadata via the HTTP header fields.
    """
    track_fp = util.urlopen(url)
    headers = track_fp.info()
    filesize = headers['content-length'] or '0'
    filetype = headers['content-type'] or 'application/octet-stream'

    if 'last-modified' in headers:
        parsed_date = feedparser._parse_date(headers['last-modified'])
        filedate = time.mktime(parsed_date)
    else:
        filedate = None

    filename = os.path.basename(os.path.dirname(url))
    track_fp.close()
    return filesize, filetype, filedate, filename
Ejemplo n.º 49
0
    def from_json(self, js):
        # hooray for utc
        updated = self.updated = self.published = getattr(
            js, "moodLastUpdated", getattr(js, "moodStatusLastUpdated", 0)
        )
        if self.updated == 0:
            self.updated_parsed = self.published_parsed = updated
        else:
            try:
                self.updated_parsed = self.published_parsed = int(updated)
            except (ValueError, TypeError):
                updated_parsed = feedparser._parse_date(updated)
                self.updated_parsed = self.published_parsed = int(calendar.timegm(updated_parsed))

        user = getattr(js, "user", None)
        if user is None:
            self.author_id = js.userId
        else:
            self.author_id = user.userId

        log.info_s("status json: %r", js)
        self.id = js.statusId
        moodimage_url = getattr(js, "moodPictureUrl", getattr(js, "moodimageurl", None))
        if moodimage_url == self.SPACER_URL:
            moodimage_url = None
        # self.icon_url = user.image
        # self.icon_url = moodimage_url
        self.icon_url = None

        self.contents = [(u"xhtml", js.status)]
        self.body = js.status
        self.mood_text = getattr(js, "moodName", getattr(js, "mood", None))
        self.mood_icon = moodimage_url
        self._numComments = 0
        try:
            self._numComments = int(getattr(js, "numComments", None) or getattr(js, "_numComments", None))
        except (AttributeError, ValueError, TypeError):
            self._numComments = None

        self.comments = map(MyspaceComment.from_json, js.get("comments", []))
Ejemplo n.º 50
0
def get_datetime(unparsed_date):
	""" string to datetime """
	parsed_date = feedparser._parse_date(unparsed_date)
	if not parsed_date:
		return datetime.datetime.min
	if isinstance(parsed_date, dict):
		return datetime.datetime(
			parsed_date['year'],
			parsed_date['month'],
			parsed_date['day'],
			parsed_date['hour'],
			parsed_date['min'],
			parsed_date['sec'],
			tzinfo=None)
	else:
		return datetime.datetime(
			parsed_date[0],
			parsed_date[1],
			parsed_date[2],
			parsed_date[3],
			parsed_date[4],
			parsed_date[5],
			tzinfo=None)
Ejemplo n.º 51
0
 def test_None(self):
     self.assertTrue(feedparser._parse_date(None) is None)
Ejemplo n.º 52
0
    def get(self, save, **kwargs):
        '''Document collected via {} feed reader'''.format(self.doctype)

        # This RSS-scraper is a generic fallback option in case we do not have
        # any specific one. Therefore, only use the following generic values
        # if we do not have any more specific info already
        if 'rss_url' in kwargs:
            RSS_URL = kwargs['rss_url']
        else:
            try:
                RSS_URL = self.rss_url
            except:
                RSS_URL = 'N/A'

        assert RSS_URL != 'N/A', 'You need to specify the feed URL. Example: rss_url="http://www.nu.nl/rss"'

        if type(RSS_URL) is str:
            RSS_URL = [RSS_URL]

        for thisurl in RSS_URL:
            rss_body = self.get_page_body(thisurl)
            d = feedparser.parse(rss_body)
            for post in d.entries:
                try:
                    _id = post.id
                except:
                    _id = post.link
                if _id == None:
                    _id = post.link
                link = re.sub("/$", "", self.getlink(post.link))

                # By now, we have retrieved the RSS feed. We now have to determine for the item that
                # we are currently processing (post in d.entries), whether we want to follow its
                # link and actually get the full text and process it. If we already have it,
                # we do not need to (therefore check_exists). But also, if we do not want to
                # work with the database backend (as indicated by save=False), we probably also
                # do not want to look something up in the database. We therefore also retrieve it in
                # that case.
                if save == False or check_exists(_id)[0] == False:
                    try:
                        req = urllib2.Request(
                            link, headers={'User-Agent': "Wget/1.9"})
                        htmlsource = urllib2.urlopen(req).read().decode(
                            encoding="utf-8", errors="ignore")
                    except:
                        htmlsource = None
                        logger.info(
                            'Could not open link - will not retrieve full article, but will give it another try with different User Agent'
                        )
                    # Some (few) scrapers seem to block certain user agents. Therefore, if code above did
                    # not succed, try fetching the article pretending to user Firefox on Windows
                    if not htmlsource or htmlsource == "":
                        try:
                            req = urllib2.Request(
                                link,
                                headers={
                                    'User-Agent':
                                    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
                                })
                            htmlsource = urllib2.urlopen(req).read().decode(
                                encoding="utf-8", errors="ignore")
                        except:
                            htmlsource = None
                            logger.info(
                                'Could not open link - will not retrieve full article'
                            )

                    try:
                        teaser = re.sub(r"\n|\r\|\t", " ", post.description)
                    except:
                        teaser = ""
                    try:
                        datum = datetime.datetime(
                            *feedparser._parse_date(post.published)[:6])
                    except:
                        try:
                            # alternative date format as used by nos.nl
                            datum = datetime.datetime(*feedparser._parse_date(
                                post.published[5:16])[:6])
                        except:
                            #print("Couldn't parse publishing date")
                            datum = None
                    doc = {
                        "_id": _id,
                        "title_rss": post.title,
                        "teaser_rss": teaser,
                        "publication_date": datum,
                        "htmlsource": htmlsource,
                        "feedurl": thisurl,
                        "url": re.sub("/$", "", post.link)
                    }
                    if htmlsource is not None:
                        # TODO: CHECK IF PARSEHTML returns None, if so, raise custom exception
                        parsed = self.parsehtml(doc['htmlsource'])
                        if parsed is None or parsed == {}:
                            try:
                                raise UnparsableException
                            except UnparsableException:
                                pass
                        else:
                            doc.update(parsed)
                    parsedurl = self.parseurl(link)
                    doc.update(parsedurl)
                    docnoemptykeys = {
                        k: v
                        for k, v in doc.items() if v or v == False
                    }
                    yield docnoemptykeys
Ejemplo n.º 53
0
def update():
    VALID_EXT = debrid.VALID_EXT
    rsslist = rssList()
    sourceList = []
    if len(rsslist) > 0: control.infoDialog('Checking RSS Lists...')
    for x in rsslist:
        u = x['rss']
        timeNow = datetime.datetime.utcnow()
        timeOffset = int(x['offset'])
        timeOffset = (
            timeNow -
            datetime.timedelta(days=int(timeOffset))).strftime('%Y%m%d')

        html = requests.get(u).content
        r = BeautifulSoup(html, "html.parser")

        soup = r.find_all('channel')[0]
        soup = soup.find_all('item')

        for item in soup:
            try:
                title = item.find_all('title')[0].getText().strip()
                link = item.find_all('link')[0].getText().strip()

                checkDB = rssDB(mode='check', link=link, title=title)

                if checkDB == True:
                    print("[REALIZER RSS MANAGER] TORRENT ALREADY ADDED: %s" %
                          title)
                    raise Exception()

                try:
                    date = item.find_all('pubdate')[0].getText().strip()
                except:
                    date = item.find_all('pubDate')[0].getText().strip()

                dateString = feedparser._parse_date(date)
                dt = datetime.datetime.fromtimestamp(mktime(dateString))
                pubDate = dt.strftime('%Y%m%d')
                strDate = dt.strftime('%Y-%m-%d')
                if int(pubDate) >= int(timeOffset):
                    r = debrid.realdebrid().addtorrent(link)
                    id = r['id']
                    select = debrid.realdebrid().torrentInfo(id)

                    status = str(select['status'])
                    print("[REALIZER RSS MANAGER] REALDEBRID STATUS", status)
                    if cleantitle.get(
                            status
                    ) != 'waitingfilesselection' and cleantitle.get(
                            status) != 'downloaded':
                        debrid.realdebrid().delete(id, type='torrents')
                        raise Exception()

                    files = select['files']
                    filesIDs = [
                        i['id'] for i in files
                        if i['path'].split('.')[-1].lower() in VALID_EXT
                    ]
                    if len(filesIDs) < 1 or filesIDs == []:
                        debrid.realdebrid().delete(id, type='torrents')
                        raise Exception()
                    r = debrid.realdebrid().selectTorrentList(id, filesIDs)
                    source = {
                        'title': title,
                        'link': link,
                        'id': id,
                        'date': str(strDate)
                    }
                    sourceList.append(source)
            except:
                pass
    control.infoDialog('RSS Lists check completed')
    rssDB(data=sourceList)
Ejemplo n.º 54
0
 def t(dateString):
     t = feedparser._parse_date(dateString)
     return datetimefromparsed(t).isoformat() if t else None
Ejemplo n.º 55
0
def parse_rfc3339(s: str) -> datetime.datetime:
    # pylint: disable=protected-access
    struct = feedparser._parse_date(s)
    return ts2dt(int(timegm(struct)))
Ejemplo n.º 56
0
#Daily Beast
dBeast = []

sourceList = [nytimes, washPo, dBeast]

feeds = []

for source in sourceList:
    for feed in source:
        feeds.append(feedparser.parse(feed))

#The set of all articles in Newspaper format.
entries = []

#runs through each feed, taking each item / article, and adding its nArticle to the entries object.
for feed in feeds:
    for item in feed["items"]:

        if "published" in item:
            pubDate = item["published"]
        elif "pubDate" in item:
            pubDate = item["pubDate"]
        else:
            pubDate = "??"
        print calendar.timegm(time.gmtime()) - calendar.timegm(
            feedparser._parse_date(pubDate))
        entries.append(nArticle(item["link"]))
#From here there seems to be a set of entries containing a ton of Articles using the URLs from the RSS feed.

pdfkit.from_url(entries[1].url, 'out.pdf')
Ejemplo n.º 57
0
    def getLink(self):
        """Reads the HTML page and extracts the link, title and body."""

        if not self.children.intersection(self.attrs):
            return      # mandatory child element missing

        self.loadCache()
        try:
            f = feedparser._open_resource(self.uri, self.etag, self.modified,
                USER_AGENT, None, [], {}
            )
            html = f.read()
        except Exception as e:
            sys.stderr.write('Getting page %s: %s\n' % (self.uri, e))
            return

        if getattr(f, 'status', None) == 304 or not html:
            # not modified or empty page
            return

        # save HTTP headers
        if hasattr(f, 'info'):
            info = f.info()
            etag = info.getheader('ETag')
            modified = info.getheader('Last-Modified')
            if modified:
                modified = feedparser._parse_date(modified)
            self.saveCache(etag, modified)

            # if the page is compressed, decompress it
            ce = info.getheader('Content-Encoding', '')
            if ce == 'gzip':
                try:
                    import gzip
                    import StringIO
                    html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read()
                except Exception as e:
                    sys.stderr.write('Unzipping page %s: %s\n' % (self.uri, e))
                    return
            elif ce == 'deflate':
                try:
                    import zlib
                    html = zlib.decompress(html, -zlib.MAX_WBITS)
                except Exception as e:
                    sys.stderr.write('Inflating page %s: %s\n' % (self.uri, e))
                    return

        # resolve relative URIs
        html = feedparser._resolveRelativeURIs(html, self.uri, self.encoding, 'text/html')

        if hasattr(f, 'headers'):
            charsets = [c for c in feedparser._getCharacterEncoding(f.headers, html) if c]
        else:
            charsets = [self.encoding]
        for charset in charsets:
            try:
                html = html.decode(charset)
                break
            except UnicodeDecodeError:
                pass
            except LookupError:
                pass

        if 'regex' in self.attrs:
            self.match_regex(html)
        else:
            self.match_xpath(html)
Ejemplo n.º 58
0
                data = ''
        elif zlib and f.headers.get('content-encoding', '') == 'deflate':
            try:
                data = zlib.decompress(data, -zlib.MAX_WBITS)
            except Exception, e:
                result['bozo'] = 1
                result['bozo_exception'] = e
                data = ''

    # save HTTP headers
    if hasattr(f, 'info'):
        info = f.info()
        result['etag'] = info.getheader('ETag')
        last_modified = info.getheader('Last-Modified')
        if last_modified:
            result['modified'] = _parse_date(last_modified)
    if hasattr(f, 'url'):
        result['href'] = f.url
        result['status'] = 200
    if hasattr(f, 'status'):
        result['status'] = f.status
    if hasattr(f, 'headers'):
        result['headers'] = f.headers.dict
    if hasattr(f, 'close'):
        f.close()

    # there are four encodings to keep track of:
    # - http_encoding is the encoding declared in the Content-Type HTTP header
    # - xml_encoding is the encoding declared in the <?xml declaration
    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
    # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
Ejemplo n.º 59
0
            data = feedparser.parse(feed_xml)
            entries = get_entries(data, pipe)
            logging.debug('get %d entries', len(entries))
            key_names = []
            for e in entries:
                key_names.append(e['key_name'])
            entries = filter_entries(entries, pipe)
            logging.debug('get %d entries', len(entries))
            for e in entries:
                logging.debug(' e.key().id_or_name() = %s e.link = %s ',
                              str(e.key().id_or_name()), str(e.link))

        elif type == 'time':
            oldest_update_time = datetime.datetime.fromtimestamp(
                time.mktime(
                    feedparser._parse_date('2010/07/15 07:29:25 +0800')))
            logging.debug('the oldest_update_time is %s',
                          str(oldest_update_time))
        elif type == 'query':
            feed_xml = self.get_xml(pipe)
            logging.debug('the len(feed_xml) is %d', len(feed_xml))
            data = feedparser.parse(feed_xml)
            entries = get_entries(data, pipe)
            logging.debug('start filter %d entries', len(entries))
            oldest_update_time = get_oldest_update_time(entries)
            logging.debug('the oldest_update_time is %s',
                          str(oldest_update_time))
            db_entries = model.FeedEntry.gql(
                "WHERE pipe = :1 AND updated_time >=:2 ORDER BY updated_time DESC",
                pipe, oldest_update_time).fetch(200)
            logging.debug('query finished. get %d entries', len(db_entries))