Example #1
0
    def __new__(cls, arg, uri=None, encoding=None, resolver=None, sourcetype=0):
        """
        arg - a string, Unicode object (only if you really know what you're doing),
              file-like object (stream), file path or URI.  You can also pass an
              InputSource object, in which case the return value is just the same
              object, possibly with the URI modified
        uri - optional override URI.  The base URI for the IS will be set to this
              value

        Returns an input source which can be passed to Amara APIs.
        """
        #do the imports within the function to avoid circular crap
        #from amara._xmlstring import IsXml as isxml

        #These importa are tucked in here because amara.lib.iri is an expensive import
        from amara.lib.iri import is_absolute, os_path_to_uri
        from amara.lib.irihelpers import DEFAULT_RESOLVER
        resolver = resolver or DEFAULT_RESOLVER

        if isinstance(arg, InputSource):
            return arg

        #if arg == (u'', ''): -> UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
        if arg == '':
            #FIXME L10N
            raise ValueError("Cannot parse an empty string as XML")
        
        if isinstance(arg, urllib2.Request):
            uri = arg.get_full_url() #One of the rightly labeled "lame" helper methods in urllib2 ;)
            stream = resolver.resolve(arg)
        elif hasattr(arg, 'read'):
            #Create dummy Uri to use as base
            uri = uri or uuid4().urn
            stream = arg
        #XXX: Should we at this point refuse to proceed unless it's a basestring?
        elif sourcetype == XMLSTRING or isxml(arg):
            #See this article about XML detection heuristics
            #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html
            uri = uri or uuid4().urn
            stream = StringIO(arg)
        elif is_absolute(arg) and not os.path.isfile(arg):
            uri = arg
            stream = resolver.resolve(uri)
        #If the arg is beyond a certain length, don't even try it as a URI
        elif len(arg) < MAX_URI_LENGTH_FOR_HEURISTIC:
            uri = os_path_to_uri(arg)
            stream = resolver.resolve(uri)
        else:
            #FIXME L10N
            raise ValueError("Does not appear to be well-formed XML")

        #We might add the ability to load zips, gzips & bzip2s
        #http://docs.python.org/lib/module-zlib.html
        #http://docs.python.org/lib/module-gzip.html
        #http://docs.python.org/lib/module-bz2.html
        #http://docs.python.org/lib/zipfile-objects.html

        #import inspect; print inspect.stack()
        #InputSource.__new__ is in C: expat/input_source.c:inputsource_new
        return InputSource.__new__(cls, stream, uri, encoding)
Example #2
0
    def __new__(cls,
                arg,
                uri=None,
                encoding=None,
                resolver=None,
                sourcetype=0):
        """
        arg - a string, Unicode object (only if you really know what you're doing),
              file-like object (stream), file path or URI.  You can also pass an
              InputSource object, in which case the return value is just the same
              object, possibly with the URI modified
        uri - optional override URI.  The base URI for the IS will be set to this
              value

        Returns an input source which can be passed to Amara APIs.
        """
        #do the imports within the function to avoid circular crap
        #from amara._xmlstring import IsXml as isxml

        #These importa are tucked in here because amara.lib.iri is an expensive import
        from amara.lib.iri import is_absolute, os_path_to_uri
        from amara.lib.irihelpers import DEFAULT_RESOLVER
        resolver = resolver or DEFAULT_RESOLVER

        if isinstance(arg, InputSource):
            return arg

        #if arg == (u'', ''): -> UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
        if arg == '':
            #FIXME L10N
            raise ValueError("Cannot parse an empty string as XML")

        if isinstance(arg, urllib2.Request):
            uri = arg.get_full_url(
            )  #One of the rightly labeled "lame" helper methods in urllib2 ;)
            stream = resolver.resolve(arg)
        elif hasattr(arg, 'read'):
            #Create dummy Uri to use as base
            uri = uri or uuid4().urn
            stream = arg
        #XXX: Should we at this point refuse to proceed unless it's a basestring?
        elif sourcetype == XMLSTRING or isxml(arg):
            #See this article about XML detection heuristics
            #http://www.xml.com/pub/a/2007/02/28/what-does-xml-smell-like.html
            uri = uri or uuid4().urn
            stream = StringIO(arg)
        elif is_absolute(arg) and not os.path.isfile(arg):
            uri = arg
            stream = resolver.resolve(uri)
        #If the arg is beyond a certain length, don't even try it as a URI
        elif len(arg) < MAX_URI_LENGTH_FOR_HEURISTIC:
            uri = os_path_to_uri(arg)
            stream = resolver.resolve(uri)
        else:
            #FIXME L10N
            raise ValueError("Does not appear to be well-formed XML")

        #We might add the ability to load zips, gzips & bzip2s
        #http://docs.python.org/lib/module-zlib.html
        #http://docs.python.org/lib/module-gzip.html
        #http://docs.python.org/lib/module-bz2.html
        #http://docs.python.org/lib/zipfile-objects.html

        #import inspect; print inspect.stack()
        #InputSource.__new__ is in C: expat/input_source.c:inputsource_new
        return InputSource.__new__(cls, stream, uri, encoding)
         raise
     except Exception, e:
         pass
 if imt in EXCEL_IMTS:
     source = speadsheet.read(body)
     dataprofile = {}
     try:
         data = ss_data or [ row for row in source.rows() ]
     except (KeyboardInterrupt, SystemExit):
         raise
     except Exception, e:
         raise
         #print >> sys.stderr, e
         #print >> sys.stderr, 'Spreadsheet processing failure.  No data to return.'
     imt = EXCEL_IMTS[0]
 elif isxml(body):
     if MODS_NAMESPACE in body:
         try:
             data, diag_info = mods2json(body, diagnostics)
             imt = 'application/x-mods+xml'
         except amara.ReaderError:
             raise ValueError('Unable to process content')
     else:
         try:
             data = atomparse(body)
             logger.debug("ATOM: " + repr(data))
         except ValueError:
             data = webfeed(body)
             imt = 'application/rss+xml'
             if data is None:
                 raise ValueError('Unable to process content')