Esempio n. 1
0
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given the name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    nodename_patt = re.escape(nodename)

    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S)
    text = _body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

    r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt},
                   re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
Esempio n. 2
0
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
    text = _body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given the name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    nodename_patt = re.escape(nodename)

    DOCUMENT_HEADER_RE = re.compile(r'<\?xml[^>]+>\s*', re.S)
    HEADER_END_RE = re.compile(fr'<\s*/{nodename_patt}\s*>', re.S)
    END_TAG_RE = re.compile(r'<\s*/([^\s>]+)\s*>', re.S)
    NAMESPACE_RE = re.compile(r'((xmlns[:A-Za-z]*)=[^>\s]+)', re.S)
    text = _body_or_str(obj)

    document_header = re.search(DOCUMENT_HEADER_RE, text)
    document_header = document_header.group().strip(
    ) if document_header else ''
    header_end_idx = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end_idx[1]:].strip() if header_end_idx else ''
    namespaces = {}
    if header_end:
        for tagname in reversed(re.findall(END_TAG_RE, header_end)):
            tag = re.search(fr'<\s*{tagname}.*?xmlns[:=][^>]*>',
                            text[:header_end_idx[1]], re.S)
            if tag:
                namespaces.update(
                    reversed(x) for x in re.findall(NAMESPACE_RE, tag.group()))

    r = re.compile(fr'<{nodename_patt}[\s>].*?</{nodename_patt}>', re.DOTALL)
    for match in r.finditer(text):
        nodetext = (document_header + match.group().replace(
            nodename, f'{nodename} {" ".join(namespaces.values())}', 1) +
                    header_end)
        yield Selector(text=nodetext, type='xml')
Esempio n. 4
0
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    nodename_patt = re.escape(nodename)

    HEADER_START_RE = re.compile(r"^(.*?)<\s*%s(?:\s|>)" % nodename_patt, re.S)
    HEADER_END_RE = re.compile(r"<\s*/%s\s*>" % nodename_patt, re.S)
    text = _body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ""
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1] :].strip() if header_end else ""

    r = re.compile(r"<{0}[\s>].*?</{0}>".format(nodename_patt), re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type="xml").xpath("//" + nodename)[0]