コード例 #1
1
    def source_fragment(self):
        value = self.context.body
        xpath = self.request.get('xpath', None)
        namespaces = {}
        if xpath is None or xpath == '/':
            return value.raw_encoded
        elif xpath.startswith('/cnx:'):
            namespaces = {'cnx': 'http://cnx.rice.edu/cnxml'}

        result = value.raw_encoded
        tree = etree.fromstring(value.raw_encoded)
        result = ''
        for node in tree.xpath(xpath, namespaces=namespaces):
            result += etree.tostring(node)

        # if we don't strip namespaces from the fragment, eip loses the
        # plot completely
        result = etree.fromstring(result)
        for prefix, ns in result.nsmap.items():
            if prefix is None:
                prefix = 'nil'
            etree.strip_attributes(result, '{%s}%s' % (ns, prefix))
        etree.cleanup_namespaces(result)
        result = etree.tostring(result)
        result = result.replace('xmlns="http://cnx.rice.edu/cnxml" ', '')

        return result
コード例 #2
0
ファイル: insertfrequency.py プロジェクト: necrop/gel_build
def insert_frequency(in_dir, out_dir, freq_dir):
    """
    Find frequency values in the frequency_build data, and inserts
    them in the GEL data.
    """
    iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low')
    frequency_finder = FrequencyMemo(freq_dir)

    for filecontent in iterator.iterate():
        for entry in filecontent.entries:
            for wordclass_set in entry.wordclass_sets():
                etree.strip_attributes(wordclass_set.node, 'size')

                tables = {}
                for type in wordclass_set.types():
                    frequencies = frequency_finder.find_frequencies(type.id)
                    if frequencies:
                        tables[type.id] = FrequencyTable(data=frequencies)
                    else:
                        tables[type.id] = None

                for type in wordclass_set.types():
                    if tables[type.id]:
                        type.node.append(tables[type.id].to_xml())

                non_null_tables = [table for table in tables.values() if table]
                if non_null_tables:
                    wcs_table = sum_frequency_tables(non_null_tables)
                    wordclass_set.node.append(wcs_table.to_xml())
コード例 #3
0
ファイル: entrycomponent.py プロジェクト: necrop/pylib3.2
    def strip_attributes(self, attributes):
        """
        Remove list of attributes from the node.

        (Wrapper for etree.strip_attributes())
        """
        etree.strip_attributes(self.node, attributes)
コード例 #4
0
    def xml(self):
        """Filtered and stripped serialized document."""

        if not hasattr(self, "_xml"):
            try:
                xml = etree.tostring(self.doc.resolved, encoding="utf-8")
                parser = etree.XMLParser(remove_blank_text=True)
                root = etree.fromstring(xml, parser)
                first = True
                for node in root.findall("SummaryMetaData/MainTopics"):
                    if first:
                        first = False
                    else:
                        parent = node.getparent()
                        parent.remove(node)
                for node in root.xpath(self.CHANGES):
                    parent = node.getparent()
                    parent.remove(node)
                etree.strip_elements(root, with_tail=False, *self.STRIP)
                etree.strip_attributes(root, "PdqKey")
                opts = dict(pretty_print=True, encoding="unicode")
                self._xml = etree.tostring(root, **opts)
            except:
                logger.exception("failure processing XML")
                bail("failure processing XML")
        return self._xml
コード例 #5
0
    def clean_html(self, html):
        if not isinstance(html, unicode):
            raise ValueError('We only support cleaning unicode HTML fragments')

        #We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff)
        divnode = fromstring(u'<div>' + html + u'</div>')
        self(divnode)

        # Strip all class attributes
        etree.strip_attributes(divnode, 'class')

        for style in divnode.xpath("//@style"):
            parent = style.getparent()
            try:
                cssStyle = cssutils.parseStyle(style)
            except Exception, e:
                logging.info("Style %s failed to parse with error %s." %
                             (style, e))
                parent.attrib.pop('style', None)
                continue

            # Set the line separator so that the style gets serialized
            cssutils.ser.prefs.lineSeparator = ''
            # Only allow valid style properties
            cssutils.ser.prefs.validOnly = True

            new_style = cssStyle.cssText
            if not new_style.strip():
                parent.attrib.pop('style', None)
            else:
                parent.attrib['style'] = new_style
コード例 #6
0
 def handle_unit_node(self, node):
     original_id = node.get("id")
     filename = node.get("id").split("__%04__")[1]
     node.set("id", original_id.split("__%04__").pop())
     for k in [i for i in node.keys() if i != "id"]:
         etree.strip_attributes(node, k)
     self.create_file_element(node, filename)
コード例 #7
0
ファイル: extractor.py プロジェクト: 88037199/chopper
    def rel_to_abs(self, base_url):
        """
        Converts relative links from html contents to absolute links
        """
        # Delete target attributes
        strip_attributes(self.tree, "target")

        # Absolute links
        self.tree.rewrite_links(
            lambda link: urljoin(base_url, link) if not link.startswith(self.rel_to_abs_excluded_prefixes) else link
        )

        # Extra attributes
        onclick_elements = self.tree.xpath("//*[@onclick]")

        for element in onclick_elements:
            # Replace attribute with absolute URL
            element.set(
                "onclick",
                self.javascript_open_re.sub(
                    lambda match: "%s%s%s"
                    % (match.group("opening"), urljoin(base_url, match.group("url")), match.group("ending")),
                    element.get("onclick"),
                ),
            )
コード例 #8
0
ファイル: gp_basics.py プロジェクト: djw4u/GenPic
 def fit_svg_circles(self):
     elem_circles = self.root.findall('circle')
     for e in elem_circles:
         e.attrib['r'] = "{:0.2f}".format(max(float(e.attrib['height']),
                                              float(e.attrib['width'])) / 2)
         e.attrib['cx'] = "{:0.2f}".format(float(e.attrib['x']) + float(e.attrib['r']))
         e.attrib['cy'] = "{:0.2f}".format(float(e.attrib['y']) + float(e.attrib['r']))
         etree.strip_attributes(e, 'x', 'y', 'width', 'height', 'transform')
コード例 #9
0
ファイル: dao.py プロジェクト: hbast/dao
def reduce_block(node):
    logger.debug("reduce_block() with node: %s %s", node.tag, node.attrib)
    count_all_children = len(node.findall('./', namespaces))
    count_fob_children = len(node.findall('./fo:block', namespaces))
    count_fom_children = len(node.findall('./fo:marker', namespaces))
    logger.debug("amount of children:     %s", count_all_children)
    logger.debug("amount of fob children: %s", count_fob_children)
    logger.debug("node contains any text: %s",
                 "None" if node.text is None else "yes")

    if (count_all_children == 1 and count_fob_children == 1 and node.text is None) or \
            (count_all_children == 2 and count_fob_children == 1 and count_fom_children == 1 and node.text is None):
        logger.debug("merge possible:         yes")

        child = node.find('fo:block', namespaces)
        marker = node.find('fo:marker', namespaces)
        parent = node.getparent()
        # contains the position of the current node in the parent tree
        pos = parent.index(node)

        # merging attributes
        # child attributes overwrite parent attributes
        merged_attrib = merge_two_dicts(dict(node.attrib), dict(child.attrib))
        logger.debug("child attribs:          %s %s", len(child.attrib),
                     child.attrib)
        logger.debug("node attribs:           %s %s", len(node.attrib),
                     node.attrib)
        logger.debug("merged attribs          %s %s", len(merged_attrib),
                     merged_attrib)

        # merging
        # 1) move child to the position of node in parent tree
        # 2) replace child's attributes with the merged attribs
        # 3) remove node from parent tree
        parent.insert(pos, child)
        etree.strip_attributes(child, '*')
        for k, v in merged_attrib.iteritems():
            child.set(k, v)
        parent.remove(node)

        # if there is a fo:marker block, it has also to be moved (in front of fo:block)
        if marker is not None:
            parent.insert(pos, marker)

        # because the node order has changed (node was replaced by child)
        # the child must be analyzed also, so we call reduce_block with child from the new position
        reduce_block(parent[pos])

    else:
        logger.debug("merge possible:         no")
        logger.debug("children: ")
        for child in node.findall('./fo:block', namespaces):
            logger.debug("%s %s", child.tag, child.attrib)

        for child in node.findall('./fo:block', namespaces):
            reduce_block(child)

    return
コード例 #10
0
ファイル: asdf_write.py プロジェクト: mgeier/ssr-utils
 def Write(self, fxml):
     etree.strip_attributes(
         self.root, '{http://codespeak.net/lxml/objectify/pytype}pytype')
     objectify.deannotate(self.root, xsi_nil=True)
     etree.cleanup_namespaces(self.root)
     print >> fxml, (etree.tostring(self.root,
                                    pretty_print=True,
                                    xml_declaration=True,
                                    encoding='utf-8'))
コード例 #11
0
ファイル: gp_basics.py プロジェクト: djw4u/GenPic
 def fit_svg_polygons(self):
     elem_polygons = self.root.findall('polygon')
     for e in elem_polygons:
         e.attrib['width'] = e.attrib['height']
         t = create_triangle(float(e.attrib['x']), float(e.attrib['y']) ,
                             max(float(e.attrib['height']), float(e.attrib['width'])))
         e.attrib['points'] = "{:0.2f},{:0.2f} {:0.2f},{:0.2f} {:0.2f},{:0.2f}".\
         format(*t[:-2])
         etree.strip_attributes(e, 'x', 'y', 'width', 'height')
コード例 #12
0
ファイル: waaswrapper.py プロジェクト: chozelinek/sacoco
 def remove_namespaces(self, tree):
     for elem in tree.getiterator():
         if not hasattr(elem.tag, 'find'): continue  # (1)
         i = elem.tag.find('}')
         if i >= 0:
             elem.tag = elem.tag[i+1:]
     objectify.deannotate(tree, cleanup_namespaces=True)
     etree.strip_attributes(tree, '{}id'.format('{'+self.xml+'}'))
     pass
コード例 #13
0
def addpoint(Loc_data, Route_data):
    new_element = objectify.Element('CoursePoint')
    new_element.Name = Route_data[0][0:10]
    new_element.Time = Loc_data[0]
    new_element.Position = ''
    new_element.Position.LatitudeDegrees = Loc_data[1]
    new_element.Position.LongitudeDegrees = Loc_data[2]
    new_element.PointType = Route_data[1]
    new_element.Notes = Route_data[3]
    etree.strip_attributes(new_element, '{http://codespeak.net/lxml/objectify/pytype}pytype')
    etree.strip_attributes(new_element, '{http://www.w3.org/2001/XMLSchema-instance}nil')
    etree.cleanup_namespaces(new_element)
    return new_element
コード例 #14
0
ファイル: booktype.py プロジェクト: SoldierGamma/EWR
    def html_before_write(self, book, chapter):
        from lxml import etree

        try:
            from urlparse import urlparse, urljoin
        except ImportError:
            from urllib.parse import urlparse, urljoin

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # should also be aware to handle
            # ../chapter/
            # ../chapter/#reference
            # ../chapter#reference

            for _link in body.xpath('//a'):
                # This is just temporary for the footnotes
                if _link.get('href', '').find('InsertNoteID') != -1:
                    _ln = _link.get('href', '')
                    i = _ln.find('#')
                    _link.set('href', _ln[i:])

                    continue

                _u = urlparse(_link.get('href', ''))

                # Let us care only for internal links at the moment
                if _u.scheme == '':
                    if _u.path != '':
                        _link.set('href', '%s.xhtml' % _u.path)

                    if _u.fragment != '':
                        _link.set(
                            'href',
                            urljoin(_link.get('href'), '#%s' % _u.fragment))

                    if _link.get('name') != None:
                        _link.set('id', _link.get('name'))
                        etree.strip_attributes(_link, 'name')

        chapter.content = etree.tostring(tree,
                                         pretty_print=True,
                                         encoding='utf-8')
コード例 #15
0
    def from_xml(self, content):
        tree = etree.fromstring(content)
        NSMAP = ['codeList', 'codeListValue', 'schemaLocation']

        etree.strip_attributes(tree, *NSMAP)
        root = tree
        etree.cleanup_namespaces(root)

        namespaces = {
            'gvq': '',
            'gco': '',
            'gmd19157': '',
            'updated19115': '',
            'gmd': '',
            'gml': '',
            'xsi': '',
            'xmlns': '',
        }

        #Some fields are xml text fields. The fields must be unique inside the model.
        # Store those elements temporarily.
        listOfTextFields = getXMLTextFields()
        fieldDictionary = dict()
        namespaceDict, originalNamespace = getNamespaceInfo()

        for field in listOfTextFields:
            foundElements = root.findall(".//" + namespaceDict[field] + field,
                                         namespaces=originalNamespace)
            valueList = list()
            for element in foundElements:
                valueList.append(''.join(
                    [etree.tostring(child) for child in element]))
            fieldDictionary[field] = valueList

        xmlcontent = etree.tostring(root)
        jsondata = xmltodict.parse(xmlcontent, namespaces=namespaces)

        #Replace all textFields with the xml text.
        for field in listOfTextFields:
            i = 0
            replaceDictValueWithStringElement(jsondata, field, i,
                                              fieldDictionary)

        if 'GVQ_FeedbackCollection' in jsondata:
            item = jsondata['GVQ_FeedbackCollection']['item']
        elif 'item' in jsondata:
            item = jsondata['item']
        else:
            item = jsondata
        self.from_json(json.dumps(item))
        return item
コード例 #16
0
ファイル: morphset.py プロジェクト: necrop/pylib3.2
    def to_node(self, serialized=False):
        node = etree.Element('morphSet', pos=self.baseclass)
        for unit in self.morphunits:
            child = unit.to_node()
            node.append(child)
        etree.strip_attributes(node, 'variantType')
        node.set('variantType', self.variant_type)
        node.set('score', str(self.score))
        node.set('sort', self.lexical_sort())

        if serialized:
            return etree.tostring(node)
        else:
            return node
コード例 #17
0
    def clean_html(self, html):
        if not isinstance(html, str):
            raise ValueError('We only support cleaning unicode HTML fragments')

        #We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff)
        divnode = fromstring('<div>' + str(html) + u'</div>')
        self(divnode)

        # Strip all class attributes
        etree.strip_attributes(divnode, 'class')

        # Drop all xml:lang and lang attributes, and handle the
        # stripping of any bad css styles
        # Also drop id and class attributes - these are not useful in RichTextEditor
        for node in divnode.xpath("//*"):
            for key, value in node.attrib.items():
                if key.lower() in ('xml:lang', 'lang', 'id', 'class'):
                    node.attrib.pop(key, None)
                elif 'style' == key.lower():
                    try:
                        cssStyle = cssutils.parseStyle(value)
                    except Exception as e:
                        logging.info(
                            "Style %s failed to parse with error %s." %
                            (value, e))
                        node.attrib.pop(key, None)
                        continue

                    # Set the line separator so that the style gets serialized
                    cssutils.ser.prefs.lineSeparator = ''
                    # Only allow valid style properties
                    cssutils.ser.prefs.validOnly = True

                    new_style = cssStyle.cssText
                    if not new_style.strip():
                        node.attrib.pop(key, None)
                    else:
                        node.attrib[key] = new_style
            # Drop all empty span tags
            if node.tag == 'span' and not list(node.keys()):
                node.drop_tag()

        #Now unwrap the divnode (i.e. just serialize the children of our extra div node)
        cleaned = saxutils.escape(divnode.text) if divnode.text else ''

        for n in divnode:
            cleaned += tostring(n, encoding='unicode', method='xml')
        return cleaned
コード例 #18
0
    def from_xml(self, content):
        tree = etree.fromstring(content)
        NSMAP = ['codeList', 'codeListValue', 'schemaLocation']

        etree.strip_attributes(tree, *NSMAP)
        root = tree
        etree.cleanup_namespaces(root)

        namespaces = {
             'gvq'         :'',
             'gco'         :'',
             'gmd19157'    :'',
             'updated19115':'',
             'gmd'         :'',
             'gml'         :'',
             'xsi'         :'',
             'xmlns'       :'',
             }

        #Some fields are xml text fields. The fields must be unique inside the model.
        # Store those elements temporarily.
        listOfTextFields = getXMLTextFields()
        fieldDictionary = dict()
        namespaceDict, originalNamespace =  getNamespaceInfo()

        for field in listOfTextFields:
            foundElements = root.findall(".//"+namespaceDict[field]+field, namespaces=originalNamespace)
            valueList = list()
            for element in foundElements:
                valueList.append(''.join([etree.tostring(child) for child in element]))
            fieldDictionary[field] = valueList

        xmlcontent = etree.tostring(root)
        jsondata = xmltodict.parse(xmlcontent, namespaces=namespaces)

        #Replace all textFields with the xml text.
        for field in listOfTextFields:
            i = 0
            replaceDictValueWithStringElement(jsondata, field, i, fieldDictionary)

        if 'GVQ_FeedbackCollection' in jsondata:
            item = jsondata['GVQ_FeedbackCollection']['item']
        elif 'item' in jsondata:
            item = jsondata['item']
        else:
            item = jsondata
        self.from_json(json.dumps(item))
        return item
コード例 #19
0
ファイル: wikiextractor.py プロジェクト: chozelinek/sacoco
 def create_xml(self,tei,revision_id):
     """Create a simplified XML file only containing the text to be processed with WebLicht."""
     for elem in tei.getiterator():
         if not hasattr(elem.tag, 'find'): continue  # (1)
         i = elem.tag.find('}')
         if i >= 0:
             elem.tag = elem.tag[i+1:]
     objectify.deannotate(tei, cleanup_namespaces=True)
     etree.strip_attributes(tei, '{}id'.format('{'+self.xml+'}'))
     content = tei.xpath('./text/body/div')[0]
     text = etree.Element('text', id = 'wiki_'+revision_id)
     text.append(content)
     outpath = os.path.join(self.xmldir,'wiki_'+revision_id+'.xml')
     tree = etree.ElementTree(text)
     tree.write(outpath, encoding = 'utf-8', pretty_print=True, xml_declaration=True)
     pass
コード例 #20
0
ファイル: booktype.py プロジェクト: 171230839/ebooklib
    def html_before_write(self, book, chapter):
        from lxml import  etree

        try:
            from urlparse import urlparse, urljoin
        except ImportError:
            from urllib.parse import urlparse, urljoin

        try:
            tree = parse_html_string(chapter.content)
        except:
            return

        root = tree.getroottree()

        if len(root.find('body')) != 0:
            body = tree.find('body')

            # should also be aware to handle
            # ../chapter/
            # ../chapter/#reference
            # ../chapter#reference

            for _link in body.xpath('//a'):
                # This is just temporary for the footnotes
                if _link.get('href', '').find('InsertNoteID') != -1:
                    _ln = _link.get('href', '')
                    i = _ln.find('#')                                       
                    _link.set('href', _ln[i:]);

                    continue

                _u = urlparse(_link.get('href', ''))

                # Let us care only for internal links at the moment
                if _u.scheme == '':
                    if _u.path != '':
                        _link.set('href', '%s.xhtml' % _u.path)
                    
                    if _u.fragment != '':
                        _link.set('href', urljoin(_link.get('href'), '#%s' % _u.fragment))

                    if _link.get('name') != None:
                        _link.set('id', _link.get('name'))
                        etree.strip_attributes(_link, 'name')
                    
        chapter.content = etree.tostring(tree, pretty_print=True, encoding='utf-8')
コード例 #21
0
def query_xpath_x(cnt_str,
                  cc_xpath,
                  removeTags=None,
                  removeAtts=None,
                  fixNode=' ',
                  rstmode='html'):
    rs, msg = query_xpath(cnt_str, cc_xpath, fixNode)
    if msg or len(rs) == 0:
        return rs, msg

    for i in range(len(rs)):
        if isinstance(rs[i], etree._Element):
            if removeTags:
                etree.strip_tags(rs[i], removeTags)
            if removeAtts:
                etree.strip_attributes(rs[i], removeAtts)
            rs[i] = etree.tostring(rs[i], encoding='unicode', method=rstmode)

    return rs, msg
コード例 #22
0
ファイル: kdb4.py プロジェクト: ifooth/libkeepass
    def protect(self):
        """
        Find all elements with a 'Protected=False' attribute and replace the
        text with a protected value in the XML element tree. If there was a
        'ProtectedValue' attribute, it is deleted and the 'Protected' attribute
        is set to 'True'. The 'ProtectPassword' element in the 'Meta' section is
        also set to 'True'.

        This does not just restore the previous protected value, but reencrypts
        all text values of elements with 'Protected=False'. So you could use
        this after modifying a password, adding a completely new entry or
        deleting entry history items.
        """
        self._reset_salsa()
        self.obj_root.Meta.MemoryProtection.ProtectPassword._setText('True')
        for elem in self.obj_root.iterfind('.//Value[@Protected="False"]'):
            etree.strip_attributes(elem, 'ProtectedValue')
            elem.set('Protected', 'True')
            elem._setText(self._protect(elem.text))
コード例 #23
0
ファイル: kdb4.py プロジェクト: shij10/libkeepass
 def protect(self):
     """
     Find all elements with a 'Protected=False' attribute and replace the
     text with a protected value in the XML element tree. If there was a
     'ProtectedValue' attribute, it is deleted and the 'Protected' attribute
     is set to 'True'. The 'ProtectPassword' element in the 'Meta' section is
     also set to 'True'.
     
     This does not just restore the previous protected value, but reencrypts
     all text values of elements with 'Protected=False'. So you could use
     this after modifying a password, adding a completely new entry or
     deleting entry history items.
     """
     self._reset_salsa()
     self.obj_root.Meta.MemoryProtection.ProtectPassword._setText('True')
     for elem in self.obj_root.iterfind('.//Value[@Protected="False"]'):
         etree.strip_attributes(elem, 'ProtectedValue')
         elem.set('Protected', 'True')
         elem._setText(self._protect(elem.text))
コード例 #24
0
ファイル: fengwen.py プロジェクト: rdavison/fengwen
def processReuters(html):
    # initialize return
    article_info = dict()

    article_info['title'] = html.find('.//meta[@property=\'og:title\']').get('content')

    text_elem = html.find('.//span[@id=\'articleText\']')

    try:
        article_info['author'] = etree.tostring(text_elem.find('.//p[@class=\'byline\']')).replace('\n','')
    except:
        article_info['author'] = "By Unknown"

    etree.strip_elements(text_elem, 'div')
    etree.strip_tags(text_elem, 'span')
    for elem in text_elem.findall('.//p'):
        etree.strip_attributes(elem)
    article_info['story'] = ''.join([etree.tostring(child) for child in text_elem.iterchildren()]).replace('\n','')

    return article_info
コード例 #25
0
def htm2x(f, type, lang):
    #delete cell spaning cause of pandoc no support
    tree = html.parse(f,
                      parser=html.HTMLParser(encoding='utf-8', compact=True))
    for table in tree.xpath(".//table"):
        firstrow = table.xpath(".//tr")[0]
        for cell in firstrow.getchildren():
            try:
                for i in range(1, int(cell.attrib["colspan"])):
                    firstrow.append(etree.XML("<td></td>"))
            except (KeyError):
                pass
    etree.strip_attributes(tree, "class", "style", "colspan", "rowspan")
    with open(f, 'wb') as file:
        file.write(html.tostring(tree, pretty_print=True, encoding='utf-8'))

    subprocess.run([
        cfg['pandoc']['location'], '-o', 'tmp/print.' + type, f,
        '--reference-doc=.\/static\/mat\/templates\/' + lang + '_msword.' +
        type
    ])
コード例 #26
0
ファイル: extractor.py プロジェクト: Live-Lyrics/chopper
    def rel_to_abs(self, base_url):
        """
        Converts relative links from html contents to absolute links
        """
        # Delete target attributes
        strip_attributes(self.tree, 'target')

        # Absolute links
        self.tree.rewrite_links(
            lambda link: urljoin(base_url, link)
            if not link.startswith(self.rel_to_abs_excluded_prefixes) else link)

        # Extra attributes
        onclick_elements = self.tree.xpath('//*[@onclick]')

        for element in onclick_elements:
            # Replace attribute with absolute URL
            element.set('onclick', self.javascript_open_re.sub(
                lambda match: '%s%s%s' % (match.group('opening'),
                        urljoin(base_url, match.group('url')),
                        match.group('ending')),
                element.get('onclick')))
コード例 #27
0
ファイル: CleanXHTML.py プロジェクト: dave42/j5basic
    def clean_html(self, html):
        if not isinstance(html, unicode):
            raise ValueError('We only support cleaning unicode HTML fragments')

        #We wrap the content up in an extra div tag (otherwise lxml does wierd things to it - like adding in <p> tags and stuff)
        divnode = fromstring(u'<div>' + html + u'</div>')
        self(divnode)

        # Strip all class attributes
        etree.strip_attributes(divnode, 'class')

        # Drop all xml:lang and lang attributes, and handle the
        # stripping of any bad css styles
        # Also drop id and class attributes - these are not useful in RichTextEditor
        for node in divnode.xpath("//*"):
            for key, value in node.attrib.iteritems():
                if key.lower() in ('xml:lang', 'lang','id','class'):
                    node.attrib.pop(key, None)
                elif 'style' == key.lower():
                    try:
                        cssStyle = cssutils.parseStyle(value)
                    except Exception, e:
                        logging.info("Style %s failed to parse with error %s." % (value, e))
                        node.attrib.pop(key, None)
                        continue

                    # Set the line separator so that the style gets serialized
                    cssutils.ser.prefs.lineSeparator = ''
                    # Only allow valid style properties
                    cssutils.ser.prefs.validOnly = True

                    new_style = cssStyle.cssText
                    if not new_style.strip():
                        node.attrib.pop(key, None)
                    else:
                        node.attrib[key] = new_style
            # Drop all empty span tags
            if node.tag == 'span' and not node.keys():
                node.drop_tag()
コード例 #28
0
def xml_parser(xml_file):
    """解析xml文件"""
    with open(xml_file, 'r') as file_object:
        xml_content = file_object.read().encode('utf-8')
        doc = etree.XML(xml_content)
        # 整通通话
        global total_duration
        total_duration = doc.xpath(r'//instance/@duration')[0]
        total_call = doc.xpath(r'//channel[@no="mix"]//item')
        input_dict, output_dict = {}, {}
        for tc in total_call:
            if tc.xpath(r'@duration'):  # 有duration属性的标签为静音通话
                etree.strip_attributes(tc, 'duration')
                tc.set('energy', '')
                tc.set('speed', '')
            get_attr = tc.xpath('attribute::*')  # 获取标签中所有属性值
            template = ('start', 'end', 'energy', 'speed')
            tmp_dict = dict(zip(template, get_attr))
            input_dict[int(tmp_dict['start'])] = tmp_dict  # 按开始时间打标

        sorted_list = [tl[1] for tl in sorted(input_dict.items(), key=lambda item: item[0])]  # 根据开始时间对通话排序
        for call_num in range(len(sorted_list)):
            output_dict[call_num+1] = sorted_list[call_num]

        # 坐席通话
        agent_time = doc.xpath(r'//channel[@no="n0"]//time/text()')[0].split(' ') if doc.xpath(r'//channel[@no="n0"]//time/text()') else []
        agent_text = doc.xpath(r'//channel[@no="n0"]//text/text()')[0].split(' ') if doc.xpath(r'//channel[@no="n0"]//text/text()') else []
        agent_call = call_generator(agent_time, agent_text, '0')
        # 客户通话
        cust_time = doc.xpath(r'//channel[@no="n1"]//time/text()')[0].split(' ') if doc.xpath(r'//channel[@no="n1"]//time/text()') else []
        cust_text = doc.xpath(r'//channel[@no="n1"]//text/text()')[0].split(' ') if doc.xpath(r'//channel[@no="n1"]//text/text()') else []
        cust_call = call_generator(cust_time, cust_text, '1')
        # 合并坐席与客户通话
        agent_call.update(cust_call)
        call_content = {key: value for key, value in sorted(agent_call.items(), key=lambda item: item[0])}
        result = call_match(output_dict, call_content)

        return result
コード例 #29
0
def parse_content(href):
    """ open file and replace ../img with img and src to data_src for iframes """

    myparser = etree.HTMLParser(encoding="utf-8")
    with open(href, 'r') as file:
        htmltext = file.read()

    tree = etree.HTML(htmltext, parser=myparser)
    # = html.fromstring(filein)

    # removing "Retour au cours" links
    try:
        links = tree.xpath('//a[contains(@href, "COURSEVIEWBYID")]')
        print (" ----- found links %s" % str(links))
        for l in links:
            l.getparent().remove(l)
    except:
        pass
    # Adapt img links to direct path to img instead of ../img
    try:
        imgs = tree.xpath('//img')#we get a list of elements
        for img in imgs:
            new_src = img.get('src').replace('../img', 'img')
            img.set('src', new_src)
    except Exception as e:
        pass
    # For all iframes, rename 'src' attribute to 'data-src'
    try:
        iframes = tree.xpath('//iframe')
        for iframe in iframes:
            iframe.attrib['data-src'] = iframe.attrib['src']
            etree.strip_attributes(iframe, 'src')
    except Exception as e:
        pass

    return html.tostring(tree, encoding='utf-8').decode('utf-8')
コード例 #30
0
def parse_content(href):
    """ open file and replace ../img with img and src to data_src for iframes """

    myparser = etree.HTMLParser(encoding="utf-8")
    with open(href, 'r') as file:
        htmltext = file.read()

    tree = etree.HTML(htmltext, parser=myparser)
    # = html.fromstring(filein)

    # removing "Retour au cours" links
    try:
        links = tree.xpath('//a[contains(@href, "COURSEVIEWBYID")]')
        print(" ----- found links %s" % str(links))
        for l in links:
            l.getparent().remove(l)
    except:
        pass
    # Adapt img links to direct path to img instead of ../img
    try:
        imgs = tree.xpath('//img')  #we get a list of elements
        for img in imgs:
            new_src = img.get('src').replace('../img', 'img')
            img.set('src', new_src)
    except Exception as e:
        pass
    # For all iframes, rename 'src' attribute to 'data-src'
    try:
        iframes = tree.xpath('//iframe')
        for iframe in iframes:
            iframe.attrib['data-src'] = iframe.attrib['src']
            etree.strip_attributes(iframe, 'src')
    except Exception as e:
        pass

    return html.tostring(tree, encoding='utf-8').decode('utf-8')
コード例 #31
0
ファイル: tools.py プロジェクト: vled12/tkp_auto
def htm2x(f, type, lang, location):
    # delete cell spaning cause of pandoc no support
    tree = html.parse(f,
                      parser=html.HTMLParser(encoding='utf-8', compact=True))
    for table in tree.xpath(".//table"):
        if table.xpath(".//tr"):
            firstrow = table.xpath(".//tr")[0]
            for cell in firstrow.getchildren():
                try:
                    for i in range(1, int(cell.attrib["colspan"])):
                        firstrow.append(etree.XML("<td></td>"))
                except (KeyError):
                    pass
    etree.strip_attributes(tree, "class", "style", "colspan", "rowspan")
    with open(f, 'wb') as file:
        file.write(html.tostring(tree, pretty_print=True, encoding='utf-8'))

    pypandoc.convert_file(source_file="tmp/print.html",
                          to='docx',
                          outputfile='tmp/print.docx',
                          extra_args=[
                              "--reference-doc",
                              "static/mat/templates/" + lang + "_msword.docx"
                          ])
コード例 #32
0
ファイル: bloomberg.py プロジェクト: rdavison/fengwen
def bloombergStoryParser(url):
    from lxml import etree
    parsed_data = dict()
    # webpage = open('bloomberg/sample_article1.html').read()
    webpage = requestHTML(url)
    html = etree.HTML(webpage)
    # header - title/author/date
    header = html.find('.//div[@id=\'story_head\']')
    header_title = header.find('.//div[@id=\'disqus_title\']/h1').text
    header_byline = header.find('.//cite[@class=\'byline\']').text
    header_date = header.find('.//span[@class=\'datestamp\']/noscript').text
    parsed_data['header_title'] = header_title
    parsed_data['header_author'] = header_byline
    parsed_data['header_date'] = header_date
    # story - content
    content = html.find('.//div[@id=\'story_content\']')
    etree.strip_elements(content, 'div', 'script', 'ul')
    etree.strip_attributes(content)
    for h2_tag in content.findall('.//h2'):
        h2_tag.tag = 'h3'
    content_story = etree.tostring(content)
    parsed_data['story_content'] = content_story
    #parsed_data['story_content'] = '~~~Story Content~~~'
    return parsed_data
コード例 #33
0
 def fix_scripts(self):
     for script in self.element.xpath('.//Script'):
         script.text = etree.CDATA(script.text)
     for text in self.element.xpath('.//Text'):
         content = None
         if text.get('Content') not in ['', None]:
             content = text.get('Content')
         else:
             if str(text.text).isspace():
                 content = ''
             else:
                 content = text.text
         text.text = etree.CDATA(content)
         etree.strip_attributes(text, 'Content')
     for text_box in self.element.xpath('.//TextBox'):
         if text_box.get('Content') not in ['', None]:
             content = text_box.get('Content')
         else:
             if str(text_box.text).isspace():
                 content = ''
             else:
                 content = text_box.text
         text_box.text = etree.CDATA(content)
         etree.strip_attributes(text_box, 'Content')
コード例 #34
0
ファイル: html_element_proxy.py プロジェクト: mouday/lxmlHtml
 def strip_attributes(self, *attribute_names):
     etree.strip_attributes(self.root, *attribute_names)
コード例 #35
0
ファイル: epub.py プロジェクト: tjguk/epub
def convert_to_xhtml(filepath, dirpath):
    #
    # By default the etree parser will open the file
    # in binary mode, leaving the windows-style \r\n linefeeds
    # intact. The end result is #&13 turds all over the XHTML.
    #
    if filepath.startswith("http://"):
        f = urllib.urlopen(filepath)
    else:
        f = open(filepath, "rb")
    try:
        text = f.read()
    finally:
        f.close()
    text = text.replace("\r\n", "\n")
    html = etree.parse(StringIO.StringIO(text), etree.HTMLParser())
    #
    # Strip tags which won't be needed
    #
    for br in html.iter("br"):
        previous = br.getprevious()
        if previous is not None:
            previous.tail = (previous.tail or "") + " "
        else:
            br.tail = " " + (br.tail or "")

    #
    # A little bit dangerous but... the header block on papalencycicals.net uses
    # divs whose generated ids all start with "DMSMenu"
    #
    for div in list(html.iter("div")):
        id = div.get("id")
        if id and id.startswith("DMSMenu"):
            div.getparent().remove(div)

    etree.strip_tags(html, "span", "script", "center", "style", "br", "a", "table", "img", "tr", "td", "font", "div")
    #
    # Remove tag-level styles and alignments
    #
    etree.strip_attributes(html, "class", "style", "align")
    #
    # Remove paragraphs which are completely empty
    #
    for p in html.iter("p"):
        if not list(p) and not (p.text or "").strip() and not (p.tail or "").strip():
            p.getparent().remove(p)

    html_body = html.find("body")
    html_body.attrib.clear()

    xml = etree.XML(XHTML_DOCUMENT)
    xml_body = xml.find("{%s}body" % xml.nsmap[None])
    xml_body.getparent().replace(xml_body, html_body)

    base, ext = os.path.splitext(os.path.basename(filepath))
    output_filepath = os.path.join(dirpath, "%s.xhtml" % base)
    with open(output_filepath, "wb") as f:
        f.write(
            etree.tostring(
                xml,
                pretty_print=True,
                xml_declaration=True,
                encoding="utf-8"
            )
        )
コード例 #36
0
ファイル: gcparse.py プロジェクト: ninetythirty/gcparse
def parse_old_conversations(mbox_file, xml_dir, addresses):
    # Until about 2013-05-01 Google used XMPP-like XML for chat
    print('Parsing old-style XML chats... ', file=sys.stdout)
    sys.stdout.flush()
    mbox = mailbox.mbox(mbox_file)
    num_messages = 0
    num_malformed = 0 # no XML
    num_groupchats = 0

    for message in mbox:
        num_messages += 1
        thread_id = message['X-GM-THRID']
        payload = message.get_payload(i=0)
        transfer_encoding = payload['Content-Transfer-Encoding']
        content_type = payload.get_content_type()
        if content_type != 'text/xml':
            # These are either empty messages or email messages that Google
            # confused while attempting to blur the distinction between chat
            # and email. In the latter case, we only care about chat messages
            # here, but the message should be correctly parsed as email
            num_malformed += 1
        else:
            # Message contains good XML, clean it up
            payload_cleaned = clean_xml_payload(payload)
            tree = etree.fromstring(payload_cleaned)
            if tree.xpath('//cli:message[@type="groupchat"]', namespaces={'cli': 'jabber:client'}):
                # Skip group chats
                num_groupchats += 1
                continue
            # Python's ElementTree fails miserably here, use lxml
            xpaths = []
            # Google elements
            signature_xpath = tree.xpath('//met:google-mail-signature', namespaces={'met': 'google:metadata'})
            xpaths.append(signature_xpath)
            delay_xpath = tree.xpath('//jxd:x', namespaces={'jxd': 'jabber:x:delay'})
            xpaths.append(delay_xpath)
            nosave_xpath = tree.xpath('//nos:x', namespaces={'nos': 'google:nosave'})
            xpaths.append(nosave_xpath)
            record_xpath = tree.xpath('//arc:record', namespaces={'arc': 'http://jabber.org/protocol/archive'})
            xpaths.append(record_xpath)
            xhtml_xpath = tree.xpath('//xht:html', namespaces={'xht': 'http://www.w3.org/1999/xhtml'})
            xpaths.append(xhtml_xpath)
            xhtmlim_xpath = tree.xpath('//xim:html', namespaces={'xim': 'http://jabber.org/protocol/xhtml-im'})
            xpaths.append(xhtmlim_xpath)
            gap_xpath = tree.xpath('//con:gap', namespaces={'con': 'google:archive:conversation'})
            xpaths.append(gap_xpath)
            # Jabber "composing" element (someone was typing)
            composing_xpath = tree.xpath('//eve:x', namespaces={'eve': 'jabber:x:event'})
            # Remove junk elements (there are a lot of them)
            xpaths.append(composing_xpath)
            for x in xpaths:
                for dud in x:
                    dud.getparent().remove(dud)
            # Strip Google and experimental(?) AIM attributes
            etree.strip_attributes(tree,
                                   'iconset',
                                   '{google:internal}cid',
                                   '{google:internal}sequence-no',
                                   '{google:internal}time-stamp',
                                   '{google:internal}interop-stanza',
                                   '{google:internal}dual-delivery',
                                   '{google:internal}interop-disable-legacy-archiver',
                                   '{google:aim}new-session',
                                   )
            # Clean up namespaces
            remove_namespace(tree, '{jabber:client}')
            remove_namespace(tree, '{google:archive:conversation}')
            remove_namespace(tree, '{google:timestamp}')
            etree.cleanup_namespaces(tree)
            # Remove /resource from message 'from' and 'to' attributes
            for m in tree.xpath('//message'):
                from_field = m.attrib['from'].split('/')[0]
                to_field = m.attrib['to'].split('/')[0]
                m.attrib['from'] = from_field
                m.attrib['to'] = to_field
                # Record addresses for name map
                addresses[from_field] += 1
                addresses[to_field] += 1

            # Write out data
            f = open('{0}/{1}.conv'.format(xml_dir, thread_id), 'a') # append
            # Format manually b/c lxml's pretty print makes a TON of mistakes
            prev_m_as_string = ''
            for m in tree.xpath('//message'):
                m_as_string = etree.tostring(m)
                # Gotcha: When xpath() is used on an Element, if the XPath
                # expression is relative it's evaluated against the element.
                # If the expression is absolute it's evaluated against the tree
                to_field = m.attrib['to']
                from_field = m.attrib['from']
                body = m.xpath('./body')
                # In the case of sequential messages with identical timestamps,
                # we have to rely on line order in the mbox to order messages
                time_ms = m.xpath('./time')[0].attrib['ms']
                if m_as_string != prev_m_as_string and len(body) != 0:
                    # Don't print duplicate messages (sometimes the entire
                    # message including timestamp is repeated), don't print
                    # empty messages
                    print('  <message to="{0}" from="{1}">'.format(to_field, from_field), file=f)
                    print('    <body>{0}</body>'.format(html.escape(body[0].text)), file=f)
                    print('    <time ms="{0}"/>'.format(time_ms), file=f)
                    print('  </message>', file=f)
                prev_m_as_string = m_as_string
            f.close()

    num_parsed = num_messages - num_malformed - num_groupchats
    if num_malformed:
        print('    Malformed: {0}'.format(num_malformed), file=sys.stdout)
    if num_groupchats:
        print('    Group chats: {0} (unsupported)'.format(num_groupchats), file=sys.stdout)
    print('    Messages parsed: {0}'.format(num_parsed), file=sys.stdout)
    print('DONE', file=sys.stdout)
    return num_parsed
コード例 #37
0
ファイル: serializers.py プロジェクト: phlax/moz_pootle_fs
 def strip_attributes(self, xml, path, attrs):
     for node in xml.xpath(path):
         for k in [i for i in node.keys() if i in attrs]:
             etree.strip_attributes(node, k)
     return xml
コード例 #38
0
def sch2svg(sch_fp, svg_fp):
    """ converts Kicad .sch (eeschema) to svg leveraging binaries from plotkicadsch
    requires the .lib to be in the same folder
    Paramters:
    ----------
    sch_fp: str
        full path to the Kicad schematics file
    svg_fp: str
        full path to the svg output
    Return:
    -------
    svg_fp: str
        full path to the svg file created
    Usage:
    ------
    N/A
    """
    lib_fp = sch_fp.replace(".sch", "-cache.lib")
    #now test that svg is older than schematics:
    sch_is_younger = True
    if exists(svg_fp):
        if getmtime(sch_fp) < getmtime(svg_fp):
            sch_is_younger = False

    if sch_is_younger:
        logger.info(f"calling plotkicadsch on {sch_fp}")
        res = call([plotkicadsch_fp, "-f", sch_fp, "-l", lib_fp])
        try:
            assert res == 0
        except:
            logger.error(
                f"failed to convert sch {sch_fp} lib_fp: {lib_fp} to svg")
            raise

    if sch_is_younger:
        with open(sch_fp.replace(".sch", ".svg"), 'r') as fp:
            fxml = fp.read()
        mxml = etree.fromstring(fxml)

        # REMOVE the canvas lines
        for element in mxml.xpath(".//*[not(node())]"):
            if (element.tag.find("rect")) > 0:
                element.getparent().remove(element)

        #REMOVE the text from the canvas
        for element in mxml.xpath('//*[contains(text(),"Page:")]'):
            element.getparent().remove(element)

        #text in RED and GREEN changed back to black
        for el in mxml.xpath("//svg:text[@fill='#FF0000']",\
            namespaces={'svg':'http://www.w3.org/2000/svg'}):
            el.attrib["fill"] = "#000000"
        for element in mxml.xpath('//svg:text[@fill="#00FF00"]',\
            namespaces={'svg':'http://www.w3.org/2000/svg'}):
            element.attrib["fill"] = "#000000"

        #polyline in dark red to black
        for element in mxml.xpath('//svg:polyline[@stroke="#800000"]',\
            namespaces={'svg':'http://www.w3.org/2000/svg'}):
            element.attrib["stroke"] = "#000000"

        x_min, x_max, y_min, y_max = get_svg_viewbox(mxml)

        for el in mxml.xpath("//*[@viewBox]"):
            width = (x_max - x_min) * 1.2
            height = (y_max - y_min) * 1.2
            el.attrib['viewBox'] = "%s %s %s %s" % (
                x_min - width * 0.1, y_min - height * 0.1, width, height)
            etree.strip_attributes(el, 'width')
            etree.strip_attributes(el, 'height')

        cleaned = etree.tostring(mxml, pretty_print=True, encoding="unicode")
        with open(svg_fp, 'w') as fp:
            fp.write(cleaned)
    return (svg_fp)
コード例 #39
0
ファイル: easy_xml.py プロジェクト: Venkata16924B/tools-1
    def remove_attr(self, attribute: str):
        """
		Remove an attribute from this node.
		"""

        etree.strip_attributes(self.lxml_element, attribute)
コード例 #40
0
ファイル: gcparse.py プロジェクト: ninetythirty/gcparse
def parse_old_conversations(mbox_file, xml_dir, addresses):
    # Until about 2013-05-01 Google used XMPP-like XML for chat
    print('Parsing old-style XML chats... ', file=sys.stdout)
    sys.stdout.flush()
    mbox = mailbox.mbox(mbox_file)
    num_messages = 0
    num_malformed = 0  # no XML
    num_groupchats = 0

    for message in mbox:
        num_messages += 1
        thread_id = message['X-GM-THRID']
        payload = message.get_payload(i=0)
        transfer_encoding = payload['Content-Transfer-Encoding']
        content_type = payload.get_content_type()
        if content_type != 'text/xml':
            # These are either empty messages or email messages that Google
            # confused while attempting to blur the distinction between chat
            # and email. In the latter case, we only care about chat messages
            # here, but the message should be correctly parsed as email
            num_malformed += 1
        else:
            # Message contains good XML, clean it up
            payload_cleaned = clean_xml_payload(payload)
            tree = etree.fromstring(payload_cleaned)
            if tree.xpath('//cli:message[@type="groupchat"]',
                          namespaces={'cli': 'jabber:client'}):
                # Skip group chats
                num_groupchats += 1
                continue
            # Python's ElementTree fails miserably here, use lxml
            xpaths = []
            # Google elements
            signature_xpath = tree.xpath('//met:google-mail-signature',
                                         namespaces={'met': 'google:metadata'})
            xpaths.append(signature_xpath)
            delay_xpath = tree.xpath('//jxd:x',
                                     namespaces={'jxd': 'jabber:x:delay'})
            xpaths.append(delay_xpath)
            nosave_xpath = tree.xpath('//nos:x',
                                      namespaces={'nos': 'google:nosave'})
            xpaths.append(nosave_xpath)
            record_xpath = tree.xpath(
                '//arc:record',
                namespaces={'arc': 'http://jabber.org/protocol/archive'})
            xpaths.append(record_xpath)
            xhtml_xpath = tree.xpath(
                '//xht:html',
                namespaces={'xht': 'http://www.w3.org/1999/xhtml'})
            xpaths.append(xhtml_xpath)
            xhtmlim_xpath = tree.xpath(
                '//xim:html',
                namespaces={'xim': 'http://jabber.org/protocol/xhtml-im'})
            xpaths.append(xhtmlim_xpath)
            gap_xpath = tree.xpath(
                '//con:gap', namespaces={'con': 'google:archive:conversation'})
            xpaths.append(gap_xpath)
            # Jabber "composing" element (someone was typing)
            composing_xpath = tree.xpath('//eve:x',
                                         namespaces={'eve': 'jabber:x:event'})
            # Remove junk elements (there are a lot of them)
            xpaths.append(composing_xpath)
            for x in xpaths:
                for dud in x:
                    dud.getparent().remove(dud)
            # Strip Google and experimental(?) AIM attributes
            etree.strip_attributes(
                tree,
                'iconset',
                '{google:internal}cid',
                '{google:internal}sequence-no',
                '{google:internal}time-stamp',
                '{google:internal}interop-stanza',
                '{google:internal}dual-delivery',
                '{google:internal}interop-disable-legacy-archiver',
                '{google:aim}new-session',
            )
            # Clean up namespaces
            remove_namespace(tree, '{jabber:client}')
            remove_namespace(tree, '{google:archive:conversation}')
            remove_namespace(tree, '{google:timestamp}')
            etree.cleanup_namespaces(tree)
            # Remove /resource from message 'from' and 'to' attributes
            for m in tree.xpath('//message'):
                from_field = m.attrib['from'].split('/')[0]
                to_field = m.attrib['to'].split('/')[0]
                m.attrib['from'] = from_field
                m.attrib['to'] = to_field
                # Record addresses for name map
                addresses[from_field] += 1
                addresses[to_field] += 1

            # Write out data
            f = open('{0}/{1}.conv'.format(xml_dir, thread_id), 'a')  # append
            # Format manually b/c lxml's pretty print makes a TON of mistakes
            prev_m_as_string = ''
            for m in tree.xpath('//message'):
                m_as_string = etree.tostring(m)
                # Gotcha: When xpath() is used on an Element, if the XPath
                # expression is relative it's evaluated against the element.
                # If the expression is absolute it's evaluated against the tree
                to_field = m.attrib['to']
                from_field = m.attrib['from']
                body = m.xpath('./body')
                # In the case of sequential messages with identical timestamps,
                # we have to rely on line order in the mbox to order messages
                time_ms = m.xpath('./time')[0].attrib['ms']
                if m_as_string != prev_m_as_string and len(body) != 0:
                    # Don't print duplicate messages (sometimes the entire
                    # message including timestamp is repeated), don't print
                    # empty messages
                    print('  <message to="{0}" from="{1}">'.format(
                        to_field, from_field),
                          file=f)
                    print('    <body>{0}</body>'.format(
                        html.escape(body[0].text)),
                          file=f)
                    print('    <time ms="{0}"/>'.format(time_ms), file=f)
                    print('  </message>', file=f)
                prev_m_as_string = m_as_string
            f.close()

    num_parsed = num_messages - num_malformed - num_groupchats
    if num_malformed:
        print('    Malformed: {0}'.format(num_malformed), file=sys.stdout)
    if num_groupchats:
        print('    Group chats: {0} (unsupported)'.format(num_groupchats),
              file=sys.stdout)
    print('    Messages parsed: {0}'.format(num_parsed), file=sys.stdout)
    print('DONE', file=sys.stdout)
    return num_parsed
コード例 #41
0
    def __call__(self, doc):
        """
        Cleans the document.
        """
        if hasattr(doc, "getroot"):
            # ElementTree instance, instead of an element
            doc = doc.getroot()
        # convert XHTML to HTML
        xhtml_to_html(doc)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter("image"):
            el.tag = "img"
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)

        kill_tags = set(self.kill_tags or ())
        remove_tags = set(self.remove_tags or ())
        allow_tags = set(self.allow_tags or ())

        if self.scripts:
            kill_tags.add("script")
        if self.safe_attrs_only:
            safe_attrs = set(self.safe_attrs)
            for el in doc.iter(etree.Element):
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not (self.safe_attrs_only and self.safe_attrs == defs.safe_attrs):
                # safe_attrs handles events attributes itself
                for el in doc.iter(etree.Element):
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith("on"):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False)
            # If we're deleting style then we don't have to remove JS links
            # from styles, otherwise...
            if not self.inline_style:
                for el in _find_styled_elements(doc):
                    old = el.get("style")
                    new = _css_javascript_re.sub("", old)
                    new = _css_import_re.sub("", new)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib["style"]
                    elif new != old:
                        el.set("style", new)
            if not self.style:
                for el in list(doc.iter("style")):
                    if el.get("type", "").lower().strip() == "text/javascript":
                        el.drop_tree()
                        continue
                    old = el.text or ""
                    new = _css_javascript_re.sub("", old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub("", old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = "/* deleted */"
                    elif new != old:
                        el.text = new
        if self.comments or self.processing_instructions:
            # FIXME: why either?  I feel like there's some obscure reason
            # because you can put PIs in comments...?  But I've already
            # forgotten it
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add("style")
        if self.inline_style:
            etree.strip_attributes(doc, "style")
        if self.links:
            kill_tags.add("link")
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter("link")):
                if "stylesheet" in el.get("rel", "").lower():
                    # Note this kills alternate stylesheets as well
                    if not self.allow_element(el):
                        el.drop_tree()
        if self.meta:
            kill_tags.add("meta")
        if self.page_structure:
            remove_tags.update(("head", "html", "title"))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
            for el in list(doc.iter("param")):
                found_parent = False
                parent = el.getparent()
                while parent is not None and parent.tag not in ("applet", "object"):
                    parent = parent.getparent()
                if parent is None:
                    el.drop_tree()
            kill_tags.update(("applet",))
            # The alternate contents that are in an iframe are a good fallback:
            remove_tags.update(("iframe", "embed", "layer", "object", "param"))
        if self.frames:
            # FIXME: ideally we should look at the frame links, but
            # generally frames don't mix properly with an HTML
            # fragment anyway.
            kill_tags.update(defs.frame_tags)
        if self.forms:
            remove_tags.add("form")
            kill_tags.update(("button", "input", "select", "textarea"))
        if self.annoying_tags:
            remove_tags.update(("blink", "marquee"))

        _remove = []
        _kill = []
        for el in doc.iter():
            if el.tag in kill_tags:
                if self.allow_element(el):
                    continue
                _kill.append(el)
            elif el.tag in remove_tags:
                if self.allow_element(el):
                    continue
                _remove.append(el)

        if _remove and _remove[0] == doc:
            # We have to drop the parent-most tag, which we can't
            # do.  Instead we'll rewrite it:
            el = _remove.pop(0)
            el.tag = "div"
            el.attrib.clear()
        elif _kill and _kill[0] == doc:
            # We have to drop the parent-most element, which we can't
            # do.  Instead we'll clear it:
            el = _kill.pop(0)
            if el.tag != "html":
                el.tag = "div"
            el.clear()

        _kill.reverse()  # start with innermost tags
        for el in _kill:
            el.drop_tree()
        for el in _remove:
            el.drop_tag()

        if self.remove_unknown_tags:
            if allow_tags:
                raise ValueError(
                    "It does not make sense to pass in both allow_tags and remove_unknown_tags"
                )
            allow_tags = set(defs.tags)
        if allow_tags:
            bad = []
            for el in doc.iter():
                if el.tag not in allow_tags:
                    bad.append(el)
            if bad:
                if bad[0] is doc:
                    el = bad.pop(0)
                    el.tag = "div"
                    el.attrib.clear()
                for el in bad:
                    el.drop_tag()
        if self.add_nofollow:
            for el in _find_external_links(doc):
                if not self.allow_follow(el):
                    rel = el.get("rel")
                    if rel:
                        if "nofollow" in rel and " nofollow " in (" %s " % rel):
                            continue
                        rel = "%s nofollow" % rel
                    else:
                        rel = "nofollow"
                    el.set("rel", rel)
コード例 #42
0
ファイル: textclass_xml.py プロジェクト: abrennr/drlutils
def build(do_id, c_id):
    """Create a DLXS-appropriate xml file for the item by using data from the assoicated mods, mets, and ocr files.  After a collection-generic XML is created, it's passed to the handle_collection_extras method for any additional collection-specific transformation.  Finally, the XML is written to a directory for that collection."""
    item = drlutils.django.utils.get_item(do_id) 
    mods_file = drlutils.django.utils.get_mods_path(item) 
    if not mods_file:
    	return do_id + ' - no mods file found'
    mets_file = drlutils.django.utils.get_mets_path(item) 
    if not mets_file:
    	return do_id + ' - no mets file found'
    try:
    	# mods is transformed into the header
    	mods = etree.parse(open(mods_file, 'r'))
    	mods_stylesheet = etree.parse(open(drlutils.config.MODS2DLXS_XSL, 'r'))
    	transform_mods = etree.XSLT(mods_stylesheet)
    	dlxs = transform_mods(mods)	
    	# mets is transformed into the body 
    	mets = etree.parse(open(mets_file, 'r'))
    	mets_stylesheet = etree.parse(open(drlutils.config.METS2DLXS_XSL, 'r'))
    	transform_mets = etree.XSLT(mets_stylesheet)
    	body = transform_mets(mets)	
    	# the body is appended to the header
    	dlxs.getroot().append(body.getroot())
        if has_ocr(do_id):
            try:
                ocr_file = drlutils.django.utils.get_ocr_zip_path(item) 
            except:
                return do_id + ' - no ocr file found'
            # the ocr is unzipped and added to the pages
            ocr_zip = zipfile.ZipFile(ocr_file, 'r')
            for ocr in ocr_zip.namelist():
                oimage = re.sub('.txt', '.tif', os.path.basename(ocr))
                xpath_for_p_element = '//P[@REF=\'' + oimage + '\']'    
                page_content = " ".join(ocr_zip.open(ocr).readlines())
                filtered_text = ' '
                if page_content:
                    filtered_text = filter_ocr(page_content) or ' ' 
                try:
                    p = dlxs.xpath(xpath_for_p_element)[0]
                    p.text = filtered_text
                except Exception as e:
                    print 'OCR exception: %s - %s - %s' % (str(e), do_id, xpath_for_p_element)
    	# remove the REF attribute of the P elements, which was our hook for the OCR
    	for p_tag in dlxs.xpath('//P'):
    		etree.strip_attributes(p_tag, 'REF')
    	# add digital object identifier
    	#idno = dlxs.xpath('//IDNO[@TYPE=\'uls-drl\']')[0]
    	idno = dlxs.xpath('//IDNO')[0]
    	idno.text = item.do_id
    	# add digital publication date
    	date = dlxs.xpath('/DLPSTEXTCLASS/HEADER/FILEDESC/PUBLICATIONSTMT/DATE')[0]
    	try:
    		pub = item.online_pub_date.year
    		date.text = str(pub)
    	except:
    		new_pub_date = str(datetime.date.today().year)
    		date.text = new_pub_date 
    	# add availability statement
    	availability = dlxs.xpath('/DLPSTEXTCLASS/HEADER/FILEDESC/PUBLICATIONSTMT/AVAILABILITY/P')[0]
    	availability.text = get_availability_statement(item)
    	# add extent (number) of digital files
    	pages = dlxs.xpath('//PB')
    	page_count = str(len(pages))
    	extent = dlxs.xpath('/DLPSTEXTCLASS/HEADER/FILEDESC/EXTENT')[0]
    	extent.text = page_count + ' digitized page images'
    	title_main = dlxs.xpath('//TITLE[@TYPE=\'245\']')[0]
    	div1_head = dlxs.xpath('//DIV1/HEAD')[0]
    	div1_head.text = title_main.text
    	# add tei encoding level
    	divs = dlxs.xpath('//DIV2')
    	if len(divs) > 1:
    		level = 2
    	else:
    		level = 1 
    	encodingdecl = dlxs.xpath('//EDITORIALDECL')[0]
    	encodingdecl.set("N", str(level))
    	# handle collection-specific extra processing
    	dlxs = handle_collection_extras(item.do_id, c_id, dlxs)
    	outdir = os.path.join(drlutils.config.DLXS_XML_PATH, c_id)
    	if not os.path.exists(outdir):
    		os.makedirs(outdir)
    	outfile = item.do_id + '.xml'
    	outpath = os.path.join(outdir, outfile)
    	etree.ElementTree(dlxs.getroot()).write(outpath, pretty_print=True)
    	return None
    except Exception as e:
    	return 'problem with %s: %s, %s' % (do_id, str(e), traceback.print_tb(sys.exc_info()[2]))
コード例 #43
0
ファイル: asdf_write.py プロジェクト: bbcrd/ssr_utils
 def Write(self, fxml):       
     etree.strip_attributes(self.root, '{http://codespeak.net/lxml/objectify/pytype}pytype')
     objectify.deannotate(self.root, xsi_nil=True)
     etree.cleanup_namespaces(self.root)
     print >>fxml, (etree.tostring(self.root, pretty_print=True, xml_declaration=True, encoding='utf-8'))
コード例 #44
0
    def make_gpx_private(self, gpx_file=None):
        '''
        wipes out private data from gpx files
        converts laps to waypoints
        '''
        logging.debug(">>")
        if gpx_file is None:
            return None

        filen = os.path.basename(gpx_file)
        tmpdir = self.pytrainer_main.profile.tmpdir
        anon_gpx_file = "%s/%s" % (tmpdir, filen)

        # get saved private area polygon
        pP = loads(self.options['privPolygon'])
        pP = pP['geometry']['coordinates'][0]
        # converts polygon's 2D matrix into a vector of just the lats or lons
        vector = lambda lonLat: [pP[i][lonLat]
                                 for i in range(len(pP))]  # 0:lon, 1:lat
        # try reading private area's bounds, stored as [lon,lat]
        NE_LAT = max([pP[i][1] for i in range(len(pP))])
        NE_LON = max([pP[i][0] for i in range(len(pP))])
        SW_LAT = min([pP[i][1] for i in range(len(pP))])
        SW_LON = min([pP[i][0] for i in range(len(pP))])
        logging.info("Anonymizing Area: NE:%f,%f -> SW: %f,%f" %
                     (NE_LON, NE_LAT, SW_LON, SW_LAT))

        # Config parameters, not used yet
        FILTER_BOX = True
        ERASE_TIME = True
        LAP_TO_WAYPOINT = True

        tree = etree.parse(gpx_file)
        _xmlns = tree.getroot().nsmap[None]
        _trkpt_path = '{%s}trk/{%s}trkseg/{%s}trkpt' % (_xmlns, _xmlns, _xmlns)
        # namespace of gpx files
        NS = dict(ns='http://www.topografix.com/GPX/1/1')

        myroot = tree.getroot()
        gpxdataNS = string.Template(\
            ".//{http://www.cluetrust.com/XML/GPXDATA/1/0}$tag")
        lapTag = gpxdataNS.substitute(tag="lap")
        endPointTag = gpxdataNS.substitute(tag="endPoint")
        triggerTag = gpxdataNS.substitute(tag="trigger")
        laps = tree.findall(lapTag)

        mygpx = tree.find('gpx')

        for lap in laps:
            trigger = lap.find(triggerTag)
            #  Watch out for manually triggered laps
            if trigger.text == 'manual':
                endPoint = lap.find(endPointTag)
                lat = endPoint.get("lat")
                lon = endPoint.get("lon")
                # Create waypt if not in home box
                try:
                    if not ((SW_LAT < float(lat) < NE_LAT) and
                            (SW_LON < float(lon) < NE_LON)):
                        etree.SubElement(myroot,
                                         'wpt',
                                         attrib={
                                             'lat': lat,
                                             'lon': lon
                                         })
                except:
                    pass
        etree.strip_attributes(myroot, 'creator')

        # Wipe out home box
        for trkpt in tree.findall(_trkpt_path):
            lat = float(trkpt.attrib['lat'])
            lon = float(trkpt.attrib['lon'])
            if (lat < NE_LAT) & (lon < NE_LON) & (lat > SW_LAT) & (lon >
                                                                   SW_LON):
                par = trkpt.getparent()
                par.remove(trkpt)

        time = tree.xpath('//ns:trkpt/ns:time', namespaces=NS)
        for i in time:
            i.text = '1970-01-01T00:00:00+00:00'
            # osm regards <time> as mandatory. gnaa.

        ext = tree.xpath('//ns:gpx/ns:extensions', namespaces=NS)
        for i in ext:
            par = i.getparent()
            par.remove(i)
        meta = tree.xpath('//ns:gpx/ns:metadata', namespaces=NS)
        for i in meta:
            par = i.getparent()
            par.remove(i)
        ele = tree.xpath('//ns:trkpt/ns:ele', namespaces=NS)
        for i in ele:
            par = i.getparent()
            par.remove(i)

        # test schema on cleaned xml-tree
        # gpx.xsd from http://www.topografix.com/gpx.asp

        #xmlschema = etree.XMLSchema(etree.parse('gpx.xsd'))
        #xmlschema.validate(tree)

        # write new gpx file
        tree.write(anon_gpx_file,
                   pretty_print=False,
                   xml_declaration=True,
                   encoding='UTF-8')
        logging.debug("<<")
        return anon_gpx_file
コード例 #45
0
def main():
    input_dir = Path(os.getenv("INPUT_RAWPATH", "content/GENERATED/"))
    output_dir = Path(os.getenv("INPUT_OUTPUTPATH", "content/api/"))

    if not input_dir.exists():
        print("Exiting because there are no files to process...")
        print(f"{input_dir.resolve()} does not exist!")
        sys.exit(0)

    print(f"Processing content of {input_dir.resolve()}...")
    print(f"Outputting results to {output_dir.resolve()}...")

    for f in input_dir.glob("*_api.xml"):
        print(f"Processing {str(f)}...")

        tree = etree.parse(str(f),
                           parser=etree.XMLParser(recover=True,
                                                  remove_comments=True))
        root = tree.getroot()

        remove_attrs = [
            "noemph",
            "{*}space",
            "add_permalink",
            "is_multiline",
            "noindex",
        ]
        etree.strip_attributes(
            root,
            *remove_attrs,
        )

        body = root.find("section")
        contents = CodeFile(body)
        contents.parse()
        # frontmatter, parsed = parse_file(f)

        output_xml = input_dir / f"{f.stem}-processed.xml"

        with output_xml.open("w") as fp:
            doc = E.document()

            if contents.domain == CodeFile.DOMAIN_PY:
                doc.set("api-lang", "python")
                doc.set("title", "Python API Documentation")
            elif contents.domain == CodeFile.DOMAIN_CPP:
                doc.set("api-lang", "cpp")
                doc.set("title", "C++ API Documentation")
            elif contents.domain == CodeFile.DOMAIN_C:
                doc.set("api-lang", "c")
                doc.set("title", "C API Documentation")

            title_element = E.document_title(doc.get("title"))
            doc.append(title_element)

            children = root.find("./section").getchildren()
            doc.extend(children)
            etree.indent(doc, space="    ", level=1)
            doc_text = etree.tostring(doc, encoding="unicode")
            fp.write(doc_text)
            fp.write("\n")

        print()

    htmlify(input_dir, output_dir)

    sys.exit(0)
コード例 #46
0
    ID = origUrl.text.replace(baseUrl, "")
    slug.text = ID
    items.append(slug)

# --- Do the processing on original xml ---
for items in root.findall(nodePath, di.nsDict):
    # --- Convert all the fields
    for keys in di.qdcFieldsDict:
        nestedFormat(items, keys)

# --- Strip out the nodes leftover from shuffling things around in the hierarchy ---
for leftovers in root.findall('item/metadata'):
    strip = leftovers.getparent()
    strip.remove(leftovers)

# --- Clean up extraneous attriubtes on nodes ---
etree.strip_attributes(root, "set", "batchsize", "id", "timestamp", "total")
etree.cleanup_namespaces(root)

# --- Create new xml file ---
filename = xmlFile
(prefix, sep, suffix) = filename.rpartition('.')

new_filename = prefix + '_batch.xml'
tree.write(new_filename,
           pretty_print=True,
           encoding="UTF-8",
           xml_declaration=True)

print('\n', 'Your new file,', new_filename, ', has been created.')
コード例 #47
0
        for h in hs:
            rawtext = lxml.html.tostring(h, method="text", encoding="utf8")
            if rawtext != None:
                raw_h = ET.Element("head")
                raw_h.text = h.text
                h.addnext(raw_h)
                h.getparent().remove(h)
        for c in cs:
            if c.getparent() is not None:
                c.getparent().remove(c)
        for note in notes:
            if note.getparent() is not None:
                note.getparent().remove(note)

        for div in divs:
            ET.strip_attributes(div, 'rend')
            if 'type' in div.attrib:
                if div.attrib['type'] == 'section':
                    div.attrib['type'] = 'chapter'
                if "n" in div.attrib:
                    div.attrib['title'] = div.attrib['n']
                    del div.attrib['n']

    # Reconstitution du Header

        fileDesc = ET.Element('fileDesc')
        titleStmt = ET.Element('titleStmt')
        title = ET.Element('title')

        if len(tree.xpath('.//title/text()')) > 0:
            title.text = tree.xpath('.//title/text()')[0]
コード例 #48
0
ファイル: serializers.py プロジェクト: phlax/moz_pootle_fs
 def strip_attributes(self, xml, path, attrs):
     for node in xml.xpath(path):
         for k in [i for i in node.keys() if i in attrs]:
             etree.strip_attributes(node, k)
     return xml
コード例 #49
0
# drop titlemap

for t in root.findall('.//titlemap'):
    t.getparent().remove(t)

# drop audio

for a in root.findall('.//audio'):
    a.getparent().remove(a)

# drop subpicture attributes

for s in root.findall('.//subpicture'):
    present = s.get('present')
    et.strip_attributes(s, 'present')
    et.strip_attributes(s, 'id')
    if present != None and present == 'no':
        s.getparent().remove(s)

for s in root.findall('.//pgc'):
    et.strip_attributes(s, 'next')
    et.strip_attributes(s, 'prev')
# fix buttons

for b in root.findall('.//buttons'):
    vod = b.getparent()
    for button in reversed(b.getchildren()):
        vod.addnext(button)
    vod.remove(b)
コード例 #50
0
    def __call__(self, doc):
        """
        Cleans the document.
        """
        try:
            getroot = doc.getroot
        except AttributeError:
            pass  # Element instance
        else:
            doc = getroot()  # ElementTree instance, instead of an element
        # convert XHTML to HTML
        xhtml_to_html(doc)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter('image'):
            el.tag = 'img'
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)

        kill_tags = set(self.kill_tags or ())
        remove_tags = set(self.remove_tags or ())
        allow_tags = set(self.allow_tags or ())

        if self.scripts:
            kill_tags.add('script')
        if self.safe_attrs_only:
            safe_attrs = set(self.safe_attrs)
            for el in doc.iter(etree.Element):
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not (self.safe_attrs_only
                    and self.safe_attrs == defs.safe_attrs):
                # safe_attrs handles events attributes itself
                for el in doc.iter(etree.Element):
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith('on'):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link,
                              resolve_base_href=False)
            # If we're deleting style then we don't have to remove JS links
            # from styles, otherwise...
            if not self.inline_style:
                for el in _find_styled_elements(doc):
                    old = el.get('style')
                    new = _css_javascript_re.sub('', old)
                    new = _css_import_re.sub('', new)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib['style']
                    elif new != old:
                        el.set('style', new)
            if not self.style:
                for el in list(doc.iter('style')):
                    if el.get('type', '').lower().strip() == 'text/javascript':
                        el.drop_tree()
                        continue
                    old = el.text or ''
                    new = _css_javascript_re.sub('', old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = '/* deleted */'
                    elif new != old:
                        el.text = new
        if self.comments:
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add('style')
        if self.inline_style:
            etree.strip_attributes(doc, 'style')
        if self.links:
            kill_tags.add('link')
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter('link')):
                if 'stylesheet' in el.get('rel', '').lower():
                    # Note this kills alternate stylesheets as well
                    if not self.allow_element(el):
                        el.drop_tree()
        if self.meta:
            kill_tags.add('meta')
        if self.page_structure:
            remove_tags.update(('head', 'html', 'title'))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
            for el in list(doc.iter('param')):
                found_parent = False
                parent = el.getparent()
                while parent is not None and parent.tag not in ('applet',
                                                                'object'):
                    parent = parent.getparent()
                if parent is None:
                    el.drop_tree()
            kill_tags.update(('applet', ))
            # The alternate contents that are in an iframe are a good fallback:
            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
        if self.frames:
            # FIXME: ideally we should look at the frame links, but
            # generally frames don't mix properly with an HTML
            # fragment anyway.
            kill_tags.update(defs.frame_tags)
        if self.forms:
            remove_tags.add('form')
            kill_tags.update(('button', 'input', 'select', 'textarea'))
        if self.annoying_tags:
            remove_tags.update(('blink', 'marquee'))

        _remove = []
        _kill = []
        for el in doc.iter():
            if el.tag in kill_tags:
                if self.allow_element(el):
                    continue
                _kill.append(el)
            elif el.tag in remove_tags:
                if self.allow_element(el):
                    continue
                _remove.append(el)

        if _remove and _remove[0] == doc:
            # We have to drop the parent-most tag, which we can't
            # do.  Instead we'll rewrite it:
            el = _remove.pop(0)
            el.tag = 'div'
            el.attrib.clear()
        elif _kill and _kill[0] == doc:
            # We have to drop the parent-most element, which we can't
            # do.  Instead we'll clear it:
            el = _kill.pop(0)
            if el.tag != 'html':
                el.tag = 'div'
            el.clear()

        _kill.reverse()  # start with innermost tags
        for el in _kill:
            el.drop_tree()
        for el in _remove:
            el.drop_tag()

        if self.remove_unknown_tags:
            if allow_tags:
                raise ValueError(
                    "It does not make sense to pass in both allow_tags and remove_unknown_tags"
                )
            allow_tags = set(defs.tags)
        if allow_tags:
            # make sure we do not remove comments/PIs if users want them (which is rare enough)
            if not self.comments:
                allow_tags.add(etree.Comment)
            if not self.processing_instructions:
                allow_tags.add(etree.ProcessingInstruction)

            bad = []
            for el in doc.iter():
                if el.tag not in allow_tags:
                    bad.append(el)
            if bad:
                if bad[0] is doc:
                    el = bad.pop(0)
                    el.tag = 'div'
                    el.attrib.clear()
                for el in bad:
                    el.drop_tag()
        if self.add_nofollow:
            for el in _find_external_links(doc):
                if not self.allow_follow(el):
                    rel = el.get('rel')
                    if rel:
                        if ('nofollow' in rel
                                and ' nofollow ' in (' %s ' % rel)):
                            continue
                        rel = '%s nofollow' % rel
                    else:
                        rel = 'nofollow'
                    el.set('rel', rel)
コード例 #51
0
ファイル: lybshared.py プロジェクト: inpos/lybrary
def html(self, catid, url, htfile, md = None, sub = False, ins = False):
    htfile = lybtools.htfile_tounicode(htfile)
    body = etree.ElementTree(etree.HTML(htfile).xpath('//body')[0])
    sessdata = cherrypy.session
    strip_tags = ["script", "link"]
    etree.strip_elements(body, *strip_tags, with_tail=False)
    chg_tags = ["body", "a", "form", "input", "noscript"]
    etree.strip_tags(body, *chg_tags)
    etree.strip_tags(body, etree.Comment)
    #safe_tags = ['img']
    for elem in body.xpath('//*'):
        if elem.tag == 'body': elem.tag = 'old-body'
        attr = elem.attrib
        if elem.tag in chg_tags:
            etree.strip_attributes(elem, *attr)
        if "class" in attr:
            etree.strip_attributes(elem, "class")
        if "id" in attr:
            etree.strip_attributes(elem, "id")
        if "onclick" in attr:
            etree.strip_attributes(elem, "onclick")
        if "style" in attr:
            attr['style'] = re.sub('url\(.+\)', 'url()', attr['style'])
        #if elem.tag not in safe_tags and (elem.text is None or elem.text.strip() == '') and elem.getchildren() == []:
        #    elem.getparent().remove(elem)
        #    continue
        if "src" in attr:
            m = re.search('data:(\S+);base64,(.+)', attr['src'])
            if not m:
                srcurl = urljoin(url, attr['src'])
                srcobjquery = urlsplit(srcurl)[3]
                srcqdict = parse_qs(srcobjquery)
                if 'lybsrcobj' in list(srcqdict.keys()):
                    ohash = srcqdict['lybsrcobj'][0]
                    srcquerydata = {'lybsrcobj': ohash}
                    srcquery = urlencode(srcquerydata)
#                    if ins:
                    page = '/getobj?'
#                    else:
#                        page = '/edit/tmpstore?'
#                    if ohash not in sessdata:
#                        if md:
#                            cat = Cat(int(catid))
#                            doco = cat[int(md)]
#                            sessdata[ohash] = doco[ohash]
                    elem.set('src', page + srcquery)
                    continue
                try:
                    srcu = URLOpener().open(srcurl)
                except:
                    continue
                if srcu.code >= 400:
                    continue
                srcdata = srcu.read()
                cont_type = srcu.headers['Content-Type']
                srcftype = cont_type and lybtools.ctype(srcu.headers['Content-Type']) or 'none'
            else:
                srcdata = decodestring(m.group(2).encode('utf-8'))
                srcftype = m.group(1)
            srchashname = hashlib.sha1(srcdata).hexdigest()
            if srcftype == 'text/html':
                if elem.tag == 'img': continue
                srcdata = self.html(catid, srcu.url, srcdata, sub = True)
            if srchashname not in sessdata:
                sessdata[srchashname] = {'body': srcdata, 'type': srcftype}
            srcquerydata = {'lybsrcobj': srchashname}
            srcquery = urlencode(srcquerydata)
            if ins:
                page = '/getobj?'
            else:
                page = '/edit/tmpstore?'
            elem.set('src', page + srcquery)
    etree.strip_tags(body, 'old-body')
    ht_ml = etree.tounicode(body, method='html', pretty_print = True)
    if not sub and not ins:
        return self.ne(catid, url=url, html=ht_ml, md=md)
    else:
        return ht_ml
コード例 #52
0
ファイル: compare_xml.py プロジェクト: evgenyorlov1/PPTX
def normalizer(e):
    etree.strip_attributes(e, 'id')
    return e
コード例 #53
0
ファイル: openstreetmap.py プロジェクト: pytrainer/pytrainer
    def make_gpx_private(self, gpx_file=None):
        '''
        wipes out private data from gpx files
        converts laps to waypoints
        '''
        logging.debug(">>")
        if gpx_file is None:
            return None
        
        filen = os.path.basename(gpx_file)
        tmpdir = self.pytrainer_main.profile.tmpdir
        anon_gpx_file = "%s/%s" % (tmpdir, filen)

        # get saved private area polygon
        pP=loads(self.options['privPolygon'])
        pP=pP['geometry']['coordinates'][0]
        # converts polygon's 2D matrix into a vector of just the lats or lons
        vector = lambda lonLat: [pP[i][lonLat] for i in range(len(pP))] # 0:lon, 1:lat
        # try reading private area's bounds, stored as [lon,lat]
        NE_LAT = max([pP[i][1] for i in range(len(pP))])
        NE_LON = max([pP[i][0] for i in range(len(pP))])
        SW_LAT = min([pP[i][1] for i in range(len(pP))])
        SW_LON = min([pP[i][0] for i in range(len(pP))])
        logging.info("Anonymizing Area: NE:%f,%f -> SW: %f,%f" % (NE_LON, NE_LAT, SW_LON, SW_LAT))
            
        # Config parameters, not used yet
        FILTER_BOX = True
        ERASE_TIME  = True
        LAP_TO_WAYPOINT = True

        tree = etree.parse(gpx_file)
        _xmlns = tree.getroot().nsmap[None]
        _trkpt_path = '{%s}trk/{%s}trkseg/{%s}trkpt' % (_xmlns, _xmlns, _xmlns)
        # namespace of gpx files
        NS = dict(ns='http://www.topografix.com/GPX/1/1')

        myroot =  tree.getroot()
        gpxdataNS = string.Template(\
            ".//{http://www.cluetrust.com/XML/GPXDATA/1/0}$tag")
        lapTag = gpxdataNS.substitute(tag="lap")
        endPointTag = gpxdataNS.substitute(tag="endPoint")
        triggerTag = gpxdataNS.substitute(tag="trigger")
        laps = tree.findall(lapTag)

        mygpx = tree.find('gpx')

        for lap in laps:
            trigger = lap.find(triggerTag)
            #  Watch out for manually triggered laps
            if trigger.text == 'manual':
                endPoint = lap.find(endPointTag)
                lat = endPoint.get("lat")
                lon = endPoint.get("lon")
                # Create waypt if not in home box
                try:
                    if not ((SW_LAT < float(lat) < NE_LAT) and (SW_LON < float(lon) < NE_LON)):
                        etree.SubElement(myroot, 'wpt', attrib= {'lat':lat, 'lon':lon})
                except:
                    pass
        etree.strip_attributes(myroot, 'creator')
                    
        # Wipe out home box
        for trkpt in tree.findall(_trkpt_path):
            lat = float(trkpt.attrib['lat'])
            lon = float(trkpt.attrib['lon'])
            if (lat < NE_LAT) & (lon < NE_LON) & (lat > SW_LAT) & (lon > SW_LON):
                par = trkpt.getparent()
                par.remove(trkpt)

        time = tree.xpath('//ns:trkpt/ns:time', namespaces=NS)
        for i in time:
            i.text = '1970-01-01T00:00:00+00:00'
            # osm regards <time> as mandatory. gnaa.

        ext = tree.xpath('//ns:gpx/ns:extensions', namespaces=NS)
        for i in ext:
            par = i.getparent()
            par.remove(i)
        meta = tree.xpath('//ns:gpx/ns:metadata', namespaces=NS)
        for i in meta:
            par = i.getparent()
            par.remove(i)
        ele = tree.xpath('//ns:trkpt/ns:ele', namespaces=NS)
        for i in ele:
            par = i.getparent()
            par.remove(i)

        # test schema on cleaned xml-tree
        # gpx.xsd from http://www.topografix.com/gpx.asp

        #xmlschema = etree.XMLSchema(etree.parse('gpx.xsd'))
        #xmlschema.validate(tree)

        # write new gpx file
        tree.write(anon_gpx_file, pretty_print=False, xml_declaration=True, encoding='UTF-8')
        logging.debug("<<")
        return anon_gpx_file
コード例 #54
0
ファイル: clean.py プロジェクト: RaphaelKimmig/lxml
    def __call__(self, doc):
        """
        Cleans the document.
        """
        if hasattr(doc, 'getroot'):
            # ElementTree instance, instead of an element
            doc = doc.getroot()
        # convert XHTML to HTML
        xhtml_to_html(doc)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter('image'):
            el.tag = 'img'
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)

        kill_tags = set(self.kill_tags or ())
        remove_tags = set(self.remove_tags or ())
        allow_tags = set(self.allow_tags or ())

        if self.scripts:
            kill_tags.add('script')
        if self.safe_attrs_only:
            safe_attrs = set(defs.safe_attrs)
            for el in doc.iter():
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not self.safe_attrs_only:
                # safe_attrs handles events attributes itself
                for el in doc.iter():
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith('on'):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link,
                              resolve_base_href=False)
            if not self.style:
                # If we're deleting style then we don't have to remove JS links
                # from styles, otherwise...
                for el in _find_styled_elements(doc):
                    old = el.get('style')
                    new = _css_javascript_re.sub('', old)
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib['style']
                    elif new != old:
                        el.set('style', new)
                for el in list(doc.iter('style')):
                    if el.get('type', '').lower().strip() == 'text/javascript':
                        el.drop_tree()
                        continue
                    old = el.text or ''
                    new = _css_javascript_re.sub('', old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = '/* deleted */'
                    elif new != old:
                        el.text = new
        if self.comments or self.processing_instructions:
            # FIXME: why either?  I feel like there's some obscure reason
            # because you can put PIs in comments...?  But I've already
            # forgotten it
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add('style')
            etree.strip_attributes(doc, 'style')
        if self.links:
            kill_tags.add('link')
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter('link')):
                if 'stylesheet' in el.get('rel', '').lower():
                    # Note this kills alternate stylesheets as well
                    el.drop_tree()
        if self.meta:
            kill_tags.add('meta')
        if self.page_structure:
            remove_tags.update(('head', 'html', 'title'))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
            for el in list(doc.iter('param')):
                found_parent = False
                parent = el.getparent()
                while parent is not None and parent.tag not in ('applet', 'object'):
                    parent = parent.getparent()
                if parent is None:
                    el.drop_tree()
            kill_tags.update(('applet',))
            # The alternate contents that are in an iframe are a good fallback:
            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
        if self.frames:
            # FIXME: ideally we should look at the frame links, but
            # generally frames don't mix properly with an HTML
            # fragment anyway.
            kill_tags.update(defs.frame_tags)
        if self.forms:
            remove_tags.add('form')
            kill_tags.update(('button', 'input', 'select', 'textarea'))
        if self.annoying_tags:
            remove_tags.update(('blink', 'marquee'))

        _remove = []
        _kill = []
        for el in doc.iter():
            if el.tag in kill_tags:
                if self.allow_element(el):
                    continue
                _kill.append(el)
            elif el.tag in remove_tags:
                if self.allow_element(el):
                    continue
                _remove.append(el)

        if _remove and _remove[0] == doc:
            # We have to drop the parent-most tag, which we can't
            # do.  Instead we'll rewrite it:
            el = _remove.pop(0)
            el.tag = 'div'
            el.attrib.clear()
        elif _kill and _kill[0] == doc:
            # We have to drop the parent-most element, which we can't
            # do.  Instead we'll clear it:
            el = _kill.pop(0)
            if el.tag != 'html':
                el.tag = 'div'
            el.clear()

        _kill.reverse() # start with innermost tags
        for el in _kill:
            el.drop_tree()
        for el in _remove:
            el.drop_tag()

        allow_tags = self.allow_tags
        if self.remove_unknown_tags:
            if allow_tags:
                raise ValueError(
                    "It does not make sense to pass in both allow_tags and remove_unknown_tags")
            allow_tags = set(defs.tags)
        if allow_tags:
            bad = []
            for el in doc.iter():
                if el.tag not in allow_tags:
                    bad.append(el)
            if bad:
                if bad[0] is doc:
                    el = bad.pop(0)
                    el.tag = 'div'
                    el.attrib.clear()
                for el in bad:
                    el.drop_tag()
        if self.add_nofollow:
            for el in _find_external_links(doc):
                if not self.allow_follow(el):
                    el.set('rel', 'nofollow')
コード例 #55
0
ファイル: fix_dvdauthor.py プロジェクト: lu-zero/dvdauthor
# drop titlemap

for t in root.findall('.//titlemap'):
    t.getparent().remove(t)

# drop audio

for a in root.findall('.//audio'):
    a.getparent().remove(a)

# drop subpicture attributes

for s in root.findall('.//subpicture'):
    present = s.get('present')
    et.strip_attributes(s, 'present')
    et.strip_attributes(s, 'id')
    if present != None and present == 'no':
        s.getparent().remove(s)

for s in root.findall('.//pgc'):
    et.strip_attributes(s, 'next')
    et.strip_attributes(s, 'prev')
# fix buttons

for b in root.findall('.//buttons'):
    vod = b.getparent()
    for button in reversed(b.getchildren()):
        vod.addnext(button)
    vod.remove(b)