Beispiel #1
0
    def _make_parser(self):
        # For some reason MAL returns an XML file with HTML exclusive
        # entities like á, so we have to create a custom XMLParser
        # to convert these entities correctly.
        parser = ET.XMLParser()
        parser.parser.UseForeignDTD(True)

        entities = dict()
        entities["nbsp"] =     u'\u00A0'
        entities["iexcl"] =    u'\u00A1'
        entities["cent"] =     u'\u00A2'
        entities["pound"] =    u'\u00A3'
        entities["curren"] =   u'\u00A4'
        entities["yen"] =      u'\u00A5'
        entities["brvbar"] =   u'\u00A6'
        entities["sect"] =     u'\u00A7'
        entities["uml"] =      u'\u00A8'
        entities["copy"] =     u'\u00A9'
        entities["ordf"] =     u'\u00AA'
        entities["laquo"] =    u'\u00AB'
        entities["not"] =      u'\u00AC'
        entities["shy"] =      u'\u00AD'
        entities["reg"] =      u'\u00AE'
        entities["macr"] =     u'\u00AF'
        entities["deg"] =      u'\u00B0'
        entities["plusmn"] =   u'\u00B1'
        entities["sup2"] =     u'\u00B2'
        entities["sup3"] =     u'\u00B3'
        entities["acute"] =    u'\u00B4'
        entities["micro"] =    u'\u00B5'
        entities["para"] =     u'\u00B6'
        entities["middot"] =   u'\u00B7'
        entities["cedil"] =    u'\u00B8'
        entities["sup1"] =     u'\u00B9'
        entities["ordm"] =     u'\u00BA'
        entities["raquo"] =    u'\u00BB'
        entities["frac14"] =   u'\u00BC'
        entities["frac12"] =   u'\u00BD'
        entities["frac34"] =   u'\u00BE'
        entities["iquest"] =   u'\u00BF'
        entities["Agrave"] =   u'\u00C0'
        entities["Aacute"] =   u'\u00C1'
        entities["Acirc"] =    u'\u00C2'
        entities["Atilde"] =   u'\u00C3'
        entities["Auml"] =     u'\u00C4'
        entities["Aring"] =    u'\u00C5'
        entities["AElig"] =    u'\u00C6'
        entities["Ccedil"] =   u'\u00C7'
        entities["Egrave"] =   u'\u00C8'
        entities["Eacute"] =   u'\u00C9'
        entities["Ecirc"] =    u'\u00CA'
        entities["Euml"] =     u'\u00CB'
        entities["Igrave"] =   u'\u00CC'
        entities["Iacute"] =   u'\u00CD'
        entities["Icirc"] =    u'\u00CE'
        entities["Iuml"] =     u'\u00CF'
        entities["ETH"] =      u'\u00D0'
        entities["Ntilde"] =   u'\u00D1'
        entities["Ograve"] =   u'\u00D2'
        entities["Oacute"] =   u'\u00D3'
        entities["Ocirc"] =    u'\u00D4'
        entities["Otilde"] =   u'\u00D5'
        entities["Ouml"] =     u'\u00D6'
        entities["times"] =    u'\u00D7'
        entities["Oslash"] =   u'\u00D8'
        entities["Ugrave"] =   u'\u00D9'
        entities["Uacute"] =   u'\u00DA'
        entities["Ucirc"] =    u'\u00DB'
        entities["Uuml"] =     u'\u00DC'
        entities["Yacute"] =   u'\u00DD'
        entities["THORN"] =    u'\u00DE'
        entities["szlig"] =    u'\u00DF'
        entities["agrave"] =   u'\u00E0'
        entities["aacute"] =   u'\u00E1'
        entities["acirc"] =    u'\u00E2'
        entities["atilde"] =   u'\u00E3'
        entities["auml"] =     u'\u00E4'
        entities["aring"] =    u'\u00E5'
        entities["aelig"] =    u'\u00E6'
        entities["ccedil"] =   u'\u00E7'
        entities["egrave"] =   u'\u00E8'
        entities["eacute"] =   u'\u00E9'
        entities["ecirc"] =    u'\u00EA'
        entities["euml"] =     u'\u00EB'
        entities["igrave"] =   u'\u00EC'
        entities["iacute"] =   u'\u00ED'
        entities["icirc"] =    u'\u00EE'
        entities["iuml"] =     u'\u00EF'
        entities["eth"] =      u'\u00F0'
        entities["ntilde"] =   u'\u00F1'
        entities["ograve"] =   u'\u00F2'
        entities["oacute"] =   u'\u00F3'
        entities["ocirc"] =    u'\u00F4'
        entities["otilde"] =   u'\u00F5'
        entities["ouml"] =     u'\u00F6'
        entities["divide"] =   u'\u00F7'
        entities["oslash"] =   u'\u00F8'
        entities["ugrave"] =   u'\u00F9'
        entities["uacute"] =   u'\u00FA'
        entities["ucirc"] =    u'\u00FB'
        entities["uuml"] =     u'\u00FC'
        entities["yacute"] =   u'\u00FD'
        entities["thorn"] =    u'\u00FE'
        entities["yuml"] =     u'\u00FF'
        entities["fnof"] =     u'\u0192'
        entities["Alpha"] =    u'\u0391'
        entities["Beta"] =     u'\u0392'
        entities["Gamma"] =    u'\u0393'
        entities["Delta"] =    u'\u0394'
        entities["Epsilon"] =  u'\u0395'
        entities["Zeta"] =     u'\u0396'
        entities["Eta"] =      u'\u0397'
        entities["Theta"] =    u'\u0398'
        entities["Iota"] =     u'\u0399'
        entities["Kappa"] =    u'\u039A'
        entities["Lambda"] =   u'\u039B'
        entities["Mu"] =       u'\u039C'
        entities["Nu"] =       u'\u039D'
        entities["Xi"] =       u'\u039E'
        entities["Omicron"] =  u'\u039F'
        entities["Pi"] =       u'\u03A0'
        entities["Rho"] =      u'\u03A1'
        entities["Sigma"] =    u'\u03A3'
        entities["Tau"] =      u'\u03A4'
        entities["Upsilon"] =  u'\u03A5'
        entities["Phi"] =      u'\u03A6'
        entities["Chi"] =      u'\u03A7'
        entities["Psi"] =      u'\u03A8'
        entities["Omega"] =    u'\u03A9'
        entities["alpha"] =    u'\u03B1'
        entities["beta"] =     u'\u03B2'
        entities["gamma"] =    u'\u03B3'
        entities["delta"] =    u'\u03B4'
        entities["epsilon"] =  u'\u03B5'
        entities["zeta"] =     u'\u03B6'
        entities["eta"] =      u'\u03B7'
        entities["theta"] =    u'\u03B8'
        entities["iota"] =     u'\u03B9'
        entities["kappa"] =    u'\u03BA'
        entities["lambda"] =   u'\u03BB'
        entities["mu"] =       u'\u03BC'
        entities["nu"] =       u'\u03BD'
        entities["xi"] =       u'\u03BE'
        entities["omicron"] =  u'\u03BF'
        entities["pi"] =       u'\u03C0'
        entities["rho"] =      u'\u03C1'
        entities["sigmaf"] =   u'\u03C2'
        entities["sigma"] =    u'\u03C3'
        entities["tau"] =      u'\u03C4'
        entities["upsilon"] =  u'\u03C5'
        entities["phi"] =      u'\u03C6'
        entities["chi"] =      u'\u03C7'
        entities["psi"] =      u'\u03C8'
        entities["omega"] =    u'\u03C9'
        entities["thetasym"] = u'\u03D1'
        entities["upsih"] =    u'\u03D2'
        entities["piv"] =      u'\u03D6'
        entities["bull"] =     u'\u2022'
        entities["hellip"] =   u'\u2026'
        entities["prime"] =    u'\u2032'
        entities["Prime"] =    u'\u2033'
        entities["oline"] =    u'\u203E'
        entities["frasl"] =    u'\u2044'
        entities["weierp"] =   u'\u2118'
        entities["image"] =    u'\u2111'
        entities["real"] =     u'\u211C'
        entities["trade"] =    u'\u2122'
        entities["alefsym"] =  u'\u2135'
        entities["larr"] =     u'\u2190'
        entities["uarr"] =     u'\u2191'
        entities["rarr"] =     u'\u2192'
        entities["darr"] =     u'\u2193'
        entities["harr"] =     u'\u2194'
        entities["crarr"] =    u'\u21B5'
        entities["lArr"] =     u'\u21D0'
        entities["uArr"] =     u'\u21D1'
        entities["rArr"] =     u'\u21D2'
        entities["dArr"] =     u'\u21D3'
        entities["hArr"] =     u'\u21D4'
        entities["forall"] =   u'\u2200'
        entities["part"] =     u'\u2202'
        entities["exist"] =    u'\u2203'
        entities["empty"] =    u'\u2205'
        entities["nabla"] =    u'\u2207'
        entities["isin"] =     u'\u2208'
        entities["notin"] =    u'\u2209'
        entities["ni"] =       u'\u220B'
        entities["prod"] =     u'\u220F'
        entities["sum"] =      u'\u2211'
        entities["minus"] =    u'\u2212'
        entities["lowast"] =   u'\u2217'
        entities["radic"] =    u'\u221A'
        entities["prop"] =     u'\u221D'
        entities["infin"] =    u'\u221E'
        entities["ang"] =      u'\u2220'
        entities["and"] =      u'\u2227'
        entities["or"] =       u'\u2228'
        entities["cap"] =      u'\u2229'
        entities["cup"] =      u'\u222A'
        entities["int"] =      u'\u222B'
        entities["there4"] =   u'\u2234'
        entities["sim"] =      u'\u223C'
        entities["cong"] =     u'\u2245'
        entities["asymp"] =    u'\u2248'
        entities["ne"] =       u'\u2260'
        entities["equiv"] =    u'\u2261'
        entities["le"] =       u'\u2264'
        entities["ge"] =       u'\u2265'
        entities["sub"] =      u'\u2282'
        entities["sup"] =      u'\u2283'
        entities["nsub"] =     u'\u2284'
        entities["sube"] =     u'\u2286'
        entities["supe"] =     u'\u2287'
        entities["oplus"] =    u'\u2295'
        entities["otimes"] =   u'\u2297'
        entities["perp"] =     u'\u22A5'
        entities["sdot"] =     u'\u22C5'
        entities["lceil"] =    u'\u2308'
        entities["rceil"] =    u'\u2309'
        entities["lfloor"] =   u'\u230A'
        entities["rfloor"] =   u'\u230B'
        entities["lang"] =     u'\u2329'
        entities["rang"] =     u'\u232A'
        entities["loz"] =      u'\u25CA'
        entities["spades"] =   u'\u2660'
        entities["clubs"] =    u'\u2663'
        entities["hearts"] =   u'\u2665'
        entities["diams"] =    u'\u2666'
        entities["quot"] =     u'\"'
        entities["amp"] =      u'&'
        entities["lt"] =       u'<'
        entities["gt"] =       u'>'
        entities["OElig"] =    u'\u0152'
        entities["oelig"] =    u'\u0153'
        entities["Scaron"] =   u'\u0160'
        entities["scaron"] =   u'\u0161'
        entities["Yuml"] =     u'\u0178'
        entities["circ"] =     u'\u02C6'
        entities["tilde"] =    u'\u02DC'
        entities["ensp"] =     u'\u2002'
        entities["emsp"] =     u'\u2003'
        entities["thinsp"] =   u'\u2009'
        entities["zwnj"] =     u'\u200C'
        entities["zwj"] =      u'\u200D'
        entities["lrm"] =      u'\u200E'
        entities["rlm"] =      u'\u200F'
        entities["ndash"] =    u'\u2013'
        entities["mdash"] =    u'\u2014'
        entities["lsquo"] =    u'\u2018'
        entities["rsquo"] =    u'\u2019'
        entities["sbquo"] =    u'\u201A'
        entities["ldquo"] =    u'\u201C'
        entities["rdquo"] =    u'\u201D'
        entities["bdquo"] =    u'\u201E'
        entities["dagger"] =   u'\u2020'
        entities["Dagger"] =   u'\u2021'
        entities["permil"] =   u'\u2030'
        entities["lsaquo"] =   u'\u2039'
        entities["rsaquo"] =   u'\u203A'
        entities["euro"] =     u'\u20AC'
        parser.entity.update(entities)
        
        return parser
class CommentedTreeBuilder(ET.TreeBuilder):
    def __init__(self, *args, **kwargs):
        super(CommentedTreeBuilder, self).__init__(*args, **kwargs)

    def comment(self, data):
        self.start(ET.Comment, {})
        self.data(data)
        self.end(ET.Comment)


if __name__ == "__main__":
    print('---------------------------------------------------------')
    print('Before: ElementTree ogirinal parser() dump()')
    tree = ET.parse('sample.xml')
    root = tree.getroot()
    dump(root)  # 주석은 읽지 않는다.
    # xml tree 를 파일로 저장한다.
    tree.write('original.xml', xml_declaration=True)  # 주석은 저장되지 않는다.
    print('---------------------------------------------------------')
    print('After:Commented parser dump')
    # 주석까지 읽으려면 파서를 지정해 줘야 한다.
    tree = ET.parse('sample.xml', parser=ET.XMLParser(target=CommentedTreeBuilder()))
    root = tree.getroot()
    dump(root) # 주석까지 읽었다

    # 읽은 내용을 다시 저장
    # xml tree 를 파일로 저장한다. 한글깨짐
    tree.write('commented_parser.xml')
    # 한글커멘트가 깨지 않도록 utf8 로 저장
    tree.write('commented_parser2.xml', encoding='utf8', xml_declaration=True)
def postprocess_translations(reduce_diff_hacks=False):
    print('Checking and postprocessing...')

    if reduce_diff_hacks:
        global _orig_escape_cdata
        _orig_escape_cdata = ET._escape_cdata
        ET._escape_cdata = escape_cdata

    for (filename, filepath) in all_ts_files():
        os.rename(filepath, filepath + '.orig')

    have_errors = False
    for (filename, filepath) in all_ts_files('.orig'):
        # pre-fixups to cope with transifex output
        parser = ET.XMLParser(
            encoding='utf-8'
        )  # need to override encoding because 'utf8' is not understood only 'utf-8'
        with open(filepath + '.orig', 'rb') as f:
            data = f.read()
        # remove control characters; this must be done over the entire file otherwise the XML parser will fail
        data = remove_invalid_characters(data)
        tree = ET.parse(io.BytesIO(data), parser=parser)

        # iterate over all messages in file
        root = tree.getroot()
        for context in root.findall('context'):
            for message in context.findall('message'):
                numerus = message.get('numerus') == 'yes'
                source = message.find('source').text
                translation_node = message.find('translation')
                # pick all numerusforms
                if numerus:
                    translations = [
                        i.text for i in translation_node.findall('numerusform')
                    ]
                else:
                    translations = [translation_node.text]

                for translation in translations:
                    if translation is None:
                        continue
                    errors = []
                    valid = check_format_specifiers(
                        source, translation, errors,
                        numerus) and not contains_erexcoin_addr(
                            translation, errors)

                    for error in errors:
                        print('%s: %s' % (filename, error))

                    if not valid:  # set type to unfinished and clear string if invalid
                        translation_node.clear()
                        translation_node.set('type', 'unfinished')
                        have_errors = True

                # Remove location tags
                for location in message.findall('location'):
                    message.remove(location)

                # Remove entire message if it is an unfinished translation
                if translation_node.get('type') == 'unfinished':
                    context.remove(message)

        # check if document is (virtually) empty, and remove it if so
        num_messages = 0
        for context in root.findall('context'):
            for message in context.findall('message'):
                num_messages += 1
        if num_messages < MIN_NUM_MESSAGES:
            print('Removing %s, as it contains only %i messages' %
                  (filepath, num_messages))
            continue

        # write fixed-up tree
        # if diff reduction requested, replace some XML to 'sanitize' to qt formatting
        if reduce_diff_hacks:
            out = io.BytesIO()
            tree.write(out, encoding='utf-8')
            out = out.getvalue()
            out = out.replace(b' />', b'/>')
            with open(filepath, 'wb') as f:
                f.write(out)
        else:
            tree.write(filepath, encoding='utf-8')
    return have_errors
def process(xml_file_path, out_path, extract_classes):
    if not os.path.exists(xml_file_path):
        print("skip '%s'" % (xml_file_path))
        return
    else:
        print("process '%s'" % (xml_file_path))

    utf8_parser = ET.XMLParser(encoding='utf-8')
    tree = ET.parse(xml_file_path, parser=utf8_parser)
    root = tree.getroot()

    clear_objs = []

    ## clear object which not in 'extract_classes'
    for anno_id, obj in enumerate(root.iter('object')):
        name = obj.find('name').text
        if name not in extract_classes:
            clear_objs.append(obj)
    for obj in clear_objs:
        root.remove(obj)

    objs = root.findall('object')

    # 选择1:没有obj直接return,那么xml都不会有
    ## none object left, return direct
    if len(objs) < 1:
        return
    # 选择2:屏蔽掉上面这两句,那么会有xml但是什么obj都没有,如果有的框架支持负样本训练,则这样

    # 选择3:如果没有obj,在左上角生成一个background的obj,坐标左上角(0,0),右下角(10,10),因为有的框架不支持负样本(也就是0obj)训练,那么我们手动添加一个
    ### clw note: none object left, write a bbox like(0, 0, 20, 20) for background.
    if len(objs) < 1:
        element = ET.Element('object')
        # 创建二级目录
        oneName = ET.Element('name')
        oneName.text = 'background'  # 二级目录的值 #结果展示:<id>1</id>
        onePose = ET.Element('pose')
        onePose.text = 'Unspecified'
        oneTruncated = ET.Element('truncated')
        oneTruncated.text = '1'
        oneDifficult = ET.Element('difficult')
        oneDifficult.text = '0'
        oneBndbox = ET.Element('bndbox')
        xmin = ET.Element('xmin')
        ymin = ET.Element('ymin')
        xmax = ET.Element('xmax')
        ymax = ET.Element('ymax')

        xmin.text = str(0)
        ymin.text = str(0)
        xmax.text = str(10)
        ymax.text = str(10)

        oneBndbox.append(xmin)
        oneBndbox.append(ymin)
        oneBndbox.append(xmax)
        oneBndbox.append(ymax)

        element.append(oneName)
        element.append(onePose)
        element.append(oneTruncated)
        element.append(oneDifficult)
        element.append(oneBndbox)
        root.append(element)


##########################################################################
    ml_file_path = xml_file_path.replace('\\', '/')  # clw added: for windows
    tree.write(os.path.join(out_path,
                            xml_file_path.split("/")[-1]),
               encoding="utf-8")

    jpg_path = xml_file_path[:-3] + "jpg"
    png_path = xml_file_path[:-3] + "png"

    if os.path.exists(jpg_path):
        shutil.copy(jpg_path, out_path)
    elif os.path.exists(png_path):
        shutil.copy(png_path, out_path)
    pass
Beispiel #5
0
def element_from_string(text):
    xml_parser = ElementTree.XMLParser()
    xml_parser._fixtext = lambda text: text
    xml_parser.feed(text)
    return xml_parser.close()
Beispiel #6
0
 def open_file(self):
     with open(self.filename, "r") as station_file:
         parser = ET.XMLParser(encoding='utf-8')
         self.tree = ET.parse(station_file, parser)
Beispiel #7
0
def compat_etree_fromstring(text):
    return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torchvision as tv
import xml.etree.ElementTree as ET
import numpy as np
import json

root_dir = Path('data/AIC20_ReID')
dataset = {x: root_dir / f'image_{x}' for x in ['train', 'test']}
track_txts = {x: root_dir / f'{x}_track.txt' for x in dataset.keys()}
train_xml_path = 'train_label.xml'

xml_data = ET.parse(str(root_dir / train_xml_path),
                    parser=ET.XMLParser(encoding='iso-8859-5')).getroot()[0]
labels = dict()
for x in xml_data:
    x = x.attrib
    labels[x['imageName']] = (x['vehicleID'], x['cameraID'])

lines = open(track_txts['train']).readlines()
tracks = [x.strip().split() for x in lines if len(x.strip()) != 0]

vehs = dict()
for i, track in enumerate(tracks):
    veh_id, cam_id = zip(*[labels[img_id] for img_id in track])
    veh_id = veh_id[0]
    cam_id = cam_id[0]

    vehs.setdefault(veh_id, dict())
Beispiel #9
0
 def __init__(self):
     self.parser = etree.XMLParser()
def myFilter(svgFilename, filename):

    #print "im myfilter",svgFilename
    import xml.etree.ElementTree as ET  #import ElementTree
    parser = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(svgFilename, parser)
    remove_namespace(tree, u'http://www.w3.org/2000/svg'
                     )  #call remove_namespace for cleaning the element-tags
    svgRoot = tree.getroot()
    # print "tree", tree

    parent_map = dict(
        (c, p) for p in tree.getiterator()
        for c in p)  #make a dictonary with a map of the hole file
    #it has the childs and the its parents stored

    coppers = set(
    )  #make a set (set is like a list but without doubled same entries)
    for child in parent_map.values():
        removed = False
        # print "child", child
        id = child.attrib.get("id")
        if id and id.startswith(
                'copper'
        ):  #get the child ids to decide if we need to clean or not
            coppers.add(child)

        paths = list()

        for copper in coppers:
            paths.extend(
                copper.findall("path")
            )  #writes the list where are path-elemnts are in that are in a parent called "copper"

            #print "paths", paths
            for path in paths:
                #print "path", path
                #print "parent_map", parent_map
                id = path.attrib.get("id")

                if id is None:
                    try:
                        #print "id", id
                        parent_map[path].remove(
                            path)  #remove the child from his parent
                        removed = True

                    except:
                        continue  #print "f**k", path, path.tag, parent_map.get(path)

                else:
                    #print "id", id
                    removed = False

            if removed:  #write the new file in a new output directory
                svgRoot.set("xmlns:svg", "http://www.w3.org/2000/svg")
                svgRoot.set("xmlns", "http://www.w3.org/2000/svg")
                #print "root", root, "tree",tree, "filename", svgFilename
                outFilename = outputDir + filename
                #print "outputdir", outputDir, "outFilename", outFilename

                tree.write(outFilename)
                with open(outFilename, "r+") as f:
                    old = f.read()
                    f.seek(0)
                    f.write(
                        '<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n'
                        +
                        '<!-- Created with Fritzing (http://www.fritzing.org/) -->'
                        + '\n' + old)
                    f.close()
                print "finised:", filename
Beispiel #11
0
    def __init__(self, filename):  # noqa: C901
        from xml.etree import ElementTree as ET

        parser = ET.XMLParser()
        try:
            tree = ET.parse(str(filename), parser)
            root = tree.getroot()
        except ET.ParseError:
            root = _parse_raw_binary(str(filename))

        if root.tag != "VTKFile":
            raise ReadError()
        if root.attrib["type"] != "UnstructuredGrid":
            raise ReadError()
        if root.attrib["version"] not in ["0.1", "1.0"]:
            raise ReadError(
                "Unknown VTU file version '{}'.".format(root.attrib["version"])
            )

        # fix empty NumberOfComponents attributes as produced by Firedrake
        for da_tag in root.findall(".//DataArray[@NumberOfComponents='']"):
            da_tag.attrib.pop("NumberOfComponents")

        if "compressor" in root.attrib:
            assert root.attrib["compressor"] in [
                "vtkLZMADataCompressor",
                "vtkZLibDataCompressor",
            ]
            self.compression = root.attrib["compressor"]
        else:
            self.compression = None

        self.header_type = (
            root.attrib["header_type"] if "header_type" in root.attrib else "UInt32"
        )

        try:
            self.byte_order = root.attrib["byte_order"]
            if self.byte_order not in ["LittleEndian", "BigEndian"]:
                raise ReadError(f"Unknown byte order '{self.byte_order}'.")
        except KeyError:
            self.byte_order = None

        grid, self.appended_data = get_grid(root)

        pieces = []
        field_data = {}
        for c in grid:
            if c.tag == "Piece":
                pieces.append(c)
            elif c.tag == "FieldData":
                # TODO test field data
                for data_array in c:
                    field_data[data_array.attrib["Name"]] = self.read_data(data_array)
            else:
                raise ReadError(f"Unknown grid subtag '{c.tag}'.")

        if not pieces:
            raise ReadError("No Piece found.")

        points = []
        cells = []
        point_data = []
        cell_data_raw = []

        for piece in pieces:
            piece_cells = {}
            piece_point_data = {}
            piece_cell_data_raw = {}

            num_points = int(piece.attrib["NumberOfPoints"])
            num_cells = int(piece.attrib["NumberOfCells"])

            for child in piece:
                if child.tag == "Points":
                    data_arrays = list(child)
                    if len(data_arrays) != 1:
                        raise ReadError()
                    data_array = data_arrays[0]

                    if data_array.tag != "DataArray":
                        raise ReadError()

                    pts = self.read_data(data_array)

                    num_components = int(data_array.attrib["NumberOfComponents"])
                    points.append(pts.reshape(num_points, num_components))

                elif child.tag == "Cells":
                    for data_array in child:
                        if data_array.tag != "DataArray":
                            raise ReadError()
                        piece_cells[data_array.attrib["Name"]] = self.read_data(
                            data_array
                        )

                    if len(piece_cells["offsets"]) != num_cells:
                        raise ReadError()
                    if len(piece_cells["types"]) != num_cells:
                        raise ReadError()

                    cells.append(piece_cells)

                elif child.tag == "PointData":
                    for c in child:
                        if c.tag != "DataArray":
                            raise ReadError()
                        piece_point_data[c.attrib["Name"]] = self.read_data(c)

                    point_data.append(piece_point_data)

                elif child.tag == "CellData":
                    for c in child:
                        if c.tag != "DataArray":
                            raise ReadError()
                        piece_cell_data_raw[c.attrib["Name"]] = self.read_data(c)

                    cell_data_raw.append(piece_cell_data_raw)
                else:
                    raise ReadError(f"Unknown tag '{child.tag}'.")

        if not cell_data_raw:
            cell_data_raw = [{}] * len(cells)

        if len(cell_data_raw) != len(cells):
            raise ReadError()

        point_offsets = numpy.cumsum([0] + [pts.shape[0] for pts in points][:-1])

        # Now merge across pieces
        if not points:
            raise ReadError()
        self.points = numpy.concatenate(points)

        if point_data:
            self.point_data = {
                key: numpy.concatenate([pd[key] for pd in point_data])
                for key in point_data[0]
            }
        else:
            self.point_data = None

        self.cells, self.cell_data = _organize_cells(
            point_offsets, cells, cell_data_raw
        )
        self.field_data = field_data
    productname=productname.split('_')
    productname=''.join(productname)
    return productname

def splitjoincaption(caption,productname):
    newcaption=caption.split(' ')[2:]
    productname=productname.split('_')
    productname=' '.join(productname)
    newcaption.insert(0,productname)
    newcaption=' '.join(newcaption)
    return newcaption
    

for entry in os.scandir(directory):
    for file in os.scandir(entry):
        parser=ET.XMLParser(encoding='utf-8')
        root=ET.parse(file.path,parser=parser)
        root.getroot()[0].attrib['versionDate']=effective_date
        manuscriptid = root.getroot()[0].attrib['manuscriptID']
        versionid = root.getroot()[0].attrib['versionID']
        caption = root.getroot()[0].attrib['caption']
        
        updatedmanuscriptid=splitjoin(manuscriptid, productname)
        updatedlob=splitjoinlob(productname)
        updatedcaption=splitjoincaption(caption,productname)
        updatedversionid=splitjoin(versionid,productname)
        #print(updatedmanuscriptid, updatedversionid)
        
        root.getroot()[0].attrib['lob']=updatedlob
        root.getroot()[0].attrib['caption']=updatedcaption
        root.getroot()[0].attrib['manuscriptID']=updatedmanuscriptid
 def get_exteranl_ole_link_type(self, unzip_dir, office_type=""):
     # Precondition
     if office_type == 'xl':
         return False
     ret = False
     r_id = ""
     flag_ole_link = False
     flag_external = False
     for (root, _, files) in os.walk(unzip_dir):
         for filename in files:
             # dir search and find .xml
             _, ext = os.path.splitext(filename)
             file_path = os.path.join(root, filename)
             try:
                 if ext == ".xml":  # e.g. document.xml
                     with open(file_path,
                               'r',
                               encoding='utf-8',
                               errors='ignore') as f:
                         xml_txt = f.read()
                     xp = xml_parser.XmlParser()
                     utf8_parser = etree.XMLParser(encoding='utf-8')
                     ooxml = etree.fromstring(xml_txt, parser=utf8_parser)
                     for elem in ooxml.iter():
                         o_oleobject = elem.find(_name('{{{o}}}OLEObject'))
                         if o_oleobject is not None:  # If it has OLE object
                             xp.parse_o_oleobject(o_oleobject)
                             if xp.oleobject_attrib[
                                     'Type'] == "Link" and xp.oleobject_attrib[
                                         'child'][
                                             'o_LinkType'] == "EnhancedMetaFile":
                                 r_id = xp.oleobject_attrib['r_id']
                                 flag_ole_link = True
                             elif xp.oleobject_attrib[
                                     'Type'] == "Link" and xp.oleobject_attrib[
                                         'child']['o_LinkType'] == "Picture":
                                 if r"\f 0" in xp.oleobject_attrib['child'][
                                         'o_FieldCodes']:
                                     r_id = xp.oleobject_attrib['r_id']
                                     flag_ole_link = True
                 if ext == '.rels':  # e.g. document.xml.rels
                     if filename not in self.external_rels.keys():
                         with open(file_path,
                                   'r',
                                   encoding='utf-8',
                                   errors='ignore') as f:
                             xml_txt = f.read().encode("utf-8")
                         xp = xml_parser.XmlParser()
                         xp.parse_relationship(xml_txt)
                         self.external_rels[filename] = xp.relationships
                     for relationship in self.external_rels[filename]:
                         if relationship['id'] == r_id and relationship[
                                 'target_mode'] == "External":
                             flag_external = True
                 if flag_ole_link and flag_external:
                     ret = True
                     break
             except etree.ParseError as parse_err:
                 logging.warning(parse_err)
                 logging.warning(
                     "Error path: {file_path}".format(file_path=file_path))
                 ret = False
     return ret
Beispiel #14
0
def procesXmlFiles(createprinter=False):
    '''
	ip_address = models.GenericIPAddressField(validators = [validate_ipv46_address])
	mac_address = models.CharField(max_length=200)
	model = models.CharField(max_length=200)
	serial_number = models.CharField(max_length=200)
	host_name = models.CharField(max_length=200)
	toner_level = models.CharField(max_length=200)
	pages_printed = models.IntegerField(blank = True)
	status = models.CharField(max_length=200)
	date = models.DateTimeField()
	'''
    parser = ET.XMLParser(encoding="ISO-8859-1")
    django.db.connection.close()
    xmls = MailsToProcess.objects.filter(done=False)
    for xml in xmls:
        try:
            tree = ET.parse(xml.xml_path, parser=parser)
            root = tree.getroot()
            for m in root:  #printers
                obj = PrinterReport()
                for attr in m:
                    if attr[0].text == 'DeviceIpAddress':
                        if attr[1].text:
                            print attr[1].text
                            obj.ip_address = attr[1].text
                    elif attr[0].text == 'DeviceMacAddress':
                        if attr[1].text.strip():
                            print attr[1].text
                            obj.mac_address = attr[1].text
                    elif attr[0].text == 'DeviceHostName':
                        if attr[1].text:
                            print attr[1].text
                            obj.host_name = attr[1].text
                    elif attr[0].text == 'DeviceModelName':
                        if attr[1].text:
                            print attr[1].text
                            obj.model = attr[1].text
                    elif attr[0].text == 'DeviceSerialNumber':
                        if attr[1].text.strip():
                            print attr[1].text
                            obj.serial_number = attr[1].text

                    elif attr[0].text == 'deviceAggregateTonerLevels':
                        if attr[1].text:
                            print attr[1].text
                            obj.toner_level = attr[1].text
                    elif attr[0].text == 'deviceAggregateStatus':
                        if attr[1].text:
                            print attr[1].text
                            obj.status = attr[1].text
                    elif attr[
                            0].text == 'totalUsagePagesPrinted':  #totalUsagePagesPrinted
                        if attr[1].text:
                            print attr[1].text
                            obj.pages_printed = attr[1].text
                            try:
                                int(obj.pages_printed)
                                obj.is_valid = True
                            except ValueError:
                                obj.is_valid = False
                obj.date = datetime.now()
                try:
                    if obj.mac_address and obj.mac_address.strip() != '':
                        if Printer.objects.filter(
                                mac_address=obj.mac_address).count() == 1:
                            p = Printer.objects.get(
                                mac_address=obj.mac_address)
                        elif Printer.objects.filter(
                                mac_address=obj.mac_address
                        ).count(
                        ) > 1 and obj.serial_number and obj.serial_number.strip(
                        ) != '':
                            p = Printer.objects.get(
                                mac_address=obj.mac_address,
                                serial_number=obj.serial_number)
                        else:
                            p = None
                    else:
                        if obj.serial_number and obj.serial_number.strip(
                        ) != '':
                            if Printer.objects.filter(
                                    serial_number=obj.serial_number).count(
                                    ) == 1:
                                p = Printer.objects.get(
                                    serial_number=obj.serial_number)

                except ObjectDoesNotExist:
                    p = None

                except Exception:
                    p = None

                if p:
                    obj.printerOwner_id = p.id
                    obj.save()
                    xml.done = True
                    xml.save()
                    p.last_report_id = obj.id
                    p.save()
        except Exception:
            print 'XML Invalid: %s' % xml.xml_path
            '''elif createprinter:

				idPrinter = createPrinters(obj)
				if idPrinter:
					obj.printer_id = idPrinter
					obj.save()
					xml.done = True
					xml.save()'''

    return True
def load_pan_data(xmls_directory,
                  truth_path,
                  write_to_txt_files=False,
                  txts_destination_directory=None):
    """Load PAN data

    This function loads the PAN dataset and the truth, parses the XML and returns:
    Merged tweets of the authors, the truth, Author IDs, and the original length of the tweets.
    It also writes the tweets to TXT files (optional).

    Args:
        xmls_directory: The directory where the XML files of the dataset reside.
        truth_path: The path of the truth file.
        write_to_txt_files: (boolean) If True, the XML files will also be written as TXT files after being parsed.
        txts_destination_directory: The TXT files will be written to this directory.

    Returns:
        merged_tweets_of_authors: List. Each item is all of the tweets of an author, merged into one string.
            Refer to the list of replacements in the remarks.
        truths: List of truths for authors.
        author_ids: List of Author IDs.
        original_tweet_lengths: List of original tweet lengths.

    Raises:
        RuntimeError: If a non-XML file exists inside the *xmls_directory*

    Remarks:
        - Since *xml_filenames* is sorted in ascending order, all the returned lists will also be in the same order
        (sorted in ascending order of the Author IDs).
        - List of replacements:
            Line feed		<LineFeed>
            End of Tweet	<EndOfTweet>
    """
    ''' 
    *os.listdir* returns a list containing the name of all files and folders in the given directory.
    Normally, the list is created in ascending order. However, the Python documentation states,
    “the list is in arbitrary order”.
    To ensure consistency and avoid errors in syncing the order of the items among
    different lists (e.g., *author_ids*, *truths*), we sort the list by calling *sorted*.
    *sorted()* returns a new sorted list (in ascending lexicographical order) of all the items in an iterable.
    '''
    xml_filenames = sorted(os.listdir(xmls_directory))

    # Store the Author IDs in a list
    # The Author IDs list will have the same order as the XML filenames list.
    author_ids = []  # Create an empty list
    for xml_filename in xml_filenames:
        author_ids.append(xml_filename[:-4])

    # Skip loading truth if path input is None. Else, load the truth from the file.
    if truth_path is None:
        logger.info("*truth_path* is None => Skipped loading the truth")
        truths = None
        # This scenario will happen when loading the test dataset for **TIRA** evaluation, where the truth of the test
        # set is not provided.
    else:
        truths = load_truth(truth_path, author_ids)

    if write_to_txt_files:
        logger.info("The parsed XMLs will also be written to TXT files.")
        # Create the directory if it does not exist.
        os.makedirs(txts_destination_directory, exist_ok=True)

    # Initialize the lists.
    # The lists will have the same order as the XML filenames list (refer to: “Iterate over XML Files”)
    original_tweet_lengths = []  # Create an empty list
    # ↳ Every row will represent an author, every column will represent a tweet.
    merged_tweets_of_authors = []  # Create an empty list
    # ↳ Each cell will contain all 100 tweets of an author, merged.

    # Iterate over XML files
    for author_index, xml_filename in enumerate(xml_filenames):
        # Make sure only XML files go through
        if not fnmatch.fnmatch(xml_filename, '*.xml'):
            logger.error(
                "Encountered a non-XML file inside the directory: %s >>> The program will now exit.",
                xml_filename)
            raise RuntimeError(
                'Encountered a non-XML file inside the directory: %s' %
                xml_filename)
            # ↳ This is printf-style String Formatting.

        # Read the XML file and parse it into a tree
        # Parser is explicitly defined to ensure UTF-8 encoding.
        tree = ElementTree.parse(
            os.path.join(xmls_directory, xml_filename),
            parser=ElementTree.XMLParser(encoding="utf-8"))
        root = tree.getroot()
        '''
        root is the root element of the parsed tree
        root[0], ..., root[m-1] are the children of root—elements one level below the root.
        root[0][0], ..., root[0][n-1] are the children of root[0].
        and so on.
        
        Each element has a tag, a dictionary of attributes, and sometimes some text:
            root[i][j].tag, ”.attrib, ”.text 
        '''

        # Add an empty new row to the list. Each row represents an author.
        original_tweet_lengths.append([])

        # Initialize the list. Note that this list resets in every author (XML file) loop.
        tweets_of_this_author = []  # Create an empty list

        # Iterate over the tweets within this parsed XML file:
        # Record the tweet length, replace line feeds, and append the tweet to a list
        for child in root[0]:
            # Element.text accesses the element's text content,
            # which is saved with the following format in the XML files: <![CDATA[some text]]>
            tweet = child.text
            original_tweet_lengths[author_index].append(len(tweet))

            # Replace line feed (LF = \n) with “ <LineFeed> ”
            # Note: There were no carriage return (CR = \r) characters in any of the 3,000 XML files.
            tweet = tweet.replace('\n', " <LineFeed> ")

            # Create a list of the tweets of this author, to write to a text file and merge, after the loop terminates.
            '''
            Google Python Style Guide: Avoid using the + and += operators to accumulate a string within a loop.
            Since strings are immutable, this creates unnecessary temporary objects and results in quadratic rather
            than linear running time.
            Avoid: merged_tweets_of_authors[author_index] += tweet + " <EndOfTweet> "
            Instead, append each substring to a list and ''.join the list after the loop terminates.
            '''
            tweets_of_this_author.append(tweet)

        # Write the tweets of this author to a TXT file
        # Note that in these tweets, the line feed characters are replaced with a tag.
        if write_to_txt_files:
            # Create a TXT file with the Author ID as the filename (same as the XML files) in the write mode
            with open(os.path.join(txts_destination_directory,
                                   author_ids[author_index] + ".txt"),
                      'w',
                      encoding="utf-8") as txt_output_file:
                txt_output_file.write('\n'.join(tweets_of_this_author))
                # ↳ '\n'.join adds a newline character between every two strings,
                # so there won't be any extra line feeds on the last line of the file.

        # Concatenate the tweets of this author, and append it to the main list
        merged_tweets_of_this_author = " <EndOfTweet> ".join(
            tweets_of_this_author) + " <EndOfTweet>"
        # ↳ " <EndOfTweet> ".join adds the tag between every two strings, so we need to add another tag to the end.
        merged_tweets_of_authors.append(merged_tweets_of_this_author)

    logger.info("@ %.2f seconds: Finished loading the dataset",
                time.process_time())

    return merged_tweets_of_authors, truths, author_ids, original_tweet_lengths
Beispiel #16
0
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET

from pyrevit import script

__context__ = 'zero-doc'

utf8xml = script.get_bundle_file('utf8.xml')
utf16xml = script.get_bundle_file('utf16.xml')

for xmlfile in [utf8xml, utf16xml]:
    print('Testing: {}'.format(xmlfile))
    c = ET.parse(xmlfile)
    print(c)

    xmlp = ET.XMLParser(encoding="utf-16")
    f = ET.parse(xmlfile, parser=xmlp)
    print(xmlp, f)
def open_file_xml(file_name):
    parser = ET.XMLParser(encoding='utf-8')
    tree = ET.parse(file_name, parser)
    root = tree.getroot()
    items = root.findall('channel/item')
    return items
Beispiel #18
0
class Parser:
    __parser = ElementTree.XMLParser(encoding='utf-8')
    __tree = None
    __root = None

    relations = {}
    ways = {}
    nodes = {}

    def __init__(self, file_name):
        self.__tree = ElementTree.parse(file_name, self.__parser)
        self.__root = self.__tree.getroot()
        self.__load_to_memory()
        self.__process_tags()

    def __load_to_memory(self):
        for element in self.__root.iter():
            if element.tag == 'relation':
                relation = elements.Relation(element)
                self.relations[relation.relation_id] = relation
            elif element.tag == 'node':
                node = elements.Node(element)
                self.nodes[node.node_id] = node
            elif element.tag == 'way':
                way = elements.Way(element)
                self.ways[way.way_id] = way

    def __extract_country(self, source, source_type):
        if source.is_representing_country():
            if source_type == 'way':
                return Country.extract_from_way(source=source,
                                                nodes=self.nodes)
            elif source_type == 'relation':
                return Country.extract_from_relation(source=source,
                                                     nodes=self.nodes,
                                                     ways=self.ways)
        raise ValueError(
            'Source {0} is not representing country!'.format(source))

    def __process_source(self, source, source_type):
        country = self.__extract_country(source, source_type)
        self.__write_country_to_file(country=country)
        hooks.process_country(country=country)

    def __write_country_to_file(self, country):
        path = 'output/'
        if not os.path.exists(path):
            os.mkdir(path)
        output = open(path + country.iso2.lower() + '.json', 'w')
        output.write(country.to_json())
        output.close()

        if country.tags['is_in:continent'] == 'Europe':
            geojsonpath = 'data/'
            if not os.path.exists(geojsonpath):
                os.mkdir(geojsonpath)
            output = open(geojsonpath + country.iso2.lower() + '.geo.json',
                          'w')
            output.write(country.to_geojson())
            output.close()

    def __process_tags(self):
        for relation in self.relations.values():
            if relation.is_representing_country():
                self.__process_source(source=relation, source_type='relation')
        for way in self.ways.values():
            if way.is_representing_country():
                self.__process_source(source=way, source_type='way')
Beispiel #19
0
# from lxml import etree as ElementTree
import xml.etree.ElementTree as ElementTree
import htmlentitydefs
import csv
import operator
import re

from config import *

# import gzip

# parser = ElementTree.XMLParser(attribute_defaults=True, load_dtd=True)
parser = ElementTree.XMLParser()

# Match ordinary page numbers (as in 10-17).
pageCounterNormal = re.compile('(\d+)-(\d+)')
# Match page number in the form volume:page (as in 12:140-12:150).
pageCounterColon = re.compile('[0-9]+:([1-9][0-9]*)-[0-9]+:([1-9][0-9]*)')

def startpage(input):
    if (input is None):
        return 0
    pageCounterMatcher1 = pageCounterNormal.match(input)
    pageCounterMatcher2 = pageCounterColon.match(input)
    start = 0

    if (not (pageCounterMatcher1 is None)):
        start = int(pageCounterMatcher1.group(1))
    else:
        if (not (pageCounterMatcher2 is None)):
            start = int(pageCounterMatcher2.group(1))
Beispiel #20
0
import xml.etree.ElementTree as ET

parser = ET.XMLParser(encoding = 'UTF-8')
tree = ET.parse('newsafr.xml', parser)
root = tree.getroot()

news_list = root.findall('channel/item')

all_news = ''
for i,news in enumerate(news_list):
  discript = news.find("description").text
  all_news += discript
list_all_word = all_news.split(' ')
new_list = []
for element in list_all_word:
  if len(element) > 6:
    new_list.append(element)
  else:
    continue
common_words = sorted(set(new_list), key = new_list.count, reverse = True)

print(common_words[:10])
Beispiel #21
0
    def _process(
        manifest_file_subpath: str,
        processed_manifests: Optional[Dict[str, ET.ElementTree]] = None
    ) -> Dict[str, ET.ElementTree]:
        """Process a manifest root file and keep an acumulator for recursion on
        ``<include>`` tags."""

        debug(
            line("""Processing manifest file {!r}""").format(
                manifest_file_subpath))

        if not processed_manifests:
            processed_manifests = {}

        if (os.path.normpath(manifest_file_subpath)
                in processed_manifests.keys()):
            raise RepoSourceTreeManifestParsingError(
                line(
                    """An already processed manifest file ({!r}) is again candidate
                to snapshot. Is there an include loop in the manifest file
                structure?""").format(manifest_file_subpath))

        try:
            parser = ET.XMLParser(target=CommentedTreeBuilder())
            xmldoc = ET.parse(os.path.join(internal_manifest_repo_path,
                                           manifest_file_subpath),
                              parser=parser)
        except Exception as exc:
            raise RepoSourceTreeManifestParsingError(
                line(
                    """Failure when parsing {!r} manifest file. Exception raised
                was: {!r}""").format(manifest_file_subpath, exc))

        for proj in xmldoc.findall("./project"):
            try:
                proj_repo_subpath = proj.attrib["path"]
            except KeyError:
                raise UnexpectedRepoSourceTreeStructure(
                    line(
                        """A project in manifest ({!r}) does not have a path."""
                    ).format(manifest_file_subpath))
            try:
                # according to repo manifest documentation, groups can be
                # separated with commas or spaces
                proj_group_set = set(proj.attrib["groups"].replace(
                    ",", " ").split())
            except KeyError:
                proj_group_set = set()

            # skip this project if the project group set does not intersect
            # with the groups to snapshot set (if provided) or if it does
            # intersect with the groups NOT to snapshot set (if provided as
            # well):
            if snapshot_groups and not set(snapshot_groups) & proj_group_set:
                debug(
                    line(
                        """Skipping project {!r} because it does not belong to any
                    group candidate for snapshot provided as argument.""").
                    format(proj_repo_subpath))
                continue
            if (no_snapshot_groups
                    and set(no_snapshot_groups) & proj_group_set):
                debug(
                    line(
                        """Skipping project {!r} because it belongs to a repo group
                    which shall not be snapshotted.""").format(
                            proj_repo_subpath))
                continue

            try:
                proj_git_repo = git.Repo(
                    os.path.join(root_path, proj_repo_subpath))
            except:
                raise UnexpectedRepoSourceTreeStructure(
                    line(
                        """Repo project {!r} is not a git repository anymore."""
                    ).format(proj_repo_subpath))

            if proj_git_repo.is_dirty():
                raise ProjectInRepoSourceTreeInUncleanState(
                    line("""Repo project {!r} is in a dirty state.""").format(
                        proj_repo_subpath))

            if not proj_git_repo.head.is_valid():
                raise ProjectInRepoSourceTreeInUncleanState(
                    line("""Repo project {!r} has not a valid git HEAD.""").
                    format(proj_repo_subpath))

            if use_branches:
                try:
                    proj_revision = proj_git_repo.head.reference.name
                except:
                    debug(
                        line("""Project {!r} cannot use git symbolic reference
                        because it is in a detached HEAD state.""").format(
                            proj_repo_subpath))
                    proj_revision = proj_git_repo.head.commit.hexsha
            else:
                proj_revision = proj_git_repo.head.commit.hexsha

            # Check that the returned revision is effectively pointed by a Git
            # reference or any ancestor of a Git reference. This is required in
            # order to avoid using a Git revision that can be garbage-collected
            # by Git.
            # Important note: we do not check that the reference pointing on
            # the current HEAD is effectively valid on at least one Git remote.
            # We consider that it is up to the user to push that Git reference
            # to the suitable authoritative Git repostitory for future usage.
            for ref in proj_git_repo.references:
                if (proj_git_repo.head.commit == ref.commit
                        or proj_git_repo.head.commit in ref.commit.parents):
                    break
            else:
                raise ProjectInRepoSourceTreeInUncleanState(
                    line(
                        """No Git symbolic reference is pointing to the current
                    HEAD commit of the project {!r} or any of its eventual
                    sucessors.""").format(proj_repo_subpath))

            # change the revision attribute in the XML project tag:
            proj.attrib["revision"] = proj_revision

            debug("Project {!r} will be snapshotted to revision {!r}.".format(
                proj_repo_subpath, proj_revision))

        # manifest is processed: all the projects have been snapshotted to a
        # revision
        processed_manifests.update(
            {os.path.normpath(manifest_file_subpath): xmldoc})

        # iterate with others manifests included
        for manifest in xmldoc.findall("./include"):
            manifest_subpath = manifest.attrib["name"]
            processed_manifests = _process(
                manifest_file_subpath=manifest_subpath,
                processed_manifests=processed_manifests)

        return processed_manifests
def parseCmpLib(filePath):
    def buildReqDataTable(elementTree, tagName, dataTable):
        for reqEle in elementTree.iter(tagName):
            refID = reqEle.get("id")
            row = {"refID": refID}
            for item in reqEle:
                row[item.tag] = item.text

            dataTable.insertRow(row)

    def buildGroupTable(elementTree, groupTable):  #import component folders
        topSetting = ""
        #get the info of top group(it's not the top level folder!)
        for topGroupEle in elementTree.findall("./TopGroup"):
            topSetting = topGroupEle
            refID = topGroupEle.get("id")
            guid = getTextFromChildEle(topGroupEle, "GUID")
            itemNaming = getTextFromChildEle(topGroupEle, "ItemNamingScheme")

            groupTable.insertRow({
                "HRID": "Top Group#",
                "refID": refID,
                "GUID": guid,
                "Path": "",
                "ParentGroup": "",
                "ItemNamingScheme": itemNaming
            })

        #traverse the path, get all folders
        for tGroupEle in topSetting.findall(".//TGroup"):
            refID = tGroupEle.get("id")
            guid = ""
            guid = getTextFromChildEle(tGroupEle, "GUID")
            hrid = getTextFromChildEle(tGroupEle, "HRID")
            path = getTextFromChildEle(tGroupEle, "Path")
            for parentGroupEle in tGroupEle.findall("./ParentGroup"):
                parentGroup = parentGroupEle.get("href").replace("#", "")

            groupTable.insertRow({
                "HRID": hrid,
                "refID": refID,
                "GUID": guid,
                "Path": path,
                "ParentGroup": parentGroup,
                "ItemNamingScheme": ""
            })

    def buildCmpTableSet(elementTree, cmpTable, paramMatchTable,
                         modelMatchTable):
        cmpID = 0
        for cmpEle in elementTree.findall(".//TComponentDefinition"):
            guid = getTextFromChildEle(cmpEle, "GUID")
            hrid = getTextFromChildEle(cmpEle, "HRID")
            cmpType = getTextFromChildEle(cmpEle, "ComponentTypes")
            itemHRID = getTextFromChildEle(cmpEle, "ItemHRID")
            revGUID = getTextFromChildEle(cmpEle, "RevisionGUID")
            namingScheme = getTextFromChildEle(cmpEle, "ItemNamingScheme")
            for parentGroupEle in cmpEle.findall("./ParentGroup"):
                parentGroup = parentGroupEle.get("href").replace("#", "")

            cmpTable.insertRow({
                "HRID": hrid,
                "GUID": guid,
                "ParentGroup": parentGroup,
                "ComponentTypes": cmpType,
                "ItemHRID": itemHRID,
                "RevisionGUID": revGUID,
                "ItemNamingScheme": namingScheme,
                "refID": cmpID
            })

            for tParamEle in cmpEle.findall(".//TParameter"):
                paramValue = getTextFromChildEle(tParamEle, "Value")
                realValue = getTextFromChildEle(tParamEle, "RealValue")
                for reqParamEle in tParamEle.iter("RequiredParameter"):
                    reqParam = reqParamEle.get("href").replace("#", "")

                paramMatchTable.insertRow({
                    "RequiredParameter": reqParam,
                    "Value": paramValue,
                    "RealValue": realValue,
                    "Component": cmpID
                })

            for modelEle in cmpEle.findall(".//TModelChoice"):
                for reqModelEle in modelEle.iter("RequiredModel"):
                    reqModel = reqModelEle.get("href").replace("#", "")
                for modelLinkEle in modelEle.iter("ModelLink"):
                    modelLink = modelLinkEle.get("href").replace("#", "")

                modelMatchTable.insertRow({
                    "RequiredModel": reqModel,
                    "ModelLink": modelLink,
                    "Component": cmpID
                })

            cmpID += 1

    tables = componentLibrary()
    parser = ET.XMLParser(encoding="utf-8")
    eleCmplib = ET.parse(filePath, parser=parser)

    #import all required models and parameters tables
    buildReqDataTable(eleCmplib, "TRequiredParameter",
                      tables.dataTables["RequiredParameters"])
    buildReqDataTable(eleCmplib, "TRequiredModel",
                      tables.dataTables["RequiredModels"])
    buildReqDataTable(eleCmplib, "TModelLink", tables.dataTables["ModelLinks"])

    #import components data, model choices, parameter links
    buildGroupTable(eleCmplib, tables.dataTables["Group"])

    componentTables = buildCmpTableSet(
        eleCmplib, tables.dataTables["ComponentDefinitions"],
        tables.dataTables["ParameterLinks"], tables.dataTables["ModelChoices"])

    #import basic cmplib info
    lifeCycleGUID = getTextFromChildEle(eleCmplib, "LifeCycleDefinitionGUID")
    revNamingGUID = getTextFromChildEle(eleCmplib, "RevisionNamingSchemeGUID")
    vaultGUID = getTextFromChildEle(eleCmplib, "VaultGUID")
    vaultName = getTextFromChildEle(eleCmplib, "VaultName")
    tempVaultGUID = getTextFromChildEle(eleCmplib, "TemplateVaultGUID")
    tempRevGUID = getTextFromChildEle(eleCmplib, "TemplateRevisionGUID")

    tables.dataTables["BasicInfo"].insertRow({
        "LifeCycleDefinitionGUID":
        lifeCycleGUID,
        "RevisionNamingSchemeGUID":
        revNamingGUID,
        "VaultGUID":
        vaultGUID,
        "VaultName":
        vaultName,
        "TemplateVaultGUID":
        tempVaultGUID,
        "TemplateRevisionGUID":
        tempRevGUID
    })

    addComponentComment(tables.dataTables)

    for aTable in tables.dataTables:
        tables.dataTables[aTable].removeBadChars
    return tables
 def __init__(self, file_folder, file_name):
     self.file_path = os.path.join(file_folder, file_name)
     self.file_name = file_name
     self.tree = Et.parse(self.file_path,
                          parser=Et.XMLParser(encoding=ENCODING))
     self.root = self.tree.getroot()
Beispiel #24
0
def get_data_from_xml(filename, code):
    parser = etree.XMLParser(encoding=code)  #iso8859_5 )#koi8_r')

    tree = etree.parse(filename, parser=parser)
    return tree
Beispiel #25
0
import xml.etree.ElementTree as ET

parser = ET.XMLParser(encoding="utf-8")
tree = ET.parse("newsafr.xml", parser)
root = tree.getroot()

news_list = root.findall("channel/item")

full_set = set()
descript = ''
for news in news_list:
	descript += news.find("description").text

for news in news_list:
    full_set.add(descript)

def count_word(full_set):
    word_value = {}
    full_set = ', '.join(full_set)
    full_set = full_set.split(" ")

    for elements in full_set:
        if len(elements) > 6:
            if elements in word_value:
                word_value[elements] += 1
            else:
                word_value[elements] = 1
    return word_value
def sorted_w(word_value):
    sorted_words = sorted(word_value.items(), key=lambda x: x[1], reverse=True)
    return sorted_words
def operation():
    Label(roottk, text="processing",
          font=('helvetica', 12, 'bold')).place(x=230, y=360)

    directory = input_directory.get()
    userinput = newfile_input.get()
    effective_date = input_effective_date.get()
    productname = userinput

    obj = os.scandir(directory)

    updatedname = []
    newname = ''
    samplename = []
    oldname = []
    oldpath = []

    for entry in obj:
        for x in os.scandir(entry.path):
            oldname.append(x.path)
            oldpath.append(entry.path)
            name = x.name.split('_')
            updatedname = name[2:]
            newname = userinput + '_'
            arraylength = len(updatedname)
            for z in range(0, arraylength):
                if z == arraylength - 1:
                    newname = newname + updatedname[z]
                else:
                    newname = newname + updatedname[z] + '_'
            samplename.append(newname)
            newname = ''

    if len(oldname) and len(oldpath) and len(samplename):
        print('code is executing')
    else:
        print('code is going to fail')

    #print(samplename)
    #print(len(oldname))
    #print(len(oldpath))
    #print(len(samplename))

    for x in range(0, len(samplename)):
        os.rename(oldname[x], oldpath[x] + '\\' + samplename[x])

    print('done')

    print('processing...')  #setup some delay
    time.sleep(5)

    def splitjoin(sampleid, productname):
        newmanuscript = sampleid.split('_')[2:]
        newmanuscript.insert(0, productname)
        newmanuscript = '_'.join(newmanuscript)
        return newmanuscript

    def splitjoinlob(productname):
        productname = productname.split('_')
        productname = ''.join(productname)
        return productname

    def splitjoincaption(caption, productname):
        captionarr = []
        newcaption = caption.split(' ')
        for x in newcaption:
            if x != '':
                captionarr.append(x)
        updatedcaption = captionarr[2:]
        productname = productname.split('_')
        for y in range(0, len(productname)):
            updatedcaption.insert(y, productname[y])
        updatedcaption = ' '.join(updatedcaption)
        return updatedcaption

    for entry in os.scandir(directory):
        for file in os.scandir(entry):
            parser = ET.XMLParser(encoding='utf-8')
            root = ET.parse(file.path, parser=parser)
            root.getroot()[0].attrib['versionDate'] = effective_date
            manuscriptid = root.getroot()[0].attrib['manuscriptID']
            versionid = root.getroot()[0].attrib['versionID']
            caption = root.getroot()[0].attrib['caption']

            updatedmanuscriptid = splitjoin(manuscriptid, productname)
            updatedlob = splitjoinlob(productname)
            updatedcaption = splitjoincaption(caption, productname)
            updatedversionid = updatedmanuscriptid
            #print(updatedmanuscriptid, updatedversionid)

            #root.getroot()[0].attrib['lob']=updatedlob
            root.getroot()[0].attrib['caption'] = updatedcaption
            root.getroot()[0].attrib['manuscriptID'] = updatedmanuscriptid
            root.getroot()[0].attrib['versionID'] = updatedversionid
            #print(updatedlob,updatedcaption)

            myroot = root.getroot()[0]
            for x in myroot:
                if x.tag == 'keys':
                    for y in x:
                        if y.attrib['name'] == 'lob':
                            y.attrib['value'] = updatedlob
                        if y.attrib['name'] == 'effectiveDateNew':
                            y.attrib['value'] = effective_date
                            #print(y.attrib['value'])
                        if y.attrib['name'] == 'effectiveDateRenewal':
                            y.attrib['value'] = effective_date

            root.write(file.path)
            #Label1=Label(roottk,text="processing...",font=('helvetica', 12, 'bold')).place(x=230,y=360)
            print('wait...')

    obj.close()
    print('done')

    Label(roottk, text="Executed", font=('helvetica', 12, 'bold')).place(x=230,
                                                                         y=390)
def compilation_textpair(traitement, racine):
    parser = ET.XMLParser(encoding='utf-8')
    fichier = open(racine+"/"+traitement.chemin_projet+"/resultat_final.xml").read()
    treeResults = ET.fromstring(fichier, parser = parser)
    fichier_non_pertinentes = open("phrases_rejetees_2.csv", "w")
    phrases_rejetes = []

    #XML_final_TextPair = "<body>"
    divs = treeResults.findall(".//div")
    for div in divs:
        print(div.attrib['id'])
        resultats_phrases = {}
        xr_phrases = {}
        XML_final_TextPair = "<div id=\"" + div.attrib['id'] + "\">"
        segs = div.findall(".//seg")
        for seg in segs:
            print (seg.attrib)
            #XML_final_TextPair += "<seg id=\"" + seg.attrib['id'] + "\" corresp=\"" + seg.attrib['corresp'] + "\">"
            XML_final_TextPair += "<seg id=\"" + seg.attrib['id']+"\">"
            phrases = seg.findall(".//s")
            phrasesTarget = ""
            phrasesSource = ""
            for phrase in phrases:
                phrasesSource += phrase.attrib['id']+' '
                print(phrase.attrib['id'])
                print(ET.tostring(phrase))
                if phrase.attrib['id'] not in resultats_phrases.keys():
                    resultats_phrases[phrase.attrib['id']] = []
                if phrase.attrib['id'] not in xr_phrases.keys():
                    xr_phrases[phrase.attrib['id']] = phrase
                w_xr = phrase.findall(".//w[xr]")
                for w in w_xr:
                    xrs = w.findall(".//xr")
                    for xr in xrs:
                        if xr.attrib["corresp"].split("_")[0] not in phrasesTarget:
                            phrasesTarget += xr.attrib["corresp"].split("_")[0] + ' '
                        if xr.attrib["corresp"].split("_")[0] not in resultats_phrases[phrase.attrib['id']]:
                            resultats_phrases[phrase.attrib['id']].append(xr.attrib["corresp"].split("_")[0])
                        if phrase.attrib['id'] in xr_phrases.keys():
                            phrase_a_ajouter = xr_phrases[phrase.attrib['id']]
                            w_source = phrase_a_ajouter.find(".//w[@id=\"" + w.attrib['id'] + "\"]")
                            xrs_source = w_source.findall(".//xr")
                            indice = presence_xr(xr.attrib['corresp'], xrs_source)
                            if indice == 0:
                                w_source.insert(0, xr)
                                xr_phrases[phrase.attrib['id']] = phrase_a_ajouter
                print(ET.tostring(xr_phrases[phrase.attrib['id']]))

            interGrp = seg.find(".//interpGrp")
            phrasesSource = phrasesSource[:-1]
            interpSource = ET.Element("interp", attrib={"type": "phrasesSource", "corresp": phrasesSource})
            interpTarget = ET.Element("interp", attrib={"type": "phrasesTarget", "corresp": phrasesTarget})
            interGrp.insert(0, interpTarget)
            interGrp.insert(0, interpSource)
            XML_final_TextPair += ET.tostring(interGrp, encoding='utf8').decode()
            XML_final_TextPair += "</seg>"

        for id_phrase, resultats in resultats_phrases.items():
            phrase = xr_phrases[id_phrase]
            identifiants_target = ''

            xrs = phrase.findall(".//xr")
            interpGrps = []
            for it in resultats:
                print (it)
                score = 0
                for xr in xrs:
                    if it in xr.attrib['corresp']:
                        score += float(xr.attrib['cert'])
                if score > 0:
                    identifiants_target += it + ' '
                    ratio_score = calcul_ratio_score(score, phrase, treeResults.find(".//s[@id=\"" + it + "\"]"))
                    interpGrp_s = ET.Element("interpGrp", attrib={"corresp": it})
                    interpScore = ET.Element("interp", attrib={"type": "score"})
                    interpScore.text = str(round(score, 2))
                    interpRatioScore = ET.Element("interp", attrib={"type": "ratioScore"})
                    interpRatioScore.text = str(ratio_score)
                    interpGrp_s.insert(0, interpRatioScore)
                    interpGrp_s.insert(0, interpScore)
                    interpGrps.append(interpGrp_s)
                else:
                    fichier_non_pertinentes.write(id_phrase+"***"+it+"***"++"")
            identifiants_target = identifiants_target[:-1]
            phrase.set("corresp", identifiants_target)
            for ip in interpGrps:
                phrase.insert(0, ip)
            print(ET.tostring(phrase, encoding='utf8').decode())
            XML_final_TextPair += ET.tostring(phrase, encoding='utf8').decode()

        XML_final_TextPair += "</div>"

        XML_final_TextPair = re.sub(r"<\?xml version='1\.0' encoding='utf8'\?>", r"", XML_final_TextPair)
        #XML_final_TextPair = "<?xml version='1.0' encoding='utf8'?>" + XML_final_TextPair
        # print(XML_final)
        fichier_resultats = open(racine + "/" + traitement.chemin_projet + "/dossier_textes/"+div.attrib['id']+".xml", "w")
        fichier_resultats.write(XML_final_TextPair)
        fichier_resultats.close()

    os.system("cat "+ racine + "/" + traitement.chemin_projet + "/dossier_textes/*.xml >" + racine + "/" + traitement.chemin_projet + "/dossier_textes/resultats_compiles.xml")
from xml.etree import ElementTree
import os
import re
import csv

file_name = "movies.xml"
full_file = os.path.abspath(os.path.join('data', file_name))

try:
    parser = ElementTree.XMLParser(encoding="utf-8")
    dom = ElementTree.parse(full_file, parser=parser)

    tree = dom.getroot()

except ElementTree.ParseError as Error:
    print("Error while parsing xml file {}".format(Error))


def parse_xml(tree):
    """parses the xml file reads a perticular set of data 
    and store the data in dictionary and then append it to list.
    Also updates the xml files(creates new element under a perticular tag)
    """

    try:
        movi_list = list()
        movie_csv = list()
        for child in tree.findall("genre"):
            if child.attrib['category'] == "Action":
                main_dict = dict()
                for movies in child.findall('./decade/'):
    def _parse_test_results(self,
                            result_filename,
                            test_results=None,
                            failing_test=None):
        """Handles result files with one or more test results.

        @param result_filename: log file to parse.
        @param test_results: Result parsed will be appended to it.
        @param failing_test: Tests considered failed will append to it.

        @return: dictionary of parsed test results.
        """
        xml = ''
        xml_start = False
        xml_complete = False
        xml_bad = False
        result = 'ParseTestResultFail'

        if test_results is None:
            test_results = {}

        if not os.path.isfile(result_filename):
            logging.error('Did not find file %s', result_filename)
            return test_results

        with open(result_filename) as result_file:
            for line in result_file.readlines():
                # If the test terminates early, the XML will be incomplete
                # and should not be parsed.
                if line.startswith('#terminateTestCaseResult'):
                    result = line.strip().split()[1]
                    xml_bad = True
                # Will only see #endTestCaseResult if the test does not
                # terminate early.
                elif line.startswith('#endTestCaseResult'):
                    xml_complete = True
                elif xml_start:
                    xml += line
                elif line.startswith('#beginTestCaseResult'):
                    # If we see another begin before an end then something is
                    # wrong.
                    if xml_start:
                        xml_bad = True
                    else:
                        xml_start = True
                        test_case = line.split(' ')[1]

                if xml_complete or xml_bad:
                    if xml_complete:
                        myparser = et.XMLParser(encoding='ISO-8859-1')
                        root = et.fromstring(xml, parser=myparser)
                        test_case = root.attrib['CasePath']
                        result = root.find('Result').get('StatusCode').strip()
                        xml_complete = False
                    test_results[result] = test_results.get(result, 0) + 1
                    if (result.lower() not in self.TEST_RESULT_FILTER
                            and failing_test != None):
                        failing_test.append(test_case)
                    xml_bad = False
                    xml_start = False
                    result = 'ParseTestResultFail'
                    xml = ''

        return test_results
Beispiel #30
0
 def _xml_to_obj(cls, serialized_str, encoding="iso-8859-2"):
     parser = ET.XMLParser(encoding=encoding)
     element = ET.fromstring(serialized_str, parser=parser)
     return cls._xml_ele_to_obj(cls._remove_xml_namespaces(element))