def _make_parser(self): # For some reason MAL returns an XML file with HTML exclusive # entities like á, so we have to create a custom XMLParser # to convert these entities correctly. parser = ET.XMLParser() parser.parser.UseForeignDTD(True) entities = dict() entities["nbsp"] = u'\u00A0' entities["iexcl"] = u'\u00A1' entities["cent"] = u'\u00A2' entities["pound"] = u'\u00A3' entities["curren"] = u'\u00A4' entities["yen"] = u'\u00A5' entities["brvbar"] = u'\u00A6' entities["sect"] = u'\u00A7' entities["uml"] = u'\u00A8' entities["copy"] = u'\u00A9' entities["ordf"] = u'\u00AA' entities["laquo"] = u'\u00AB' entities["not"] = u'\u00AC' entities["shy"] = u'\u00AD' entities["reg"] = u'\u00AE' entities["macr"] = u'\u00AF' entities["deg"] = u'\u00B0' entities["plusmn"] = u'\u00B1' entities["sup2"] = u'\u00B2' entities["sup3"] = u'\u00B3' entities["acute"] = u'\u00B4' entities["micro"] = u'\u00B5' entities["para"] = u'\u00B6' entities["middot"] = u'\u00B7' entities["cedil"] = u'\u00B8' entities["sup1"] = u'\u00B9' entities["ordm"] = u'\u00BA' entities["raquo"] = u'\u00BB' entities["frac14"] = u'\u00BC' entities["frac12"] = u'\u00BD' entities["frac34"] = u'\u00BE' entities["iquest"] = u'\u00BF' entities["Agrave"] = u'\u00C0' entities["Aacute"] = u'\u00C1' entities["Acirc"] = u'\u00C2' entities["Atilde"] = u'\u00C3' entities["Auml"] = u'\u00C4' entities["Aring"] = u'\u00C5' entities["AElig"] = u'\u00C6' entities["Ccedil"] = u'\u00C7' entities["Egrave"] = u'\u00C8' entities["Eacute"] = u'\u00C9' entities["Ecirc"] = u'\u00CA' entities["Euml"] = u'\u00CB' entities["Igrave"] = u'\u00CC' entities["Iacute"] = u'\u00CD' entities["Icirc"] = u'\u00CE' entities["Iuml"] = u'\u00CF' entities["ETH"] = u'\u00D0' entities["Ntilde"] = u'\u00D1' entities["Ograve"] = u'\u00D2' entities["Oacute"] = u'\u00D3' entities["Ocirc"] = u'\u00D4' entities["Otilde"] = u'\u00D5' entities["Ouml"] = u'\u00D6' entities["times"] = u'\u00D7' entities["Oslash"] = u'\u00D8' entities["Ugrave"] = u'\u00D9' entities["Uacute"] = u'\u00DA' entities["Ucirc"] = u'\u00DB' entities["Uuml"] = u'\u00DC' entities["Yacute"] = u'\u00DD' entities["THORN"] = u'\u00DE' entities["szlig"] = u'\u00DF' entities["agrave"] = u'\u00E0' entities["aacute"] = u'\u00E1' entities["acirc"] = u'\u00E2' entities["atilde"] = u'\u00E3' entities["auml"] = u'\u00E4' entities["aring"] = u'\u00E5' entities["aelig"] = u'\u00E6' entities["ccedil"] = u'\u00E7' entities["egrave"] = u'\u00E8' entities["eacute"] = u'\u00E9' entities["ecirc"] = u'\u00EA' entities["euml"] = u'\u00EB' entities["igrave"] = u'\u00EC' entities["iacute"] = u'\u00ED' entities["icirc"] = u'\u00EE' entities["iuml"] = u'\u00EF' entities["eth"] = u'\u00F0' entities["ntilde"] = u'\u00F1' entities["ograve"] = u'\u00F2' entities["oacute"] = u'\u00F3' entities["ocirc"] = u'\u00F4' entities["otilde"] = u'\u00F5' entities["ouml"] = u'\u00F6' entities["divide"] = u'\u00F7' entities["oslash"] = u'\u00F8' entities["ugrave"] = u'\u00F9' entities["uacute"] = u'\u00FA' entities["ucirc"] = u'\u00FB' entities["uuml"] = u'\u00FC' entities["yacute"] = u'\u00FD' entities["thorn"] = u'\u00FE' entities["yuml"] = u'\u00FF' entities["fnof"] = u'\u0192' entities["Alpha"] = u'\u0391' entities["Beta"] = u'\u0392' entities["Gamma"] = u'\u0393' entities["Delta"] = u'\u0394' entities["Epsilon"] = u'\u0395' entities["Zeta"] = u'\u0396' entities["Eta"] = u'\u0397' entities["Theta"] = u'\u0398' entities["Iota"] = u'\u0399' entities["Kappa"] = u'\u039A' entities["Lambda"] = u'\u039B' entities["Mu"] = u'\u039C' entities["Nu"] = u'\u039D' entities["Xi"] = u'\u039E' entities["Omicron"] = u'\u039F' entities["Pi"] = u'\u03A0' entities["Rho"] = u'\u03A1' entities["Sigma"] = u'\u03A3' entities["Tau"] = u'\u03A4' entities["Upsilon"] = u'\u03A5' entities["Phi"] = u'\u03A6' entities["Chi"] = u'\u03A7' entities["Psi"] = u'\u03A8' entities["Omega"] = u'\u03A9' entities["alpha"] = u'\u03B1' entities["beta"] = u'\u03B2' entities["gamma"] = u'\u03B3' entities["delta"] = u'\u03B4' entities["epsilon"] = u'\u03B5' entities["zeta"] = u'\u03B6' entities["eta"] = u'\u03B7' entities["theta"] = u'\u03B8' entities["iota"] = u'\u03B9' entities["kappa"] = u'\u03BA' entities["lambda"] = u'\u03BB' entities["mu"] = u'\u03BC' entities["nu"] = u'\u03BD' entities["xi"] = u'\u03BE' entities["omicron"] = u'\u03BF' entities["pi"] = u'\u03C0' entities["rho"] = u'\u03C1' entities["sigmaf"] = u'\u03C2' entities["sigma"] = u'\u03C3' entities["tau"] = u'\u03C4' entities["upsilon"] = u'\u03C5' entities["phi"] = u'\u03C6' entities["chi"] = u'\u03C7' entities["psi"] = u'\u03C8' entities["omega"] = u'\u03C9' entities["thetasym"] = u'\u03D1' entities["upsih"] = u'\u03D2' entities["piv"] = u'\u03D6' entities["bull"] = u'\u2022' entities["hellip"] = u'\u2026' entities["prime"] = u'\u2032' entities["Prime"] = u'\u2033' entities["oline"] = u'\u203E' entities["frasl"] = u'\u2044' entities["weierp"] = u'\u2118' entities["image"] = u'\u2111' entities["real"] = u'\u211C' entities["trade"] = u'\u2122' entities["alefsym"] = u'\u2135' entities["larr"] = u'\u2190' entities["uarr"] = u'\u2191' entities["rarr"] = u'\u2192' entities["darr"] = u'\u2193' entities["harr"] = u'\u2194' entities["crarr"] = u'\u21B5' entities["lArr"] = u'\u21D0' entities["uArr"] = u'\u21D1' entities["rArr"] = u'\u21D2' entities["dArr"] = u'\u21D3' entities["hArr"] = u'\u21D4' entities["forall"] = u'\u2200' entities["part"] = u'\u2202' entities["exist"] = u'\u2203' entities["empty"] = u'\u2205' entities["nabla"] = u'\u2207' entities["isin"] = u'\u2208' entities["notin"] = u'\u2209' entities["ni"] = u'\u220B' entities["prod"] = u'\u220F' entities["sum"] = u'\u2211' entities["minus"] = u'\u2212' entities["lowast"] = u'\u2217' entities["radic"] = u'\u221A' entities["prop"] = u'\u221D' entities["infin"] = u'\u221E' entities["ang"] = u'\u2220' entities["and"] = u'\u2227' entities["or"] = u'\u2228' entities["cap"] = u'\u2229' entities["cup"] = u'\u222A' entities["int"] = u'\u222B' entities["there4"] = u'\u2234' entities["sim"] = u'\u223C' entities["cong"] = u'\u2245' entities["asymp"] = u'\u2248' entities["ne"] = u'\u2260' entities["equiv"] = u'\u2261' entities["le"] = u'\u2264' entities["ge"] = u'\u2265' entities["sub"] = u'\u2282' entities["sup"] = u'\u2283' entities["nsub"] = u'\u2284' entities["sube"] = u'\u2286' entities["supe"] = u'\u2287' entities["oplus"] = u'\u2295' entities["otimes"] = u'\u2297' entities["perp"] = u'\u22A5' entities["sdot"] = u'\u22C5' entities["lceil"] = u'\u2308' entities["rceil"] = u'\u2309' entities["lfloor"] = u'\u230A' entities["rfloor"] = u'\u230B' entities["lang"] = u'\u2329' entities["rang"] = u'\u232A' entities["loz"] = u'\u25CA' entities["spades"] = u'\u2660' entities["clubs"] = u'\u2663' entities["hearts"] = u'\u2665' entities["diams"] = u'\u2666' entities["quot"] = u'\"' entities["amp"] = u'&' entities["lt"] = u'<' entities["gt"] = u'>' entities["OElig"] = u'\u0152' entities["oelig"] = u'\u0153' entities["Scaron"] = u'\u0160' entities["scaron"] = u'\u0161' entities["Yuml"] = u'\u0178' entities["circ"] = u'\u02C6' entities["tilde"] = u'\u02DC' entities["ensp"] = u'\u2002' entities["emsp"] = u'\u2003' entities["thinsp"] = u'\u2009' entities["zwnj"] = u'\u200C' entities["zwj"] = u'\u200D' entities["lrm"] = u'\u200E' entities["rlm"] = u'\u200F' entities["ndash"] = u'\u2013' entities["mdash"] = u'\u2014' entities["lsquo"] = u'\u2018' entities["rsquo"] = u'\u2019' entities["sbquo"] = u'\u201A' entities["ldquo"] = u'\u201C' entities["rdquo"] = u'\u201D' entities["bdquo"] = u'\u201E' entities["dagger"] = u'\u2020' entities["Dagger"] = u'\u2021' entities["permil"] = u'\u2030' entities["lsaquo"] = u'\u2039' entities["rsaquo"] = u'\u203A' entities["euro"] = u'\u20AC' parser.entity.update(entities) return parser
class CommentedTreeBuilder(ET.TreeBuilder): def __init__(self, *args, **kwargs): super(CommentedTreeBuilder, self).__init__(*args, **kwargs) def comment(self, data): self.start(ET.Comment, {}) self.data(data) self.end(ET.Comment) if __name__ == "__main__": print('---------------------------------------------------------') print('Before: ElementTree ogirinal parser() dump()') tree = ET.parse('sample.xml') root = tree.getroot() dump(root) # 주석은 읽지 않는다. # xml tree 를 파일로 저장한다. tree.write('original.xml', xml_declaration=True) # 주석은 저장되지 않는다. print('---------------------------------------------------------') print('After:Commented parser dump') # 주석까지 읽으려면 파서를 지정해 줘야 한다. tree = ET.parse('sample.xml', parser=ET.XMLParser(target=CommentedTreeBuilder())) root = tree.getroot() dump(root) # 주석까지 읽었다 # 읽은 내용을 다시 저장 # xml tree 를 파일로 저장한다. 한글깨짐 tree.write('commented_parser.xml') # 한글커멘트가 깨지 않도록 utf8 로 저장 tree.write('commented_parser2.xml', encoding='utf8', xml_declaration=True)
def postprocess_translations(reduce_diff_hacks=False): print('Checking and postprocessing...') if reduce_diff_hacks: global _orig_escape_cdata _orig_escape_cdata = ET._escape_cdata ET._escape_cdata = escape_cdata for (filename, filepath) in all_ts_files(): os.rename(filepath, filepath + '.orig') have_errors = False for (filename, filepath) in all_ts_files('.orig'): # pre-fixups to cope with transifex output parser = ET.XMLParser( encoding='utf-8' ) # need to override encoding because 'utf8' is not understood only 'utf-8' with open(filepath + '.orig', 'rb') as f: data = f.read() # remove control characters; this must be done over the entire file otherwise the XML parser will fail data = remove_invalid_characters(data) tree = ET.parse(io.BytesIO(data), parser=parser) # iterate over all messages in file root = tree.getroot() for context in root.findall('context'): for message in context.findall('message'): numerus = message.get('numerus') == 'yes' source = message.find('source').text translation_node = message.find('translation') # pick all numerusforms if numerus: translations = [ i.text for i in translation_node.findall('numerusform') ] else: translations = [translation_node.text] for translation in translations: if translation is None: continue errors = [] valid = check_format_specifiers( source, translation, errors, numerus) and not contains_erexcoin_addr( translation, errors) for error in errors: print('%s: %s' % (filename, error)) if not valid: # set type to unfinished and clear string if invalid translation_node.clear() translation_node.set('type', 'unfinished') have_errors = True # Remove location tags for location in message.findall('location'): message.remove(location) # Remove entire message if it is an unfinished translation if translation_node.get('type') == 'unfinished': context.remove(message) # check if document is (virtually) empty, and remove it if so num_messages = 0 for context in root.findall('context'): for message in context.findall('message'): num_messages += 1 if num_messages < MIN_NUM_MESSAGES: print('Removing %s, as it contains only %i messages' % (filepath, num_messages)) continue # write fixed-up tree # if diff reduction requested, replace some XML to 'sanitize' to qt formatting if reduce_diff_hacks: out = io.BytesIO() tree.write(out, encoding='utf-8') out = out.getvalue() out = out.replace(b' />', b'/>') with open(filepath, 'wb') as f: f.write(out) else: tree.write(filepath, encoding='utf-8') return have_errors
def process(xml_file_path, out_path, extract_classes): if not os.path.exists(xml_file_path): print("skip '%s'" % (xml_file_path)) return else: print("process '%s'" % (xml_file_path)) utf8_parser = ET.XMLParser(encoding='utf-8') tree = ET.parse(xml_file_path, parser=utf8_parser) root = tree.getroot() clear_objs = [] ## clear object which not in 'extract_classes' for anno_id, obj in enumerate(root.iter('object')): name = obj.find('name').text if name not in extract_classes: clear_objs.append(obj) for obj in clear_objs: root.remove(obj) objs = root.findall('object') # 选择1:没有obj直接return,那么xml都不会有 ## none object left, return direct if len(objs) < 1: return # 选择2:屏蔽掉上面这两句,那么会有xml但是什么obj都没有,如果有的框架支持负样本训练,则这样 # 选择3:如果没有obj,在左上角生成一个background的obj,坐标左上角(0,0),右下角(10,10),因为有的框架不支持负样本(也就是0obj)训练,那么我们手动添加一个 ### clw note: none object left, write a bbox like(0, 0, 20, 20) for background. if len(objs) < 1: element = ET.Element('object') # 创建二级目录 oneName = ET.Element('name') oneName.text = 'background' # 二级目录的值 #结果展示:<id>1</id> onePose = ET.Element('pose') onePose.text = 'Unspecified' oneTruncated = ET.Element('truncated') oneTruncated.text = '1' oneDifficult = ET.Element('difficult') oneDifficult.text = '0' oneBndbox = ET.Element('bndbox') xmin = ET.Element('xmin') ymin = ET.Element('ymin') xmax = ET.Element('xmax') ymax = ET.Element('ymax') xmin.text = str(0) ymin.text = str(0) xmax.text = str(10) ymax.text = str(10) oneBndbox.append(xmin) oneBndbox.append(ymin) oneBndbox.append(xmax) oneBndbox.append(ymax) element.append(oneName) element.append(onePose) element.append(oneTruncated) element.append(oneDifficult) element.append(oneBndbox) root.append(element) ########################################################################## ml_file_path = xml_file_path.replace('\\', '/') # clw added: for windows tree.write(os.path.join(out_path, xml_file_path.split("/")[-1]), encoding="utf-8") jpg_path = xml_file_path[:-3] + "jpg" png_path = xml_file_path[:-3] + "png" if os.path.exists(jpg_path): shutil.copy(jpg_path, out_path) elif os.path.exists(png_path): shutil.copy(png_path, out_path) pass
def element_from_string(text): xml_parser = ElementTree.XMLParser() xml_parser._fixtext = lambda text: text xml_parser.feed(text) return xml_parser.close()
def open_file(self): with open(self.filename, "r") as station_file: parser = ET.XMLParser(encoding='utf-8') self.tree = ET.parse(station_file, parser)
def compat_etree_fromstring(text): return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
from PIL import Image import seaborn as sns import matplotlib.pyplot as plt import torch import torchvision as tv import xml.etree.ElementTree as ET import numpy as np import json root_dir = Path('data/AIC20_ReID') dataset = {x: root_dir / f'image_{x}' for x in ['train', 'test']} track_txts = {x: root_dir / f'{x}_track.txt' for x in dataset.keys()} train_xml_path = 'train_label.xml' xml_data = ET.parse(str(root_dir / train_xml_path), parser=ET.XMLParser(encoding='iso-8859-5')).getroot()[0] labels = dict() for x in xml_data: x = x.attrib labels[x['imageName']] = (x['vehicleID'], x['cameraID']) lines = open(track_txts['train']).readlines() tracks = [x.strip().split() for x in lines if len(x.strip()) != 0] vehs = dict() for i, track in enumerate(tracks): veh_id, cam_id = zip(*[labels[img_id] for img_id in track]) veh_id = veh_id[0] cam_id = cam_id[0] vehs.setdefault(veh_id, dict())
def __init__(self): self.parser = etree.XMLParser()
def myFilter(svgFilename, filename): #print "im myfilter",svgFilename import xml.etree.ElementTree as ET #import ElementTree parser = ET.XMLParser(encoding="utf-8") tree = ET.parse(svgFilename, parser) remove_namespace(tree, u'http://www.w3.org/2000/svg' ) #call remove_namespace for cleaning the element-tags svgRoot = tree.getroot() # print "tree", tree parent_map = dict( (c, p) for p in tree.getiterator() for c in p) #make a dictonary with a map of the hole file #it has the childs and the its parents stored coppers = set( ) #make a set (set is like a list but without doubled same entries) for child in parent_map.values(): removed = False # print "child", child id = child.attrib.get("id") if id and id.startswith( 'copper' ): #get the child ids to decide if we need to clean or not coppers.add(child) paths = list() for copper in coppers: paths.extend( copper.findall("path") ) #writes the list where are path-elemnts are in that are in a parent called "copper" #print "paths", paths for path in paths: #print "path", path #print "parent_map", parent_map id = path.attrib.get("id") if id is None: try: #print "id", id parent_map[path].remove( path) #remove the child from his parent removed = True except: continue #print "f**k", path, path.tag, parent_map.get(path) else: #print "id", id removed = False if removed: #write the new file in a new output directory svgRoot.set("xmlns:svg", "http://www.w3.org/2000/svg") svgRoot.set("xmlns", "http://www.w3.org/2000/svg") #print "root", root, "tree",tree, "filename", svgFilename outFilename = outputDir + filename #print "outputdir", outputDir, "outFilename", outFilename tree.write(outFilename) with open(outFilename, "r+") as f: old = f.read() f.seek(0) f.write( '<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n' + '<!-- Created with Fritzing (http://www.fritzing.org/) -->' + '\n' + old) f.close() print "finised:", filename
def __init__(self, filename): # noqa: C901 from xml.etree import ElementTree as ET parser = ET.XMLParser() try: tree = ET.parse(str(filename), parser) root = tree.getroot() except ET.ParseError: root = _parse_raw_binary(str(filename)) if root.tag != "VTKFile": raise ReadError() if root.attrib["type"] != "UnstructuredGrid": raise ReadError() if root.attrib["version"] not in ["0.1", "1.0"]: raise ReadError( "Unknown VTU file version '{}'.".format(root.attrib["version"]) ) # fix empty NumberOfComponents attributes as produced by Firedrake for da_tag in root.findall(".//DataArray[@NumberOfComponents='']"): da_tag.attrib.pop("NumberOfComponents") if "compressor" in root.attrib: assert root.attrib["compressor"] in [ "vtkLZMADataCompressor", "vtkZLibDataCompressor", ] self.compression = root.attrib["compressor"] else: self.compression = None self.header_type = ( root.attrib["header_type"] if "header_type" in root.attrib else "UInt32" ) try: self.byte_order = root.attrib["byte_order"] if self.byte_order not in ["LittleEndian", "BigEndian"]: raise ReadError(f"Unknown byte order '{self.byte_order}'.") except KeyError: self.byte_order = None grid, self.appended_data = get_grid(root) pieces = [] field_data = {} for c in grid: if c.tag == "Piece": pieces.append(c) elif c.tag == "FieldData": # TODO test field data for data_array in c: field_data[data_array.attrib["Name"]] = self.read_data(data_array) else: raise ReadError(f"Unknown grid subtag '{c.tag}'.") if not pieces: raise ReadError("No Piece found.") points = [] cells = [] point_data = [] cell_data_raw = [] for piece in pieces: piece_cells = {} piece_point_data = {} piece_cell_data_raw = {} num_points = int(piece.attrib["NumberOfPoints"]) num_cells = int(piece.attrib["NumberOfCells"]) for child in piece: if child.tag == "Points": data_arrays = list(child) if len(data_arrays) != 1: raise ReadError() data_array = data_arrays[0] if data_array.tag != "DataArray": raise ReadError() pts = self.read_data(data_array) num_components = int(data_array.attrib["NumberOfComponents"]) points.append(pts.reshape(num_points, num_components)) elif child.tag == "Cells": for data_array in child: if data_array.tag != "DataArray": raise ReadError() piece_cells[data_array.attrib["Name"]] = self.read_data( data_array ) if len(piece_cells["offsets"]) != num_cells: raise ReadError() if len(piece_cells["types"]) != num_cells: raise ReadError() cells.append(piece_cells) elif child.tag == "PointData": for c in child: if c.tag != "DataArray": raise ReadError() piece_point_data[c.attrib["Name"]] = self.read_data(c) point_data.append(piece_point_data) elif child.tag == "CellData": for c in child: if c.tag != "DataArray": raise ReadError() piece_cell_data_raw[c.attrib["Name"]] = self.read_data(c) cell_data_raw.append(piece_cell_data_raw) else: raise ReadError(f"Unknown tag '{child.tag}'.") if not cell_data_raw: cell_data_raw = [{}] * len(cells) if len(cell_data_raw) != len(cells): raise ReadError() point_offsets = numpy.cumsum([0] + [pts.shape[0] for pts in points][:-1]) # Now merge across pieces if not points: raise ReadError() self.points = numpy.concatenate(points) if point_data: self.point_data = { key: numpy.concatenate([pd[key] for pd in point_data]) for key in point_data[0] } else: self.point_data = None self.cells, self.cell_data = _organize_cells( point_offsets, cells, cell_data_raw ) self.field_data = field_data
productname=productname.split('_') productname=''.join(productname) return productname def splitjoincaption(caption,productname): newcaption=caption.split(' ')[2:] productname=productname.split('_') productname=' '.join(productname) newcaption.insert(0,productname) newcaption=' '.join(newcaption) return newcaption for entry in os.scandir(directory): for file in os.scandir(entry): parser=ET.XMLParser(encoding='utf-8') root=ET.parse(file.path,parser=parser) root.getroot()[0].attrib['versionDate']=effective_date manuscriptid = root.getroot()[0].attrib['manuscriptID'] versionid = root.getroot()[0].attrib['versionID'] caption = root.getroot()[0].attrib['caption'] updatedmanuscriptid=splitjoin(manuscriptid, productname) updatedlob=splitjoinlob(productname) updatedcaption=splitjoincaption(caption,productname) updatedversionid=splitjoin(versionid,productname) #print(updatedmanuscriptid, updatedversionid) root.getroot()[0].attrib['lob']=updatedlob root.getroot()[0].attrib['caption']=updatedcaption root.getroot()[0].attrib['manuscriptID']=updatedmanuscriptid
def get_exteranl_ole_link_type(self, unzip_dir, office_type=""): # Precondition if office_type == 'xl': return False ret = False r_id = "" flag_ole_link = False flag_external = False for (root, _, files) in os.walk(unzip_dir): for filename in files: # dir search and find .xml _, ext = os.path.splitext(filename) file_path = os.path.join(root, filename) try: if ext == ".xml": # e.g. document.xml with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: xml_txt = f.read() xp = xml_parser.XmlParser() utf8_parser = etree.XMLParser(encoding='utf-8') ooxml = etree.fromstring(xml_txt, parser=utf8_parser) for elem in ooxml.iter(): o_oleobject = elem.find(_name('{{{o}}}OLEObject')) if o_oleobject is not None: # If it has OLE object xp.parse_o_oleobject(o_oleobject) if xp.oleobject_attrib[ 'Type'] == "Link" and xp.oleobject_attrib[ 'child'][ 'o_LinkType'] == "EnhancedMetaFile": r_id = xp.oleobject_attrib['r_id'] flag_ole_link = True elif xp.oleobject_attrib[ 'Type'] == "Link" and xp.oleobject_attrib[ 'child']['o_LinkType'] == "Picture": if r"\f 0" in xp.oleobject_attrib['child'][ 'o_FieldCodes']: r_id = xp.oleobject_attrib['r_id'] flag_ole_link = True if ext == '.rels': # e.g. document.xml.rels if filename not in self.external_rels.keys(): with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: xml_txt = f.read().encode("utf-8") xp = xml_parser.XmlParser() xp.parse_relationship(xml_txt) self.external_rels[filename] = xp.relationships for relationship in self.external_rels[filename]: if relationship['id'] == r_id and relationship[ 'target_mode'] == "External": flag_external = True if flag_ole_link and flag_external: ret = True break except etree.ParseError as parse_err: logging.warning(parse_err) logging.warning( "Error path: {file_path}".format(file_path=file_path)) ret = False return ret
def procesXmlFiles(createprinter=False): ''' ip_address = models.GenericIPAddressField(validators = [validate_ipv46_address]) mac_address = models.CharField(max_length=200) model = models.CharField(max_length=200) serial_number = models.CharField(max_length=200) host_name = models.CharField(max_length=200) toner_level = models.CharField(max_length=200) pages_printed = models.IntegerField(blank = True) status = models.CharField(max_length=200) date = models.DateTimeField() ''' parser = ET.XMLParser(encoding="ISO-8859-1") django.db.connection.close() xmls = MailsToProcess.objects.filter(done=False) for xml in xmls: try: tree = ET.parse(xml.xml_path, parser=parser) root = tree.getroot() for m in root: #printers obj = PrinterReport() for attr in m: if attr[0].text == 'DeviceIpAddress': if attr[1].text: print attr[1].text obj.ip_address = attr[1].text elif attr[0].text == 'DeviceMacAddress': if attr[1].text.strip(): print attr[1].text obj.mac_address = attr[1].text elif attr[0].text == 'DeviceHostName': if attr[1].text: print attr[1].text obj.host_name = attr[1].text elif attr[0].text == 'DeviceModelName': if attr[1].text: print attr[1].text obj.model = attr[1].text elif attr[0].text == 'DeviceSerialNumber': if attr[1].text.strip(): print attr[1].text obj.serial_number = attr[1].text elif attr[0].text == 'deviceAggregateTonerLevels': if attr[1].text: print attr[1].text obj.toner_level = attr[1].text elif attr[0].text == 'deviceAggregateStatus': if attr[1].text: print attr[1].text obj.status = attr[1].text elif attr[ 0].text == 'totalUsagePagesPrinted': #totalUsagePagesPrinted if attr[1].text: print attr[1].text obj.pages_printed = attr[1].text try: int(obj.pages_printed) obj.is_valid = True except ValueError: obj.is_valid = False obj.date = datetime.now() try: if obj.mac_address and obj.mac_address.strip() != '': if Printer.objects.filter( mac_address=obj.mac_address).count() == 1: p = Printer.objects.get( mac_address=obj.mac_address) elif Printer.objects.filter( mac_address=obj.mac_address ).count( ) > 1 and obj.serial_number and obj.serial_number.strip( ) != '': p = Printer.objects.get( mac_address=obj.mac_address, serial_number=obj.serial_number) else: p = None else: if obj.serial_number and obj.serial_number.strip( ) != '': if Printer.objects.filter( serial_number=obj.serial_number).count( ) == 1: p = Printer.objects.get( serial_number=obj.serial_number) except ObjectDoesNotExist: p = None except Exception: p = None if p: obj.printerOwner_id = p.id obj.save() xml.done = True xml.save() p.last_report_id = obj.id p.save() except Exception: print 'XML Invalid: %s' % xml.xml_path '''elif createprinter: idPrinter = createPrinters(obj) if idPrinter: obj.printer_id = idPrinter obj.save() xml.done = True xml.save()''' return True
def load_pan_data(xmls_directory, truth_path, write_to_txt_files=False, txts_destination_directory=None): """Load PAN data This function loads the PAN dataset and the truth, parses the XML and returns: Merged tweets of the authors, the truth, Author IDs, and the original length of the tweets. It also writes the tweets to TXT files (optional). Args: xmls_directory: The directory where the XML files of the dataset reside. truth_path: The path of the truth file. write_to_txt_files: (boolean) If True, the XML files will also be written as TXT files after being parsed. txts_destination_directory: The TXT files will be written to this directory. Returns: merged_tweets_of_authors: List. Each item is all of the tweets of an author, merged into one string. Refer to the list of replacements in the remarks. truths: List of truths for authors. author_ids: List of Author IDs. original_tweet_lengths: List of original tweet lengths. Raises: RuntimeError: If a non-XML file exists inside the *xmls_directory* Remarks: - Since *xml_filenames* is sorted in ascending order, all the returned lists will also be in the same order (sorted in ascending order of the Author IDs). - List of replacements: Line feed <LineFeed> End of Tweet <EndOfTweet> """ ''' *os.listdir* returns a list containing the name of all files and folders in the given directory. Normally, the list is created in ascending order. However, the Python documentation states, “the list is in arbitrary order”. To ensure consistency and avoid errors in syncing the order of the items among different lists (e.g., *author_ids*, *truths*), we sort the list by calling *sorted*. *sorted()* returns a new sorted list (in ascending lexicographical order) of all the items in an iterable. ''' xml_filenames = sorted(os.listdir(xmls_directory)) # Store the Author IDs in a list # The Author IDs list will have the same order as the XML filenames list. author_ids = [] # Create an empty list for xml_filename in xml_filenames: author_ids.append(xml_filename[:-4]) # Skip loading truth if path input is None. Else, load the truth from the file. if truth_path is None: logger.info("*truth_path* is None => Skipped loading the truth") truths = None # This scenario will happen when loading the test dataset for **TIRA** evaluation, where the truth of the test # set is not provided. else: truths = load_truth(truth_path, author_ids) if write_to_txt_files: logger.info("The parsed XMLs will also be written to TXT files.") # Create the directory if it does not exist. os.makedirs(txts_destination_directory, exist_ok=True) # Initialize the lists. # The lists will have the same order as the XML filenames list (refer to: “Iterate over XML Files”) original_tweet_lengths = [] # Create an empty list # ↳ Every row will represent an author, every column will represent a tweet. merged_tweets_of_authors = [] # Create an empty list # ↳ Each cell will contain all 100 tweets of an author, merged. # Iterate over XML files for author_index, xml_filename in enumerate(xml_filenames): # Make sure only XML files go through if not fnmatch.fnmatch(xml_filename, '*.xml'): logger.error( "Encountered a non-XML file inside the directory: %s >>> The program will now exit.", xml_filename) raise RuntimeError( 'Encountered a non-XML file inside the directory: %s' % xml_filename) # ↳ This is printf-style String Formatting. # Read the XML file and parse it into a tree # Parser is explicitly defined to ensure UTF-8 encoding. tree = ElementTree.parse( os.path.join(xmls_directory, xml_filename), parser=ElementTree.XMLParser(encoding="utf-8")) root = tree.getroot() ''' root is the root element of the parsed tree root[0], ..., root[m-1] are the children of root—elements one level below the root. root[0][0], ..., root[0][n-1] are the children of root[0]. and so on. Each element has a tag, a dictionary of attributes, and sometimes some text: root[i][j].tag, ”.attrib, ”.text ''' # Add an empty new row to the list. Each row represents an author. original_tweet_lengths.append([]) # Initialize the list. Note that this list resets in every author (XML file) loop. tweets_of_this_author = [] # Create an empty list # Iterate over the tweets within this parsed XML file: # Record the tweet length, replace line feeds, and append the tweet to a list for child in root[0]: # Element.text accesses the element's text content, # which is saved with the following format in the XML files: <![CDATA[some text]]> tweet = child.text original_tweet_lengths[author_index].append(len(tweet)) # Replace line feed (LF = \n) with “ <LineFeed> ” # Note: There were no carriage return (CR = \r) characters in any of the 3,000 XML files. tweet = tweet.replace('\n', " <LineFeed> ") # Create a list of the tweets of this author, to write to a text file and merge, after the loop terminates. ''' Google Python Style Guide: Avoid using the + and += operators to accumulate a string within a loop. Since strings are immutable, this creates unnecessary temporary objects and results in quadratic rather than linear running time. Avoid: merged_tweets_of_authors[author_index] += tweet + " <EndOfTweet> " Instead, append each substring to a list and ''.join the list after the loop terminates. ''' tweets_of_this_author.append(tweet) # Write the tweets of this author to a TXT file # Note that in these tweets, the line feed characters are replaced with a tag. if write_to_txt_files: # Create a TXT file with the Author ID as the filename (same as the XML files) in the write mode with open(os.path.join(txts_destination_directory, author_ids[author_index] + ".txt"), 'w', encoding="utf-8") as txt_output_file: txt_output_file.write('\n'.join(tweets_of_this_author)) # ↳ '\n'.join adds a newline character between every two strings, # so there won't be any extra line feeds on the last line of the file. # Concatenate the tweets of this author, and append it to the main list merged_tweets_of_this_author = " <EndOfTweet> ".join( tweets_of_this_author) + " <EndOfTweet>" # ↳ " <EndOfTweet> ".join adds the tag between every two strings, so we need to add another tag to the end. merged_tweets_of_authors.append(merged_tweets_of_this_author) logger.info("@ %.2f seconds: Finished loading the dataset", time.process_time()) return merged_tweets_of_authors, truths, author_ids, original_tweet_lengths
# -*- coding: utf-8 -*- import xml.etree.ElementTree as ET from pyrevit import script __context__ = 'zero-doc' utf8xml = script.get_bundle_file('utf8.xml') utf16xml = script.get_bundle_file('utf16.xml') for xmlfile in [utf8xml, utf16xml]: print('Testing: {}'.format(xmlfile)) c = ET.parse(xmlfile) print(c) xmlp = ET.XMLParser(encoding="utf-16") f = ET.parse(xmlfile, parser=xmlp) print(xmlp, f)
def open_file_xml(file_name): parser = ET.XMLParser(encoding='utf-8') tree = ET.parse(file_name, parser) root = tree.getroot() items = root.findall('channel/item') return items
class Parser: __parser = ElementTree.XMLParser(encoding='utf-8') __tree = None __root = None relations = {} ways = {} nodes = {} def __init__(self, file_name): self.__tree = ElementTree.parse(file_name, self.__parser) self.__root = self.__tree.getroot() self.__load_to_memory() self.__process_tags() def __load_to_memory(self): for element in self.__root.iter(): if element.tag == 'relation': relation = elements.Relation(element) self.relations[relation.relation_id] = relation elif element.tag == 'node': node = elements.Node(element) self.nodes[node.node_id] = node elif element.tag == 'way': way = elements.Way(element) self.ways[way.way_id] = way def __extract_country(self, source, source_type): if source.is_representing_country(): if source_type == 'way': return Country.extract_from_way(source=source, nodes=self.nodes) elif source_type == 'relation': return Country.extract_from_relation(source=source, nodes=self.nodes, ways=self.ways) raise ValueError( 'Source {0} is not representing country!'.format(source)) def __process_source(self, source, source_type): country = self.__extract_country(source, source_type) self.__write_country_to_file(country=country) hooks.process_country(country=country) def __write_country_to_file(self, country): path = 'output/' if not os.path.exists(path): os.mkdir(path) output = open(path + country.iso2.lower() + '.json', 'w') output.write(country.to_json()) output.close() if country.tags['is_in:continent'] == 'Europe': geojsonpath = 'data/' if not os.path.exists(geojsonpath): os.mkdir(geojsonpath) output = open(geojsonpath + country.iso2.lower() + '.geo.json', 'w') output.write(country.to_geojson()) output.close() def __process_tags(self): for relation in self.relations.values(): if relation.is_representing_country(): self.__process_source(source=relation, source_type='relation') for way in self.ways.values(): if way.is_representing_country(): self.__process_source(source=way, source_type='way')
# from lxml import etree as ElementTree import xml.etree.ElementTree as ElementTree import htmlentitydefs import csv import operator import re from config import * # import gzip # parser = ElementTree.XMLParser(attribute_defaults=True, load_dtd=True) parser = ElementTree.XMLParser() # Match ordinary page numbers (as in 10-17). pageCounterNormal = re.compile('(\d+)-(\d+)') # Match page number in the form volume:page (as in 12:140-12:150). pageCounterColon = re.compile('[0-9]+:([1-9][0-9]*)-[0-9]+:([1-9][0-9]*)') def startpage(input): if (input is None): return 0 pageCounterMatcher1 = pageCounterNormal.match(input) pageCounterMatcher2 = pageCounterColon.match(input) start = 0 if (not (pageCounterMatcher1 is None)): start = int(pageCounterMatcher1.group(1)) else: if (not (pageCounterMatcher2 is None)): start = int(pageCounterMatcher2.group(1))
import xml.etree.ElementTree as ET parser = ET.XMLParser(encoding = 'UTF-8') tree = ET.parse('newsafr.xml', parser) root = tree.getroot() news_list = root.findall('channel/item') all_news = '' for i,news in enumerate(news_list): discript = news.find("description").text all_news += discript list_all_word = all_news.split(' ') new_list = [] for element in list_all_word: if len(element) > 6: new_list.append(element) else: continue common_words = sorted(set(new_list), key = new_list.count, reverse = True) print(common_words[:10])
def _process( manifest_file_subpath: str, processed_manifests: Optional[Dict[str, ET.ElementTree]] = None ) -> Dict[str, ET.ElementTree]: """Process a manifest root file and keep an acumulator for recursion on ``<include>`` tags.""" debug( line("""Processing manifest file {!r}""").format( manifest_file_subpath)) if not processed_manifests: processed_manifests = {} if (os.path.normpath(manifest_file_subpath) in processed_manifests.keys()): raise RepoSourceTreeManifestParsingError( line( """An already processed manifest file ({!r}) is again candidate to snapshot. Is there an include loop in the manifest file structure?""").format(manifest_file_subpath)) try: parser = ET.XMLParser(target=CommentedTreeBuilder()) xmldoc = ET.parse(os.path.join(internal_manifest_repo_path, manifest_file_subpath), parser=parser) except Exception as exc: raise RepoSourceTreeManifestParsingError( line( """Failure when parsing {!r} manifest file. Exception raised was: {!r}""").format(manifest_file_subpath, exc)) for proj in xmldoc.findall("./project"): try: proj_repo_subpath = proj.attrib["path"] except KeyError: raise UnexpectedRepoSourceTreeStructure( line( """A project in manifest ({!r}) does not have a path.""" ).format(manifest_file_subpath)) try: # according to repo manifest documentation, groups can be # separated with commas or spaces proj_group_set = set(proj.attrib["groups"].replace( ",", " ").split()) except KeyError: proj_group_set = set() # skip this project if the project group set does not intersect # with the groups to snapshot set (if provided) or if it does # intersect with the groups NOT to snapshot set (if provided as # well): if snapshot_groups and not set(snapshot_groups) & proj_group_set: debug( line( """Skipping project {!r} because it does not belong to any group candidate for snapshot provided as argument."""). format(proj_repo_subpath)) continue if (no_snapshot_groups and set(no_snapshot_groups) & proj_group_set): debug( line( """Skipping project {!r} because it belongs to a repo group which shall not be snapshotted.""").format( proj_repo_subpath)) continue try: proj_git_repo = git.Repo( os.path.join(root_path, proj_repo_subpath)) except: raise UnexpectedRepoSourceTreeStructure( line( """Repo project {!r} is not a git repository anymore.""" ).format(proj_repo_subpath)) if proj_git_repo.is_dirty(): raise ProjectInRepoSourceTreeInUncleanState( line("""Repo project {!r} is in a dirty state.""").format( proj_repo_subpath)) if not proj_git_repo.head.is_valid(): raise ProjectInRepoSourceTreeInUncleanState( line("""Repo project {!r} has not a valid git HEAD."""). format(proj_repo_subpath)) if use_branches: try: proj_revision = proj_git_repo.head.reference.name except: debug( line("""Project {!r} cannot use git symbolic reference because it is in a detached HEAD state.""").format( proj_repo_subpath)) proj_revision = proj_git_repo.head.commit.hexsha else: proj_revision = proj_git_repo.head.commit.hexsha # Check that the returned revision is effectively pointed by a Git # reference or any ancestor of a Git reference. This is required in # order to avoid using a Git revision that can be garbage-collected # by Git. # Important note: we do not check that the reference pointing on # the current HEAD is effectively valid on at least one Git remote. # We consider that it is up to the user to push that Git reference # to the suitable authoritative Git repostitory for future usage. for ref in proj_git_repo.references: if (proj_git_repo.head.commit == ref.commit or proj_git_repo.head.commit in ref.commit.parents): break else: raise ProjectInRepoSourceTreeInUncleanState( line( """No Git symbolic reference is pointing to the current HEAD commit of the project {!r} or any of its eventual sucessors.""").format(proj_repo_subpath)) # change the revision attribute in the XML project tag: proj.attrib["revision"] = proj_revision debug("Project {!r} will be snapshotted to revision {!r}.".format( proj_repo_subpath, proj_revision)) # manifest is processed: all the projects have been snapshotted to a # revision processed_manifests.update( {os.path.normpath(manifest_file_subpath): xmldoc}) # iterate with others manifests included for manifest in xmldoc.findall("./include"): manifest_subpath = manifest.attrib["name"] processed_manifests = _process( manifest_file_subpath=manifest_subpath, processed_manifests=processed_manifests) return processed_manifests
def parseCmpLib(filePath): def buildReqDataTable(elementTree, tagName, dataTable): for reqEle in elementTree.iter(tagName): refID = reqEle.get("id") row = {"refID": refID} for item in reqEle: row[item.tag] = item.text dataTable.insertRow(row) def buildGroupTable(elementTree, groupTable): #import component folders topSetting = "" #get the info of top group(it's not the top level folder!) for topGroupEle in elementTree.findall("./TopGroup"): topSetting = topGroupEle refID = topGroupEle.get("id") guid = getTextFromChildEle(topGroupEle, "GUID") itemNaming = getTextFromChildEle(topGroupEle, "ItemNamingScheme") groupTable.insertRow({ "HRID": "Top Group#", "refID": refID, "GUID": guid, "Path": "", "ParentGroup": "", "ItemNamingScheme": itemNaming }) #traverse the path, get all folders for tGroupEle in topSetting.findall(".//TGroup"): refID = tGroupEle.get("id") guid = "" guid = getTextFromChildEle(tGroupEle, "GUID") hrid = getTextFromChildEle(tGroupEle, "HRID") path = getTextFromChildEle(tGroupEle, "Path") for parentGroupEle in tGroupEle.findall("./ParentGroup"): parentGroup = parentGroupEle.get("href").replace("#", "") groupTable.insertRow({ "HRID": hrid, "refID": refID, "GUID": guid, "Path": path, "ParentGroup": parentGroup, "ItemNamingScheme": "" }) def buildCmpTableSet(elementTree, cmpTable, paramMatchTable, modelMatchTable): cmpID = 0 for cmpEle in elementTree.findall(".//TComponentDefinition"): guid = getTextFromChildEle(cmpEle, "GUID") hrid = getTextFromChildEle(cmpEle, "HRID") cmpType = getTextFromChildEle(cmpEle, "ComponentTypes") itemHRID = getTextFromChildEle(cmpEle, "ItemHRID") revGUID = getTextFromChildEle(cmpEle, "RevisionGUID") namingScheme = getTextFromChildEle(cmpEle, "ItemNamingScheme") for parentGroupEle in cmpEle.findall("./ParentGroup"): parentGroup = parentGroupEle.get("href").replace("#", "") cmpTable.insertRow({ "HRID": hrid, "GUID": guid, "ParentGroup": parentGroup, "ComponentTypes": cmpType, "ItemHRID": itemHRID, "RevisionGUID": revGUID, "ItemNamingScheme": namingScheme, "refID": cmpID }) for tParamEle in cmpEle.findall(".//TParameter"): paramValue = getTextFromChildEle(tParamEle, "Value") realValue = getTextFromChildEle(tParamEle, "RealValue") for reqParamEle in tParamEle.iter("RequiredParameter"): reqParam = reqParamEle.get("href").replace("#", "") paramMatchTable.insertRow({ "RequiredParameter": reqParam, "Value": paramValue, "RealValue": realValue, "Component": cmpID }) for modelEle in cmpEle.findall(".//TModelChoice"): for reqModelEle in modelEle.iter("RequiredModel"): reqModel = reqModelEle.get("href").replace("#", "") for modelLinkEle in modelEle.iter("ModelLink"): modelLink = modelLinkEle.get("href").replace("#", "") modelMatchTable.insertRow({ "RequiredModel": reqModel, "ModelLink": modelLink, "Component": cmpID }) cmpID += 1 tables = componentLibrary() parser = ET.XMLParser(encoding="utf-8") eleCmplib = ET.parse(filePath, parser=parser) #import all required models and parameters tables buildReqDataTable(eleCmplib, "TRequiredParameter", tables.dataTables["RequiredParameters"]) buildReqDataTable(eleCmplib, "TRequiredModel", tables.dataTables["RequiredModels"]) buildReqDataTable(eleCmplib, "TModelLink", tables.dataTables["ModelLinks"]) #import components data, model choices, parameter links buildGroupTable(eleCmplib, tables.dataTables["Group"]) componentTables = buildCmpTableSet( eleCmplib, tables.dataTables["ComponentDefinitions"], tables.dataTables["ParameterLinks"], tables.dataTables["ModelChoices"]) #import basic cmplib info lifeCycleGUID = getTextFromChildEle(eleCmplib, "LifeCycleDefinitionGUID") revNamingGUID = getTextFromChildEle(eleCmplib, "RevisionNamingSchemeGUID") vaultGUID = getTextFromChildEle(eleCmplib, "VaultGUID") vaultName = getTextFromChildEle(eleCmplib, "VaultName") tempVaultGUID = getTextFromChildEle(eleCmplib, "TemplateVaultGUID") tempRevGUID = getTextFromChildEle(eleCmplib, "TemplateRevisionGUID") tables.dataTables["BasicInfo"].insertRow({ "LifeCycleDefinitionGUID": lifeCycleGUID, "RevisionNamingSchemeGUID": revNamingGUID, "VaultGUID": vaultGUID, "VaultName": vaultName, "TemplateVaultGUID": tempVaultGUID, "TemplateRevisionGUID": tempRevGUID }) addComponentComment(tables.dataTables) for aTable in tables.dataTables: tables.dataTables[aTable].removeBadChars return tables
def __init__(self, file_folder, file_name): self.file_path = os.path.join(file_folder, file_name) self.file_name = file_name self.tree = Et.parse(self.file_path, parser=Et.XMLParser(encoding=ENCODING)) self.root = self.tree.getroot()
def get_data_from_xml(filename, code): parser = etree.XMLParser(encoding=code) #iso8859_5 )#koi8_r') tree = etree.parse(filename, parser=parser) return tree
import xml.etree.ElementTree as ET parser = ET.XMLParser(encoding="utf-8") tree = ET.parse("newsafr.xml", parser) root = tree.getroot() news_list = root.findall("channel/item") full_set = set() descript = '' for news in news_list: descript += news.find("description").text for news in news_list: full_set.add(descript) def count_word(full_set): word_value = {} full_set = ', '.join(full_set) full_set = full_set.split(" ") for elements in full_set: if len(elements) > 6: if elements in word_value: word_value[elements] += 1 else: word_value[elements] = 1 return word_value def sorted_w(word_value): sorted_words = sorted(word_value.items(), key=lambda x: x[1], reverse=True) return sorted_words
def operation(): Label(roottk, text="processing", font=('helvetica', 12, 'bold')).place(x=230, y=360) directory = input_directory.get() userinput = newfile_input.get() effective_date = input_effective_date.get() productname = userinput obj = os.scandir(directory) updatedname = [] newname = '' samplename = [] oldname = [] oldpath = [] for entry in obj: for x in os.scandir(entry.path): oldname.append(x.path) oldpath.append(entry.path) name = x.name.split('_') updatedname = name[2:] newname = userinput + '_' arraylength = len(updatedname) for z in range(0, arraylength): if z == arraylength - 1: newname = newname + updatedname[z] else: newname = newname + updatedname[z] + '_' samplename.append(newname) newname = '' if len(oldname) and len(oldpath) and len(samplename): print('code is executing') else: print('code is going to fail') #print(samplename) #print(len(oldname)) #print(len(oldpath)) #print(len(samplename)) for x in range(0, len(samplename)): os.rename(oldname[x], oldpath[x] + '\\' + samplename[x]) print('done') print('processing...') #setup some delay time.sleep(5) def splitjoin(sampleid, productname): newmanuscript = sampleid.split('_')[2:] newmanuscript.insert(0, productname) newmanuscript = '_'.join(newmanuscript) return newmanuscript def splitjoinlob(productname): productname = productname.split('_') productname = ''.join(productname) return productname def splitjoincaption(caption, productname): captionarr = [] newcaption = caption.split(' ') for x in newcaption: if x != '': captionarr.append(x) updatedcaption = captionarr[2:] productname = productname.split('_') for y in range(0, len(productname)): updatedcaption.insert(y, productname[y]) updatedcaption = ' '.join(updatedcaption) return updatedcaption for entry in os.scandir(directory): for file in os.scandir(entry): parser = ET.XMLParser(encoding='utf-8') root = ET.parse(file.path, parser=parser) root.getroot()[0].attrib['versionDate'] = effective_date manuscriptid = root.getroot()[0].attrib['manuscriptID'] versionid = root.getroot()[0].attrib['versionID'] caption = root.getroot()[0].attrib['caption'] updatedmanuscriptid = splitjoin(manuscriptid, productname) updatedlob = splitjoinlob(productname) updatedcaption = splitjoincaption(caption, productname) updatedversionid = updatedmanuscriptid #print(updatedmanuscriptid, updatedversionid) #root.getroot()[0].attrib['lob']=updatedlob root.getroot()[0].attrib['caption'] = updatedcaption root.getroot()[0].attrib['manuscriptID'] = updatedmanuscriptid root.getroot()[0].attrib['versionID'] = updatedversionid #print(updatedlob,updatedcaption) myroot = root.getroot()[0] for x in myroot: if x.tag == 'keys': for y in x: if y.attrib['name'] == 'lob': y.attrib['value'] = updatedlob if y.attrib['name'] == 'effectiveDateNew': y.attrib['value'] = effective_date #print(y.attrib['value']) if y.attrib['name'] == 'effectiveDateRenewal': y.attrib['value'] = effective_date root.write(file.path) #Label1=Label(roottk,text="processing...",font=('helvetica', 12, 'bold')).place(x=230,y=360) print('wait...') obj.close() print('done') Label(roottk, text="Executed", font=('helvetica', 12, 'bold')).place(x=230, y=390)
def compilation_textpair(traitement, racine): parser = ET.XMLParser(encoding='utf-8') fichier = open(racine+"/"+traitement.chemin_projet+"/resultat_final.xml").read() treeResults = ET.fromstring(fichier, parser = parser) fichier_non_pertinentes = open("phrases_rejetees_2.csv", "w") phrases_rejetes = [] #XML_final_TextPair = "<body>" divs = treeResults.findall(".//div") for div in divs: print(div.attrib['id']) resultats_phrases = {} xr_phrases = {} XML_final_TextPair = "<div id=\"" + div.attrib['id'] + "\">" segs = div.findall(".//seg") for seg in segs: print (seg.attrib) #XML_final_TextPair += "<seg id=\"" + seg.attrib['id'] + "\" corresp=\"" + seg.attrib['corresp'] + "\">" XML_final_TextPair += "<seg id=\"" + seg.attrib['id']+"\">" phrases = seg.findall(".//s") phrasesTarget = "" phrasesSource = "" for phrase in phrases: phrasesSource += phrase.attrib['id']+' ' print(phrase.attrib['id']) print(ET.tostring(phrase)) if phrase.attrib['id'] not in resultats_phrases.keys(): resultats_phrases[phrase.attrib['id']] = [] if phrase.attrib['id'] not in xr_phrases.keys(): xr_phrases[phrase.attrib['id']] = phrase w_xr = phrase.findall(".//w[xr]") for w in w_xr: xrs = w.findall(".//xr") for xr in xrs: if xr.attrib["corresp"].split("_")[0] not in phrasesTarget: phrasesTarget += xr.attrib["corresp"].split("_")[0] + ' ' if xr.attrib["corresp"].split("_")[0] not in resultats_phrases[phrase.attrib['id']]: resultats_phrases[phrase.attrib['id']].append(xr.attrib["corresp"].split("_")[0]) if phrase.attrib['id'] in xr_phrases.keys(): phrase_a_ajouter = xr_phrases[phrase.attrib['id']] w_source = phrase_a_ajouter.find(".//w[@id=\"" + w.attrib['id'] + "\"]") xrs_source = w_source.findall(".//xr") indice = presence_xr(xr.attrib['corresp'], xrs_source) if indice == 0: w_source.insert(0, xr) xr_phrases[phrase.attrib['id']] = phrase_a_ajouter print(ET.tostring(xr_phrases[phrase.attrib['id']])) interGrp = seg.find(".//interpGrp") phrasesSource = phrasesSource[:-1] interpSource = ET.Element("interp", attrib={"type": "phrasesSource", "corresp": phrasesSource}) interpTarget = ET.Element("interp", attrib={"type": "phrasesTarget", "corresp": phrasesTarget}) interGrp.insert(0, interpTarget) interGrp.insert(0, interpSource) XML_final_TextPair += ET.tostring(interGrp, encoding='utf8').decode() XML_final_TextPair += "</seg>" for id_phrase, resultats in resultats_phrases.items(): phrase = xr_phrases[id_phrase] identifiants_target = '' xrs = phrase.findall(".//xr") interpGrps = [] for it in resultats: print (it) score = 0 for xr in xrs: if it in xr.attrib['corresp']: score += float(xr.attrib['cert']) if score > 0: identifiants_target += it + ' ' ratio_score = calcul_ratio_score(score, phrase, treeResults.find(".//s[@id=\"" + it + "\"]")) interpGrp_s = ET.Element("interpGrp", attrib={"corresp": it}) interpScore = ET.Element("interp", attrib={"type": "score"}) interpScore.text = str(round(score, 2)) interpRatioScore = ET.Element("interp", attrib={"type": "ratioScore"}) interpRatioScore.text = str(ratio_score) interpGrp_s.insert(0, interpRatioScore) interpGrp_s.insert(0, interpScore) interpGrps.append(interpGrp_s) else: fichier_non_pertinentes.write(id_phrase+"***"+it+"***"++"") identifiants_target = identifiants_target[:-1] phrase.set("corresp", identifiants_target) for ip in interpGrps: phrase.insert(0, ip) print(ET.tostring(phrase, encoding='utf8').decode()) XML_final_TextPair += ET.tostring(phrase, encoding='utf8').decode() XML_final_TextPair += "</div>" XML_final_TextPair = re.sub(r"<\?xml version='1\.0' encoding='utf8'\?>", r"", XML_final_TextPair) #XML_final_TextPair = "<?xml version='1.0' encoding='utf8'?>" + XML_final_TextPair # print(XML_final) fichier_resultats = open(racine + "/" + traitement.chemin_projet + "/dossier_textes/"+div.attrib['id']+".xml", "w") fichier_resultats.write(XML_final_TextPair) fichier_resultats.close() os.system("cat "+ racine + "/" + traitement.chemin_projet + "/dossier_textes/*.xml >" + racine + "/" + traitement.chemin_projet + "/dossier_textes/resultats_compiles.xml")
from xml.etree import ElementTree import os import re import csv file_name = "movies.xml" full_file = os.path.abspath(os.path.join('data', file_name)) try: parser = ElementTree.XMLParser(encoding="utf-8") dom = ElementTree.parse(full_file, parser=parser) tree = dom.getroot() except ElementTree.ParseError as Error: print("Error while parsing xml file {}".format(Error)) def parse_xml(tree): """parses the xml file reads a perticular set of data and store the data in dictionary and then append it to list. Also updates the xml files(creates new element under a perticular tag) """ try: movi_list = list() movie_csv = list() for child in tree.findall("genre"): if child.attrib['category'] == "Action": main_dict = dict() for movies in child.findall('./decade/'):
def _parse_test_results(self, result_filename, test_results=None, failing_test=None): """Handles result files with one or more test results. @param result_filename: log file to parse. @param test_results: Result parsed will be appended to it. @param failing_test: Tests considered failed will append to it. @return: dictionary of parsed test results. """ xml = '' xml_start = False xml_complete = False xml_bad = False result = 'ParseTestResultFail' if test_results is None: test_results = {} if not os.path.isfile(result_filename): logging.error('Did not find file %s', result_filename) return test_results with open(result_filename) as result_file: for line in result_file.readlines(): # If the test terminates early, the XML will be incomplete # and should not be parsed. if line.startswith('#terminateTestCaseResult'): result = line.strip().split()[1] xml_bad = True # Will only see #endTestCaseResult if the test does not # terminate early. elif line.startswith('#endTestCaseResult'): xml_complete = True elif xml_start: xml += line elif line.startswith('#beginTestCaseResult'): # If we see another begin before an end then something is # wrong. if xml_start: xml_bad = True else: xml_start = True test_case = line.split(' ')[1] if xml_complete or xml_bad: if xml_complete: myparser = et.XMLParser(encoding='ISO-8859-1') root = et.fromstring(xml, parser=myparser) test_case = root.attrib['CasePath'] result = root.find('Result').get('StatusCode').strip() xml_complete = False test_results[result] = test_results.get(result, 0) + 1 if (result.lower() not in self.TEST_RESULT_FILTER and failing_test != None): failing_test.append(test_case) xml_bad = False xml_start = False result = 'ParseTestResultFail' xml = '' return test_results
def _xml_to_obj(cls, serialized_str, encoding="iso-8859-2"): parser = ET.XMLParser(encoding=encoding) element = ET.fromstring(serialized_str, parser=parser) return cls._xml_ele_to_obj(cls._remove_xml_namespaces(element))