def parse_xml_plugins_list(jenkins_base_url, chunk_size, auth): chunk_size = chunk_size * 1024 xml_pull_parser = ET.XMLPullParser() plugins = [] with requests.get("{jenkins_base_url}/{path}".format( jenkins_base_url=jenkins_base_url, path=JENKINS_PLUGIN_MANAGER_PATH), auth=auth, stream=True) as jenkins_response: # parse the xml plugins list a chunk at a time, a very large set of plugins may be installed for chunk in jenkins_response.iter_content(chunk_size): if chunk: xml_pull_parser.feed(chunk) try: for event, element in xml_pull_parser.read_events(): """ Rely on the fact that a plugin version is always encountered after its short name in <plugin> tag children """ if TAGS["SHORT_NAME"] == element.tag: plugin_line = element.text elif TAGS["VERSION"] == element.tag: plugins.append(plugin_line + ":" + element.text) plugin_line = "" except ET.ParseError as parse_err: print( "Jenkins response is not in parsable XML format, " "check your access rights to the instance: {parse_err}" .format(parse_err=parse_err)) return 1 for plugin in sorted(plugins): print(plugin) return 0
def init_parser(self): """init the XML parser. The parser must always be reset for each new connexion """ self.xml_depth = 0 self.xml_root = None self.parser = ET.XMLPullParser(("start", "end"))
def parseBytes(bString): xml_encoding = re.search(b"encoding=[\'\"](\S*)[\'\"].* ?\?\>", bString).group(1) encoding = xml_encoding.decode(encoding=ENCODING) header = bString[:8] try: body = bString[8:].strip().decode(encoding) except: print("Cannot decode CSTA message with ", encoding) print(bString) raise try: root = ET.fromstring(body) except: print("Cannot parse CSTA message string:") print(header) print(body) raise tree = ET.ElementTree(root) # ns=re.search("^{(.*)}",root.tag) XMLParser = ET.XMLPullParser(events=['start-ns']) XMLParser.feed(body) ns = [e[1] for e in XMLParser.read_events()] if ns: # namespace=ns.group(1) cstamessage = CstaMessage(header, tree, body, encoding=encoding, ns=ns) else: # print("Warning: No namespace defined in message",root.tag) cstamessage = CstaMessage(header, tree, body, encoding=encoding) return cstamessage
def read(source_file_name): nodes = {} ways = {} relations = {} way = None relation = None element = None parser = ET.XMLPullParser(['start']) source_file = open(source_file_name, "r", encoding='utf8') parser.feed(source_file.read()) source_file.close() for event, elem in parser.read_events(): if elem.tag == 'node': nodes[elem.get('id')] = [float(elem.get('lon')), float(elem.get('lat'))] elif elem.tag == 'way': element = way = ways[elem.get('id')] = Way() elif elem.tag == 'relation': element = relation = relations[elem.get('id')] = Relation() elif elem.tag == 'tag': element.tags.append([elem.get('k'),elem.get('v')]) elif elem.tag == 'nd': way.coords.append(nodes[elem.get('ref')]) elif elem.tag == 'member': if elem.get('type') == 'way': coords = ways[elem.get('ref')].coords if elem.get('role') == 'outer': relation.outer.append(coords) elif elem.get('role') == 'inner': relation.inner.append(coords) return ways.values(), relations.values()
def _verify_all_tags_closed(xml_text: str) -> Optional[str]: """ Verify that all the tags were properly closed in the XML given as text. Return error if any. """ parser = ET.XMLPullParser(["start", "end"]) parser.feed(xml_text.encode("utf-8")) open_tags = [] # type: List[ET.Element] iterator = parser.read_events() while True: try: event, element = next(iterator) except StopIteration: break except ET.ParseError as exception: lineno, _ = exception.position line = xml_text.splitlines()[lineno - 1] if exception.msg.startswith("mismatched tag:"): return ( f"{exception.msg}; the line was: {line!r}, " f"the open tag(s) up to that point: " f"{list(map(rasaeco.et.to_str, open_tags))}. " f"Did you maybe forget to close the tag " f"{rasaeco.et.to_str(open_tags[-1])}? " f"See also https://github.com/mristin/rasaeco#known-issues " f"in case you have missing or too many new lines.") else: return f"{exception.msg}; the line was: {line!r}" if event == "start": open_tags.append(element) elif event == "end": if len(open_tags) == 0: return (f"Unexpected closing tag " f"{rasaeco.et.to_str(element)} and no open tags") elif open_tags[-1].tag != element.tag: return (f"Unexpected closing tag " f"{rasaeco.et.to_str(element)} as the last opened " f"tag was: {rasaeco.et.to_str(open_tags[-1])}") elif open_tags[-1].tag == element.tag: open_tags.pop() else: raise AssertionError( f"Unhandled case: " f"element.tag is {rasaeco.et.to_str(element)}, " f"event: {event}, " f"open tags: " f"{list(map(rasaeco.et.to_str, open_tags))}") else: raise AssertionError(f"Unhandled event: {event}") return None
def load_errortypes_xml(file): parser = ET.XMLPullParser(['start', 'end']) while True: l = file.readline().decode() parser.feed(l) if not l.strip(): return parser
def __init__(self): self.decompressor = zlib.decompressobj(wbits=zlib.MAX_WBITS + 16) self.xml_parser = ElementTree.XMLPullParser(['end']) # ElementTree mangles the tags thus: '{xml_namespace}tag_name' self.tag_re = re.compile( '({[^}]+}|)(location|size|checksum|package|time)$') # Package state must persist across `feed()` calls, since a # package element may straddle a chunk boundary. self._package = {}
def valgrind_output_xml(pipe_r): ''' Extract valgrind output XML from a pipe file descriptor ''' with os.fdopen(pipe_r, 'r', closefd=False) as fout: parser = XmlElementTree.XMLPullParser() for line in fout: parser.feed(line) for event in parser.read_events(): if event[1].tag == 'valgrindoutput': return event[1]
def pullParse(data): parser = ET.XMLPullParser(['start', 'end']) parser.feed(data) for event, elem in parser.read_events(): print('Event: ', event) if elem.text != None: text = elem.text.replace('\n', '') text = text.replace(' ', '') if text != '': print('Tag:', elem.tag, ", text:", elem.text) else: print('Tag:', elem.tag, ", text: NONE")
def clearParser(self): """ :return: success for test purpose """ # XML parser self.parser = ETree.XMLPullParser(['start', 'end']) self.parser.feed('<root>') # clear the event queue of parser for _, _ in self.parser.read_events(): pass return True
def read_tin_xml_ET(fn): """ Parse a TIN from a LandXML file. Returns two arrays: P: points [:,3] doubles F: faces [:,3] integers This version uses a proper XML parser, for robust but slow parsing. Only handles a single TIN per file """ Ps = np.nan * np.zeros((1, 3), np.float64) Fs = -1 * np.ones((1, 3), np.int32) Fcount = 0 parser = ET.XMLPullParser(['start', 'end']) tag_types = {} blksize = 10000 n_blks = 1 + int(os.stat(fn).st_size / blksize) with open(fn, 'rt') as fp: for _ in utils.progress(range(n_blks)): buff = fp.read(blksize) if len(buff) == 0: break parser.feed(buff) for event, elem in parser.read_events(): if elem.tag not in tag_types: print(elem.tag, 'text=', elem.text) tag_types[elem.tag] = elem if elem.text is None: continue if elem.tag == "{http://www.landxml.org/schema/LandXML-1.2}P": pid = int(elem.attrib['id']) P = [float(s) for s in elem.text.split()] # Appears that these are written lat/long order, but I prefer # to keep everything x/y P[0], P[1] = P[1], P[0] while len(Ps) < pid + 1: Ps = np.concatenate([Ps, np.nan * Ps], axis=0) Ps[pid] = P elif elem.tag == "{http://www.landxml.org/schema/LandXML-1.2}F": F = [int(s) for s in elem.text.split()] fid = Fcount while fid + 1 > len(Fs): Fs = np.concatenate([Fs, -1 * Fs], axis=0) Fs[fid] = F Fcount += 1 return Ps, Fs
def parse_send(line, parsers): """Send:<time>:<id>:<xml> :param line: :param parsers: """ split = line.split(":") time = float(split[1]) identifier = split[2][1:-1] if identifier not in parsers or "<?xml" in ":".join(split[3:]): parser = ET.XMLPullParser(["start", "end"]) parsers[identifier] = parser xml = parse_xml(":".join(split[3:]), parsers[identifier], False) return identifier, [(time, x) for x in xml]
def getpages(bz2data): xml = bz2.decompress(bz2data).decode("utf-8") parser = ET.XMLPullParser() parser.feed("<pages>") parser.feed(xml) ns, id = 0, 0 for ev, el in parser.read_events(): if el.tag == "ns": ns = int(el.text) id = 0 elif id == 0 and el.tag == "id": id = int(el.text) elif ns == 0 and el.tag == "text": with io.StringIO(el.text) as text: yield id, text
def parse_recv(lines, parsers): """Recv:<time>:<id>:<xml> :param lines: :param parsers: """ split = lines[0].split(":") time = float(split[1]) identifier = split[2][1:-1] if identifier not in parsers or "<?xml" in ":".join(split[3:]): parser = ET.XMLPullParser(["start", "end"]) parsers[identifier] = parser xml = parse_xml(":".join(split[3:]) + "".join(lines[1:]), parsers[identifier], True) return identifier, [(time, x) for x in xml]
def parse_book(book_path): if book_path.endswith('fb2'): for event, elem in ET.iterparse(book_path, events=('end', 'end-ns')): if elem and elem.tag.endswith('title-info'): root = elem break else: if book_path.endswith('.zip'): book_file = zipfile.ZipFile(book_path) book_file = book_file.open(book_file.namelist()[0]) elif book_path.endswith('.gz'): book_file = gzip.open(book_path, 'r') text = [] while book_file.readable(): line = book_file.readline().decode('utf-8') text.append(line) if '</title-info>' in line: break text = ''.join(text) parser = ET.XMLPullParser(['start']) parser.feed(text) for event, elem in parser.read_events(): if elem and elem.tag.endswith('title-info'): root = elem def find_tags(elem, title, author, year): tag = elem.tag.lower() if not title and 'book-title' in tag: title = elem.text return title, author, year if not author and 'author' in tag: author = ' '.join(filter(lambda x: x, [elem[0].text, elem[1].text])) return title, author, year if not year and 'date' in tag: year = elem.attrib['value'] if 'value' in elem.attrib else elem.text return title, author, year for child in elem: if not all([title, author, year]): title, author, year = find_tags(child, title, author, year) return title, author, year title, author, year = find_tags(root, None, None, None) year = dateparser.parse(year).year if year else None return (title, author, year), book_path
def find_all_tags(fp, tags, progress_callback=None): parser = ET.XMLPullParser(("start", "end")) root = None while True: chunk = fp.read(1024 * 1024) if not chunk: break parser.feed(chunk) for event, el in parser.read_events(): if event == "start" and root is None: root = el if event == "end" and el.tag in tags: yield el.tag, el root.clear() if progress_callback is not None: progress_callback(len(chunk))
def parse(self, spec: str) -> dict: """ Parses the given spec in a non-blocking manner :param spec: path to the XML spec file :return: a dictionary of id -> operations """ results = {} with open(spec, 'rb') as file: parser = ET.XMLPullParser(['end']) for chunk in self.read_spec(file): for data in chunk: parser.feed(data) for event, element in parser.read_events(): if element.tag in self.operations and 'id' in element.attrib: node_xml = ET.tostring(element, encoding='unicode') ops = self.process_node(node_xml) results[element.attrib['id']] = ops return results
def _get_bible_name(filename, hint_line=10): """Check whether it is a XML file with following xml tag and attributes. <XMLBIBLE biblename="King James 2000" type="x-bible"> return biblename if it is a valid bible. """ lines = "" with open(filename, "r", encoding="utf-8") as f: for i in range(hint_line): lines += f.readline() parser = ET.XMLPullParser(["start", "end"]) parser.feed(lines) for event, elem in parser.read_events(): if event == "start" and elem.tag == "XMLBIBLE" and "biblename" in elem.attrib: return elem.attrib["biblename"] return None
def xml_parse_repodata(repodata_path, element_tag, repodata_type): file_extension = os.path.splitext(repodata_path)[1] iterator = decompression_iter(repodata_path, file_extension[1:]) if repodata_type == "primary": parse_pkg_elem = parse_primary_pkg_elem elif repodata_type == "filelists": parse_pkg_elem = parse_filelists_pkg_elem elif repodata_type == "other": parse_pkg_elem = parse_other_pkg_elem elif repodata_type == "repomd": parse_pkg_elem = parse_repomd_item_elem parser = ET.XMLPullParser(['end']) metadata_obj = Metadata(repodata_path) for xml_data in iterator: parser.feed(xml_data) for event, element in parser.read_events(): if event == "end" and element.tag == element_tag: pp = parse_pkg_elem(element) metadata_obj.append(pp.checksum, pp) return metadata_obj
def depth(elem, level): global maxdepth # your code goes here ''' level += 1 if level >= maxdepth: maxdepth = level for child in elem: depth(child, level) ''' maxdepth = level # print(etree.tostring(elem)) parser = etree.XMLPullParser(['start', 'end', 'start-ns', 'end-ns']) parser.feed(etree.tostring(elem)) # print(parser) for (event, elem) in parser.read_events(): # print(elem) if event == 'start': level += 1 if level > maxdepth: maxdepth = level elif event == 'end': level -= 1 return
def sent2iob(sent, format="c", tag_list=None, unk_expand=False, bert=False): if unk_expand or bert: sent = sent.replace(' ', '') text = '<sent>' + sent + '</sent>' parser = ET.XMLPullParser(['start', 'end']) parser.feed(text) ne_type = "O" ne_prefix = "" res = "" label = [] tag_set = set() print(sent) for event, elem in parser.read_events(): isuse = tag_list is None or (tag_list is not None and elem.tag in tag_list) if event == "start": assert len(tag_set) < 2, "タグが入れ子になっています\n{}".format(sent) word = elem.text if elem.text is not None else "" res += word #isuse = tag_list is None or (tag_list is not None and elem.tag in tag_list) if elem.tag != "sent" and isuse: tag_set.add(elem.tag) label += [elem.tag] * len(word) else: label += ["O"] * len(word) if event == "end": if elem.tag != "sent" and isuse: tag_set.remove(elem.tag) word = elem.tail if elem.tail is not None else "" res += word label += ["O"] * len(word) if format == "c": res = list(res) nums = [len(r) for r in res] elif format == "w": mecab = MeCab.Tagger('-Owakati') res = mecab.parse(res)[:-1].split(' ')[:-1] nums = [len(r) for r in res] else: if unk_expand: res, nums = format(res) else: res = format(res) nums = [1 for r in res] cnt = 0 output = [] prev = "O" post = "" for token, n in zip(res, nums): if len(label) <= cnt: output.append((token, "O")) break assert len(set( label[cnt:cnt + n])) == 1, "形態素とラベルが食い違っています\n{2}\n{0} : {1}".format( token, label[cnt:cnt + len(token)], res) pre_token = "" if label[cnt] != "O" and (prev == "O" or prev != label[cnt]): pre_token = "B-" elif label[cnt] != "O" and prev == label[cnt]: pre_token = "I-" prev = label[cnt] output.append((token, pre_token + label[cnt])) cnt += n return output
name = actor.find('{http://people.example.com}name') print(name.text) for char in actor.findall('{http://characters.example.com}character'): print(' |-->', char.text) ns = {'real_person': 'http://people.example.com', 'role': 'http://characters.example.com'} for actor in root.findall('real_person:actor', ns): name = actor.find('real_person:name', ns) print(name.text) for char in actor.findall('role:character', ns): print(' |-->', char.text) print("-------------------- XMLPullParser") parser = ET.XMLPullParser(['start', 'end']) parser.feed('<mytag>sometext') print(list(parser.read_events())) parser.feed(' more text</mytag>') for event, elem in parser.read_events(): print(event) print(elem.tag, 'text=', elem.text) print("----------- XPath") import xml.etree.ElementTree as ET root = ET.fromstring(xml) # Top-level elements print(root.findall("."))
def collect_data(FILE='map.xml', csv_output=True, max_size=1000000): ''' This function collects all required data out of an OpenStreetMap-like XML-file **FILE**. (correct path to it) and returns the following: - **bounds** dictionary, where keys are 'maxlat', 'maxlon', 'minlat' and 'minlon' - **cameras** dictionary in form {..., node_id: [node_lat, node_lon], ... } if node_id refers to a camera - **street_nodes** in form {..., node_id: [node_lat, node_lon], ... }} if node_id refers to a street defining node - **streets** dictionary in form {..., street_id: [type, is_oneway, postal_code, name, node1, node2, ..], ... } - **postal_areas** dictionary in form {..., postal_code: [way1, way2, way3, ..], ... } - **area_lats** dictionary in form {..., postal_code: [node1_lat, node2_lat, ..], ... } - **area_lons** dictionary in form {..., postal_code: [node1_lon, node2_lon, ..], ... } It parses the XML-code incrementally in character bunches of length **max_size**. If csv output is wanted, it creates a new directory csv_%time with including files bounds.csv, cameras.csv, street_nodes.csv, streets.csv, areas.csv, area_lats.csv, area_lons.csv. ''' # init return Data-Types bounds = {} cameras = {} street_nodes = {} streets = {} postal_areas = {} area_lats = {} area_lons = {} # init other Data containers nodes = { } # collect all elements with tag 'node' here, dict of form {..., node_id: [node_lat, node_lon], ...} ways = { } # collect all elements with tag 'way' here, dict of form {..., way_id: [node1, node2, ..], ...} street_nodes_set = set( ) # collect all node_ids that define any street in a set area_ways_set = set( ) # collect all way_ids that define any postal area border in a set # get current time time_now = time.strftime('%d.%m.%Y_%H.%M.%S') # init parser parser = et.XMLPullParser(['start', 'end']) try: if csv_output: # set paths of the output files head, tail = os.path.split(FILE) out_dir = os.path.join(head, 'csv_%s' % time_now) print('outdir:\t %s' % out_dir) if not os.path.isdir(out_dir): os.mkdir(out_dir) csv_files = map(lambda x: os.path.join(out_dir, x), [ 'bounds.csv', 'cameras.csv', 'street_nodes.csv', 'streets.csv', 'areas.csv', 'area_nodes.csv', 'area_lats.csv', 'area_lons.csv' ]) else: csv_files = None with open(FILE, 'r') as read_file: for csv_file in csv_files: # open files to write head, tail = os.path.split(csv_file) if tail == 'bounds.csv': bound_file = open(csv_file, 'w') if tail == 'cameras.csv': camera_file = open(csv_file, 'w') if tail == 'street_nodes.csv': street_node_file = open(csv_file, 'w') if tail == 'streets.csv': street_file = open(csv_file, 'w') if tail == 'areas.csv': area_file = open(csv_file, 'w') if tail == 'area_nodes.csv': area_node_file = open(csv_file, 'w') if tail == 'area_lats.csv': area_lat_file = open(csv_file, 'w') if tail == 'area_lons.csv': area_lon_file = open(csv_file, 'w') if csv_output: # init csv files to write bounds_csv = csv.writer(bound_file, delimiter=',') camera_csv = csv.writer(camera_file, delimiter=',') street_node_csv = csv.writer(street_node_file, delimiter=',') street_csv = csv.writer(street_file, delimiter=',') area_csv = csv.writer(area_file, delimiter=',') area_node_csv = csv.writer(area_node_file, delimiter=',') area_lat_csv = csv.writer(area_lat_file, delimiter=',') area_lon_csv = csv.writer(area_lon_file, delimiter=',') # init parsing variables root = None relevancy_level = 0 elements_to_delete = [] case_dict = None # Start with parsing... rep = 0 while True: test_func = None rep += 1 line = read_file.read(max_size) if not line: break # feed the parser parser.feed(line) # iterate through all parsed elements for event, elem in parser.read_events(): if root is None: root = elem # get bounds of the given map excerpt if elem.tag == 'bounds': if event == "end": output = [] for key in [ 'minlat', 'minlon', 'maxlat', 'maxlon' ]: bounds[key] = elem.attrib[key] test_func = None output.append(elem.attrib[key]) if csv_output: bounds_csv.writerow(list(output)) bound_file.close() # process all node tags with _get_camera if elem.tag == 'node': test_func = _get_camera case_dict = cameras # process all way tags with _get_street if elem.tag == 'way': test_func = _get_street case_dict = streets # process all relation tags with _get_relation if elem.tag == 'relation': test_func = _get_relation case_dict = postal_areas if not test_func: continue if event == "start": if test_func(elem): # in case of an opening tag: # increase level if elem consists of required data (otherwise/ on level 0, elem would deleted before having read all its children) relevancy_level += 1 if event == "end": # in case of an ending tag: # have processed complete element, so add it to list of elements to delete elements_to_delete.append(elem) if test_func(elem): # decrease level if elem consists of required data (finally only on level 0, i. e. when all its child tags are read, it will be freed to delete) relevancy_level -= 1 # get output of test_func output = test_func(elem) # in case of a node tag: get node coords and if it refers to a camera add it to the cameras dict if case_dict == cameras: is_cam = output[0] id = output[1] coordinates = output[2:] nodes[id] = coordinates if is_cam: assert (id not in cameras) cameras[id] = coordinates if csv_output: camera_csv.writerow(list(output[1:])) # in case of a way tag: elif case_dict == streets: is_highway = output[0] id = output[1] info = output[2:6] way_nodes = output[6:] assert (id not in streets) # only if way refers to a street add it to the streets dict if is_highway: streets[id] = info + way_nodes # collect all nodes which are defining a street in the street_nodes set street_nodes_set.update(set(way_nodes)) if csv_output: street_csv.writerow(list(output[1:])) # otherwise save only its nodes in ways dict ways[id] = way_nodes # in case of a relation tag: get the bounding ways (ids) of an postal area and save it to the postal_areas dict elif case_dict == postal_areas: postal_code = output[0] postal_ways = output[1:] assert (postal_code not in postal_areas) postal_areas[postal_code] = postal_ways # collect all ways which are defining a postal area in the area_ways set area_ways_set.update(set(postal_ways)) if csv_output: area_csv.writerow(list(output)) # delete elements only when we parsed them completely (including all its children) if relevancy_level == 0: for elem in elements_to_delete: elem.clear() if elem is not root: root.clear() elements_to_delete.clear() # save only coordinates of those nodes that are part of a street (and optional save it to file) street_nodes = { node_id: nodes[node_id] for node_id in street_nodes_set } street_nodes_set.clear() if csv_output: for (node_id, coords) in street_nodes.items(): street_node_csv.writerow([node_id] + list(coords)) # save only coordinates of those nodes that are defining a postal area boundary (and optional save it to file) area_ways = { way_id: ways[way_id] for way_id in area_ways_set if way_id in ways } area_ways_set.clear() # get all nodes that define some postal area in a set area_nodes_set = set() for node_list in area_ways.values(): area_nodes_set.update(node_list) area_ways.clear() # collect coordinates to all area nodes in dict area_nodes = { node_id: nodes[node_id] for node_id in area_nodes_set if node_id in nodes } area_nodes_set.clear() if csv_output: for (node_id, coords) in area_nodes.items(): area_node_csv.writerow([node_id] + list(coords)) # for each postal_code area save all its defining node coords in two dicts (separately for all its lat resp. lon coordinates) # form will be e. g. area_lats = { ..., postal_code1: [node1_lat, node2_lat, ..], ...} for postal_code in postal_areas.keys(): print('\nNew Postal code: %s' % postal_code) postal_node_lats = [] postal_node_lons = [] for way_id in postal_areas[postal_code]: if way_id in ways: way_lats = [] way_lons = [] for node_id in ways[way_id]: if node_id in nodes: node_lat, node_lon = nodes[node_id] way_lats.append(np.float_(node_lat)) way_lons.append(np.float_(node_lon)) else: print('\t\tNode %s NOT in ways[%s].' % (node_id, way_id)) postal_node_lats = list(postal_node_lats) + list(way_lats) postal_node_lons = list(postal_node_lons) + list(way_lons) area_lats[postal_code] = postal_node_lats area_lons[postal_code] = postal_node_lons if csv_output: area_lat_csv.writerow([postal_code] + list(area_lats[postal_code])) area_lon_csv.writerow([postal_code] + list(area_lons[postal_code])) # close all opened files if csv_output: camera_file.close() street_node_file.close() street_file.close() area_file.close() area_node_file.close() area_lat_file.close() area_lon_file.close() # return required data return bounds, cameras, street_nodes, streets, postal_areas, area_nodes, area_lats, area_lons except MemoryError: print('Out of Memory.')
def reset_pullParser(): global pullParser global first pullParser = None pullParser = ET.XMLPullParser(["start", "end"]) first = True
def extract_glade(fileobj, keywords, comment_tags, options): """Extracts translatable strings from Glade files or GtkBuilder UI XML. :param fileobj: the file-like object to extract from, iterable by lines :param keywords: a list of translation keywords to extract, with the same names and meanings as C/Python i18n function names. :param comment_tags: a list of translator tags to search for and include in the results. This is ignored. :param options: a dictionary of additional options (optional) :return: An iterator over ``(lineno, funcname, message, comments)`` tuples whose interpretation depends on ``funcname``. :rtype: iterator Properties must be marked translatable="yes". Context and comments attributes are respected. The yielded tuples are and returned as if you used gettext() or pgettext() in C or Python code. In other words, "gettext" or "pgettext" must be listed in ``keywords`` for contextless or context-bearing strings to be translated. The shorthand "_" and "C_" aliases from g18n.h are valid keywords too. Babel defaults to having "gettext" and "pgettext" in ``keywords``, so you don't normally need to worry about this. See also: * babel.messages.extract.extract() * http://babel.pocoo.org/en/latest/messages.html#writing-extraction-methods * https://www.gnu.org/software/gettext/manual/html_node/PO-Files.html """ parser = etree.XMLPullParser(["end"]) pgettext_wanted = ("pgettext" in keywords) or ("C_" in keywords) gettext_wanted = "gettext" in keywords truthy_values = [s.casefold() for s in ["yes", "true", "1", "y", "t"]] for line_idx, line_data in enumerate(fileobj): parser.feed(line_data) for event, elem in parser.read_events(): assert event == "end" translatable_attr = elem.attrib.get("translatable", "no") if not translatable_attr.casefold() in truthy_values: continue comments = [] if "comments" in elem.attrib: comments.append(elem.attrib["comments"]) # Babel's interpretation of the yielded tuple depends on the # function name returned as part of it. This tells Babel what # the elements of the returned messages list or tuple mean. func_name = None if "context" in elem.attrib and pgettext_wanted: func_name = "pgettext" context = elem.attrib["context"] messages = [context, elem.text] elif gettext_wanted and "context" not in elem.attrib: # Returned strings are equivalent to a list or tuple # of length 1, like the arguments to C gettext()/_(). func_name = "gettext" messages = elem.text if func_name is None: continue yield (line_idx + 1, func_name, messages, comments)
def __init__(self, *, target: BuilderBase = None): super().__init__(target=target) self.parser = ElementTree.XMLPullParser(["start", "end"])