def fulltext_search(self, query, rows=None, start=None): """Does an advanced search on fulltext:blah. You get back a pair (x,y) where x is the total # of hits and y is a list of identifiers like ["foo", "bar", etc.]""" query = self._prefix_query('fulltext', query) result_list = self.raw_search(query, rows=rows, start=start) e = ElementTree() try: e.parse(StringIO(result_list)) except SyntaxError as e: raise SolrError(e) total_nbr_text = e.find('info/range_info/total_nbr').text # total_nbr_text = e.find('result').get('numFound') # for raw xml total_nbr = int(total_nbr_text) if total_nbr_text else 0 out = [] for r in e.getiterator('hit'): for d in r.find('metadata'): for x in list(d.getiterator()): if x.tag == "identifier": xid = six.text_type(x.text).encode('utf-8') if xid.startswith('OCA/'): xid = xid[4:] elif xid.endswith('.txt'): xid = xid.split('/')[-1].split('_')[0] elif xid.endswith('_ZZ'): xid = xid[:-3] out.append(xid) break return (total_nbr, out)
def add_from_file(self, filename): '''parses xml file and stores wanted details''' Gtk.Builder.add_from_file(self, filename) # extract data for the extra interfaces tree = ElementTree() tree.parse(filename) ele_widgets = tree.getiterator("object") for ele_widget in ele_widgets: name = ele_widget.attrib['id'] widget = self.get_object(name) # populate indexes - a dictionary of widgets self.widgets[name] = widget # populate a reversed dictionary self._reverse_widget_dict[widget] = name # populate connections list ele_signals = ele_widget.findall("signal") connections = [(name, ele_signal.attrib['name'], ele_signal.attrib['handler']) for ele_signal in ele_signals] if connections: self.connections.extend(connections) ele_signals = tree.getiterator("signal") for ele_signal in ele_signals: self.glade_handler_dict.update( {ele_signal.attrib["handler"]: None})
def add_from_file(self, filename): '''parses xml file and stores wanted details''' Gtk.Builder.add_from_file(self, filename) # extract data for the extra interfaces tree = ElementTree() tree.parse(filename) ele_widgets = tree.getiterator("object") for ele_widget in ele_widgets: name = ele_widget.attrib['id'] widget = self.get_object(name) # populate indexes - a dictionary of widgets self.widgets[name] = widget # populate a reversed dictionary self._reverse_widget_dict[widget] = name # populate connections list ele_signals = ele_widget.findall("signal") connections = [ (name, ele_signal.attrib['name'], ele_signal.attrib['handler']) for ele_signal in ele_signals] if connections: self.connections.extend(connections) ele_signals = tree.getiterator("signal") for ele_signal in ele_signals: self.glade_handler_dict.update( {ele_signal.attrib["handler"]: None})
class LastParser(object): RSS_URL = "http://ws.audioscrobbler.com/2.0/user/{0}/recenttracks.rss" def __init__(self, user): self.tree = ElementTree() self.tree.parse(urllib2.urlopen(self.RSS_URL.format(user))) def get_songs(self, count=10): l = [] for item in self.tree.getiterator("item"): d = {} for e in item: d[e.tag] = e.text l.append(d) return l[:count] def get_song(self): return self.get_songs(1)[0] def get_titles(self, count=10): l = [title.text for title in self.tree.getiterator("title")] return l[1:count + 1] # removing rss title def get_title(self): return self.get_titles(1)[0]
def pagetext_search(self, locator, query, rows=None, start=None): """Does an advanced search on pagetext:blah locator:identifier where identifier is one of the id's from fulltext search. You get back a list of page numbers like [21, 25, 39].""" def extract(page_id): """TODO: DjVu format is deprecated. Is this function still even used? A page id is something like 'adventsuburbanit00butlrich_0065.djvu', which this function extracts asa a locator and a leaf number ('adventsuburbanit00butlrich', 65). """ g = re.search(r'(.*)_(\d{4})\.djvu$', page_id) a, b = g.group(1, 2) return a, int(b) # try using qf= parameter here and see if it gives a speedup. @@ # pdb.set_trace() query = self._prefix_query('pagetext', query) page_hits = self.raw_search(query, fq='locator:' + locator, rows=rows, start=start) XML = ElementTree() try: XML.parse(StringIO(page_hits)) except SyntaxError as e: raise SolrError(e) page_ids = list(e.text for e in XML.getiterator('identifier')) return [extract(x)[1] for x in page_ids]
def fulltext_search(self, query, rows=None, start=None): """Does an advanced search on fulltext:blah. You get back a pair (x,y) where x is the total # of hits and y is a list of identifiers like ["foo", "bar", etc.]""" query = self._prefix_query('fulltext', query) result_list = self.raw_search(query, rows=rows, start=start) e = ElementTree() try: e.parse(StringIO(result_list)) except SyntaxError as e: raise SolrError(e) total_nbr_text = e.find('info/range_info/total_nbr').text # total_nbr_text = e.find('result').get('numFound') # for raw xml total_nbr = int(total_nbr_text) if total_nbr_text else 0 out = [] for r in e.getiterator('hit'): for d in r.find('metadata'): for x in list(d.getiterator()): if x.tag == "identifier": xid = six.text_type(x.text).encode('utf-8') if xid.startswith('OCA/'): xid = xid[4:] elif xid.endswith('.txt'): xid = xid.split('/')[-1].split('_')[0] elif xid.endswith('_ZZ'): xid = xid[:-3] out.append(xid) break return (total_nbr, out)
def pagetext_search(self, locator, query, rows=None, start=None): """Does an advanced search on pagetext:blah locator:identifier where identifier is one of the id's from fulltext search. You get back a list of page numbers like [21, 25, 39].""" def extract(page_id): """TODO: DjVu format is deprecated. Is this function still even used? A page id is something like 'adventsuburbanit00butlrich_0065.djvu', which this function extracts asa a locator and a leaf number ('adventsuburbanit00butlrich', 65). """ g = re.search('(.*)_(\d{4})\.djvu$', page_id) a,b = g.group(1,2) return a, int(b) # try using qf= parameter here and see if it gives a speedup. @@ # pdb.set_trace() query = self._prefix_query('pagetext', query) page_hits = self.raw_search(query, fq='locator:' + locator, rows=rows, start=start) XML = ElementTree() try: XML.parse(StringIO(page_hits)) except SyntaxError as e: raise SolrError(e) page_ids = list(e.text for e in XML.getiterator('identifier')) return [extract(x)[1] for x in page_ids]
def __init__(self, name, file, parent=None): AWindow.__init__(self, name, file, parent) self.builder = Gtk.Builder() self.builder.add_from_file(self.file) self.item = self.builder.get_object(self.name) if self.parent is not None: self.item.set_transient_for(self.parent.item) self.types = {} for cls in self.classes: if hasattr(cls, 'type'): self.types[cls.type] = cls tree = ElementTree() tree.parse(self.file) ele_widgets = tree.getiterator("object") for ele_widget in ele_widgets: name = ele_widget.attrib['id'] widget = self.builder.get_object(name) type = widget.__class__.__name__ if type in self.types: self.widgets[name] = self.types[type](widget) else: self.other_widgets[name] = widget self.item.connect('delete-event', self.emit_closed)
def normalizeXMLData(data): # Read in XML try: tree = ElementTree(file=StringIO.StringIO(data)) except Exception: raise ValueError("Could not parse XML data") # Apply filters for filter in filters: for node in tree.getiterator(filter): node.clear() return tostring(tree.getroot())
def normalizeXMLData(data): # Read in XML try: tree = ElementTree(file=StringIO.StringIO(data)) except Exception: raise ValueError("Could not parse XML data") # Apply filters for filter in filters: for node in tree.getiterator(filter): node.clear() return tostring(tree.getroot())
def __init__(self, filename, window): self.widgets = {} self.widgets = {} self.builder = Gtk.Builder() self.builder.add_from_file(filename) self.ShowWindow(window) tree = ElementTree() tree.parse(filename) ele_widgets = tree.getiterator("object") for ele_widget in ele_widgets: name = ele_widget.attrib['id'] widget = self.builder.get_object(name) self.widgets[name] = widget
def __init__(self, result_xml): et = ElementTree() try: w = result_xml.encode('utf-8') def tx(a): return (type(a), len(a)) et.parse(StringIO(w)) except SyntaxError as e: ptb = traceback.extract_stack() raise SolrError(e, result_xml, traceback.format_list(ptb)) range_info = et.find('info').find('range_info') def gn(tagname): return int(range_info.findtext(tagname)) self.total_results = gn('total_nbr') self.begin = gn('begin') self.end = gn('end') self.results_this_page = gn('contained_in_this_set') self.result_list = list(str(a.text) \ for a in et.getiterator('identifier'))
def __init__(self, result_xml): et = ElementTree() try: w = result_xml.encode('utf-8') def tx(a): return (type(a), len(a)) et.parse(StringIO(w)) except SyntaxError as e: ptb = traceback.extract_stack() raise SolrError(e, result_xml, traceback.format_list(ptb)) range_info = et.find('info').find('range_info') def gn(tagname): return int(range_info.findtext(tagname)) self.total_results = gn('total_nbr') self.begin = gn('begin') self.end = gn('end') self.results_this_page = gn('contained_in_this_set') self.result_list = list(str(a.text) \ for a in et.getiterator('identifier'))
def install(language, directory=config.default_dict_path, repos=config.default_repository, use_description=True): ''' Download and install a dictionary file. language: a string of the form 'll_CC'. Example: 'en_US' for English, USA directory: the installation directory. Defaults to the value given in config.py. After installation this is the package root of 'hyphen' repos: the url of the dictionary repository. (Default: as declared in config.py; after installation of PyHyphen this is LibreOffice's GIT repository .). ''' # Download the dictionaries.xcu file from the LibreOffice repository if needed if use_description: # first try full language name; it won't work in all cases... language_ext_name = language descr_url = repos + language_ext_name + '/dictionaries.xcu' try: descr_file = urlopen(descr_url) except URLError: # OK. So try with the country code. language_ext_name = language[:2] descr_url = repos + language_ext_name + '/dictionaries.xcu' try: descr_file = urlopen(descr_url) except URLError: descr_file = None # Parse the xml file if it is present, and extract the data. if use_description and descr_file: descr_tree = ElementTree(file=descr_file) # Flag to catch the case that xcu file # does not refer to a hyphenation dict found_dict = False # Find the nodes containing meta data of hyphenation dictionaries # Iterate over all nodes for node in descr_tree.getiterator('node'): # Check if node relates to a hyphenation dict. # We assume this is the case if an attribute value # contains the substring 'hyph' node_values = [i[1] for i in node.items()] iter_values = [i for i in node_values if ('hyph' in i.lower())] # Install all available hyphen dictionairies for v in iter_values: # Found a hyphenation dict! So extract the data and construct the local record found_dict = True for property in node.getchildren(): prop_values = [j[1] for j in property.items()] for pv in prop_values: if pv.lower() == 'locations': # Its only child's text is a list of strings of the form %origin%<filename> # For simplicity, we only use the first filename in the list. raw_dict_fn = property.getchildren()[0].text.split( )[0] dict_fn = raw_dict_fn[ 9:] # strip the prefix '%origin%' dict_url = ''.join( (repos, language_ext_name, '/', dict_fn)) break # skip any other values of this property elif pv.lower() == 'locales': # Its only child's text is a list of locales. dict_locales = property.getchildren( )[0].text.replace('-', '_').split() break # skip any other values of this property # Install the dictionary file dict_str = urlopen(dict_url).read() filepath = directory + '/' + dict_fn with open(filepath, 'wb') as dict_file: dict_file.write(dict_str) # Save the metadata # Generate a record for each locale, overwrite any existing ones new_dict = hyphen.DictInfo(dict_locales, filepath, url=dict_url) for l in dict_locales: hyphen.dict_info[l] = new_dict # Catch the case that there is no hyphenation dict # for this language: if not found_dict: raise IOError('Cannot find hyphenation dictionary for language ' + language + '.') # handle the case that there is no xml metadata else: # Download the dictionary guessing its URL dict_fn = ''.join(('hyph_dict_', language, '.dic')) dict_url = ''.join((repos, dict_fn)) dict_str = urlopen(dict_url).read() filepath = directory + '/' + dict_fn with open(filepath, 'w') as dict_file: dict_file.write(dict_str) # Store the metadata new_dict = hyphen.DictInfo([language], filepath) # the URL is thus set to None. hyphen.dict_info[language] = new_dict # Save the modified metadata save_dict_info()
def install(language, directory = config.default_dict_path, repos = config.default_repository, use_description = True): ''' Download and install a dictionary file. language: a string of the form 'll_CC'. Example: 'en_US' for English, USA directory: the installation directory. Defaults to the value given in config.py. After installation this is the package root of 'hyphen' repos: the url of the dictionary repository. (Default: as declared in config.py; after installation of PyHyphen this is LibreOffice's GIT repository .). ''' # Download the dictionaries.xcu file from the LibreOffice repository if needed if use_description: # first try full language name; it won't work in all cases... language_ext_name = language descr_url = repos + language_ext_name + '/dictionaries.xcu' try: descr_file = urlopen(descr_url) except URLError: # OK. So try with the country code. language_ext_name = language[:2] descr_url = repos + language_ext_name + '/dictionaries.xcu' try: descr_file = urlopen(descr_url) except URLError: descr_file = None # Parse the xml file if it is present, and extract the data. if use_description and descr_file: descr_tree = ElementTree(file = descr_file) # Flag to catch the case that xcu file # does not refer to a hyphenation dict found_dict = False # Find the nodes containing meta data of hyphenation dictionaries # Iterate over all nodes for node in descr_tree.getiterator('node'): # Check if node relates to a hyphenation dict. # We assume this is the case if an attribute value # contains the substring 'hyph' node_values = [i[1] for i in node.items()] iter_values = [i for i in node_values if ('hyph' in i.lower())] # Install all available hyphen dictionairies for v in iter_values: # Found a hyphenation dict! So extract the data and construct the local record found_dict = True for property in node.getchildren(): prop_values = [j[1] for j in property.items()] for pv in prop_values: if pv.lower() == 'locations': # Its only child's text is a list of strings of the form %origin%<filename> # For simplicity, we only use the first filename in the list. raw_dict_fn = property.getchildren()[0].text.split()[0] dict_fn = raw_dict_fn[9:] # strip the prefix '%origin%' dict_url = ''.join((repos, language_ext_name, '/', dict_fn)) break # skip any other values of this property elif pv.lower() == 'locales': # Its only child's text is a list of locales. dict_locales = property.getchildren()[0].text.replace('-', '_').split() break # skip any other values of this property # Install the dictionary file dict_str = urlopen(dict_url).read() filepath = directory + '/' + dict_fn with open(filepath, 'wb') as dict_file: dict_file.write(dict_str) # Save the metadata # Generate a record for each locale, overwrite any existing ones new_dict = hyphen.DictInfo(dict_locales, filepath, url = dict_url) for l in dict_locales: hyphen.dict_info[l] = new_dict # Catch the case that there is no hyphenation dict # for this language: if not found_dict: raise IOError('Cannot find hyphenation dictionary for language ' + language + '.') # handle the case that there is no xml metadata else: # Download the dictionary guessing its URL dict_fn = ''.join(('hyph_dict_', language, '.dic')) dict_url = ''.join((repos, dict_fn)) dict_str = urlopen(dict_url).read() filepath = directory + '/' + dict_fn with open(filepath, 'w') as dict_file: dict_file.write(dict_str) # Store the metadata new_dict = hyphen.DictInfo([language], filepath) # the URL is thus set to None. hyphen.dict_info[language] = new_dict # Save the modified metadata save_dict_info()
import sys from collections import namedtuple from xml.etree.cElementTree import ElementTree from xml.etree.cElementTree import ParseError Contact = namedtuple('ContactRecord', 'first last age email') try: tree = ElementTree().parse('results.xml') except ParseError as e: print('Parse error: {err}'.format(err=e)) sys.exit(42) contacts = [] for contact in tree.getiterator('contact'): try: first = contact.find('.//first').text last = contact.find('.//last').text age = contact.find('./name').get('age') email = contact.find('.//email').text contacts.append(Contact(first, last, age, email)) except AttributeError as e: print('Element error: {err}'.format(err=e)) print(contacts)