def test_DictCharWidget_renders_fieldset_with_label_and_field_names(self): names = [factory.make_string(), factory.make_string()] initials = [] labels = [factory.make_string(), factory.make_string()] values = [factory.make_string(), factory.make_string()] widget = DictCharWidget( [widgets.TextInput, widgets.TextInput, widgets.CheckboxInput], names, initials, labels, skip_check=True, ) name = factory.make_string() html_widget = fromstring("<root>" + widget.render(name, values) + "</root>") widget_names = XPath("fieldset/input/@name")(html_widget) widget_labels = XPath("fieldset/label/text()")(html_widget) widget_values = XPath("fieldset/input/@value")(html_widget) expected_names = [ "%s_%s" % (name, widget_name) for widget_name in names ] self.assertEqual( [expected_names, labels, values], [widget_names, widget_labels, widget_values], )
def download_frm_scirp(doc_list): title_query = XPath('.//article-title') abstract_query = XPath('.//abstract/p') body_query = XPath('.//body/sec') strip_elements_list = ['xref', 'title', 'b', 'sup', 'table-wrap'] logger.info("Will retry processing for {} scirp faulty files".format( len(doc_list))) result_dict_list = [] for file_name, doc_id in doc_list: logger.debug('Processing : {}'.format(file_name)) urlfile = "https://www.scirp.org/xml/{}.xml".format(str(doc_id)) try: response = requests.get(urlfile) if response.status_code == 200: xml_text = response.content xml_text = re.sub('<sup>|</sup>|<b>|</b>', '', xml_text.decode('utf-8')).encode('utf-8') result_dict = get_data_from_xml( xml_text=xml_text, title_query=title_query, abstract_query=abstract_query, body_query=body_query, strip_elemnts_list=strip_elements_list, strip_tags_list=[], namespaces=None) result_dict_list.append(result_dict) with open(file_name, 'wb') as file: file.write(xml_text) except Exception as e: logger.info("No substitute XML exists for {}".format(file_name)) return result_dict_list
def area_codes(): # Need to Get page first to obtain key data parsed_body = pm.get_page(home_url) # extract data from page create form to send back with post form = pm.get_formDataCuric(parsed_body) form.update( { 'ctl00$ctl00$countdowntimer2$hdnCountdowntime': '0', 'ctl00$ctl00$ddlAcademicYear': searchYear, 'ctl00$ctl00$cphContent$ContentPlaceHolder1$ddlStructureLevel': '2', 'ctl00$ctl00$cphContent$ContentPlaceHolder1$txtStructureCode': '', 'ctl00$ctl00$cphContent$ContentPlaceHolder1$txtStructureTitle': '', 'ctl00$ctl00$cphContent$ContentPlaceHolder1$btnSearch': 'Search', 'hiddenInputToUpdateATBuffer_CommonToolkitScripts': '1' } ) # Post for with key search data, this data should be static a my only need update occasionally # need to check if it is stored before sending a request parsed_body = pm.post_page(home_url, form) # key areas to get data from rows_xpath = XPath('//*[@id="ctl00_ctl00_cphContent_ContentPlaceHolder1_gvStructureSearch"]//tr/td/a') href_xpath = XPath('@href') title_xpath = XPath('text()') courses = [] for row in rows_xpath(parsed_body): courses.append({ "href": href_xpath(row)[0].split("../")[-1], # "structureid": href_xpath(row)[0].split("../")[-1].split("=", 1)[-1].split('&')[0], # "AcademicYearID" : href_xpath(row)[0].split("../")[-1].split("=", 1)[-1].split('&')[1], "title":title_xpath(row)[0] }) return courses
class AccountLifeInsurance(IsinMixin, Page): _re_isin = re.compile(r'isin=(\w+)') _tr_list = XPath( '//div[@id="content-gauche"]//table[@class="list"]/tbody/tr') _td_list = XPath('./td') _link = XPath('./td[1]/a/@href') def get_investment(self): for tr in self._tr_list(self.document): cells = list(el_to_string(td) for td in self._td_list(tr)) link = unicode(self._link(tr)[0]) ''' Boursorama table cells ---------------------- 0. Fonds 1. Date de valeur 2. Valeur de part 3. Nombre de parts 4. Contre valeur 5. Prix revient 6. +/- value en €* 7. +/- value en %* Investment model ---------------- label = StringField('Label of stocks') code = StringField('Identifier of the stock (ISIN code)') description = StringField('Short description of the stock') quantity = IntField('Quantity of stocks') unitprice = DecimalField('Buy price of one stock') unitvalue = DecimalField('Current value of one stock') valuation = DecimalField('Total current valuation of the Investment') diff = DecimalField('Difference between the buy cost and the current valuation') ''' inv = Investment() isin = self.get_isin(link) if isin: inv.id = inv.code = isin inv.label = cells[0] inv.quantity = Decimal(cells[3]) inv.valuation = Decimal(cells[4]) inv.unitprice = Decimal(cells[5]) inv.unitvalue = Decimal(cells[2]) inv.diff = Decimal(cells[6]) inv._detail_url = link if '/cours.phtml' in link else None yield inv def get_valuation_diff(self, account): account.valuation_diff = Decimal( self.document.xpath( '//td[contains(text(), "Total des +/- values **")]/following-sibling::*[1]' ))
def generate(self, nsmap): attrs = OrderedDict() dt = self.mapped_datatype if self.used: if isinstance(self.range, CIMEnum): var, query_base = self.name_query() attrs[f"{var}_name"] = Column(String(120), ForeignKey(CIMEnumValue.name), name=f"{var}_name") attrs[var] = relationship(CIMEnumValue, foreign_keys=attrs[f"{var}_name"]) self.key = f"{var}_name" self.xpath = XPath(query_base + "/@rdf:resource", namespaces=nsmap) elif self.range: self.generate_relationship(nsmap) elif not self.range: var, query_base = self.name_query() log.debug(f"Generating property for {var} on {self.name}") self.key = var self.xpath = XPath(query_base + "/text()", namespaces=nsmap) if dt: if dt == "String": attrs[var] = Column(String(50), name=f"{var}") elif dt in ("Float", "Decimal"): attrs[var] = Column(Float, name=f"{var}") elif dt == "Integer": attrs[var] = Column(Integer, name=f"{var}") elif dt == "Boolean": attrs[var] = Column(Boolean, name=f"{var}") else: attrs[var] = Column(String(30), name=f"{var}") else: # Fallback to parsing as String(50) attrs[var] = Column(String(50), name=f"{var}") for attr, attr_value in attrs.items(): setattr(self.cls.class_, attr, attr_value)
def __init__(self): ns = {'c': 'https://toolkit.cit-ec.uni-bielefeld.de/CITKat'} self.xpath_has_other_versions = XPath( "/c:catalog/child::node()[not(@version = $version) and " "c:filename = concat($filename_wo_version, '-', translate(@version, '#', '_'))]", # TODO: needs better escaping of all special uri-chars namespaces=ns) self.xpath_get_version = XPath('/c:catalog/child::node()/@version', namespaces=ns)
def students_more_details(pmstudentid): url = '/ILP/Information/furtherdetails.aspx?pmstudentid=' page = pm.get_page(url + pmstudentid) return { 'Nok Mobile': check_list( XPath('//*[@id="Content_Content_txtNextOfKinMobileValue"]/text()')( page)), 'Nok': check_list( XPath( '//*[@id="Content_Content_txtNextOfKin_Spellchecker_cccTextArea"]/text()' )(page)).strip('\r\n'), 'Notes': check_list( XPath( '//*[@id="Content_Content_txtNotes_Spellchecker_cccTextArea"]/text()' )(page)).strip('\r\n'), 'Nok Email': check_list( XPath('//*[@id="Content_Content_txtNextOfKinEmail"]/@value')( page)), 'ULN': check_list(XPath('//*[@id="Content_Content_txtULN"]/text()')(page)) }
def test_class_attributes(self): try: XPath(self.shelter.animal_url) XPath(self.shelter.next_url) except XPathSyntaxError as e: self.fail(e.msg) url = urlparse(self.shelter.start_url) self.assertIn(url.scheme, ('http', 'https')) self.assertTrue(url.netloc)
def _generateXPathMap(cls): super()._generateXPathMap() Map = { "category": XPath(r"cims:belongsToCategory/@rdf:resource", namespaces=cls.nsmap), "stereotype": XPath(r"cims:stereotype/text()", namespaces=cls.nsmap) } if not cls.XPathMap: cls.XPathMap = Map else: cls.XPathMap = {**cls.XPathMap, **Map}
def _generateXPathMap(cls): super()._generateXPathMap() Map = { "domain": XPath(r"rdfs:domain/@rdf:resource", namespaces=cls.nsmap), "multiplicity": XPath(r"cims:multiplicity/@rdf:resource", namespaces=cls.nsmap) } if not cls.XPathMap: cls.XPathMap = Map else: cls.XPathMap = {**cls.XPathMap, **Map}
class InvestmentDetail(IsinMixin, Page): _re_isin = re.compile('(\w+)') _isin = XPath('//h2[@class and contains(concat(" ", normalize-space(@class), " "), " fv-isin ")]') _description = XPath('//p[@class="taj"] | //div[@class="taj"]') def get_investment_detail(self, inv): subtitle = el_to_string(self._isin(self.document)[0]) inv.id = inv.code = self.get_isin(subtitle) inv.description = el_to_string(self._description(self.document)[0]).strip()
def determine_sentence_type(xml_tree): if len(XPath(refused_xpath_spec)(xml_tree)) > 0: return SentenceType.refused elif len(XPath(upheld_xpath_spec)(xml_tree)) > 0: return SentenceType.upheld elif len(XPath(without_cause_xpath_spec)(xml_tree)) > 0: return SentenceType.without_cause elif len(XPath(predictive_text_xpath_spec)(xml_tree)) > 0: return SentenceType.predictive_text else: return SentenceType.irrelevant
def check_recommended(self, ds): return [ ('sensor_descriptions', XPath( "/sml:SensorML/sml:member/sml:System/sml:components/sml:ComponentList/sml:component/sml:System/gml:description", namespaces=self.ns)), ('sensor_ids', XPath( "/sml:SensorML/sml:member/sml:System/sml:components/sml:ComponentList/sml:component/sml:System/@gml:id", namespaces=self.ns)), ('sensor_names', XPath( "/sml:SensorML/sml:member/sml:System/sml:components/sml:ComponentList/sml:component/@name", namespaces=self.ns)), ('data_format_template_version', XPath( "/sml:SensorML/sml:capabilities/swe:SimpleDataRecord/swe:field[@name='ioosTemplateVersion']/swe:Text/swe:value", namespaces=self.ns)), ('variable_names', XPath( "/sml:SensorML/sml:member/sml:System/sml:components/sml:ComponentList/sml:component/sml:System/sml:outputs/sml:OutputList/sml:output/swe:Quantity/@definition", namespaces=self.ns)), ('variable_units', XPath( "/sml:SensorML/sml:member/sml:System/sml:components/sml:ComponentList/sml:component/sml:System/sml:outputs/sml:OutputList/sml:output/swe:Quantity/swe:uom/@code", namespaces=self.ns)), ('network_id', XPath( "/sml:SensorML/sml:member/sml:System/sml:capabilities[@name='networkProcedures']/swe:SimpleDataRecord/gml:metaDataProperty/@xlink:href", namespaces=self.ns)), ('operator_sector', XPath( "/sml:SensorML/sml:member/sml:System/sml:classification/sml:ClassifierList/sml:classifier[@name='operatorSector']/sml:Term/sml:value", namespaces=self.ns)), ]
def student_info(pmstudentid): url = "/ilp/information/details.aspx?pmstudentid=" page = pm.get_page(url + pmstudentid) common = '//*[@id="ctl00_ctl00_cphContent_ContentPlaceHolder1_' return { 'dob': check_list(XPath(common + 'txtDOB"]/text()')(page)), 'address': XPath(common + 'txtAddress"]/text()')(page), 'telephone': check_list(XPath(common + 'txtTelephone"]/text()')(page)), 'mobile': check_list(XPath(common + 'txtMobile"]/text()')(page)), 'email': check_list(XPath(common + 'lnkEmail"]/text()')(page)) }
def course_codes(url): parsed_body = pm.get_page('/' + url) rows_xpath = XPath('//*[@id="ctl00_ctl00_cphContent_ContentPlaceHolder1_gvStudentGroups"]//tr[td[3]//text()!="0"]') href_xpath = XPath('td[1]/a/@href') title_xpath = XPath('td[1]//text()') students_xpath = XPath('td[3]//text()') courses = [] for row in rows_xpath(parsed_body): courses.append({ "href": href_xpath(row)[0].split("../")[-1], "students": students_xpath(row)[0], "title":title_xpath(row)[0] }) return courses
def _generateXPathMap(cls): super()._generateXPathMap() Map = {"isFixed": XPath(r"cims:isFixed/@rdfs:Literal", namespaces=cls.nsmap)} if not cls.XPathMap: cls.XPathMap = Map else: cls.XPathMap = {**cls.XPathMap, **Map}
def generate_relationship(self, nsmap=None): var, query_base = self.name_query() attrs = {} Map = {} log.debug(f"Generating relationship for {var} on {self.name}") if self.many_remote: if self.inverse: br = self.inverse.label if self.namespace == "cim" else self.namespace + "_" + self.inverse.label tbl = self.generate_association_table() self.association_table = tbl attrs[var] = relationship(self.range.label, secondary=tbl, backref=br) else: tbl = self.generate_association_table() attrs[var] = relationship(self.range.label, secondary=tbl) else: attrs[f"{var}_id"] = Column(String(50), ForeignKey(f"{self.range.label}.id"), name=f"{var}_id") if self.inverse: br = self.inverse.label if self.namespace == "cim" else self.namespace+"_"+self.inverse.label attrs[var] = relationship(self.range.label, foreign_keys=attrs[f"{var}_id"], backref=br) else: attrs[var] = relationship(self.range.label, foreign_keys=attrs[f"{var}_id"]) self.key = f"{var}_id" self.xpath = XPath(query_base + "/@rdf:resource", namespaces=nsmap) class_ = self.cls.class_ for attr, attr_value in attrs.items(): setattr(class_, attr, attr_value) return Map
def opt_filter(self, my_filter): try: # Weed out invalid filters XPath(my_filter) except XPathSyntaxError: raise usage.UsageError("Invalid XPath expression: %s" % my_filter) self['filters'].append(my_filter)
def parse_scan_results(self, data): html = lxml.html.fromstring(data) compliance_failed_xpath = XPath("//span[contains(text(), 'FAILED')]") compliance_failed = len(compliance_failed_xpath(html)) return compliance_failed
def _generateXPathMap(cls): super()._generateXPathMap() Map = {"type": XPath(r"rdf:type/@rdf:resource", namespaces=cls.nsmap)} if not cls.XPathMap: cls.XPathMap = Map else: cls.XPathMap = {**cls.XPathMap, **Map}
def valid_xpath(self, to_validate=''): """ Check to see if an xpath is valid with a boolean return. Notes: If the optional parameter "to_validate" is not used, the calling objects own locator is used instead. Args: to_validate (str): A (hopefully) valid xpath when you don't want to use the the calling objects own locator. Returns: bool: True if the string is valid xpath, False if not. """ valid = False try: if not to_validate: to_validate = self.locator valid = bool(XPath(to_validate).path) except XPathSyntaxError: pass return valid
def select(self, xpath=None, pyquery=None): start = time.time() if xpath is None and pyquery is None: raise Exception('Both xpath and pyquery option are None') if xpath is not None and pyquery is not None: raise Exception('Both xpath and pyquery option are not None') if xpath is not None: if not xpath in XPATH_CACHE: obj = XPath(xpath) XPATH_CACHE[xpath] = obj xpath_obj = XPATH_CACHE[xpath] val = self.wrap_list(xpath_obj(self.node), 'xpath', xpath) query_exp = xpath else: val = self.wrap_list(self.pyquery_node().find(pyquery), 'pyquery', pyquery) query_exp = pyquery total = time.time() - start if DEBUG_LOGGING: logger.debug(u'Performed query [%s], elements: %d, time: %.05f sec' % (query_exp, len(val), total)) GLOBAL_STATE['selector_time'] += total return val
def extract_uitspraak(ruling_tree): from lxml.etree import XPath xpath_strs = [ "/open-rechtspraak/rvr:uitspraak/rvr:section//rvr:*/text()[contains(., 'DE UITSPRAAK')]/" "ancestor::rvr:section/descendant-or-self::text() | " "/open-rechtspraak/rvr:uitspraak/rvr:section//rvr:*/text()[contains(., 'DE UITSPRAAK')]/" "ancestor::rvr:section/following-sibling::rvr:section/descendant-or-self::text()", '/open-rechtspraak/rvr:uitspraak/rvr:section[@role="beslissing"][last()]/descendant-or-self::text()', "/open-rechtspraak/rvr:uitspraak/rvr:section/rvr:title/text()[contains(., 'eslissing')][last()]//" "ancestor::rvr:section/descendant-or-self::text()" ] xpaths = (XPath(xpath_str, namespaces=NAMESPACE_PREFIX_MAP) for xpath_str in xpath_strs) items = tuple('\n'.join(xpath(ruling_tree)) for xpath in xpaths) if len(items) >= 1: if len(items) <= 3: return filter_out_wijzers(clean( items[0])) # TODO: if != '' ; precedence of XPaths elif len(items) > 3: assert False else: return None
def extract_standpunt_adv(ruling_tree): from lxml.etree import XPath xpath_strs = [ "/open-rechtspraak/rvr:uitspraak/rvr:section//rvr:parablock/rvr:para/" "descendant-or-self::*[contains(text(), 'standpunt van de verdediging')]/" "ancestor::rvr:para/following-sibling::rvr:para/text()", "/open-rechtspraak/rvr:uitspraak/rvr:section//rvr:paragroup/" "descendant-or-self::*[contains(text(), 'standpunt van de Verdediging') " # TODO: unneeded "or contains(text(), 'standpunt van de verdediging')" "or contains(text(), 'standpunt van verdediging')]/" # TODO: unneeded "parent::*/descendant::rvr:parablock/descendant::*/text()" ] # , # "/open-rechtspraak/rvr:uitspraak/rvr:section//rvr:para/" # "rvr:emphasis[text()='Het standpunt van de verdediging']/" # "ancestor::rvr:para/following-sibling::rvr:*/text()"] xpaths = (XPath(xpath_str, namespaces=NAMESPACE_PREFIX_MAP) for xpath_str in xpath_strs) items = tuple('\n'.join(xpath(ruling_tree)) for xpath in xpaths) if len(items) == 1: return clean(items[0]) elif len(items) > 1: assert False else: return None
def xpath_results(node, xpath_expr): if xpath_expr: try: finder = XPath(xpath_expr, namespaces=REGEXPNAMESPACE) return finder(node) except Exception, ex: raise Exception((ex, xpath_expr))
def get_xpath(xpath_spec): """Return cached compiled XPath There is no thread lock. Worst case scenario, xpath_str is compiled more than one time. Args: * xpath_spec (str|lxml.etree.XPath): XPath as a str or lxml.etree.XPath Returns: * result (bool, float, list, str): Results. Raises: * TypeError: Raise when xpath_spec is neither a str nor a lxml.etree.XPath * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath """ if isinstance(xpath_spec, str): result = xpath_cache.get(xpath_spec, None) if result is None: try: result = XPath(xpath_spec) except XPathSyntaxError as e: raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e xpath_cache[xpath_spec] = result return result if isinstance(xpath_spec, XPath): return xpath_spec raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
def accept_expr(self): method = 'csssel' if self.rb1.isChecked() else 'xpath' expr = self.le_expr.text().strip() if not expr: QMessageBox.warning(self, "警告", '表达式不可为空', QMessageBox.Cancel) self.le_expr.setFocus() elif method == 'csssel': try: self.state['select'] = CSSSelector(expr) self.close() except: QMessageBox.warning(self, "警告", '错误的 CSS选择器 表达式', QMessageBox.Cancel) self.le_expr.setFocus() elif method == 'xpath': try: self.state['select'] = XPath(expr) self.close() except: QMessageBox.warning(self, "警告", '错误的 XPath 表达式', QMessageBox.Cancel) self.le_expr.setFocus() else: raise NotImplementedError('unsupported method %r' % expr) self.state['numfmt'] = self.le_numfmt.text() self.state['only_modify_text'] = self.cb1.isChecked() self.state['unique_strategy'] = 'inepub' if self.cb2.isChecked( ) else 'inhtml'
def evaluateXPath(path, element): try: import xml.dom from xml.xpath import Evaluate result=Evaluate(path, element) if hasattr(result,'__iter__'): for i in range(len(result)): if isinstance(result[i], xml.dom.Node) and result[i].nodeType == xml.dom.Node.ATTRIBUTE_NODE: result[i]=result[i].value elif type(result)==bool: return result else: result=[result] return result except ImportError: # Implementation for etree from lxml.etree import XPath, fromstring, tounicode # returns a list of _ElementStringResult buf=toPrettyXML(element) elist=XPath(path).evaluate(fromstring(buf)) nodelist=list() # if is iterable if hasattr(elist,'__iter__'): for eelement in elist: # either the returnlist is a stringlist or a element list if isinstance(eelement, basestring): nodelist.append(eelement) else: nodelist.append(parseXMLString(tounicode(eelement)).documentElement) elif type(elist)==bool: return elist else: nodelist.append(elist) return nodelist
def selector_converter(selector): ''' Create a selector out of a string or number. If the input is a string, a CSS or XPath selector are created, if the input is a number or a tuple/list of numbers, the selector will be turned into a slice selector. Applies basic parsing ond the selector, allowing css query grouping. (\w+ > \w+) ~ \w+ will result in a sibling selection of the parent element in the left part of the parenthesized selector. ''' parent_sibling = '\((.*)\s*>\s*(.*)\s*\)\s*~\s*(.*)' ''' if re.match(parent_sibling, selector): parent, child, sibling = re.match(parent_sibling, selector).groups() selector = lambda x: [el.getparent().getnext() for el in css(parent+'>'+child)(x)] return selector ''' if selector: if type(selector) == int: return SliceSelector((selector, )) if type(selector) in (list, tuple): return SliceSelector(selector) if type(selector) == lxml.cssselect.CSSSelector: return selector try: return lxml.cssselect.CSSSelector(selector) except lxml.cssselect.SelectorSyntaxError: return XPath(selector) except: raise Exception('This value for a selector was not understood', selector)
def compile_selector(self, expr, default_type): """ Compiles a single selector string to ``(selector_type, selector_object, expression_string, attributes)`` where the selector_type is a string (``"elements"``, ``"children"``, etc), selector_object is a callable that returns elements, expression_string is the original expression, passed in, and ``attributes`` is a list of attributes in the case of ``attributes(attr1, attr2):`` """ type, attributes, rest_expr = self.parse_prefix( expr, default_type=default_type) if not self.types_compatible(type, self.major_type): raise DeliveranceSyntaxError( "Expression %s in selector %r uses the type %r, but this is not " "compatible with the type %r already declared earlier in the selector" % (expr, self, type, self.major_type)) if rest_expr.startswith('/'): selector = XPath(rest_expr) else: try: selector = CSSSelector(rest_expr) except AssertionError as e: raise DeliveranceSyntaxError('Bad CSS selector: "%s" (%s)' % (expr, e)) return (type, selector, expr, attributes)