def parse(self, el): raw = self.extract.match(el.text).group("html") raw = raw.replace('\\"', '"').replace('\\n', '').replace('\\/', '/') parsed = lxml.html.fromstring(raw) self.env['name'] = CleanText( './/span[@class="popUpTitleBold"]')(parsed) self.env['object'] = CleanText( './/span[@class="popUpTitleNormal"]')(parsed).strip(' /') url = Attr('.//div[@class="popUpMsDiagramm"]/img', 'src')(parsed) self.env['id'] = url.split('_')[1] for tr in parsed.xpath('.//tr'): td = tr.xpath('.//td') if len(td) == 1 and "Datum" in td[0].text: l = td[0].text.split()[1:3] self.env['datetime'] = "%s %s" % (l[0], l[1]) elif len(td) == 2: if "Wasserstand" in td[0].text: self.env['levelvalue'] = td[1].text.split()[0] elif "Durchfluss" in td[0].text: self.env['flowvalue'] = td[1].text.split()[0] elif "Tendenz" in td[0].text: try: self.env['forecast'] = Attr('img', 'src')( td[1]).split("/")[-1] except ParseError: self.env['forecast'] = None # TODO self.env['alarm'] = None
def parse(self, el): raw = self.extract.match(el.text).group("html") raw = raw.replace('\\"', '"').replace('\\n', '').replace('\\/', '/') parsed = lxml.html.fromstring(raw) self.env['name'] = CleanText('.//span[@class="popUpTitleBold"]')(parsed) self.env['object'] = CleanText('.//span[@class="popUpTitleNormal"]')(parsed).strip(' /') url = Attr('.//div[@class="popUpMsDiagramm"]/img', 'src')(parsed) self.env['id'] = url.split('_')[1] for tr in parsed.xpath('.//tr'): td = tr.xpath('.//td') if len(td) == 1 and "Datum" in td[0].text: l = td[0].text.split()[1:3] self.env['datetime'] = "%s %s" % (l[0], l[1]) elif len(td) == 2: if "Wasserstand" in td[0].text: self.env['levelvalue'] = td[1].text.split()[0] elif "Durchfluss" in td[0].text: self.env['flowvalue'] = td[1].text.split()[0] elif "Tendenz" in td[0].text: try: self.env['forecast'] = Attr('img', 'src')(td[1]).split("/")[-1] except ParseError: self.env['forecast'] = None # TODO self.env['alarm'] = None
def get_history_jid(self): if self.is_asv: # Assurance Vie, we do not support this kind of account. return None span = Attr('//span[starts-with(@id, "index:j_id")]', 'id')(self.doc) return span.split(':')[1]
def get_history_jid(self): span = self.doc.xpath('//span[@id="index:panelASV"]') if len(span) > 1: # Assurance Vie, we do not support this kind of account. return None span = Attr('//span[starts-with(@id, "index:j_id")]', 'id')(self.doc) jid = span.split(':')[1] return jid
def obj_DPE(self): DPE = Attr( '//div[has-class("energy-box")]//div[has-class("energy-rank")]', 'class', default="" )(self) if DPE: DPE = [x.replace("energy-rank-", "").upper() for x in DPE.split() if x.startswith("energy-rank-")][0] return getattr(ENERGY_CLASS, DPE, NotAvailable)
def obj_details(self): GES = Attr( '//div[has-class("energy-box")]//div[has-class("rank")]', 'class', default=None )(self) if GES: GES = [x.replace("rank-", "").upper() for x in GES.split() if x.startswith("rank-")][0] else: GES = NotAvailable return { "GES": GES }
def obj_level(self): classes = Attr(u'//*[@class="lignes"]//div[@id="%s"]' % self.env[u'line'], attr='class')(self) classes = classes.split() if u"perturb_critique_trav" in classes: return CRITICAL_AND_WORK elif u"perturb_critique" in classes: return CRITICAL elif u"perturb_alerte_trav" in classes: return ALERT_AND_WORK elif u"perturb_alerte" in classes: return ALERT elif u"perturb_normal_trav" in classes: return NORMAL_AND_WORK elif u"perturb_normal" in classes: return NORMAL
def obj_level(self): classes = Attr( u'//*[@class="lignes"]//div[@id="%s"]' % self.env[u'line'], attr='class' )(self) classes = classes.split() if u"perturb_critique_trav" in classes: return CRITICAL_AND_WORK elif u"perturb_critique" in classes: return CRITICAL elif u"perturb_alerte_trav" in classes: return ALERT_AND_WORK elif u"perturb_alerte" in classes: return ALERT elif u"perturb_normal_trav" in classes: return NORMAL_AND_WORK elif u"perturb_normal" in classes: return NORMAL
def parse(self, el): # Trying to find vdate and unitvalue unitvalue, vdate = None, None for span in TableCell('label')(self)[0].xpath('.//span'): if unitvalue is None: unitvalue = Regexp(CleanText('.'), '^([\d,]+)$', default=None)(span) if vdate is None: vdate = None if any(x in CleanText('./parent::div')(span) for x in [u"échéance", "Maturity"]) else \ Regexp(CleanText('.'), '^([\d\/]+)$', default=None)(span) self.env['unitvalue'] = MyDecimal().filter( unitvalue) if unitvalue else NotAvailable self.env['vdate'] = Date( dayfirst=True).filter(vdate) if vdate else NotAvailable page = None link_id = Attr(u'.//a[contains(@title, "détail du fonds")]', 'id', default=None)(self) inv_id = Attr('.//a[contains(@id, "linkpdf")]', 'id', default=None)(self) if link_id and inv_id: form = self.page.get_form('//div[@id="operation"]//form') form['idFonds'] = inv_id.split('-', 1)[-1] form['org.richfaces.ajax.component'] = form[link_id] = link_id page = self.page.browser.open(form['javax.faces.encodedURL'], data=dict(form)).page if "hsbc.fr" in self.page.browser.BASEURL: # special space for HSBC m = re.search('fundid=(\w+).+SH=(\w+)', CleanText('//complete', default="")(page.doc)) if m: # had to put full url to skip redirections. page = page.browser.open( 'https://www.assetmanagement.hsbc.com/feedRequest?feed_data=gfcFundData&cod=FR&client=FCPE&fId=%s&SH=%s&lId=fr' % m.groups()).page elif "consulteroperations" not in self.page.browser.url: # not on history url = Regexp(CleanText('//complete'), r"openUrlFichesFonds\('(.*?)'\)", default=NotAvailable)(page.doc) if url is NotAvailable: # redirection to a useless graphplot page with url like /portal/salarie-sg/fichefonds?idFonds=XXX&source=/portal/salarie-sg/monepargne/mesavoirs assert CleanText('//redirect/@url')(page.doc) self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return if url.startswith('http://docfinder.is.bnpparibas-ip.com/'): # pdf... http://docfinder.is.bnpparibas-ip.com/api/files/040d05b3-1776-4991-aa49-f0cd8717dab8/1536 self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return match = re.match( r'http://www.cpr-am.fr/fr/fonds_detail.php\?isin=([A-Z0-9]+)', url) match = match or re.match( r'http://www.cpr-am.fr/particuliers/product/view/([A-Z0-9]+)', url) if match: self.env['code'] = match.group(1) self.env['code_type'] = Investment.CODE_TYPE_ISIN return page = self.page.browser.open(url).page try: self.env['code'] = page.get_code() self.env['code_type'] = page.CODE_TYPE # Handle page is None and page has not get_code method except AttributeError: self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable
def get_history_jid(self): span = Attr('//*[starts-with(@id, "index:j_id")]', 'id')(self.doc) jid = span.split(':')[1] return jid
def parse(self, el): # Trying to find vdate and unitvalue unitvalue, vdate = None, None for span in TableCell('label')(self)[0].xpath('.//span'): if unitvalue is None: unitvalue = Regexp(CleanText('.'), '^([\d,]+)$', default=None)(span) if vdate is None: vdate = None if any(x in CleanText('./parent::div')(span) for x in [u"échéance", "Maturity"]) else \ Regexp(CleanText('.'), '^([\d\/]+)$', default=None)(span) self.env['unitvalue'] = MyDecimal().filter(unitvalue) if unitvalue else NotAvailable self.env['vdate'] = Date(dayfirst=True).filter(vdate) if vdate else NotAvailable page = None link_id = Attr(u'.//a[contains(@title, "détail du fonds")]', 'id', default=None)(self) inv_id = Attr('.//a[contains(@id, "linkpdf")]', 'id', default=None)(self) if link_id and inv_id: form = self.page.get_form('//div[@id="operation"]//form') form['idFonds'] = inv_id.split('-', 1)[-1] form['org.richfaces.ajax.component'] = form[link_id] = link_id page = self.page.browser.open(form['javax.faces.encodedURL'], data=dict(form)).page if "hsbc.fr" in self.page.browser.BASEURL: # special space for HSBC m = re.search('fundid=(\w+).+SH=(\w+)', CleanText('//complete', default="")(page.doc)) if m: # had to put full url to skip redirections. page = page.browser.open('https://www.assetmanagement.hsbc.com/feedRequest?feed_data=gfcFundData&cod=FR&client=FCPE&fId=%s&SH=%s&lId=fr' % m.groups()).page elif "consulteroperations" not in self.page.browser.url: # not on history url = Regexp(CleanText('//complete'), r"openUrlFichesFonds\('(.*?)',true|false\).*", default=NotAvailable)(page.doc) if url is NotAvailable: # redirection to a useless graphplot page with url like /portal/salarie-sg/fichefonds?idFonds=XXX&source=/portal/salarie-sg/monepargne/mesavoirs # or on bnp, look for plot display function in a script assert CleanText('//redirect/@url')(page.doc) or CleanText('//script[contains(text(), "afficherGraphique")]')(page.doc) self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return useless_urls = ( # pdf... http://docfinder.is.bnpparibas-ip.com/api/files/040d05b3-1776-4991-aa49-f0cd8717dab8/1536 'http://docfinder.is.bnpparibas-ip.com/', # Redirection to a useless page with url like "https://epargne-salariale.axa-im.fr/fr/" 'https://epargne-salariale.axa-im.fr/fr/', ) for useless_url in useless_urls: if url.startswith(useless_url): self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return match = re.match(r'http://www.cpr-am.fr/fr/fonds_detail.php\?isin=([A-Z0-9]+)', url) match = match or re.match(r'http://www.cpr-am.fr/particuliers/product/view/([A-Z0-9]+)', url) if match: self.env['code'] = match.group(1) self.env['code_type'] = Investment.CODE_TYPE_ISIN return if url.startswith('http://fr.swisslife-am.com/fr/'): self.page.browser.session.cookies.set('location', 'fr') self.page.browser.session.cookies.set('prof', 'undefined') page = self.page.browser.open(url).page try: self.env['code'] = page.get_code() self.env['code_type'] = page.CODE_TYPE # Handle page is None and page has not get_code method except AttributeError: self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable
def parse(self, el): # Trying to find vdate and unitvalue unitvalue, vdate = None, None for span in TableCell('label')(self)[0].xpath('.//span'): if unitvalue is None: unitvalue = Regexp(CleanText('.'), '^([\d,]+)$', default=None)(span) if vdate is None: vdate = None if any(x in CleanText('./parent::div')(span) for x in ["échéance", "Maturity"]) else \ Regexp(CleanText('.'), '^([\d\/]+)$', default=None)(span) self.env['unitvalue'] = MyDecimal().filter(unitvalue) if unitvalue else NotAvailable self.env['vdate'] = Date(dayfirst=True).filter(vdate) if vdate else NotAvailable self.env['_link'] = None self.env['asset_category'] = NotAvailable page = None link_id = Attr(u'.//a[contains(@title, "détail du fonds")]', 'id', default=None)(self) inv_id = Attr('.//a[contains(@id, "linkpdf")]', 'id', default=None)(self) if link_id and inv_id: form = self.page.get_form('//div[@id="operation"]//form') form['idFonds'] = inv_id.split('-', 1)[-1] form['org.richfaces.ajax.component'] = form[link_id] = link_id page = self.page.browser.open(form['javax.faces.encodedURL'], data=dict(form)).page if 'hsbc.fr' in self.page.browser.BASEURL: # Special space for HSBC, does not contain any information related to performances. m = re.search(r'fundid=(\w+).+SH=(\w+)', CleanText('//complete', default='')(page.doc)) if m: # had to put full url to skip redirections. page = page.browser.open('https://www.assetmanagement.hsbc.com/feedRequest?feed_data=gfcFundData&cod=FR&client=FCPE&fId=%s&SH=%s&lId=fr' % m.groups()).page elif not self.page.browser.history.is_here(): url = page.get_invest_url() if empty(url): self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return # URLs used in browser.py to access investments performance history: if url.startswith('https://optimisermon.epargne-retraite-entreprises'): # This URL can be used to access the BNP Wealth API to fetch investment performance and ISIN code self.env['_link'] = url self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return elif (url.startswith('http://sggestion-ede.com/product') or url.startswith('https://www.lyxorfunds.com/part') or url.startswith('https://www.societegeneralegestion.fr') or url.startswith('http://www.etoile-gestion.com/productsheet')): self.env['_link'] = url # Try to fetch ISIN code from URL with re.match match = re.match(r'http://www.cpr-am.fr/fr/fonds_detail.php\?isin=([A-Z0-9]+)', url) match = match or re.match(r'http://www.cpr-am.fr/particuliers/product/view/([A-Z0-9]+)', url) if match: self.env['code'] = match.group(1) if is_isin_valid(match.group(1)): self.env['code_type'] = Investment.CODE_TYPE_ISIN else: self.env['code_type'] = Investment.CODE_TYPE_AMF return # Try to fetch ISIN code from URL with re.search m = re.search(r'&ISIN=([^&]+)', url) m = m or re.search(r'&isin=([^&]+)', url) m = m or re.search(r'&codeIsin=([^&]+)', url) m = m or re.search(r'lyxorfunds\.com/part/([^/]+)', url) if m: self.env['code'] = m.group(1) if is_isin_valid(m.group(1)): self.env['code_type'] = Investment.CODE_TYPE_ISIN else: self.env['code_type'] = Investment.CODE_TYPE_AMF return useless_urls = ( # pdf... http://docfinder.is.bnpparibas-ip.com/api/files/040d05b3-1776-4991-aa49-f0cd8717dab8/1536 'http://docfinder.is.bnpparibas-ip.com/', # The AXA website displays performance graphs but everything is calculated using JS scripts. # There is an API but it only contains risk data and performances per year, not 1-3-5 years. 'https://epargne-salariale.axa-im.fr/fr/', # Redirection to the Rothschild Gestion website, which doesn't exist anymore... 'https://www.rothschildgestion.com', # URL to the Morningstar website does not contain any useful information 'http://doc.morningstar.com', ) for useless_url in useless_urls: if url.startswith(useless_url): self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable return if url.startswith('http://fr.swisslife-am.com/fr/'): self.page.browser.session.cookies.set('location', 'fr') self.page.browser.session.cookies.set('prof', 'undefined') try: page = self.page.browser.open(url).page except HTTPNotFound: # Some pages lead to a 404 so we must avoid unnecessary crash self.logger.warning('URL %s was not found, investment details will be skipped.', url) if isinstance(page, CodePage): self.env['code'] = page.get_code() self.env['code_type'] = page.CODE_TYPE self.env['asset_category'] = page.get_asset_category() else: # The page is not handled and does not have a get_code method. self.env['code'] = NotAvailable self.env['code_type'] = NotAvailable self.env['asset_category'] = NotAvailable