Ejemplo n.º 1
0
 def get_session_storage(self):
     sessionContent = Regexp(
         CleanText('//script[@type="text/javascript"]'),
         'sessionStorage.setItem\((.*)\)'
     )(self.doc)
     key, value = map(lambda x: x.strip("'").strip(), sessionContent.split(",", 1))
     return key, json.decoder.JSONDecoder().decode(value)
Ejemplo n.º 2
0
    def obj_type(self):
        types = {'comptes? bancaires?': Account.TYPE_CHECKING,
                 'livrets?': Account.TYPE_SAVINGS,
                 'epargnes? logement': Account.TYPE_SAVINGS,
                 "autres produits d'epargne": Account.TYPE_SAVINGS,
                 'comptes? titres? et pea': Account.TYPE_MARKET,
                 'compte-titres': Account.TYPE_MARKET,
                 'assurances? vie et retraite': Account.TYPE_LIFE_INSURANCE,
                 u'prêt': Account.TYPE_LOAN,
                 u'crédits?': Account.TYPE_LOAN,
                 'plan d\'epargne en actions': Account.TYPE_PEA
                 }

        # first trying to match with label
        label = Field('label')(self)
        for atypetxt, atype in types.items():
            if re.findall(atypetxt, label.lower()):  # match with/without plurial in type
                return atype
        # then by type
        type = Regexp(CleanText('../../preceding-sibling::div[@class="avoirs"][1]/span[1]'), r'(\d+) (.*)', '\\2')(self)
        for atypetxt, atype in types.items():
            if re.findall(atypetxt, type.lower()):  # match with/without plurial in type
                return atype

        return Account.TYPE_UNKNOWN
Ejemplo n.º 3
0
 def filter(self, el):
     duration = Regexp(CleanText('.'), r'.+\|(.+)')(el[0])
     if duration[-1:] == "'":
         t = [0, int(duration[:-1])]
     else:
         t = map(int, duration.split(':'))
     return timedelta(hours=t[0], minutes=t[1])
Ejemplo n.º 4
0
 def get_cards(self):
     cards = []
     for tr in self.doc.getiterator('tr'):
         link = Regexp(CleanText('./@onclick'), "'(.*)'", default=None)(tr)
         if link is not None and link.startswith('/outil/UWCB/UWCBEncours') and 'listeOperations' in link:
             cards.append(link)
     return cards
Ejemplo n.º 5
0
 def obj_id(self):
     id = Regexp(CleanText('./a/@href'), '//www.france.tv/(.*)/', default=None)(self)
     if not id:
         id = CleanText('.')(self)
         id = id.encode('ascii', 'ignore')
         id = hashlib.md5(id).hexdigest()
         id = u'vid_%s' % id
     return id
Ejemplo n.º 6
0
            def obj_rdate(self):
                if self.obj.rdate:
                    # Transaction.Raw may have already set it
                    return self.obj.rdate

                s = Regexp(Field('raw'), ' (\d{2}/\d{2}/\d{2}) | (?!NUM) (\d{6}) ', default=NotAvailable)(self)
                if not s:
                    return Field('date')(self)
                s = s.replace('/', '')
                # Sometimes the user enters an invalid date 16/17/19 for example
                return Date(dayfirst=True, default=NotAvailable).filter('%s%s%s%s%s' % (s[:2], '-', s[2:4], '-', s[4:]))
Ejemplo n.º 7
0
    def on_load(self):
        # website may have identify us as a robot, if it happens login form won't be available
        try:
            attr = Attr('head/meta', 'name')(self.doc)
        except AttributeNotFound:
            # website have identify us as a human ;)
            return

        # sometimes robots is uppercase and there is an iframe
        # sometimes it's lowercase and there is an encoded javascript
        if attr == 'ROBOTS':
            self.browser.location(Attr('//iframe', 'src')(self.doc))
        elif attr == 'robots':
            hexa_code = Regexp(CleanText('head/script[contains(text(), "function")]'), r'var b="(.*?)"')(self.doc)
            code = hexa_code.decode("hex")
            url = re.search(r'xhr.open\("GET","(.*?)"', code).group(1)
            self.browser.location(url)
Ejemplo n.º 8
0
        def parse(self, el):
            # we have two kinds of page and sometimes we don't have any advisor
            agency_phone = CleanText('//span/a[contains(@href, "rendezVous")]', replace=[(' ', '')], default=NotAvailable)(self) or \
                           CleanText('//div[has-class("lbp-numero")]/span', replace=[(' ', '')], default=NotAvailable)(self)
            advisor_phone = Regexp(CleanText('//div[h3[contains(text(), "conseil")]]//span[2]', replace=[(' ', '')], default=""), '(\d+)', default="")(self)
            if advisor_phone.startswith(("06", "07")):
                self.env['phone'] = agency_phone
                self.env['mobile'] = advisor_phone
            else:
                self.env['phone'] = advisor_phone or agency_phone

            agency = CleanText('//div[h3[contains(text(), "Bureau")]]/div[not(@class)][1]')(self) or NotAvailable
            name = CleanText('//div[h3[contains(text(), "conseil")]]//span[1]', default=None)(self) or \
                   CleanText('//div[@class="lbp-font-accueil"]/div[2]/div[1]/span[1]', default=None)(self)
            if name:
                self.env['name'] = name
                self.env['agency'] = agency
            else:
                self.env['name'] = agency
Ejemplo n.º 9
0
    class get_video(ItemElement):
        klass = RmllVideo

        obj_id = CleanHTML('/html/head/meta[@property="og:url"]/@content'
                           ) & CleanText() & Regexp(
                               pattern=r'.*/permalink/(.+)/$')
        obj_title = Format(
            u'%s',
            CleanHTML('/html/head/meta[@name="DC.title"]/@content')
            & CleanText())
        obj_description = Format(
            u'%s',
            CleanHTML('/html/head/meta[@property="og:description"]/@content')
            & CleanText())

        def obj_thumbnail(self):
            url = NormalizeThumbnail(
                CleanText('/html/head/meta[@property="og:image"]/@content'))(
                    self)
            if url:
                thumbnail = Thumbnail(url)
                thumbnail.url = thumbnail.id
                return thumbnail

        obj_duration = CleanText('/html/head/script[not(@src)]') & Regexp(
            pattern=r'media_duration: ([^,.]+),?.*,',
            default='') & Duration(default=NotAvailable)

        def obj_url(self):
            links = XPath(
                '//div[@id="tab_sharing_content"]/div/div/div[@class="paragraph"]/div[@class="share"]/a[@target="_blank"]/@href'
            )(self)
            for link in links:
                ext = str(link).split('.')[-1]
                self.logger.debug("Link:%s Ext:%s", link, ext)
                if ext in ['mp4', 'webm']:
                    return unicode(link)
Ejemplo n.º 10
0
    def iter_pocket(self, label):
        date_available, condition = 0, 0
        for tr in self.doc.xpath(
                u'//table[@summary="Liste des échéances"]/tbody/tr'):
            tds = tr.findall('td')

            pocket = Pocket()
            i = 0

            if len(tds) <= 2:
                continue
            elif len(tds) < 6:
                pocket.availability_date = date_available
                pocket.condition = condition
            else:
                i += 1
                pocket.availability_date = Date(Regexp(CleanText(tds[0]),
                                                       '([\d\/]+)',
                                                       default=NotAvailable),
                                                default=NotAvailable)(tr)
                date_available = pocket.availability_date

                pocket.condition = Pocket.CONDITION_DATE if pocket.availability_date is not NotAvailable else \
                                            self.CONDITIONS.get(CleanText(tds[0])(tr).lower().split()[0], Pocket.CONDITION_UNKNOWN)
                condition = pocket.condition

            pocket.label = CleanText(tds[i])(tr)
            pocket.quantity = CleanDecimal(tds[i + 3], replace_dots=True)(tr)
            pocket.amount = CleanDecimal(tds[i + 4], replace_dots=True)(tr)

            if 'PEI' in label.split()[0]:
                label = 'PEE'
            if Regexp(CleanText(tds[i]),
                      '\(([\w]+).*\)$')(tr) not in label.split()[0]:
                continue

            yield pocket
Ejemplo n.º 11
0
        class item(ItemElement):
            klass = SensCritiquenCalendarEvent

            def condition(self):
                if '_id' in self.env and self.env['_id']:
                    return Format(u'%s#%s#%s',
                                  Regexp(Link('.'), '/film/(.*)'),
                                  FormatDate("%Y%m%d%H%M",
                                             Date('div[@class="elgr-guide-details"]/div[@class="elgr-data-diffusion"]')),
                                  CleanText('./div/span[@class="d-offset"]',
                                            replace=[(' ', '-')]))(self) == self.env['_id']
                return True

            def validate(self, obj):
                if 'date_from' in self.env and self.env['date_from'] and obj.start_date > self.env['date_from']:
                    if not self.env['date_to']:
                        return True
                    else:
                        if empty(obj.end_date):
                            if obj.start_date < self.env['date_to']:
                                return True
                        elif obj.end_date <= self.env['date_to']:
                            return True

                if '_id' in self.env:
                    return True

                return False

            obj_id = Format(u'%s#%s#%s',
                            Regexp(Link('.'), '/film/(.*)'),
                            FormatDate("%Y%m%d%H%M", Date('div/div[@class="elgr-data-diffusion"]')),
                            CleanText('./div/span[@class="d-offset"]', replace=[(' ', '-')]))
            obj_start_date = Date('div/div[@class="elgr-data-diffusion"]')
            obj_summary = Format('%s - %s',
                                 Regexp(CleanText('./div/img/@alt'), '^Affiche(.*)'),
                                 CleanText('./div/span[@class="d-offset"]'))
Ejemplo n.º 12
0
    class get_housing(ItemElement):
        klass = Housing

        obj_id = Env('_id')
        obj_title = CleanText('//h1[@class="desc clearfix"]/span[@class="title"]')
        obj_cost = CleanDecimal('//h1[@class="desc clearfix"]/span[@class="prix"]')
        obj_currency = Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="prix"]'),
                              '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€')
        obj_area = CleanDecimal(Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="title"]'),
                                '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable)
        obj_location = CleanText('//div[@class="text-annonce"]/h2')
        obj_text = CleanText(CleanHTML('//div[@class="text-annonce-container"]/p'))
        obj_station = CleanText('//div[@class="metro"]')
        obj_phone = CleanText('(//span[@class="telephone hide-tel"])[1]')
        obj_url = BrowserURL('housing', _id=Env('_id'))

        def obj_details(self):
            details = dict()
            for item in XPath('//div[@class="footer-descriptif"]/ul/li')(self):
                key = CleanText('./span[@class="label"]')(item)
                value = CleanText('.', replace=[(key, '')])(item)
                if value and key:
                    details[key] = value

            key = CleanText('//div[@class="classe-energie-content"]/div/div/span')(self)
            value = Format('%s(%s)', CleanText('//div[@class="classe-energie-content"]/div/div/p'),
                           CleanText('//div[@class="classe-energie-content"]/div/@class',
                                     replace=[('-', ' ')]))(self)
            if value and key:
                details[key] = value
            return details

        def obj_photos(self):
            photos = []
            for img in XPath('//div[@class="showcase-thumbnail"]/img/@src')(self):
                photos.append(HousingPhoto(u'%s' % img))
            return photos
Ejemplo n.º 13
0
        class item(ItemElement):

            klass = Housing

            obj_id = QueryValue(
                Attr('.//div[has-class("presentationItem")]/h2/a', 'href'),
                'idter')

            obj_url = AbsoluteLink('.//h2/a')

            obj_type = POSTS_TYPES.SALE

            obj_advert_type = ADVERT_TYPES.PROFESSIONAL

            obj_house_type = HOUSE_TYPES.LAND

            obj_title = CleanText('.//div[@class="presentationItem"]/h2/a')

            obj_area = CleanDecimal(
                Regexp(CleanText('.//div[@class="presentationItem"]/h3'),
                       'surface de (\d+) m²'))

            obj_cost = CleanDecimal(
                CleanText('.//div[@class="presentationItem"]/h3/span[1]',
                          replace=[(".", ""), (" €", "")]))

            obj_currency = Currency.get_currency(u'€')

            obj_date = Date(
                CleanText(
                    './/div[@class="presentationItem"]//span[@class="majItem"]',
                    replace=[("Mise à jour : ", "")]))

            obj_text = CleanText('.//div[@class="presentationItem"]/p')

            obj_phone = CleanText(
                './/div[@class="divBoutonContact"]/div[@class="phone-numbers-bloc"]/p[1]/strong'
            )

            def obj_photos(self):
                for photo in self.xpath(
                        './/div[has-class("photoItemListe")]/img/@data-src'):
                    if photo:
                        photo_url = BASE_URL + '/' + photo
                        return [HousingPhoto(photo_url)]
                else:
                    return []

            obj_utilities = UTILITIES.UNKNOWN
Ejemplo n.º 14
0
        class item(ItemElement):
            klass = GaugeMeasure
            verif = re.compile("\d\d.\d\d.\d+ \d\d:\d\d")

            obj_date = DateTime(
                Regexp(CleanText('.'), r'(\d+)\.(\d+)\.(\d+) (\d+):(\d+)',
                       r'\3-\2-\1 \4:\5'))
            sensor_types = [u'Level', u'Flow']

            def obj_level(self):
                index = self.sensor_types.index(self.env['sensor'].name) + 1
                try:
                    return float(self.el[index].text_content())
                except ValueError:
                    return NotAvailable
Ejemplo n.º 15
0
    class get_job_advert(ItemElement):
        klass = BaseJobAdvert

        obj_description = Join('\n', '//div[@id="annonce-detail"]/p[@class="text"]', textCleaner=CleanHTML)
        obj_id = Env('_id')
        obj_url = BrowserURL('advert_page', _id=Env('_id'))
        obj_publication_date = Date(Regexp(CleanText('//div[@id="annonce-detail"]/p[@class="infos"]'),
                                           '(\d{2}/\d{2}/\d{4})', default=NotAvailable), default=NotAvailable)
        obj_title = CleanText('//div[@id="annonce"]/div/div/h1')
        obj_society_name = CleanText('//section[@class="entp-resume"]/h1/a')

        obj_contract_type = CleanText('//dl[@class="infos-annonce"]/dt[span[@class="picto picto-contrat-grey"]]/following-sibling::dd[1]')
        obj_place = CleanText('//dl[@class="infos-annonce"]/dt[span[@class="picto picto-geolocalisation-grey"]]/following-sibling::dd[1]')
        obj_pay = CleanText('//div[@id="annonce-detail"]/p[@class="infos"]/preceding-sibling::p[1]',
                            replace=[('Salaire : ', '')])
Ejemplo n.º 16
0
        class item(ItemElement):
            klass = Message

            obj_id = Regexp(Link('./div/div/small/a', default=''),
                            '/.+/status/(.+)',
                            default=None)

            obj_title = Regexp(
                CleanText('./div/p',
                          replace=[('@ ', '@'), ('# ', '#'),
                                   ('http:// ', 'http://')]), '(.{50}|.+).+')
            obj_content = CleanText('./div/p',
                                    replace=[('@ ', '@'), ('# ', '#'),
                                             ('http:// ', 'http://')])
            obj_sender = Regexp(Link('./div/div/small/a', default=''),
                                '/(.+)/status/.+',
                                default=None)
            obj_date = DatetimeFromTimestamp(
                Attr(
                    './div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span',
                    'data-time'))

            def validate(self, obj):
                return obj.id is not None
Ejemplo n.º 17
0
        class item(ItemElement):
            klass = Investment

            def condition(self):
                return Field('quantity')(self) is not NotAvailable

            obj_label = CleanText('./th')
            obj_quantity = CleanDecimal(TableCell('quantity'),
                                        default=NotAvailable)
            obj_unitvalue = CleanDecimal(TableCell('unitvalue'))
            obj_valuation = CleanDecimal(TableCell('valuation'))
            obj_portfolio_share = Eval(
                lambda x: x / 100, CleanDecimal(TableCell('portfolio_share')))
            obj_code = Regexp(Link('./th/a'), r'isin=(\w+)|/(\w+)\.pdf')
            obj_code_type = Investment.CODE_TYPE_ISIN
Ejemplo n.º 18
0
        class item(ItemElement):
            klass = Transaction
            obj_date = Date(Regexp(CleanText('.//div[1]'),
                                   r'(\d{2}\/\d{2}\/\d{4})'),
                            dayfirst=True)
            obj_label = Format(
                '%s %s', CleanText('./preceding::h3[1]'),
                Regexp(CleanText('./div[1]'), r'(\d{2}\/\d{2}\/\d{4})'))

            def obj_amount(self):
                return sum(x.valuation for x in Field('investments')(self))

            def obj_investments(self):
                investments = []
                for elem in self.xpath(
                        './following-sibling::div[1]//tbody/tr'):
                    inv = Investment()
                    inv.label = CleanText('./td[1]')(elem)
                    inv.valuation = Coalesce(
                        CleanDecimal.French('./td[2]/p', default=NotAvailable),
                        CleanDecimal.French('./td[2]'))(elem)
                    investments.append(inv)

                return investments
Ejemplo n.º 19
0
        class item(ItemElement):
            klass = Housing

            obj_id = Format(
                '%s-%s', Regexp(Env('type'), '(.*)-.*'),
                CleanText('./@id', replace=[('header-offer-', '')]))
            obj_title = CleanText(
                './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-type"]/span/@title'
            )
            obj_area = CleanDecimal(
                CleanText(
                    './div/div/div[@class="offer-details-wrapper"]/div/div/div/div/h3/a/span[@class="offer-area-number"]',
                    default=NotAvailable))
            obj_cost = CleanDecimal(Regexp(CleanText(
                './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-price"]/span',
                default=NotAvailable),
                                           '(.*) [%s%s%s]' %
                                           (u'€', u'$', u'£'),
                                           default=NotAvailable),
                                    default=Decimal(0))
            obj_currency = Regexp(CleanText(
                './div/div/div[@class="offer-details-wrapper"]/div/div/p[@class="offer-price"]/span',
                default=NotAvailable),
                                  '.* ([%s%s%s])' % (u'€', u'$', u'£'),
                                  default=u'€')
            obj_date = Date(
                Regexp(
                    CleanText(
                        './div/div/div[has-class("offer-picture-more")]/div/p[@class="offer-update"]'
                    ), ".*(\d{2}/\d{2}/\d{4}).*"))
            obj_text = CleanText(
                './div/div/div[@class="offer-details-wrapper"]/div/div/div/p[has-class("offer-description")]/span'
            )
            obj_location = CleanText(
                './div/div/div[@class="offer-details-wrapper"]/div/div/div/div/h2'
            )
Ejemplo n.º 20
0
    def iter_investments(self, account):
        for row, elem_repartition, elem_pocket, elem_diff in self.iter_invest_rows(
                account=account):
            inv = Investment()
            inv._account = account
            inv._el_pocket = elem_pocket
            inv.label = CleanText('.//td[1]')(row)
            _url = Link('.//td[1]/a', default=None)(row)
            if _url:
                inv._url = self.absurl(_url)
            else:
                # If _url is None, self.absurl returns the BASEURL, so we need to set the value manually.
                inv._url = None
            inv.valuation = MyDecimal('.//td[2]')(row)

            # On all Cmes children the row shows percentages and the popup shows absolute values in currency.
            # On Cmes it is mirrored, the popup contains the percentage.
            is_mirrored = '%' in row.text_content()

            if not is_mirrored:
                inv.diff = MyDecimal('.//td[3]')(row)
                if elem_diff is not None:
                    inv.diff_ratio = Eval(
                        lambda x: x / 100,
                        MyDecimal(
                            Regexp(CleanText('.'),
                                   r'([+-]?[\d\s]+[\d,]+)\s*%')))(elem_diff)
            else:
                inv.diff = MyDecimal('.')(elem_diff)
                if elem_diff is not None:
                    inv.diff_ratio = Eval(
                        lambda x: x / 100,
                        MyDecimal(
                            Regexp(CleanText('.//td[3]'),
                                   r'([+-]?[\d\s]+[\d,]+)\s*%')))(row)
            yield inv
Ejemplo n.º 21
0
        class item(ItemElement):
            klass = Advisor

            obj_name = Format('%s %s %s', Dict('data/civilite'),
                              Dict('data/prenom'), Dict('data/nom'))
            obj_email = Regexp(Dict('data/mail'),
                               '(?=\w)(.*)',
                               default=NotAvailable)
            obj_phone = CleanText(Dict('data/telephone'), replace=[(' ', '')])
            obj_mobile = CleanText(Dict('data/mobile'), replace=[(' ', '')])
            obj_fax = CleanText(Dict('data/fax'), replace=[(' ', '')])
            obj_agency = Dict('data/agence')
            obj_address = Format('%s %s %s', Dict('data/adresseAgence'),
                                 Dict('data/codePostalAgence'),
                                 Dict('data/villeAgence'))
Ejemplo n.º 22
0
        class item(ItemElement):
            klass = Housing

            obj_id = Format(
                'colocation-%s',
                CleanText('./div/header/@id', replace=[('header-offer-', '')]))
            obj_title = CleanText(
                CleanHTML(
                    './div/header/section/p[@class="property-type"]/span/@title'
                ))

            obj_area = CleanDecimal(
                './div/header/section/p[@class="offer-attributes"]/a/span[@class="offer-area-number"]',
                default=0)

            obj_cost = CleanDecimal('./div/header/section/p[@class="price"]',
                                    default=0)
            obj_currency = Regexp(
                CleanText('./div/header/section/p[@class="price"]',
                          default=NotAvailable),
                '.* ([%s%s%s])' % (u'€', u'$', u'£'),
                default=u'€')

            obj_text = CleanText(
                './div/div[@class="content-offer"]/section[has-class("content-desc")]/p/span[has-class("offer-text")]/@title'
            )

            obj_date = Date(
                Regexp(
                    CleanText(
                        './div/header/section/p[has-class("update-date")]'),
                    ".*(\d{2}/\d{2}/\d{4}).*"))

            obj_location = CleanText(
                '(./div/div[@class="content-offer"]/section[has-class("content-desc")]/p)[1]/span/@title'
            )
Ejemplo n.º 23
0
    def obj_type(self):
        types = {
            'comptes? bancaires?': Account.TYPE_CHECKING,
            "plan d'epargne populaire": Account.TYPE_SAVINGS,
            'livrets?': Account.TYPE_SAVINGS,
            'epargnes? logement': Account.TYPE_SAVINGS,
            "autres produits d'epargne": Account.TYPE_SAVINGS,
            'compte relais': Account.TYPE_SAVINGS,
            'comptes? titres? et pea': Account.TYPE_MARKET,
            'compte-titres': Account.TYPE_MARKET,
            'assurances? vie': Account.TYPE_LIFE_INSURANCE,
            'prêt': Account.TYPE_LOAN,
            'crédits?': Account.TYPE_LOAN,
            'plan d\'epargne en actions': Account.TYPE_PEA,
            'comptes? attente': Account.TYPE_CHECKING,
            'perp': Account.TYPE_PERP,
            'assurances? retraite': Account.TYPE_PERP,
        }

        # first trying to match with label
        label = Field('label')(self)
        for atypetxt, atype in types.items():
            if re.findall(atypetxt,
                          label.lower()):  # match with/without plurial in type
                return atype
        # then by type
        type = Regexp(
            CleanText(
                '../../preceding-sibling::div[@class="avoirs"][1]/span[1]'),
            r'(\d+) (.*)', '\\2')(self)
        for atypetxt, atype in types.items():
            if re.findall(atypetxt,
                          type.lower()):  # match with/without plurial in type
                return atype

        return Account.TYPE_UNKNOWN
Ejemplo n.º 24
0
        class item(ItemElement):
            klass = Account

            TYPE = {
                'Livret': Account.TYPE_SAVINGS,
                'Compte': Account.TYPE_CHECKING,
                'PEA': Account.TYPE_PEA,
                'PEA-PME': Account.TYPE_PEA,
                'Compte-titres': Account.TYPE_MARKET,
                'Assurance-vie': Account.TYPE_LIFE_INSURANCE,
                'Crédit': Account.TYPE_LOAN,
            }

            obj_id = CleanText(
                './td//div[contains(@class, "-synthese-title") or contains(@class, "-synthese-text")]'
            ) & Regexp(pattern=r'(\d+)')
            obj_label = CleanText(
                './td//div[contains(@class, "-synthese-title")]')
            obj_balance = MyDecimal(
                './td//div[contains(@class, "-synthese-num")]',
                replace_dots=True)
            obj_currency = FrenchTransaction.Currency(
                './td//div[contains(@class, "-synthese-num")]')
            obj_type = Map(Regexp(Field('label'), r'^([^ ]*)'),
                           TYPE,
                           default=Account.TYPE_UNKNOWN)

            def obj_url(self):
                return urljoin(self.page.url, CleanText('./@data-href')(self))

            obj__card_balance = CleanDecimal(
                './td//div[@class="synthese-encours"][last()]/div[2]',
                default=None)

            def condition(self):
                return not len(self.el.xpath('./td[@class="chart"]'))
Ejemplo n.º 25
0
    def __init__(self, page):
        img_url = Regexp(CleanText('//style'), r'background:url\((.*?)\)', default=None)(page.doc) or \
                  Regexp(CleanText('//script'), r'IMG_ALL = "(.*?)"', default=None)(page.doc)
        size = 252
        if not img_url:
            img_url = page.doc.xpath('//img[@id="imageCVS"]')[0].attrib['src']
            size = 146
        coords = {}

        x, y, width, height = (0, 0, size // 4, size // 4)
        for i, _ in enumerate(
                page.doc.xpath('//div[@id="imageclavier"]//button')):
            code = '%02d' % i
            coords[code] = (x + 4, y + 4, x + width - 8, y + height - 8)
            if (x + width + 1) >= size:
                y += height + 1
                x = 0
            else:
                x += width + 1

        data = page.browser.open(img_url).content
        VirtKeyboard.__init__(self, BytesIO(data), coords, self.color)

        self.check_symbols(self.symbols, page.browser.responses_dirname)
Ejemplo n.º 26
0
    class get_profile(ItemElement):
        klass = Person

        obj_email = CleanText(
            '//form[@id="idCoordonneePersonnelle"]//table//strong[contains(text(), "e-mail")]/parent::td',
            children=False)

        obj_phone = CleanText(
            '//form[@id="idCoordonneePersonnelle"]//table//strong[contains(text(), "mobile")]/parent::td',
            children=False)

        obj_address = Regexp(
            CleanText(
                '//form[@id="idCoordonneePersonnelle"]//table//strong[contains(text(), "adresse fiscale")]/parent::td',
                children=False), '^(.*?)\/')
Ejemplo n.º 27
0
    class get_video(ItemElement):
        klass = BaseVideo

        obj_nsfw = True
        obj_ext = 'mp4'
        obj_title = Attr('//meta[@property="og:title"]', 'content')
        obj_id = Env('id')

        obj__props = Eval(json.loads, Regexp(RawText('//script[contains(text(),"window.initials =")]'), r'window.initials = (.*);\n'))

        obj_duration = Base(Field('_props'), Dict('videoModel/duration'))
        obj_url = Base(Field('_props'), Dict('videoModel/mp4File'))

        def obj__page(self):
            return self.page.url
Ejemplo n.º 28
0
    class get_job_advert(ItemElement):
        klass = BaseJobAdvert

        obj_id = Env('id')
        obj_url = BrowserURL('advert_page', id=Env('id'))
        obj_title = CleanText('//title')
        obj_job_name = CleanText('//title')
        obj_society_name = CleanText('//div[2]/div[@class="col-md-9"]/h4[1]')
        obj_publication_date = Date(CleanText(
            '//div[2]/div[@class="col-md-9"]/small',
            replace=[(u'Ajoutée le', '')]),
                                    parse_func=parse_french_date)
        obj_place = Regexp(CleanText('//div[2]/div[@class="col-md-9"]/h4[2]'),
                           '(.*) \(.*\)')
        obj_description = CleanHTML('//div[4]/div[@class="col-md-9"]')
Ejemplo n.º 29
0
        class item(ItemElement):
            klass = Bill

            def obj_url(self):
                return urljoin(self.page.url, Regexp(Dict('sOperation'), r'&quot;(/.*\.pdf)')(self))

            _num = Regexp(Field('url'), r'facture_(\d+).pdf')

            obj_id = Format('%s_%s', Env('subid'), _num)
            obj_date = Eval(datetime.fromtimestamp, Dict('sTimestamp'))
            obj_label = Format('Facture %s', _num)
            obj_price = CleanDecimal(Dict('fMontant'))
            obj_currency = Currency(Dict('sMontant'))
            obj_type = 'bill'
            obj_format = 'pdf'
Ejemplo n.º 30
0
        class item(ItemElement):
            klass = Transaction

            obj_amount = MyDecimal('./th[@scope="rowgroup"][2]')
            obj_label = CleanText('(//p[contains(@id, "smltitle")])[2]')
            obj_raw = Transaction.Raw(Field('label'))
            obj_date = Date(Regexp(
                CleanText('(//p[contains(@id, "smltitle")])[1]'),
                r'(\d{1,2}/\d{1,2}/\d+)'),
                            dayfirst=True)

            def obj__account_label(self):
                account_label = CleanText('./th[@scope="rowgroup"][1]')(self)
                return self.page.ACCOUNTS_SPE_LABELS.get(
                    account_label, account_label)
Ejemplo n.º 31
0
    class get_transfer(ItemElement):
        klass = Transfer

        obj_amount = CleanDecimal('//p[@class="tabTxt tabTxt2"]/strong[1]',
                                  replace_dots=True)
        obj_exec_date = Date(
            CleanText('//p[@class="tabTxt tabTxt2"]/strong[2]'), dayfirst=True)
        obj_label = Regexp(CleanText('//p[@class="tabTxt tabTxt2"]/strong[3]'),
                           u'« (.*) »')
        obj_account_id = Regexp(
            CleanText(
                '//div[@class="transAction"]/div[@class="inner"]/div[@class="first"]//small'
            ), r'N°(\w+)')
        obj_recipient_id = Regexp(CleanText(
            '//div[@class="transAction"]/div[@class="inner"]/div[not(@class="first")]//small'
        ),
                                  r'N°(\w+)',
                                  default=None)

        def obj_recipient_iban(self):
            if Field('recipient_id')(self) is None:
                return CleanText(
                    '//div[@class="transAction"]/div[@class="inner"]/div[not(@class="first")]//span[@class="tabTxt"]'
                )(self).replace(' ', '')
Ejemplo n.º 32
0
        class item(ItemElement):
            klass = BaseJobAdvert

            obj_id = Regexp(
                Link('./td/div/div[@class="jobTitleContainer"]/a'),
                'http://offre-(d?)emploi.monster.fr:80/(.*?)(.aspx|\?).*',
                '\\1#\\2')
            obj_society_name = CleanText(
                './td/div/div[@class="companyContainer"]/div/a')
            obj_title = CleanText('./td/div/div[@class="jobTitleContainer"]/a')
            obj_publication_date = MonsterDate(
                CleanText('td/div/div[@class="fnt20"]'))
            obj_place = CleanText(
                './td/div/div[@class="jobLocationSingleLine"]/a/@title',
                default=NotAvailable)
Ejemplo n.º 33
0
    class get_video(ItemElement):
        klass = YoupornVideo

        obj_author = CleanText('//div[has-class("submitByLink")]')
        #obj_date = Date('//div[@id="stats-date"]')
        obj_duration = NotAvailable
        obj_ext = 'mp4'
        obj_id = Env('id')
        obj_rating = CleanText('//div[@class="videoRatingPercentage"]') & Regexp(pattern=r'(\d+)%') & Type(type=int)
        obj_rating_max = 100
        obj_thumbnail = NotAvailable
        obj_title = CleanText('//h1')

        def obj_url(self):
            return loads(re.search('videoUrl":(".*?")', self.page.text).group(1))
Ejemplo n.º 34
0
 def populate(self, accounts):
     cards = []
     for account in accounts:
         for li in self.doc.xpath('//li[@class="nav-category"]'):
             title = CleanText().filter(li.xpath('./h3'))
             for a in li.xpath('./ul/li//a'):
                 label = CleanText().filter(
                     a.xpath('.//span[@class="nav-category__name"]'))
                 balance_el = a.xpath(
                     './/span[@class="nav-category__value"]')
                 balance = CleanDecimal(
                     replace_dots=True,
                     default=NotAvailable).filter(balance_el)
                 if 'CARTE' in label and balance:
                     acc = Account()
                     acc.balance = balance
                     acc.label = label
                     acc.currency = FrenchTransaction.Currency().filter(
                         balance_el)
                     acc._link = Link().filter(a.xpath('.'))
                     acc._history_page = acc._link
                     acc.id = acc._webid = Regexp(
                         pattern='([^=]+)$').filter(Link().filter(
                             a.xpath('.')))
                     acc.type = Account.TYPE_CARD
                     if not acc in cards:
                         cards.append(acc)
                 elif account.label == label and account.balance == balance:
                     if not account.type:
                         account.type = AccountsPage.ACCOUNT_TYPES.get(
                             title, Account.TYPE_UNKNOWN)
                     if account.type == Account.TYPE_LOAN:
                         account._history_page = None
                     elif account.type in (Account.TYPE_LIFE_INSURANCE,
                                           Account.TYPE_MARKET):
                         account._history_page = re.sub(
                             '/$', '',
                             Link().filter(a.xpath('.')))
                     elif '/compte/cav' in a.attrib[
                             'href'] or not 'titulaire' in self.url:
                         account._history_page = self.browser.other_transactions
                     else:
                         account._history_page = self.browser.budget_transactions
                     account._webid = Attr(
                         None, 'data-account-label').filter(
                             a.xpath(
                                 './/span[@class="nav-category__name"]'))
     accounts.extend(cards)
Ejemplo n.º 35
0
    def get_list(self):
        account_type = Account.TYPE_UNKNOWN
        accounts = []

        for tr in self.doc.xpath(
                '//div[@class="finance"]/form/table[@class="ecli"]/tr'):
            if tr.attrib.get('class', '') == 'entete':
                account_type = self.ACCOUNT_TYPES.get(
                    tr.find('th').text.strip(), Account.TYPE_UNKNOWN)
                continue

            tds = tr.findall('td')
            a = tds[0].find('a')

            # Skip accounts that can't be accessed
            if a is None:
                continue

            balance = tds[-1].text.strip()

            account = Account()
            account.label = u' '.join(
                [txt.strip() for txt in tds[0].itertext()])
            account.label = re.sub(u'[ \xa0\u2022\r\n\t]+', u' ',
                                   account.label).strip()
            account.id = Regexp(pattern=u'N° ((.*?) |(.*))').filter(
                account.label).strip()
            account.type = account_type
            if balance:
                account.balance = Decimal(
                    FrenchTransaction.clean_amount(balance))
                account.currency = account.get_currency(balance)

            if 'onclick' in a.attrib:
                m = re.search(r"javascript:submitForm\(([\w_]+),'([^']+)'\);",
                              a.attrib['onclick'])
                if not m:
                    self.logger.warning('Unable to find link for %r' %
                                        account.label)
                    account._link = None
                else:
                    account._link = m.group(2)
            else:
                account._link = a.attrib['href'].strip()

            accounts.append(account)

        return accounts
Ejemplo n.º 36
0
    def get_not_rounded_valuations(self):
        def prepare_url(url, fields):
            components = urlparse(url)
            query_pairs = [(f, v) for (f, v) in parse_qsl(components.query)
                           if f not in fields.iterkeys()]

            for (field, value) in fields.iteritems():
                query_pairs.append((field, value))

            new_query_str = urlencode(query_pairs)

            new_components = (components.scheme, components.netloc,
                              components.path, components.params,
                              new_query_str, components.fragment)

            return urlunparse(new_components)

        not_rounded_valuations = {}
        pages = []

        try:
            for i in range(
                    1,
                    CleanDecimal(
                        Regexp(
                            CleanText(
                                u'(//table[form[contains(@name, "detailCompteTitresForm")]]//tr[1])[1]/td[3]/text()'
                            ), r'\/(.*)'))(self.doc) + 1):
                pages.append(
                    self.browser.open(
                        prepare_url(self.browser.url, {
                            'action': '11',
                            'idCptSelect': '1',
                            'numPage': i
                        })).page)
        except RegexpError:  # no multiple page
            pages.append(self)

        for page in pages:
            for inv in page.doc.xpath(
                    u'//table[contains(., "Détail du compte")]//tr[2]//table/tr[position() > 1]'
            ):
                if len(inv.xpath('.//td')) > 2:
                    not_rounded_valuations[CleanText('.//td[1]/a/text()')(
                        inv)] = CleanDecimal('.//td[7]/text()',
                                             replace_dots=True)(inv)

        return not_rounded_valuations
Ejemplo n.º 37
0
 def parse(self, el):
     json_content = Regexp(CleanText('//script'),
                           "var ava_data = ({.+?});")(self)
     json_content = json_content.replace("logged", "\"logged\"")
     json_content = json_content.replace("lengthcarrousel",
                                         "\"lengthcarrousel\"")
     json_content = json_content.replace("products", "\"products\"")
     json_content = json_content.replace(
         "// // ANNONCES_SIMILAIRE / RECO", "")
     self.house_json_datas = json.loads(json_content)['products'][0]
Ejemplo n.º 38
0
        class item(ItemElement):
            klass = SongLyrics

            obj_title = CleanText('.', default=NotAvailable)
            obj_artist = Regexp(
                CleanText(
                    '//div[has-class("breadcrumb")]//span[has-class("breadcrumb-current")]'
                ), 'Paroles (.*)')
            obj_content = NotLoaded

            def obj_id(self):
                href = CleanText('./@href')(self)
                aid = href.split('/')[-2]
                sid = href.split('/')[-1].replace('paroles-', '')
                id = '%s|%s' % (aid, sid)
                return id
Ejemplo n.º 39
0
    def parse(self, el):
        # Trying to find vdate and unitvalue
        unitvalue, vdate = None, None
        for span in TableCell('label')(self)[0].xpath('.//span'):
            if unitvalue is None:
                unitvalue = Regexp(CleanText('.'), '^([\d,]+)$', default=None)(span)
            if vdate is None:
                vdate = None if any(x in CleanText('./parent::div')(span) for x in [u"échéance", "Maturity"]) else \
                        Regexp(CleanText('.'), '^([\d\/]+)$', default=None)(span)
        self.env['unitvalue'] = MyDecimal().filter(unitvalue) if unitvalue else NotAvailable
        self.env['vdate'] = Date(dayfirst=True).filter(vdate) if vdate else NotAvailable

        page = None
        link_id = Attr(u'.//a[contains(@title, "détail du fonds")]', 'id', default=None)(self)
        inv_id = Attr('.//a[contains(@id, "linkpdf")]', 'id', default=None)(self)

        if link_id and inv_id:
            form = self.page.get_form('//div[@id="operation"]//form')
            form['idFonds'] = inv_id.split('-', 1)[-1]
            form['org.richfaces.ajax.component'] = form[link_id] = link_id
            page = self.page.browser.open(form['javax.faces.encodedURL'], data=dict(form)).page

            if "hsbc.fr" in self.page.browser.BASEURL: # special space for HSBC
                m = re.search('fundid=(\w+).+SH=(\w+)', CleanText('//complete', default="")(page.doc))

                if m: # had to put full url to skip redirections.
                    page = page.browser.open('https://www.assetmanagement.hsbc.com/feedRequest?feed_data=gfcFundData&cod=FR&client=FCPE&fId=%s&SH=%s&lId=fr' % m.groups()).page
            elif "consulteroperations" not in self.page.browser.url: # not on history
                url = Regexp(CleanText('//complete'), r"openUrlFichesFonds\('(.*?)',true|false\).*", default=NotAvailable)(page.doc)

                if url is NotAvailable:
                    # redirection to a useless graphplot page with url like /portal/salarie-sg/fichefonds?idFonds=XXX&source=/portal/salarie-sg/monepargne/mesavoirs
                    # or on bnp, look for plot display function in a script
                    assert CleanText('//redirect/@url')(page.doc) or CleanText('//script[contains(text(), "afficherGraphique")]')(page.doc)
                    self.env['code'] = NotAvailable
                    self.env['code_type'] = NotAvailable
                    return

                useless_urls = (
                    # pdf... http://docfinder.is.bnpparibas-ip.com/api/files/040d05b3-1776-4991-aa49-f0cd8717dab8/1536
                    'http://docfinder.is.bnpparibas-ip.com/',
                    # Redirection to a useless page with url like "https://epargne-salariale.axa-im.fr/fr/"
                    'https://epargne-salariale.axa-im.fr/fr/',
                )

                for useless_url in useless_urls:
                    if url.startswith(useless_url):
                        self.env['code'] = NotAvailable
                        self.env['code_type'] = NotAvailable
                        return

                match = re.match(r'http://www.cpr-am.fr/fr/fonds_detail.php\?isin=([A-Z0-9]+)', url)
                match = match or re.match(r'http://www.cpr-am.fr/particuliers/product/view/([A-Z0-9]+)', url)
                if match:
                    self.env['code'] = match.group(1)
                    self.env['code_type'] = Investment.CODE_TYPE_ISIN
                    return

                if url.startswith('http://fr.swisslife-am.com/fr/'):
                    self.page.browser.session.cookies.set('location', 'fr')
                    self.page.browser.session.cookies.set('prof', 'undefined')

                page = self.page.browser.open(url).page

        try:
            self.env['code'] = page.get_code()
            self.env['code_type'] = page.CODE_TYPE
        # Handle page is None and page has not get_code method
        except AttributeError:
            self.env['code'] = NotAvailable
            self.env['code_type'] = NotAvailable
Ejemplo n.º 40
0
    def iter_payment_details(self, sub):
        id_str = self.doc.xpath('//div[@class="entete container"]/h2')[0].text.strip()
        m = re.match('.*le (.*) pour un montant de.*', id_str)
        if m:
            blocs_benes = self.doc.xpath('//span[contains(@id,"nomBeneficiaire")]')
            blocs_prestas = self.doc.xpath('//table[@id="tableauPrestation"]')
            i = 0
            last_bloc = len(blocs_benes)
            for i in range(0, last_bloc):
                bene = blocs_benes[i].text;
                id_str = m.group(1)
                id_date = datetime.strptime(id_str, '%d/%m/%Y').date()
                id = sub._id + "." + datetime.strftime(id_date, "%Y%m%d")
                table = blocs_prestas[i].xpath('.//tr')
                line = 1
                last_date = None
                for tr in table:
                    tds = tr.xpath('.//td')
                    if len(tds) == 0:
                        continue

                    det = Detail()

                    # TO TEST : Indemnités journalières : Pas pu tester de cas de figure similaire dans la nouvelle mouture du site
                    if len(tds) == 4:
                        date_str = Regexp(pattern=r'.*<br/>(\d+/\d+/\d+)\).*').filter(tds[0].text)
                        det.id = id + "." + str(line)
                        det.label = tds[0].xpath('.//span')[0].text.strip()

                        jours = tds[1].text
                        if jours is None:
                            jours = '0'

                        montant = tds[2].text
                        if montant is None:
                            montant = '0'

                        price = tds[3].text
                        if price is None:
                            price = '0'

                        if date_str is None or date_str == '':
                            det.infos = u''
                            det.datetime = last_date
                        else:
                            det.infos = '%s (%sj) * %s€' % (date_str, re.sub(r'[^\d,-]+', '', jours), re.sub(r'[^\d,-]+', '', montant))
                            det.datetime = datetime.strptime(date_str.split(' ')[3], '%d/%m/%Y').date()
                            last_date = det.datetime
                        det.price = Decimal(re.sub('[^\d,-]+', '', price).replace(',', '.'))

                    if len(tds) == 5:
                        date_str = Regexp(pattern=r'\w*(\d{2})/(\d{2})/(\d{4}).*', template='\\1/\\2/\\3', default="").filter("".join(tds[0].itertext()))
                        det.id = id + "." + str(line)
                        det.label = '%s - %s' % (bene, tds[0].xpath('.//span')[0].text.strip())

                        paye = tds[1].text
                        if paye is None:
                            paye = '0'

                        base = tds[2].text
                        if base is None:
                            base = '0'

                        tdtaux = tds[3].xpath('.//span')[0].text
                        if tdtaux is None:
                            taux = '0'
                        else:
                            taux = tdtaux.strip()

                        tdprice = tds[4].xpath('.//span')[0].text
                        if tdprice is None:
                            price = '0'
                        else:
                            price = tdprice.strip()

                        if date_str is None or date_str == '':
                            det.infos = u''
                            det.datetime = last_date
                        else:
                            det.infos = u' Payé %s€ / Base %s€ / Taux %s%%' % (re.sub(r'[^\d,-]+', '', paye), re.sub(r'[^\d,-]+', '', base), re.sub('[^\d,-]+', '', taux))
                            det.datetime = datetime.strptime(date_str, '%d/%m/%Y').date()
                            last_date = det.datetime
                        det.price = Decimal(re.sub('[^\d,-]+', '', price).replace(',', '.'))
                    line = line + 1
                    yield det
Ejemplo n.º 41
0
 def obj_label(self):
     label = Regexp(CleanText('.//div[@class="c-card-ghost__top-label"]'), pattern=r'^(.*?)(?: -[^-]*)?$')(self)
     return label.rstrip('-').rstrip()
Ejemplo n.º 42
0
 def obj_split_path(self):
     _id = Regexp(CleanText('./@href'), '/\w{2}/(.*)', default=u'accueil')(self)
     return [SITE.CREATIVE.get('id')] + [_id.replace('/', '^')]
Ejemplo n.º 43
0
 def get_params(self):
     a = Regexp(CleanText('//script'),
                '"algolia_app_id":"(.*)","algolia_api_key":"(.*)","algolia_api_index_taxonomy".*',
                '\\1|\\2')(self.doc)
     return a.split('|')
Ejemplo n.º 44
0
 def obj_url(self):
     url = Regexp(AbsoluteLink('//div[has-class("torrentinfo")]//div[has-class("dltorrent")]//a[text()="Download torrent"]'), '(^.*)\?.*', '\\1')(self)
     return url.replace('http://', 'https://')
Ejemplo n.º 45
0
 def obj_url(self):
     url = Regexp(AbsoluteLink('.//div[has-class("tt-name")]/a[1]'), '(^.*)\?.*', '\\1')(self)
     return url.replace('http://', 'https://')
Ejemplo n.º 46
0
 def obj_rdate(self):
     s = Regexp(Field('raw'), ' (\d{2}/\d{2}/\d{2}) | (?!NUM) (\d{6}) ', default=NotAvailable)(self)
     if not s:
         return Field('date')(self)
     s = s.replace('/', '')
     return Date(dayfirst=True).filter('%s%s%s%s%s' % (s[:2], '-', s[2:4], '-', s[4:]))
Ejemplo n.º 47
0
 def obj_split_path(self):
     _id = Regexp(CleanText('./a/@href'), '/\w{2}/(.*)')(self)
     return [SITE.CINEMA.get('id')] + _id.split('/')