Python BrokenPageError Beispiele, weboob.deprecated.browser.BrokenPageError Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: pages.py Projekt: linura/weboob

    def set_details(self, v):
        v.author = u'European Parliament'
        obj = self.parser.select(self.document.getroot(),
                                 'meta[name=available]', 1)
        if obj is not None:
            value = obj.attrib['content']
            m = re.match('(\d\d)-(\d\d)-(\d\d\d\d)\s*(\d\d):(\d\d)', value)
            if not m:
                raise BrokenPageError('Unable to parse datetime: %r' % value)
            day = m.group(1)
            month = m.group(2)
            year = m.group(3)
            hour = m.group(4)
            minute = m.group(5)
            v.date = datetime.datetime(year=int(year),
                                       month=int(month),
                                       day=int(day),
                                       hour=int(hour),
                                       minute=int(minute))

        obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle',
                                 1)
        if obj is not None:
            span = self.parser.select(obj, 'span.ep_date', 1)
            value = span.text
            m = re.match(
                '(\d\d):(\d\d)\s*\/\s*(\d\d):(\d\d)\s*-\s*(\d\d)-(\d\d)-(\d\d\d\d)',
                value)
            if not m:
                raise BrokenPageError('Unable to parse datetime: %r' % value)
            bhour = m.group(1)
            bminute = m.group(2)
            ehour = m.group(3)
            eminute = m.group(4)
            day = m.group(5)
            month = m.group(6)
            year = m.group(7)

            start = datetime.datetime(year=int(year),
                                      month=int(month),
                                      day=int(day),
                                      hour=int(bhour),
                                      minute=int(bminute))
            end = datetime.datetime(year=int(year),
                                    month=int(month),
                                    day=int(day),
                                    hour=int(ehour),
                                    minute=int(eminute))

            v.duration = end - start

Beispiel #2

0

Datei anzeigen

    def get_history(self):
        tables = self.document.xpath('//table[@id="table-detail-operation"]')
        if len(tables) == 0:
            tables = self.document.xpath('//table[@id="table-detail"]')
        if len(tables) == 0:
            tables = self.document.getroot().cssselect('table.table-detail')
        if len(tables) == 0:
            try:
                self.parser.select(self.document.getroot(), 'td.no-result', 1)
            except BrokenPageError:
                raise BrokenPageError('Unable to find table?')
            else:
                return

        for tr in tables[0].xpath('.//tr'):
            tds = tr.findall('td')
            if len(tds) < 4:
                continue

            t = Transaction(0)
            date = u''.join(
                [txt.strip() for txt in tds[self.COL_DATE].itertext()])
            raw = u''.join(
                [txt.strip() for txt in tds[self.COL_TEXT].itertext()])
            debit = u''.join(
                [txt.strip() for txt in tds[self.COL_DEBIT].itertext()])
            credit = u''.join(
                [txt.strip() for txt in tds[self.COL_CREDIT].itertext()])

            t.parse(date, re.sub(r'[ ]+', ' ', raw))
            t.set_amount(credit, debit)

            yield t

Beispiel #3

0

Datei anzeigen

    def get_history(self):
        txt = self.get_from_js('ListeMvts_data = new Array(', ');')

        if txt is None:
            no_trans = self.get_from_js('js_noMvts = new Ext.Panel(', ')')
            if no_trans is not None:
                # there is no transactions for this account, this is normal.
                return
            else:
                raise BrokenPageError(
                    'Unable to find transactions list in scripts')

        data = json.loads('[%s]' % txt.replace('"', '\\"').replace("'", '"'))

        for line in data:
            t = Transaction(line[self.COL_ID])

            if self.is_coming is not None:
                t.type = t.TYPE_CARD
                date = self.parser.strip(line[self.COL_DEBIT_DATE])
            else:
                date = self.parser.strip(line[self.COL_DATE])
            raw = self.parser.strip(line[self.COL_LABEL])

            t.parse(date, raw)
            t.set_amount(line[self.COL_VALUE])

            if t.date is NotAvailable:
                continue

            if self.set_coming(t):
                continue

            yield t

Beispiel #4

0

Datei anzeigen

    def set_details(self, v):
        for li in self.parser.select(self.document.getroot(), 'ul.spaced li'):
            span = li.find('label')
            name = span.text.strip()
            value = span.tail.strip()

            if name == 'Duration:':
                m = re.match('((\d+)hrs)?\s*((\d+)min)?\s*((\d+)sec)?', value)
                if not m:
                    raise BrokenPageError('Unable to parse datetime: %r' % value)
                hours = m.group(2) or 0
                minutes = m.group(4) or 0
                seconds = m.group(6) or 0
                v.duration = datetime.timedelta(hours=int(hours),
                                                minutes=int(minutes),
                                                seconds=int(seconds))
            elif name == 'Submitted:':
                author = li.find('i')
                if author is None:
                    author = li.find('a')
                if author is None:
                    v.author = unicode(value)
                else:
                    v.author = unicode(author.text)
            elif name == 'Rating:':
                value = li.find('span').text
                v.rating = int(value.rstrip('%'))
                v.rating_max = 100
            elif name == 'Date:':
                v.date = parse_dt(value)

Beispiel #5

0

Datei anzeigen

Datei: browser.py Projekt: yang2lalang/weboob

 def iter_station_departures(self, station_id, arrival_id=None):
     url = u'http://widget.canaltp.fr/Prochains_departs_15122009/dev/index.php?gare=%s' % unicode(
         station_id)
     result = self.openurl(url.encode('utf-8')).read()
     result = result
     departure = ''
     for line in result.split('&'):
         if '=' not in line:
             raise BrokenPageError('Unable to parse result: %s' % line)
         key, value = line.split('=', 1)
         if key == 'nomgare':
             departure = value
         elif key.startswith('ligne'):
             _type, unknown, _time, arrival, served, late, late_reason = value.split(
                 ';', 6)
             yield {
                 'type':
                 to_unicode(_type),
                 'time':
                 datetime.combine(date.today(),
                                  time(*[int(x)
                                         for x in _time.split(':')])),
                 'departure':
                 to_unicode(departure),
                 'arrival':
                 to_unicode(arrival).strip(),
                 'late':
                 late and time(0, int(late.split()[0])) or time(),
                 'late_reason':
                 to_unicode(late_reason).replace('\n', '').strip()
             }

Beispiel #6

0

Datei anzeigen

    def get_list(self):
        accounts = []

        txt = self.get_from_js('_data = new Array(', ');', is_list=True)

        if txt is None:
            raise BrokenPageError('Unable to find accounts list in scripts')

        data = json.loads('[%s]' % txt.replace("'", '"'))

        for line in data:
            a = Account()
            a.id = line[self.COL_ID].replace(' ', '')
            a._acc_nb = a.id.split('_')[0] if len(
                a.id.split('_')) > 1 else None
            fp = StringIO(
                unicode(line[self.COL_LABEL]).encode(self.browser.ENCODING))
            a.label = self.parser.tocleanstring(
                self.parser.parse(fp, self.browser.ENCODING).xpath(
                    '//div[@class="libelleCompteTDB"]')[0])
            # This account can be multiple life insurance accounts
            if a.label == 'ASSURANCE VIE-BON CAPI-SCPI-DIVERS *':
                continue

            a.balance = Decimal(
                FrenchTransaction.clean_amount(line[self.COL_BALANCE]))
            a.currency = a.get_currency(line[self.COL_BALANCE])
            a.type = self.get_account_type(a.label)
            if line[self.COL_HISTORY] == 'true':
                a._inv = False
                a._link = self.get_history_link()
                a._args = {
                    '_eventId': 'clicDetailCompte',
                    '_ipc_eventValue': '',
                    '_ipc_fireEvent': '',
                    'deviseAffichee': 'DEVISE',
                    'execution': self.get_execution(),
                    'idCompteClique': line[self.COL_ID],
                }
            else:
                a._inv = True
                a._args = {
                    '_ipc_eventValue': line[self.COL_ID],
                    '_ipc_fireEvent': line[self.COL_FIRE_EVENT],
                }
                a._link = self.document.xpath(
                    '//form[@name="changePageForm"]')[0].attrib['action']

            if a.id.find('_CarteVisa') >= 0:
                accounts[-1]._card_ids.append(a._args)
                if not accounts[-1].coming:
                    accounts[-1].coming = Decimal('0.0')
                accounts[-1].coming += a.balance
                continue

            a._card_ids = []
            accounts.append(a)

        return accounts

Beispiel #7

0

Datei anzeigen

 def get_messages_link(self):
     """
     Get the link to the messages page, which seems to have an identifier in it.
     """
     for link in self.parser.select(self.document.getroot(),
                                    'div#pantalon div.interieur a'):
         if 'MessagesRecus' in link.attrib.get('href', ''):
             return link.attrib['href']
     raise BrokenPageError('Unable to find the link to the messages page')

Beispiel #8

0

Datei anzeigen

 def login3(self, passwd):
     self.browser.select_form(name='Main')
     self.browser['codconf'] = passwd.encode('utf-8')
     a = self.document.xpath('//a[@title="Valider"]')[0]
     m = re.match("javascript:RedirectToDeiPart\('([^']+)'\);", a.attrib['href'])
     if not m:
         raise BrokenPageError('Unable to find validate URL')
     self.browser.form.action = m.group(1)
     self.browser.submit(nologin=True)

Beispiel #9

0

Datei anzeigen

 def iter_videos(self):
     if self.document is None or self.document['data'] is None:
         raise BrokenPageError('Unable to find JSON data')
     for data in self.document['data']:
         video = GDCVaultVideo.get_video_from_json(data)
         # TODO: split type 4 videos into id and id#slides
         if video is None:
             continue
         yield video

Beispiel #10

0

Datei anzeigen

Datei: pages.py Projekt: skeptycal/weboob-devel

    def set_video_url(self, video):

        embed_page = self.browser.readurl(
            'http://www.dailymotion.com/embed/video/%s' % video.id)

        m = re.search('playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
                      embed_page)
        if not m:
            raise BrokenPageError('Unable to find information about video')

        info = json.loads(m.group(1))
        qualities = info.get('metadata').get('qualities')
        for key in ['2160', '1440', '1080', '720', '480', '380', '240']:
            if qualities.get(key):
                max_quality = key
                break
        else:
            raise BrokenPageError(u'Unable to extract video URL')

        video.url = unicode(qualities.get(max_quality)[0].get('url'))

Beispiel #11

0

Datei anzeigen

    def get_url(self):
        download_div = self.parser.select(self.document.getroot(), 'ul.downloadList li')
        if len(download_div) < 1:
            raise BrokenPageError('Unable to find file URL')

        a = self.parser.select(download_div[0], 'a', 1)
        m = re.match('^(\w+) - .*', a.text)
        if m:
            ext = m.group(1).lower()
        else:
            ext = u'flv'
        return unicode(a.attrib['href']), unicode(ext)

Beispiel #12

0

Datei anzeigen

Datei: pages.py Projekt: sourcery-ai-bot/weboob

    def set_video_url(self, video):

        embed_page = self.browser.readurl(
            'http://www.dailymotion.com/embed/video/%s' % video.id)

        m = re.search('var info = ({.*?}),[^{"]', embed_page)
        if not m:
            raise BrokenPageError('Unable to find information about video')

        info = json.loads(m.group(1))
        for key in [
                'stream_h264_hd1080_url', 'stream_h264_hd_url',
                'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url'
        ]:
            if info.get(key):
                max_quality = key
                break
        else:
            raise BrokenPageError(u'Unable to extract video URL')

        video.url = unicode(info[max_quality])

Beispiel #13

0

Datei anzeigen

Datei: browser.py Projekt: dermorz/weboob

    def iter_videos(self):
        # When no results are found, the website returns random results
        sb = self.parser.select(self.document.getroot(),
                                'div.search form input.searchbox', 1)
        if sb.value == 'No Results Found':
            return

        #Extracting meta data from results page
        vidbackdrop_list = self.parser.select(self.document.getroot(),
                                              'div.vidBackdrop    ')
        for vidbackdrop in vidbackdrop_list:
            url = self.parser.select(vidbackdrop, 'a', 1).attrib['href']
            _id = url[2:]

            video = CappedVideo(_id)
            video.set_empty_fields(NotAvailable, ('url', ))

            video.title = to_unicode(
                self.parser.select(vidbackdrop, 'div.vidTitle a', 1).text)
            video.author = to_unicode(
                self.parser.select(vidbackdrop, 'div.vidAuthor a', 1).text)

            thumbnail_url = 'http://cdn.capped.tv/pre/%s.png' % _id
            video.thumbnail = Thumbnail(thumbnail_url)
            video.thumbnail.url = to_unicode(video.thumbnail.id)

            #we get the description field
            duration_tmp = self.parser.select(vidbackdrop, 'div.vidInfo', 1)
            #we remove tabs and spaces
            duration_tmp2 = duration_tmp.text[7:]
            #we remove all fields exept time
            duration_tmp3 = duration_tmp2.split(' ')[0]
            #we transform it in datetime format
            parts = duration_tmp3.split(':')
            if len(parts) == 1:
                hours = minutes = 0
                seconds = parts[0]
            elif len(parts) == 2:
                hours = 0
                minutes, seconds = parts
            elif len(parts) == 3:
                hours, minutes, seconds = parts
            else:
                raise BrokenPageError('Unable to parse duration %r' %
                                      duration_tmp)

            video.duration = datetime.timedelta(hours=int(hours),
                                                minutes=int(minutes),
                                                seconds=int(seconds))

            yield video

Beispiel #14

0

Datei anzeigen

Datei: accounts_list.py Projekt: sourcery-ai-bot/weboob

    def get_list(self):
        for tr in self.document.getiterator('tr'):
            if 'LGNTableRow' not in tr.attrib.get('class', '').split():
                continue

            account = Account()
            for td in tr.getiterator('td'):
                if td.attrib.get('headers', '') == 'TypeCompte':
                    a = td.find('a')
                    if a is None:
                        break
                    account.label = self.parser.tocleanstring(a)
                    for pattern, actype in self.TYPES.iteritems():
                        if account.label.startswith(pattern):
                            account.type = actype
                    account._link_id = a.get('href', '')

                elif td.attrib.get('headers', '') == 'NumeroCompte':
                    account.id = self.parser.tocleanstring(td).replace(
                        u'\xa0', '')

                elif td.attrib.get('headers', '') == 'Libelle':
                    text = self.parser.tocleanstring(td)
                    if text != '':
                        account.label = text

                elif td.attrib.get('headers', '') == 'Solde':
                    div = td.xpath('./div[@class="Solde"]')
                    if len(div) > 0:
                        balance = self.parser.tocleanstring(div[0])
                        if len(balance) > 0 and balance not in ('ANNULEE',
                                                                'OPPOSITION'):
                            try:
                                account.balance = Decimal(
                                    FrenchTransaction.clean_amount(balance))
                            except InvalidOperation:
                                raise BrokenPageError(
                                    'Unable to parse balance %r' % balance)
                            account.currency = account.get_currency(balance)
                        else:
                            account.balance = NotAvailable

            if not account.label or empty(account.balance):
                continue

            if 'CARTE_' in account._link_id:
                account.type = account.TYPE_CARD
                account.coming = account.balance
                account.balance = Decimal('0')

            yield account

Beispiel #15

0

Datei anzeigen

Datei: pages.py Projekt: dasimon/weboob

    def get_history(self, date_guesser, state=None):
        seen = set()
        lines = self.document.xpath('(//table[@class="ca-table"])[2]/tr')
        debit_date = None
        for i, line in enumerate(lines):
            is_balance = line.xpath('./td/@class="cel-texte cel-neg"')

            # It is possible to have three or four columns.
            cols = [self.parser.tocleanstring(td) for td in line.xpath('./td')]
            date = cols[0]
            label = cols[1]
            amount = cols[-1]

            t = Transaction()
            t.set_amount(amount)
            t.label = t.raw = label

            if is_balance:
                m = re.search('(\d+ [^ ]+ \d+)', label)
                if not m:
                    raise BrokenPageError(
                        'Unable to read card balance in history: %r' % label)
                if state is None:
                    debit_date = parse_french_date(m.group(1))
                else:
                    debit_date = state

                # Skip the first line because it is balance
                if i == 0:
                    continue

                t.date = t.rdate = debit_date

                # Consider the second one as a positive amount to reset balance to 0.
                t.amount = -t.amount
                state = t.date
            else:
                day, month = map(int, date.split('/', 1))
                t.rdate = date_guesser.guess_date(day, month)
                t.date = debit_date

            t.type = t.TYPE_CARD
            try:
                t.id = t.unique_id(seen)
            except UnicodeEncodeError:
                self.logger.debug(t)
                self.logger.debug(t.label)
                raise

            yield state, t

Beispiel #16

0

Datei anzeigen

Datei: browser.py Projekt: sourcery-ai-bot/weboob

    def go_on_accounts_list(self):
        for taskInfoOID in self.ACCOUNT_URLS:
            self.location(self.buildurl('/cyber/internet/StartTask.do', taskInfoOID=taskInfoOID, token=self.token))
            if not self.page.is_error():
                self.ACCOUNT_URLS = [taskInfoOID]
                break
        else:
            raise BrokenPageError('Unable to go on the accounts list page')

        if self.page.is_short_list():
            self.select_form(nr=0)
            self.set_all_readonly(False)
            self['dialogActionPerformed'] = 'EQUIPEMENT_COMPLET'
            self['token'] = self.page.build_token(self['token'])
            self.submit()

Beispiel #17

0

Datei anzeigen

Datei: pages.py Projekt: skeptycal/weboob-devel

    def iter_videos(self):
        for div in self.parser.select(self.document.getroot(),
                                      'div.sd_video_listitem'):
            smalldiv = self.parser.select(div, 'div.sd_video_preview', 1)
            _id = smalldiv.attrib.get('data-id', None)

            if _id is None:
                self.browser.logger.warning('Unable to find the ID of a video')
                continue

            video = DailymotionVideo(_id)
            video.title = unicode(
                self.parser.select(div, 'div a img',
                                   1).attrib['title']).strip()
            video.author = unicode(
                self.parser.select(div, 'a.link-on-hvr', 1).text).strip()
            video.description = NotAvailable
            try:
                parts = self.parser.select(div, 'div.badge-duration',
                                           1).text.split(':')
            except BrokenPageError:
                # it's probably a live, np.
                video.duration = NotAvailable
            else:
                if len(parts) == 1:
                    seconds = parts[0]
                    hours = minutes = 0
                elif len(parts) == 2:
                    minutes, seconds = parts
                    hours = 0
                elif len(parts) == 3:
                    hours, minutes, seconds = parts
                else:
                    raise BrokenPageError(
                        'Unable to parse duration %r' %
                        self.parser.select(div, 'div.duration', 1).text)
                video.duration = datetime.timedelta(hours=int(hours),
                                                    minutes=int(minutes),
                                                    seconds=int(seconds))
            url = unicode(
                self.parser.select(div, 'img.preview', 1).attrib['data-src'])
            # remove the useless anti-caching
            url = re.sub('\?\d+', '', url)
            video.thumbnail = BaseImage(url)
            video.thumbnail.url = video.thumbnail.id

            video.set_empty_fields(NotAvailable, ('url', ))
            yield video

Beispiel #18

0

Datei anzeigen

Datei: pages.py Projekt: skeptycal/weboob-devel

    def get_list(self):
        accounts = []

        txt = self.get_from_js('_data = new Array(', ');', is_list=True)

        if txt is None:
            raise BrokenPageError('Unable to find accounts list in scripts')

        data = json.loads('[%s]' % txt.replace("'", '"'))

        for line in data:
            a = Account()
            a.id = line[self.COL_ID].replace(' ', '')
            fp = StringIO(
                unicode(line[self.COL_LABEL]).encode(self.browser.ENCODING))
            a.label = self.parser.tocleanstring(
                self.parser.parse(fp, self.browser.ENCODING).xpath(
                    '//div[@class="libelleCompteTDB"]')[0])
            a.balance = Decimal(
                FrenchTransaction.clean_amount(line[self.COL_BALANCE]))
            a.currency = a.get_currency(line[self.COL_BALANCE])
            a.type = self.get_account_type(a.label)
            a._link = self.get_history_link()
            if line[self.COL_HISTORY] == 'true':
                a._args = {
                    '_eventId': 'clicDetailCompte',
                    '_ipc_eventValue': '',
                    '_ipc_fireEvent': '',
                    'deviseAffichee': 'DEVISE',
                    'execution': self.get_execution(),
                    'idCompteClique': line[self.COL_ID],
                }
            else:
                a._args = None

            if a.id.find('_CarteVisa') >= 0:
                accounts[-1]._card_ids.append(a._args)
                if not accounts[-1].coming:
                    accounts[-1].coming = Decimal('0.0')
                accounts[-1].coming += a.balance
                continue

            a._card_ids = []
            accounts.append(a)

        return iter(accounts)

Beispiel #19

0

Datei anzeigen

Datei: pages.py Projekt: skeptycal/weboob-devel

    def set_video_metadata(self, video):

        head = self.parser.select(self.document.getroot(), 'head', 1)

        video.title = unicode(
            self.parser.select(head, 'meta[property="og:title"]',
                               1).get("content")).strip()
        video.author = unicode(
            self.parser.select(head, 'meta[name="author"]',
                               1).get("content")).strip()

        url = unicode(
            self.parser.select(head, 'meta[property="og:image"]',
                               1).get("content")).strip()
        # remove the useless anti-caching
        url = re.sub('\?\d+', '', url)
        video.thumbnail = BaseImage(url)
        video.thumbnail.url = video.thumbnail.id

        try:
            parts = self.parser.select(head, 'meta[property="video:duration"]',
                                       1).get("content").strip().split(':')
        except BrokenPageError:
            # it's probably a live, np.
            video.duration = NotAvailable
        else:
            if len(parts) == 1:
                seconds = parts[0]
                hours = minutes = 0
            elif len(parts) == 2:
                minutes, seconds = parts
                hours = 0
            elif len(parts) == 3:
                hours, minutes, seconds = parts
            else:
                raise BrokenPageError('Unable to parse duration %r' % parts)
            video.duration = datetime.timedelta(hours=int(hours),
                                                minutes=int(minutes),
                                                seconds=int(seconds))

        try:
            video.description = html2text(
                self.parser.select(head, 'meta[property="og:description"]',
                                   1).get("content")).strip() or unicode()
        except BrokenPageError:
            video.description = u''

Beispiel #20

0

Datei anzeigen

Datei: pages.py Projekt: linura/weboob

    def get_video(self, video=None):
        _id = to_unicode(self.group_dict['id'])
        if video is None:
            video = JacquieEtMichelVideo(_id)
        title_el = self.parser.select(self.document.getroot(), 'h1', 1)
        video.title = to_unicode(title_el.text.strip())
        video.description = self.document.xpath(
            '//meta[@name="description"]')[0].attrib['content']

        for script in self.document.xpath('.//script'):
            if script.text is None:
                continue
            m = re.search('"(http://[^"]+.mp4)"', script.text, re.MULTILINE)
            if m:
                video.url = to_unicode(m.group(1))
                break

        if not video.url:
            raise BrokenPageError('Unable to find URL')

        video.set_empty_fields(NotAvailable)

        return video

Beispiel #21

0

Datei anzeigen

Datei: browser.py Projekt: yang2lalang/weboob

    def get_content(self, _id):
        url, _id = self.parse_id(_id)

        if url is None:
            return None

        self.location(url)
        self.page.url = self.absurl(url)

        if self.is_on_page(CommentPage):
            content = self.page.get_comment()
        elif self.is_on_page(ContentPage):
            m = re.match('.*#comment-(\d+)$', url)
            if m:
                content = self.page.get_comment(int(m.group(1)))
            else:
                content = self.page.get_article()
        else:
            raise BrokenPageError('Not on a content or comment page (%r)' % self.page)

        if _id is not None:
            content.id = _id
        return content

Beispiel #22

0

Datei anzeigen

Datei: search.py Projekt: sourcery-ai-bot/weboob

    def iter_videos(self):
        try:
            ul = self.parser.select(self.document.getroot(),
                                    'div.container-videos ul', 1)
        except BrokenPageError:
            # It means there are no results.
            return
        for li in ul.findall('li'):
            url = li.find('a').find('img').attrib['src']

            id = re.sub(self.URL_REGEXP, r'\2', url)
            video = InaVideo(id)

            video.thumbnail = BaseImage(u'http://boutique.ina.fr%s' % url)
            video.thumbnail.url = video.thumbnail.id

            # The title is poorly encoded is the source, we have to encode/decode it again
            video.title = unicode(self.parser.select(
                li, 'p.titre',
                1).text).encode('raw_unicode_escape').decode('utf8')

            date = self.parser.select(li, 'p.date', 1).text
            day, month, year = [int(s) for s in date.split('/')]
            video.date = datetime.datetime(year, month, day)

            duration = self.parser.select(li, 'p.duree', 1).text
            m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration)
            if m:
                video.duration = datetime.timedelta(hours=int(m.group(2) or 0),
                                                    minutes=int(
                                                        m.group(4) or 0),
                                                    seconds=int(m.group(5)))
            else:
                raise BrokenPageError('Unable to match duration (%r)' %
                                      duration)

            yield video

Beispiel #23

0

Datei anzeigen

Datei: pages.py Projekt: dasimon/weboob

    def iter_accounts(self, next_pages):
        params = self.get_params()

        account = None
        currency = None
        for th in self.document.xpath('//table[@id="TabCtes"]//thead//th'):
            m = re.match('.*\((\w+)\)$', th.text)
            if m and currency is None:
                currency = Account.get_currency(m.group(1))

        for tr in self.document.xpath('//table[@id="TabCtes"]/tbody/tr'):
            cols = tr.xpath('./td')

            id = self.parser.tocleanstring(cols[self.COL_ID])
            if len(id) > 0:
                if account is not None:
                    yield account
                account = Account()
                account.id = id.replace(' ', '')
                account.type = Account.TYPE_CARD
                account.balance = account.coming = Decimal('0')
                account._next_debit = datetime.date.today()
                account._prev_debit = datetime.date(2000, 1, 1)
                account.label = u' '.join([
                    self.parser.tocleanstring(cols[self.COL_TYPE]),
                    self.parser.tocleanstring(cols[self.COL_LABEL])
                ])
                account.currency = currency
                account._params = None
                account._invest_params = None
                account._coming_params = params.copy()
                account._coming_params[
                    'dialogActionPerformed'] = 'SELECTION_ENCOURS_CARTE'
                account._coming_params[
                    'attribute($SEL_$%s)' %
                    tr.attrib['id'].split('_')[0]] = tr.attrib['id'].split(
                        '_', 1)[1]
            elif account is None:
                raise BrokenPageError('Unable to find accounts on cards page')
            else:
                account._params = params.copy()
                account._params[
                    'dialogActionPerformed'] = 'SELECTION_ENCOURS_CARTE'
                account._params[
                    'attribute($SEL_$%s)' %
                    tr.attrib['id'].split('_')[0]] = tr.attrib['id'].split(
                        '_', 1)[1]

            date_col = self.parser.tocleanstring(cols[self.COL_DATE])
            m = re.search('(\d+)/(\d+)/(\d+)', date_col)
            if not m:
                self.logger.warning('Unable to parse date %r' % date_col)
                continue

            date = datetime.date(*reversed(map(int, m.groups())))
            if date.year < 100:
                date = date.replace(year=date.year + 2000)

            amount = Decimal(
                FrenchTransaction.clean_amount(
                    self.parser.tocleanstring(cols[self.COL_AMOUNT])))

            if not date_col.endswith('(1)'):
                # debited
                account.coming += -abs(amount)
                account._next_debit = date
            elif date > account._prev_debit:
                account._prev_balance = -abs(amount)
                account._prev_debit = date

        if account is not None:
            yield account

        # Needed to preserve navigation.
        btn = self.document.xpath('.//button/span[text()="Retour"]')
        if len(btn) > 0:
            btn = btn[0].getparent()
            actions = self.get_button_actions()
            _params = params.copy()
            _params.update(actions[btn.attrib['id']])
            self.browser.openurl('/cyber/internet/ContinueTask.do',
                                 urllib.urlencode(_params))

Beispiel #24

0

Datei anzeigen

    def get_list(self):
        def check_valid_url(url):
            pattern = [
                '/restitution/cns_detailAVPAT.html',
                '/restitution/cns_detailPea.html',
                '/restitution/cns_detailAlterna.html',
            ]

            for p in pattern:
                if url.startswith(p):
                    return False
            return True

        for tr in self.document.getiterator('tr'):
            if 'LGNTableRow' not in tr.attrib.get('class', '').split():
                continue

            account = Account()
            for td in tr.getiterator('td'):
                if td.attrib.get('headers', '') == 'TypeCompte':
                    a = td.find('a')
                    if a is None:
                        break
                    account.label = self.parser.tocleanstring(a)
                    account._link_id = a.get('href', '')
                    for pattern, actype in self.TYPES.iteritems():
                        if account.label.startswith(pattern):
                            account.type = actype
                            break
                    else:
                        if account._link_id.startswith('/asv/asvcns10.html'):
                            account.type = Account.TYPE_LIFE_INSURANCE
                    # Website crashes when going on theses URLs
                    if not check_valid_url(account._link_id):
                        account._link_id = None

                elif td.attrib.get('headers', '') == 'NumeroCompte':
                    account.id = self.parser.tocleanstring(td).replace(
                        u'\xa0', '')

                elif td.attrib.get('headers', '') == 'Libelle':
                    text = self.parser.tocleanstring(td)
                    if text != '':
                        account.label = text

                elif td.attrib.get('headers', '') == 'Solde':
                    div = td.xpath('./div[@class="Solde"]')
                    if len(div) > 0:
                        balance = self.parser.tocleanstring(div[0])
                        if len(balance) > 0 and balance not in ('ANNULEE',
                                                                'OPPOSITION'):
                            try:
                                account.balance = Decimal(
                                    FrenchTransaction.clean_amount(balance))
                            except InvalidOperation:
                                raise BrokenPageError(
                                    'Unable to parse balance %r' % balance)
                            account.currency = account.get_currency(balance)
                        else:
                            account.balance = NotAvailable

            if not account.label or empty(account.balance):
                continue

            if account._link_id and 'CARTE_' in account._link_id:
                account.type = account.TYPE_CARD

            if account.type == Account.TYPE_UNKNOWN:
                self.logger.debug('Unknown account type: %s', account.label)

            yield account

Beispiel #25

0

Datei anzeigen

    def get_accounts(self):
        accounts = {}
        content = self.document.xpath(
            '//div[@id="main"]//div[@class="col first"]')[0]

        # Primary currency account
        primary_account = Account()
        primary_account.type = Account.TYPE_CHECKING

        # Total currency balance.
        # If there are multiple currencies, this balance is all currencies
        # converted to the main currency.
        try:
            balance = content.xpath('.//h3/span[@class="balance"]')
            if not balance:
                balance = content.xpath('.//li[@class="balance"]//span/strong')
            balance = balance[0].text_content().strip()
            primary_account.balance = AmTr.decimal_amount(balance)
            primary_account.currency = Account.get_currency(balance)
            primary_account.id = unicode(primary_account.currency)
            primary_account.label = u'%s %s*' % (self.browser.username,
                                                 balance.split()[-1])
        except IndexError:
            primary_account.balance = NotAvailable
            primary_account.label = u'%s' % (self.browser.username)
        accounts[primary_account.id] = primary_account

        # The following code will only work if the user enabled multiple currencies.
        balance = content.xpath(
            './/div[@class="body"]//ul/li[@class="balance"]/span')
        table = content.xpath('.//table[@id="balanceDetails"]//tbody//tr')

        # sanity check
        if bool(balance) is not bool(table):
            raise BrokenPageError(
                'Unable to find all required multiple currency entries')

        # Primary currency balance.
        # If the user enabled multiple currencies, we get this one instead.
        # An Account object has only one currency; secondary currencies should be other accounts.
        if balance:
            balance = balance[0].text_content().strip()
            primary_account.balance = AmTr.decimal_amount(balance)
            # The primary currency of the "head balance" is the same; ensure we got the right one
            assert primary_account.currency == primary_account.get_currency(
                balance)

        for row in table:
            balance = row.xpath('.//td')[-1].text_content().strip()
            account = Account()
            account.type = Account.TYPE_CHECKING
            # XXX it ignores 5+ devises, so it's bad, but it prevents a crash, cf #1216
            try:
                account.balance = AmTr.decimal_amount(balance)
            except InvalidOperation:
                continue
            account.currency = Account.get_currency(balance)
            account.id = unicode(account.currency)
            account.label = u'%s %s' % (self.browser.username,
                                        balance.split()[-1])
            if account.id == primary_account.id:
                assert account.balance == primary_account.balance
                assert account.currency == primary_account.currency
            elif account.currency:
                accounts[account.id] = account

        return accounts

Beispiel #26

0

Datei anzeigen

    def get_video(self, video=None):
        # check for slides id variant
        want_slides = False
        m = re.match('.*#slides', self.url)
        if m:
            want_slides = True
            # not sure it's safe
            self.group_dict['id'] += '#slides'

        if video is None:
            video = GDCVaultVideo(self.group_dict['id'])

        # the config file has it too, but in CDATA and only for type 4
        obj = self.parser.select(self.document.getroot(), 'title')
        title = None
        if len(obj) > 0:
            try:
                title = unicode(obj[0].text)
            except UnicodeDecodeError as e:
                title = None

        if title is None:
            obj = self.parser.select(self.document.getroot(),
                                     'meta[name=title]')
            if len(obj) > 0:
                if 'content' in obj[0].attrib:
                    try:
                        # FIXME: 1013483 has buggus title (latin1)
                        # for now we just pass it as-is
                        title = obj[0].attrib['content']
                    except UnicodeDecodeError as e:
                        # XXX: this doesn't even works!?
                        title = obj[0].attrib['content'].decode('iso-5589-15')

        if title is not None:
            title = title.strip()
            m = re.match('GDC Vault\s+-\s+(.*)', title)
            if m:
                title = m.group(1)
            video.title = title

        #TODO: POST back the title to /search.php and filter == id to get
        # cleaner (JSON) data... (though it'd be much slower)

        # try to find an iframe (type 3 and 4)
        obj = self.parser.select(self.document.getroot(), 'iframe')
        if len(obj) == 0:
            # type 1 or 2 (swf+js)
            # find which script element contains the swf args
            for script in self.parser.select(self.document.getroot(),
                                             'script'):
                m = re.match(
                    ".*new SWFObject.*addVariable\('type', '(.*)'\).*",
                    unicode(script.text), re.DOTALL)
                if m:
                    video.ext = m.group(1)

                m = re.match(
                    ".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*",
                    unicode(script.text), re.DOTALL)
                if m:
                    video.url = "http://gdcvault.com%s" % (m.group(1))
                    # TODO: for non-free (like 769),
                    # must be logged to use /mediaProxy.php

                    # FIXME: doesn't seem to work yet, we get 2 bytes as html
                    # 769 should give:
                    # http://twvideo01.ubm-us.net/o1/gdcradio-net/2007/gdc/GDC07-4889.mp3
                    # HACK: we use mechanize directly here for now... FIXME
                    #print "asking for redirect on '%s'" % (video.url)
                    #self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]]
                    #print self.browser.addheaders
                    self.browser.set_handle_redirect(False)
                    try:
                        self.browser.open_novisit(video.url)
                        # headers = req.info()
                        # if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2':
                        # print 'BUG'

                        #print req.code
                    except HTTPError as e:
                        #print e.getcode()
                        if e.getcode() == 302 and hasattr(e, 'hdrs'):
                            #print e.hdrs['Location']
                            video.url = unicode(e.hdrs['Location'])
                    self.browser.set_handle_redirect(True)

                    video.set_empty_fields(NotAvailable)
                    return video

            #XXX: raise error?
            return None

        obj = obj[0]
        if obj is None:
            return None
        # type 3 or 4 (iframe)
        # get the config file for the rest
        iframe_url = obj.attrib['src']

        # 1015020 has a boggus url
        m = re.match('http:/event(.+)', iframe_url)
        if m:
            iframe_url = 'http://event' + m.group(1)

        # print iframe_url
        # 1013798 has player169.html
        # 1012186 has player16x9.html
        # some other have /somethingplayer.html...
        # 1441 has a space in the xml filename, which we must not strip
        m = re.match(
            '(http:.*/)[^/]*player[0-9a-z]*\.html\?.*xmlURL=([^&]+\.xml).*\&token=([^& ]+)',
            iframe_url)

        if not m:
            m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
            if m is None:
                return None
            # TODO: must be logged to use /mediaProxy.php
            # type 3 (pdf slides)
            video.ext = u'pdf'
            video.url = "http://gdcvault.com%s" % (unicode(iframe_url))

            # HACK: we use mechanize directly here for now... FIXME
            # print "asking for redirect on '%s'" % (video.url)
            self.browser.set_handle_redirect(False)
            try:
                self.browser.open_novisit(video.url)
            except HTTPError as e:
                if e.getcode() == 302 and hasattr(e, 'hdrs'):
                    video.url = unicode(e.hdrs['Location'])
            self.browser.set_handle_redirect(True)

            video.set_empty_fields(NotAvailable)
            return video

        # type 4 (dual screen video)

        # token doesn't actually seem required
        # 1441 has a space in the xml filename
        xml_filename = urllib.quote(m.group(2))
        config_url = m.group(1) + xml_filename + '?token=' + m.group(3)

        # self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]]
        # print self.browser.addheaders
        # TODO: fix for 1015021 & others (forbidden)
        #config = self.browser.openurl(config_url).read()
        config = self.browser.get_document(self.browser.openurl(config_url))

        obj = self.parser.select(config.getroot(), 'akamaihost', 1)
        host = obj.text
        if host is None:
            raise BrokenPageError('Missing tag in xml config file')

        if host == "smil":
            # the rtmp URL is described in a smil file,
            # with several available bitrates
            obj = self.parser.select(config.getroot(), 'speakervideo', 1)
            smil = self.browser.get_document(self.browser.openurl(obj.text))
            obj = self.parser.select(smil.getroot(), 'meta', 1)
            # TODO: error checking
            base = obj.attrib.get('base', '')
            best_bitrate = 0
            path = None
            obj = self.parser.select(smil.getroot(), 'video')
            # choose the best bitrate
            for o in obj:
                rate = int(o.attrib.get('system-bitrate', 0))
                if rate > best_bitrate:
                    path = o.attrib.get('src', '')
            video.url = unicode(base + '/' + path)

        else:
            # not smil, the rtmp url is directly here as host + path
            # for id 1373 host is missing '/ondemand'
            # only add it when only a domain is specified without path
            m = re.match('^[^\/]+$', host)
            if m:
                host += "/ondemand"

            videos = {}

            obj = self.parser.select(config.getroot(), 'speakervideo', 1)
            if obj.text is not None:
                videos['speaker'] = 'rtmp://' + host + '/' + urllib.quote(
                    obj.text)

            obj = self.parser.select(config.getroot(), 'slidevideo', 1)
            if obj.text is not None:
                videos['slides'] = 'rtmp://' + host + '/' + urllib.quote(
                    obj.text)

            # print videos
            # XXX
            if 'speaker' in videos:
                video.url = unicode(videos['speaker'])
            elif 'slides' in videos:
                # 1016627 only has slides, so fallback to them
                video.url = unicode(videos['slides'])

            if want_slides:
                if 'slides' in videos:
                    video.url = unicode(videos['slides'])
            # if video.url is none: raise ? XXX

        obj = self.parser.select(config.getroot(), 'date', 1)
        if obj.text is not None:
            # 1016634 has "Invalid Date"
            try:
                video.date = parse_dt(obj.text)
            except ValueError as e:
                video.date = NotAvailable

        obj = self.parser.select(config.getroot(), 'duration', 1)
        m = re.match('(\d\d):(\d\d):(\d\d)', obj.text)
        if m:
            video.duration = datetime.timedelta(hours=int(m.group(1)),
                                                minutes=int(m.group(2)),
                                                seconds=int(m.group(3)))

        obj = self.parser.select(config.getroot(), 'speaker', 1)
        #print obj.text_content()

        #self.set_details(video)

        video.set_empty_fields(NotAvailable)
        return video

        obj = self.parser.select(self.document.getroot(), 'title')
        if len(obj) < 1:
            return None
        title = obj[0].text.strip()
        m = re.match('GDC Vault\s+-\s+(.*)', title)
        if m:
            title = m.group(1)