def set_details(self, v): v.author = u'European Parliament' obj = self.parser.select(self.document.getroot(), 'meta[name=available]', 1) if obj is not None: value = obj.attrib['content'] m = re.match('(\d\d)-(\d\d)-(\d\d\d\d)\s*(\d\d):(\d\d)', value) if not m: raise BrokenPageError('Unable to parse datetime: %r' % value) day = m.group(1) month = m.group(2) year = m.group(3) hour = m.group(4) minute = m.group(5) v.date = datetime.datetime(year=int(year), month=int(month), day=int(day), hour=int(hour), minute=int(minute)) obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle', 1) if obj is not None: span = self.parser.select(obj, 'span.ep_date', 1) value = span.text m = re.match( '(\d\d):(\d\d)\s*\/\s*(\d\d):(\d\d)\s*-\s*(\d\d)-(\d\d)-(\d\d\d\d)', value) if not m: raise BrokenPageError('Unable to parse datetime: %r' % value) bhour = m.group(1) bminute = m.group(2) ehour = m.group(3) eminute = m.group(4) day = m.group(5) month = m.group(6) year = m.group(7) start = datetime.datetime(year=int(year), month=int(month), day=int(day), hour=int(bhour), minute=int(bminute)) end = datetime.datetime(year=int(year), month=int(month), day=int(day), hour=int(ehour), minute=int(eminute)) v.duration = end - start
def get_history(self): tables = self.document.xpath('//table[@id="table-detail-operation"]') if len(tables) == 0: tables = self.document.xpath('//table[@id="table-detail"]') if len(tables) == 0: tables = self.document.getroot().cssselect('table.table-detail') if len(tables) == 0: try: self.parser.select(self.document.getroot(), 'td.no-result', 1) except BrokenPageError: raise BrokenPageError('Unable to find table?') else: return for tr in tables[0].xpath('.//tr'): tds = tr.findall('td') if len(tds) < 4: continue t = Transaction(0) date = u''.join( [txt.strip() for txt in tds[self.COL_DATE].itertext()]) raw = u''.join( [txt.strip() for txt in tds[self.COL_TEXT].itertext()]) debit = u''.join( [txt.strip() for txt in tds[self.COL_DEBIT].itertext()]) credit = u''.join( [txt.strip() for txt in tds[self.COL_CREDIT].itertext()]) t.parse(date, re.sub(r'[ ]+', ' ', raw)) t.set_amount(credit, debit) yield t
def get_history(self): txt = self.get_from_js('ListeMvts_data = new Array(', ');') if txt is None: no_trans = self.get_from_js('js_noMvts = new Ext.Panel(', ')') if no_trans is not None: # there is no transactions for this account, this is normal. return else: raise BrokenPageError( 'Unable to find transactions list in scripts') data = json.loads('[%s]' % txt.replace('"', '\\"').replace("'", '"')) for line in data: t = Transaction(line[self.COL_ID]) if self.is_coming is not None: t.type = t.TYPE_CARD date = self.parser.strip(line[self.COL_DEBIT_DATE]) else: date = self.parser.strip(line[self.COL_DATE]) raw = self.parser.strip(line[self.COL_LABEL]) t.parse(date, raw) t.set_amount(line[self.COL_VALUE]) if t.date is NotAvailable: continue if self.set_coming(t): continue yield t
def set_details(self, v): for li in self.parser.select(self.document.getroot(), 'ul.spaced li'): span = li.find('label') name = span.text.strip() value = span.tail.strip() if name == 'Duration:': m = re.match('((\d+)hrs)?\s*((\d+)min)?\s*((\d+)sec)?', value) if not m: raise BrokenPageError('Unable to parse datetime: %r' % value) hours = m.group(2) or 0 minutes = m.group(4) or 0 seconds = m.group(6) or 0 v.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) elif name == 'Submitted:': author = li.find('i') if author is None: author = li.find('a') if author is None: v.author = unicode(value) else: v.author = unicode(author.text) elif name == 'Rating:': value = li.find('span').text v.rating = int(value.rstrip('%')) v.rating_max = 100 elif name == 'Date:': v.date = parse_dt(value)
def iter_station_departures(self, station_id, arrival_id=None): url = u'http://widget.canaltp.fr/Prochains_departs_15122009/dev/index.php?gare=%s' % unicode( station_id) result = self.openurl(url.encode('utf-8')).read() result = result departure = '' for line in result.split('&'): if '=' not in line: raise BrokenPageError('Unable to parse result: %s' % line) key, value = line.split('=', 1) if key == 'nomgare': departure = value elif key.startswith('ligne'): _type, unknown, _time, arrival, served, late, late_reason = value.split( ';', 6) yield { 'type': to_unicode(_type), 'time': datetime.combine(date.today(), time(*[int(x) for x in _time.split(':')])), 'departure': to_unicode(departure), 'arrival': to_unicode(arrival).strip(), 'late': late and time(0, int(late.split()[0])) or time(), 'late_reason': to_unicode(late_reason).replace('\n', '').strip() }
def get_list(self): accounts = [] txt = self.get_from_js('_data = new Array(', ');', is_list=True) if txt is None: raise BrokenPageError('Unable to find accounts list in scripts') data = json.loads('[%s]' % txt.replace("'", '"')) for line in data: a = Account() a.id = line[self.COL_ID].replace(' ', '') a._acc_nb = a.id.split('_')[0] if len( a.id.split('_')) > 1 else None fp = StringIO( unicode(line[self.COL_LABEL]).encode(self.browser.ENCODING)) a.label = self.parser.tocleanstring( self.parser.parse(fp, self.browser.ENCODING).xpath( '//div[@class="libelleCompteTDB"]')[0]) # This account can be multiple life insurance accounts if a.label == 'ASSURANCE VIE-BON CAPI-SCPI-DIVERS *': continue a.balance = Decimal( FrenchTransaction.clean_amount(line[self.COL_BALANCE])) a.currency = a.get_currency(line[self.COL_BALANCE]) a.type = self.get_account_type(a.label) if line[self.COL_HISTORY] == 'true': a._inv = False a._link = self.get_history_link() a._args = { '_eventId': 'clicDetailCompte', '_ipc_eventValue': '', '_ipc_fireEvent': '', 'deviseAffichee': 'DEVISE', 'execution': self.get_execution(), 'idCompteClique': line[self.COL_ID], } else: a._inv = True a._args = { '_ipc_eventValue': line[self.COL_ID], '_ipc_fireEvent': line[self.COL_FIRE_EVENT], } a._link = self.document.xpath( '//form[@name="changePageForm"]')[0].attrib['action'] if a.id.find('_CarteVisa') >= 0: accounts[-1]._card_ids.append(a._args) if not accounts[-1].coming: accounts[-1].coming = Decimal('0.0') accounts[-1].coming += a.balance continue a._card_ids = [] accounts.append(a) return accounts
def get_messages_link(self): """ Get the link to the messages page, which seems to have an identifier in it. """ for link in self.parser.select(self.document.getroot(), 'div#pantalon div.interieur a'): if 'MessagesRecus' in link.attrib.get('href', ''): return link.attrib['href'] raise BrokenPageError('Unable to find the link to the messages page')
def login3(self, passwd): self.browser.select_form(name='Main') self.browser['codconf'] = passwd.encode('utf-8') a = self.document.xpath('//a[@title="Valider"]')[0] m = re.match("javascript:RedirectToDeiPart\('([^']+)'\);", a.attrib['href']) if not m: raise BrokenPageError('Unable to find validate URL') self.browser.form.action = m.group(1) self.browser.submit(nologin=True)
def iter_videos(self): if self.document is None or self.document['data'] is None: raise BrokenPageError('Unable to find JSON data') for data in self.document['data']: video = GDCVaultVideo.get_video_from_json(data) # TODO: split type 4 videos into id and id#slides if video is None: continue yield video
def set_video_url(self, video): embed_page = self.browser.readurl( 'http://www.dailymotion.com/embed/video/%s' % video.id) m = re.search('playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', embed_page) if not m: raise BrokenPageError('Unable to find information about video') info = json.loads(m.group(1)) qualities = info.get('metadata').get('qualities') for key in ['2160', '1440', '1080', '720', '480', '380', '240']: if qualities.get(key): max_quality = key break else: raise BrokenPageError(u'Unable to extract video URL') video.url = unicode(qualities.get(max_quality)[0].get('url'))
def get_url(self): download_div = self.parser.select(self.document.getroot(), 'ul.downloadList li') if len(download_div) < 1: raise BrokenPageError('Unable to find file URL') a = self.parser.select(download_div[0], 'a', 1) m = re.match('^(\w+) - .*', a.text) if m: ext = m.group(1).lower() else: ext = u'flv' return unicode(a.attrib['href']), unicode(ext)
def set_video_url(self, video): embed_page = self.browser.readurl( 'http://www.dailymotion.com/embed/video/%s' % video.id) m = re.search('var info = ({.*?}),[^{"]', embed_page) if not m: raise BrokenPageError('Unable to find information about video') info = json.loads(m.group(1)) for key in [ 'stream_h264_hd1080_url', 'stream_h264_hd_url', 'stream_h264_hq_url', 'stream_h264_url', 'stream_h264_ld_url' ]: if info.get(key): max_quality = key break else: raise BrokenPageError(u'Unable to extract video URL') video.url = unicode(info[max_quality])
def iter_videos(self): # When no results are found, the website returns random results sb = self.parser.select(self.document.getroot(), 'div.search form input.searchbox', 1) if sb.value == 'No Results Found': return #Extracting meta data from results page vidbackdrop_list = self.parser.select(self.document.getroot(), 'div.vidBackdrop ') for vidbackdrop in vidbackdrop_list: url = self.parser.select(vidbackdrop, 'a', 1).attrib['href'] _id = url[2:] video = CappedVideo(_id) video.set_empty_fields(NotAvailable, ('url', )) video.title = to_unicode( self.parser.select(vidbackdrop, 'div.vidTitle a', 1).text) video.author = to_unicode( self.parser.select(vidbackdrop, 'div.vidAuthor a', 1).text) thumbnail_url = 'http://cdn.capped.tv/pre/%s.png' % _id video.thumbnail = Thumbnail(thumbnail_url) video.thumbnail.url = to_unicode(video.thumbnail.id) #we get the description field duration_tmp = self.parser.select(vidbackdrop, 'div.vidInfo', 1) #we remove tabs and spaces duration_tmp2 = duration_tmp.text[7:] #we remove all fields exept time duration_tmp3 = duration_tmp2.split(' ')[0] #we transform it in datetime format parts = duration_tmp3.split(':') if len(parts) == 1: hours = minutes = 0 seconds = parts[0] elif len(parts) == 2: hours = 0 minutes, seconds = parts elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError('Unable to parse duration %r' % duration_tmp) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) yield video
def get_list(self): for tr in self.document.getiterator('tr'): if 'LGNTableRow' not in tr.attrib.get('class', '').split(): continue account = Account() for td in tr.getiterator('td'): if td.attrib.get('headers', '') == 'TypeCompte': a = td.find('a') if a is None: break account.label = self.parser.tocleanstring(a) for pattern, actype in self.TYPES.iteritems(): if account.label.startswith(pattern): account.type = actype account._link_id = a.get('href', '') elif td.attrib.get('headers', '') == 'NumeroCompte': account.id = self.parser.tocleanstring(td).replace( u'\xa0', '') elif td.attrib.get('headers', '') == 'Libelle': text = self.parser.tocleanstring(td) if text != '': account.label = text elif td.attrib.get('headers', '') == 'Solde': div = td.xpath('./div[@class="Solde"]') if len(div) > 0: balance = self.parser.tocleanstring(div[0]) if len(balance) > 0 and balance not in ('ANNULEE', 'OPPOSITION'): try: account.balance = Decimal( FrenchTransaction.clean_amount(balance)) except InvalidOperation: raise BrokenPageError( 'Unable to parse balance %r' % balance) account.currency = account.get_currency(balance) else: account.balance = NotAvailable if not account.label or empty(account.balance): continue if 'CARTE_' in account._link_id: account.type = account.TYPE_CARD account.coming = account.balance account.balance = Decimal('0') yield account
def get_history(self, date_guesser, state=None): seen = set() lines = self.document.xpath('(//table[@class="ca-table"])[2]/tr') debit_date = None for i, line in enumerate(lines): is_balance = line.xpath('./td/@class="cel-texte cel-neg"') # It is possible to have three or four columns. cols = [self.parser.tocleanstring(td) for td in line.xpath('./td')] date = cols[0] label = cols[1] amount = cols[-1] t = Transaction() t.set_amount(amount) t.label = t.raw = label if is_balance: m = re.search('(\d+ [^ ]+ \d+)', label) if not m: raise BrokenPageError( 'Unable to read card balance in history: %r' % label) if state is None: debit_date = parse_french_date(m.group(1)) else: debit_date = state # Skip the first line because it is balance if i == 0: continue t.date = t.rdate = debit_date # Consider the second one as a positive amount to reset balance to 0. t.amount = -t.amount state = t.date else: day, month = map(int, date.split('/', 1)) t.rdate = date_guesser.guess_date(day, month) t.date = debit_date t.type = t.TYPE_CARD try: t.id = t.unique_id(seen) except UnicodeEncodeError: self.logger.debug(t) self.logger.debug(t.label) raise yield state, t
def go_on_accounts_list(self): for taskInfoOID in self.ACCOUNT_URLS: self.location(self.buildurl('/cyber/internet/StartTask.do', taskInfoOID=taskInfoOID, token=self.token)) if not self.page.is_error(): self.ACCOUNT_URLS = [taskInfoOID] break else: raise BrokenPageError('Unable to go on the accounts list page') if self.page.is_short_list(): self.select_form(nr=0) self.set_all_readonly(False) self['dialogActionPerformed'] = 'EQUIPEMENT_COMPLET' self['token'] = self.page.build_token(self['token']) self.submit()
def iter_videos(self): for div in self.parser.select(self.document.getroot(), 'div.sd_video_listitem'): smalldiv = self.parser.select(div, 'div.sd_video_preview', 1) _id = smalldiv.attrib.get('data-id', None) if _id is None: self.browser.logger.warning('Unable to find the ID of a video') continue video = DailymotionVideo(_id) video.title = unicode( self.parser.select(div, 'div a img', 1).attrib['title']).strip() video.author = unicode( self.parser.select(div, 'a.link-on-hvr', 1).text).strip() video.description = NotAvailable try: parts = self.parser.select(div, 'div.badge-duration', 1).text.split(':') except BrokenPageError: # it's probably a live, np. video.duration = NotAvailable else: if len(parts) == 1: seconds = parts[0] hours = minutes = 0 elif len(parts) == 2: minutes, seconds = parts hours = 0 elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError( 'Unable to parse duration %r' % self.parser.select(div, 'div.duration', 1).text) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) url = unicode( self.parser.select(div, 'img.preview', 1).attrib['data-src']) # remove the useless anti-caching url = re.sub('\?\d+', '', url) video.thumbnail = BaseImage(url) video.thumbnail.url = video.thumbnail.id video.set_empty_fields(NotAvailable, ('url', )) yield video
def get_list(self): accounts = [] txt = self.get_from_js('_data = new Array(', ');', is_list=True) if txt is None: raise BrokenPageError('Unable to find accounts list in scripts') data = json.loads('[%s]' % txt.replace("'", '"')) for line in data: a = Account() a.id = line[self.COL_ID].replace(' ', '') fp = StringIO( unicode(line[self.COL_LABEL]).encode(self.browser.ENCODING)) a.label = self.parser.tocleanstring( self.parser.parse(fp, self.browser.ENCODING).xpath( '//div[@class="libelleCompteTDB"]')[0]) a.balance = Decimal( FrenchTransaction.clean_amount(line[self.COL_BALANCE])) a.currency = a.get_currency(line[self.COL_BALANCE]) a.type = self.get_account_type(a.label) a._link = self.get_history_link() if line[self.COL_HISTORY] == 'true': a._args = { '_eventId': 'clicDetailCompte', '_ipc_eventValue': '', '_ipc_fireEvent': '', 'deviseAffichee': 'DEVISE', 'execution': self.get_execution(), 'idCompteClique': line[self.COL_ID], } else: a._args = None if a.id.find('_CarteVisa') >= 0: accounts[-1]._card_ids.append(a._args) if not accounts[-1].coming: accounts[-1].coming = Decimal('0.0') accounts[-1].coming += a.balance continue a._card_ids = [] accounts.append(a) return iter(accounts)
def set_video_metadata(self, video): head = self.parser.select(self.document.getroot(), 'head', 1) video.title = unicode( self.parser.select(head, 'meta[property="og:title"]', 1).get("content")).strip() video.author = unicode( self.parser.select(head, 'meta[name="author"]', 1).get("content")).strip() url = unicode( self.parser.select(head, 'meta[property="og:image"]', 1).get("content")).strip() # remove the useless anti-caching url = re.sub('\?\d+', '', url) video.thumbnail = BaseImage(url) video.thumbnail.url = video.thumbnail.id try: parts = self.parser.select(head, 'meta[property="video:duration"]', 1).get("content").strip().split(':') except BrokenPageError: # it's probably a live, np. video.duration = NotAvailable else: if len(parts) == 1: seconds = parts[0] hours = minutes = 0 elif len(parts) == 2: minutes, seconds = parts hours = 0 elif len(parts) == 3: hours, minutes, seconds = parts else: raise BrokenPageError('Unable to parse duration %r' % parts) video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds)) try: video.description = html2text( self.parser.select(head, 'meta[property="og:description"]', 1).get("content")).strip() or unicode() except BrokenPageError: video.description = u''
def get_video(self, video=None): _id = to_unicode(self.group_dict['id']) if video is None: video = JacquieEtMichelVideo(_id) title_el = self.parser.select(self.document.getroot(), 'h1', 1) video.title = to_unicode(title_el.text.strip()) video.description = self.document.xpath( '//meta[@name="description"]')[0].attrib['content'] for script in self.document.xpath('.//script'): if script.text is None: continue m = re.search('"(http://[^"]+.mp4)"', script.text, re.MULTILINE) if m: video.url = to_unicode(m.group(1)) break if not video.url: raise BrokenPageError('Unable to find URL') video.set_empty_fields(NotAvailable) return video
def get_content(self, _id): url, _id = self.parse_id(_id) if url is None: return None self.location(url) self.page.url = self.absurl(url) if self.is_on_page(CommentPage): content = self.page.get_comment() elif self.is_on_page(ContentPage): m = re.match('.*#comment-(\d+)$', url) if m: content = self.page.get_comment(int(m.group(1))) else: content = self.page.get_article() else: raise BrokenPageError('Not on a content or comment page (%r)' % self.page) if _id is not None: content.id = _id return content
def iter_videos(self): try: ul = self.parser.select(self.document.getroot(), 'div.container-videos ul', 1) except BrokenPageError: # It means there are no results. return for li in ul.findall('li'): url = li.find('a').find('img').attrib['src'] id = re.sub(self.URL_REGEXP, r'\2', url) video = InaVideo(id) video.thumbnail = BaseImage(u'http://boutique.ina.fr%s' % url) video.thumbnail.url = video.thumbnail.id # The title is poorly encoded is the source, we have to encode/decode it again video.title = unicode(self.parser.select( li, 'p.titre', 1).text).encode('raw_unicode_escape').decode('utf8') date = self.parser.select(li, 'p.date', 1).text day, month, year = [int(s) for s in date.split('/')] video.date = datetime.datetime(year, month, day) duration = self.parser.select(li, 'p.duree', 1).text m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration) if m: video.duration = datetime.timedelta(hours=int(m.group(2) or 0), minutes=int( m.group(4) or 0), seconds=int(m.group(5))) else: raise BrokenPageError('Unable to match duration (%r)' % duration) yield video
def iter_accounts(self, next_pages): params = self.get_params() account = None currency = None for th in self.document.xpath('//table[@id="TabCtes"]//thead//th'): m = re.match('.*\((\w+)\)$', th.text) if m and currency is None: currency = Account.get_currency(m.group(1)) for tr in self.document.xpath('//table[@id="TabCtes"]/tbody/tr'): cols = tr.xpath('./td') id = self.parser.tocleanstring(cols[self.COL_ID]) if len(id) > 0: if account is not None: yield account account = Account() account.id = id.replace(' ', '') account.type = Account.TYPE_CARD account.balance = account.coming = Decimal('0') account._next_debit = datetime.date.today() account._prev_debit = datetime.date(2000, 1, 1) account.label = u' '.join([ self.parser.tocleanstring(cols[self.COL_TYPE]), self.parser.tocleanstring(cols[self.COL_LABEL]) ]) account.currency = currency account._params = None account._invest_params = None account._coming_params = params.copy() account._coming_params[ 'dialogActionPerformed'] = 'SELECTION_ENCOURS_CARTE' account._coming_params[ 'attribute($SEL_$%s)' % tr.attrib['id'].split('_')[0]] = tr.attrib['id'].split( '_', 1)[1] elif account is None: raise BrokenPageError('Unable to find accounts on cards page') else: account._params = params.copy() account._params[ 'dialogActionPerformed'] = 'SELECTION_ENCOURS_CARTE' account._params[ 'attribute($SEL_$%s)' % tr.attrib['id'].split('_')[0]] = tr.attrib['id'].split( '_', 1)[1] date_col = self.parser.tocleanstring(cols[self.COL_DATE]) m = re.search('(\d+)/(\d+)/(\d+)', date_col) if not m: self.logger.warning('Unable to parse date %r' % date_col) continue date = datetime.date(*reversed(map(int, m.groups()))) if date.year < 100: date = date.replace(year=date.year + 2000) amount = Decimal( FrenchTransaction.clean_amount( self.parser.tocleanstring(cols[self.COL_AMOUNT]))) if not date_col.endswith('(1)'): # debited account.coming += -abs(amount) account._next_debit = date elif date > account._prev_debit: account._prev_balance = -abs(amount) account._prev_debit = date if account is not None: yield account # Needed to preserve navigation. btn = self.document.xpath('.//button/span[text()="Retour"]') if len(btn) > 0: btn = btn[0].getparent() actions = self.get_button_actions() _params = params.copy() _params.update(actions[btn.attrib['id']]) self.browser.openurl('/cyber/internet/ContinueTask.do', urllib.urlencode(_params))
def get_list(self): def check_valid_url(url): pattern = [ '/restitution/cns_detailAVPAT.html', '/restitution/cns_detailPea.html', '/restitution/cns_detailAlterna.html', ] for p in pattern: if url.startswith(p): return False return True for tr in self.document.getiterator('tr'): if 'LGNTableRow' not in tr.attrib.get('class', '').split(): continue account = Account() for td in tr.getiterator('td'): if td.attrib.get('headers', '') == 'TypeCompte': a = td.find('a') if a is None: break account.label = self.parser.tocleanstring(a) account._link_id = a.get('href', '') for pattern, actype in self.TYPES.iteritems(): if account.label.startswith(pattern): account.type = actype break else: if account._link_id.startswith('/asv/asvcns10.html'): account.type = Account.TYPE_LIFE_INSURANCE # Website crashes when going on theses URLs if not check_valid_url(account._link_id): account._link_id = None elif td.attrib.get('headers', '') == 'NumeroCompte': account.id = self.parser.tocleanstring(td).replace( u'\xa0', '') elif td.attrib.get('headers', '') == 'Libelle': text = self.parser.tocleanstring(td) if text != '': account.label = text elif td.attrib.get('headers', '') == 'Solde': div = td.xpath('./div[@class="Solde"]') if len(div) > 0: balance = self.parser.tocleanstring(div[0]) if len(balance) > 0 and balance not in ('ANNULEE', 'OPPOSITION'): try: account.balance = Decimal( FrenchTransaction.clean_amount(balance)) except InvalidOperation: raise BrokenPageError( 'Unable to parse balance %r' % balance) account.currency = account.get_currency(balance) else: account.balance = NotAvailable if not account.label or empty(account.balance): continue if account._link_id and 'CARTE_' in account._link_id: account.type = account.TYPE_CARD if account.type == Account.TYPE_UNKNOWN: self.logger.debug('Unknown account type: %s', account.label) yield account
def get_accounts(self): accounts = {} content = self.document.xpath( '//div[@id="main"]//div[@class="col first"]')[0] # Primary currency account primary_account = Account() primary_account.type = Account.TYPE_CHECKING # Total currency balance. # If there are multiple currencies, this balance is all currencies # converted to the main currency. try: balance = content.xpath('.//h3/span[@class="balance"]') if not balance: balance = content.xpath('.//li[@class="balance"]//span/strong') balance = balance[0].text_content().strip() primary_account.balance = AmTr.decimal_amount(balance) primary_account.currency = Account.get_currency(balance) primary_account.id = unicode(primary_account.currency) primary_account.label = u'%s %s*' % (self.browser.username, balance.split()[-1]) except IndexError: primary_account.balance = NotAvailable primary_account.label = u'%s' % (self.browser.username) accounts[primary_account.id] = primary_account # The following code will only work if the user enabled multiple currencies. balance = content.xpath( './/div[@class="body"]//ul/li[@class="balance"]/span') table = content.xpath('.//table[@id="balanceDetails"]//tbody//tr') # sanity check if bool(balance) is not bool(table): raise BrokenPageError( 'Unable to find all required multiple currency entries') # Primary currency balance. # If the user enabled multiple currencies, we get this one instead. # An Account object has only one currency; secondary currencies should be other accounts. if balance: balance = balance[0].text_content().strip() primary_account.balance = AmTr.decimal_amount(balance) # The primary currency of the "head balance" is the same; ensure we got the right one assert primary_account.currency == primary_account.get_currency( balance) for row in table: balance = row.xpath('.//td')[-1].text_content().strip() account = Account() account.type = Account.TYPE_CHECKING # XXX it ignores 5+ devises, so it's bad, but it prevents a crash, cf #1216 try: account.balance = AmTr.decimal_amount(balance) except InvalidOperation: continue account.currency = Account.get_currency(balance) account.id = unicode(account.currency) account.label = u'%s %s' % (self.browser.username, balance.split()[-1]) if account.id == primary_account.id: assert account.balance == primary_account.balance assert account.currency == primary_account.currency elif account.currency: accounts[account.id] = account return accounts
def get_video(self, video=None): # check for slides id variant want_slides = False m = re.match('.*#slides', self.url) if m: want_slides = True # not sure it's safe self.group_dict['id'] += '#slides' if video is None: video = GDCVaultVideo(self.group_dict['id']) # the config file has it too, but in CDATA and only for type 4 obj = self.parser.select(self.document.getroot(), 'title') title = None if len(obj) > 0: try: title = unicode(obj[0].text) except UnicodeDecodeError as e: title = None if title is None: obj = self.parser.select(self.document.getroot(), 'meta[name=title]') if len(obj) > 0: if 'content' in obj[0].attrib: try: # FIXME: 1013483 has buggus title (latin1) # for now we just pass it as-is title = obj[0].attrib['content'] except UnicodeDecodeError as e: # XXX: this doesn't even works!? title = obj[0].attrib['content'].decode('iso-5589-15') if title is not None: title = title.strip() m = re.match('GDC Vault\s+-\s+(.*)', title) if m: title = m.group(1) video.title = title #TODO: POST back the title to /search.php and filter == id to get # cleaner (JSON) data... (though it'd be much slower) # try to find an iframe (type 3 and 4) obj = self.parser.select(self.document.getroot(), 'iframe') if len(obj) == 0: # type 1 or 2 (swf+js) # find which script element contains the swf args for script in self.parser.select(self.document.getroot(), 'script'): m = re.match( ".*new SWFObject.*addVariable\('type', '(.*)'\).*", unicode(script.text), re.DOTALL) if m: video.ext = m.group(1) m = re.match( ".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL) if m: video.url = "http://gdcvault.com%s" % (m.group(1)) # TODO: for non-free (like 769), # must be logged to use /mediaProxy.php # FIXME: doesn't seem to work yet, we get 2 bytes as html # 769 should give: # http://twvideo01.ubm-us.net/o1/gdcradio-net/2007/gdc/GDC07-4889.mp3 # HACK: we use mechanize directly here for now... FIXME #print "asking for redirect on '%s'" % (video.url) #self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]] #print self.browser.addheaders self.browser.set_handle_redirect(False) try: self.browser.open_novisit(video.url) # headers = req.info() # if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2': # print 'BUG' #print req.code except HTTPError as e: #print e.getcode() if e.getcode() == 302 and hasattr(e, 'hdrs'): #print e.hdrs['Location'] video.url = unicode(e.hdrs['Location']) self.browser.set_handle_redirect(True) video.set_empty_fields(NotAvailable) return video #XXX: raise error? return None obj = obj[0] if obj is None: return None # type 3 or 4 (iframe) # get the config file for the rest iframe_url = obj.attrib['src'] # 1015020 has a boggus url m = re.match('http:/event(.+)', iframe_url) if m: iframe_url = 'http://event' + m.group(1) # print iframe_url # 1013798 has player169.html # 1012186 has player16x9.html # some other have /somethingplayer.html... # 1441 has a space in the xml filename, which we must not strip m = re.match( '(http:.*/)[^/]*player[0-9a-z]*\.html\?.*xmlURL=([^&]+\.xml).*\&token=([^& ]+)', iframe_url) if not m: m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url) if m is None: return None # TODO: must be logged to use /mediaProxy.php # type 3 (pdf slides) video.ext = u'pdf' video.url = "http://gdcvault.com%s" % (unicode(iframe_url)) # HACK: we use mechanize directly here for now... FIXME # print "asking for redirect on '%s'" % (video.url) self.browser.set_handle_redirect(False) try: self.browser.open_novisit(video.url) except HTTPError as e: if e.getcode() == 302 and hasattr(e, 'hdrs'): video.url = unicode(e.hdrs['Location']) self.browser.set_handle_redirect(True) video.set_empty_fields(NotAvailable) return video # type 4 (dual screen video) # token doesn't actually seem required # 1441 has a space in the xml filename xml_filename = urllib.quote(m.group(2)) config_url = m.group(1) + xml_filename + '?token=' + m.group(3) # self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]] # print self.browser.addheaders # TODO: fix for 1015021 & others (forbidden) #config = self.browser.openurl(config_url).read() config = self.browser.get_document(self.browser.openurl(config_url)) obj = self.parser.select(config.getroot(), 'akamaihost', 1) host = obj.text if host is None: raise BrokenPageError('Missing tag in xml config file') if host == "smil": # the rtmp URL is described in a smil file, # with several available bitrates obj = self.parser.select(config.getroot(), 'speakervideo', 1) smil = self.browser.get_document(self.browser.openurl(obj.text)) obj = self.parser.select(smil.getroot(), 'meta', 1) # TODO: error checking base = obj.attrib.get('base', '') best_bitrate = 0 path = None obj = self.parser.select(smil.getroot(), 'video') # choose the best bitrate for o in obj: rate = int(o.attrib.get('system-bitrate', 0)) if rate > best_bitrate: path = o.attrib.get('src', '') video.url = unicode(base + '/' + path) else: # not smil, the rtmp url is directly here as host + path # for id 1373 host is missing '/ondemand' # only add it when only a domain is specified without path m = re.match('^[^\/]+$', host) if m: host += "/ondemand" videos = {} obj = self.parser.select(config.getroot(), 'speakervideo', 1) if obj.text is not None: videos['speaker'] = 'rtmp://' + host + '/' + urllib.quote( obj.text) obj = self.parser.select(config.getroot(), 'slidevideo', 1) if obj.text is not None: videos['slides'] = 'rtmp://' + host + '/' + urllib.quote( obj.text) # print videos # XXX if 'speaker' in videos: video.url = unicode(videos['speaker']) elif 'slides' in videos: # 1016627 only has slides, so fallback to them video.url = unicode(videos['slides']) if want_slides: if 'slides' in videos: video.url = unicode(videos['slides']) # if video.url is none: raise ? XXX obj = self.parser.select(config.getroot(), 'date', 1) if obj.text is not None: # 1016634 has "Invalid Date" try: video.date = parse_dt(obj.text) except ValueError as e: video.date = NotAvailable obj = self.parser.select(config.getroot(), 'duration', 1) m = re.match('(\d\d):(\d\d):(\d\d)', obj.text) if m: video.duration = datetime.timedelta(hours=int(m.group(1)), minutes=int(m.group(2)), seconds=int(m.group(3))) obj = self.parser.select(config.getroot(), 'speaker', 1) #print obj.text_content() #self.set_details(video) video.set_empty_fields(NotAvailable) return video obj = self.parser.select(self.document.getroot(), 'title') if len(obj) < 1: return None title = obj[0].text.strip() m = re.match('GDC Vault\s+-\s+(.*)', title) if m: title = m.group(1)