def filename_from_html_content(html): trace(8, 'Trying to deduce filename from html-content.') programname = '' displaydate = find_html_meta_argument(html, 'displaydate') # programid = find_html_meta_argument(html, 'programid') # change date from 20141210 to 2014-12-10 if len(displaydate) == 8: displaydate = displaydate[:-4] + '-' + displaydate[-4:-2] + '-' + displaydate[-2:] title = find_html_meta_argument(html, 'og:title') idx = title.rfind(' - ') if idx < 0: trace(8, 'programname is not part of og:title, truing twitter:title') title = find_html_meta_argument(html, 'twitter:title') idx = title.rfind(' - ') if idx > 0: programname = title[idx+3:].strip() title = title[:idx] programname = common.unescape_html(programname) programname = programname.replace('/', ' ').rstrip(' .,!') if programname == 'Lordagsmorgon i P2': programname = 'Lordagsmorgon' trace(7, 'programname is ' + programname) parts = title.split(' ') # trim date/time from end lastToKeep = 0 for idx in range(0, len(parts)): # trace(9, 'idx=' + str(idx) + ': "' + parts[idx] + '"') if ( not re.match(r'\d+(:\d+)*', parts[idx]) # skip time like 12:24:00 and parts[idx] != 'kl' and not common.is_swe_month(parts[idx]) and not common.is_swe_weekday(parts[idx]) ): #trace(9, 'idx=' + str(idx) + ' is to keep "' + parts[idx] + '"') lastToKeep = idx #trace(9, 'skipping idx=' + str(idx) + ' "' + parts[idx] + '" from title') if lastToKeep == 0: trace(4, 'didn\'t find any valid name-parts in title. Keeping as is: "', title, '"') else: title = ' '.join(parts[0:lastToKeep+1]) title = common.unescape_html(title) title = title.replace('/', ' ').strip(' .,!') trace(4, 'new title is ' + title + '\nskipped index ', lastToKeep, 'to ', len(parts)-1) filename = programname + ' ' + displaydate + ' ' + title + '.m4a' trace(4, 'filename: ' + filename) return filename
def pp_info(url, nick): """ Return the transcript and link to the image of a Profound Programmer page. """ try: data = common.read_url(url) except HTTPError: return '{}: kunde inte ladda sidan: {}'.format(nick, url) main_re = re.compile(r""" <li\ class="post\ photo"> \s* <img\ src="(?P<img>.+?)" .+? <div\ class="caption"><p> \[(?P<transcript>.+?)\] </p> \s* (<p><a\ href="(?P<hdimg>.+?)">\[HD\ Version\]</a>)? """, re.DOTALL | re.VERBOSE) transcript_re = re.compile(r'text\s?:? (“(?P<title1>.+?)”|‘(?P<title2>.+?)’)?([,;] )?(?P<transcript>.+)', re.DOTALL) result = main_re.search(data) if not result: print(url) raise AttributeError('.profound could not match the regex! Has theprofoundprogrammer.org change format?') rawtranscript = transcript_re.match(common.unescape_html(sanitize(result.group('transcript')))) title = None if rawtranscript: for t in ('title1', 'title2'): if rawtranscript.group(t): title = rawtranscript.group(t) transcript = rawtranscript.group('transcript') else: transcript = common.unescape_html(result.group('transcript')) if result.group('hdimg'): image = result.group('hdimg') else: image = result.group('img') out = ['[{}]'.format(transcript)] + [image] if title: out = ['"{}"'.format(title)] + out return [common.truncate(x, 400) for x in out]
def xkcd_info(url, nick): """ Return the transcript and title of an xkcd page. """ try: data = common.read_url(url) except HTTPError: return '{}: kunde inte ladda sidan: {}'.format(nick, url) title_re = re.compile(r'<title>xkcd: (.+?)</div>') titlebackup_re = re.compile(r'<div id="ctitle">(.+?)</div>') transcript_re = re.compile(r'<div id="transcript" .+?>(?P<transcript>.*?)(\{\{(?P<alt>.+?)\}\})?</div>', re.DOTALL) # Transcript result = transcript_re.search(data) transcript = [line.strip() for line in result.group('transcript').splitlines() if line.strip()] if not transcript: transcript = ['Ingen beskrivning än!'] # Unused for now - also borken if no transcript is available # alttext = result.group('alt').strip() # Title title = title_re.search(data) if not title: title = titlebackup_re.search(data) firstline = '{} – {}'.format(title.group(1), url) return [common.truncate(common.unescape_html(x), 400) for x in [firstline] + transcript[:3]]
def sanitise(text): # nbsp ugly shit hack text = text.replace('\xa0', '') text = common.unescape_html(codecs.getdecoder('unicode_escape')(text)[0]) # Note the special ⁄-char (not a regular slash) between </sup> and <sub> text = re.sub(r'<sup>(\d+)</sup>⁄<sub>(\d+)</sub>', r' \1/\2', text) return text.replace('<sup>', '^(').replace('</sup>',')').replace('( ', '(')
def parse(cls, api, json): result = cls() for k, v in json.items(): if k == 'created_at': setattr(result, k, parse_search_datetime(v)) elif k == 'source': setattr(result, k, parse_html_value(unescape_html(v))) else: setattr(result, k, v) return result
def fixhtml(str): str = re.sub(r'(<br ?/>)+', r' ', str) str = re.sub('\n', r' ', str) str = re.sub('\r', r' ', str) str = re.sub(r'(<.+?>)+', r'', str) return common.unescape_html(str)