def find_lyric(self, url): pattern = 'item-([0-9]+)\.html' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.info('Failed to get id of url [%s]', url) return False song_url = 'http://www.kasi-time.com/item_js.php?no=' + song_id data = common.get_url_content(song_url) if not data: logging.info('Failed to get content of url [%s]', song_url) return False lyric = data.decode('utf-8', 'ignore') lyric = lyric.replace("document.write('", "") lyric = lyric.replace("');", "") lyric = lyric.replace("<br>", "\n") lyric = lyric.replace(" ", " ") lyric = common.htmlspecialchars_decode(lyric) lyric = common.unicode2string(lyric) lyric = common.strip_slash(lyric) lyric = lyric.strip() # test for half to full lyric = common.half2full(lyric) self.lyric = lyric return True
def find_song_info(self, content): prefix = "<div id='lyricBlock'>" suffix = '</table>' info_block = common.find_string_by_prefix_suffix(content, prefix, suffix, False) prefix = '<h2>' suffix = '</h2>' title = common.find_string_by_prefix_suffix(info_block, prefix, suffix, False) self.title = common.htmlspecialchars_decode(common.unicode2string(title)) patterns = { 'artist': u'>歌:(.*?)</td>', 'lyricist': u'>作詞:(.*?)</td>', 'composer': u'>作曲:(.*?)</td>' } for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(info_block, pattern) if value: value = common.strip_tags(common.htmlspecialchars_decode(value)).strip() setattr(self, key, value) else: logging.debug('Failed to get %s, pattern: %s' % (key, pattern, )) return True
def find_song_info(self, url): ret = True html = common.get_url_content(url) encoding = 'sjis' html = html.decode(encoding, 'ignore') patterns = { 'title': u'<h2[^>]*>([^<]+)</h2>', 'artist': u'歌手:<h3.*?><a href="/artist/[0-9]+/".*?>(.+?)</a></h3>', 'lyricist': u'作詞:<h4.*?>([^<]+)</h4>', 'composer': u'作曲:<h4.*?>([^<]+)</h4>' } for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(html, pattern) if not value: logging.info('Failed to get %s of url [%s]', key, url) ret = False else: value = common.unicode2string(common.strip_tags(value)) setattr(self, key, value) return ret
def parse_artist_title(self, html): startStr = '"description" content="' endStr = u'の歌詞ページです' infoStr = common.get_string_by_start_end_string(startStr, endStr, html) if not infoStr: return None infoStr = infoStr.replace(startStr, '') infoStr = infoStr.replace(endStr, '') infoStr = infoStr.strip() items = infoStr.split(' / ') if len(items) == 2: self.title = common.unicode2string(items[0]) self.artist = common.unicode2string(items[1])
def parse_composer(self, html): prefix = '<b>作曲:</b>' suffix = '\t' raw_string = common.find_string_by_prefix_suffix(html, prefix, suffix, False) if not raw_string: logging.debug('Failed to find composer') return False self.composer = common.htmlspecialchars_decode(common.unicode2string(raw_string)).strip()
def find_lyric(self, content): prefix = "<p id='lyricBody'>" suffix = "</p>" lyric = common.find_string_by_prefix_suffix(content, prefix, suffix, False) lyric = lyric.replace('<br />', '') lyric = common.htmlspecialchars_decode(common.unicode2string(lyric)) lyric = lyric.strip(); self.lyric = lyric return True
def find_lyric(self, html): prefix = '<div id="lyrics">' suffix = '</div>' rawLyric = common.get_string_by_start_end_string(prefix, suffix, html) rawLyric = rawLyric.replace('<br/>', '\n') rawLyric = common.unicode2string(rawLyric) rawLyric = common.strip_tags(rawLyric).strip() self.lyric = rawLyric return True
def get_lyric_1st_part(self, html): prefix = '<canvas id="lyrics" ' suffix = '</canvas>' rawLyric = common.get_string_by_start_end_string(prefix, suffix, html) if not rawLyric: logging.info('Failed to get lyric string') return None encodedLyric = common.strip_tags(rawLyric) lyric_1st = common.unicode2string(encodedLyric) return lyric_1st
def parse_lyricist(self, html): prefix = '<b>作詞:</b>' suffix = '\t' logging.debug('find me LYRICIST') raw_string = common.find_string_by_prefix_suffix(html, prefix, suffix, False) if not raw_string: logging.debug('Failed to find lyricist') return False self.lyricist = common.htmlspecialchars_decode(common.unicode2string(raw_string)).strip()
def parse_lyric(self, url, html): prefix = '<div id="lyric-trunk">' suffix = '</div>' lyric = common.get_string_by_start_end_string(prefix, suffix, html) if not lyric: logging.error('Failed to parse lyric') return False lyric = common.strip_tags(lyric) lyric = common.unicode2string(lyric).strip() self.lyric = lyric return True
def find_lyric(self, html): prefix = "<div class='lyricbox'>" suffix = '<!--' line = common.find_string_by_prefix_suffix(html, prefix, suffix, True) prefix = '</script>' suffix = '<!--' lyric = common.find_string_by_prefix_suffix(line, prefix, suffix, False) lyric = lyric.replace('<br />', '\n') lyric = common.unicode2string(lyric).strip() lyric = common.strip_tags(lyric).strip() self.lyric = lyric return True
def parse_lyric(self, html): html = html.replace('\r\n', '') prefix = "<div class='body'><p>" suffix = '</p>' lyric = common.find_string_by_prefix_suffix(html, prefix, suffix, False) if not lyric: logging.info('Failed to parse lyric from html [%s]', html) return False lyric = lyric.replace('<br />', '\n') lyric = lyric.strip() lyric = common.unicode2string(lyric) lyric = common.half2full(lyric) self.lyric = lyric return True
def sanitize(self, src): return common.unicode2string(common.htmlspecialchars_decode(src))