Example #1
0
    def find_lyric(self, url):
        pattern = 'item-([0-9]+)\.html'

        song_id = common.get_first_group_by_pattern(url, pattern)

        if not song_id:
            logging.info('Failed to get id of url [%s]', url)
            return False

        song_url = 'http://www.kasi-time.com/item_js.php?no=' + song_id
        data = common.get_url_content(song_url)
        if not data:
            logging.info('Failed to get content of url [%s]', song_url)
            return False

        lyric = data.decode('utf-8', 'ignore')
        lyric = lyric.replace("document.write('", "")
        lyric = lyric.replace("');", "")
        lyric = lyric.replace("<br>", "\n")
        lyric = lyric.replace("&nbsp;", " ")
        lyric = common.htmlspecialchars_decode(lyric)
        lyric = common.unicode2string(lyric)
        lyric = common.strip_slash(lyric)
        lyric = lyric.strip()

        # test for half to full
        lyric = common.half2full(lyric)

        self.lyric = lyric

        return True
Example #2
0
    def find_song_info(self, content):
        prefix = "<div id='lyricBlock'>"
        suffix = '</table>'
        info_block = common.find_string_by_prefix_suffix(content, prefix, suffix, False)

        prefix = '<h2>'
        suffix = '</h2>'
        title = common.find_string_by_prefix_suffix(info_block, prefix, suffix, False)

        self.title = common.htmlspecialchars_decode(common.unicode2string(title))

        patterns = {
            'artist': u'>歌:(.*?)</td>',
            'lyricist': u'>作詞:(.*?)</td>',
            'composer': u'>作曲:(.*?)</td>'
        }

        for key in patterns:
            pattern = patterns[key]

            value = common.get_first_group_by_pattern(info_block, pattern)
            if value:
                value = common.strip_tags(common.htmlspecialchars_decode(value)).strip()
                setattr(self, key, value)
            else:
                logging.debug('Failed to get %s, pattern: %s' % (key, pattern, ))

        return True
Example #3
0
    def find_song_info(self, url):
        ret = True
        html = common.get_url_content(url)

        encoding = 'sjis'
        html = html.decode(encoding, 'ignore')

        patterns = {
            'title': u'<h2[^>]*>([^<]+)</h2>',
            'artist': u'歌手:<h3.*?><a href="/artist/[0-9]+/".*?>(.+?)</a></h3>',
            'lyricist': u'作詞:<h4.*?>([^<]+)</h4>',
            'composer': u'作曲:<h4.*?>([^<]+)</h4>'
        }

        for key in patterns:
            pattern = patterns[key]

            value = common.get_first_group_by_pattern(html, pattern)

            if not value:
                logging.info('Failed to get %s of url [%s]', key, url)
                ret = False
            else:
                value = common.unicode2string(common.strip_tags(value))
                setattr(self, key, value)

        return ret
Example #4
0
    def parse_artist_title(self, html):
        startStr = '"description" content="'
        endStr = u'の歌詞ページです'

        infoStr = common.get_string_by_start_end_string(startStr, endStr, html)
        if not infoStr:
            return None

        infoStr = infoStr.replace(startStr, '')
        infoStr = infoStr.replace(endStr, '')
        infoStr = infoStr.strip()

        items = infoStr.split(' / ')

        if len(items) == 2:
            self.title = common.unicode2string(items[0])
            self.artist = common.unicode2string(items[1])
Example #5
0
    def parse_composer(self, html):
        prefix = '<b>&#20316;&#26354;&#65306;</b>'
        suffix = '\t'

        raw_string = common.find_string_by_prefix_suffix(html, prefix, suffix, False)
        if not raw_string:
            logging.debug('Failed to find composer')
            return False

        self.composer = common.htmlspecialchars_decode(common.unicode2string(raw_string)).strip()
Example #6
0
    def find_lyric(self, content):
        prefix = "<p id='lyricBody'>"
        suffix = "</p>"
        lyric = common.find_string_by_prefix_suffix(content, prefix, suffix, False)

        lyric = lyric.replace('<br />', '')
        lyric = common.htmlspecialchars_decode(common.unicode2string(lyric))
        lyric = lyric.strip();

        self.lyric = lyric

        return True
Example #7
0
    def find_lyric(self, html):
        prefix = '<div id="lyrics">'
        suffix = '</div>'
        rawLyric = common.get_string_by_start_end_string(prefix, suffix, html)

        rawLyric = rawLyric.replace('<br/>', '\n')
        rawLyric = common.unicode2string(rawLyric)
        rawLyric = common.strip_tags(rawLyric).strip()

        self.lyric = rawLyric

        return True
Example #8
0
    def get_lyric_1st_part(self, html):
        prefix = '<canvas id="lyrics" '
        suffix = '</canvas>'

        rawLyric = common.get_string_by_start_end_string(prefix, suffix, html)
        if not rawLyric:
            logging.info('Failed to get lyric string')
            return None
        encodedLyric = common.strip_tags(rawLyric)
        lyric_1st = common.unicode2string(encodedLyric)

        return lyric_1st
Example #9
0
    def parse_lyricist(self, html):
        prefix = '<b>&#20316;&#35422;&#65306;</b>'
        suffix = '\t'

        logging.debug('find me LYRICIST')

        raw_string = common.find_string_by_prefix_suffix(html, prefix, suffix, False)
        if not raw_string:
            logging.debug('Failed to find lyricist')
            return False

        self.lyricist = common.htmlspecialchars_decode(common.unicode2string(raw_string)).strip()
Example #10
0
    def parse_lyric(self, url, html):

        prefix = '<div id="lyric-trunk">'
        suffix = '</div>'
        lyric = common.get_string_by_start_end_string(prefix, suffix, html)
        if not lyric:
            logging.error('Failed to parse lyric')
            return False

        lyric = common.strip_tags(lyric)

        lyric = common.unicode2string(lyric).strip()

        self.lyric = lyric
        return True
Example #11
0
    def find_lyric(self, html):
        prefix = "<div class='lyricbox'>"
        suffix = '<!--'
        line = common.find_string_by_prefix_suffix(html, prefix, suffix, True)

        prefix = '</script>'
        suffix = '<!--'
        lyric = common.find_string_by_prefix_suffix(line, prefix, suffix, False)

        lyric = lyric.replace('<br />', '\n')
        lyric = common.unicode2string(lyric).strip()
        lyric = common.strip_tags(lyric).strip()

        self.lyric = lyric
        return True
Example #12
0
    def parse_lyric(self, html):
        html = html.replace('\r\n', '')
        prefix = "<div class='body'><p>"
        suffix = '</p>'
        lyric = common.find_string_by_prefix_suffix(html, prefix, suffix, False)
        if not lyric:
            logging.info('Failed to parse lyric from html [%s]', html)
            return False

        lyric = lyric.replace('<br />', '\n')
        lyric = lyric.strip()
        lyric = common.unicode2string(lyric)
        lyric = common.half2full(lyric)

        self.lyric = lyric

        return True
Example #13
0
 def sanitize(self, src):
     return common.unicode2string(common.htmlspecialchars_decode(src))