Example #1
0
    end_str = conf['block_end']
    sub_content = ''
    if special_str in content:
        start_index = content.index(special_str)
        sub_content = content[start_index + len(special_str):]
        end_index = sub_content.index(end_str)
        sub_content = sub_content[0:end_index]
        #sub_content = re.sub(r'</?\w+[^>]*>',' ', sub_content)

    ###
    end_mark = conf['end_mark']
    song_mark = conf['song_mark']
    singer_mark = conf['singer_mark']
    find_index = 0

    sub_content = tools.str_replace(sub_content)
    i = 0
    while song_mark in sub_content:
        #{
        song = ''
        singer = ''
        if 'simple_parse' in conf and conf['simple_parse']:
            start_index = sub_content.index(song_mark)
            sub_content = sub_content[start_index + len(song_mark):]
            end_index = sub_content.index(end_mark)
            html = sub_content[0:end_index]

            song_singer = callback(html, conf['callback'])
            #tools.debug(song_singer, 1)

            song = song_singer['song']
Example #2
0
    def xiami_parse_detail(self, html, module, conf, info_obj, fp_obj, logger):
        # {
        # tools.debug(html_section)
        # tools.debug(info_obj)
        # tools.debug(fp_obj, 1)
        parser = HTMLParser.HTMLParser()
        dp_fp = fp_obj["dp_fp"]
        dd_fp = fp_obj["dd_fp"]

        url = info_obj["url"]
        logger.info("Start parse url for %s, url: %s" % (module, url))
        # tools.debug(url)

        html = tools.str_replace(html)
        detail_data = {
            "url": url,
            "album": info_obj["album"],
            "artist": "",
            "language": "",
            "company": "",
            "publish_time": "",
            "album_type": "",
            "album_intro": "",
        }
        start = '<td valign="top"'
        end = "</td>"
        start_len = len(start)
        end_len = len(end)

        parse = ["artist", "language", "company", "publish_time", "album_type"]
        for tag in parse:
            # {
            if start not in html:
                break

            s_index = html.find(start)
            html = html[s_index:]
            e_index = html.find(end)
            sub_html = html[0 : e_index + end_len]
            data = tools.strip_html_tag(sub_html)
            # tools.debug(data)
            detail_data[tag] = data

            html = html[e_index + end_len :]
        # } end for tag
        # tools.debug(detail_data, 1)

        # album intro
        start = '<div id="album_intro"'
        end = '<div class="album_intro_toggle'
        start_len = len(start)
        end_len = len(end)
        if start in html:
            s_index = html.find(start)
            e_index = html.find(end)
            sub_html = html[s_index:e_index]
            data = tools.strip_html_tag(sub_html)
            # {
            try:
                data = parser.unescape(data)
            except (UnicodeDecodeError, UnicodeEncodeError), e:
                logger.info("Parse data error. [Exception]: %s" % (e))
            # }
            detail_data["album_intro"] = data
Example #3
0
    def xiami_parse_detail(self, html, module, conf, info_obj, fp_obj, logger):
        #{
        #tools.debug(html_section)
        #tools.debug(info_obj)
        #tools.debug(fp_obj, 1)
        parser = HTMLParser.HTMLParser()
        dp_fp = fp_obj['dp_fp']
        dd_fp = fp_obj['dd_fp']

        url = info_obj['url']
        logger.info('Start parse url for %s, url: %s' % (module, url))
        #tools.debug(url)

        html = tools.str_replace(html)
        detail_data = {
            'url': url,
            'album': info_obj['album'],
            'artist': '',
            'language': '',
            'company': '',
            'publish_time': '',
            'album_type': '',
            'album_intro': '',
        }
        start = '<td valign="top"'
        end = '</td>'
        start_len = len(start)
        end_len = len(end)

        parse = ['artist', 'language', 'company', 'publish_time', 'album_type']
        for tag in parse:
            #{
            if start not in html:
                break

            s_index = html.find(start)
            html = html[s_index:]
            e_index = html.find(end)
            sub_html = html[0:e_index + end_len]
            data = tools.strip_html_tag(sub_html)
            #tools.debug(data)
            detail_data[tag] = data

            html = html[e_index + end_len:]
        #} end for tag
        #tools.debug(detail_data, 1)

        # album intro
        start = '<div id="album_intro"'
        end = '<div class="album_intro_toggle'
        start_len = len(start)
        end_len = len(end)
        if start in html:
            s_index = html.find(start)
            e_index = html.find(end)
            sub_html = html[s_index:e_index]
            data = tools.strip_html_tag(sub_html)
            #{
            try:
                data = parser.unescape(data)
            except (UnicodeDecodeError, UnicodeEncodeError), e:
                logger.info('Parse data error. [Exception]: %s' % (e))
            #}
            detail_data['album_intro'] = data
Example #4
0
    def xiami_parse_url(self, html, module, conf, fp, logger):
        # {
        logger.info("Start parse url for %s" % module)

        start_tag = "<li>"
        end_tag = "</li>"
        deep_url = "http://www.xiami.com%s"

        html = tools.str_replace(html)
        while start_tag in html:
            # {
            start_index = html.find(start_tag)
            end_index = html.find(end_tag)

            sub_html = html[start_index + len(start_tag) : end_index]
            # tools.debug(sub_html, 1)

            parse_info = {"album": "", "artist": "", "url": "", "year": ""}

            # parse deep page url
            url = "#"
            u_start = 'href="'
            u_end = '"'
            if u_start in sub_html:
                u_s_index = sub_html.find(u_start)
                sub_html = sub_html[u_s_index + len(u_start) :]
                u_e_index = sub_html.find(u_end)
                url = sub_html[0:u_e_index]
                sub_html = sub_html[u_e_index + len(u_end) :]
            if "/" == url[0]:
                url = deep_url % url
            else:
                url = "/%s" % url
                url = deep_url % url
            parse_info["url"] = url

            # parse album name, singer and year
            parse = [
                {"name": "album", "start": '<a class="song"', "end": "</a>"},
                {"name": "singer", "start": '<a class="singer"', "end": "</a>"},
                {"name": "year", "start": '<p class="year"', "end": "</p>"},
            ]

            for obj in parse:
                # {
                name = obj["name"]
                start = obj["start"]
                end = obj["end"]
                data = ""

                start_len = len(start)
                end_len = len(end)

                if start in sub_html:
                    s_index = sub_html.find(start)
                    sub_html = sub_html[s_index:]
                    e_index = sub_html.find(end)
                    tmp_html = sub_html[0 : e_index + end_len]
                    data = tools.strip_html_tag(tmp_html)
                    sub_html = sub_html[e_index + end_len :]
                parse_info[name] = data
            # }

            # album singer url year
            w_str = "%s\t%s\t%s\t%s\n" % (
                parse_info["album"],
                parse_info["singer"],
                parse_info["url"],
                parse_info["year"],
            )
            # tools.debug(w_str, 1)
            try:
                fp.write(w_str)
            except:
                logger.warn("Write data fail for %s." % module)

            html = html[end_index + len(end_tag) :]
        # } end while
        logger.info("parse url is completed for %s." % module)

        return 0
Example #5
0
    def xiami_parse_url(self, html, module, conf, fp, logger):
        #{
        logger.info('Start parse url for %s' % module)

        start_tag = '<li>'
        end_tag = '</li>'
        deep_url = 'http://www.xiami.com%s'

        html = tools.str_replace(html)
        while start_tag in html:
            #{
            start_index = html.find(start_tag)
            end_index = html.find(end_tag)

            sub_html = html[start_index + len(start_tag):end_index]
            #tools.debug(sub_html, 1)

            parse_info = {'album': '', 'artist': '', 'url': '', 'year': ''}

            # parse deep page url
            url = '#'
            u_start = 'href="'
            u_end = '"'
            if u_start in sub_html:
                u_s_index = sub_html.find(u_start)
                sub_html = sub_html[u_s_index + len(u_start):]
                u_e_index = sub_html.find(u_end)
                url = sub_html[0:u_e_index]
                sub_html = sub_html[u_e_index + len(u_end):]
            if '/' == url[0]:
                url = deep_url % url
            else:
                url = '/%s' % url
                url = deep_url % url
            parse_info['url'] = url

            # parse album name, singer and year
            parse = [
                {
                    'name': 'album',
                    'start': '<a class="song"',
                    'end': '</a>',
                },
                {
                    'name': 'singer',
                    'start': '<a class="singer"',
                    'end': '</a>',
                },
                {
                    'name': 'year',
                    'start': '<p class="year"',
                    'end': '</p>',
                },
            ]

            for obj in parse:
                #{
                name = obj['name']
                start = obj['start']
                end = obj['end']
                data = ''

                start_len = len(start)
                end_len = len(end)

                if start in sub_html:
                    s_index = sub_html.find(start)
                    sub_html = sub_html[s_index:]
                    e_index = sub_html.find(end)
                    tmp_html = sub_html[0:e_index + end_len]
                    data = tools.strip_html_tag(tmp_html)
                    sub_html = sub_html[e_index + end_len:]
                parse_info[name] = data
            #}

            # album singer url year
            w_str = '%s\t%s\t%s\t%s\n' % (
                parse_info['album'], parse_info['singer'], parse_info['url'],
                parse_info['year'])
            #tools.debug(w_str, 1)
            try:
                fp.write(w_str)
            except:
                logger.warn('Write data fail for %s.' % module)

            html = html[end_index + len(end_tag):]
        #} end while
        logger.info('parse url is completed for %s.' % module)

        return 0