Example #1
0
    def get_xml_parameters(self, url):
        bytes = common.get_url_content(url)

        pattern = "query +: +'([^']+)'"
        query = common.get_first_group_by_pattern(bytes, pattern)

        return query
Example #2
0
    def find_lyric(self, url):
        pattern = 'surl=([^&=]+)'

        song_id = common.get_first_group_by_pattern(url, pattern)

        if not song_id:
            logging.error('Failed to get id of url [%s]', url)
            return False

        song_url = 'http://www.animap.jp/kasi/phpflash/flashphp.php?unum=' + song_id
        data = common.get_url_content(song_url)
        if not data:
            logging.error('Failed to get content of url [%s]', song_url)
            return False

        prefix = 'test2='
        pos = data.find(prefix)
        if pos == -1:
            logging.error('Failed to find lyric position of url [%s]', url)
            return False

        lyric = data[pos + len(prefix):]
        lyric = lyric.decode('sjis').strip()

        # test for half to full
        lyric = common.half2full(lyric)

        self.lyric = lyric

        return True
Example #3
0
def download_search_result():
    url = 'http://www.uta-net.com/search/?Aselect=1&Bselect=3&Keyword=KOKIA&sort=6'
    output = 'uta_net.search.txt'

    html = common.get_url_content(url)
    if not html:
        logging.error('Failed to download url [%s]' % (url, ))
        return False

    pattern = '<td class="side td1"><a href="([^"]+)">'

    import re
    import urlparse
    songs = re.findall(pattern, html)

    out = open(output, 'wb')
    for song in songs:
        print song
        song_url = urlparse.urljoin(site_url, song)
        full = get_lyric(song_url)

        out.write(full.encode('utf-8'))
        out.write('\n\n=====\n')

    out.close()
Example #4
0
    def find_lyric(self, url):
        pattern = 'item-([0-9]+)\.html'

        song_id = common.get_first_group_by_pattern(url, pattern)

        if not song_id:
            logging.info('Failed to get id of url [%s]', url)
            return False

        song_url = 'http://www.kasi-time.com/item_js.php?no=' + song_id
        data = common.get_url_content(song_url)
        if not data:
            logging.info('Failed to get content of url [%s]', song_url)
            return False

        lyric = data.decode('utf-8', 'ignore')
        lyric = lyric.replace("document.write('", "")
        lyric = lyric.replace("');", "")
        lyric = lyric.replace("<br>", "\n")
        lyric = lyric.replace("&nbsp;", " ")
        lyric = common.htmlspecialchars_decode(lyric)
        lyric = common.unicode2string(lyric)
        lyric = common.strip_slash(lyric)
        lyric = lyric.strip()

        # test for half to full
        lyric = common.half2full(lyric)

        self.lyric = lyric

        return True
Example #5
0
    def find_song_info(self, url):
        ret = True
        resp = common.get_url_content(url)

        encoding = 'sjis'
        html = resp.decode(encoding, 'ignore')

        prefix = '<table border=0 cellpadding=0 cellspacing=5>'
        suffix = '</td></table>'
        infoString = common.get_string_by_start_end_string(prefix, suffix, html)

        self.title = common.strip_tags(
            common.get_string_by_start_end_string('<td>', '</td>', infoString)
        )

        self.artist = common.strip_tags(
            common.get_string_by_start_end_string('<td><a href=', '</a></td>', infoString)
        )

        prefix = '<table border=0 cellpadding=0 cellspacing=0>'
        suffix = '</td></table>'
        lyricAndMusic = common.get_string_by_start_end_string(prefix, suffix, infoString)

        pattern = u'作詞 : (.*)<br>'
        self.lyricist = common.get_first_group_by_pattern(lyricAndMusic, pattern)

        pattern = u'作曲 : (.*)</td>'
        self.composer = common.get_first_group_by_pattern(lyricAndMusic, pattern)

        return ret
Example #6
0
    def find_song_info(self, url):
        ret = True
        html = common.get_url_content(url)

        encoding = 'sjis'
        html = html.decode(encoding, 'ignore')

        patterns = {
            'title': u'<h2[^>]*>([^<]+)</h2>',
            'artist': u'歌手:<h3.*?><a href="/artist/[0-9]+/".*?>(.+?)</a></h3>',
            'lyricist': u'作詞:<h4.*?>([^<]+)</h4>',
            'composer': u'作曲:<h4.*?>([^<]+)</h4>'
        }

        for key in patterns:
            pattern = patterns[key]

            value = common.get_first_group_by_pattern(html, pattern)

            if not value:
                logging.info('Failed to get %s of url [%s]', key, url)
                ret = False
            else:
                value = common.unicode2string(common.strip_tags(value))
                setattr(self, key, value)

        return ret
Example #7
0
    def find_song_info(self, url):
        ret = True
        html = common.get_url_content(url)

        encoding = 'euc_jp'
        html = html.decode(encoding, 'ignore')

        patterns = {
            'title': 'title',
            'artist': 'artist',
            'lyricist': 'sakusi',
            'composer': 'sakyoku',
        }

        for key in patterns:
            key_for_pattern  = patterns[key]

            pattern = u'<INPUT type="hidden" name=%s value="([^"]*)">' % (key_for_pattern, )
            value = common.get_first_group_by_pattern(html, pattern)

            if not value:
                logging.info('Failed to get %s of url [%s]', key, url)
                ret = False
            else:
                value = common.htmlspecialchars_decode(value).strip()
                setattr(self, key, value)

        return ret
Example #8
0
    def get_lyric_json(self, content):
        line = content.replace('\r\n', '')

        prefix = 'function showLyric'
        suffix = '$.ajax'
        line = common.find_string_by_prefix_suffix(line, prefix, suffix)
        if not line:
            logging.error('Failed to find string in ')
            return False

        prefix = 'var data ='
        suffix = ';'
        line = common.find_string_by_prefix_suffix(line, prefix, suffix)
        if not line:
            logging.error('Failed to find string in ')
            return False

        post_data = self.convert_js_to_url(line)
        logging.debug('post data: %s' % (post_data, ))

        lyric_url = 'http://music-book.jp/music/MusicDetail/GetLyric'
        raw_json = common.get_url_content(lyric_url, data=post_data)
        if not raw_json:
            logging.error('Failed to get json of url [%s]' % (lyric_url, ))
            return False

        return json.loads(raw_json)
Example #9
0
    def get_html(self, url):
        data = common.get_url_content(url)
        if not data:
            logging.error('Failed to get content of url [%s]', url)
            return False

        html = data.decode('utf-8', 'ignore')
        return html
Example #10
0
    def get_html(self, url):
        html = common.get_url_content(url)
        if not html:
            return False

        html = html.decode('sjis', 'ignore')

        return html
Example #11
0
    def get_image_list(self):
        content = common.get_url_content(self.url)

        list = self.parse_image_list(content)

        if len(list) > 0:
            list = ['%s%s' % (self.image_server, img, ) for img in list]
        return list
Example #12
0
    def get_page_content(self, url):
        content = common.get_url_content(url)
        if not content:
            logging.info('Failed to get content of url [%s]', url)
            return False

        content = content.decode('utf-8', 'ignore')

        return content
Example #13
0
    def get_image_server(self):
        url = "http://%s%s" % (self.host, self.serverListJS)
        content = common.get_url_content(url)

        self.image_server = self.parse_image_server(content)

        if self.image_server == "":
            raise Exception("failed to find image_server")

        logging.debug("image_server: %s" % (self.image_server,))
Example #14
0
def get_weibo_urls_in_pagebar(page, bar, id):
	pre_page, page, pagebar = get_page_paras(page, bar)
	#http://weibo.com/u/1760242980
	pagebar_url = 'http://weibo.com/p/aj/mblog/mbloglist?pre_page=' + str(pre_page) + '&page=' + str(page) + '&id=100505' + str(id) + '&pagebar=' + str(pagebar)
	print pagebar_url
	content = common.get_url_content(pagebar_url)
	urlList =re.findall(reSettings.weibo_url_pat, content)
	urlList = [(url.replace('\\', '')) for url in urlList]
	#print len(urlList)
	return urlList
Example #15
0
    def get_image_server(self):
        url = 'http://%s%s' % (self.host, self.serverListJS)
        content = common.get_url_content(url)

        self.image_server = self.parse_image_server(content)

        if self.image_server == '':
            raise Exception('failed to find image_server')

        logging.debug('image_server: %s' % (self.image_server, ) )
Example #16
0
    def get_lyric_html(self, url):
        encoding = "sjis"

        raw = common.get_url_content(url)
        if not raw:
            logging.error("Failed to get content of url [%s]", url)
            return False

        html = raw.decode(encoding, "ignore")
        return html
Example #17
0
    def get_xml(self, query):
        # http://rio.yahooapis.jp/RioWebService/V2/getLyrics?appid=7vOgnk6xg64IDggn6YEl3IQxmbj1qqkQzTpAx5nGwl9HnfPX3tZksE.oYhEw3zA-&lyrics_id=Y152097&results=1&multi_htmlspecialchars_flag=1
        xmlpath = 'http://rio.yahooapis.jp/RioWebService/V2/getLyrics?appid=%s&%s' % (
            '7vOgnk6xg64IDggn6YEl3IQxmbj1qqkQzTpAx5nGwl9HnfPX3tZksE.oYhEw3zA-', unquote(query)
        )
        
        bytes = common.get_url_content(xmlpath)

        logging.debug(bytes)

        return bytes
Example #18
0
    def get_url_content(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2141.0 Safari/537.36'
        }

        html = common.get_url_content(url, data=None, headers=headers)
        if not html:
            logging.error('Failed to get html of url [%s]' % (url, ))
            return False

        return html.decode('utf-8', 'ignore')
Example #19
0
def get_follower_infos(id):
	followerListUrl = 'http://weibo.com/p/100505' + str(id) + '/follow?relate=fans&page='
	followerList = []
	page = 1
	while True:
		url = followerListUrl + str(page)
		page = page + 1
		content = common.get_url_content(url)
		friend_list_pat = re.compile(r'<li class=\\\"clearfix S_line1\\\".*?uid=(.*?)&fnick=(.*?)&sex=(.*?)\\\">')
		follower_list_list = re.findall(friend_list_pat, content)
		if len(follower_list_list) == 0:
			break
		for x in follower_list_list:
			followerList.append(Friend.FriendInfo(x[0], x[1], x[2]))
	return followerList
Example #20
0
    def get_hidden_params(self):
        """ get new hidden parameters of kashinavi. need zlib support. """ 
        url = 'http://www.kashinavi.com/song_view.swf'

        # add random value to avoid Google cache old value
        url = '%s?time=%.0f' % (url, time.time())
        logging.debug('url:%s' % (url, ))

        data = common.get_url_content(url)

        if data[0:3] == 'CWS':
            # compressed swf
            compressed_str = data[8:]
            uncompressed_str = zlib.decompress(compressed_str)
        elif data[0:3] == 'FWS':
            # uncompressed swf
            uncompressed_str = data
        else:
            # not valid swf, just return
            return

        prefix = '\0LoadVars\0'
        suffix = '\0myLoadVars'

        u = uncompressed_str 
        m = u[u.find(prefix)+len(prefix): u.find(suffix)]
        items = m.split('\0')

        para_name = 'something'
        para_value = 'is'
        # wrong

        if len(items) == 3:
            target_name = items[1]
            target_value = items[2]
            
        middle_value = m.replace('\0', ' ')

        query_prefix = '%s=%s&file_no=' % (target_name, target_value,)

        para = {
            'query_prefix': query_prefix,
            'middle_value': middle_value,
        }

        logging.debug('query_prefix:%s' % (query_prefix, ))

        return para
Example #21
0
    def get_song_json(self, song_id):
        json_url = 'https://mspxy.joysound.com/Common/Lyric'
        post_data = 'kind=naviGroupId&selSongNo=%s&interactionFlg=0&apiVer=1.0' % (song_id, )
        headers = {
            'X-JSP-APP-NAME': '0000800'
        }

        json_str = common.get_url_content(json_url, post_data, headers)

        if not json_str:
            logging.info('Failed to get json from url [%s]', url)
            return False

        obj = json.loads(json_str)

        return obj
Example #22
0
def getAndSaveData():
    for year in years:
        for no in range(1, gameCount + 1):
            gameUrl = GAME_URL.format(year=year, gameNo=no)
            url = DATA_URL.format(year=year) + gameUrl

            try:
                text = common.get_url_content(url)
            except:
                continue

            data = common.parse_json_str(text)

            stt = data["g"]["stt"]
            # 比赛是否结束
            if not stt.endswith("Final"):
                print '%s game is not finished' % key
                continue
            vls = data["g"]["vls"]
            hls = data["g"]["hls"]

            vs = int(vls['s'])
            hs = int(hls['s'])
            # 主队是赢还是输
            w = 1.0 if hs > vs else 0.0

            di = {}
            di.update({"win": w})

            # 比赛数据diff
            vsts = vls['tstsg']
            hsts = hls['tstsg']
            for k in hsts:
                di.update({k: int(hsts[k]) - int(vsts[k])})

        # 队伍名称
            vn = vls["ta"]
            hn = hls["ta"]
            di.update({"home": hn, "away": vn})
            # 比赛日期
            date = data["g"]["gdtutc"]
            di.update({"date": date})

            key = gameUrl
            cli.hset("gamedetaildiff", key, str(di))
            print "%s save successfully." % key
            cli.hset("gamedetail", key, text)
Example #23
0
    def find_song_info(self, url):
        ret = True
        html = common.get_url_content(url)

        encoding = 'euc_jp'
        html = html.decode(encoding, 'ignore')

        prefix = '<TABLE cellspacing="1"'
        suffix = '</TABLE>'
        info_table = common.find_string_by_prefix_suffix(html, prefix, suffix)

        def get_info_value_by_key(html, key):
            valuePrefix = '#ffffff>&nbsp;'
            valueSuffix = '</TD>'
            lenPrefix = len(valuePrefix)
            posKey = html.find(key)
            logging.debug('key position: %d' % (posKey))

            posStart = html.find(valuePrefix, posKey) + lenPrefix
            posEnd = html.find(valueSuffix, posStart)
            logging.debug('position [%d:%d]' % (posStart, posEnd))

            value = html[posStart:posEnd]
            return value

        patterns = {
            'title': u'曲名</TD>',
            'artist': u'歌手</TD>',
            'lyricist': u'作詞</TD>',
            'composer': u'作曲</TD>',
        }

        for key in patterns:
            pattern = patterns[key]
            value = get_info_value_by_key(info_table, pattern)

            if not value:
                logging.info('Failed to get %s of url [%s]', key, url)
                ret = False
            else:
                value = common.htmlspecialchars_decode(value).strip()
                setattr(self, key, value)

        return ret
Example #24
0
def get_followed_infos(id):
	next_coursor = 0
	followedListUrl = 'https://api.weibo.com/2/friendships/friends.json?uid=' + str(id) + '&cursor=' + next_coursor + '&access_token=2.00YlCmSCMYmPKE8da11d9359t_17GE'
	followedList = []
	page = 1
	while True:
		url = followedListUrl + str(page)
		print page
		page = page + 1
		print url
		content = common.get_url_content(url)
		friend_list_pat = re.compile(r'<li class=\\\"clearfix S_line1\\\".*?uid=(.*?)&fnick=(.*?)&sex=(.*?)\\\">')
		followed_list_list = re.findall(friend_list_pat, content)
		if len(followed_list_list) == 0:
			break
		print url
		for x in followed_list_list:
			followedList.append(Friend.FriendInfo(x[0], x[1], x[2]))
	return followedList
Example #25
0
    def get_lyric_content(self, url):
        lyric_api_url = 'http://kashisearch.jp/api/lyrics'

        lyric_id = self.get_lyric_id(url)
        if not lyric_id:
            return False

        post_data = 'id=%s' % (lyric_id, )
        headers = {
            'X-Requested-With': 'XMLHttpRequest' 
        }
        raw_lyric = common.get_url_content(lyric_api_url, post_data, headers)
        if not raw_lyric:
            logging.error('Failed to get lyric content, url [%s]', url)
            return False

        raw_lyric = raw_lyric.decode('utf-8', 'ignore')

        return raw_lyric
Example #26
0
    def parse_page(self):
        url = self.url

        content = common.get_url_content(url)
        if not content:
            logging.info('Failed to get content of url [%s]', url)
            return False

        content = content.decode('utf-8', 'ignore')

        if not self.find_lyric(content):
            logging.info('Failed to get lyric of url [%s]', url)
            return False

        if not self.find_song_info(content):
            logging.info('Failed to get song info of url [%s]', url)
            return False 

        return True
Example #27
0
    def find_song_info(self, url):
        ret = True
        html = common.get_url_content(url)

        encoding = 'utf-8'
        html = html.decode(encoding, 'ignore')

        pattern = '<h1>(.*)</h1>'
        value = common.get_first_group_by_pattern(html, pattern).strip()
        if value:
            self.title = value
        else:
            logging.error('Failed to find title of url [%s]', url)
            ret = False

        prefix = '<div class="person_list">'
        suffix = '</div>'
        info_table = common.find_string_by_prefix_suffix(html, prefix, suffix)

        patterns = {
            'artist': u'歌手',
            'lyricist': u'作詞者',
            'composer': u'作曲者',
            'arranger': u'編曲者',
        }

        for key in patterns:
            pattern = patterns[key]

            prefix = u'<th>%s</th>' % (pattern)
            suffix = '</td>'

            value = common.find_string_by_prefix_suffix(info_table, prefix, suffix, False)
            if not value:
                continue
            value = common.strip_tags(value).strip()
            if value:
                setattr(self, key, value)

        return ret
Example #28
0
    def find_lyric(self, url):
        pattern = '\?([0-9]+)'

        song_id = common.get_first_group_by_pattern(url, pattern)
        if not song_id:
            logging.error('Failed to get id of url [%s]', url)
            return False

        params = self.get_hidden_params()

        query = '%s%s&time=%s' % (params['query_prefix'], song_id, time.localtime(), )
        logging.debug('query:%s' % (query, ))

        post_url = 'http://www.kashinavi.com/cgi-bin/kashi.cgi'
        resp = common.get_url_content(post_url, query)
        if not resp:
            logging.error('Failed to get content of url [%s], query [%s]', post_url, query)
            return False

        raw_lyric = resp.decode('utf-8', 'ignore')

        # if parsing rule changed, return debug info
        if raw_lyric.find(u'歌詞ナビTOPページより') > 0:
            self.lyric = '''
Site rule changed!
Please contact franklai
LoadVars::%s::myLoadVars
''' % (params['middle_value'])
            return True
        
        # else remove the useless part and return lyric
        front_str = 'kashi='
        start = raw_lyric.find(front_str) + len(front_str)
        lyric = raw_lyric[start:]
        lyric = lyric.strip()

        self.lyric = lyric

        return True
Example #29
0
    def find_lyric(self, url):
        pattern = '/[a-z]+/([0-9]+)/'

        song_id = common.get_first_group_by_pattern(url, pattern)
        if not song_id:
            # try old pattern
            # http://www.uta-net.com/user/phplib/view_0.php?ID=17248
            pattern = 'ID=([0-9]+)'
            song_id = common.get_first_group_by_pattern(url, pattern)

        if not song_id:
            logging.info('Failed to get id of url [%s]', url)
            return False

        showkasi_pattern = 'http://www.uta-net.com/user/phplib/swf/showkasi.php?ID=%s&WIDTH=530&HEIGHT=810'
        song_url = showkasi_pattern % (song_id, )
        data = common.get_url_content(song_url)
        if not data:
            logging.info('Failed to get content of url [%s]', song_url)
            return False

        prefix = '<\0\0'
        suffix = '\0'
        lyric = common.find_string_by_prefix_suffix(data, prefix, suffix, False)

        if not lyric:
            logging.error('Failed to get lyric of url [%s]', url)
            return False

        lyric = unicode(lyric, 'utf8')
        lyric = lyric.strip()

        # test for half to full
        lyric = common.half2full(lyric)

        self.lyric = lyric

        return True
Example #30
0
    def get_lyric_html(self, url):
        raw = common.get_url_content(url)

        html = raw.decode('utf-8', 'ignore')

        return html
Example #31
0
 def get_content(self):
   """Get the content of the wiki enquiry
   """
   formatted_url = common.format_url(self.WIKIAPI, self.WIKIAPI_FLAGS, self.keywords);
   self.raw_results = common.get_url_content(formatted_url)
   return