def get_xml_parameters(self, url): bytes = common.get_url_content(url) pattern = "query +: +'([^']+)'" query = common.get_first_group_by_pattern(bytes, pattern) return query
def find_lyric(self, url): pattern = 'surl=([^&=]+)' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.error('Failed to get id of url [%s]', url) return False song_url = 'http://www.animap.jp/kasi/phpflash/flashphp.php?unum=' + song_id data = common.get_url_content(song_url) if not data: logging.error('Failed to get content of url [%s]', song_url) return False prefix = 'test2=' pos = data.find(prefix) if pos == -1: logging.error('Failed to find lyric position of url [%s]', url) return False lyric = data[pos + len(prefix):] lyric = lyric.decode('sjis').strip() # test for half to full lyric = common.half2full(lyric) self.lyric = lyric return True
def download_search_result(): url = 'http://www.uta-net.com/search/?Aselect=1&Bselect=3&Keyword=KOKIA&sort=6' output = 'uta_net.search.txt' html = common.get_url_content(url) if not html: logging.error('Failed to download url [%s]' % (url, )) return False pattern = '<td class="side td1"><a href="([^"]+)">' import re import urlparse songs = re.findall(pattern, html) out = open(output, 'wb') for song in songs: print song song_url = urlparse.urljoin(site_url, song) full = get_lyric(song_url) out.write(full.encode('utf-8')) out.write('\n\n=====\n') out.close()
def find_lyric(self, url): pattern = 'item-([0-9]+)\.html' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.info('Failed to get id of url [%s]', url) return False song_url = 'http://www.kasi-time.com/item_js.php?no=' + song_id data = common.get_url_content(song_url) if not data: logging.info('Failed to get content of url [%s]', song_url) return False lyric = data.decode('utf-8', 'ignore') lyric = lyric.replace("document.write('", "") lyric = lyric.replace("');", "") lyric = lyric.replace("<br>", "\n") lyric = lyric.replace(" ", " ") lyric = common.htmlspecialchars_decode(lyric) lyric = common.unicode2string(lyric) lyric = common.strip_slash(lyric) lyric = lyric.strip() # test for half to full lyric = common.half2full(lyric) self.lyric = lyric return True
def find_song_info(self, url): ret = True resp = common.get_url_content(url) encoding = 'sjis' html = resp.decode(encoding, 'ignore') prefix = '<table border=0 cellpadding=0 cellspacing=5>' suffix = '</td></table>' infoString = common.get_string_by_start_end_string(prefix, suffix, html) self.title = common.strip_tags( common.get_string_by_start_end_string('<td>', '</td>', infoString) ) self.artist = common.strip_tags( common.get_string_by_start_end_string('<td><a href=', '</a></td>', infoString) ) prefix = '<table border=0 cellpadding=0 cellspacing=0>' suffix = '</td></table>' lyricAndMusic = common.get_string_by_start_end_string(prefix, suffix, infoString) pattern = u'作詞 : (.*)<br>' self.lyricist = common.get_first_group_by_pattern(lyricAndMusic, pattern) pattern = u'作曲 : (.*)</td>' self.composer = common.get_first_group_by_pattern(lyricAndMusic, pattern) return ret
def find_song_info(self, url): ret = True html = common.get_url_content(url) encoding = 'sjis' html = html.decode(encoding, 'ignore') patterns = { 'title': u'<h2[^>]*>([^<]+)</h2>', 'artist': u'歌手:<h3.*?><a href="/artist/[0-9]+/".*?>(.+?)</a></h3>', 'lyricist': u'作詞:<h4.*?>([^<]+)</h4>', 'composer': u'作曲:<h4.*?>([^<]+)</h4>' } for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(html, pattern) if not value: logging.info('Failed to get %s of url [%s]', key, url) ret = False else: value = common.unicode2string(common.strip_tags(value)) setattr(self, key, value) return ret
def find_song_info(self, url): ret = True html = common.get_url_content(url) encoding = 'euc_jp' html = html.decode(encoding, 'ignore') patterns = { 'title': 'title', 'artist': 'artist', 'lyricist': 'sakusi', 'composer': 'sakyoku', } for key in patterns: key_for_pattern = patterns[key] pattern = u'<INPUT type="hidden" name=%s value="([^"]*)">' % (key_for_pattern, ) value = common.get_first_group_by_pattern(html, pattern) if not value: logging.info('Failed to get %s of url [%s]', key, url) ret = False else: value = common.htmlspecialchars_decode(value).strip() setattr(self, key, value) return ret
def get_lyric_json(self, content): line = content.replace('\r\n', '') prefix = 'function showLyric' suffix = '$.ajax' line = common.find_string_by_prefix_suffix(line, prefix, suffix) if not line: logging.error('Failed to find string in ') return False prefix = 'var data =' suffix = ';' line = common.find_string_by_prefix_suffix(line, prefix, suffix) if not line: logging.error('Failed to find string in ') return False post_data = self.convert_js_to_url(line) logging.debug('post data: %s' % (post_data, )) lyric_url = 'http://music-book.jp/music/MusicDetail/GetLyric' raw_json = common.get_url_content(lyric_url, data=post_data) if not raw_json: logging.error('Failed to get json of url [%s]' % (lyric_url, )) return False return json.loads(raw_json)
def get_html(self, url): data = common.get_url_content(url) if not data: logging.error('Failed to get content of url [%s]', url) return False html = data.decode('utf-8', 'ignore') return html
def get_html(self, url): html = common.get_url_content(url) if not html: return False html = html.decode('sjis', 'ignore') return html
def get_image_list(self): content = common.get_url_content(self.url) list = self.parse_image_list(content) if len(list) > 0: list = ['%s%s' % (self.image_server, img, ) for img in list] return list
def get_page_content(self, url): content = common.get_url_content(url) if not content: logging.info('Failed to get content of url [%s]', url) return False content = content.decode('utf-8', 'ignore') return content
def get_image_server(self): url = "http://%s%s" % (self.host, self.serverListJS) content = common.get_url_content(url) self.image_server = self.parse_image_server(content) if self.image_server == "": raise Exception("failed to find image_server") logging.debug("image_server: %s" % (self.image_server,))
def get_weibo_urls_in_pagebar(page, bar, id): pre_page, page, pagebar = get_page_paras(page, bar) #http://weibo.com/u/1760242980 pagebar_url = 'http://weibo.com/p/aj/mblog/mbloglist?pre_page=' + str(pre_page) + '&page=' + str(page) + '&id=100505' + str(id) + '&pagebar=' + str(pagebar) print pagebar_url content = common.get_url_content(pagebar_url) urlList =re.findall(reSettings.weibo_url_pat, content) urlList = [(url.replace('\\', '')) for url in urlList] #print len(urlList) return urlList
def get_image_server(self): url = 'http://%s%s' % (self.host, self.serverListJS) content = common.get_url_content(url) self.image_server = self.parse_image_server(content) if self.image_server == '': raise Exception('failed to find image_server') logging.debug('image_server: %s' % (self.image_server, ) )
def get_lyric_html(self, url): encoding = "sjis" raw = common.get_url_content(url) if not raw: logging.error("Failed to get content of url [%s]", url) return False html = raw.decode(encoding, "ignore") return html
def get_xml(self, query): # http://rio.yahooapis.jp/RioWebService/V2/getLyrics?appid=7vOgnk6xg64IDggn6YEl3IQxmbj1qqkQzTpAx5nGwl9HnfPX3tZksE.oYhEw3zA-&lyrics_id=Y152097&results=1&multi_htmlspecialchars_flag=1 xmlpath = 'http://rio.yahooapis.jp/RioWebService/V2/getLyrics?appid=%s&%s' % ( '7vOgnk6xg64IDggn6YEl3IQxmbj1qqkQzTpAx5nGwl9HnfPX3tZksE.oYhEw3zA-', unquote(query) ) bytes = common.get_url_content(xmlpath) logging.debug(bytes) return bytes
def get_url_content(self, url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2141.0 Safari/537.36' } html = common.get_url_content(url, data=None, headers=headers) if not html: logging.error('Failed to get html of url [%s]' % (url, )) return False return html.decode('utf-8', 'ignore')
def get_follower_infos(id): followerListUrl = 'http://weibo.com/p/100505' + str(id) + '/follow?relate=fans&page=' followerList = [] page = 1 while True: url = followerListUrl + str(page) page = page + 1 content = common.get_url_content(url) friend_list_pat = re.compile(r'<li class=\\\"clearfix S_line1\\\".*?uid=(.*?)&fnick=(.*?)&sex=(.*?)\\\">') follower_list_list = re.findall(friend_list_pat, content) if len(follower_list_list) == 0: break for x in follower_list_list: followerList.append(Friend.FriendInfo(x[0], x[1], x[2])) return followerList
def get_hidden_params(self): """ get new hidden parameters of kashinavi. need zlib support. """ url = 'http://www.kashinavi.com/song_view.swf' # add random value to avoid Google cache old value url = '%s?time=%.0f' % (url, time.time()) logging.debug('url:%s' % (url, )) data = common.get_url_content(url) if data[0:3] == 'CWS': # compressed swf compressed_str = data[8:] uncompressed_str = zlib.decompress(compressed_str) elif data[0:3] == 'FWS': # uncompressed swf uncompressed_str = data else: # not valid swf, just return return prefix = '\0LoadVars\0' suffix = '\0myLoadVars' u = uncompressed_str m = u[u.find(prefix)+len(prefix): u.find(suffix)] items = m.split('\0') para_name = 'something' para_value = 'is' # wrong if len(items) == 3: target_name = items[1] target_value = items[2] middle_value = m.replace('\0', ' ') query_prefix = '%s=%s&file_no=' % (target_name, target_value,) para = { 'query_prefix': query_prefix, 'middle_value': middle_value, } logging.debug('query_prefix:%s' % (query_prefix, )) return para
def get_song_json(self, song_id): json_url = 'https://mspxy.joysound.com/Common/Lyric' post_data = 'kind=naviGroupId&selSongNo=%s&interactionFlg=0&apiVer=1.0' % (song_id, ) headers = { 'X-JSP-APP-NAME': '0000800' } json_str = common.get_url_content(json_url, post_data, headers) if not json_str: logging.info('Failed to get json from url [%s]', url) return False obj = json.loads(json_str) return obj
def getAndSaveData(): for year in years: for no in range(1, gameCount + 1): gameUrl = GAME_URL.format(year=year, gameNo=no) url = DATA_URL.format(year=year) + gameUrl try: text = common.get_url_content(url) except: continue data = common.parse_json_str(text) stt = data["g"]["stt"] # 比赛是否结束 if not stt.endswith("Final"): print '%s game is not finished' % key continue vls = data["g"]["vls"] hls = data["g"]["hls"] vs = int(vls['s']) hs = int(hls['s']) # 主队是赢还是输 w = 1.0 if hs > vs else 0.0 di = {} di.update({"win": w}) # 比赛数据diff vsts = vls['tstsg'] hsts = hls['tstsg'] for k in hsts: di.update({k: int(hsts[k]) - int(vsts[k])}) # 队伍名称 vn = vls["ta"] hn = hls["ta"] di.update({"home": hn, "away": vn}) # 比赛日期 date = data["g"]["gdtutc"] di.update({"date": date}) key = gameUrl cli.hset("gamedetaildiff", key, str(di)) print "%s save successfully." % key cli.hset("gamedetail", key, text)
def find_song_info(self, url): ret = True html = common.get_url_content(url) encoding = 'euc_jp' html = html.decode(encoding, 'ignore') prefix = '<TABLE cellspacing="1"' suffix = '</TABLE>' info_table = common.find_string_by_prefix_suffix(html, prefix, suffix) def get_info_value_by_key(html, key): valuePrefix = '#ffffff> ' valueSuffix = '</TD>' lenPrefix = len(valuePrefix) posKey = html.find(key) logging.debug('key position: %d' % (posKey)) posStart = html.find(valuePrefix, posKey) + lenPrefix posEnd = html.find(valueSuffix, posStart) logging.debug('position [%d:%d]' % (posStart, posEnd)) value = html[posStart:posEnd] return value patterns = { 'title': u'曲名</TD>', 'artist': u'歌手</TD>', 'lyricist': u'作詞</TD>', 'composer': u'作曲</TD>', } for key in patterns: pattern = patterns[key] value = get_info_value_by_key(info_table, pattern) if not value: logging.info('Failed to get %s of url [%s]', key, url) ret = False else: value = common.htmlspecialchars_decode(value).strip() setattr(self, key, value) return ret
def get_followed_infos(id): next_coursor = 0 followedListUrl = 'https://api.weibo.com/2/friendships/friends.json?uid=' + str(id) + '&cursor=' + next_coursor + '&access_token=2.00YlCmSCMYmPKE8da11d9359t_17GE' followedList = [] page = 1 while True: url = followedListUrl + str(page) print page page = page + 1 print url content = common.get_url_content(url) friend_list_pat = re.compile(r'<li class=\\\"clearfix S_line1\\\".*?uid=(.*?)&fnick=(.*?)&sex=(.*?)\\\">') followed_list_list = re.findall(friend_list_pat, content) if len(followed_list_list) == 0: break print url for x in followed_list_list: followedList.append(Friend.FriendInfo(x[0], x[1], x[2])) return followedList
def get_lyric_content(self, url): lyric_api_url = 'http://kashisearch.jp/api/lyrics' lyric_id = self.get_lyric_id(url) if not lyric_id: return False post_data = 'id=%s' % (lyric_id, ) headers = { 'X-Requested-With': 'XMLHttpRequest' } raw_lyric = common.get_url_content(lyric_api_url, post_data, headers) if not raw_lyric: logging.error('Failed to get lyric content, url [%s]', url) return False raw_lyric = raw_lyric.decode('utf-8', 'ignore') return raw_lyric
def parse_page(self): url = self.url content = common.get_url_content(url) if not content: logging.info('Failed to get content of url [%s]', url) return False content = content.decode('utf-8', 'ignore') if not self.find_lyric(content): logging.info('Failed to get lyric of url [%s]', url) return False if not self.find_song_info(content): logging.info('Failed to get song info of url [%s]', url) return False return True
def find_song_info(self, url): ret = True html = common.get_url_content(url) encoding = 'utf-8' html = html.decode(encoding, 'ignore') pattern = '<h1>(.*)</h1>' value = common.get_first_group_by_pattern(html, pattern).strip() if value: self.title = value else: logging.error('Failed to find title of url [%s]', url) ret = False prefix = '<div class="person_list">' suffix = '</div>' info_table = common.find_string_by_prefix_suffix(html, prefix, suffix) patterns = { 'artist': u'歌手', 'lyricist': u'作詞者', 'composer': u'作曲者', 'arranger': u'編曲者', } for key in patterns: pattern = patterns[key] prefix = u'<th>%s</th>' % (pattern) suffix = '</td>' value = common.find_string_by_prefix_suffix(info_table, prefix, suffix, False) if not value: continue value = common.strip_tags(value).strip() if value: setattr(self, key, value) return ret
def find_lyric(self, url): pattern = '\?([0-9]+)' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.error('Failed to get id of url [%s]', url) return False params = self.get_hidden_params() query = '%s%s&time=%s' % (params['query_prefix'], song_id, time.localtime(), ) logging.debug('query:%s' % (query, )) post_url = 'http://www.kashinavi.com/cgi-bin/kashi.cgi' resp = common.get_url_content(post_url, query) if not resp: logging.error('Failed to get content of url [%s], query [%s]', post_url, query) return False raw_lyric = resp.decode('utf-8', 'ignore') # if parsing rule changed, return debug info if raw_lyric.find(u'歌詞ナビTOPページより') > 0: self.lyric = ''' Site rule changed! Please contact franklai LoadVars::%s::myLoadVars ''' % (params['middle_value']) return True # else remove the useless part and return lyric front_str = 'kashi=' start = raw_lyric.find(front_str) + len(front_str) lyric = raw_lyric[start:] lyric = lyric.strip() self.lyric = lyric return True
def find_lyric(self, url): pattern = '/[a-z]+/([0-9]+)/' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: # try old pattern # http://www.uta-net.com/user/phplib/view_0.php?ID=17248 pattern = 'ID=([0-9]+)' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.info('Failed to get id of url [%s]', url) return False showkasi_pattern = 'http://www.uta-net.com/user/phplib/swf/showkasi.php?ID=%s&WIDTH=530&HEIGHT=810' song_url = showkasi_pattern % (song_id, ) data = common.get_url_content(song_url) if not data: logging.info('Failed to get content of url [%s]', song_url) return False prefix = '<\0\0' suffix = '\0' lyric = common.find_string_by_prefix_suffix(data, prefix, suffix, False) if not lyric: logging.error('Failed to get lyric of url [%s]', url) return False lyric = unicode(lyric, 'utf8') lyric = lyric.strip() # test for half to full lyric = common.half2full(lyric) self.lyric = lyric return True
def get_lyric_html(self, url): raw = common.get_url_content(url) html = raw.decode('utf-8', 'ignore') return html
def get_content(self): """Get the content of the wiki enquiry """ formatted_url = common.format_url(self.WIKIAPI, self.WIKIAPI_FLAGS, self.keywords); self.raw_results = common.get_url_content(formatted_url) return