forked from thuandt/OnlineMusicDownloader
/
ZingMP3Parser.py
88 lines (78 loc) · 3.08 KB
/
ZingMP3Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" ZingMp3Parser - Parser data from http://mp3.zing.vn
Parser web page to get xml url
Get xml file and parser to get data
"""
from urllib import urlopen
from HTMLParser import HTMLParser
from xml.etree import ElementTree as ET
import gzip
from StringIO import StringIO
__author__ = "Thuan.D.T (MrTux)"
__copyright__ = "Copyright (c) 2011 Thuan.D.T (MrTux) "
__credits__ = ["Thuan.D.T"]
__license__ = "GPL"
__version__ = "0.0.1"
__maintainer__ = "Thuan.D.T (MrTux)"
__email__ = "mrtux@ubuntu-vn.org"
__status__ = "Development"
class ZingMP3Parser(HTMLParser):
def __init__(self, url):
"""Returns new Sequence object with specified url
url: link to mp3.zing.vn web page
"""
HTMLParser.__init__(self)
self.song_name = []
self.song_artist = []
self.song_link = []
self.song_type = []
req = urlopen(url) # open connection to web page
data = None
if req.info().get('Content-Encoding') == "gzip":
buf = StringIO( req.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read().split("\n")
else:
data = req.read().split("\n") # split web page with \n
feed_data = None
for param in data:
if (param.find('<param name="flashvars" value="') > -1):
"""Find line to get xml url
"""
feed_data = param
break
self.feed(feed_data) # parser html data
def handle_starttag(self, tag, attrs):
"""Handle html tag to get xml data
"""
if tag == 'param' and dict(attrs)['name'] == 'flashvars':
"""Get param tags and attribute 'flashvars'
"""
flashvars = dict(attrs)['value'] # get flashvars value
flashvars = flashvars.split('&')
for xml_file in flashvars:
if(xml_file.find('xmlURL=') > -1):
xml_url = xml_file.replace('xmlURL=', '') # get xml url
break
xml_data = urlopen(xml_url) # get xml data
if xml_data.info().get('Content-Encoding') == "gzip":
buf = StringIO( xml_data.read())
xml_data = gzip.GzipFile(fileobj=buf)
tree = ET.parse(xml_data)
root = tree.getroot()
for name in tree.findall('./item/title'):
self.song_name.append(name.text.strip()) # get song name
for artist in tree.findall('./item/performer'):
self.song_artist.append(artist.text.strip()) # get song artist
for media_url in tree.findall('./item/source'):
self.song_link.append(media_url.text) # get media url
for child in root:
self.song_type.append(child.attrib['type']) # get media file type
def music_data(self):
"""Returns data of Object
song_name: list of song name
song_artist: list of artist
song_link: list of mp3 media link
"""
return self.song_name, self.song_artist, self.song_link, self.song_type