-
Notifications
You must be signed in to change notification settings - Fork 1
/
parser.py
123 lines (86 loc) · 3.87 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from urllib import quote
from optparse import OptionParser
from grab import Grab
from grab.spider import Spider, Task
from elixir import session
import model
additional_headers = {
'Accept-Charset': 'utf-8',
'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
class ShoutCastCom(Spider):
def __init__(self, *kargs, **kwargs):
super(ShoutCastCom, self).__init__(*kargs, **kwargs)
self.setup_grab(headers=additional_headers)
def task_generator(self):
yield Task(name='genres',
url='http://www.shoutcast.com/')
def task_genres(self, grab, task):
genres = grab.xpath_list('//li[@class="prigen"]/a/text()')
for genre in genres:
genre_record = model.get_or_create(model.Genre,
name=genre,
parent=None)
grab = self.create_grab_instance()
grab.setup(url='http://www.shoutcast.com/genre.jsp',
post=dict(genre=genre))
yield Task(name='subgenres',
genre=genre_record,
grab=grab)
yield self.new_stations_task(genre=genre_record,
start_index=0)
def task_subgenres(self, grab, task):
subgenres = grab.xpath_list('//li[@class="secgen"]/a/text()')
for subgenre in subgenres:
subgenre_record = model.get_or_create(model.Genre,
name=subgenre,
parent=task.genre)
yield self.new_stations_task(genre=subgenre_record,
start_index=0)
def new_stations_task(self, genre, start_index, count=100):
url = 'http://www.shoutcast.com/genre-ajax/%s' % (quote(genre.name))
grab = self.create_grab_instance()
grab.setup(url=url,
post=dict(strIndex=start_index, count=count))
return Task(name='stations',
genre=genre,
grab=grab,
last=start_index)
def task_stations(self, grab, task):
stations = grab.xpath_list('//div[@class="dirlist"]')
if grab.xpath_exists('//span[contains(text(), "show more")]'):
yield self.new_stations_task(genre=task.genre,
start_index=task.last + len(stations))
for dirlist in stations:
info = dirlist.xpath('./div[1]/a[1]')[0]
url, name = info.get('href'), info.get('name')
stream = dirlist.xpath('./div[@class="dirtype"]/text()')[0]
bitrate = dirlist.xpath('./div[@class="dirbitrate"]/text()')[0]
stream = model.get_or_create(model.Stream,
name=stream)
bitrate = model.get_or_create(model.Bitrate,
name=bitrate)
station = model.get_or_create(model.Station,
name=name,
url=url,
stream=stream,
bitrate=bitrate)
station.genres.append(task.genre)
if stations:
session.commit()
def main():
parser = OptionParser(description=u'Парсер радиостанций www.ShoutCast.com')
parser.add_option('-t',
action="store",
dest='threads_count',
default=10,
help=u'количество потоков')
options, _ = parser.parse_args()
parser = ShoutCastCom(thread_number=options.threads_count)
parser.run()
sys.exit()
if __name__ == '__main__':
main()