/
video_main.py
106 lines (81 loc) · 2.56 KB
/
video_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
from pyquery import PyQuery as pq
from thread_pool import ThreadPool
from models import VideoTopic, get_session
from util.http_utils import HttpAgent
class HenHenVideoPage(object):
"""VideoPage """
def __init__(self, url, video_type):
self.http = HttpAgent(timeout=10)
self.url = url
self.video_type = video_type
def get_video_info(self):
title_list = list()
video_id_list = list()
(reps, content) = self.http.request(self.url)
d = pq(content)
for tag_box in d('.list-pianyuan-box'):
tag_url_a = pq(tag_box)('.list-pianyuan-box-l a')
pq_a = pq(tag_url_a)
url_id_text = pq_a.attr('href')
title_list.append(pq_a.attr('title'))
henhenluid = url_id_text[url_id_text.rfind('/')
+ 1:url_id_text.find('.')]
video_id_list.append(henhenluid)
return (title_list, video_id_list)
def save_to_db(self):
(title_list, video_id_list) = self.get_video_info()
session = get_session()
try:
for index in range(0, len(video_id_list)):
topic = VideoTopic(title_list[index], 'henhenlu',
self.video_type,
video_id_list[index])
session.add(topic)
session.commit()
print video_id_list[index] + ' is ok'
finally:
session.close()
def check_page_max():
http = HttpAgent()
base_url = 'http://www.toutoulu.com/vodlist/7_%s.html'
for i in range(11, 2000, 1):
(reps, content) = http.request(base_url % str(i))
if reps['status'] == '404':
print '%d is 404' % i
break
else:
print '%d is ok' % i
def thread_pool_job(url, video_type):
page = HenHenVideoPage(url, video_type)
page.save_to_db()
page_info = {
'1': 567,
'2': 80,
'3': 175,
'4': 27,
'5': 33,
'6': 172,
'7': 20,
}
def main():
thread_pool = ThreadPool(50)
thread_pool.start()
video_type = '7'
base_url = 'http://www.toutoulu.com/vodlist/%s_%s.html'
# init task
for page_num in range(1, page_info[video_type] + 1):
url = base_url % (video_type, page_num)
print 'add task %s' % url
thread_pool.add_task(thread_pool_job, url, video_type)
thread_pool.wait_done()
main()
# 1_567
# 2_80
# 3_175
# 4_27
# 5_33
# 6_172
# 7_20
# page = HenHenVideoPage('http://www.toutoulu.com/vodlist/1_1.html')
# print page.get_video_id()