/
douban_movies.py
107 lines (92 loc) · 2.99 KB
/
douban_movies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
#!/usr/bin/env python
import requests, re, urllib, time
from threading import Lock, Thread
from queue import Queue
from multiprocessing.dummy import Pool as ThreadPool
class Fetcher:
def __init__(self,threads,subject):
self.opener = urllib.request.build_opener(urllib.request.HTTPHandler)
self.lock = Lock()
self.q_req = Queue()
self.q_ans = Queue()
self.threads = threads
self.subject = subject
for i in range(threads):
t = Thread(target=self.threadget,args=subject)
t.setDaemon(True)
t.start()
self.running = 0
def __del__(self):
time.sleep(0.5)
self.q_req.join()
self.q_ans.join()
def taskleft(self):
return self.q_req.qsize()+self.q_ans.qsize()+self.running
def push(self, req):
self.q_req.put(req)
def pop(self, ans):
return self.q_ans.get()
def download_imag(self, subject):
global count
s = requests.session()
imag = s.get(subject['cover'])
name = subject['title']
path = '/users/peibibing/PycharmProjects/douban/douban_movie/%s.jpg'%name
with open(path,'wb') as f:
f.write(imag.content)
count += 1
print(count)
return 'ok'
def threadget(self,sub):
while True:
req = self.q_req.get()
with self.lock: #保证操作的原子性
self.running += 1
try:
# ans = download_imag(sub)
ans = self.opener.open(req).read()
except Exception:
ans = 'error'
print(ans)
self.q_ans.put((req,ans))
with self.lock:
self.running -= 1
self.q_req.task_done()
time.sleep(0.1)
def download_imag(subject):
global count
s = requests.session()
imag = s.get(subject['cover'])
name = subject['title']
path = '/users/peibibing/PycharmProjects/douban/douban_movie/%s.jpg'%name
with open(path,'wb') as f:
f.write(imag.content)
count += 1
print(count)
return 'ok'
def get_subject(url):
header={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Host':'movie.douban.com',
'Connection':'keep-alive'
}
a = requests.get(url=url)
b = a.json()['subjects']
# print(len(b),b[0]['cover'],b[0]['title'])
return b
count = 0
if __name__ == "__main__":
pool = ThreadPool(5)
num = 20
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=6000&page_start=0'
subjects = get_subject(url)
pool.map(download_imag, subjects)
# print(subjects[0])
# end = list(map(download_imag,subjects))
# print(len(end))
# f = Fetcher(threads=10)
# for subject in subjects:
# f.push(subject)
# while f.taskleft():
# f.pop()