-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
112 lines (90 loc) · 3.37 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# coding: utf-8
from grab.spider import Spider, Task
from youtube_dl import YoutubeDL
import model
class PornHubCategoryParser(Spider):
initial_urls = []
#priority_mode='const'
def __init__(self, model):
super(PornHubCategoryParser, self).__init__()
self.model = model
self.cat_index = 0 # index of category, from 0
PornHubCategoryParser.initial_urls.append(self.model.site_categories)
def task_initial(self, grab, task):
# find category url and add new category to the model
for elem in grab.doc.select("//div[@class='category-wrapper']/a"):
self.model.addCategoryUrl(self.model.site + elem.attr('href'))
# download category previews
for elem in grab.doc.select("//div[@class='category-wrapper']/a/img"):
yield Task("image", url=elem.attr('src'), num=self.cat_index)
self.cat_index += 1
self.model.addCategory(elem.attr('alt'))
#print(elem.attr('src'))
def task_image(self, grab, task):
path = '{path}/{i}.jpg'.format(i = task.num, path = self.model.categories_image_path)
grab.response.save(path)
class PornHubPageParser(Spider):
initial_urls = []
#priority_mode='const'
def __init__(self, model, page_url, category_id, page = 0):
super(PornHubPageParser, self).__init__()
self.model = model
self.category_id = category_id # from 0
self.page = page # from 0 or 1
PornHubPageParser.initial_urls.append(page_url)
print("page parser created for page", page)
def task_initial(self, grab, task):
porn = [] # list or porn videos from this page
print("parsing {c} page {p}".format(c = self.category_id, p = self.page))
# get vkeys
for elem in grab.doc.select("//li[@class='videoblock']"):
porn.append({'vkey': elem.attr('_vkey')})
# append names to vkeys
i = 0;
for elem in grab.doc.select("//li[@class='videoblock']/a[@class='img']"):
porn[i]['name'] = elem.attr('_vkey')
i += 1
# if this is category url (page == 0), we should detect page url
page_url = str()
if self.page == 0:
for elem in grab.doc.select("//li[@class='page_number']/a[@class='greyButton']"):
if elem.text() == '2':
page_url = elem.attr('href')[0:-1]
#print(page_url)
# add porn videos to the model
for x in porn:
self.model.addPornVideo(x, self.category_id, page_url)
#def task_image(self, grab, task):
# path = '{path}/{vkey}.jpg'.format(vkey = task.vkey, path = self.model.videos_preview_image_path)
# grab.response.save(path)
class PyJizzParser(object):
def __init__(self, model):
self.model = model
self.model.parser = self
self.ydl = YoutubeDL()
self.ydl.add_default_info_extractors()
def parseCategories(self):
c = PornHubCategoryParser(self.model)
c.run()
def parseCategoryPage(self, category, page = 1):
if page == 0 or page == 1:
url = self.model.categories_url[category]
else:
url = "{site}{page_url}{page}".format(
site = self.model.site,
page_url = self.model.porn[category]['page_url'],
page = page)
print("page parser creating for page", page)
p = PornHubPageParser(self.model, url, category, page)
p.run()
print("page parser exit for page", page)
def getInfo(self, vkey):
info = self.ydl.extract_info('http://www.pornhub.com/view_video.php?viewkey={v}'.format(v = vkey), download=False)
return info
if __name__ == '__main__':
m = model.Porn()
p = PyJizzParser(m)
p.parseCategories()
p.parseCategoryPage(0)
#p.parseCategoryPage(0, 3)
print(m.porn)