/
ZalandoCrawler.py
196 lines (170 loc) · 9.4 KB
/
ZalandoCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
The Zalando Crawler crawls the https://www.zalando.co.uk website and downloads images and related data content for both men and women.
The two major functions of the Zalando class are building the database by first crawling all data and then assigning attributes in incremental steps.
"""
import requests
import json
from py2neo import Graph,authenticate,Node,Relationship
from bs4 import BeautifulSoup
import os,sys
from xml.etree.ElementTree import Element, SubElement, Comment, tostring
class Zalando:
def __init__(self):
self.logger = open("zalando_logger","a")
self.logger.write("Checking for directory...\n")
self.logger.write("Creating Zalando Object...\nInitializing Object variables...\n")
self.set_directory()
self.BASE_URL = "https://www.zalando.co.uk"
self.initial_page = ['https://www.zalando.co.uk/womens-clothing','https://www.zalando.co.uk/mens-clothing']
self.parameters = ['Women','Men']
self.subcategories = [dict() for x in range(2)]
authenticate("localhost:7474","neo4j","awdr.;/")
self.zalando_graph = Graph()
try:
self.zalando_graph.schema.create_uniqueness_constraint('Zalando', 'article_number')
except Exception as e:
self.logger.write("uniqueness constraint already set\n")
self.count = 0
self.xml_root = Element('Properties')
self.properties = open("clothing_properties.xml","w")
self.logger.write("Zalando Object: " + str(self)+"\n")
for i in range(len(self.initial_page)):
page_html = requests.get(self.initial_page[i]).content
page_soup = BeautifulSoup(page_html,"lxml")
self.get_clothing_attributes(page_soup,i)
self.build_zalando_database(page_soup,self.parameters[i])
self.set_attributes(i)
self.properties.write(tostring(self.xml_root))
def set_directory(self):
print "Setting up directory"
if os.path.exists("Zalando"):
self.logger.write("Zalando directory exists. Changing directory...\n")
os.chdir("Zalando")
else:
self.logger.write("Zalando directory does not exist. Creating Zalando...\n")
os.makedirs("Zalando")
os.chdir("Zalando")
self.logger.write("setting up required xml files...\n")
#create xml files for attributes
def set_attributes(self,i):
for key in self.subcategories[i]:
print self.subcategories[i][key].split("*")[1]
product_type = self.subcategories[i][key].split("*")[0]
next_page = self.BASE_URL + self.subcategories[i][key].split("*")[1]
while next_page is not None :
try:
attribute_soup = BeautifulSoup(requests.get(next_page).content,"lxml")
except Exception as e:
attribute_soup = BeautifulSoup(requests.get(next_page).content,"lxml")
catalog_articles_list = attribute_soup.find("ul","catalogArticlesList")
for li in catalog_articles_list.findAll("li"):
if str(type(li))=="<class 'bs4.element.Tag'>":
product_id = li['id']
try:
result = self.zalando_graph.cypher.execute("Match (n:Women) where n.article_number = '%s' set n.%s = '%s' " % (product_id,product_type,key))
except Exception as e:
print e
if attribute_soup.find("a","catalogPagination_button catalogPagination_button-next") is not None:
next_page = self.BASE_URL + attribute_soup.find("a","catalogPagination_button catalogPagination_button-next")['href']
else:
next_page = None
def get_product_details(self,product_url,gender):
print self.count
self.count = self.count + 1
try:
product_html = requests.get(product_url).content
except Exception as e:
product_html = requests.get(product_url).content
product_soup = BeautifulSoup(product_html,"lxml")
product_image = product_soup.find("img","articleMedia_imagePlaceholder")
article_span = product_soup.find("span","sku")
product_name = product_soup.find("h1","productName noBg")
brand = product_name.contents[1]
product_info = dict()
product_info["article_number"] = article_span.string
product_info["brand"] = brand.string
product_info["gender"] = gender
ul = article_span.parent.parent
self.logger.write("%s being explored" %(article_span.string)+"\n")
for li in ul.contents:
if li.string is not None and str(type(li))=="<class 'bs4.element.Tag'>":
var = li.get('class')[0]
if(len(li.string.split(":")) > 1 ):
product_info[li['class'][0]] = li.string.split(":")[1].encode("ascii","ignore")
n = Node(gender,"Zalando","Product")
for key in product_info:
n.properties[key] = product_info[key]
while (product_image is not None):
url = product_image['src']
file_name = url.split('/')[-1]
folder_name = file_name.split('@')[0]
if os.path.exists(folder_name):
break
os.makedirs(folder_name)
os.chdir(folder_name)
url = url.replace("detail","large")
try:
r = requests.get(url)
except Exception as e:
r = requests.get(url)
open(file_name,"w").write(r.content)
os.chdir("..")
n["file_system_url"] = "Zalando/"+folder_name
n["image_url"] = url
product_image = product_image.nextSibling.nextSibling
try:
self.zalando_graph.create(n)
except Exception as e:
self.logger.write("Node %s already exists" %(article_span.string))
def get_clothing_attributes(self,page_soup,i):
gender_dictionary = dict()
gender_dictionary[0] = "/womens-clothing-"
gender_dictionary[1] = "/mens-clothing-"
child_element = SubElement(self.xml_root,self.parameters[i])
self.logger.write("Getting clothing attributes")
subcategories = page_soup.find("ul","subCat")
self.subcategories[i] = dict()
type_element = SubElement(child_element,"type")
for li in subcategories.children :
if str(type(li))=="<class 'bs4.element.Tag'>" and li.contents[0].name == "a":
self.subcategories[i][li.contents[0]['href'].replace(gender_dictionary[i],"")[:-1]] = "type*"+li.contents[0]['href']
SubElement(type_element,"value").text = li.contents[0]['href'].replace(gender_dictionary[i],"")[:-1]
color = page_soup.find("div","cFilter zFilter zal_color")
color_element = SubElement(child_element,"color")
for li in color.find_all("div")[1].find("div").find("ul").find_all("li"):
if str(type(li))=="<class 'bs4.element.Tag'>" and li.contents[0].name == "a":
self.subcategories[i][li.contents[0]['class'][0]] = "color*"+li.contents[0]['href']
SubElement(color_element,"value").text = li.contents[0]['class'][0]
pattern = page_soup.find("div","cFilter zFilter zal_pattern")
pattern_element = SubElement(child_element,"pattern")
for li in pattern.find_all("div")[1].find("div").find("ul").find_all("li"):
if str(type(li))=="<class 'bs4.element.Tag'>" and li.contents[2].name == "a":
self.subcategories[i][li.contents[0]['id']] = "pattern*"+li.contents[2]['href']
SubElement(pattern_element,"value").text = li.contents[0]['id']
if( i == 0 ):
occasion = page_soup.find("div","cFilter zFilter zal_occasion")
occasion_element = SubElement(child_element,"occasion")
for li in occasion.find_all("div")[1].find("div").find("ul").find_all("li"):
if str(type(li))=="<class 'bs4.element.Tag'>" and li.contents[2].name == "a":
self.subcategories[i][li.contents[0]['id']] = "occasion*"+li.contents[2]['href']
SubElement(occasion_element,"value").text = li.contents[0]['id']
material = page_soup.find("div","cFilter zFilter zal_upper_material")
material_element = SubElement(child_element,"material")
for li in material.find_all("div")[1].find("div").find("ul").find_all("li"):
if str(type(li))=="<class 'bs4.element.Tag'>" and li.contents[2].name == "a":
self.subcategories[i][li.contents[0]['id']] = "material*"+li.contents[2]['href']
SubElement(material_element,"value").text = li.contents[0]['id']
self.logger.write(self.parameters[i] + " " + str(self.subcategories[i]) + "\n\n")
def build_zalando_database(self,page_soup,gender):
page_a = self.BASE_URL + page_soup.find("a","catalogPagination_button catalogPagination_button-next")
catalog_articles_list = page_soup.find("ul","catalogArticlesList")
for li in catalog_articles_list.findAll("li"):
a = li.find("a","catalogArticlesList_productBox")
self.logger.write(self.BASE_URL+a["href"]+" being explored \n")
self.get_product_details(self.BASE_URL+a["href"],gender)
if page_a is None:
return
next_page = page_a['href']
self.build_zalando_database(BeautifulSoup(requests.get(next_page).content,"lxml"),gender)
print "Creating Zalando Object"
Zalando()