def parse_professional(self, response): json_scripts = response.xpath( '//script[@type="application/ld+json"][1]/text()').extract_first() pro = json.loads(json_scripts) try: pro = pro[0] item = HouzzItem() item['name'] = '' item['telephone'] = '' item['website'] = '' item['sms'] = '' item['email'] = '' item['streetAddress'] = '' item['addressLocality'] = '' item['addressRegion'] = '' item['addressCountry'] = '' item['postalCode'] = '' item['schedules'] = '' item['category'] = '' item['information'] = '' item['paymentAccepted'] = '' item['ratingstars'] = '' item['comments'] = '' item['s_tel'] = '' item['name'] = pro['name'] item['telephone'] = pro['telephone'] item['website'] = response.xpath( '//a[@compid="Profile_Website"]/@href').extract_first() item['streetAddress'] = pro['address']['streetAddress'] item['addressLocality'] = pro['address']['addressLocality'] item['addressRegion'] = pro['address']['addressRegion'] item['addressCountry'] = pro['address']['addressCountry'] item['postalCode'] = pro['address']['postalCode'] item['category'] = response.xpath( '//span[@itemprop="child"]//span[@itemprop="title"]/text()' ).extract_first() info_list = response.xpath( '//div[@class="professional-info-content"]/text()').extract() temp = '' for info in info_list: temp = temp + info + ', ' item['information'] = temp[:-2] item['ratingstars'] = pro['aggregateRating']['ratingValue'] yield item except: pass
def parse_details(self, response): items = HouzzItem() # make sure only un-cached / new records are saved in the spreadsheet if not "cached" in response.flags: try: PhoneNumber = response.xpath( "//div[@compid='Profile_Phone']/span[@class='pro-contact-text']/text()" )[0].extract() except: PhoneNumber = "-" try: ContactPersonRAW = response.xpath( "normalize-space(//div[@class='info-list-text']/b[text()='Contact']/../text())" )[0].extract() ContactPerson = ContactPersonRAW.split(": ")[1] except: ContactPerson = "-" try: LocationRAW = response.xpath( "//div[@class='info-list-text']/b[text()='Location']/..") Street = LocationRAW.xpath( "./span[@itemprop='streetAddress']/text()")[0].extract() AddressLocality = LocationRAW.xpath( "./span[@itemprop='addressLocality']/text()")[0].extract() AddressRegion = LocationRAW.xpath( "./span[@itemprop='addressRegion']/text()")[0].extract() PostalCode = LocationRAW.xpath( "./span[@itemprop='postalCode']/text()")[0].extract() AddressCountry = LocationRAW.xpath( "./span[@itemprop='addressCountry']/text()")[0].extract() Location = Street + ", " + AddressLocality + ", " + AddressRegion + ", " + PostalCode + ", " + AddressCountry except: Location = BeautifulSoup( response.xpath( "//div[@class='info-list-text']/b[text()='Location']/.." )[0].extract(), 'lxml').get_text() Location = Location.replace("Location: ", "") items["category"] = response.meta['category'], items["posttitle"] = response.meta['posttitle'], items["posthref"] = response.meta['posthref'], items["location"] = Location, items["contact"] = ContactPerson, items["phone"] = PhoneNumber yield items self.logger.info("Item processed!") #yield scrapy.FormRequest(GoogleURL, formdata=DataObject, callback=self.dummy, method="POST", dont_filter=True, meta={"refresh_cache":True}) else: # self.logger.info("Page is cached!") pass
def parse_content(self, response): count = response.xpath("//h1[@class='header-2 header-dt-1 main-title']/text()").extract_first() count = int(re.sub("[^\d-]+", "", count)) if response.meta["first"] == True: self.total_cnt += count print "Total:", self.total_cnt script_element = response.xpath("//div[contains(@class, 'whiteCard pro-card')]") if len(script_element) > 0: self.found_count += len(script_element) for row in script_element: item = HouzzItem() item["url"] = response.url try: item["sub_url"] = row.xpath(".//a[@class='pro-title']/@href").extract_first() exists = 0 for sub_item in sub_url_list: if item["sub_url"] == sub_item: exists = 1 if exists == 1: self.exists_count += 1 continue else: sub_url_list.append(item["sub_url"]) item["phone"] = row.xpath(".//span[@class='pro-list-item--text']/text()").extract_first() addressLocality = row.xpath(".//li[@class='pro-list-item pro-location']//span[@itemprop='addressLocality']/text()").extract_first() if addressLocality == None: addressLocality = "" addressRegion = row.xpath(".//li[@class='pro-list-item pro-location']//span[@itemprop='addressRegion']/text()").extract_first() if addressRegion == None: addressRegion = "" postalCode = row.xpath(".//li[@class='pro-list-item pro-location']//span[@itemprop='postalCode']/text()").extract_first() if postalCode == None: postalCode = "" addressCountry = row.xpath(".//li[@class='pro-list-item pro-location']//span[@itemprop='addressCountry']/text()").extract_first() if addressCountry == None: addressCountry = "" item["location"] = addressLocality.encode("utf-8") + " " + addressRegion.encode("utf-8") + " " + postalCode.encode("utf-8") + " " + addressCountry.encode("utf-8") item["name"] = row.xpath(".//a[@class='pro-title']/text()").extract_first().encode("utf-8") req = self.set_proxies(item["sub_url"], self.parse_detail) req.meta["item"] = item yield req except: print "Parsing Error ->". response.url else: print "Not Found ->", response.url next_link = response.xpath("//a[@class='navigation-button next']") if len(next_link) > 0: url = next_link.xpath("@href").extract_first() req = self.set_proxies(url, self.parse_content) req.meta["first"] = False yield req else: print "Not Found Next Page ->", response.url print "Total Items:", self.total_cnt, "Found Items:", self.found_count ,"Exists Count:", self.exists_count
def start_requests(self): # req = self.set_proxies("https://www.houzz.co.uk/pro/black-and-milk-residential/black-and-milk-interior-design-london", self.parse_detail) # yield req # return # req = self.set_proxies("http://www.kiadesigns.co.uk/", self.parse_website_url) # item = HouzzItem() # req.meta["item"] = item # yield req # return # req = self.set_proxies("https://www.houzz.co.uk/professionals/interior-designers/c/Moldova/d/100/p/30", self.parse_content) # item = HouzzItem() # req.meta["first"] = True # yield req # return if self.category == "email": with open(source_csv_file) as csvfile: reader = csv.reader(csvfile) print("-----------------CSV Read------------------") i = 0 item_list = [] for item in reader: if i > 0: obj = {} website = item[7] if website != "null": website = website.replace("http://", "") if '@' not in website and '.' in website and 'facebook' not in website and 'google' not in website and 'twitter' not in website: item_list.append("http://" + website) i += 1 i = 0 for url in item_list: if i > 20: return req = self.set_proxies(url, self.parse_website_url) yield req i += 1 elif self.category == "merge": item_list = [] houzzItem = HouzzItem() with open(source_csv_file) as csvfile: reader = csv.reader(csvfile) i = 0 for item in reader: if i > 0: obj = {} obj["contact"] = item[1] obj["location"] = item[2] obj["name"] = item[3] if item[4] != "null" and item[4].strip() != ",": obj["phone"] = item[4] else: obj["phone"] = "" obj["sub_url"] = item[5] obj["url"] = item[6] if item[7] != "null": obj["website_url"] = item[7] else: obj["website_url"] = "" obj["contact_url"] = [] obj["email"] = [] item_list.append(obj) i += 1 email_item_list = [] with open(dest_csv_file) as csvfile: reader = csv.reader(csvfile) i = 0 for item in reader: if i > 0: obj = {} obj["website_url"] = item[1] obj["email"] = item[2].split(",") obj["contact_url"] = item[0] email_item_list.append(obj) i += 1 for item in item_list: for email_item in email_item_list: if item["website_url"] == email_item["website_url"]: for sub_emails in email_item["email"]: exists = 0 for email in item["email"]: if sub_emails == email: exists = 1 if exists == 0: item["email"].append(sub_emails) contact_exists = 0 for contact in item["contact_url"]: if contact == email_item["contact_url"]: contact_exists = 1 if contact_exists == 0: item["contact_url"].append( email_item["contact_url"]) with open(email_csv_file, 'w') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow([ "Name", "Phone", "Location", "Contact", "Web Site", "Email", "Url", "Country & City URL" ]) for item in item_list: csv_writer.writerow([ item["name"], item["phone"], item["location"], ",".join(item["contact_url"]), item["website_url"], ",".join(item["email"]), item["sub_url"], item["url"], ]) else: if int(self.category) == 0 or self.category == "": raise CloseSpider("Index is not valid.") if int(self.category) > 7: raise CloseSpider("Index must be not high.") index = int(self.category) url_list = [] i = (index - 1) * 200 last_index = index * 200 if last_index > len(self.start_urls): last_index = len(self.start_urls) while (i < last_index): url_list.append(self.start_urls[i]) i += 1 #for url in self.start_urls: for url in url_list: url = url + "/d/100" req = self.set_proxies(url, self.parse_content) req.meta["first"] = True yield req