Beispiel #1
0
    def parse_professional(self, response):
        json_scripts = response.xpath(
            '//script[@type="application/ld+json"][1]/text()').extract_first()
        pro = json.loads(json_scripts)

        try:
            pro = pro[0]

            item = HouzzItem()

            item['name'] = ''

            item['telephone'] = ''
            item['website'] = ''
            item['sms'] = ''
            item['email'] = ''

            item['streetAddress'] = ''
            item['addressLocality'] = ''
            item['addressRegion'] = ''
            item['addressCountry'] = ''
            item['postalCode'] = ''

            item['schedules'] = ''
            item['category'] = ''
            item['information'] = ''
            item['paymentAccepted'] = ''

            item['ratingstars'] = ''
            item['comments'] = ''
            item['s_tel'] = ''

            item['name'] = pro['name']

            item['telephone'] = pro['telephone']
            item['website'] = response.xpath(
                '//a[@compid="Profile_Website"]/@href').extract_first()

            item['streetAddress'] = pro['address']['streetAddress']
            item['addressLocality'] = pro['address']['addressLocality']
            item['addressRegion'] = pro['address']['addressRegion']
            item['addressCountry'] = pro['address']['addressCountry']
            item['postalCode'] = pro['address']['postalCode']

            item['category'] = response.xpath(
                '//span[@itemprop="child"]//span[@itemprop="title"]/text()'
            ).extract_first()
            info_list = response.xpath(
                '//div[@class="professional-info-content"]/text()').extract()
            temp = ''
            for info in info_list:
                temp = temp + info + ', '
            item['information'] = temp[:-2]
            item['ratingstars'] = pro['aggregateRating']['ratingValue']

            yield item
        except:
            pass
Beispiel #2
0
 def parse_details(self, response):
     items = HouzzItem()
     # make sure only un-cached / new records are saved in the spreadsheet
     if not "cached" in response.flags:
         try:
             PhoneNumber = response.xpath(
                 "//div[@compid='Profile_Phone']/span[@class='pro-contact-text']/text()"
             )[0].extract()
         except:
             PhoneNumber = "-"
         try:
             ContactPersonRAW = response.xpath(
                 "normalize-space(//div[@class='info-list-text']/b[text()='Contact']/../text())"
             )[0].extract()
             ContactPerson = ContactPersonRAW.split(": ")[1]
         except:
             ContactPerson = "-"
         try:
             LocationRAW = response.xpath(
                 "//div[@class='info-list-text']/b[text()='Location']/..")
             Street = LocationRAW.xpath(
                 "./span[@itemprop='streetAddress']/text()")[0].extract()
             AddressLocality = LocationRAW.xpath(
                 "./span[@itemprop='addressLocality']/text()")[0].extract()
             AddressRegion = LocationRAW.xpath(
                 "./span[@itemprop='addressRegion']/text()")[0].extract()
             PostalCode = LocationRAW.xpath(
                 "./span[@itemprop='postalCode']/text()")[0].extract()
             AddressCountry = LocationRAW.xpath(
                 "./span[@itemprop='addressCountry']/text()")[0].extract()
             Location = Street + ", " + AddressLocality + ", " + AddressRegion + ", " + PostalCode + ", " + AddressCountry
         except:
             Location = BeautifulSoup(
                 response.xpath(
                     "//div[@class='info-list-text']/b[text()='Location']/.."
                 )[0].extract(), 'lxml').get_text()
             Location = Location.replace("Location: ", "")
         items["category"] = response.meta['category'],
         items["posttitle"] = response.meta['posttitle'],
         items["posthref"] = response.meta['posthref'],
         items["location"] = Location,
         items["contact"] = ContactPerson,
         items["phone"] = PhoneNumber
         yield items
         self.logger.info("Item processed!")
         #yield scrapy.FormRequest(GoogleURL, formdata=DataObject, callback=self.dummy, method="POST", dont_filter=True, meta={"refresh_cache":True})
     else:
         # self.logger.info("Page is cached!")
         pass
    def parse_content(self, response):
        count = response.xpath("//h1[@class='header-2 header-dt-1 main-title']/text()").extract_first()
        count = int(re.sub("[^\d-]+", "", count))

        if response.meta["first"] == True:
            self.total_cnt += count
            print "Total:", self.total_cnt
        
        script_element = response.xpath("//div[contains(@class, 'whiteCard pro-card')]")
        
        if len(script_element) > 0:
            self.found_count += len(script_element)

            for row in script_element:
                item = HouzzItem()
                item["url"] = response.url
                try:
                    item["sub_url"] = row.xpath(".//a[@class='pro-title']/@href").extract_first()
                    exists = 0
                    for sub_item in sub_url_list:
                        if item["sub_url"] == sub_item:
                            exists = 1
                    
                    if exists == 1:
                        self.exists_count += 1
                        continue
                    else:
                        sub_url_list.append(item["sub_url"])

                    item["phone"] = row.xpath(".//span[@class='pro-list-item--text']/text()").extract_first()
                    
                    addressLocality = row.xpath(".//li[@class='pro-list-item pro-location']//span[@itemprop='addressLocality']/text()").extract_first()
                    if addressLocality == None:
                        addressLocality = ""

                    addressRegion = row.xpath(".//li[@class='pro-list-item pro-location']//span[@itemprop='addressRegion']/text()").extract_first()
                    if addressRegion == None:
                        addressRegion = ""

                    postalCode = row.xpath(".//li[@class='pro-list-item pro-location']//span[@itemprop='postalCode']/text()").extract_first()
                    if postalCode == None:
                        postalCode = ""

                    addressCountry = row.xpath(".//li[@class='pro-list-item pro-location']//span[@itemprop='addressCountry']/text()").extract_first()
                    if addressCountry == None:
                        addressCountry = ""

                    item["location"] = addressLocality.encode("utf-8") + " " + addressRegion.encode("utf-8") + " " + postalCode.encode("utf-8") + " " + addressCountry.encode("utf-8")

                    item["name"] = row.xpath(".//a[@class='pro-title']/text()").extract_first().encode("utf-8")
                    req = self.set_proxies(item["sub_url"], self.parse_detail)
                    req.meta["item"] = item
                    yield req
                except:
                    print "Parsing Error ->". response.url

        else:
            print "Not Found ->", response.url
        
        next_link = response.xpath("//a[@class='navigation-button next']")
        if len(next_link) > 0:
            url = next_link.xpath("@href").extract_first()
            req = self.set_proxies(url, self.parse_content)
            req.meta["first"] = False
            yield req
        else:
            print "Not Found Next Page ->", response.url
            print "Total Items:", self.total_cnt, "Found Items:", self.found_count ,"Exists Count:", self.exists_count
Beispiel #4
0
    def start_requests(self):
        # req = self.set_proxies("https://www.houzz.co.uk/pro/black-and-milk-residential/black-and-milk-interior-design-london", self.parse_detail)
        # yield req
        # return

        # req = self.set_proxies("http://www.kiadesigns.co.uk/", self.parse_website_url)
        # item = HouzzItem()
        # req.meta["item"] = item
        # yield req
        # return

        # req = self.set_proxies("https://www.houzz.co.uk/professionals/interior-designers/c/Moldova/d/100/p/30", self.parse_content)
        # item = HouzzItem()
        # req.meta["first"] = True
        # yield req
        # return

        if self.category == "email":
            with open(source_csv_file) as csvfile:
                reader = csv.reader(csvfile)
                print("-----------------CSV Read------------------")
                i = 0

                item_list = []
                for item in reader:
                    if i > 0:
                        obj = {}
                        website = item[7]
                        if website != "null":
                            website = website.replace("http://", "")
                            if '@' not in website and '.' in website and 'facebook' not in website and 'google' not in website and 'twitter' not in website:
                                item_list.append("http://" + website)
                    i += 1

                i = 0
                for url in item_list:
                    if i > 20:
                        return
                    req = self.set_proxies(url, self.parse_website_url)
                    yield req
                    i += 1
        elif self.category == "merge":
            item_list = []
            houzzItem = HouzzItem()
            with open(source_csv_file) as csvfile:
                reader = csv.reader(csvfile)
                i = 0
                for item in reader:
                    if i > 0:
                        obj = {}
                        obj["contact"] = item[1]
                        obj["location"] = item[2]
                        obj["name"] = item[3]
                        if item[4] != "null" and item[4].strip() != ",":
                            obj["phone"] = item[4]
                        else:
                            obj["phone"] = ""

                        obj["sub_url"] = item[5]
                        obj["url"] = item[6]
                        if item[7] != "null":
                            obj["website_url"] = item[7]
                        else:
                            obj["website_url"] = ""

                        obj["contact_url"] = []
                        obj["email"] = []
                        item_list.append(obj)
                    i += 1

            email_item_list = []
            with open(dest_csv_file) as csvfile:
                reader = csv.reader(csvfile)
                i = 0
                for item in reader:
                    if i > 0:
                        obj = {}
                        obj["website_url"] = item[1]
                        obj["email"] = item[2].split(",")
                        obj["contact_url"] = item[0]

                        email_item_list.append(obj)
                    i += 1

            for item in item_list:
                for email_item in email_item_list:
                    if item["website_url"] == email_item["website_url"]:
                        for sub_emails in email_item["email"]:
                            exists = 0

                            for email in item["email"]:
                                if sub_emails == email:
                                    exists = 1

                            if exists == 0:
                                item["email"].append(sub_emails)

                                contact_exists = 0
                                for contact in item["contact_url"]:
                                    if contact == email_item["contact_url"]:
                                        contact_exists = 1

                                if contact_exists == 0:
                                    item["contact_url"].append(
                                        email_item["contact_url"])

            with open(email_csv_file, 'w') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow([
                    "Name", "Phone", "Location", "Contact", "Web Site",
                    "Email", "Url", "Country & City URL"
                ])
                for item in item_list:
                    csv_writer.writerow([
                        item["name"],
                        item["phone"],
                        item["location"],
                        ",".join(item["contact_url"]),
                        item["website_url"],
                        ",".join(item["email"]),
                        item["sub_url"],
                        item["url"],
                    ])

        else:
            if int(self.category) == 0 or self.category == "":
                raise CloseSpider("Index is not valid.")

            if int(self.category) > 7:
                raise CloseSpider("Index must be not high.")

            index = int(self.category)
            url_list = []
            i = (index - 1) * 200
            last_index = index * 200

            if last_index > len(self.start_urls):
                last_index = len(self.start_urls)

            while (i < last_index):
                url_list.append(self.start_urls[i])
                i += 1

            #for url in self.start_urls:
            for url in url_list:
                url = url + "/d/100"
                req = self.set_proxies(url, self.parse_content)
                req.meta["first"] = True
                yield req