def __VenueParser(self, venueElement): try: img_link = [] ad_type = "none" if venueElement.find( './/span[@class="label label-success"]') != None: ad_type = "featured" divs = venueElement.xpath('./div') logo_ = divs[0].find('.//img') if logo_ != None: img_link.append(self.__url__ + logo_.get('src')) url__ = venueElement.xpath( './div[@class="col-xs-9 col-sm-9 col-md-9 listing-body"]//div[@class="h4 listing-heading"]/a' ) if url__ != None: url__ = url__[0].get('href') url__ = self.__url__ + url__ ''' files = open('D:\\test.txt','a') files.write(url__+'\r\n') files.close() ''' existing = [x for x in self.listLink if url__ in x] if len(existing) <= 0: self.listLink.append(url__) print 'Scraping' + ' : ' + url__ #if url__ =='http://www.garagesandrecovery.co.uk/business/dorset-auto-repirs': # print 'Debug' xmlDoc = Util.getRequestsXML( url__, '//body/div[@class="page-wrapper"]') ven = Venue() ven.name = xmlDoc.find( './/div[@class="page-heading"]//h1').text content = xmlDoc.find( './/div[@class="container page-content"]') if content != None: des_img = content.find('.//div[@class="article-body"]') if des_img != None: div_img = des_img.xpath('.//img/parent::div') if len(div_img) > 0: des_img.remove(div_img[0]) des = ' '.join(des_img.itertext()) ven.description = des ven.country = self._language ven.scrape_page = url__ ven.hqdb_featured_ad_type = ad_type offices_ = content.xpath( './/div[@id="offices"]/parent::div/div[@class="row"]' ) div_maps = offices_[0].find( './/div[@class="google-map"]') if div_maps != None: ven.latitude = div_maps.get('data-lat') ven.longitude = div_maps.get('data-lng') info_ = offices_[0].xpath( './div[@class="col-md-5 col-sm-6"]') info_ = info_[0] ul = info_.xpath('./ul') phones = [] for u in ul: phone_ = u.xpath('./li/a') for phone in phone_: if phone.get('title') == 'Phone Number': phone = phone.text.replace(' ', '') if phone.startswith('0800'): continue else: phones.append(phone) if len(ul) >= 2: ul_2 = ul[0] li__ = ul_2.xpath('./li') address = '' for li in li__: if li.get('class') != 'text-bold': address = '\n'.join(li.itertext()) addressArr = address.split('\n') if len(addressArr) >= 3: ven.street = addressArr[len(addressArr) - 3] ven.city = addressArr[len(addressArr) - 2].split(',')[0] ven.zipcode = addressArr[len(addressArr) - 1] if ven.zipcode != None: results = re.search(self.ukReg, ven.zipcode, flags=0) if ven.zipcode == 'Rotherham, South Yorkshire': ven.zipcode = '' ven.street = None if results == None: ven.zipcode = None (ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2) = self.processPhones(phones) # right sidebar : //div[@class="col-md-3 page-sidebar"]/div rightSidebar = xmlDoc.xpath( './/div[@class="col-md-3 page-sidebar"]/div[@class="section"]' ) for right in rightSidebar: website = right.xpath( './a[contains(text(),"Visit Our Website")]') if len(website) > 0: website = website[0].get('href') if website.find('facebook.com') == -1: ven.business_website = website else: ven.facebook = website reviews = right.xpath('./p/strong') if len(reviews) >= 3: ven.hqdb_nr_reviews = reviews[2].text ven.hqdb_review_score = reviews[1].text follows = right.xpath('./ul/li/a') for foll in follows: follow_link = foll.get('href') if follow_link.find('facebook.com') != -1: if ven.facebook == None: ven.facebook = self.addHTTP( follow_link) if follow_link.find('twitter.com') != -1: if ven.twitter == None: ven.twitter = self.addHTTP(follow_link) img_find = xmlDoc.xpath( '//div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img' ) for ig in img_find: img_link.append(self.__url__ + ig.get('src')) if len(img_link) > 0: ven.img_link = img_link self.index += 1 ven.writeToFile(self.folder, self.index, ven.name, False) #img_link : //div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img else: print '\nduplicate'.upper() print '*' * (len(url__) + 4) print '*' + ' ' * (len(url__) + 2) + '*' print '* ' + url__ + ' *' print '*' + ' ' * (len(url__) + 2) + '*' print '*' * (len(url__) + 4) + '\n' except Exception, ex: print ex
def __VenueParser(self, jsonItems, hash): url = self.__url__ + 'profile/' + jsonItems.get( 'serviceSlug') + '/' + jsonItems.get( 'companySlug') + '-' + jsonItems.get('id') + '?hash=' + hash url__ = self.__url__ + 'profile/' + jsonItems.get( 'serviceSlug') + '/' + jsonItems.get( 'companySlug') + '-' + jsonItems.get('id') id_ = str(jsonItems.get('id')) existing = [x for x in self.list_url if url__ in x] if len(existing) > 0: print 'this venues existed in list' return None if len(existing) <= 0: print 'Scrapping: ' + url ven = Venue() services_v = [] ven.category = jsonItems.get('restriction').get('name') ven.adid = str(jsonItems.get('id')) ven.name = jsonItems.get('companyName') ven.latitude = jsonItems.get('coordinates').get('lat') ven.longitude = jsonItems.get('coordinates').get('long') ven.venue_images = jsonItems.get('logo') points_ = jsonItems.get('satisfaction_rating') if str(points_).find('.') >= 0: ven.hqdb_review_score = str(round(points_, 1)) else: ven.hqdb_review_score = str(points_) #ven.img_link = [url] #ven.description = jsonItems.get('salesPitch') ven.country = 'gb' ven.scrape_page = url #ven.pricelist_link = [url] self.list_url.append(url__) #url ='https://www.unbiased.co.uk/profile/financial-adviser/stiles-company-financial-services-petersfield-ltd-511274' xmlRequest = Util.getRequestsXML( url, '//div[@class="container-fluid"]') if xmlRequest != None: stringAddress = xmlRequest.find( './/span[@class="profile-meta__address"]').text.replace( ',,', ',') stringAddress = '1st and 2nd Floor Offices, 446 - 452 High street, Kingswinford, West Midlands,' ven.formatted_address = self.removeNameFromAdd( ven.name.strip(), stringAddress).replace('PO BOX', '').replace('PO Box', '').replace( 'Po Box', '') zipArr = stringAddress.split(',') ven.zipcode = zipArr[len(zipArr) - 1] ex_ = re.search( '([Gg][Ii][Rr]0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9]?[A-Za-z]))))\s?[0-9][A-Za-z]{2})', stringAddress, flags=0) if ex_ != None: zip_c = ex_.group(0) #ven.zipcode = zip_c #ven.formatted_address = ven.formatted_address.replace(ven.zipcode,'').strip() if ven.zipcode != zip_c: poZip_c = stringAddress.find(zip_c) poZipcode = stringAddress.find(ven.zipcode) if len(ven.zipcode.strip()) > 1: if poZip_c > poZipcode: ven.zipcode = zip_c if ex_ == None: if ven.zipcode != None: ven.zipcode = None if ven.formatted_address.endswith(','): ven.formatted_address = ven.formatted_address[ 0:len(ven.formatted_address) - 2] phoneLabel = xmlRequest.xpath( './/span[@class="phone-label"]/parent::a') if len(phoneLabel) > 0: for phone_ in phoneLabel: phone = phone_.get('data-phone').replace('\n', '').replace( ' ', '') if phone.find('Shownumber') <= 0: phone = self.validatePhone(phone) for rePhone in self.listPhoneremove: if phone == rePhone: phone = None if phone != None: if phone.startswith('07'): ven.mobile_number = phone else: ven.office_number = phone break services = xmlRequest.find( './/ul[@class="advice-area__level-one"]') if services != None: list_ser = services.xpath('./li') for ser_name in list_ser: # feedback 3 : add category service cate = ser_name.find('./span').text.strip() list_services = ser_name.xpath('./ul/li') for service__ in list_services: service = Service() service.service_category = cate + ' advice' service.service = service__.text + ' advice' services_v.append(service) ven.services = services_v # append accreditations feedback 3 certi = [] cer = xmlRequest.xpath( './/div[@class="profile-team__skill-item collapsed"]') for c in cer: inCerti = [x_ for x_ in certi if c.text in x_] if len(inCerti) <= 0: certi.append(c.text) ven.accreditations = ', '.join(certi) # add follow : fb, twi, website feedback 3 follow = xmlRequest.xpath( '//div[@class="profile__follow"]/ul/li') for fol in follow: values_fol = fol.get('class') if values_fol == 'icon-soc-tw': ven.twitter = fol.find('./a').get('href') if values_fol == 'icon-soc-www': ven.business_website = fol.find('./a').get('href') if values_fol == 'icon-soc-fb': ven.facebook = fol.find('./a').get('href') # description feedback 3 des_1 = xmlRequest.find( './/div[@class="profile__text-block "]/p') if des_1 != None: ven.description = ''.join(des_1.itertext()).replace( '.\n', ' | ') des_2 = xmlRequest.find( './/div[@class="profile__text-block spacing-bottom-xs-0"]/p' ) if des_2 != None: ven.description += ' -Our services: ' + ''.join( des_2.itertext()).replace('.\n', ' | ') if ven.description != None: if ven.description.endswith(' | '): ven.description = ven.description[0:len(ven.description ) - 2] return ven else: return None