def __VenueParser(self, url, cateName): existing = [x for x in self.listlink if url in x] self.listlink.append(url) if len(existing) > 0: self.countduplicate += 1 print '[INFO] Duplicate count = ' + str(self.countduplicate) return try: print 'Scraping url: ' + url #url = 'http://www.uksecurity-directory.co.uk/the-directory/1905/ecpcco/' xmlDoc = Util.getRequestsXML(url, '//div[@class="gdl-page-content"]') xmlDoc = xmlDoc.xpath('//div[@class="gdl-page-content"]/div')[0] ven = Venue() imgs = [] ven.category = cateName ven.scrape_page = url ven.country = self._language ven.name = xmlDoc.find('./div/h2').text ven.hqdb_featured_ad_type = 'none' isFeatured = xmlDoc.find('./div[@class="stickytag"]/img') if isFeatured != None: if isFeatured.get('title') == 'Featured Listing': ven.hqdb_featured_ad_type = 'featured' divInfo = xmlDoc.xpath('./div[@class="listing-details cf"]/div') town_ = '' area_ = '' zipcode = '' listPhone_ = [] for div__ in divInfo: label = div__.find('./label') if label != None: label_ = label.text if label_ == 'Business Website Address:': website = div__.find('./span/a') if website != None: website = website.get('href') isFacebook = website.find('facebook.com') isTwiter = website.find('twiter.com') if isFacebook == -1 and isTwiter == -1: ven.business_website = website else: if isFacebook != -1: ven.facebook = website if ven.twitter != -1: ven.twitter = website if label_ == 'Security Services:': serviceStr = div__.xpath('./span/a') sers = [] for ser in serviceStr: serv = Service() serv.service = ser.text sers.append(serv) if len(sers) > 0: ven.services = sers ven.pricelist_link = [ven.scrape_page] if label_ == 'Long Business Description:': des = div__.find('./span') if des != None: des = ' '.join(des.itertext()) ven.description = des if label_ == 'Business Phone Number:': phone = div__.find('./span').text #phone = self.formatPhone(phone) findsplistPPhone = self.findSplitPhone(phone) if findsplistPPhone == None: listPhone_ = [phone] #(ven.office_number,ven.office_number2,ven.mobile_number,ven.mobile_number2) = self.processPhones([phone]) else: listPhone_ = phone.split(findsplistPPhone) (ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2) = self.processPhones(listPhone_) if label_ == 'Postcode:': zipcode = div__.find('./span').text if label_ == 'Town:': town_ = div__.find('./span').text if label_ == 'Area:': area_ = div__.find('./span').text zipcode = self.validateZipcode(zipcode) if url == 'http://www.uksecurity-directory.co.uk/the-directory/1981/s-comm-vehicle-surveillance-system': print if ven.office_number == 'NOT_GB' or ven.office_number2 == 'NOT_GB' or ven.mobile_number == 'NOT_GB' or ven.mobile_number2 == 'NOT_GB': return for p in listPhone_: if p == town_: town_ = '' break ven.zipcode = zipcode ven.formatted_address = ', '.join([area_, town_, zipcode]) ven.formatted_address = self.refixFormatAddress( ven.formatted_address.replace('0000000', '')) extraImg = xmlDoc.xpath('./div[@class="extra-images"]//a/img') listingThumbnail = xmlDoc.xpath( './div[@class="listing-thumbnail"]//a/img') for thumb in listingThumbnail: imgs.append(thumb.get('src')) for img in extraImg: imgs.append(img.get('src')) if len(imgs) > 0: ven.img_link = imgs self.index = self.index + 1 ven.writeToFile(self.folder, self.index, ven.name, False) except Exception, ex: print '[ERROR] ' + url + ': ' + str(ex)
def __VenueParser(self, url): try: print 'Scraping: ' + url xmlDoc = Util.getRequestsXML(url, '//div[@id="main"]') if xmlDoc != None: ven = Venue() ven.scrape_page = url ven.country = self._language ven.name = xmlDoc.find('.//h1').text overview = xmlDoc.find('.//div[@class="overview"]') option = overview.xpath('./div[@class="options row"]/div') for opt in option: div_ = opt.xpath('./div') for div__ in div_: strong = div__.find('./strong') if strong != None: if strong.text == 'Adresse:': street = div__.find( './span[@itemprop="streetAddress"]') if street != None: ven.street = street.text zipcode = div__.find( './span[@itemprop="postalCode"]') if zipcode != None: ven.zipcode = zipcode.text city = div__.find( './span[@itemprop="addressLocality"]') if city != None: ven.city = city.text if strong.text == 'Téléphone:': phone = ''.join(div__.itertext()).replace( ' ', '').replace('.', '').replace('Téléphone:', '') if phone.startswith('06') or phone.startswith( '07') or phone.startswith( '7') or phone.startswith('6'): ven.mobile_number = self.validatePhone__( phone) else: ven.office_number = self.validatePhone__( phone) if strong.text == 'Site Web:': website = ''.join(div__.itertext()).replace( 'Site Web:', '') if website.find('facebook.com') != -1: ven.facebook = self.addHTTP(website) continue if website.find('twitter.com') != -1: ven.twitter = self.addHTTP(website) continue ven.business_website = self.addHTTP(website) if strong.text == 'Horaires:': openning = ''.join(div__.itertext()).replace( 'Horaires:', '') for format in self.openFormat: if openning.strip() == format: ven.opening_hours_raw = 'Lundi au Dimanche: 0h00 - 24h00' if ven.opening_hours_raw == None: ven.opening_hours_raw = openning if strong.text == 'Votez pour ce serrurier:': score = div__.find( './span[@class="thevotescount"]/span[@itemprop="ratingValue"]' ) if score != None: ven.hqdb_review_score = score.text descElement = overview.find('./div[@class="contenu"]') if descElement != None: ven.description = ' | '.join(descElement.itertext()) if ven.description != None: ven.description = ven.description.strip() if ven.description.startswith('|'): ven.description = ven.description[ 1:len(ven.description)] if ven.description.endswith("|"): ven.description = ven.description[ 0:len(ven.description) - 1] ven.description = ven.description.replace( '| \n |', '|') if len(ven.description.split()) < 3: ven.description = None address = [] if ven.street != None and len(ven.street.strip()) > 0: address.append(ven.street) if ven.city != None and len(ven.city.strip()) > 0: address.append(ven.city) if ven.zipcode != None and len(ven.zipcode.strip()) > 0: address.append(ven.zipcode) address_ = ', '.join(address) (ven.latitude, ven.longitude) = self.getLatlng(address_) ven.is_get_by_address = True self.index += 1 ven.writeToFile(self.folder, self.index, ven.name, False) except Exception, ex: print '[ERROR] ' + url print ex
def __VenueParser(self, venueElement): try: img_link = [] ad_type = "none" if venueElement.find( './/span[@class="label label-success"]') != None: ad_type = "featured" divs = venueElement.xpath('./div') logo_ = divs[0].find('.//img') if logo_ != None: img_link.append(self.__url__ + logo_.get('src')) url__ = venueElement.xpath( './div[@class="col-xs-9 col-sm-9 col-md-9 listing-body"]//div[@class="h4 listing-heading"]/a' ) if url__ != None: url__ = url__[0].get('href') url__ = self.__url__ + url__ ''' files = open('D:\\test.txt','a') files.write(url__+'\r\n') files.close() ''' existing = [x for x in self.listLink if url__ in x] if len(existing) <= 0: self.listLink.append(url__) print 'Scraping' + ' : ' + url__ #if url__ =='http://www.garagesandrecovery.co.uk/business/dorset-auto-repirs': # print 'Debug' xmlDoc = Util.getRequestsXML( url__, '//body/div[@class="page-wrapper"]') ven = Venue() ven.name = xmlDoc.find( './/div[@class="page-heading"]//h1').text content = xmlDoc.find( './/div[@class="container page-content"]') if content != None: des_img = content.find('.//div[@class="article-body"]') if des_img != None: div_img = des_img.xpath('.//img/parent::div') if len(div_img) > 0: des_img.remove(div_img[0]) des = ' '.join(des_img.itertext()) ven.description = des ven.country = self._language ven.scrape_page = url__ ven.hqdb_featured_ad_type = ad_type offices_ = content.xpath( './/div[@id="offices"]/parent::div/div[@class="row"]' ) div_maps = offices_[0].find( './/div[@class="google-map"]') if div_maps != None: ven.latitude = div_maps.get('data-lat') ven.longitude = div_maps.get('data-lng') info_ = offices_[0].xpath( './div[@class="col-md-5 col-sm-6"]') info_ = info_[0] ul = info_.xpath('./ul') phones = [] for u in ul: phone_ = u.xpath('./li/a') for phone in phone_: if phone.get('title') == 'Phone Number': phone = phone.text.replace(' ', '') if phone.startswith('0800'): continue else: phones.append(phone) if len(ul) >= 2: ul_2 = ul[0] li__ = ul_2.xpath('./li') address = '' for li in li__: if li.get('class') != 'text-bold': address = '\n'.join(li.itertext()) addressArr = address.split('\n') if len(addressArr) >= 3: ven.street = addressArr[len(addressArr) - 3] ven.city = addressArr[len(addressArr) - 2].split(',')[0] ven.zipcode = addressArr[len(addressArr) - 1] if ven.zipcode != None: results = re.search(self.ukReg, ven.zipcode, flags=0) if ven.zipcode == 'Rotherham, South Yorkshire': ven.zipcode = '' ven.street = None if results == None: ven.zipcode = None (ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2) = self.processPhones(phones) # right sidebar : //div[@class="col-md-3 page-sidebar"]/div rightSidebar = xmlDoc.xpath( './/div[@class="col-md-3 page-sidebar"]/div[@class="section"]' ) for right in rightSidebar: website = right.xpath( './a[contains(text(),"Visit Our Website")]') if len(website) > 0: website = website[0].get('href') if website.find('facebook.com') == -1: ven.business_website = website else: ven.facebook = website reviews = right.xpath('./p/strong') if len(reviews) >= 3: ven.hqdb_nr_reviews = reviews[2].text ven.hqdb_review_score = reviews[1].text follows = right.xpath('./ul/li/a') for foll in follows: follow_link = foll.get('href') if follow_link.find('facebook.com') != -1: if ven.facebook == None: ven.facebook = self.addHTTP( follow_link) if follow_link.find('twitter.com') != -1: if ven.twitter == None: ven.twitter = self.addHTTP(follow_link) img_find = xmlDoc.xpath( '//div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img' ) for ig in img_find: img_link.append(self.__url__ + ig.get('src')) if len(img_link) > 0: ven.img_link = img_link self.index += 1 ven.writeToFile(self.folder, self.index, ven.name, False) #img_link : //div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img else: print '\nduplicate'.upper() print '*' * (len(url__) + 4) print '*' + ' ' * (len(url__) + 2) + '*' print '* ' + url__ + ' *' print '*' + ' ' * (len(url__) + 2) + '*' print '*' * (len(url__) + 4) + '\n' except Exception, ex: print ex
def __VenueParser(self, jsonItems, hash): url = self.__url__ + 'profile/' + jsonItems.get( 'serviceSlug') + '/' + jsonItems.get( 'companySlug') + '-' + jsonItems.get('id') + '?hash=' + hash url__ = self.__url__ + 'profile/' + jsonItems.get( 'serviceSlug') + '/' + jsonItems.get( 'companySlug') + '-' + jsonItems.get('id') id_ = str(jsonItems.get('id')) existing = [x for x in self.list_url if url__ in x] if len(existing) > 0: print 'this venues existed in list' return None if len(existing) <= 0: print 'Scrapping: ' + url ven = Venue() services_v = [] ven.category = jsonItems.get('restriction').get('name') ven.adid = str(jsonItems.get('id')) ven.name = jsonItems.get('companyName') ven.latitude = jsonItems.get('coordinates').get('lat') ven.longitude = jsonItems.get('coordinates').get('long') ven.venue_images = jsonItems.get('logo') points_ = jsonItems.get('satisfaction_rating') if str(points_).find('.') >= 0: ven.hqdb_review_score = str(round(points_, 1)) else: ven.hqdb_review_score = str(points_) #ven.img_link = [url] #ven.description = jsonItems.get('salesPitch') ven.country = 'gb' ven.scrape_page = url #ven.pricelist_link = [url] self.list_url.append(url__) #url ='https://www.unbiased.co.uk/profile/financial-adviser/stiles-company-financial-services-petersfield-ltd-511274' xmlRequest = Util.getRequestsXML( url, '//div[@class="container-fluid"]') if xmlRequest != None: stringAddress = xmlRequest.find( './/span[@class="profile-meta__address"]').text.replace( ',,', ',') stringAddress = '1st and 2nd Floor Offices, 446 - 452 High street, Kingswinford, West Midlands,' ven.formatted_address = self.removeNameFromAdd( ven.name.strip(), stringAddress).replace('PO BOX', '').replace('PO Box', '').replace( 'Po Box', '') zipArr = stringAddress.split(',') ven.zipcode = zipArr[len(zipArr) - 1] ex_ = re.search( '([Gg][Ii][Rr]0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9]?[A-Za-z]))))\s?[0-9][A-Za-z]{2})', stringAddress, flags=0) if ex_ != None: zip_c = ex_.group(0) #ven.zipcode = zip_c #ven.formatted_address = ven.formatted_address.replace(ven.zipcode,'').strip() if ven.zipcode != zip_c: poZip_c = stringAddress.find(zip_c) poZipcode = stringAddress.find(ven.zipcode) if len(ven.zipcode.strip()) > 1: if poZip_c > poZipcode: ven.zipcode = zip_c if ex_ == None: if ven.zipcode != None: ven.zipcode = None if ven.formatted_address.endswith(','): ven.formatted_address = ven.formatted_address[ 0:len(ven.formatted_address) - 2] phoneLabel = xmlRequest.xpath( './/span[@class="phone-label"]/parent::a') if len(phoneLabel) > 0: for phone_ in phoneLabel: phone = phone_.get('data-phone').replace('\n', '').replace( ' ', '') if phone.find('Shownumber') <= 0: phone = self.validatePhone(phone) for rePhone in self.listPhoneremove: if phone == rePhone: phone = None if phone != None: if phone.startswith('07'): ven.mobile_number = phone else: ven.office_number = phone break services = xmlRequest.find( './/ul[@class="advice-area__level-one"]') if services != None: list_ser = services.xpath('./li') for ser_name in list_ser: # feedback 3 : add category service cate = ser_name.find('./span').text.strip() list_services = ser_name.xpath('./ul/li') for service__ in list_services: service = Service() service.service_category = cate + ' advice' service.service = service__.text + ' advice' services_v.append(service) ven.services = services_v # append accreditations feedback 3 certi = [] cer = xmlRequest.xpath( './/div[@class="profile-team__skill-item collapsed"]') for c in cer: inCerti = [x_ for x_ in certi if c.text in x_] if len(inCerti) <= 0: certi.append(c.text) ven.accreditations = ', '.join(certi) # add follow : fb, twi, website feedback 3 follow = xmlRequest.xpath( '//div[@class="profile__follow"]/ul/li') for fol in follow: values_fol = fol.get('class') if values_fol == 'icon-soc-tw': ven.twitter = fol.find('./a').get('href') if values_fol == 'icon-soc-www': ven.business_website = fol.find('./a').get('href') if values_fol == 'icon-soc-fb': ven.facebook = fol.find('./a').get('href') # description feedback 3 des_1 = xmlRequest.find( './/div[@class="profile__text-block "]/p') if des_1 != None: ven.description = ''.join(des_1.itertext()).replace( '.\n', ' | ') des_2 = xmlRequest.find( './/div[@class="profile__text-block spacing-bottom-xs-0"]/p' ) if des_2 != None: ven.description += ' -Our services: ' + ''.join( des_2.itertext()).replace('.\n', ' | ') if ven.description != None: if ven.description.endswith(' | '): ven.description = ven.description[0:len(ven.description ) - 2] return ven else: return None