def __VenueParser(self, xmlE, index): #print 'Scrapping: ' ven = Venue() ven.adid = xmlE.get('id') ven.category = 'architecural technologist' photos = xmlE.find( './div[@class="search_result_photo"]/div[@class="photo"]/a') ven.venue_images = self.__url__ + photos.find('./img').get('src') ven.scrape_page = self.__url__ + photos.get('href') #print str(index)+' >>'+ ven.scrape_page existing = [x for x in self.list_url if ven.scrape_page in x] if len(existing) > 0: print 'This venues exist in list' return self.list_url.append(ven.scrape_page) details_ = xmlE.find('.//div[@class="search_result_details"]') ven.name = details_.find('./div[@class="title"]/h3/a').text contacts_ = details_.find('./div[@class="contact"]').text ven.description = details_.find('./div[@class="desc"]').text contact__ = contacts_.split(',') if len(contact__) >= 2: ven.zipcode = contact__[len(contact__) - 1] if ven.zipcode != None: ven.zipcode = self.check_zip(ven.zipcode) ven.city = contact__[len(contact__) - 2] #scraping details ____ #ven.scrape_page ='http://www.architecturalindex.com/consumers/architects/architect.asp?lngArchitectId=207922' xmlInfo = Util.getRequestsXML( ven.scrape_page, '//div[@class="architect_header"]/parent::div') if xmlInfo != None: addressInfo = xmlInfo.find( './/div[@class="architect_header"]/div[@class="architect_header_info"]' ) h2 = addressInfo.find('./h2') if h2 != None: addressInfo.remove(h2) address__ = ' '.join(addressInfo.itertext()) if ven.city == None: __address = address__.split(',') ven.city = __address[len(__address) - 3] if len(ven.city) < 2: __address = address__.split(',') ven.city = __address[len(__address) - 3] street = address__[0:address__.find(ven.city.strip()) - 1] if street.endswith(','): street = street[0:len(street) - 1] if street.upper().find('PO BOX') >= 0: street = None ven.street = street #ven.office_number= '08708700053' img = [] img_info = xmlInfo.find('.//div[@class="architect_portfolio"]') photos_ = img_info.xpath( './div[@class="architect_portfolio_photo"]//img') for photo in photos_: im_ = self.__url__ + photo.get('src') img.append(im_) ven.img_link = img sers = [] des = xmlInfo.find('.//div[@class="architect_info_statement"]') des = ' '.join(des.itertext()) ven.description = des services = xmlInfo.xpath('//div[@class="architect_info"]/ul') desP = xmlInfo.xpath('//div[@class="architect_info"]/p') affi = xmlInfo.xpath('//div[@class="architect_info"]/h3') isAffiliations = '' for aff in affi: if aff.text.strip() == 'Affiliations': isAffiliations = desP[len(desP) - 1].text ven.accreditations = isAffiliations if len(desP) >= 2: p1 = desP[0].text p2 = desP[1].text #ven.description= ven.description+' '+p1+' '+p2 if p1 != None: ven.description += ' ' + p1 if p2 != None: if p2 != 'None': ven.description += ' ' + p2 + ': ' if len(services) >= 3: services_ = services[1] listSer = services_.xpath('./li') listDes_2 = services[2].xpath('./li') des_2 = '' if len(listDes_2) > 0: des_2 = '. Specialist Experience: ' for des2 in listDes_2: des_2 += des2.text + ', ' des_2 = des_2.strip() if des_2.endswith(','): des_2 = des_2[0:-1] listDes = services[0].xpath('./li') if len(listDes) > 0: desSectors = '' for lides in listDes: desSectors += lides.text + ', ' desSectors = desSectors.strip() if desSectors.endswith(','): desSectors = desSectors[0:-1] ven.description = ven.description + ' ' + desSectors + '.' + des_2 ven.description = ven.description.replace(', ,', ', ').replace( '..', '.') for ser in listSer: se = ser.text serv = Service() serv.service = se sers.append(serv) ven.services = sers ven.pricelist_link = [ven.scrape_page] ven.country = 'gb' '''if ven.street!=None: add_ = ven.street+', '+ven.city+', '+ ven.zipcode else: add_ = ven.city+', '+ ven.zipcode #(ven.latitude,ven.longitude) = self.getLatlng(add_, 'UK')''' indexc = self.addIndex() try: print 'Writing index: ' + str(indexc) ven.writeToFile(self.folder, indexc, ven.name.replace(':', ''), False) #return ven except Exception, ex: print ex return
def __VenueParser(self, jsonItems, hash): url = self.__url__ + 'profile/' + jsonItems.get( 'serviceSlug') + '/' + jsonItems.get( 'companySlug') + '-' + jsonItems.get('id') + '?hash=' + hash url__ = self.__url__ + 'profile/' + jsonItems.get( 'serviceSlug') + '/' + jsonItems.get( 'companySlug') + '-' + jsonItems.get('id') id_ = str(jsonItems.get('id')) existing = [x for x in self.list_url if url__ in x] if len(existing) > 0: print 'this venues existed in list' return None if len(existing) <= 0: print 'Scrapping: ' + url ven = Venue() services_v = [] ven.category = jsonItems.get('restriction').get('name') ven.adid = str(jsonItems.get('id')) ven.name = jsonItems.get('companyName') ven.latitude = jsonItems.get('coordinates').get('lat') ven.longitude = jsonItems.get('coordinates').get('long') ven.venue_images = jsonItems.get('logo') points_ = jsonItems.get('satisfaction_rating') if str(points_).find('.') >= 0: ven.hqdb_review_score = str(round(points_, 1)) else: ven.hqdb_review_score = str(points_) #ven.img_link = [url] #ven.description = jsonItems.get('salesPitch') ven.country = 'gb' ven.scrape_page = url #ven.pricelist_link = [url] self.list_url.append(url__) #url ='https://www.unbiased.co.uk/profile/financial-adviser/stiles-company-financial-services-petersfield-ltd-511274' xmlRequest = Util.getRequestsXML( url, '//div[@class="container-fluid"]') if xmlRequest != None: stringAddress = xmlRequest.find( './/span[@class="profile-meta__address"]').text.replace( ',,', ',') stringAddress = '1st and 2nd Floor Offices, 446 - 452 High street, Kingswinford, West Midlands,' ven.formatted_address = self.removeNameFromAdd( ven.name.strip(), stringAddress).replace('PO BOX', '').replace('PO Box', '').replace( 'Po Box', '') zipArr = stringAddress.split(',') ven.zipcode = zipArr[len(zipArr) - 1] ex_ = re.search( '([Gg][Ii][Rr]0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9]?[A-Za-z]))))\s?[0-9][A-Za-z]{2})', stringAddress, flags=0) if ex_ != None: zip_c = ex_.group(0) #ven.zipcode = zip_c #ven.formatted_address = ven.formatted_address.replace(ven.zipcode,'').strip() if ven.zipcode != zip_c: poZip_c = stringAddress.find(zip_c) poZipcode = stringAddress.find(ven.zipcode) if len(ven.zipcode.strip()) > 1: if poZip_c > poZipcode: ven.zipcode = zip_c if ex_ == None: if ven.zipcode != None: ven.zipcode = None if ven.formatted_address.endswith(','): ven.formatted_address = ven.formatted_address[ 0:len(ven.formatted_address) - 2] phoneLabel = xmlRequest.xpath( './/span[@class="phone-label"]/parent::a') if len(phoneLabel) > 0: for phone_ in phoneLabel: phone = phone_.get('data-phone').replace('\n', '').replace( ' ', '') if phone.find('Shownumber') <= 0: phone = self.validatePhone(phone) for rePhone in self.listPhoneremove: if phone == rePhone: phone = None if phone != None: if phone.startswith('07'): ven.mobile_number = phone else: ven.office_number = phone break services = xmlRequest.find( './/ul[@class="advice-area__level-one"]') if services != None: list_ser = services.xpath('./li') for ser_name in list_ser: # feedback 3 : add category service cate = ser_name.find('./span').text.strip() list_services = ser_name.xpath('./ul/li') for service__ in list_services: service = Service() service.service_category = cate + ' advice' service.service = service__.text + ' advice' services_v.append(service) ven.services = services_v # append accreditations feedback 3 certi = [] cer = xmlRequest.xpath( './/div[@class="profile-team__skill-item collapsed"]') for c in cer: inCerti = [x_ for x_ in certi if c.text in x_] if len(inCerti) <= 0: certi.append(c.text) ven.accreditations = ', '.join(certi) # add follow : fb, twi, website feedback 3 follow = xmlRequest.xpath( '//div[@class="profile__follow"]/ul/li') for fol in follow: values_fol = fol.get('class') if values_fol == 'icon-soc-tw': ven.twitter = fol.find('./a').get('href') if values_fol == 'icon-soc-www': ven.business_website = fol.find('./a').get('href') if values_fol == 'icon-soc-fb': ven.facebook = fol.find('./a').get('href') # description feedback 3 des_1 = xmlRequest.find( './/div[@class="profile__text-block "]/p') if des_1 != None: ven.description = ''.join(des_1.itertext()).replace( '.\n', ' | ') des_2 = xmlRequest.find( './/div[@class="profile__text-block spacing-bottom-xs-0"]/p' ) if des_2 != None: ven.description += ' -Our services: ' + ''.join( des_2.itertext()).replace('.\n', ' | ') if ven.description != None: if ven.description.endswith(' | '): ven.description = ven.description[0:len(ven.description ) - 2] return ven else: return None