def __VenueParser(self, urlVenues): print 'Scrapping: ' + urlVenues urlVenues = urlVenues.split('/#/') if len(urlVenues) == 2: ven = Venue() ven.category = urlVenues[1] xmlDoc = Util.getRequestsXML(urlVenues[0], '//div[@id="container"]') subTable = xmlDoc.xpath('./div/table') if len(subTable) == 3: subTable = subTable[1] subtd = subTable.xpath('./tr/td/table/tr/td') if len(subtd) == 2: subtd = subtd[1] return ven else: return None
def __VenueParser(self, url): try: if self.checkDuplicate(self.urlList, url) == False: ven = Venue() print '[SCRAPING]:' + url #ven.scrape_page= url ven.country = self._language self.urlList.append(url) xmlDoc = Util.getRequestsXML(url, '//div[@class="content"]/main') xmlDoc = xmlDoc.find('./main') name = xmlDoc.find('./h2') ven.name = name.text des = xmlDoc.find('./div[@class="clearfix"]') if des != None: imgs = [] img = des.xpath('.//img') for im in img: imgs.append(self.__url__ + im.get('src')) des.remove((im.getparent()).getparent()) if len(imgs) > 0: ven.img_link = imgs ven.description = ''.join(des.itertext()) pass map_and_phone_number = xmlDoc.xpath( './div/div[@class="footer row"]') isMulti = False if len(map_and_phone_number) > 1: isMulti = True countVenues_ = 0 for clone_ in map_and_phone_number: countVenues_ += 1 self._cloneVenues(ven, clone_, countVenues_, url, isMulti) else: print '[DUPLICATE]: ' + url except Exception, ex: print '[ERROR]: ' + url print ex
def __VenueParser(self, item, city): existing = [x for x in self.listLink if item in x] self.listLink.append(item) if len(existing) <= 0: try: xmlDoc = Util.getRequestsXML(item, '/html') ven = Venue() ven.scrape_page = item #ven.city = city ven.name = xmlDoc.xpath( '//div[@class="row top-buffer"]/h3')[0].text (ven.latitude, ven.longitude) = self.getLatlng(xmlDoc) xmlcontent = xmlDoc.find('.//div[@class="tab-content"]') services_schedule_info = xmlcontent.xpath( './div/div[@class="row top-buffer"]/h4/parent::div')[0] if services_schedule_info != None: services_schedule_info = ''.join( services_schedule_info.itertext()).split('\n') for it in services_schedule_info: if it.find('Style:') != -1: it = it[0:it.find('Schedule')] it = it.strip() ser_name = it[it.find('Style:') + len('Style:'):it. find('Ability level')] cost = len(it) cost_ = ['Cost:', 'Concession cost:'] char_cost = '' for c in cost_: if it.find(c) != -1: cost = it.find(c) char_cost = c break #cost = it.find('Cost:') if cost == -1: cost = len(it) ser_des = it[it.find('Ability level:') + len('Ability level:'):cost] ser_price = it[cost + len(char_cost):it.find('GBP') + len('GBP')] ser = Service() ser.service = ser_name ser.description = ser_des ser.price = ser_price.replace('-', '') ven.services = [ser] if it.find('a.m.') != -1 or it.find('p.m.') != -1: ven.opening_hours_raw = it.strip().replace( '.Monday', ' | Monday').replace( '.Tuesday', ' | Tuesday').replace( '.Wednesday', ' | Wednesday').replace( '.Thursday', ' | Thursday').replace( '.Friday', ' | Friday').replace( '.Saturday', ' | Saturday').replace( '.Sunday', ' | Sunday') ven.opening_hours_raw = self.formatOpenhour( ven.opening_hours_raw) address = xmlcontent.find('.//address') if address != None: #print ET.dump(address) address = ''.join(address.itertext()).replace( 'United Kingdom', '').strip() address = self.validateAddress(address) #address ='Ward Park Arras Pavilion,Gransha Road,Bangor,Northern Ireland,BT20 4TN' ven.country = 'gb' if address.upper().find('Ireland'.upper()) != -1: if address.upper().find( 'Northern Ireland'.upper()) != -1: ven.country = 'ie' if address.endswith(','): address = address[0:-1] ven.formatted_address = address posted = xmlcontent.find('./div/div[@class="row"]/p') imgs = xmlcontent.xpath('.//a/img') img_ = [] for img in imgs: img_.append(img.get('src')) ven.img_link = img_ if posted != None: ven.hqdb_ad_posted = posted.text.replace( 'Last updated', '') split_posted = ven.hqdb_ad_posted.split(',') if len(split_posted) >= 3: ven.hqdb_ad_posted = ', '.join( split_posted[0:len(split_posted) - 1]) ven.category = self.category #ven.country ='gb' des_info = xmlcontent.xpath( '//div[@class="row top-buffer"]/h3')[1] #print des_info.text des_info = des_info.getparent() des__ = des_info.xpath('./p') ven.pricelist_link = [ven.scrape_page] ven.hqdb_featured_ad_type = 'none' ven.description = '' for des in des__: ven.description += ''.join(des.itertext()) + ' ' des_info.remove(des) info = '____'.join(des_info.itertext()) a = des_info.find('./a') if a != None: a = a.get('href') if a.find('facebook.com') == -1: ven.business_website = a else: if a.startswith('http:'): a = a.replace('http:', 'https:') ven.facebook = a info = info.split('__') for inf in range(0, len(info)): if info[inf] == 'Qualifications:': ven.accreditations = info[inf + 2] if info[inf] == 'Phone:': phone = info[inf + 2].strip() pattren = '(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)' '''isEmail = re.search(pattren, phone, flags=0) if isEmail!=None: ven.business_email = isEmail.group(0) continue''' find_charSplit = self.findSplitPhone(phone) if find_charSplit == None: issMail = re.search(pattren, phone, flags=0) if issMail != None: ven.business_email = issMail.group(0) continue phone = phone.replace('Mobile:', '').replace( 'ext.225', '').replace('O7', '07').replace(' ', '') if phone.startswith('07') or phone.startswith( '447' ) or phone.startswith('+447') or phone.startswith( '00447') or phone.startswith( '+44(0)7') or phone.startswith( '44(0)7') or phone.startswith( '004407'): ven.mobile_number = self.validatePhone__( phone, ven.country) else: ven.office_number = self.validatePhone__( phone, ven.country) else: phone = phone.split(find_charSplit) for p in phone: issMail = re.search(pattren, p, flags=0) if issMail != None: ven.business_email = issMail.group(0) continue p = p.replace('Mobile', '').replace( 'ext225', '').replace('O7', '07').replace(' ', '') if p.startswith('07') or p.startswith( '447' ) or p.startswith('+447') or p.startswith( '00447' ) or p.startswith('+44(0)7') or p.startswith( '44(0)7') or p.startswith('004407'): if ven.mobile_number != None: ven.mobile_number2 = self.validatePhone__( p, ven.country) else: ven.mobile_number = self.validatePhone__( p, ven.country) else: if ven.office_number != None: ven.office_number2 = self.validatePhone__( p, ven.country) else: ven.office_number = self.validatePhone__( p, ven.country) isPhoneOverSea = self.checkPhoneOverSea([ ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2 ]) if isPhoneOverSea == False: index = self.addIndex() print str( index) + ' Scapping: ' + city + '---' + ven.scrape_page #ven.is_get_by_address =True ven.writeToFile(self.folder, index, ven.name, False) except Exception, ex: print ex return
def __VenueParser(self, element): try: self.urlmarks += 1 print '[COUNT]: ' + str(self.urlmarks) featured = 'featured' onclick = element.xpath('./td//a/@onclick')[0] detailLink = onclick[onclick.find("MM_openBrWindow('") + len("MM_openBrWindow('"):onclick.find("','','" )] detailLink = self.__url__ + '/' + detailLink xmlDoc = Util.getRequestsXML(detailLink, '//table[@cellspacing="3"]') xmlDoc = xmlDoc[0] ven = Venue() ven.hqdb_featured_ad_type = featured ven.country = self._language ven.scrape_page = detailLink detail_ = xmlDoc.xpath('./tr/td/table') detail_1 = detail_[0] detail_2 = detail_[2] basicInfo = detail_1.find('./tr/td/table/tr/td[@class="text"]') email_website = basicInfo.getparent().xpath( '//table//div[@align="right"]/a') for aTag in email_website: link__ = aTag.get('href') if link__.find('mailto') != -1: ven.business_email = link__.replace('mailto:', '') else: if link__.find('http') != -1: ven.business_website = link__ openxml = detail_1.xpath('./tr') openxml = openxml[2].find('./td/table') # table openning hour rows = openxml.xpath('./tr') dayofweek = { 'Montag': 'Lunedì', 'Dienstag': 'Martedì', 'Mittwoch': 'Mercoledì', 'Donnerstag': 'Giovedì', 'Freitag': 'Venerdì', 'Samstag': 'Sabato', 'Sonntag': 'Domenica' } openning_hour_array = [] for row in rows: tds = row.xpath('./td') if len(tds) > 0: if tds[0].text != None: if dayofweek.get(tds[0].text, "NULL") != "NULL": record = '' count_ = 0 for td in tds: if dayofweek.get(td.text, "NULL") != "NULL": record += dayofweek.get(td.text) + ": " else: if td.text.strip() != '-': record += td.text.replace('.', ':') + ", " else: count_ += 1 record = record.strip() if record.endswith(','): record = record[0:-1] if count_ < 3: openning_hour_array.append(record) ven.opening_hours_raw = ' | '.join(openning_hour_array) basicInfo_ = ''.join(basicInfo.itertext()).split('\n') if basicInfo_[len(basicInfo_) - 1].find('Fax') != -1: basicInfo_ = basicInfo_[0:-1] phonenumber = basicInfo_[-1].strip().replace('Tel.', '').replace(' ', '') zip_ci = basicInfo_[-2] street = basicInfo_[-3] contactName = basicInfo_[-4] name = ' '.join(basicInfo_[0:-4]) (ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2) = self.processPhone([phonenumber]) (ven.city, ven.zipcode) = self.processZipCity(zip_ci) if ven.zipcode != None: if self.validateZip(ven.zipcode) == None: return ven.street = street ven.name_of_contact = contactName ven.name = name services = detail_2.xpath('./tr[@valign="top"]/td') if len(services) > 0: service_ = [] services = services[0].text ven.services = self.__ServicesParser(services) self.index += 1 ven.writeToFile(self.folder, self.index, ven.name, False) except Exception, ex: print ex
def __VenueParser_2(self, element): try: featured = 'none' #print ET.dump(element) self.urlmarks += 1 print '[COUNT]: ' + str(self.urlmarks) ven = Venue() ven.scrape_page = self._url_lstVenues + '#' + str(self.urlmarks) td = element.find('./td') div = td.find('./div') if div != None: a = div.find('./a').get('href') ven.business_website = a td.remove(div) basicInfo = ''.join(td.itertext()) #basicInfo =td.text if self.urlmarks == 881: pass street = '' contactName = '' name = '' basicInfo_ = basicInfo.split('\n') if basicInfo_[len(basicInfo_) - 1].find('Fax') != -1: basicInfo_ = basicInfo_[0:-1] positionInArr = -1 phoneNumber = basicInfo_[positionInArr].strip().replace( 'Tel.', '').replace(' ', '') positionInArr -= 1 local = basicInfo_[positionInArr] positionInArr -= 1 street = basicInfo_[positionInArr] if self.isContactName(street) == True: contactName = street street = '' positionInArr += 1 positionInArr -= 1 if contactName == '': contactName = basicInfo_[positionInArr] if self.isContactName(contactName) == False: positionInArr += 1 contactName = '' name = ' '.join(basicInfo_[0:positionInArr]) (ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2) = self.processPhone([phoneNumber]) (ven.city, ven.zipcode) = self.processZipCity(local) if ven.zipcode != None: if self.validateZip(ven.zipcode) == None: return ven.street = street ven.name_of_contact = contactName ven.name = name ven.country = self._language ven.hqdb_featured_ad_type = featured self.venues[ven.scrape_page] = ven except Exception, ex: print ex
def __VenueParser(self, xmlE, index): #print 'Scrapping: ' ven = Venue() ven.adid = xmlE.get('id') ven.category = 'architecural technologist' photos = xmlE.find( './div[@class="search_result_photo"]/div[@class="photo"]/a') ven.venue_images = self.__url__ + photos.find('./img').get('src') ven.scrape_page = self.__url__ + photos.get('href') #print str(index)+' >>'+ ven.scrape_page existing = [x for x in self.list_url if ven.scrape_page in x] if len(existing) > 0: print 'This venues exist in list' return self.list_url.append(ven.scrape_page) details_ = xmlE.find('.//div[@class="search_result_details"]') ven.name = details_.find('./div[@class="title"]/h3/a').text contacts_ = details_.find('./div[@class="contact"]').text ven.description = details_.find('./div[@class="desc"]').text contact__ = contacts_.split(',') if len(contact__) >= 2: ven.zipcode = contact__[len(contact__) - 1] if ven.zipcode != None: ven.zipcode = self.check_zip(ven.zipcode) ven.city = contact__[len(contact__) - 2] #scraping details ____ #ven.scrape_page ='http://www.architecturalindex.com/consumers/architects/architect.asp?lngArchitectId=207922' xmlInfo = Util.getRequestsXML( ven.scrape_page, '//div[@class="architect_header"]/parent::div') if xmlInfo != None: addressInfo = xmlInfo.find( './/div[@class="architect_header"]/div[@class="architect_header_info"]' ) h2 = addressInfo.find('./h2') if h2 != None: addressInfo.remove(h2) address__ = ' '.join(addressInfo.itertext()) if ven.city == None: __address = address__.split(',') ven.city = __address[len(__address) - 3] if len(ven.city) < 2: __address = address__.split(',') ven.city = __address[len(__address) - 3] street = address__[0:address__.find(ven.city.strip()) - 1] if street.endswith(','): street = street[0:len(street) - 1] if street.upper().find('PO BOX') >= 0: street = None ven.street = street #ven.office_number= '08708700053' img = [] img_info = xmlInfo.find('.//div[@class="architect_portfolio"]') photos_ = img_info.xpath( './div[@class="architect_portfolio_photo"]//img') for photo in photos_: im_ = self.__url__ + photo.get('src') img.append(im_) ven.img_link = img sers = [] des = xmlInfo.find('.//div[@class="architect_info_statement"]') des = ' '.join(des.itertext()) ven.description = des services = xmlInfo.xpath('//div[@class="architect_info"]/ul') desP = xmlInfo.xpath('//div[@class="architect_info"]/p') affi = xmlInfo.xpath('//div[@class="architect_info"]/h3') isAffiliations = '' for aff in affi: if aff.text.strip() == 'Affiliations': isAffiliations = desP[len(desP) - 1].text ven.accreditations = isAffiliations if len(desP) >= 2: p1 = desP[0].text p2 = desP[1].text #ven.description= ven.description+' '+p1+' '+p2 if p1 != None: ven.description += ' ' + p1 if p2 != None: if p2 != 'None': ven.description += ' ' + p2 + ': ' if len(services) >= 3: services_ = services[1] listSer = services_.xpath('./li') listDes_2 = services[2].xpath('./li') des_2 = '' if len(listDes_2) > 0: des_2 = '. Specialist Experience: ' for des2 in listDes_2: des_2 += des2.text + ', ' des_2 = des_2.strip() if des_2.endswith(','): des_2 = des_2[0:-1] listDes = services[0].xpath('./li') if len(listDes) > 0: desSectors = '' for lides in listDes: desSectors += lides.text + ', ' desSectors = desSectors.strip() if desSectors.endswith(','): desSectors = desSectors[0:-1] ven.description = ven.description + ' ' + desSectors + '.' + des_2 ven.description = ven.description.replace(', ,', ', ').replace( '..', '.') for ser in listSer: se = ser.text serv = Service() serv.service = se sers.append(serv) ven.services = sers ven.pricelist_link = [ven.scrape_page] ven.country = 'gb' '''if ven.street!=None: add_ = ven.street+', '+ven.city+', '+ ven.zipcode else: add_ = ven.city+', '+ ven.zipcode #(ven.latitude,ven.longitude) = self.getLatlng(add_, 'UK')''' indexc = self.addIndex() try: print 'Writing index: ' + str(indexc) ven.writeToFile(self.folder, indexc, ven.name.replace(':', ''), False) #return ven except Exception, ex: print ex return
def __VenueParser(self, url__, name__): print 'Scraping: ' + url__ existing = [x for x in self.venuesList if url__ in x] if len(existing) > 0: return None #url__ ='http://www.drivingschoolsfinder.co.uk/city-Accrington/1846198-driving-Terrys-School-of-Motoring.html' #name__ ='Terrys School of Motoring' city = url__.split('/')[3].replace('city-', '').replace('-', ' ') xmlDoc = Util.getRequestsXML(url__, '/html/body') if xmlDoc == None: return None else: ven = Venue() sers = [] ven.name = name__ ven.city = city ven.scrape_page = url__ td = xmlDoc.xpath('//td[@class="welcome-padding"]') iter__ = ''.join(td[0].itertext()) iter__ = iter__[iter__.find('Driving School:') + len('Driving School:'):iter__. find('[Edit Text]')].replace('\n', '|').replace( '\t', '') iter__ = iter__.replace('|||', ' | ') rep = '|' + name__ iter__ = iter__[0:iter__.find(rep)] rep = ' | |' iter__ = iter__[0:iter__.find(rep)] ven.description = iter__ div = td[0].xpath('./div') if len(div) < 5: return None else: # div info = position div gray-line[0]+1 div_info = 0 for div_ in div: if div_.find('./script') != None: div_info = 3 info = div[div_info] info_ = ''.join(info.itertext()) address = info_[0:info_.find('Phone')].replace( name__, '').replace(city, ',' + city).replace(',,', ',').replace( ', ,', ',').split(',') #street = ', '.join(address[0:len(address)-2]).replace(','+city,'') street = ', '.join(address[0:len(address)]) street = street[0:street.find(city) - 1] if street.endswith(','): street = street[0:len(street) - 1] zipcode = address[len(address) - 1] street__ = street.upper() if street__.find('PO BOX') == -1: ven.street = street.replace('n/a', '').replace( '***', '').replace('6 weldon place croy', '').replace( 'cumbernauld41 napier square bellshill ml4 1tb', '').replace('P.O. Box 1048', '') if ven.street == '-': ven.street = None ven.zipcode = self.validateZipcode(zipcode) phone = info_[info_.find('Phone:') + len('Phone:'):info_.find('Fax:')].replace( ' ', '') if phone.isdigit(): if phone.startswith('07') | phone.startswith('7'): ven.mobile_number = self.validatePhone(phone) ven.mobile_number = self.validatePhone__( ven.mobile_number, 'gb') else: ven.office_number = self.validatePhone(phone) ven.office_number = self.validatePhone__( ven.office_number, 'gb') services_ = info_[info_.find('Services Offered:') + len('Services Offered:'):info_. find('Areas Served:')].strip().replace( ';', ',') if services_ != 'None Listed - [Edit]': services_ = services_.replace('/', ',').replace(',,', ',').split(',') for s in services_: name = self.validateServices(s) if len(name) >= 5: name__ = name.split() for n in name__: name = self.validateNameServices(name) if len(name.strip()) >= 5: services = Service() services.service = name sers.append(services) #ven.description = ven.description +' | ' +services_ stringfind = 'No Website' if info_.find('No Website') == -1: stringfind = 'Website' area_coverd = info_[info_.find('Areas Served:') + len('Areas Served:'):info_. find(stringfind)].strip().replace( ';', ',') #area_coverd = area_coverd[0:area_coverd.find(stringfind)] if area_coverd != 'None Listed - [Edit]': ven.areas_covered = area_coverd ven.services = sers reviewer = len(xmlDoc.xpath('//td[@class="review-box"]')) if reviewer > 0: ven.hqdb_nr_reviews = str(reviewer) scoreInfo = div[div_info + 1] #http://www.drivingschoolsfinder.co.uk/halfstar.gif +0.5 #http://www.drivingschoolsfinder.co.uk/fullstar.gif +1 #http://www.drivingschoolsfinder.co.uk/emptystar.gif +0 tr = scoreInfo.xpath('./table/tr') tr = tr[1] img_core = tr.xpath('./td')[1] img_core = img_core.xpath('./table/tr/td/img') score__ = 0.0 for score in img_core: score_ = score.get('src') if score_ == 'http://www.drivingschoolsfinder.co.uk/halfstar.gif': score__ += 0.5 if score_ == 'http://www.drivingschoolsfinder.co.uk/fullstar.gif': score__ += 1 if score_ == 'http://www.drivingschoolsfinder.co.uk/emptystar.gif': score__ += 0 if score__ > 0: ven.hqdb_review_score = str(score__).replace('.0', '') ven.country = 'gb' emails_ = re.findall(r'[\w\.-]+@[\w\.-]+', info_) for email_ in emails_: ven.business_email = email_ # website_ = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', info_) # for web_ in website_: # ven.business_website = web_ if ven.business_email != None: if ven.business_email.startswith('http'): ven.business_email = None ven.business_email = None if info_.find('No Website') == -1: arrays__ = info_.split(' ') for i in range(0, len(arrays__)): if arrays__[i].find('Website') >= 0: web_ = arrays__[i + 1].replace('\t', ' ').replace( '\n', ' ').split()[0].replace('No', '') ven.business_website = self.formatWeb_(web_) print ven.business_website break address_ = '' if ven.street == None: address_ = ven.city + ', ' + ven.zipcode #ven.formatted_address = ven.city+', '+ven.zipcode else: if ven.zipcode != None: address_ = ven.street + ', ' + ven.city + ', ' + ven.zipcode else: address_ = ven.street + ', ' + ven.city ven.pricelist_link = [ven.scrape_page] ''' get lat -lng ''' if address_ != '': try: (ven.latitude, ven.longitude) = self.getLatlng(address_, 'UK') except Exception, ex: Util.log.running_logger.error(ven.scrape_page + ' : ' + ex) return None ven.is_get_by_address = True return ven
def __VenueParser_2(self, element, cate, scrape_pages): subB = element.find('./div/a') link = subB.get('href') try: existing = [x for x in self.listLink if link in x] if len(existing) <= 0: print 'Scraping Feature : ' + link self.listLink.append(link) ven = Venue() ven.country = self._language ven.hqdb_featured_ad_type = 'featured' ven.category = cate #ven.subcategory = cate ven.scrape_page = scrape_pages subDiv = element.find('./div[@class="resultado nada"]') div = subDiv.find('./a/div') ven.name = div.find('./h2').text #.replace('Niño','Niño') '''if ven.name =='Niño de la Virgen': print''' #ven.name = Validator.RevalidName(ven.name) ven.name = self.replaceName(ven.name) address = div.xpath('./p[@itemprop="address"]/span') if address != None: for span in address: itemprop = span.get('itemprop') if itemprop == 'street-address': ven.street = span.text if itemprop == 'postal-code': ven.zipcode = span.text if itemprop == 'locality': ven.city = span.text #.split(',')[0] if ven.city == '' or ven.city == None: continue find_slash = ven.city.find('/') find_comma = ven.city.find(',') if find_slash != -1 and find_comma != -1: ven.city = ven.city.split('/')[0] if ven.city.find(',') != -1: ven.city = ven.city.split(',')[1] ven.city = ven.city.split(',')[0] ven.city = ven.city.split('/')[0] if ven.street != None: ven.street = self.validateStreet(ven.street) if ven.city != None: re_City = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.city, flags=0) if re_City != None: ven.city = ven.city.replace(re_City.group(0), '') if ven.zipcode != None: ven.zipcode = ven.zipcode.strip() if len(ven.zipcode) >= 5: re_zipcode = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.zipcode, flags=0) if re_zipcode != None: if re_zipcode.group(0) != ven.zipcode: ven.zipcode = None else: ven.zipcode = None else: ven.zipcode = '0' + ven.zipcode rezipcode = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.zipcode, flags=0) if rezipcode == None: ven.zipcode = None else: if ven.zipcode != rezipcode.group(0): ven.zipcode = None try: if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000: ven.zipcode = None except Exception, ex: ven.zipcode = None description = div.find('./p[@class="descripcion"]').text if description != None: ven.description = description imgs = subDiv.xpath('./a/figure/img') if len(imgs) > 0: imgs_ = [] for im in imgs: imgs_.append(im.get('src')) ven.img_link = imgs_ footer = subDiv.xpath('./div[@class="iconos"]/ul/li') for fo in footer: text__ = fo.find('./a').text if text__ == 'Mandar mail': ven.business_website = fo.find('./a').get('href') if text__ == 'Ver teléfono': phone = fo.find('./span[@class="telefono"]').text if phone.startswith('+346') or phone.startswith( '+347') or phone.startswith( '7') or phone.startswith('6'): ven.mobile_number = self.validatePhone__(phone) else: ven.office_number = self.validatePhone__(phone) #ven.is_get_by_address =True ven.writeToFile(self.folder, self.addIndex(), ven.name, False) else:
def __VenueParser(self,hqdb_type, linkItems,subcate,cate): #linkItems ='https://www.blauarbeit.de/p/modernisierung/_sanierung/berlin/daniel_kutscher/576667.htm' existing=[x for x in self.linkIn if linkItems in x] if len(existing)>0: print 'This venue exist in list' return None self.linkIn.append(linkItems) xmlPages = self.getRequest(linkItems) if xmlPages==None: return None xmlVen = xmlPages.xpath('//div[@class="page_move"]') cate__ = xmlPages.find('.//meta[@name="Description"]') if len(xmlVen)==0: return None name = xmlVen[0].xpath('.//h2') if len(name) <=0: name ='' else: name = name[0].text.strip() noneValues ={'ZERO','NULL'} if name.upper() in noneValues: return None ven = Venue() if cate__!=None: ven.category = cate__.get('content').split(',')[0] nameFromUrl = self.getNamefromUrl(linkItems) ven.name = nameFromUrl ven.hqdb_featured_ad_type = hqdb_type #ven.name =name ven.scrape_page = linkItems #ven.subcategory = subcate #ven.category= cate address_= '' #ven.formatted_address='' img_link= [] divInfo = xmlVen[0].find('.//div[@class="content_wrapper content_wrapper_main clearfix"]/div') if divInfo!=None: mainInfo = divInfo.xpath('./section') if len(mainInfo)>=2: leftInfo = mainInfo[0] rightInfo = mainInfo[1] tableInfo = leftInfo.find('./div/div[@class="profile_top_left"]/table') trinfo = tableInfo.xpath('./tr') for tr_ in trinfo: td =tr_.xpath('./td') if len(td)<2: continue key_ = ''.join(td[0].itertext()).strip() values_ = ' '.join(td[1].itertext()).strip().replace('keine Angabe','').replace('NULL','').replace('null','') if key_ =='Ansprechpartner:': if values_!=None and len(values_)>2: #values_ ='' ven.name_of_contact = values_ ven.name +=', '+ ven.name_of_contact if key_ =='Addresse:': address_ = values_ ven.formatted_address = self.validateFormat(address_) '''(ven.street,ven.city,ven.zipcode) = self.processAddress(address_) if ven.street!=None: ven.street = self.validateStreet2(ven.street) #ven.formatted_address = address_ if ven.city!=None: checkCity = ven.city.split() if len(checkCity)>0: if checkCity[0].isdigit(): if len(checkCity[0])==5: if ven.street!=None: ven.street+=' '+ ven.zipcode ven.zipcode = checkCity[0] ven.city = ven.city.replace(ven.zipcode,'') else: ven.city = None ven.street = None ven.zipcode= None ven.formatted_address = ' '.join(checkCity) if ven.zipcode!=None: if len(ven.zipcode)==5: ven.zipcode = ven.zipcode else: ven.zipcode = None''' if key_ =='Homepage:': a_ = td[1].find('./a') if a_ !=None: ven.business_website = a_.get('href') mobileCode =['015','016','017','+4915','+4916','+4917'] if key_ =='Tel:': values_ = values_.replace('/', '').replace(' ', '').replace('Tel', '') #values_ ='01735465435' for mCode in mobileCode: if values_.startswith(mCode): ven.mobile_number = self.validatePhone__(self.validatePhone(values_), 'de') break if ven.mobile_number==None: ven.office_number = self.validatePhone__(self.validatePhone(values_), 'de') '''if values_.startswith('01')| values_.startswith('+0041')| values_.startswith('0041'): ven.mobile_number = self.validatePhone__(self.validatePhone(values_), 'de') else: ven.office_number = self.validatePhone__(self.validatePhone(values_), 'de')''' img_ = leftInfo.find('./div/div[@class="profile_top_right"]/img') if img_!=None: img_ =img_.get('src') img_link.append(img_) rating = leftInfo.xpath('.//section[@id="ratings"]/div') if len(rating)>=2: rating1 = ''.join(rating[0].itertext()).strip().split()[1] rating2 = ''.join(rating[1].itertext()).strip() if len(rating2)>0: rating2 = rating2.split()[0] if rating2.find('/')!=-1: rating2 = rating2.split('/')[0].replace(',','.') try: float(rating2) except Exception,ex: rating2=None ven.hqdb_nr_reviews = rating1 ven.hqdb_review_score = rating2 if ven.hqdb_review_score==None: scoreIn = xmlVen[0].xpath('//div[@class="float_box"]//span[@class="txtLight"]/parent::div') if len(scoreIn)>0: core_ = scoreIn[0].text.replace(',','.') try: float(core_) except Exception,ex: core_ =None ven.hqdb_review_score = core_ script_ = xmlPages.xpath('./head/script') if ven.formatted_address.strip()=='' and ven.office_number==None and ven.office_number2 ==None and ven.mobile_number ==None and ven.mobile_number2 ==None: return None '''streetTemp = ven.street cityTemp =ven.city zipcodeTemp =ven.zipcode if streetTemp ==None: streetTemp ='' if ven.city ==None: cityTemp = '' if ven.zipcode ==None: zipcodeTemp ='' address_ = streetTemp+', '+cityTemp+', '+zipcodeTemp address_ = address_.strip().replace(', ,', ',').replace(',,', ',') if address_.startswith(','): address_ =address_[1:len(address_)] if address_.endswith(','): address_ = address_[0:len(address_)-1] if ven.formatted_address!=None: address_ = ven.formatted_address''' #if len(address_.strip())>5: # (ven.latitude,ven.longitude) = self.getLatlng(address_,'DE') #script_ zipFrom = self.findZipcode(ven.formatted_address) if zipFrom!=None: (ven.latitude,ven.longitude) = self.getLatlng(zipFrom, 'DE') if ven.latitude ==None and ven.longitude==None: Util.log.running_logger.info(ven.formatted_address+' : cannot get GEO code') redirecPhotos= rightInfo.find('./nav/div/ul/li[@class="tabOff tab_foto"]/a') if redirecPhotos!=None: linkPhotos = redirecPhotos.get('href') if linkPhotos.startswith('/'): linkPhotos = self.__url__+ linkPhotos #time.sleep(1) xpathPhotos = Util.getRequestsXML(linkPhotos, '//div[@class="portfolio thumbs"]/a') if xpathPhotos!=None: listImg = xpathPhotos.xpath('./a') for __img in listImg: img_link.append(__img.get('data-thumb')) desElement= rightInfo.find('./div/div[@id="cont_about"]') ''' pTag = desElement.xpath('//div[@class="overview"]/p') des = '' for desE in pTag : if ''.join(desE.itertext()).find('<xml>')>=0: continue des+=''.join(desE.itertext()) h5Tag = desElement.xpath('//div[@class="overview"]/h5') for desE_ in h5Tag: if ''.join(desE_.itertext()).find('<xml>')>=0: continue des += ''.join(desE_.itertext()) divTag =desElement.xpath('//div[@class="overview"]/h5') for div_ in divTag: if ''.join(div_.itertext()).find('<xml>')>=0: continue des+= ''.join(div_.itertext()) if len(pTag)==0 and len(h5Tag) ==0: if desElement.find('.//div[@class="overview"]')!=None: des = desElement.find('.//div[@class="overview"]').text ven.description = self.validateDes(des) ''' des ='' divTag = desElement.xpath('//div[@class="overview"]') for divDes in divTag: des+= ' '.join(divDes.itertext()) ven.description = self.validateDes(des) certi = rightInfo.find('.//div/div[@id="cont_certs"]') tablecerti = certi.find('./table') if tablecerti!=None: certi_ = ''.join(tablecerti.itertext()).replace('Geprüfte Zertifikate:','') ven.accreditations = certi_ ven.img_link = img_link ven.country ='de' ven.is_get_by_address = True return ven
def __VenueParser(self, venueElement): try: img_link = [] ad_type = "none" if venueElement.find( './/span[@class="label label-success"]') != None: ad_type = "featured" divs = venueElement.xpath('./div') logo_ = divs[0].find('.//img') if logo_ != None: img_link.append(self.__url__ + logo_.get('src')) url__ = venueElement.xpath( './div[@class="col-xs-9 col-sm-9 col-md-9 listing-body"]//div[@class="h4 listing-heading"]/a' ) if url__ != None: url__ = url__[0].get('href') url__ = self.__url__ + url__ ''' files = open('D:\\test.txt','a') files.write(url__+'\r\n') files.close() ''' existing = [x for x in self.listLink if url__ in x] if len(existing) <= 0: self.listLink.append(url__) print 'Scraping' + ' : ' + url__ #if url__ =='http://www.garagesandrecovery.co.uk/business/dorset-auto-repirs': # print 'Debug' xmlDoc = Util.getRequestsXML( url__, '//body/div[@class="page-wrapper"]') ven = Venue() ven.name = xmlDoc.find( './/div[@class="page-heading"]//h1').text content = xmlDoc.find( './/div[@class="container page-content"]') if content != None: des_img = content.find('.//div[@class="article-body"]') if des_img != None: div_img = des_img.xpath('.//img/parent::div') if len(div_img) > 0: des_img.remove(div_img[0]) des = ' '.join(des_img.itertext()) ven.description = des ven.country = self._language ven.scrape_page = url__ ven.hqdb_featured_ad_type = ad_type offices_ = content.xpath( './/div[@id="offices"]/parent::div/div[@class="row"]' ) div_maps = offices_[0].find( './/div[@class="google-map"]') if div_maps != None: ven.latitude = div_maps.get('data-lat') ven.longitude = div_maps.get('data-lng') info_ = offices_[0].xpath( './div[@class="col-md-5 col-sm-6"]') info_ = info_[0] ul = info_.xpath('./ul') phones = [] for u in ul: phone_ = u.xpath('./li/a') for phone in phone_: if phone.get('title') == 'Phone Number': phone = phone.text.replace(' ', '') if phone.startswith('0800'): continue else: phones.append(phone) if len(ul) >= 2: ul_2 = ul[0] li__ = ul_2.xpath('./li') address = '' for li in li__: if li.get('class') != 'text-bold': address = '\n'.join(li.itertext()) addressArr = address.split('\n') if len(addressArr) >= 3: ven.street = addressArr[len(addressArr) - 3] ven.city = addressArr[len(addressArr) - 2].split(',')[0] ven.zipcode = addressArr[len(addressArr) - 1] if ven.zipcode != None: results = re.search(self.ukReg, ven.zipcode, flags=0) if ven.zipcode == 'Rotherham, South Yorkshire': ven.zipcode = '' ven.street = None if results == None: ven.zipcode = None (ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2) = self.processPhones(phones) # right sidebar : //div[@class="col-md-3 page-sidebar"]/div rightSidebar = xmlDoc.xpath( './/div[@class="col-md-3 page-sidebar"]/div[@class="section"]' ) for right in rightSidebar: website = right.xpath( './a[contains(text(),"Visit Our Website")]') if len(website) > 0: website = website[0].get('href') if website.find('facebook.com') == -1: ven.business_website = website else: ven.facebook = website reviews = right.xpath('./p/strong') if len(reviews) >= 3: ven.hqdb_nr_reviews = reviews[2].text ven.hqdb_review_score = reviews[1].text follows = right.xpath('./ul/li/a') for foll in follows: follow_link = foll.get('href') if follow_link.find('facebook.com') != -1: if ven.facebook == None: ven.facebook = self.addHTTP( follow_link) if follow_link.find('twitter.com') != -1: if ven.twitter == None: ven.twitter = self.addHTTP(follow_link) img_find = xmlDoc.xpath( '//div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img' ) for ig in img_find: img_link.append(self.__url__ + ig.get('src')) if len(img_link) > 0: ven.img_link = img_link self.index += 1 ven.writeToFile(self.folder, self.index, ven.name, False) #img_link : //div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img else: print '\nduplicate'.upper() print '*' * (len(url__) + 4) print '*' + ' ' * (len(url__) + 2) + '*' print '* ' + url__ + ' *' print '*' + ' ' * (len(url__) + 2) + '*' print '*' * (len(url__) + 4) + '\n' except Exception, ex: print ex
def __VenueParser(self, url): try: print 'Scraping: ' + url xmlDoc = Util.getRequestsXML(url, '//div[@id="main"]') if xmlDoc != None: ven = Venue() ven.scrape_page = url ven.country = self._language ven.name = xmlDoc.find('.//h1').text overview = xmlDoc.find('.//div[@class="overview"]') option = overview.xpath('./div[@class="options row"]/div') for opt in option: div_ = opt.xpath('./div') for div__ in div_: strong = div__.find('./strong') if strong != None: if strong.text == 'Adresse:': street = div__.find( './span[@itemprop="streetAddress"]') if street != None: ven.street = street.text zipcode = div__.find( './span[@itemprop="postalCode"]') if zipcode != None: ven.zipcode = zipcode.text city = div__.find( './span[@itemprop="addressLocality"]') if city != None: ven.city = city.text if strong.text == 'Téléphone:': phone = ''.join(div__.itertext()).replace( ' ', '').replace('.', '').replace('Téléphone:', '') if phone.startswith('06') or phone.startswith( '07') or phone.startswith( '7') or phone.startswith('6'): ven.mobile_number = self.validatePhone__( phone) else: ven.office_number = self.validatePhone__( phone) if strong.text == 'Site Web:': website = ''.join(div__.itertext()).replace( 'Site Web:', '') if website.find('facebook.com') != -1: ven.facebook = self.addHTTP(website) continue if website.find('twitter.com') != -1: ven.twitter = self.addHTTP(website) continue ven.business_website = self.addHTTP(website) if strong.text == 'Horaires:': openning = ''.join(div__.itertext()).replace( 'Horaires:', '') for format in self.openFormat: if openning.strip() == format: ven.opening_hours_raw = 'Lundi au Dimanche: 0h00 - 24h00' if ven.opening_hours_raw == None: ven.opening_hours_raw = openning if strong.text == 'Votez pour ce serrurier:': score = div__.find( './span[@class="thevotescount"]/span[@itemprop="ratingValue"]' ) if score != None: ven.hqdb_review_score = score.text descElement = overview.find('./div[@class="contenu"]') if descElement != None: ven.description = ' | '.join(descElement.itertext()) if ven.description != None: ven.description = ven.description.strip() if ven.description.startswith('|'): ven.description = ven.description[ 1:len(ven.description)] if ven.description.endswith("|"): ven.description = ven.description[ 0:len(ven.description) - 1] ven.description = ven.description.replace( '| \n |', '|') if len(ven.description.split()) < 3: ven.description = None address = [] if ven.street != None and len(ven.street.strip()) > 0: address.append(ven.street) if ven.city != None and len(ven.city.strip()) > 0: address.append(ven.city) if ven.zipcode != None and len(ven.zipcode.strip()) > 0: address.append(ven.zipcode) address_ = ', '.join(address) (ven.latitude, ven.longitude) = self.getLatlng(address_) ven.is_get_by_address = True self.index += 1 ven.writeToFile(self.folder, self.index, ven.name, False) except Exception, ex: print '[ERROR] ' + url print ex
def __VenueParser(self, jsonItems, hash): url = self.__url__ + 'profile/' + jsonItems.get( 'serviceSlug') + '/' + jsonItems.get( 'companySlug') + '-' + jsonItems.get('id') + '?hash=' + hash url__ = self.__url__ + 'profile/' + jsonItems.get( 'serviceSlug') + '/' + jsonItems.get( 'companySlug') + '-' + jsonItems.get('id') id_ = str(jsonItems.get('id')) existing = [x for x in self.list_url if url__ in x] if len(existing) > 0: print 'this venues existed in list' return None if len(existing) <= 0: print 'Scrapping: ' + url ven = Venue() services_v = [] ven.category = jsonItems.get('restriction').get('name') ven.adid = str(jsonItems.get('id')) ven.name = jsonItems.get('companyName') ven.latitude = jsonItems.get('coordinates').get('lat') ven.longitude = jsonItems.get('coordinates').get('long') ven.venue_images = jsonItems.get('logo') points_ = jsonItems.get('satisfaction_rating') if str(points_).find('.') >= 0: ven.hqdb_review_score = str(round(points_, 1)) else: ven.hqdb_review_score = str(points_) #ven.img_link = [url] #ven.description = jsonItems.get('salesPitch') ven.country = 'gb' ven.scrape_page = url #ven.pricelist_link = [url] self.list_url.append(url__) #url ='https://www.unbiased.co.uk/profile/financial-adviser/stiles-company-financial-services-petersfield-ltd-511274' xmlRequest = Util.getRequestsXML( url, '//div[@class="container-fluid"]') if xmlRequest != None: stringAddress = xmlRequest.find( './/span[@class="profile-meta__address"]').text.replace( ',,', ',') stringAddress = '1st and 2nd Floor Offices, 446 - 452 High street, Kingswinford, West Midlands,' ven.formatted_address = self.removeNameFromAdd( ven.name.strip(), stringAddress).replace('PO BOX', '').replace('PO Box', '').replace( 'Po Box', '') zipArr = stringAddress.split(',') ven.zipcode = zipArr[len(zipArr) - 1] ex_ = re.search( '([Gg][Ii][Rr]0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9]?[A-Za-z]))))\s?[0-9][A-Za-z]{2})', stringAddress, flags=0) if ex_ != None: zip_c = ex_.group(0) #ven.zipcode = zip_c #ven.formatted_address = ven.formatted_address.replace(ven.zipcode,'').strip() if ven.zipcode != zip_c: poZip_c = stringAddress.find(zip_c) poZipcode = stringAddress.find(ven.zipcode) if len(ven.zipcode.strip()) > 1: if poZip_c > poZipcode: ven.zipcode = zip_c if ex_ == None: if ven.zipcode != None: ven.zipcode = None if ven.formatted_address.endswith(','): ven.formatted_address = ven.formatted_address[ 0:len(ven.formatted_address) - 2] phoneLabel = xmlRequest.xpath( './/span[@class="phone-label"]/parent::a') if len(phoneLabel) > 0: for phone_ in phoneLabel: phone = phone_.get('data-phone').replace('\n', '').replace( ' ', '') if phone.find('Shownumber') <= 0: phone = self.validatePhone(phone) for rePhone in self.listPhoneremove: if phone == rePhone: phone = None if phone != None: if phone.startswith('07'): ven.mobile_number = phone else: ven.office_number = phone break services = xmlRequest.find( './/ul[@class="advice-area__level-one"]') if services != None: list_ser = services.xpath('./li') for ser_name in list_ser: # feedback 3 : add category service cate = ser_name.find('./span').text.strip() list_services = ser_name.xpath('./ul/li') for service__ in list_services: service = Service() service.service_category = cate + ' advice' service.service = service__.text + ' advice' services_v.append(service) ven.services = services_v # append accreditations feedback 3 certi = [] cer = xmlRequest.xpath( './/div[@class="profile-team__skill-item collapsed"]') for c in cer: inCerti = [x_ for x_ in certi if c.text in x_] if len(inCerti) <= 0: certi.append(c.text) ven.accreditations = ', '.join(certi) # add follow : fb, twi, website feedback 3 follow = xmlRequest.xpath( '//div[@class="profile__follow"]/ul/li') for fol in follow: values_fol = fol.get('class') if values_fol == 'icon-soc-tw': ven.twitter = fol.find('./a').get('href') if values_fol == 'icon-soc-www': ven.business_website = fol.find('./a').get('href') if values_fol == 'icon-soc-fb': ven.facebook = fol.find('./a').get('href') # description feedback 3 des_1 = xmlRequest.find( './/div[@class="profile__text-block "]/p') if des_1 != None: ven.description = ''.join(des_1.itertext()).replace( '.\n', ' | ') des_2 = xmlRequest.find( './/div[@class="profile__text-block spacing-bottom-xs-0"]/p' ) if des_2 != None: ven.description += ' -Our services: ' + ''.join( des_2.itertext()).replace('.\n', ' | ') if ven.description != None: if ven.description.endswith(' | '): ven.description = ven.description[0:len(ven.description ) - 2] return ven else: return None
def __VenueParser(self): print 'Scrapping: ' ven = Venue() return ven
def __VenueParser(self, element, cate, scrappages): subA = element.find('./div/a') link = subA.get('href') try: #if link =='http://es.qdq.com/f:1-GV9-6082/': # print existing = [x for x in self.listLink if link in x] print 'Scraping : ' + link if len(existing) <= 0: self.listLink.append(link) ven = Venue() #ven.name = subA.find('./div/h2').text ven.scrape_page = link #ven.subcategory = cate ven.category = cate ven.country = self._language ven.hqdb_featured_ad_type = "none" address = subA.xpath('./div/p/span') for span in address: itemprop = span.get('itemprop') if itemprop == 'street-address': ven.street = span.text if itemprop == 'postal-code': ven.zipcode = span.text if itemprop == 'locality': # before the first "," and before the "/" ven.city = span.text #.split(',')[0] if ven.city == '' or ven.city == None: continue find_slash = ven.city.find('/') find_comma = ven.city.find(',') if find_slash != -1 and find_comma != -1: ven.city = ven.city.split('/')[0] if ven.city.find(',') != -1: ven.city = ven.city.split(',')[1] ven.city = ven.city.split(',')[0] ven.city = ven.city.split('/')[0] if ven.street != None: ven.street = self.validateStreet(ven.street) if ven.city != None: re_City = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.city, flags=0) if re_City != None: ven.city = ven.city.replace(re_City.group(0), '') if ven.zipcode != None: ven.zipcode = ven.zipcode.strip() if len(ven.zipcode) >= 5: re_zipcode = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.zipcode, flags=0) if re_zipcode != None: if re_zipcode.group(0) != ven.zipcode: ven.zipcode = None else: ven.zipcode = None else: ven.zipcode = '0' + ven.zipcode rezipcode = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.zipcode, flags=0) if rezipcode == None: ven.zipcode = None else: if ven.zipcode != rezipcode.group(0): ven.zipcode = None try: if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000: ven.zipcode = None except Exception, ex: ven.zipcode = None detail = Util.getRequestsXML(link, '//div[@id="contenido"]') ven.name = detail.find('.//h1').text #.replace('Niño','Niño') #ven.name = Validator.RevalidName(ven.name) ven.name = self.replaceName(ven.name) phone = detail.find('.//span[@class="telefonoCliente"]') if phone != None: phone = phone.text if phone.startswith('6') or phone.startswith('7'): ven.mobile_number = '' + phone ven.mobile_number = self.validatePhone__( ven.mobile_number) else: ven.office_number = '' + phone ven.office_number = self.validatePhone__( ven.office_number) maps = detail.find('.//div[@id="mymap"]/img') if maps != None: maps = maps.get('src') (ven.latitude, ven.longitude) = self.getLatlng(maps) #ven.is_get_by_address =True ven.writeToFile(self.folder, self.addIndex(), ven.name, False) else:
def __VenueParser(self, link): #link ='https://www.meilleur-garagiste.com/annuaire/garage-la-couronne.464207.html' print 'Scrapping: ' + link existing = [x for x in self.link_venues if link in x] if len(existing) > 0: print 'Len existing : ' + str(len(existing)) return None xmlBody = Util.getRequestsXML(link, '//div[@id="fiche-artisan"]') if xmlBody != None and len(xmlBody) > 0: ven = Venue() name_ = xmlBody.xpath('.//h1/parent::div') if len(name_) > 0: if name_ != None: name_h1 = name_[0].find('./h1') name_h2 = name_[0].find('.//h2') if name_h2 != None: ven.name = name_h2.text else: ven.name = name_h1.text else: return None xmldiv = xmlBody.find('.//div[@class="row nomargin"]/div') if xmldiv == None: return None span_ = xmldiv.xpath('./span') for i_ in span_: if i_.get('class') == 'street-address text-hide-mobile': ven.street = i_.text if ven.street != None: #ven.street = self.validateStreet(ven.street).replace('43442491700012', '') ven.street = self.validateStreet2(ven.street).replace( '43442491700012', '') if ven.street.strip() == '.': ven.street = None if i_.get('class') == 'postal-code': ven.zipcode = i_.text ven.zipcode = self.validateZipcode(ven.zipcode) if i_.get('class') == 'locality': ven.city = i_.text a = xmlBody.find( './/a[@class="col m12 s4 tel waves-effect waves-light btn center btn-fix bleu"]' ) if a != None: phone = a.get('href').replace('tel:', '').replace(' ', '') if phone.startswith('07') | phone.startswith('06'): ven.mobile_number = self.validatePhone__(phone, 'FR') else: ven.office_number = self.validatePhone__(phone, 'FR') logo = xmlBody.find('.//div[@class="center-align"]/img') if logo != None: ven.img_link = [self.__url__ + logo.get('src')] ven.scrape_page = link ven.pricelist_link = [link] listServices = xmlBody.xpath( '//li/div[@class="collapsible-body"]/div/a') sers = [] for ser in listServices: servic = Service() servic.service = ser.text sers.append(servic) self.services.append(servic) ven.services = sers if ven.city != None and ven.zipcode != None: if ven.street != None and len(ven.street) > 0: add_ = ven.street + ', ' + ven.city + ', ' + ven.zipcode else: add_ = ven.city + ', ' + ven.zipcode else: add_ = None (ven.latitude, ven.longitude) = self.getLatlng(add_, 'FR') if ven.latitude == None and ven.longitude == None: Util.log.coordinate_logger.error(ven.scrape_page + ' : Cannot get GEO code') self.link_venues.append(link) ven.country = 'fr' desc = xmlBody.find('.//p[@id="description"]') desc_ = '' if desc != None: desc_ = ''.join(desc.itertext()).strip().replace('\n', '|').replace( '\t', '') title = xmlBody.find('.//div[@class="container"]//h2') if title != None and desc != None: desc_ = title.text + ' | ' + desc_ img_link_arr = [] desc_ = self.replace__(desc_) desc_ = self.replaceSame(desc_, '||', '|').replace('|', ' | ') ven.description = desc_ img_link = xmlBody.find('.//div[@class="realisations"]/img') if img_link != None: temp_img = ven.img_link = self.__url__ + img_link.get('src') img_link_arr.append(temp_img) multi_img = xmlBody.xpath( '//div[@class="3photo realisations"]/div/img') for it in multi_img: temp_ml = self.__url__ + it.get('src') img_link_arr.append(temp_ml) if len(img_link_arr) > 0: ven.img_link = img_link_arr nr_reviewer = xmlBody.xpath('//div[@class="avisoperation row"]') if len(nr_reviewer) > 0: ven.hqdb_nr_reviews = str(len(nr_reviewer)) ven.is_get_by_address = True return ven
def __VenueParser(self, url, cateName): existing = [x for x in self.listlink if url in x] self.listlink.append(url) if len(existing) > 0: self.countduplicate += 1 print '[INFO] Duplicate count = ' + str(self.countduplicate) return try: print 'Scraping url: ' + url #url = 'http://www.uksecurity-directory.co.uk/the-directory/1905/ecpcco/' xmlDoc = Util.getRequestsXML(url, '//div[@class="gdl-page-content"]') xmlDoc = xmlDoc.xpath('//div[@class="gdl-page-content"]/div')[0] ven = Venue() imgs = [] ven.category = cateName ven.scrape_page = url ven.country = self._language ven.name = xmlDoc.find('./div/h2').text ven.hqdb_featured_ad_type = 'none' isFeatured = xmlDoc.find('./div[@class="stickytag"]/img') if isFeatured != None: if isFeatured.get('title') == 'Featured Listing': ven.hqdb_featured_ad_type = 'featured' divInfo = xmlDoc.xpath('./div[@class="listing-details cf"]/div') town_ = '' area_ = '' zipcode = '' listPhone_ = [] for div__ in divInfo: label = div__.find('./label') if label != None: label_ = label.text if label_ == 'Business Website Address:': website = div__.find('./span/a') if website != None: website = website.get('href') isFacebook = website.find('facebook.com') isTwiter = website.find('twiter.com') if isFacebook == -1 and isTwiter == -1: ven.business_website = website else: if isFacebook != -1: ven.facebook = website if ven.twitter != -1: ven.twitter = website if label_ == 'Security Services:': serviceStr = div__.xpath('./span/a') sers = [] for ser in serviceStr: serv = Service() serv.service = ser.text sers.append(serv) if len(sers) > 0: ven.services = sers ven.pricelist_link = [ven.scrape_page] if label_ == 'Long Business Description:': des = div__.find('./span') if des != None: des = ' '.join(des.itertext()) ven.description = des if label_ == 'Business Phone Number:': phone = div__.find('./span').text #phone = self.formatPhone(phone) findsplistPPhone = self.findSplitPhone(phone) if findsplistPPhone == None: listPhone_ = [phone] #(ven.office_number,ven.office_number2,ven.mobile_number,ven.mobile_number2) = self.processPhones([phone]) else: listPhone_ = phone.split(findsplistPPhone) (ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2) = self.processPhones(listPhone_) if label_ == 'Postcode:': zipcode = div__.find('./span').text if label_ == 'Town:': town_ = div__.find('./span').text if label_ == 'Area:': area_ = div__.find('./span').text zipcode = self.validateZipcode(zipcode) if url == 'http://www.uksecurity-directory.co.uk/the-directory/1981/s-comm-vehicle-surveillance-system': print if ven.office_number == 'NOT_GB' or ven.office_number2 == 'NOT_GB' or ven.mobile_number == 'NOT_GB' or ven.mobile_number2 == 'NOT_GB': return for p in listPhone_: if p == town_: town_ = '' break ven.zipcode = zipcode ven.formatted_address = ', '.join([area_, town_, zipcode]) ven.formatted_address = self.refixFormatAddress( ven.formatted_address.replace('0000000', '')) extraImg = xmlDoc.xpath('./div[@class="extra-images"]//a/img') listingThumbnail = xmlDoc.xpath( './div[@class="listing-thumbnail"]//a/img') for thumb in listingThumbnail: imgs.append(thumb.get('src')) for img in extraImg: imgs.append(img.get('src')) if len(imgs) > 0: ven.img_link = imgs self.index = self.index + 1 ven.writeToFile(self.folder, self.index, ven.name, False) except Exception, ex: print '[ERROR] ' + url + ': ' + str(ex)
def __VenueParser(self, element, listData, scrape_page): try: classLI = element.get('class') featured = 'none' if classLI.find('TOP') != -1: featured = 'featured' ven = Venue() ven.name = element.find('./a').text.replace(''', '') ven.country = self._language ven.hqdb_featured_ad_type = featured ven.scrape_page = scrape_page div = element.find('./div') if div.find('./font') != None: font = div.xpath('./font') #div.remove(font) for font_ in font: div.remove(font_) a = div.find('./a') if a != None: ven.business_website = self.check_website(a.get('href')) div.remove(a) content = '|'.join(div.itertext()) content_ = content.split('|') phone = '' postion = -1 if content_[postion].find('Telefoon:') != -1: phone = content_[postion].replace('Telefoon:', '').replace( ' ', '').replace('-', '').replace('PR', '') if phone.startswith('00'): phone = '+' + phone[2:len(phone)] if phone.startswith('31'): phone = '+' + phone if phone.startswith('+'): if phone.startswith('+31'): '' else: phone = None if phone != None: if phone.startswith('06') or phone.startswith('+316'): ven.mobile_number = self.validatePhone__(phone) else: ven.office_number = self.validatePhone__(phone) postion -= 1 city_zipcode = content_[postion] street = ' '.join(content_[0:postion]) (ven.zipcode, ven.city) = self.__processAddress(city_zipcode) ven.street = street (ven.latitude, ven.longitude) = self.processlatlng(listData, city_zipcode) index = self.index + 1 self.index = index ven.writeToFile(self.folder, index, ven.name, False) except Exception, ex: print ex