def __VenueParser(self, element): try: self.urlmarks += 1 print '[COUNT]: ' + str(self.urlmarks) featured = 'featured' onclick = element.xpath('./td//a/@onclick')[0] detailLink = onclick[onclick.find("MM_openBrWindow('") + len("MM_openBrWindow('"):onclick.find("','','" )] detailLink = self.__url__ + '/' + detailLink xmlDoc = Util.getRequestsXML(detailLink, '//table[@cellspacing="3"]') xmlDoc = xmlDoc[0] ven = Venue() ven.hqdb_featured_ad_type = featured ven.country = self._language ven.scrape_page = detailLink detail_ = xmlDoc.xpath('./tr/td/table') detail_1 = detail_[0] detail_2 = detail_[2] basicInfo = detail_1.find('./tr/td/table/tr/td[@class="text"]') email_website = basicInfo.getparent().xpath( '//table//div[@align="right"]/a') for aTag in email_website: link__ = aTag.get('href') if link__.find('mailto') != -1: ven.business_email = link__.replace('mailto:', '') else: if link__.find('http') != -1: ven.business_website = link__ openxml = detail_1.xpath('./tr') openxml = openxml[2].find('./td/table') # table openning hour rows = openxml.xpath('./tr') dayofweek = { 'Montag': 'Lunedì', 'Dienstag': 'Martedì', 'Mittwoch': 'Mercoledì', 'Donnerstag': 'Giovedì', 'Freitag': 'Venerdì', 'Samstag': 'Sabato', 'Sonntag': 'Domenica' } openning_hour_array = [] for row in rows: tds = row.xpath('./td') if len(tds) > 0: if tds[0].text != None: if dayofweek.get(tds[0].text, "NULL") != "NULL": record = '' count_ = 0 for td in tds: if dayofweek.get(td.text, "NULL") != "NULL": record += dayofweek.get(td.text) + ": " else: if td.text.strip() != '-': record += td.text.replace('.', ':') + ", " else: count_ += 1 record = record.strip() if record.endswith(','): record = record[0:-1] if count_ < 3: openning_hour_array.append(record) ven.opening_hours_raw = ' | '.join(openning_hour_array) basicInfo_ = ''.join(basicInfo.itertext()).split('\n') if basicInfo_[len(basicInfo_) - 1].find('Fax') != -1: basicInfo_ = basicInfo_[0:-1] phonenumber = basicInfo_[-1].strip().replace('Tel.', '').replace(' ', '') zip_ci = basicInfo_[-2] street = basicInfo_[-3] contactName = basicInfo_[-4] name = ' '.join(basicInfo_[0:-4]) (ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2) = self.processPhone([phonenumber]) (ven.city, ven.zipcode) = self.processZipCity(zip_ci) if ven.zipcode != None: if self.validateZip(ven.zipcode) == None: return ven.street = street ven.name_of_contact = contactName ven.name = name services = detail_2.xpath('./tr[@valign="top"]/td') if len(services) > 0: service_ = [] services = services[0].text ven.services = self.__ServicesParser(services) self.index += 1 ven.writeToFile(self.folder, self.index, ven.name, False) except Exception, ex: print ex
def __VenueParser(self, item, city): existing = [x for x in self.listLink if item in x] self.listLink.append(item) if len(existing) <= 0: try: xmlDoc = Util.getRequestsXML(item, '/html') ven = Venue() ven.scrape_page = item #ven.city = city ven.name = xmlDoc.xpath( '//div[@class="row top-buffer"]/h3')[0].text (ven.latitude, ven.longitude) = self.getLatlng(xmlDoc) xmlcontent = xmlDoc.find('.//div[@class="tab-content"]') services_schedule_info = xmlcontent.xpath( './div/div[@class="row top-buffer"]/h4/parent::div')[0] if services_schedule_info != None: services_schedule_info = ''.join( services_schedule_info.itertext()).split('\n') for it in services_schedule_info: if it.find('Style:') != -1: it = it[0:it.find('Schedule')] it = it.strip() ser_name = it[it.find('Style:') + len('Style:'):it. find('Ability level')] cost = len(it) cost_ = ['Cost:', 'Concession cost:'] char_cost = '' for c in cost_: if it.find(c) != -1: cost = it.find(c) char_cost = c break #cost = it.find('Cost:') if cost == -1: cost = len(it) ser_des = it[it.find('Ability level:') + len('Ability level:'):cost] ser_price = it[cost + len(char_cost):it.find('GBP') + len('GBP')] ser = Service() ser.service = ser_name ser.description = ser_des ser.price = ser_price.replace('-', '') ven.services = [ser] if it.find('a.m.') != -1 or it.find('p.m.') != -1: ven.opening_hours_raw = it.strip().replace( '.Monday', ' | Monday').replace( '.Tuesday', ' | Tuesday').replace( '.Wednesday', ' | Wednesday').replace( '.Thursday', ' | Thursday').replace( '.Friday', ' | Friday').replace( '.Saturday', ' | Saturday').replace( '.Sunday', ' | Sunday') ven.opening_hours_raw = self.formatOpenhour( ven.opening_hours_raw) address = xmlcontent.find('.//address') if address != None: #print ET.dump(address) address = ''.join(address.itertext()).replace( 'United Kingdom', '').strip() address = self.validateAddress(address) #address ='Ward Park Arras Pavilion,Gransha Road,Bangor,Northern Ireland,BT20 4TN' ven.country = 'gb' if address.upper().find('Ireland'.upper()) != -1: if address.upper().find( 'Northern Ireland'.upper()) != -1: ven.country = 'ie' if address.endswith(','): address = address[0:-1] ven.formatted_address = address posted = xmlcontent.find('./div/div[@class="row"]/p') imgs = xmlcontent.xpath('.//a/img') img_ = [] for img in imgs: img_.append(img.get('src')) ven.img_link = img_ if posted != None: ven.hqdb_ad_posted = posted.text.replace( 'Last updated', '') split_posted = ven.hqdb_ad_posted.split(',') if len(split_posted) >= 3: ven.hqdb_ad_posted = ', '.join( split_posted[0:len(split_posted) - 1]) ven.category = self.category #ven.country ='gb' des_info = xmlcontent.xpath( '//div[@class="row top-buffer"]/h3')[1] #print des_info.text des_info = des_info.getparent() des__ = des_info.xpath('./p') ven.pricelist_link = [ven.scrape_page] ven.hqdb_featured_ad_type = 'none' ven.description = '' for des in des__: ven.description += ''.join(des.itertext()) + ' ' des_info.remove(des) info = '____'.join(des_info.itertext()) a = des_info.find('./a') if a != None: a = a.get('href') if a.find('facebook.com') == -1: ven.business_website = a else: if a.startswith('http:'): a = a.replace('http:', 'https:') ven.facebook = a info = info.split('__') for inf in range(0, len(info)): if info[inf] == 'Qualifications:': ven.accreditations = info[inf + 2] if info[inf] == 'Phone:': phone = info[inf + 2].strip() pattren = '(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)' '''isEmail = re.search(pattren, phone, flags=0) if isEmail!=None: ven.business_email = isEmail.group(0) continue''' find_charSplit = self.findSplitPhone(phone) if find_charSplit == None: issMail = re.search(pattren, phone, flags=0) if issMail != None: ven.business_email = issMail.group(0) continue phone = phone.replace('Mobile:', '').replace( 'ext.225', '').replace('O7', '07').replace(' ', '') if phone.startswith('07') or phone.startswith( '447' ) or phone.startswith('+447') or phone.startswith( '00447') or phone.startswith( '+44(0)7') or phone.startswith( '44(0)7') or phone.startswith( '004407'): ven.mobile_number = self.validatePhone__( phone, ven.country) else: ven.office_number = self.validatePhone__( phone, ven.country) else: phone = phone.split(find_charSplit) for p in phone: issMail = re.search(pattren, p, flags=0) if issMail != None: ven.business_email = issMail.group(0) continue p = p.replace('Mobile', '').replace( 'ext225', '').replace('O7', '07').replace(' ', '') if p.startswith('07') or p.startswith( '447' ) or p.startswith('+447') or p.startswith( '00447' ) or p.startswith('+44(0)7') or p.startswith( '44(0)7') or p.startswith('004407'): if ven.mobile_number != None: ven.mobile_number2 = self.validatePhone__( p, ven.country) else: ven.mobile_number = self.validatePhone__( p, ven.country) else: if ven.office_number != None: ven.office_number2 = self.validatePhone__( p, ven.country) else: ven.office_number = self.validatePhone__( p, ven.country) isPhoneOverSea = self.checkPhoneOverSea([ ven.office_number, ven.office_number2, ven.mobile_number, ven.mobile_number2 ]) if isPhoneOverSea == False: index = self.addIndex() print str( index) + ' Scapping: ' + city + '---' + ven.scrape_page #ven.is_get_by_address =True ven.writeToFile(self.folder, index, ven.name, False) except Exception, ex: print ex return
def __VenueParser(self, url__, name__): print 'Scraping: ' + url__ existing = [x for x in self.venuesList if url__ in x] if len(existing) > 0: return None #url__ ='http://www.drivingschoolsfinder.co.uk/city-Accrington/1846198-driving-Terrys-School-of-Motoring.html' #name__ ='Terrys School of Motoring' city = url__.split('/')[3].replace('city-', '').replace('-', ' ') xmlDoc = Util.getRequestsXML(url__, '/html/body') if xmlDoc == None: return None else: ven = Venue() sers = [] ven.name = name__ ven.city = city ven.scrape_page = url__ td = xmlDoc.xpath('//td[@class="welcome-padding"]') iter__ = ''.join(td[0].itertext()) iter__ = iter__[iter__.find('Driving School:') + len('Driving School:'):iter__. find('[Edit Text]')].replace('\n', '|').replace( '\t', '') iter__ = iter__.replace('|||', ' | ') rep = '|' + name__ iter__ = iter__[0:iter__.find(rep)] rep = ' | |' iter__ = iter__[0:iter__.find(rep)] ven.description = iter__ div = td[0].xpath('./div') if len(div) < 5: return None else: # div info = position div gray-line[0]+1 div_info = 0 for div_ in div: if div_.find('./script') != None: div_info = 3 info = div[div_info] info_ = ''.join(info.itertext()) address = info_[0:info_.find('Phone')].replace( name__, '').replace(city, ',' + city).replace(',,', ',').replace( ', ,', ',').split(',') #street = ', '.join(address[0:len(address)-2]).replace(','+city,'') street = ', '.join(address[0:len(address)]) street = street[0:street.find(city) - 1] if street.endswith(','): street = street[0:len(street) - 1] zipcode = address[len(address) - 1] street__ = street.upper() if street__.find('PO BOX') == -1: ven.street = street.replace('n/a', '').replace( '***', '').replace('6 weldon place croy', '').replace( 'cumbernauld41 napier square bellshill ml4 1tb', '').replace('P.O. Box 1048', '') if ven.street == '-': ven.street = None ven.zipcode = self.validateZipcode(zipcode) phone = info_[info_.find('Phone:') + len('Phone:'):info_.find('Fax:')].replace( ' ', '') if phone.isdigit(): if phone.startswith('07') | phone.startswith('7'): ven.mobile_number = self.validatePhone(phone) ven.mobile_number = self.validatePhone__( ven.mobile_number, 'gb') else: ven.office_number = self.validatePhone(phone) ven.office_number = self.validatePhone__( ven.office_number, 'gb') services_ = info_[info_.find('Services Offered:') + len('Services Offered:'):info_. find('Areas Served:')].strip().replace( ';', ',') if services_ != 'None Listed - [Edit]': services_ = services_.replace('/', ',').replace(',,', ',').split(',') for s in services_: name = self.validateServices(s) if len(name) >= 5: name__ = name.split() for n in name__: name = self.validateNameServices(name) if len(name.strip()) >= 5: services = Service() services.service = name sers.append(services) #ven.description = ven.description +' | ' +services_ stringfind = 'No Website' if info_.find('No Website') == -1: stringfind = 'Website' area_coverd = info_[info_.find('Areas Served:') + len('Areas Served:'):info_. find(stringfind)].strip().replace( ';', ',') #area_coverd = area_coverd[0:area_coverd.find(stringfind)] if area_coverd != 'None Listed - [Edit]': ven.areas_covered = area_coverd ven.services = sers reviewer = len(xmlDoc.xpath('//td[@class="review-box"]')) if reviewer > 0: ven.hqdb_nr_reviews = str(reviewer) scoreInfo = div[div_info + 1] #http://www.drivingschoolsfinder.co.uk/halfstar.gif +0.5 #http://www.drivingschoolsfinder.co.uk/fullstar.gif +1 #http://www.drivingschoolsfinder.co.uk/emptystar.gif +0 tr = scoreInfo.xpath('./table/tr') tr = tr[1] img_core = tr.xpath('./td')[1] img_core = img_core.xpath('./table/tr/td/img') score__ = 0.0 for score in img_core: score_ = score.get('src') if score_ == 'http://www.drivingschoolsfinder.co.uk/halfstar.gif': score__ += 0.5 if score_ == 'http://www.drivingschoolsfinder.co.uk/fullstar.gif': score__ += 1 if score_ == 'http://www.drivingschoolsfinder.co.uk/emptystar.gif': score__ += 0 if score__ > 0: ven.hqdb_review_score = str(score__).replace('.0', '') ven.country = 'gb' emails_ = re.findall(r'[\w\.-]+@[\w\.-]+', info_) for email_ in emails_: ven.business_email = email_ # website_ = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', info_) # for web_ in website_: # ven.business_website = web_ if ven.business_email != None: if ven.business_email.startswith('http'): ven.business_email = None ven.business_email = None if info_.find('No Website') == -1: arrays__ = info_.split(' ') for i in range(0, len(arrays__)): if arrays__[i].find('Website') >= 0: web_ = arrays__[i + 1].replace('\t', ' ').replace( '\n', ' ').split()[0].replace('No', '') ven.business_website = self.formatWeb_(web_) print ven.business_website break address_ = '' if ven.street == None: address_ = ven.city + ', ' + ven.zipcode #ven.formatted_address = ven.city+', '+ven.zipcode else: if ven.zipcode != None: address_ = ven.street + ', ' + ven.city + ', ' + ven.zipcode else: address_ = ven.street + ', ' + ven.city ven.pricelist_link = [ven.scrape_page] ''' get lat -lng ''' if address_ != '': try: (ven.latitude, ven.longitude) = self.getLatlng(address_, 'UK') except Exception, ex: Util.log.running_logger.error(ven.scrape_page + ' : ' + ex) return None ven.is_get_by_address = True return ven