def __VenueParser(self, url): try: if self.checkDuplicate(self.urlList, url) == False: ven = Venue() print '[SCRAPING]:' + url #ven.scrape_page= url ven.country = self._language self.urlList.append(url) xmlDoc = Util.getRequestsXML(url, '//div[@class="content"]/main') xmlDoc = xmlDoc.find('./main') name = xmlDoc.find('./h2') ven.name = name.text des = xmlDoc.find('./div[@class="clearfix"]') if des != None: imgs = [] img = des.xpath('.//img') for im in img: imgs.append(self.__url__ + im.get('src')) des.remove((im.getparent()).getparent()) if len(imgs) > 0: ven.img_link = imgs ven.description = ''.join(des.itertext()) pass map_and_phone_number = xmlDoc.xpath( './div/div[@class="footer row"]') isMulti = False if len(map_and_phone_number) > 1: isMulti = True countVenues_ = 0 for clone_ in map_and_phone_number: countVenues_ += 1 self._cloneVenues(ven, clone_, countVenues_, url, isMulti) else: print '[DUPLICATE]: ' + url except Exception, ex: print '[ERROR]: ' + url print ex
def __VenueParser(self, element, cate, scrappages): subA = element.find('./div/a') link = subA.get('href') try: #if link =='http://es.qdq.com/f:1-GV9-6082/': # print existing = [x for x in self.listLink if link in x] print 'Scraping : ' + link if len(existing) <= 0: self.listLink.append(link) ven = Venue() #ven.name = subA.find('./div/h2').text ven.scrape_page = link #ven.subcategory = cate ven.category = cate ven.country = self._language ven.hqdb_featured_ad_type = "none" address = subA.xpath('./div/p/span') for span in address: itemprop = span.get('itemprop') if itemprop == 'street-address': ven.street = span.text if itemprop == 'postal-code': ven.zipcode = span.text if itemprop == 'locality': # before the first "," and before the "/" ven.city = span.text #.split(',')[0] if ven.city == '' or ven.city == None: continue find_slash = ven.city.find('/') find_comma = ven.city.find(',') if find_slash != -1 and find_comma != -1: ven.city = ven.city.split('/')[0] if ven.city.find(',') != -1: ven.city = ven.city.split(',')[1] ven.city = ven.city.split(',')[0] ven.city = ven.city.split('/')[0] if ven.street != None: ven.street = self.validateStreet(ven.street) if ven.city != None: re_City = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.city, flags=0) if re_City != None: ven.city = ven.city.replace(re_City.group(0), '') if ven.zipcode != None: ven.zipcode = ven.zipcode.strip() if len(ven.zipcode) >= 5: re_zipcode = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.zipcode, flags=0) if re_zipcode != None: if re_zipcode.group(0) != ven.zipcode: ven.zipcode = None else: ven.zipcode = None else: ven.zipcode = '0' + ven.zipcode rezipcode = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.zipcode, flags=0) if rezipcode == None: ven.zipcode = None else: if ven.zipcode != rezipcode.group(0): ven.zipcode = None try: if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000: ven.zipcode = None except Exception, ex: ven.zipcode = None detail = Util.getRequestsXML(link, '//div[@id="contenido"]') ven.name = detail.find('.//h1').text #.replace('Niño','Niño') #ven.name = Validator.RevalidName(ven.name) ven.name = self.replaceName(ven.name) phone = detail.find('.//span[@class="telefonoCliente"]') if phone != None: phone = phone.text if phone.startswith('6') or phone.startswith('7'): ven.mobile_number = '' + phone ven.mobile_number = self.validatePhone__( ven.mobile_number) else: ven.office_number = '' + phone ven.office_number = self.validatePhone__( ven.office_number) maps = detail.find('.//div[@id="mymap"]/img') if maps != None: maps = maps.get('src') (ven.latitude, ven.longitude) = self.getLatlng(maps) #ven.is_get_by_address =True ven.writeToFile(self.folder, self.addIndex(), ven.name, False) else:
def __VenueParser_2(self, element, cate, scrape_pages): subB = element.find('./div/a') link = subB.get('href') try: existing = [x for x in self.listLink if link in x] if len(existing) <= 0: print 'Scraping Feature : ' + link self.listLink.append(link) ven = Venue() ven.country = self._language ven.hqdb_featured_ad_type = 'featured' ven.category = cate #ven.subcategory = cate ven.scrape_page = scrape_pages subDiv = element.find('./div[@class="resultado nada"]') div = subDiv.find('./a/div') ven.name = div.find('./h2').text #.replace('Niño','Niño') '''if ven.name =='Niño de la Virgen': print''' #ven.name = Validator.RevalidName(ven.name) ven.name = self.replaceName(ven.name) address = div.xpath('./p[@itemprop="address"]/span') if address != None: for span in address: itemprop = span.get('itemprop') if itemprop == 'street-address': ven.street = span.text if itemprop == 'postal-code': ven.zipcode = span.text if itemprop == 'locality': ven.city = span.text #.split(',')[0] if ven.city == '' or ven.city == None: continue find_slash = ven.city.find('/') find_comma = ven.city.find(',') if find_slash != -1 and find_comma != -1: ven.city = ven.city.split('/')[0] if ven.city.find(',') != -1: ven.city = ven.city.split(',')[1] ven.city = ven.city.split(',')[0] ven.city = ven.city.split('/')[0] if ven.street != None: ven.street = self.validateStreet(ven.street) if ven.city != None: re_City = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.city, flags=0) if re_City != None: ven.city = ven.city.replace(re_City.group(0), '') if ven.zipcode != None: ven.zipcode = ven.zipcode.strip() if len(ven.zipcode) >= 5: re_zipcode = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.zipcode, flags=0) if re_zipcode != None: if re_zipcode.group(0) != ven.zipcode: ven.zipcode = None else: ven.zipcode = None else: ven.zipcode = '0' + ven.zipcode rezipcode = re.search( '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})', ven.zipcode, flags=0) if rezipcode == None: ven.zipcode = None else: if ven.zipcode != rezipcode.group(0): ven.zipcode = None try: if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000: ven.zipcode = None except Exception, ex: ven.zipcode = None description = div.find('./p[@class="descripcion"]').text if description != None: ven.description = description imgs = subDiv.xpath('./a/figure/img') if len(imgs) > 0: imgs_ = [] for im in imgs: imgs_.append(im.get('src')) ven.img_link = imgs_ footer = subDiv.xpath('./div[@class="iconos"]/ul/li') for fo in footer: text__ = fo.find('./a').text if text__ == 'Mandar mail': ven.business_website = fo.find('./a').get('href') if text__ == 'Ver teléfono': phone = fo.find('./span[@class="telefono"]').text if phone.startswith('+346') or phone.startswith( '+347') or phone.startswith( '7') or phone.startswith('6'): ven.mobile_number = self.validatePhone__(phone) else: ven.office_number = self.validatePhone__(phone) #ven.is_get_by_address =True ven.writeToFile(self.folder, self.addIndex(), ven.name, False) else: