def __VenueParser(self, link):
        #link ='https://www.meilleur-garagiste.com/annuaire/garage-la-couronne.464207.html'
        print 'Scrapping: ' + link
        existing = [x for x in self.link_venues if link in x]
        if len(existing) > 0:
            print 'Len existing : ' + str(len(existing))
            return None
        xmlBody = Util.getRequestsXML(link, '//div[@id="fiche-artisan"]')
        if xmlBody != None and len(xmlBody) > 0:
            ven = Venue()
            name_ = xmlBody.xpath('.//h1/parent::div')
            if len(name_) > 0:
                if name_ != None:
                    name_h1 = name_[0].find('./h1')
                    name_h2 = name_[0].find('.//h2')
                    if name_h2 != None:
                        ven.name = name_h2.text
                    else:
                        ven.name = name_h1.text

            else:
                return None
            xmldiv = xmlBody.find('.//div[@class="row nomargin"]/div')
            if xmldiv == None:
                return None
            span_ = xmldiv.xpath('./span')
            for i_ in span_:
                if i_.get('class') == 'street-address text-hide-mobile':
                    ven.street = i_.text
                    if ven.street != None:
                        #ven.street = self.validateStreet(ven.street).replace('43442491700012', '')
                        ven.street = self.validateStreet2(ven.street).replace(
                            '43442491700012', '')
                        if ven.street.strip() == '.':
                            ven.street = None
                if i_.get('class') == 'postal-code':
                    ven.zipcode = i_.text
                    ven.zipcode = self.validateZipcode(ven.zipcode)
                if i_.get('class') == 'locality':
                    ven.city = i_.text
            a = xmlBody.find(
                './/a[@class="col m12 s4 tel waves-effect waves-light btn center btn-fix bleu"]'
            )
            if a != None:
                phone = a.get('href').replace('tel:', '').replace(' ', '')
                if phone.startswith('07') | phone.startswith('06'):
                    ven.mobile_number = self.validatePhone__(phone, 'FR')
                else:
                    ven.office_number = self.validatePhone__(phone, 'FR')
            logo = xmlBody.find('.//div[@class="center-align"]/img')
            if logo != None:
                ven.img_link = [self.__url__ + logo.get('src')]
            ven.scrape_page = link
            ven.pricelist_link = [link]
            listServices = xmlBody.xpath(
                '//li/div[@class="collapsible-body"]/div/a')
            sers = []
            for ser in listServices:
                servic = Service()
                servic.service = ser.text
                sers.append(servic)
                self.services.append(servic)
            ven.services = sers
            if ven.city != None and ven.zipcode != None:
                if ven.street != None and len(ven.street) > 0:
                    add_ = ven.street + ', ' + ven.city + ', ' + ven.zipcode
                else:
                    add_ = ven.city + ', ' + ven.zipcode
            else:
                add_ = None
            (ven.latitude, ven.longitude) = self.getLatlng(add_, 'FR')
            if ven.latitude == None and ven.longitude == None:
                Util.log.coordinate_logger.error(ven.scrape_page +
                                                 ' : Cannot get GEO code')
            self.link_venues.append(link)
            ven.country = 'fr'
            desc = xmlBody.find('.//p[@id="description"]')
            desc_ = ''
            if desc != None:
                desc_ = ''.join(desc.itertext()).strip().replace('\n',
                                                                 '|').replace(
                                                                     '\t', '')
            title = xmlBody.find('.//div[@class="container"]//h2')
            if title != None and desc != None:
                desc_ = title.text + ' | ' + desc_
            img_link_arr = []
            desc_ = self.replace__(desc_)
            desc_ = self.replaceSame(desc_, '||', '|').replace('|', ' | ')
            ven.description = desc_
            img_link = xmlBody.find('.//div[@class="realisations"]/img')
            if img_link != None:
                temp_img = ven.img_link = self.__url__ + img_link.get('src')
                img_link_arr.append(temp_img)
            multi_img = xmlBody.xpath(
                '//div[@class="3photo realisations"]/div/img')
            for it in multi_img:
                temp_ml = self.__url__ + it.get('src')
                img_link_arr.append(temp_ml)
            if len(img_link_arr) > 0:
                ven.img_link = img_link_arr
            nr_reviewer = xmlBody.xpath('//div[@class="avisoperation row"]')
            if len(nr_reviewer) > 0:
                ven.hqdb_nr_reviews = str(len(nr_reviewer))
            ven.is_get_by_address = True
            return ven
    def __VenueParser(self, url__, name__):
        print 'Scraping: ' + url__
        existing = [x for x in self.venuesList if url__ in x]
        if len(existing) > 0:
            return None
        #url__ ='http://www.drivingschoolsfinder.co.uk/city-Accrington/1846198-driving-Terrys-School-of-Motoring.html'
        #name__ ='Terrys School of Motoring'
        city = url__.split('/')[3].replace('city-', '').replace('-', ' ')
        xmlDoc = Util.getRequestsXML(url__, '/html/body')
        if xmlDoc == None:
            return None
        else:
            ven = Venue()
            sers = []
            ven.name = name__
            ven.city = city
            ven.scrape_page = url__
            td = xmlDoc.xpath('//td[@class="welcome-padding"]')
            iter__ = ''.join(td[0].itertext())
            iter__ = iter__[iter__.find('Driving School:') +
                            len('Driving School:'):iter__.
                            find('[Edit Text]')].replace('\n', '|').replace(
                                '\t', '')
            iter__ = iter__.replace('|||', ' | ')
            rep = '|' + name__
            iter__ = iter__[0:iter__.find(rep)]
            rep = '  |  |'
            iter__ = iter__[0:iter__.find(rep)]
            ven.description = iter__
            div = td[0].xpath('./div')

            if len(div) < 5:
                return None
            else:
                # div info = position div gray-line[0]+1
                div_info = 0
                for div_ in div:
                    if div_.find('./script') != None:
                        div_info = 3
                info = div[div_info]
                info_ = ''.join(info.itertext())
                address = info_[0:info_.find('Phone')].replace(
                    name__, '').replace(city,
                                        ',' + city).replace(',,', ',').replace(
                                            ', ,', ',').split(',')
                #street = ', '.join(address[0:len(address)-2]).replace(','+city,'')
                street = ', '.join(address[0:len(address)])
                street = street[0:street.find(city) - 1]
                if street.endswith(','):
                    street = street[0:len(street) - 1]
                zipcode = address[len(address) - 1]
                street__ = street.upper()
                if street__.find('PO BOX') == -1:
                    ven.street = street.replace('n/a', '').replace(
                        '***', '').replace('6 weldon place croy', '').replace(
                            'cumbernauld41 napier square bellshill ml4 1tb',
                            '').replace('P.O. Box 1048', '')
                if ven.street == '-':
                    ven.street = None
                ven.zipcode = self.validateZipcode(zipcode)

                phone = info_[info_.find('Phone:') +
                              len('Phone:'):info_.find('Fax:')].replace(
                                  ' ', '')
                if phone.isdigit():
                    if phone.startswith('07') | phone.startswith('7'):
                        ven.mobile_number = self.validatePhone(phone)
                        ven.mobile_number = self.validatePhone__(
                            ven.mobile_number, 'gb')
                    else:
                        ven.office_number = self.validatePhone(phone)
                        ven.office_number = self.validatePhone__(
                            ven.office_number, 'gb')
                services_ = info_[info_.find('Services Offered:') +
                                  len('Services Offered:'):info_.
                                  find('Areas Served:')].strip().replace(
                                      ';', ',')
                if services_ != 'None Listed - [Edit]':
                    services_ = services_.replace('/',
                                                  ',').replace(',,',
                                                               ',').split(',')
                    for s in services_:
                        name = self.validateServices(s)
                        if len(name) >= 5:
                            name__ = name.split()
                            for n in name__:
                                name = self.validateNameServices(name)
                        if len(name.strip()) >= 5:
                            services = Service()
                            services.service = name
                            sers.append(services)

                    #ven.description = ven.description +' | ' +services_
                stringfind = 'No Website'
                if info_.find('No Website') == -1:
                    stringfind = 'Website'
                area_coverd = info_[info_.find('Areas Served:') +
                                    len('Areas Served:'):info_.
                                    find(stringfind)].strip().replace(
                                        ';', ',')
                #area_coverd = area_coverd[0:area_coverd.find(stringfind)]
                if area_coverd != 'None Listed - [Edit]':
                    ven.areas_covered = area_coverd

                ven.services = sers
                reviewer = len(xmlDoc.xpath('//td[@class="review-box"]'))
                if reviewer > 0:
                    ven.hqdb_nr_reviews = str(reviewer)
                scoreInfo = div[div_info + 1]
                #http://www.drivingschoolsfinder.co.uk/halfstar.gif +0.5
                #http://www.drivingschoolsfinder.co.uk/fullstar.gif +1
                #http://www.drivingschoolsfinder.co.uk/emptystar.gif +0
                tr = scoreInfo.xpath('./table/tr')
                tr = tr[1]
                img_core = tr.xpath('./td')[1]
                img_core = img_core.xpath('./table/tr/td/img')
                score__ = 0.0
                for score in img_core:
                    score_ = score.get('src')
                    if score_ == 'http://www.drivingschoolsfinder.co.uk/halfstar.gif':
                        score__ += 0.5
                    if score_ == 'http://www.drivingschoolsfinder.co.uk/fullstar.gif':
                        score__ += 1
                    if score_ == 'http://www.drivingschoolsfinder.co.uk/emptystar.gif':
                        score__ += 0
                if score__ > 0:
                    ven.hqdb_review_score = str(score__).replace('.0', '')
                ven.country = 'gb'
                emails_ = re.findall(r'[\w\.-]+@[\w\.-]+', info_)
                for email_ in emails_:
                    ven.business_email = email_
            #    website_ = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', info_)
            #    for web_  in website_:
            #       ven.business_website = web_
                if ven.business_email != None:
                    if ven.business_email.startswith('http'):
                        ven.business_email = None
                    ven.business_email = None
                if info_.find('No Website') == -1:
                    arrays__ = info_.split(' ')
                    for i in range(0, len(arrays__)):
                        if arrays__[i].find('Website') >= 0:
                            web_ = arrays__[i + 1].replace('\t', ' ').replace(
                                '\n', ' ').split()[0].replace('No', '')
                            ven.business_website = self.formatWeb_(web_)
                            print ven.business_website
                            break
                address_ = ''
                if ven.street == None:
                    address_ = ven.city + ', ' + ven.zipcode
                    #ven.formatted_address = ven.city+', '+ven.zipcode
                else:
                    if ven.zipcode != None:
                        address_ = ven.street + ', ' + ven.city + ', ' + ven.zipcode
                    else:
                        address_ = ven.street + ', ' + ven.city
                ven.pricelist_link = [ven.scrape_page]
                ''' get lat -lng '''
                if address_ != '':
                    try:
                        (ven.latitude,
                         ven.longitude) = self.getLatlng(address_, 'UK')
                    except Exception, ex:
                        Util.log.running_logger.error(ven.scrape_page + ' : ' +
                                                      ex)
                        return None
            ven.is_get_by_address = True
            return ven
예제 #3
0
    def __VenueParser(self, xmlE, index):
        #print 'Scrapping: '
        ven = Venue()
        ven.adid = xmlE.get('id')
        ven.category = 'architecural technologist'
        photos = xmlE.find(
            './div[@class="search_result_photo"]/div[@class="photo"]/a')
        ven.venue_images = self.__url__ + photos.find('./img').get('src')
        ven.scrape_page = self.__url__ + photos.get('href')
        #print str(index)+' >>'+ ven.scrape_page
        existing = [x for x in self.list_url if ven.scrape_page in x]
        if len(existing) > 0:
            print 'This venues exist in list'
            return
        self.list_url.append(ven.scrape_page)
        details_ = xmlE.find('.//div[@class="search_result_details"]')
        ven.name = details_.find('./div[@class="title"]/h3/a').text
        contacts_ = details_.find('./div[@class="contact"]').text
        ven.description = details_.find('./div[@class="desc"]').text
        contact__ = contacts_.split(',')
        if len(contact__) >= 2:
            ven.zipcode = contact__[len(contact__) - 1]
            if ven.zipcode != None:
                ven.zipcode = self.check_zip(ven.zipcode)
            ven.city = contact__[len(contact__) - 2]

        #scraping details ____
        #ven.scrape_page ='http://www.architecturalindex.com/consumers/architects/architect.asp?lngArchitectId=207922'
        xmlInfo = Util.getRequestsXML(
            ven.scrape_page, '//div[@class="architect_header"]/parent::div')
        if xmlInfo != None:
            addressInfo = xmlInfo.find(
                './/div[@class="architect_header"]/div[@class="architect_header_info"]'
            )
            h2 = addressInfo.find('./h2')
            if h2 != None:
                addressInfo.remove(h2)
            address__ = ' '.join(addressInfo.itertext())
            if ven.city == None:
                __address = address__.split(',')
                ven.city = __address[len(__address) - 3]
            if len(ven.city) < 2:
                __address = address__.split(',')
                ven.city = __address[len(__address) - 3]
            street = address__[0:address__.find(ven.city.strip()) - 1]
            if street.endswith(','):
                street = street[0:len(street) - 1]
                if street.upper().find('PO BOX') >= 0:
                    street = None
                ven.street = street

            #ven.office_number= '08708700053'
            img = []
            img_info = xmlInfo.find('.//div[@class="architect_portfolio"]')
            photos_ = img_info.xpath(
                './div[@class="architect_portfolio_photo"]//img')
            for photo in photos_:
                im_ = self.__url__ + photo.get('src')
                img.append(im_)
            ven.img_link = img
            sers = []
            des = xmlInfo.find('.//div[@class="architect_info_statement"]')
            des = ' '.join(des.itertext())
            ven.description = des
            services = xmlInfo.xpath('//div[@class="architect_info"]/ul')
            desP = xmlInfo.xpath('//div[@class="architect_info"]/p')
            affi = xmlInfo.xpath('//div[@class="architect_info"]/h3')
            isAffiliations = ''
            for aff in affi:
                if aff.text.strip() == 'Affiliations':
                    isAffiliations = desP[len(desP) - 1].text
                    ven.accreditations = isAffiliations

            if len(desP) >= 2:
                p1 = desP[0].text
                p2 = desP[1].text

                #ven.description= ven.description+' '+p1+' '+p2
                if p1 != None:
                    ven.description += ' ' + p1
                if p2 != None:
                    if p2 != 'None':
                        ven.description += ' ' + p2 + ': '

            if len(services) >= 3:
                services_ = services[1]
                listSer = services_.xpath('./li')

                listDes_2 = services[2].xpath('./li')
                des_2 = ''
                if len(listDes_2) > 0:
                    des_2 = '. Specialist Experience: '
                    for des2 in listDes_2:
                        des_2 += des2.text + ', '

                    des_2 = des_2.strip()
                    if des_2.endswith(','):
                        des_2 = des_2[0:-1]

                listDes = services[0].xpath('./li')
                if len(listDes) > 0:
                    desSectors = ''
                    for lides in listDes:
                        desSectors += lides.text + ', '
                    desSectors = desSectors.strip()
                    if desSectors.endswith(','):
                        desSectors = desSectors[0:-1]
                    ven.description = ven.description + ' ' + desSectors + '.' + des_2
                    ven.description = ven.description.replace(', ,',
                                                              ', ').replace(
                                                                  '..', '.')
                for ser in listSer:
                    se = ser.text
                    serv = Service()
                    serv.service = se
                    sers.append(serv)
            ven.services = sers
            ven.pricelist_link = [ven.scrape_page]
            ven.country = 'gb'
            '''if ven.street!=None:
                add_ = ven.street+', '+ven.city+', '+ ven.zipcode
            else:
                add_ = ven.city+', '+ ven.zipcode
            #(ven.latitude,ven.longitude) = self.getLatlng(add_, 'UK')'''
            indexc = self.addIndex()
            try:
                print 'Writing index: ' + str(indexc)
                ven.writeToFile(self.folder, indexc, ven.name.replace(':', ''),
                                False)
            #return ven
            except Exception, ex:
                print ex
                return
    def __VenueParser(self, venueElement):
        try:
            img_link = []
            ad_type = "none"
            if venueElement.find(
                    './/span[@class="label label-success"]') != None:
                ad_type = "featured"
            divs = venueElement.xpath('./div')
            logo_ = divs[0].find('.//img')
            if logo_ != None:
                img_link.append(self.__url__ + logo_.get('src'))
            url__ = venueElement.xpath(
                './div[@class="col-xs-9 col-sm-9 col-md-9 listing-body"]//div[@class="h4 listing-heading"]/a'
            )
            if url__ != None:
                url__ = url__[0].get('href')
                url__ = self.__url__ + url__
                '''
                files = open('D:\\test.txt','a')
                files.write(url__+'\r\n')
                files.close()
                '''
                existing = [x for x in self.listLink if url__ in x]
                if len(existing) <= 0:
                    self.listLink.append(url__)
                    print 'Scraping' + ' : ' + url__

                    #if url__ =='http://www.garagesandrecovery.co.uk/business/dorset-auto-repirs':
                    #    print 'Debug'

                    xmlDoc = Util.getRequestsXML(
                        url__, '//body/div[@class="page-wrapper"]')
                    ven = Venue()
                    ven.name = xmlDoc.find(
                        './/div[@class="page-heading"]//h1').text
                    content = xmlDoc.find(
                        './/div[@class="container page-content"]')
                    if content != None:
                        des_img = content.find('.//div[@class="article-body"]')
                        if des_img != None:
                            div_img = des_img.xpath('.//img/parent::div')
                            if len(div_img) > 0:
                                des_img.remove(div_img[0])
                            des = ' '.join(des_img.itertext())
                            ven.description = des
                        ven.country = self._language
                        ven.scrape_page = url__
                        ven.hqdb_featured_ad_type = ad_type
                        offices_ = content.xpath(
                            './/div[@id="offices"]/parent::div/div[@class="row"]'
                        )
                        div_maps = offices_[0].find(
                            './/div[@class="google-map"]')
                        if div_maps != None:
                            ven.latitude = div_maps.get('data-lat')
                            ven.longitude = div_maps.get('data-lng')
                        info_ = offices_[0].xpath(
                            './div[@class="col-md-5 col-sm-6"]')
                        info_ = info_[0]
                        ul = info_.xpath('./ul')
                        phones = []
                        for u in ul:
                            phone_ = u.xpath('./li/a')
                            for phone in phone_:
                                if phone.get('title') == 'Phone Number':
                                    phone = phone.text.replace(' ', '')
                                    if phone.startswith('0800'):
                                        continue
                                    else:
                                        phones.append(phone)
                        if len(ul) >= 2:
                            ul_2 = ul[0]
                            li__ = ul_2.xpath('./li')

                            address = ''
                            for li in li__:
                                if li.get('class') != 'text-bold':
                                    address = '\n'.join(li.itertext())
                                    addressArr = address.split('\n')
                                    if len(addressArr) >= 3:
                                        ven.street = addressArr[len(addressArr)
                                                                - 3]
                                    ven.city = addressArr[len(addressArr) -
                                                          2].split(',')[0]
                                    ven.zipcode = addressArr[len(addressArr) -
                                                             1]
                                    if ven.zipcode != None:
                                        results = re.search(self.ukReg,
                                                            ven.zipcode,
                                                            flags=0)
                                        if ven.zipcode == 'Rotherham, South Yorkshire':
                                            ven.zipcode = ''
                                            ven.street = None
                                        if results == None:
                                            ven.zipcode = None

                        (ven.office_number, ven.office_number2,
                         ven.mobile_number,
                         ven.mobile_number2) = self.processPhones(phones)

                        # right sidebar : //div[@class="col-md-3 page-sidebar"]/div
                        rightSidebar = xmlDoc.xpath(
                            './/div[@class="col-md-3 page-sidebar"]/div[@class="section"]'
                        )
                        for right in rightSidebar:

                            website = right.xpath(
                                './a[contains(text(),"Visit Our Website")]')
                            if len(website) > 0:
                                website = website[0].get('href')
                                if website.find('facebook.com') == -1:
                                    ven.business_website = website
                                else:
                                    ven.facebook = website
                            reviews = right.xpath('./p/strong')
                            if len(reviews) >= 3:
                                ven.hqdb_nr_reviews = reviews[2].text
                                ven.hqdb_review_score = reviews[1].text
                            follows = right.xpath('./ul/li/a')
                            for foll in follows:
                                follow_link = foll.get('href')
                                if follow_link.find('facebook.com') != -1:
                                    if ven.facebook == None:
                                        ven.facebook = self.addHTTP(
                                            follow_link)
                                if follow_link.find('twitter.com') != -1:
                                    if ven.twitter == None:
                                        ven.twitter = self.addHTTP(follow_link)

                        img_find = xmlDoc.xpath(
                            '//div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img'
                        )
                        for ig in img_find:
                            img_link.append(self.__url__ + ig.get('src'))

                        if len(img_link) > 0:
                            ven.img_link = img_link
                        self.index += 1
                        ven.writeToFile(self.folder, self.index, ven.name,
                                        False)
                        #img_link : //div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img

                else:
                    print '\nduplicate'.upper()
                    print '*' * (len(url__) + 4)
                    print '*' + ' ' * (len(url__) + 2) + '*'
                    print '* ' + url__ + ' *'
                    print '*' + ' ' * (len(url__) + 2) + '*'
                    print '*' * (len(url__) + 4) + '\n'

        except Exception, ex:
            print ex
    def __VenueParser(self, url):
        try:
            print 'Scraping: ' + url
            xmlDoc = Util.getRequestsXML(url, '//div[@id="main"]')
            if xmlDoc != None:
                ven = Venue()
                ven.scrape_page = url
                ven.country = self._language
                ven.name = xmlDoc.find('.//h1').text
                overview = xmlDoc.find('.//div[@class="overview"]')
                option = overview.xpath('./div[@class="options row"]/div')
                for opt in option:
                    div_ = opt.xpath('./div')
                    for div__ in div_:
                        strong = div__.find('./strong')
                        if strong != None:
                            if strong.text == 'Adresse:':
                                street = div__.find(
                                    './span[@itemprop="streetAddress"]')
                                if street != None:
                                    ven.street = street.text
                                zipcode = div__.find(
                                    './span[@itemprop="postalCode"]')
                                if zipcode != None:
                                    ven.zipcode = zipcode.text
                                city = div__.find(
                                    './span[@itemprop="addressLocality"]')
                                if city != None:
                                    ven.city = city.text
                            if strong.text == 'Téléphone:':
                                phone = ''.join(div__.itertext()).replace(
                                    ' ',
                                    '').replace('.',
                                                '').replace('Téléphone:', '')
                                if phone.startswith('06') or phone.startswith(
                                        '07') or phone.startswith(
                                            '7') or phone.startswith('6'):
                                    ven.mobile_number = self.validatePhone__(
                                        phone)
                                else:
                                    ven.office_number = self.validatePhone__(
                                        phone)

                            if strong.text == 'Site Web:':
                                website = ''.join(div__.itertext()).replace(
                                    'Site Web:', '')
                                if website.find('facebook.com') != -1:
                                    ven.facebook = self.addHTTP(website)
                                    continue
                                if website.find('twitter.com') != -1:
                                    ven.twitter = self.addHTTP(website)
                                    continue
                                ven.business_website = self.addHTTP(website)
                            if strong.text == 'Horaires:':
                                openning = ''.join(div__.itertext()).replace(
                                    'Horaires:', '')
                                for format in self.openFormat:
                                    if openning.strip() == format:
                                        ven.opening_hours_raw = 'Lundi au Dimanche: 0h00 - 24h00'
                                if ven.opening_hours_raw == None:
                                    ven.opening_hours_raw = openning
                            if strong.text == 'Votez pour ce serrurier:':
                                score = div__.find(
                                    './span[@class="thevotescount"]/span[@itemprop="ratingValue"]'
                                )
                                if score != None:
                                    ven.hqdb_review_score = score.text
                descElement = overview.find('./div[@class="contenu"]')
                if descElement != None:
                    ven.description = ' | '.join(descElement.itertext())
                    if ven.description != None:
                        ven.description = ven.description.strip()
                        if ven.description.startswith('|'):
                            ven.description = ven.description[
                                1:len(ven.description)]
                        if ven.description.endswith("|"):
                            ven.description = ven.description[
                                0:len(ven.description) - 1]
                        ven.description = ven.description.replace(
                            '| \n |', '|')
                        if len(ven.description.split()) < 3:
                            ven.description = None
                address = []
                if ven.street != None and len(ven.street.strip()) > 0:
                    address.append(ven.street)
                if ven.city != None and len(ven.city.strip()) > 0:
                    address.append(ven.city)
                if ven.zipcode != None and len(ven.zipcode.strip()) > 0:
                    address.append(ven.zipcode)
                address_ = ', '.join(address)
                (ven.latitude, ven.longitude) = self.getLatlng(address_)
                ven.is_get_by_address = True
                self.index += 1
                ven.writeToFile(self.folder, self.index, ven.name, False)
        except Exception, ex:
            print '[ERROR] ' + url
            print ex
예제 #6
0
    def __VenueParser(self, element, cate, scrappages):
        subA = element.find('./div/a')
        link = subA.get('href')
        try:

            #if link =='http://es.qdq.com/f:1-GV9-6082/':
            #    print
            existing = [x for x in self.listLink if link in x]
            print 'Scraping : ' + link
            if len(existing) <= 0:

                self.listLink.append(link)

                ven = Venue()
                #ven.name = subA.find('./div/h2').text
                ven.scrape_page = link
                #ven.subcategory = cate
                ven.category = cate
                ven.country = self._language
                ven.hqdb_featured_ad_type = "none"
                address = subA.xpath('./div/p/span')
                for span in address:
                    itemprop = span.get('itemprop')
                    if itemprop == 'street-address':
                        ven.street = span.text
                    if itemprop == 'postal-code':
                        ven.zipcode = span.text
                    if itemprop == 'locality':

                        # before the first "," and before the "/"

                        ven.city = span.text  #.split(',')[0]
                        if ven.city == '' or ven.city == None:
                            continue

                        find_slash = ven.city.find('/')
                        find_comma = ven.city.find(',')
                        if find_slash != -1 and find_comma != -1:
                            ven.city = ven.city.split('/')[0]
                            if ven.city.find(',') != -1:
                                ven.city = ven.city.split(',')[1]
                        ven.city = ven.city.split(',')[0]
                        ven.city = ven.city.split('/')[0]
                if ven.street != None:
                    ven.street = self.validateStreet(ven.street)
                if ven.city != None:
                    re_City = re.search(
                        '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                        ven.city,
                        flags=0)
                    if re_City != None:
                        ven.city = ven.city.replace(re_City.group(0), '')
                if ven.zipcode != None:
                    ven.zipcode = ven.zipcode.strip()
                    if len(ven.zipcode) >= 5:
                        re_zipcode = re.search(
                            '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                            ven.zipcode,
                            flags=0)
                        if re_zipcode != None:
                            if re_zipcode.group(0) != ven.zipcode:
                                ven.zipcode = None
                        else:
                            ven.zipcode = None
                    else:
                        ven.zipcode = '0' + ven.zipcode
                        rezipcode = re.search(
                            '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                            ven.zipcode,
                            flags=0)
                        if rezipcode == None:
                            ven.zipcode = None
                        else:
                            if ven.zipcode != rezipcode.group(0):
                                ven.zipcode = None

                try:
                    if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000:
                        ven.zipcode = None
                except Exception, ex:
                    ven.zipcode = None

                detail = Util.getRequestsXML(link, '//div[@id="contenido"]')
                ven.name = detail.find('.//h1').text  #.replace('Niño','Niño')
                #ven.name = Validator.RevalidName(ven.name)
                ven.name = self.replaceName(ven.name)
                phone = detail.find('.//span[@class="telefonoCliente"]')
                if phone != None:
                    phone = phone.text
                    if phone.startswith('6') or phone.startswith('7'):
                        ven.mobile_number = '' + phone
                        ven.mobile_number = self.validatePhone__(
                            ven.mobile_number)
                    else:
                        ven.office_number = '' + phone
                        ven.office_number = self.validatePhone__(
                            ven.office_number)
                maps = detail.find('.//div[@id="mymap"]/img')
                if maps != None:
                    maps = maps.get('src')
                    (ven.latitude, ven.longitude) = self.getLatlng(maps)
                #ven.is_get_by_address =True
                ven.writeToFile(self.folder, self.addIndex(), ven.name, False)
            else:
예제 #7
0
    def __VenueParser_2(self, element, cate, scrape_pages):
        subB = element.find('./div/a')
        link = subB.get('href')

        try:
            existing = [x for x in self.listLink if link in x]
            if len(existing) <= 0:
                print 'Scraping Feature : ' + link

                self.listLink.append(link)
                ven = Venue()
                ven.country = self._language
                ven.hqdb_featured_ad_type = 'featured'
                ven.category = cate
                #ven.subcategory = cate
                ven.scrape_page = scrape_pages
                subDiv = element.find('./div[@class="resultado nada"]')
                div = subDiv.find('./a/div')
                ven.name = div.find('./h2').text  #.replace('Niño','Niño')
                '''if ven.name =='Niño de la Virgen':
                    print'''
                #ven.name = Validator.RevalidName(ven.name)
                ven.name = self.replaceName(ven.name)
                address = div.xpath('./p[@itemprop="address"]/span')
                if address != None:
                    for span in address:
                        itemprop = span.get('itemprop')
                        if itemprop == 'street-address':
                            ven.street = span.text
                        if itemprop == 'postal-code':
                            ven.zipcode = span.text
                        if itemprop == 'locality':
                            ven.city = span.text  #.split(',')[0]
                            if ven.city == '' or ven.city == None:
                                continue
                            find_slash = ven.city.find('/')
                            find_comma = ven.city.find(',')
                            if find_slash != -1 and find_comma != -1:
                                ven.city = ven.city.split('/')[0]
                                if ven.city.find(',') != -1:
                                    ven.city = ven.city.split(',')[1]
                            ven.city = ven.city.split(',')[0]
                            ven.city = ven.city.split('/')[0]
                    if ven.street != None:
                        ven.street = self.validateStreet(ven.street)
                    if ven.city != None:
                        re_City = re.search(
                            '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                            ven.city,
                            flags=0)
                        if re_City != None:
                            ven.city = ven.city.replace(re_City.group(0), '')
                    if ven.zipcode != None:
                        ven.zipcode = ven.zipcode.strip()
                        if len(ven.zipcode) >= 5:
                            re_zipcode = re.search(
                                '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                                ven.zipcode,
                                flags=0)
                            if re_zipcode != None:
                                if re_zipcode.group(0) != ven.zipcode:
                                    ven.zipcode = None
                            else:
                                ven.zipcode = None
                        else:
                            ven.zipcode = '0' + ven.zipcode
                            rezipcode = re.search(
                                '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                                ven.zipcode,
                                flags=0)
                            if rezipcode == None:
                                ven.zipcode = None
                            else:
                                if ven.zipcode != rezipcode.group(0):
                                    ven.zipcode = None

                try:
                    if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000:
                        ven.zipcode = None
                except Exception, ex:
                    ven.zipcode = None

                description = div.find('./p[@class="descripcion"]').text
                if description != None:
                    ven.description = description
                imgs = subDiv.xpath('./a/figure/img')
                if len(imgs) > 0:
                    imgs_ = []
                    for im in imgs:
                        imgs_.append(im.get('src'))
                    ven.img_link = imgs_
                footer = subDiv.xpath('./div[@class="iconos"]/ul/li')
                for fo in footer:
                    text__ = fo.find('./a').text
                    if text__ == 'Mandar mail':
                        ven.business_website = fo.find('./a').get('href')
                    if text__ == 'Ver teléfono':
                        phone = fo.find('./span[@class="telefono"]').text
                        if phone.startswith('+346') or phone.startswith(
                                '+347') or phone.startswith(
                                    '7') or phone.startswith('6'):
                            ven.mobile_number = self.validatePhone__(phone)
                        else:
                            ven.office_number = self.validatePhone__(phone)
                #ven.is_get_by_address =True
                ven.writeToFile(self.folder, self.addIndex(), ven.name, False)
            else:
    def __VenueParser(self, hqdb_type, linkItems, subcate, cate):
        #linkItems ='https://www.blauarbeit.de/p/reinigungsservice/schwarzheide/rene_nowak/11894.htm'
        existing = [x for x in self.linkIn if linkItems in x]
        if len(existing) > 0:
            print 'This venue exist in list'
            return None
        self.linkIn.append(linkItems)

        xmlPages = self.getRequest(linkItems)
        if xmlPages == None:
            return None
        #print ET.dump(xmlPages)
        #time.sleep(1)
        xmlVen = xmlPages.xpath('//div[@class="page_move"]')
        if len(xmlVen) == 0:
            return None
        #print ET.dump(xmlVen[0])
        name = xmlVen[0].xpath('.//h2')
        if len(name) <= 0:
            name = ''
        else:
            name = name[0].text.strip()
        noneValues = {'ZERO', 'NULL'}
        if name.upper() in noneValues:
            print 'return none'
            return None
        ven = Venue()
        nameFromUrl = self.getNamefromUrl(linkItems)
        ven.name = nameFromUrl
        ven.hqdb_featured_ad_type = hqdb_type
        #ven.name =name
        ven.scrape_page = linkItems
        ven.subcategory = subcate
        ven.category = cate
        address_ = ''
        #ven.formatted_address=''
        img_link = []
        divInfo = xmlVen[0].find(
            './/div[@class="content_wrapper content_wrapper_main clearfix"]/div'
        )
        if divInfo != None:
            mainInfo = divInfo.xpath('./section')
            if len(mainInfo) >= 2:
                leftInfo = mainInfo[0]
                rightInfo = mainInfo[1]
                tableInfo = leftInfo.find(
                    './div/div[@class="profile_top_left"]/table')
                trinfo = tableInfo.xpath('./tr')
                for tr_ in trinfo:
                    td = tr_.xpath('./td')
                    if len(td) < 2:
                        print 'continue'
                        continue
                    key_ = ''.join(td[0].itertext()).strip()
                    values_ = ' '.join(td[1].itertext()).strip().replace(
                        'keine Angabe', '').replace('NULL',
                                                    '').replace('null', '')
                    if key_ == 'Ansprechpartner:':
                        if values_ != None and len(values_) > 2:
                            #values_ =''
                            ven.name_of_contact = values_
                            ven.name += ', ' + ven.name_of_contact

                    if key_ == 'Addresse:':
                        address_ = values_
                        (ven.street, ven.city,
                         ven.zipcode) = self.processAddress(address_)
                        #ven.formatted_address = address_
                        if ven.city != None:
                            checkCity = ven.city.split()
                            if len(checkCity) > 0:
                                if checkCity[0].isdigit():
                                    if len(checkCity[0]) == 5:
                                        if ven.street != None:
                                            ven.street += ' ' + ven.zipcode
                                        ven.zipcode = checkCity[0]
                                        ven.city = ven.city.replace(
                                            ven.zipcode, '')
                                    else:
                                        ven.city = None
                                        ven.street = None
                                        ven.zipcode = None
                                        ven.formatted_address = ' '.join(
                                            checkCity)

                        if ven.zipcode != None:
                            if len(ven.zipcode) == 5:
                                ven.zipcode = ven.zipcode
                            else:
                                ven.zipcode = None
                    if key_ == 'Homepage:':
                        a_ = td[1].find('./a')
                        if a_ != None:
                            ven.business_website = a_.get('href')
                    if key_ == 'Tel:':
                        values_ = values_.replace('/',
                                                  '').replace(' ', '').replace(
                                                      'Tel', '')
                        if values_.startswith('01') | values_.startswith(
                                '+0041') | values_.startswith('0041'):
                            ven.mobile_number = self.validatePhone__(
                                self.validatePhone(values_), 'de')
                        else:
                            ven.office_number = self.validatePhone__(
                                self.validatePhone(values_), 'de')

                img_ = leftInfo.find(
                    './div/div[@class="profile_top_right"]/img')
                if img_ != None:
                    img_ = img_.get('src')
                    img_link.append(img_)
                rating = leftInfo.xpath('.//section[@id="ratings"]/div')
                if len(rating) >= 2:
                    rating1 = ''.join(rating[0].itertext()).strip().split()[1]
                    rating2 = ''.join(rating[1].itertext()).strip().split()[0]
                    rating2 = rating2.split('/')[0].replace(',', '.')
                    try:
                        float(rating2)
                    except Exception, ex:
                        rating2 = None
                    ven.hqdb_nr_reviews = rating1
                    ven.hqdb_review_score = rating2

                if ven.hqdb_review_score == None:
                    scoreIn = xmlVen[0].xpath(
                        '//div[@class="float_box"]//span[@class="txtLight"]/parent::div'
                    )
                    if len(scoreIn) > 0:
                        core_ = scoreIn[0].text.replace(',', '.')
                        try:
                            float(core_)
                        except Exception, ex:
                            core_ = None
                        ven.hqdb_review_score = core_
                script_ = xmlPages.xpath('./head/script')
                if address_.strip(
                ) == '' and ven.office_number == None and ven.office_number2 == None and ven.mobile_number == None and ven.mobile_number2 == None:
                    print 'None address and phone number'
                    return None

                streetTemp = ven.street
                cityTemp = ven.city
                zipcodeTemp = ven.zipcode

                if streetTemp == None:
                    streetTemp = ''
                if ven.city == None:
                    cityTemp = ''
                if ven.zipcode == None:
                    zipcodeTemp = ''
                address_ = streetTemp + ', ' + cityTemp + ', ' + zipcodeTemp
                address_ = address_.strip().replace(', ,',
                                                    ',').replace(',,', ',')
                if address_.startswith(','):
                    address_ = address_[1:len(address_)]
                if address_.endswith(','):
                    address_ = address_[0:len(address_) - 1]

                if ven.formatted_address != None:
                    address_ = ven.formatted_address

                zipFrom = self.findZipcode(address_)
                if zipFrom != None:
                    (ven.latitude,
                     ven.longitude) = self.getLatlng(zipFrom, 'DE')
                    if ven.latitude == None and ven.longitude == None:
                        Util.log.running_logger.info(address_ +
                                                     ' : cannot get GEO code')
                    else:
                        Util.log.running_logger.info(address_ +
                                                     ' : cannot get GEO code')
                else:
                    Util.log.running_logger.info(address_ +
                                                 ' : cannot get GEO code')

                redirecPhotos = rightInfo.find(
                    './nav/div/ul/li[@class="tabOff tab_foto"]/a')
                if redirecPhotos != None:
                    linkPhotos = redirecPhotos.get('href')
                    if linkPhotos.startswith('/'):
                        linkPhotos = self.__url__ + linkPhotos
                    time.sleep(1)
                    xpathPhotos = Util.getRequestsXML(
                        linkPhotos, '//div[@class="portfolio thumbs"]/a')
                    if xpathPhotos != None:
                        listImg = xpathPhotos.xpath('./a')
                        for __img in listImg:
                            img_link.append(__img.get('data-thumb'))

                desElement = rightInfo.find('./div/div[@id="cont_about"]')
                '''
                    pTag = desElement.xpath('//div[@class="overview"]/p')
                    des = ''
                    for desE in pTag :
                        if ''.join(desE.itertext()).find('<xml>')>=0:
                            continue
                        des+=''.join(desE.itertext())
                    h5Tag = desElement.xpath('//div[@class="overview"]/h5')
                    for desE_ in h5Tag:
                        if ''.join(desE_.itertext()).find('<xml>')>=0:
                            continue
                        des += ''.join(desE_.itertext())
                    divTag =desElement.xpath('//div[@class="overview"]/h5')
                    for div_ in divTag:
                        if ''.join(div_.itertext()).find('<xml>')>=0:
                            continue
                        des+= ''.join(div_.itertext())
                    if len(pTag)==0 and len(h5Tag) ==0:
                        if desElement.find('.//div[@class="overview"]')!=None:
                            des =  desElement.find('.//div[@class="overview"]').text
                    ven.description = self.validateDes(des)
                    '''
                des = ''
                divTag = desElement.xpath('//div[@class="overview"]')
                for divDes in divTag:
                    des += ' '.join(divDes.itertext())
                ven.description = self.validateDes(des)

                if ven.street != None:
                    if ven.street.find('@') >= 0:
                        ven.street = None

                certi = rightInfo.find('.//div/div[@id="cont_certs"]')
                tablecerti = certi.find('./table')
                if tablecerti != None:
                    certi_ = ''.join(tablecerti.itertext()).replace(
                        'Geprüfte Zertifikate:', '')
                    ven.accreditations = certi_
                ven.img_link = img_link
                ven.country = 'de'
                ven.is_get_by_address = True
                return ven