コード例 #1
0
    def __VenueParser(self, element, listData, scrape_page):
        try:
            classLI = element.get('class')
            featured = 'none'
            if classLI.find('TOP') != -1:
                featured = 'featured'
            ven = Venue()
            ven.name = element.find('./a').text.replace(''', '')
            ven.country = self._language
            ven.hqdb_featured_ad_type = featured
            ven.scrape_page = scrape_page

            div = element.find('./div')
            if div.find('./font') != None:
                font = div.xpath('./font')
                #div.remove(font)
                for font_ in font:
                    div.remove(font_)

            a = div.find('./a')
            if a != None:
                ven.business_website = self.check_website(a.get('href'))
                div.remove(a)
            content = '|'.join(div.itertext())
            content_ = content.split('|')
            phone = ''
            postion = -1
            if content_[postion].find('Telefoon:') != -1:
                phone = content_[postion].replace('Telefoon:', '').replace(
                    ' ', '').replace('-', '').replace('PR', '')

                if phone.startswith('00'):
                    phone = '+' + phone[2:len(phone)]
                if phone.startswith('31'):
                    phone = '+' + phone
                if phone.startswith('+'):
                    if phone.startswith('+31'):
                        ''
                    else:
                        phone = None

                if phone != None:

                    if phone.startswith('06') or phone.startswith('+316'):
                        ven.mobile_number = self.validatePhone__(phone)
                    else:
                        ven.office_number = self.validatePhone__(phone)
                postion -= 1
            city_zipcode = content_[postion]
            street = ' '.join(content_[0:postion])
            (ven.zipcode, ven.city) = self.__processAddress(city_zipcode)
            ven.street = street
            (ven.latitude,
             ven.longitude) = self.processlatlng(listData, city_zipcode)

            index = self.index + 1
            self.index = index
            ven.writeToFile(self.folder, index, ven.name, False)
        except Exception, ex:
            print ex
コード例 #2
0
    def __VenueParser(self, url, cateName):
        existing = [x for x in self.listlink if url in x]
        self.listlink.append(url)
        if len(existing) > 0:
            self.countduplicate += 1
            print '[INFO] Duplicate count = ' + str(self.countduplicate)
            return
        try:
            print 'Scraping url: ' + url

            #url = 'http://www.uksecurity-directory.co.uk/the-directory/1905/ecpcco/'

            xmlDoc = Util.getRequestsXML(url,
                                         '//div[@class="gdl-page-content"]')
            xmlDoc = xmlDoc.xpath('//div[@class="gdl-page-content"]/div')[0]
            ven = Venue()

            imgs = []

            ven.category = cateName
            ven.scrape_page = url
            ven.country = self._language
            ven.name = xmlDoc.find('./div/h2').text
            ven.hqdb_featured_ad_type = 'none'
            isFeatured = xmlDoc.find('./div[@class="stickytag"]/img')
            if isFeatured != None:
                if isFeatured.get('title') == 'Featured Listing':
                    ven.hqdb_featured_ad_type = 'featured'
            divInfo = xmlDoc.xpath('./div[@class="listing-details cf"]/div')
            town_ = ''
            area_ = ''
            zipcode = ''
            listPhone_ = []
            for div__ in divInfo:
                label = div__.find('./label')
                if label != None:
                    label_ = label.text
                    if label_ == 'Business Website Address:':
                        website = div__.find('./span/a')
                        if website != None:
                            website = website.get('href')
                            isFacebook = website.find('facebook.com')
                            isTwiter = website.find('twiter.com')
                            if isFacebook == -1 and isTwiter == -1:
                                ven.business_website = website
                            else:
                                if isFacebook != -1:
                                    ven.facebook = website
                                if ven.twitter != -1:
                                    ven.twitter = website
                    if label_ == 'Security Services:':
                        serviceStr = div__.xpath('./span/a')
                        sers = []
                        for ser in serviceStr:
                            serv = Service()
                            serv.service = ser.text
                            sers.append(serv)
                        if len(sers) > 0:
                            ven.services = sers
                            ven.pricelist_link = [ven.scrape_page]
                    if label_ == 'Long Business Description:':
                        des = div__.find('./span')
                        if des != None:
                            des = ' '.join(des.itertext())
                            ven.description = des
                    if label_ == 'Business Phone Number:':
                        phone = div__.find('./span').text
                        #phone = self.formatPhone(phone)
                        findsplistPPhone = self.findSplitPhone(phone)
                        if findsplistPPhone == None:
                            listPhone_ = [phone]
                        #(ven.office_number,ven.office_number2,ven.mobile_number,ven.mobile_number2) = self.processPhones([phone])
                        else:

                            listPhone_ = phone.split(findsplistPPhone)
                        (ven.office_number, ven.office_number2,
                         ven.mobile_number,
                         ven.mobile_number2) = self.processPhones(listPhone_)

                    if label_ == 'Postcode:':
                        zipcode = div__.find('./span').text
                    if label_ == 'Town:':
                        town_ = div__.find('./span').text
                    if label_ == 'Area:':
                        area_ = div__.find('./span').text
                    zipcode = self.validateZipcode(zipcode)
            if url == 'http://www.uksecurity-directory.co.uk/the-directory/1981/s-comm-vehicle-surveillance-system':
                print
            if ven.office_number == 'NOT_GB' or ven.office_number2 == 'NOT_GB' or ven.mobile_number == 'NOT_GB' or ven.mobile_number2 == 'NOT_GB':
                return
            for p in listPhone_:
                if p == town_:
                    town_ = ''
                    break

            ven.zipcode = zipcode
            ven.formatted_address = ', '.join([area_, town_, zipcode])

            ven.formatted_address = self.refixFormatAddress(
                ven.formatted_address.replace('0000000', ''))
            extraImg = xmlDoc.xpath('./div[@class="extra-images"]//a/img')
            listingThumbnail = xmlDoc.xpath(
                './div[@class="listing-thumbnail"]//a/img')
            for thumb in listingThumbnail:
                imgs.append(thumb.get('src'))
            for img in extraImg:
                imgs.append(img.get('src'))
            if len(imgs) > 0:
                ven.img_link = imgs
            self.index = self.index + 1
            ven.writeToFile(self.folder, self.index, ven.name, False)

        except Exception, ex:
            print '[ERROR] ' + url + ': ' + str(ex)
コード例 #3
0
    def __VenueParser(self, item, city):
        existing = [x for x in self.listLink if item in x]
        self.listLink.append(item)
        if len(existing) <= 0:
            try:
                xmlDoc = Util.getRequestsXML(item, '/html')
                ven = Venue()
                ven.scrape_page = item
                #ven.city = city
                ven.name = xmlDoc.xpath(
                    '//div[@class="row top-buffer"]/h3')[0].text
                (ven.latitude, ven.longitude) = self.getLatlng(xmlDoc)
                xmlcontent = xmlDoc.find('.//div[@class="tab-content"]')
                services_schedule_info = xmlcontent.xpath(
                    './div/div[@class="row top-buffer"]/h4/parent::div')[0]
                if services_schedule_info != None:
                    services_schedule_info = ''.join(
                        services_schedule_info.itertext()).split('\n')
                    for it in services_schedule_info:
                        if it.find('Style:') != -1:
                            it = it[0:it.find('Schedule')]
                            it = it.strip()
                            ser_name = it[it.find('Style:') + len('Style:'):it.
                                          find('Ability level')]
                            cost = len(it)
                            cost_ = ['Cost:', 'Concession cost:']
                            char_cost = ''
                            for c in cost_:
                                if it.find(c) != -1:
                                    cost = it.find(c)
                                    char_cost = c
                                    break
                            #cost = it.find('Cost:')
                            if cost == -1:
                                cost = len(it)

                            ser_des = it[it.find('Ability level:') +
                                         len('Ability level:'):cost]
                            ser_price = it[cost +
                                           len(char_cost):it.find('GBP') +
                                           len('GBP')]
                            ser = Service()
                            ser.service = ser_name
                            ser.description = ser_des
                            ser.price = ser_price.replace('-', '')
                            ven.services = [ser]
                        if it.find('a.m.') != -1 or it.find('p.m.') != -1:
                            ven.opening_hours_raw = it.strip().replace(
                                '.Monday', ' | Monday').replace(
                                    '.Tuesday', ' | Tuesday').replace(
                                        '.Wednesday', ' | Wednesday').replace(
                                            '.Thursday',
                                            ' | Thursday').replace(
                                                '.Friday',
                                                ' | Friday').replace(
                                                    '.Saturday',
                                                    ' | Saturday').replace(
                                                        '.Sunday', ' | Sunday')
                            ven.opening_hours_raw = self.formatOpenhour(
                                ven.opening_hours_raw)
                address = xmlcontent.find('.//address')
                if address != None:

                    #print ET.dump(address)
                    address = ''.join(address.itertext()).replace(
                        'United Kingdom', '').strip()
                    address = self.validateAddress(address)

                    #address ='Ward Park Arras Pavilion,Gransha Road,Bangor,Northern Ireland,BT20 4TN'
                    ven.country = 'gb'
                    if address.upper().find('Ireland'.upper()) != -1:
                        if address.upper().find(
                                'Northern Ireland'.upper()) != -1:
                            ven.country = 'ie'
                    if address.endswith(','):
                        address = address[0:-1]
                    ven.formatted_address = address
                posted = xmlcontent.find('./div/div[@class="row"]/p')
                imgs = xmlcontent.xpath('.//a/img')
                img_ = []
                for img in imgs:
                    img_.append(img.get('src'))

                ven.img_link = img_
                if posted != None:
                    ven.hqdb_ad_posted = posted.text.replace(
                        'Last updated', '')
                    split_posted = ven.hqdb_ad_posted.split(',')
                    if len(split_posted) >= 3:
                        ven.hqdb_ad_posted = ', '.join(
                            split_posted[0:len(split_posted) - 1])
                ven.category = self.category
                #ven.country ='gb'
                des_info = xmlcontent.xpath(
                    '//div[@class="row top-buffer"]/h3')[1]
                #print des_info.text
                des_info = des_info.getparent()
                des__ = des_info.xpath('./p')

                ven.pricelist_link = [ven.scrape_page]
                ven.hqdb_featured_ad_type = 'none'

                ven.description = ''
                for des in des__:
                    ven.description += ''.join(des.itertext()) + ' '
                    des_info.remove(des)
                info = '____'.join(des_info.itertext())
                a = des_info.find('./a')
                if a != None:
                    a = a.get('href')
                    if a.find('facebook.com') == -1:
                        ven.business_website = a
                    else:
                        if a.startswith('http:'):
                            a = a.replace('http:', 'https:')
                        ven.facebook = a

                info = info.split('__')

                for inf in range(0, len(info)):
                    if info[inf] == 'Qualifications:':
                        ven.accreditations = info[inf + 2]
                    if info[inf] == 'Phone:':

                        phone = info[inf + 2].strip()

                        pattren = '(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)'
                        '''isEmail  = re.search(pattren, phone, flags=0)
                        if isEmail!=None:
                            ven.business_email = isEmail.group(0)
                            continue'''
                        find_charSplit = self.findSplitPhone(phone)
                        if find_charSplit == None:

                            issMail = re.search(pattren, phone, flags=0)
                            if issMail != None:
                                ven.business_email = issMail.group(0)
                                continue
                            phone = phone.replace('Mobile:', '').replace(
                                'ext.225', '').replace('O7',
                                                       '07').replace(' ', '')
                            if phone.startswith('07') or phone.startswith(
                                    '447'
                            ) or phone.startswith('+447') or phone.startswith(
                                    '00447') or phone.startswith(
                                        '+44(0)7') or phone.startswith(
                                            '44(0)7') or phone.startswith(
                                                '004407'):
                                ven.mobile_number = self.validatePhone__(
                                    phone, ven.country)
                            else:
                                ven.office_number = self.validatePhone__(
                                    phone, ven.country)
                        else:
                            phone = phone.split(find_charSplit)
                            for p in phone:
                                issMail = re.search(pattren, p, flags=0)
                                if issMail != None:
                                    ven.business_email = issMail.group(0)
                                    continue
                                p = p.replace('Mobile', '').replace(
                                    'ext225',
                                    '').replace('O7', '07').replace(' ', '')
                                if p.startswith('07') or p.startswith(
                                        '447'
                                ) or p.startswith('+447') or p.startswith(
                                        '00447'
                                ) or p.startswith('+44(0)7') or p.startswith(
                                        '44(0)7') or p.startswith('004407'):
                                    if ven.mobile_number != None:
                                        ven.mobile_number2 = self.validatePhone__(
                                            p, ven.country)
                                    else:
                                        ven.mobile_number = self.validatePhone__(
                                            p, ven.country)
                                else:
                                    if ven.office_number != None:
                                        ven.office_number2 = self.validatePhone__(
                                            p, ven.country)
                                    else:
                                        ven.office_number = self.validatePhone__(
                                            p, ven.country)
                isPhoneOverSea = self.checkPhoneOverSea([
                    ven.office_number, ven.office_number2, ven.mobile_number,
                    ven.mobile_number2
                ])
                if isPhoneOverSea == False:
                    index = self.addIndex()
                    print str(
                        index) + ' Scapping: ' + city + '---' + ven.scrape_page
                    #ven.is_get_by_address =True
                    ven.writeToFile(self.folder, index, ven.name, False)
            except Exception, ex:
                print ex
                return
コード例 #4
0
 def __VenueParser(self, element):
     try:
         self.urlmarks += 1
         print '[COUNT]: ' + str(self.urlmarks)
         featured = 'featured'
         onclick = element.xpath('./td//a/@onclick')[0]
         detailLink = onclick[onclick.find("MM_openBrWindow('") +
                              len("MM_openBrWindow('"):onclick.find("','','"
                                                                    )]
         detailLink = self.__url__ + '/' + detailLink
         xmlDoc = Util.getRequestsXML(detailLink,
                                      '//table[@cellspacing="3"]')
         xmlDoc = xmlDoc[0]
         ven = Venue()
         ven.hqdb_featured_ad_type = featured
         ven.country = self._language
         ven.scrape_page = detailLink
         detail_ = xmlDoc.xpath('./tr/td/table')
         detail_1 = detail_[0]
         detail_2 = detail_[2]
         basicInfo = detail_1.find('./tr/td/table/tr/td[@class="text"]')
         email_website = basicInfo.getparent().xpath(
             '//table//div[@align="right"]/a')
         for aTag in email_website:
             link__ = aTag.get('href')
             if link__.find('mailto') != -1:
                 ven.business_email = link__.replace('mailto:', '')
             else:
                 if link__.find('http') != -1:
                     ven.business_website = link__
         openxml = detail_1.xpath('./tr')
         openxml = openxml[2].find('./td/table')  # table openning hour
         rows = openxml.xpath('./tr')
         dayofweek = {
             'Montag': 'Lunedì',
             'Dienstag': 'Martedì',
             'Mittwoch': 'Mercoledì',
             'Donnerstag': 'Giovedì',
             'Freitag': 'Venerdì',
             'Samstag': 'Sabato',
             'Sonntag': 'Domenica'
         }
         openning_hour_array = []
         for row in rows:
             tds = row.xpath('./td')
             if len(tds) > 0:
                 if tds[0].text != None:
                     if dayofweek.get(tds[0].text, "NULL") != "NULL":
                         record = ''
                         count_ = 0
                         for td in tds:
                             if dayofweek.get(td.text, "NULL") != "NULL":
                                 record += dayofweek.get(td.text) + ": "
                             else:
                                 if td.text.strip() != '-':
                                     record += td.text.replace('.',
                                                               ':') + ", "
                                 else:
                                     count_ += 1
                         record = record.strip()
                         if record.endswith(','):
                             record = record[0:-1]
                         if count_ < 3:
                             openning_hour_array.append(record)
         ven.opening_hours_raw = ' | '.join(openning_hour_array)
         basicInfo_ = ''.join(basicInfo.itertext()).split('\n')
         if basicInfo_[len(basicInfo_) - 1].find('Fax') != -1:
             basicInfo_ = basicInfo_[0:-1]
         phonenumber = basicInfo_[-1].strip().replace('Tel.',
                                                      '').replace(' ', '')
         zip_ci = basicInfo_[-2]
         street = basicInfo_[-3]
         contactName = basicInfo_[-4]
         name = ' '.join(basicInfo_[0:-4])
         (ven.office_number, ven.office_number2, ven.mobile_number,
          ven.mobile_number2) = self.processPhone([phonenumber])
         (ven.city, ven.zipcode) = self.processZipCity(zip_ci)
         if ven.zipcode != None:
             if self.validateZip(ven.zipcode) == None:
                 return
         ven.street = street
         ven.name_of_contact = contactName
         ven.name = name
         services = detail_2.xpath('./tr[@valign="top"]/td')
         if len(services) > 0:
             service_ = []
             services = services[0].text
             ven.services = self.__ServicesParser(services)
         self.index += 1
         ven.writeToFile(self.folder, self.index, ven.name, False)
     except Exception, ex:
         print ex
コード例 #5
0
    def __VenueParser_2(self, element):
        try:
            featured = 'none'
            #print ET.dump(element)
            self.urlmarks += 1
            print '[COUNT]: ' + str(self.urlmarks)
            ven = Venue()
            ven.scrape_page = self._url_lstVenues + '#' + str(self.urlmarks)
            td = element.find('./td')
            div = td.find('./div')
            if div != None:
                a = div.find('./a').get('href')
                ven.business_website = a
                td.remove(div)
            basicInfo = ''.join(td.itertext())
            #basicInfo =td.text

            if self.urlmarks == 881:
                pass

            street = ''
            contactName = ''
            name = ''

            basicInfo_ = basicInfo.split('\n')
            if basicInfo_[len(basicInfo_) - 1].find('Fax') != -1:
                basicInfo_ = basicInfo_[0:-1]
            positionInArr = -1
            phoneNumber = basicInfo_[positionInArr].strip().replace(
                'Tel.', '').replace(' ', '')
            positionInArr -= 1
            local = basicInfo_[positionInArr]
            positionInArr -= 1
            street = basicInfo_[positionInArr]
            if self.isContactName(street) == True:
                contactName = street
                street = ''
                positionInArr += 1
            positionInArr -= 1
            if contactName == '':
                contactName = basicInfo_[positionInArr]

            if self.isContactName(contactName) == False:
                positionInArr += 1
                contactName = ''
            name = ' '.join(basicInfo_[0:positionInArr])

            (ven.office_number, ven.office_number2, ven.mobile_number,
             ven.mobile_number2) = self.processPhone([phoneNumber])
            (ven.city, ven.zipcode) = self.processZipCity(local)
            if ven.zipcode != None:
                if self.validateZip(ven.zipcode) == None:
                    return
            ven.street = street
            ven.name_of_contact = contactName
            ven.name = name
            ven.country = self._language
            ven.hqdb_featured_ad_type = featured
            self.venues[ven.scrape_page] = ven
        except Exception, ex:
            print ex
コード例 #6
0
    def __VenueParser(self, url__, name__):
        print 'Scraping: ' + url__
        existing = [x for x in self.venuesList if url__ in x]
        if len(existing) > 0:
            return None
        #url__ ='http://www.drivingschoolsfinder.co.uk/city-Accrington/1846198-driving-Terrys-School-of-Motoring.html'
        #name__ ='Terrys School of Motoring'
        city = url__.split('/')[3].replace('city-', '').replace('-', ' ')
        xmlDoc = Util.getRequestsXML(url__, '/html/body')
        if xmlDoc == None:
            return None
        else:
            ven = Venue()
            sers = []
            ven.name = name__
            ven.city = city
            ven.scrape_page = url__
            td = xmlDoc.xpath('//td[@class="welcome-padding"]')
            iter__ = ''.join(td[0].itertext())
            iter__ = iter__[iter__.find('Driving School:') +
                            len('Driving School:'):iter__.
                            find('[Edit Text]')].replace('\n', '|').replace(
                                '\t', '')
            iter__ = iter__.replace('|||', ' | ')
            rep = '|' + name__
            iter__ = iter__[0:iter__.find(rep)]
            rep = '  |  |'
            iter__ = iter__[0:iter__.find(rep)]
            ven.description = iter__
            div = td[0].xpath('./div')

            if len(div) < 5:
                return None
            else:
                # div info = position div gray-line[0]+1
                div_info = 0
                for div_ in div:
                    if div_.find('./script') != None:
                        div_info = 3
                info = div[div_info]
                info_ = ''.join(info.itertext())
                address = info_[0:info_.find('Phone')].replace(
                    name__, '').replace(city,
                                        ',' + city).replace(',,', ',').replace(
                                            ', ,', ',').split(',')
                #street = ', '.join(address[0:len(address)-2]).replace(','+city,'')
                street = ', '.join(address[0:len(address)])
                street = street[0:street.find(city) - 1]
                if street.endswith(','):
                    street = street[0:len(street) - 1]
                zipcode = address[len(address) - 1]
                street__ = street.upper()
                if street__.find('PO BOX') == -1:
                    ven.street = street.replace('n/a', '').replace(
                        '***', '').replace('6 weldon place croy', '').replace(
                            'cumbernauld41 napier square bellshill ml4 1tb',
                            '').replace('P.O. Box 1048', '')
                if ven.street == '-':
                    ven.street = None
                ven.zipcode = self.validateZipcode(zipcode)

                phone = info_[info_.find('Phone:') +
                              len('Phone:'):info_.find('Fax:')].replace(
                                  ' ', '')
                if phone.isdigit():
                    if phone.startswith('07') | phone.startswith('7'):
                        ven.mobile_number = self.validatePhone(phone)
                        ven.mobile_number = self.validatePhone__(
                            ven.mobile_number, 'gb')
                    else:
                        ven.office_number = self.validatePhone(phone)
                        ven.office_number = self.validatePhone__(
                            ven.office_number, 'gb')
                services_ = info_[info_.find('Services Offered:') +
                                  len('Services Offered:'):info_.
                                  find('Areas Served:')].strip().replace(
                                      ';', ',')
                if services_ != 'None Listed - [Edit]':
                    services_ = services_.replace('/',
                                                  ',').replace(',,',
                                                               ',').split(',')
                    for s in services_:
                        name = self.validateServices(s)
                        if len(name) >= 5:
                            name__ = name.split()
                            for n in name__:
                                name = self.validateNameServices(name)
                        if len(name.strip()) >= 5:
                            services = Service()
                            services.service = name
                            sers.append(services)

                    #ven.description = ven.description +' | ' +services_
                stringfind = 'No Website'
                if info_.find('No Website') == -1:
                    stringfind = 'Website'
                area_coverd = info_[info_.find('Areas Served:') +
                                    len('Areas Served:'):info_.
                                    find(stringfind)].strip().replace(
                                        ';', ',')
                #area_coverd = area_coverd[0:area_coverd.find(stringfind)]
                if area_coverd != 'None Listed - [Edit]':
                    ven.areas_covered = area_coverd

                ven.services = sers
                reviewer = len(xmlDoc.xpath('//td[@class="review-box"]'))
                if reviewer > 0:
                    ven.hqdb_nr_reviews = str(reviewer)
                scoreInfo = div[div_info + 1]
                #http://www.drivingschoolsfinder.co.uk/halfstar.gif +0.5
                #http://www.drivingschoolsfinder.co.uk/fullstar.gif +1
                #http://www.drivingschoolsfinder.co.uk/emptystar.gif +0
                tr = scoreInfo.xpath('./table/tr')
                tr = tr[1]
                img_core = tr.xpath('./td')[1]
                img_core = img_core.xpath('./table/tr/td/img')
                score__ = 0.0
                for score in img_core:
                    score_ = score.get('src')
                    if score_ == 'http://www.drivingschoolsfinder.co.uk/halfstar.gif':
                        score__ += 0.5
                    if score_ == 'http://www.drivingschoolsfinder.co.uk/fullstar.gif':
                        score__ += 1
                    if score_ == 'http://www.drivingschoolsfinder.co.uk/emptystar.gif':
                        score__ += 0
                if score__ > 0:
                    ven.hqdb_review_score = str(score__).replace('.0', '')
                ven.country = 'gb'
                emails_ = re.findall(r'[\w\.-]+@[\w\.-]+', info_)
                for email_ in emails_:
                    ven.business_email = email_
            #    website_ = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', info_)
            #    for web_  in website_:
            #       ven.business_website = web_
                if ven.business_email != None:
                    if ven.business_email.startswith('http'):
                        ven.business_email = None
                    ven.business_email = None
                if info_.find('No Website') == -1:
                    arrays__ = info_.split(' ')
                    for i in range(0, len(arrays__)):
                        if arrays__[i].find('Website') >= 0:
                            web_ = arrays__[i + 1].replace('\t', ' ').replace(
                                '\n', ' ').split()[0].replace('No', '')
                            ven.business_website = self.formatWeb_(web_)
                            print ven.business_website
                            break
                address_ = ''
                if ven.street == None:
                    address_ = ven.city + ', ' + ven.zipcode
                    #ven.formatted_address = ven.city+', '+ven.zipcode
                else:
                    if ven.zipcode != None:
                        address_ = ven.street + ', ' + ven.city + ', ' + ven.zipcode
                    else:
                        address_ = ven.street + ', ' + ven.city
                ven.pricelist_link = [ven.scrape_page]
                ''' get lat -lng '''
                if address_ != '':
                    try:
                        (ven.latitude,
                         ven.longitude) = self.getLatlng(address_, 'UK')
                    except Exception, ex:
                        Util.log.running_logger.error(ven.scrape_page + ' : ' +
                                                      ex)
                        return None
            ven.is_get_by_address = True
            return ven
コード例 #7
0
 def __VenueParser(self,hqdb_type, linkItems,subcate,cate):    
         #linkItems ='https://www.blauarbeit.de/p/modernisierung/_sanierung/berlin/daniel_kutscher/576667.htm'
         existing=[x for x in self.linkIn if linkItems in x]
         if len(existing)>0:
             print 'This venue exist in list'
             return None
         self.linkIn.append(linkItems)
         
         
         
         xmlPages = self.getRequest(linkItems)
         if xmlPages==None:
             return None
       
     
         xmlVen = xmlPages.xpath('//div[@class="page_move"]')
         cate__ = xmlPages.find('.//meta[@name="Description"]')
         
         if len(xmlVen)==0:
             return None
     
         name = xmlVen[0].xpath('.//h2')
         if len(name) <=0:
             name =''
         else:
             name = name[0].text.strip()
         noneValues ={'ZERO','NULL'}
         if name.upper() in noneValues:
             return None
         ven = Venue()
         
         if cate__!=None:
             ven.category = cate__.get('content').split(',')[0]
             
             
         nameFromUrl = self.getNamefromUrl(linkItems)
         ven.name =  nameFromUrl
         ven.hqdb_featured_ad_type = hqdb_type
         #ven.name =name
         ven.scrape_page = linkItems
         #ven.subcategory = subcate
         #ven.category= cate
         address_= ''
         #ven.formatted_address=''
         img_link= []
         divInfo = xmlVen[0].find('.//div[@class="content_wrapper content_wrapper_main clearfix"]/div')
         if divInfo!=None:
             mainInfo =  divInfo.xpath('./section')    
             if len(mainInfo)>=2:
                 leftInfo =  mainInfo[0]
                 rightInfo = mainInfo[1]
                 tableInfo = leftInfo.find('./div/div[@class="profile_top_left"]/table')
                 trinfo = tableInfo.xpath('./tr')
                 for tr_ in trinfo:
                     td =tr_.xpath('./td')
                     if len(td)<2:
                         continue
                     key_ = ''.join(td[0].itertext()).strip()
                     values_ = ' '.join(td[1].itertext()).strip().replace('keine Angabe','').replace('NULL','').replace('null','')
                     if key_ =='Ansprechpartner:':
                         if values_!=None and len(values_)>2:
                             #values_ =''
                             ven.name_of_contact = values_
                             ven.name +=', '+ ven.name_of_contact
                         
                     if key_ =='Addresse:':
                         
                         address_ =  values_
                         ven.formatted_address = self.validateFormat(address_)
                         
                         
                         
                         
                         
                         '''(ven.street,ven.city,ven.zipcode) = self.processAddress(address_)
                         if ven.street!=None:
                             ven.street = self.validateStreet2(ven.street)
                         #ven.formatted_address = address_
                         if ven.city!=None:
                             checkCity = ven.city.split() 
                             if len(checkCity)>0:
                                 if checkCity[0].isdigit():
                                     if len(checkCity[0])==5:
                                         if ven.street!=None:
                                             ven.street+=' '+ ven.zipcode
                                         ven.zipcode = checkCity[0]
                                         ven.city = ven.city.replace(ven.zipcode,'')
                                     else:
                                         ven.city = None
                                         ven.street = None
                                         ven.zipcode= None
                                         ven.formatted_address = ' '.join(checkCity)
                             
                             
                             
                         if ven.zipcode!=None:
                             if len(ven.zipcode)==5:
                                 ven.zipcode = ven.zipcode
                             else:
                                 ven.zipcode = None'''
                         
                         
                         
                     if key_ =='Homepage:':
                         a_ = td[1].find('./a')
                         if a_ !=None:
                             ven.business_website = a_.get('href')
                     mobileCode =['015','016','017','+4915','+4916','+4917']
                     if key_ =='Tel:':
                         values_ = values_.replace('/', '').replace(' ', '').replace('Tel', '')
                         
                         #values_ ='01735465435'
                         
                         
                         for mCode in mobileCode:
                             if values_.startswith(mCode):
                                 ven.mobile_number = self.validatePhone__(self.validatePhone(values_), 'de')
                                 break
                         if ven.mobile_number==None:
                             ven.office_number = self.validatePhone__(self.validatePhone(values_), 'de')
                         
                         '''if values_.startswith('01')| values_.startswith('+0041')| values_.startswith('0041'):
                             ven.mobile_number = self.validatePhone__(self.validatePhone(values_), 'de')
                         else:
                             ven.office_number = self.validatePhone__(self.validatePhone(values_), 'de')'''
                         
                 img_ = leftInfo.find('./div/div[@class="profile_top_right"]/img')
                 if img_!=None:
                     img_ =img_.get('src')
                     img_link.append(img_)
                 rating = leftInfo.xpath('.//section[@id="ratings"]/div')
                 if len(rating)>=2:
                     rating1 = ''.join(rating[0].itertext()).strip().split()[1]
                     rating2 = ''.join(rating[1].itertext()).strip()
                     if len(rating2)>0:
                         rating2 = rating2.split()[0]
                         if rating2.find('/')!=-1:
                             rating2 =  rating2.split('/')[0].replace(',','.')
                     try:
                         float(rating2)
                     except Exception,ex:
                         rating2=None
                     ven.hqdb_nr_reviews = rating1
                     ven.hqdb_review_score = rating2
                 
                 
                 if ven.hqdb_review_score==None:
                     scoreIn = xmlVen[0].xpath('//div[@class="float_box"]//span[@class="txtLight"]/parent::div')
                     if len(scoreIn)>0:
                         core_ = scoreIn[0].text.replace(',','.')
                         try:
                             float(core_)
                         except Exception,ex:
                             core_ =None
                         ven.hqdb_review_score = core_
                 script_ = xmlPages.xpath('./head/script')
                 if ven.formatted_address.strip()=='' and ven.office_number==None and ven.office_number2 ==None and ven.mobile_number ==None and ven.mobile_number2 ==None:
                     return None
                 
                 '''streetTemp = ven.street
                 cityTemp =ven.city
                 zipcodeTemp =ven.zipcode
                 
                 if streetTemp ==None:
                     streetTemp =''
                 if ven.city ==None:
                     cityTemp = ''
                 if ven.zipcode ==None:
                     zipcodeTemp =''
                 address_ = streetTemp+', '+cityTemp+', '+zipcodeTemp
                 address_ = address_.strip().replace(', ,', ',').replace(',,', ',')
                 if address_.startswith(','):
                     address_ =address_[1:len(address_)]
                 if address_.endswith(','):
                     address_ = address_[0:len(address_)-1]
                     
                 if ven.formatted_address!=None:
                     address_ = ven.formatted_address'''
                 
                 #if len(address_.strip())>5:
                 #    (ven.latitude,ven.longitude)  = self.getLatlng(address_,'DE') #script_
                 zipFrom = self.findZipcode(ven.formatted_address)
                 if zipFrom!=None:
                     (ven.latitude,ven.longitude) = self.getLatlng(zipFrom, 'DE')
                     if ven.latitude ==None and ven.longitude==None:
                         Util.log.running_logger.info(ven.formatted_address+' : cannot get GEO code')
                 redirecPhotos= rightInfo.find('./nav/div/ul/li[@class="tabOff tab_foto"]/a')
                 if redirecPhotos!=None:
                     linkPhotos =  redirecPhotos.get('href')
                     if linkPhotos.startswith('/'):
                         linkPhotos = self.__url__+ linkPhotos
                     #time.sleep(1)
                     xpathPhotos =  Util.getRequestsXML(linkPhotos, '//div[@class="portfolio thumbs"]/a')
                     if xpathPhotos!=None:
                         listImg = xpathPhotos.xpath('./a')
                         for __img in listImg:
                             img_link.append(__img.get('data-thumb'))
                 
                 
                 desElement= rightInfo.find('./div/div[@id="cont_about"]')
                 
                 
                 '''
                 pTag = desElement.xpath('//div[@class="overview"]/p')
                 des = ''
                 for desE in pTag :
                     if ''.join(desE.itertext()).find('<xml>')>=0:
                         continue
                     des+=''.join(desE.itertext())
                 h5Tag = desElement.xpath('//div[@class="overview"]/h5')
                 for desE_ in h5Tag:
                     if ''.join(desE_.itertext()).find('<xml>')>=0:
                         continue
                     des += ''.join(desE_.itertext())
                 divTag =desElement.xpath('//div[@class="overview"]/h5')
                 for div_ in divTag:
                     if ''.join(div_.itertext()).find('<xml>')>=0:
                         continue
                     des+= ''.join(div_.itertext())
                 if len(pTag)==0 and len(h5Tag) ==0:
                     if desElement.find('.//div[@class="overview"]')!=None:
                         des =  desElement.find('.//div[@class="overview"]').text
                 ven.description = self.validateDes(des)
                 '''
                 des =''
                 divTag = desElement.xpath('//div[@class="overview"]')
                 for divDes in divTag:
                     des+= ' '.join(divDes.itertext())
                 ven.description =  self.validateDes(des)
                 
                 
          
                 
                 
                 certi = rightInfo.find('.//div/div[@id="cont_certs"]')
                 tablecerti =  certi.find('./table')
                 if tablecerti!=None:
                     certi_ = ''.join(tablecerti.itertext()).replace('Geprüfte Zertifikate:','')
                     ven.accreditations = certi_
                 ven.img_link = img_link
                 ven.country ='de'
                 ven.is_get_by_address = True
                 return ven
コード例 #8
0
    def __VenueParser(self, venueElement):
        try:
            img_link = []
            ad_type = "none"
            if venueElement.find(
                    './/span[@class="label label-success"]') != None:
                ad_type = "featured"
            divs = venueElement.xpath('./div')
            logo_ = divs[0].find('.//img')
            if logo_ != None:
                img_link.append(self.__url__ + logo_.get('src'))
            url__ = venueElement.xpath(
                './div[@class="col-xs-9 col-sm-9 col-md-9 listing-body"]//div[@class="h4 listing-heading"]/a'
            )
            if url__ != None:
                url__ = url__[0].get('href')
                url__ = self.__url__ + url__
                '''
                files = open('D:\\test.txt','a')
                files.write(url__+'\r\n')
                files.close()
                '''
                existing = [x for x in self.listLink if url__ in x]
                if len(existing) <= 0:
                    self.listLink.append(url__)
                    print 'Scraping' + ' : ' + url__

                    #if url__ =='http://www.garagesandrecovery.co.uk/business/dorset-auto-repirs':
                    #    print 'Debug'

                    xmlDoc = Util.getRequestsXML(
                        url__, '//body/div[@class="page-wrapper"]')
                    ven = Venue()
                    ven.name = xmlDoc.find(
                        './/div[@class="page-heading"]//h1').text
                    content = xmlDoc.find(
                        './/div[@class="container page-content"]')
                    if content != None:
                        des_img = content.find('.//div[@class="article-body"]')
                        if des_img != None:
                            div_img = des_img.xpath('.//img/parent::div')
                            if len(div_img) > 0:
                                des_img.remove(div_img[0])
                            des = ' '.join(des_img.itertext())
                            ven.description = des
                        ven.country = self._language
                        ven.scrape_page = url__
                        ven.hqdb_featured_ad_type = ad_type
                        offices_ = content.xpath(
                            './/div[@id="offices"]/parent::div/div[@class="row"]'
                        )
                        div_maps = offices_[0].find(
                            './/div[@class="google-map"]')
                        if div_maps != None:
                            ven.latitude = div_maps.get('data-lat')
                            ven.longitude = div_maps.get('data-lng')
                        info_ = offices_[0].xpath(
                            './div[@class="col-md-5 col-sm-6"]')
                        info_ = info_[0]
                        ul = info_.xpath('./ul')
                        phones = []
                        for u in ul:
                            phone_ = u.xpath('./li/a')
                            for phone in phone_:
                                if phone.get('title') == 'Phone Number':
                                    phone = phone.text.replace(' ', '')
                                    if phone.startswith('0800'):
                                        continue
                                    else:
                                        phones.append(phone)
                        if len(ul) >= 2:
                            ul_2 = ul[0]
                            li__ = ul_2.xpath('./li')

                            address = ''
                            for li in li__:
                                if li.get('class') != 'text-bold':
                                    address = '\n'.join(li.itertext())
                                    addressArr = address.split('\n')
                                    if len(addressArr) >= 3:
                                        ven.street = addressArr[len(addressArr)
                                                                - 3]
                                    ven.city = addressArr[len(addressArr) -
                                                          2].split(',')[0]
                                    ven.zipcode = addressArr[len(addressArr) -
                                                             1]
                                    if ven.zipcode != None:
                                        results = re.search(self.ukReg,
                                                            ven.zipcode,
                                                            flags=0)
                                        if ven.zipcode == 'Rotherham, South Yorkshire':
                                            ven.zipcode = ''
                                            ven.street = None
                                        if results == None:
                                            ven.zipcode = None

                        (ven.office_number, ven.office_number2,
                         ven.mobile_number,
                         ven.mobile_number2) = self.processPhones(phones)

                        # right sidebar : //div[@class="col-md-3 page-sidebar"]/div
                        rightSidebar = xmlDoc.xpath(
                            './/div[@class="col-md-3 page-sidebar"]/div[@class="section"]'
                        )
                        for right in rightSidebar:

                            website = right.xpath(
                                './a[contains(text(),"Visit Our Website")]')
                            if len(website) > 0:
                                website = website[0].get('href')
                                if website.find('facebook.com') == -1:
                                    ven.business_website = website
                                else:
                                    ven.facebook = website
                            reviews = right.xpath('./p/strong')
                            if len(reviews) >= 3:
                                ven.hqdb_nr_reviews = reviews[2].text
                                ven.hqdb_review_score = reviews[1].text
                            follows = right.xpath('./ul/li/a')
                            for foll in follows:
                                follow_link = foll.get('href')
                                if follow_link.find('facebook.com') != -1:
                                    if ven.facebook == None:
                                        ven.facebook = self.addHTTP(
                                            follow_link)
                                if follow_link.find('twitter.com') != -1:
                                    if ven.twitter == None:
                                        ven.twitter = self.addHTTP(follow_link)

                        img_find = xmlDoc.xpath(
                            '//div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img'
                        )
                        for ig in img_find:
                            img_link.append(self.__url__ + ig.get('src'))

                        if len(img_link) > 0:
                            ven.img_link = img_link
                        self.index += 1
                        ven.writeToFile(self.folder, self.index, ven.name,
                                        False)
                        #img_link : //div[@id="galleries"]/parent::div/div[@class="carousel slide equal-height"]//img

                else:
                    print '\nduplicate'.upper()
                    print '*' * (len(url__) + 4)
                    print '*' + ' ' * (len(url__) + 2) + '*'
                    print '* ' + url__ + ' *'
                    print '*' + ' ' * (len(url__) + 2) + '*'
                    print '*' * (len(url__) + 4) + '\n'

        except Exception, ex:
            print ex
コード例 #9
0
    def __VenueParser(self, url):
        try:
            print 'Scraping: ' + url
            xmlDoc = Util.getRequestsXML(url, '//div[@id="main"]')
            if xmlDoc != None:
                ven = Venue()
                ven.scrape_page = url
                ven.country = self._language
                ven.name = xmlDoc.find('.//h1').text
                overview = xmlDoc.find('.//div[@class="overview"]')
                option = overview.xpath('./div[@class="options row"]/div')
                for opt in option:
                    div_ = opt.xpath('./div')
                    for div__ in div_:
                        strong = div__.find('./strong')
                        if strong != None:
                            if strong.text == 'Adresse:':
                                street = div__.find(
                                    './span[@itemprop="streetAddress"]')
                                if street != None:
                                    ven.street = street.text
                                zipcode = div__.find(
                                    './span[@itemprop="postalCode"]')
                                if zipcode != None:
                                    ven.zipcode = zipcode.text
                                city = div__.find(
                                    './span[@itemprop="addressLocality"]')
                                if city != None:
                                    ven.city = city.text
                            if strong.text == 'Téléphone:':
                                phone = ''.join(div__.itertext()).replace(
                                    ' ',
                                    '').replace('.',
                                                '').replace('Téléphone:', '')
                                if phone.startswith('06') or phone.startswith(
                                        '07') or phone.startswith(
                                            '7') or phone.startswith('6'):
                                    ven.mobile_number = self.validatePhone__(
                                        phone)
                                else:
                                    ven.office_number = self.validatePhone__(
                                        phone)

                            if strong.text == 'Site Web:':
                                website = ''.join(div__.itertext()).replace(
                                    'Site Web:', '')
                                if website.find('facebook.com') != -1:
                                    ven.facebook = self.addHTTP(website)
                                    continue
                                if website.find('twitter.com') != -1:
                                    ven.twitter = self.addHTTP(website)
                                    continue
                                ven.business_website = self.addHTTP(website)
                            if strong.text == 'Horaires:':
                                openning = ''.join(div__.itertext()).replace(
                                    'Horaires:', '')
                                for format in self.openFormat:
                                    if openning.strip() == format:
                                        ven.opening_hours_raw = 'Lundi au Dimanche: 0h00 - 24h00'
                                if ven.opening_hours_raw == None:
                                    ven.opening_hours_raw = openning
                            if strong.text == 'Votez pour ce serrurier:':
                                score = div__.find(
                                    './span[@class="thevotescount"]/span[@itemprop="ratingValue"]'
                                )
                                if score != None:
                                    ven.hqdb_review_score = score.text
                descElement = overview.find('./div[@class="contenu"]')
                if descElement != None:
                    ven.description = ' | '.join(descElement.itertext())
                    if ven.description != None:
                        ven.description = ven.description.strip()
                        if ven.description.startswith('|'):
                            ven.description = ven.description[
                                1:len(ven.description)]
                        if ven.description.endswith("|"):
                            ven.description = ven.description[
                                0:len(ven.description) - 1]
                        ven.description = ven.description.replace(
                            '| \n |', '|')
                        if len(ven.description.split()) < 3:
                            ven.description = None
                address = []
                if ven.street != None and len(ven.street.strip()) > 0:
                    address.append(ven.street)
                if ven.city != None and len(ven.city.strip()) > 0:
                    address.append(ven.city)
                if ven.zipcode != None and len(ven.zipcode.strip()) > 0:
                    address.append(ven.zipcode)
                address_ = ', '.join(address)
                (ven.latitude, ven.longitude) = self.getLatlng(address_)
                ven.is_get_by_address = True
                self.index += 1
                ven.writeToFile(self.folder, self.index, ven.name, False)
        except Exception, ex:
            print '[ERROR] ' + url
            print ex
コード例 #10
0
    def __VenueParser(self, jsonItems, hash):
        url = self.__url__ + 'profile/' + jsonItems.get(
            'serviceSlug') + '/' + jsonItems.get(
                'companySlug') + '-' + jsonItems.get('id') + '?hash=' + hash
        url__ = self.__url__ + 'profile/' + jsonItems.get(
            'serviceSlug') + '/' + jsonItems.get(
                'companySlug') + '-' + jsonItems.get('id')
        id_ = str(jsonItems.get('id'))
        existing = [x for x in self.list_url if url__ in x]
        if len(existing) > 0:
            print 'this venues existed in list'
            return None
        if len(existing) <= 0:
            print 'Scrapping: ' + url
            ven = Venue()
            services_v = []
            ven.category = jsonItems.get('restriction').get('name')
            ven.adid = str(jsonItems.get('id'))
            ven.name = jsonItems.get('companyName')

            ven.latitude = jsonItems.get('coordinates').get('lat')
            ven.longitude = jsonItems.get('coordinates').get('long')
            ven.venue_images = jsonItems.get('logo')
            points_ = jsonItems.get('satisfaction_rating')
            if str(points_).find('.') >= 0:
                ven.hqdb_review_score = str(round(points_, 1))
            else:
                ven.hqdb_review_score = str(points_)
            #ven.img_link = [url]
            #ven.description = jsonItems.get('salesPitch')
            ven.country = 'gb'
            ven.scrape_page = url
            #ven.pricelist_link = [url]
            self.list_url.append(url__)
            #url ='https://www.unbiased.co.uk/profile/financial-adviser/stiles-company-financial-services-petersfield-ltd-511274'
            xmlRequest = Util.getRequestsXML(
                url, '//div[@class="container-fluid"]')
            if xmlRequest != None:
                stringAddress = xmlRequest.find(
                    './/span[@class="profile-meta__address"]').text.replace(
                        ',,', ',')

                stringAddress = '1st and 2nd Floor Offices, 446 - 452 High street, Kingswinford, West Midlands,'

                ven.formatted_address = self.removeNameFromAdd(
                    ven.name.strip(),
                    stringAddress).replace('PO BOX',
                                           '').replace('PO Box', '').replace(
                                               'Po Box', '')
                zipArr = stringAddress.split(',')
                ven.zipcode = zipArr[len(zipArr) - 1]
                ex_ = re.search(
                    '([Gg][Ii][Rr]0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9]?[A-Za-z]))))\s?[0-9][A-Za-z]{2})',
                    stringAddress,
                    flags=0)

                if ex_ != None:
                    zip_c = ex_.group(0)
                    #ven.zipcode = zip_c
                    #ven.formatted_address = ven.formatted_address.replace(ven.zipcode,'').strip()
                    if ven.zipcode != zip_c:
                        poZip_c = stringAddress.find(zip_c)
                        poZipcode = stringAddress.find(ven.zipcode)
                        if len(ven.zipcode.strip()) > 1:
                            if poZip_c > poZipcode:
                                ven.zipcode = zip_c
                if ex_ == None:
                    if ven.zipcode != None:
                        ven.zipcode = None
                if ven.formatted_address.endswith(','):
                    ven.formatted_address = ven.formatted_address[
                        0:len(ven.formatted_address) - 2]
                phoneLabel = xmlRequest.xpath(
                    './/span[@class="phone-label"]/parent::a')

                if len(phoneLabel) > 0:
                    for phone_ in phoneLabel:
                        phone = phone_.get('data-phone').replace('\n',
                                                                 '').replace(
                                                                     ' ', '')
                        if phone.find('Shownumber') <= 0:
                            phone = self.validatePhone(phone)
                            for rePhone in self.listPhoneremove:
                                if phone == rePhone:
                                    phone = None
                            if phone != None:
                                if phone.startswith('07'):
                                    ven.mobile_number = phone
                                else:
                                    ven.office_number = phone
                                break
                services = xmlRequest.find(
                    './/ul[@class="advice-area__level-one"]')
                if services != None:
                    list_ser = services.xpath('./li')
                    for ser_name in list_ser:

                        # feedback 3 : add category service

                        cate = ser_name.find('./span').text.strip()
                        list_services = ser_name.xpath('./ul/li')
                        for service__ in list_services:
                            service = Service()
                            service.service_category = cate + ' advice'
                            service.service = service__.text + ' advice'
                            services_v.append(service)

                ven.services = services_v

                # append accreditations feedback 3
                certi = []
                cer = xmlRequest.xpath(
                    './/div[@class="profile-team__skill-item collapsed"]')
                for c in cer:
                    inCerti = [x_ for x_ in certi if c.text in x_]
                    if len(inCerti) <= 0:
                        certi.append(c.text)

                ven.accreditations = ', '.join(certi)

                # add follow :  fb, twi, website feedback 3
                follow = xmlRequest.xpath(
                    '//div[@class="profile__follow"]/ul/li')
                for fol in follow:
                    values_fol = fol.get('class')
                    if values_fol == 'icon-soc-tw':
                        ven.twitter = fol.find('./a').get('href')
                    if values_fol == 'icon-soc-www':
                        ven.business_website = fol.find('./a').get('href')
                    if values_fol == 'icon-soc-fb':
                        ven.facebook = fol.find('./a').get('href')

                # description feedback 3

                des_1 = xmlRequest.find(
                    './/div[@class="profile__text-block "]/p')
                if des_1 != None:
                    ven.description = ''.join(des_1.itertext()).replace(
                        '.\n', ' | ')
                des_2 = xmlRequest.find(
                    './/div[@class="profile__text-block spacing-bottom-xs-0"]/p'
                )
                if des_2 != None:
                    ven.description += ' -Our services: ' + ''.join(
                        des_2.itertext()).replace('.\n', ' | ')
                if ven.description != None:
                    if ven.description.endswith(' | '):
                        ven.description = ven.description[0:len(ven.description
                                                                ) - 2]
                return ven
        else:
            return None
コード例 #11
0
    def __VenueParser_2(self, element, cate, scrape_pages):
        subB = element.find('./div/a')
        link = subB.get('href')

        try:
            existing = [x for x in self.listLink if link in x]
            if len(existing) <= 0:
                print 'Scraping Feature : ' + link

                self.listLink.append(link)
                ven = Venue()
                ven.country = self._language
                ven.hqdb_featured_ad_type = 'featured'
                ven.category = cate
                #ven.subcategory = cate
                ven.scrape_page = scrape_pages
                subDiv = element.find('./div[@class="resultado nada"]')
                div = subDiv.find('./a/div')
                ven.name = div.find('./h2').text  #.replace('Niño','Niño')
                '''if ven.name =='Niño de la Virgen':
                    print'''
                #ven.name = Validator.RevalidName(ven.name)
                ven.name = self.replaceName(ven.name)
                address = div.xpath('./p[@itemprop="address"]/span')
                if address != None:
                    for span in address:
                        itemprop = span.get('itemprop')
                        if itemprop == 'street-address':
                            ven.street = span.text
                        if itemprop == 'postal-code':
                            ven.zipcode = span.text
                        if itemprop == 'locality':
                            ven.city = span.text  #.split(',')[0]
                            if ven.city == '' or ven.city == None:
                                continue
                            find_slash = ven.city.find('/')
                            find_comma = ven.city.find(',')
                            if find_slash != -1 and find_comma != -1:
                                ven.city = ven.city.split('/')[0]
                                if ven.city.find(',') != -1:
                                    ven.city = ven.city.split(',')[1]
                            ven.city = ven.city.split(',')[0]
                            ven.city = ven.city.split('/')[0]
                    if ven.street != None:
                        ven.street = self.validateStreet(ven.street)
                    if ven.city != None:
                        re_City = re.search(
                            '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                            ven.city,
                            flags=0)
                        if re_City != None:
                            ven.city = ven.city.replace(re_City.group(0), '')
                    if ven.zipcode != None:
                        ven.zipcode = ven.zipcode.strip()
                        if len(ven.zipcode) >= 5:
                            re_zipcode = re.search(
                                '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                                ven.zipcode,
                                flags=0)
                            if re_zipcode != None:
                                if re_zipcode.group(0) != ven.zipcode:
                                    ven.zipcode = None
                            else:
                                ven.zipcode = None
                        else:
                            ven.zipcode = '0' + ven.zipcode
                            rezipcode = re.search(
                                '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                                ven.zipcode,
                                flags=0)
                            if rezipcode == None:
                                ven.zipcode = None
                            else:
                                if ven.zipcode != rezipcode.group(0):
                                    ven.zipcode = None

                try:
                    if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000:
                        ven.zipcode = None
                except Exception, ex:
                    ven.zipcode = None

                description = div.find('./p[@class="descripcion"]').text
                if description != None:
                    ven.description = description
                imgs = subDiv.xpath('./a/figure/img')
                if len(imgs) > 0:
                    imgs_ = []
                    for im in imgs:
                        imgs_.append(im.get('src'))
                    ven.img_link = imgs_
                footer = subDiv.xpath('./div[@class="iconos"]/ul/li')
                for fo in footer:
                    text__ = fo.find('./a').text
                    if text__ == 'Mandar mail':
                        ven.business_website = fo.find('./a').get('href')
                    if text__ == 'Ver teléfono':
                        phone = fo.find('./span[@class="telefono"]').text
                        if phone.startswith('+346') or phone.startswith(
                                '+347') or phone.startswith(
                                    '7') or phone.startswith('6'):
                            ven.mobile_number = self.validatePhone__(phone)
                        else:
                            ven.office_number = self.validatePhone__(phone)
                #ven.is_get_by_address =True
                ven.writeToFile(self.folder, self.addIndex(), ven.name, False)
            else: