예제 #1
0
    def __VenueParser(self, xmlE, index):
        #print 'Scrapping: '
        ven = Venue()
        ven.adid = xmlE.get('id')
        ven.category = 'architecural technologist'
        photos = xmlE.find(
            './div[@class="search_result_photo"]/div[@class="photo"]/a')
        ven.venue_images = self.__url__ + photos.find('./img').get('src')
        ven.scrape_page = self.__url__ + photos.get('href')
        #print str(index)+' >>'+ ven.scrape_page
        existing = [x for x in self.list_url if ven.scrape_page in x]
        if len(existing) > 0:
            print 'This venues exist in list'
            return
        self.list_url.append(ven.scrape_page)
        details_ = xmlE.find('.//div[@class="search_result_details"]')
        ven.name = details_.find('./div[@class="title"]/h3/a').text
        contacts_ = details_.find('./div[@class="contact"]').text
        ven.description = details_.find('./div[@class="desc"]').text
        contact__ = contacts_.split(',')
        if len(contact__) >= 2:
            ven.zipcode = contact__[len(contact__) - 1]
            if ven.zipcode != None:
                ven.zipcode = self.check_zip(ven.zipcode)
            ven.city = contact__[len(contact__) - 2]

        #scraping details ____
        #ven.scrape_page ='http://www.architecturalindex.com/consumers/architects/architect.asp?lngArchitectId=207922'
        xmlInfo = Util.getRequestsXML(
            ven.scrape_page, '//div[@class="architect_header"]/parent::div')
        if xmlInfo != None:
            addressInfo = xmlInfo.find(
                './/div[@class="architect_header"]/div[@class="architect_header_info"]'
            )
            h2 = addressInfo.find('./h2')
            if h2 != None:
                addressInfo.remove(h2)
            address__ = ' '.join(addressInfo.itertext())
            if ven.city == None:
                __address = address__.split(',')
                ven.city = __address[len(__address) - 3]
            if len(ven.city) < 2:
                __address = address__.split(',')
                ven.city = __address[len(__address) - 3]
            street = address__[0:address__.find(ven.city.strip()) - 1]
            if street.endswith(','):
                street = street[0:len(street) - 1]
                if street.upper().find('PO BOX') >= 0:
                    street = None
                ven.street = street

            #ven.office_number= '08708700053'
            img = []
            img_info = xmlInfo.find('.//div[@class="architect_portfolio"]')
            photos_ = img_info.xpath(
                './div[@class="architect_portfolio_photo"]//img')
            for photo in photos_:
                im_ = self.__url__ + photo.get('src')
                img.append(im_)
            ven.img_link = img
            sers = []
            des = xmlInfo.find('.//div[@class="architect_info_statement"]')
            des = ' '.join(des.itertext())
            ven.description = des
            services = xmlInfo.xpath('//div[@class="architect_info"]/ul')
            desP = xmlInfo.xpath('//div[@class="architect_info"]/p')
            affi = xmlInfo.xpath('//div[@class="architect_info"]/h3')
            isAffiliations = ''
            for aff in affi:
                if aff.text.strip() == 'Affiliations':
                    isAffiliations = desP[len(desP) - 1].text
                    ven.accreditations = isAffiliations

            if len(desP) >= 2:
                p1 = desP[0].text
                p2 = desP[1].text

                #ven.description= ven.description+' '+p1+' '+p2
                if p1 != None:
                    ven.description += ' ' + p1
                if p2 != None:
                    if p2 != 'None':
                        ven.description += ' ' + p2 + ': '

            if len(services) >= 3:
                services_ = services[1]
                listSer = services_.xpath('./li')

                listDes_2 = services[2].xpath('./li')
                des_2 = ''
                if len(listDes_2) > 0:
                    des_2 = '. Specialist Experience: '
                    for des2 in listDes_2:
                        des_2 += des2.text + ', '

                    des_2 = des_2.strip()
                    if des_2.endswith(','):
                        des_2 = des_2[0:-1]

                listDes = services[0].xpath('./li')
                if len(listDes) > 0:
                    desSectors = ''
                    for lides in listDes:
                        desSectors += lides.text + ', '
                    desSectors = desSectors.strip()
                    if desSectors.endswith(','):
                        desSectors = desSectors[0:-1]
                    ven.description = ven.description + ' ' + desSectors + '.' + des_2
                    ven.description = ven.description.replace(', ,',
                                                              ', ').replace(
                                                                  '..', '.')
                for ser in listSer:
                    se = ser.text
                    serv = Service()
                    serv.service = se
                    sers.append(serv)
            ven.services = sers
            ven.pricelist_link = [ven.scrape_page]
            ven.country = 'gb'
            '''if ven.street!=None:
                add_ = ven.street+', '+ven.city+', '+ ven.zipcode
            else:
                add_ = ven.city+', '+ ven.zipcode
            #(ven.latitude,ven.longitude) = self.getLatlng(add_, 'UK')'''
            indexc = self.addIndex()
            try:
                print 'Writing index: ' + str(indexc)
                ven.writeToFile(self.folder, indexc, ven.name.replace(':', ''),
                                False)
            #return ven
            except Exception, ex:
                print ex
                return
    def __VenueParser(self, jsonItems, hash):
        url = self.__url__ + 'profile/' + jsonItems.get(
            'serviceSlug') + '/' + jsonItems.get(
                'companySlug') + '-' + jsonItems.get('id') + '?hash=' + hash
        url__ = self.__url__ + 'profile/' + jsonItems.get(
            'serviceSlug') + '/' + jsonItems.get(
                'companySlug') + '-' + jsonItems.get('id')
        id_ = str(jsonItems.get('id'))
        existing = [x for x in self.list_url if url__ in x]
        if len(existing) > 0:
            print 'this venues existed in list'
            return None
        if len(existing) <= 0:
            print 'Scrapping: ' + url
            ven = Venue()
            services_v = []
            ven.category = jsonItems.get('restriction').get('name')
            ven.adid = str(jsonItems.get('id'))
            ven.name = jsonItems.get('companyName')

            ven.latitude = jsonItems.get('coordinates').get('lat')
            ven.longitude = jsonItems.get('coordinates').get('long')
            ven.venue_images = jsonItems.get('logo')
            points_ = jsonItems.get('satisfaction_rating')
            if str(points_).find('.') >= 0:
                ven.hqdb_review_score = str(round(points_, 1))
            else:
                ven.hqdb_review_score = str(points_)
            #ven.img_link = [url]
            #ven.description = jsonItems.get('salesPitch')
            ven.country = 'gb'
            ven.scrape_page = url
            #ven.pricelist_link = [url]
            self.list_url.append(url__)
            #url ='https://www.unbiased.co.uk/profile/financial-adviser/stiles-company-financial-services-petersfield-ltd-511274'
            xmlRequest = Util.getRequestsXML(
                url, '//div[@class="container-fluid"]')
            if xmlRequest != None:
                stringAddress = xmlRequest.find(
                    './/span[@class="profile-meta__address"]').text.replace(
                        ',,', ',')

                stringAddress = '1st and 2nd Floor Offices, 446 - 452 High street, Kingswinford, West Midlands,'

                ven.formatted_address = self.removeNameFromAdd(
                    ven.name.strip(),
                    stringAddress).replace('PO BOX',
                                           '').replace('PO Box', '').replace(
                                               'Po Box', '')
                zipArr = stringAddress.split(',')
                ven.zipcode = zipArr[len(zipArr) - 1]
                ex_ = re.search(
                    '([Gg][Ii][Rr]0[Aa]{2})|((([A-Za-z][0-9]{1,2})|(([A-Za-z][A-Ha-hJ-Yj-y][0-9]{1,2})|(([A-Za-z][0-9][A-Za-z])|([A-Za-z][A-Ha-hJ-Yj-y][0-9]?[A-Za-z]))))\s?[0-9][A-Za-z]{2})',
                    stringAddress,
                    flags=0)

                if ex_ != None:
                    zip_c = ex_.group(0)
                    #ven.zipcode = zip_c
                    #ven.formatted_address = ven.formatted_address.replace(ven.zipcode,'').strip()
                    if ven.zipcode != zip_c:
                        poZip_c = stringAddress.find(zip_c)
                        poZipcode = stringAddress.find(ven.zipcode)
                        if len(ven.zipcode.strip()) > 1:
                            if poZip_c > poZipcode:
                                ven.zipcode = zip_c
                if ex_ == None:
                    if ven.zipcode != None:
                        ven.zipcode = None
                if ven.formatted_address.endswith(','):
                    ven.formatted_address = ven.formatted_address[
                        0:len(ven.formatted_address) - 2]
                phoneLabel = xmlRequest.xpath(
                    './/span[@class="phone-label"]/parent::a')

                if len(phoneLabel) > 0:
                    for phone_ in phoneLabel:
                        phone = phone_.get('data-phone').replace('\n',
                                                                 '').replace(
                                                                     ' ', '')
                        if phone.find('Shownumber') <= 0:
                            phone = self.validatePhone(phone)
                            for rePhone in self.listPhoneremove:
                                if phone == rePhone:
                                    phone = None
                            if phone != None:
                                if phone.startswith('07'):
                                    ven.mobile_number = phone
                                else:
                                    ven.office_number = phone
                                break
                services = xmlRequest.find(
                    './/ul[@class="advice-area__level-one"]')
                if services != None:
                    list_ser = services.xpath('./li')
                    for ser_name in list_ser:

                        # feedback 3 : add category service

                        cate = ser_name.find('./span').text.strip()
                        list_services = ser_name.xpath('./ul/li')
                        for service__ in list_services:
                            service = Service()
                            service.service_category = cate + ' advice'
                            service.service = service__.text + ' advice'
                            services_v.append(service)

                ven.services = services_v

                # append accreditations feedback 3
                certi = []
                cer = xmlRequest.xpath(
                    './/div[@class="profile-team__skill-item collapsed"]')
                for c in cer:
                    inCerti = [x_ for x_ in certi if c.text in x_]
                    if len(inCerti) <= 0:
                        certi.append(c.text)

                ven.accreditations = ', '.join(certi)

                # add follow :  fb, twi, website feedback 3
                follow = xmlRequest.xpath(
                    '//div[@class="profile__follow"]/ul/li')
                for fol in follow:
                    values_fol = fol.get('class')
                    if values_fol == 'icon-soc-tw':
                        ven.twitter = fol.find('./a').get('href')
                    if values_fol == 'icon-soc-www':
                        ven.business_website = fol.find('./a').get('href')
                    if values_fol == 'icon-soc-fb':
                        ven.facebook = fol.find('./a').get('href')

                # description feedback 3

                des_1 = xmlRequest.find(
                    './/div[@class="profile__text-block "]/p')
                if des_1 != None:
                    ven.description = ''.join(des_1.itertext()).replace(
                        '.\n', ' | ')
                des_2 = xmlRequest.find(
                    './/div[@class="profile__text-block spacing-bottom-xs-0"]/p'
                )
                if des_2 != None:
                    ven.description += ' -Our services: ' + ''.join(
                        des_2.itertext()).replace('.\n', ' | ')
                if ven.description != None:
                    if ven.description.endswith(' | '):
                        ven.description = ven.description[0:len(ven.description
                                                                ) - 2]
                return ven
        else:
            return None