コード例 #1
0
    def __VenueParser_2(self, element):
        try:
            featured = 'none'
            #print ET.dump(element)
            self.urlmarks += 1
            print '[COUNT]: ' + str(self.urlmarks)
            ven = Venue()
            ven.scrape_page = self._url_lstVenues + '#' + str(self.urlmarks)
            td = element.find('./td')
            div = td.find('./div')
            if div != None:
                a = div.find('./a').get('href')
                ven.business_website = a
                td.remove(div)
            basicInfo = ''.join(td.itertext())
            #basicInfo =td.text

            if self.urlmarks == 881:
                pass

            street = ''
            contactName = ''
            name = ''

            basicInfo_ = basicInfo.split('\n')
            if basicInfo_[len(basicInfo_) - 1].find('Fax') != -1:
                basicInfo_ = basicInfo_[0:-1]
            positionInArr = -1
            phoneNumber = basicInfo_[positionInArr].strip().replace(
                'Tel.', '').replace(' ', '')
            positionInArr -= 1
            local = basicInfo_[positionInArr]
            positionInArr -= 1
            street = basicInfo_[positionInArr]
            if self.isContactName(street) == True:
                contactName = street
                street = ''
                positionInArr += 1
            positionInArr -= 1
            if contactName == '':
                contactName = basicInfo_[positionInArr]

            if self.isContactName(contactName) == False:
                positionInArr += 1
                contactName = ''
            name = ' '.join(basicInfo_[0:positionInArr])

            (ven.office_number, ven.office_number2, ven.mobile_number,
             ven.mobile_number2) = self.processPhone([phoneNumber])
            (ven.city, ven.zipcode) = self.processZipCity(local)
            if ven.zipcode != None:
                if self.validateZip(ven.zipcode) == None:
                    return
            ven.street = street
            ven.name_of_contact = contactName
            ven.name = name
            ven.country = self._language
            ven.hqdb_featured_ad_type = featured
            self.venues[ven.scrape_page] = ven
        except Exception, ex:
            print ex
コード例 #2
0
 def __VenueParser(self, element):
     try:
         self.urlmarks += 1
         print '[COUNT]: ' + str(self.urlmarks)
         featured = 'featured'
         onclick = element.xpath('./td//a/@onclick')[0]
         detailLink = onclick[onclick.find("MM_openBrWindow('") +
                              len("MM_openBrWindow('"):onclick.find("','','"
                                                                    )]
         detailLink = self.__url__ + '/' + detailLink
         xmlDoc = Util.getRequestsXML(detailLink,
                                      '//table[@cellspacing="3"]')
         xmlDoc = xmlDoc[0]
         ven = Venue()
         ven.hqdb_featured_ad_type = featured
         ven.country = self._language
         ven.scrape_page = detailLink
         detail_ = xmlDoc.xpath('./tr/td/table')
         detail_1 = detail_[0]
         detail_2 = detail_[2]
         basicInfo = detail_1.find('./tr/td/table/tr/td[@class="text"]')
         email_website = basicInfo.getparent().xpath(
             '//table//div[@align="right"]/a')
         for aTag in email_website:
             link__ = aTag.get('href')
             if link__.find('mailto') != -1:
                 ven.business_email = link__.replace('mailto:', '')
             else:
                 if link__.find('http') != -1:
                     ven.business_website = link__
         openxml = detail_1.xpath('./tr')
         openxml = openxml[2].find('./td/table')  # table openning hour
         rows = openxml.xpath('./tr')
         dayofweek = {
             'Montag': 'Lunedì',
             'Dienstag': 'Martedì',
             'Mittwoch': 'Mercoledì',
             'Donnerstag': 'Giovedì',
             'Freitag': 'Venerdì',
             'Samstag': 'Sabato',
             'Sonntag': 'Domenica'
         }
         openning_hour_array = []
         for row in rows:
             tds = row.xpath('./td')
             if len(tds) > 0:
                 if tds[0].text != None:
                     if dayofweek.get(tds[0].text, "NULL") != "NULL":
                         record = ''
                         count_ = 0
                         for td in tds:
                             if dayofweek.get(td.text, "NULL") != "NULL":
                                 record += dayofweek.get(td.text) + ": "
                             else:
                                 if td.text.strip() != '-':
                                     record += td.text.replace('.',
                                                               ':') + ", "
                                 else:
                                     count_ += 1
                         record = record.strip()
                         if record.endswith(','):
                             record = record[0:-1]
                         if count_ < 3:
                             openning_hour_array.append(record)
         ven.opening_hours_raw = ' | '.join(openning_hour_array)
         basicInfo_ = ''.join(basicInfo.itertext()).split('\n')
         if basicInfo_[len(basicInfo_) - 1].find('Fax') != -1:
             basicInfo_ = basicInfo_[0:-1]
         phonenumber = basicInfo_[-1].strip().replace('Tel.',
                                                      '').replace(' ', '')
         zip_ci = basicInfo_[-2]
         street = basicInfo_[-3]
         contactName = basicInfo_[-4]
         name = ' '.join(basicInfo_[0:-4])
         (ven.office_number, ven.office_number2, ven.mobile_number,
          ven.mobile_number2) = self.processPhone([phonenumber])
         (ven.city, ven.zipcode) = self.processZipCity(zip_ci)
         if ven.zipcode != None:
             if self.validateZip(ven.zipcode) == None:
                 return
         ven.street = street
         ven.name_of_contact = contactName
         ven.name = name
         services = detail_2.xpath('./tr[@valign="top"]/td')
         if len(services) > 0:
             service_ = []
             services = services[0].text
             ven.services = self.__ServicesParser(services)
         self.index += 1
         ven.writeToFile(self.folder, self.index, ven.name, False)
     except Exception, ex:
         print ex
コード例 #3
0
 def __VenueParser(self,hqdb_type, linkItems,subcate,cate):    
         #linkItems ='https://www.blauarbeit.de/p/modernisierung/_sanierung/berlin/daniel_kutscher/576667.htm'
         existing=[x for x in self.linkIn if linkItems in x]
         if len(existing)>0:
             print 'This venue exist in list'
             return None
         self.linkIn.append(linkItems)
         
         
         
         xmlPages = self.getRequest(linkItems)
         if xmlPages==None:
             return None
       
     
         xmlVen = xmlPages.xpath('//div[@class="page_move"]')
         cate__ = xmlPages.find('.//meta[@name="Description"]')
         
         if len(xmlVen)==0:
             return None
     
         name = xmlVen[0].xpath('.//h2')
         if len(name) <=0:
             name =''
         else:
             name = name[0].text.strip()
         noneValues ={'ZERO','NULL'}
         if name.upper() in noneValues:
             return None
         ven = Venue()
         
         if cate__!=None:
             ven.category = cate__.get('content').split(',')[0]
             
             
         nameFromUrl = self.getNamefromUrl(linkItems)
         ven.name =  nameFromUrl
         ven.hqdb_featured_ad_type = hqdb_type
         #ven.name =name
         ven.scrape_page = linkItems
         #ven.subcategory = subcate
         #ven.category= cate
         address_= ''
         #ven.formatted_address=''
         img_link= []
         divInfo = xmlVen[0].find('.//div[@class="content_wrapper content_wrapper_main clearfix"]/div')
         if divInfo!=None:
             mainInfo =  divInfo.xpath('./section')    
             if len(mainInfo)>=2:
                 leftInfo =  mainInfo[0]
                 rightInfo = mainInfo[1]
                 tableInfo = leftInfo.find('./div/div[@class="profile_top_left"]/table')
                 trinfo = tableInfo.xpath('./tr')
                 for tr_ in trinfo:
                     td =tr_.xpath('./td')
                     if len(td)<2:
                         continue
                     key_ = ''.join(td[0].itertext()).strip()
                     values_ = ' '.join(td[1].itertext()).strip().replace('keine Angabe','').replace('NULL','').replace('null','')
                     if key_ =='Ansprechpartner:':
                         if values_!=None and len(values_)>2:
                             #values_ =''
                             ven.name_of_contact = values_
                             ven.name +=', '+ ven.name_of_contact
                         
                     if key_ =='Addresse:':
                         
                         address_ =  values_
                         ven.formatted_address = self.validateFormat(address_)
                         
                         
                         
                         
                         
                         '''(ven.street,ven.city,ven.zipcode) = self.processAddress(address_)
                         if ven.street!=None:
                             ven.street = self.validateStreet2(ven.street)
                         #ven.formatted_address = address_
                         if ven.city!=None:
                             checkCity = ven.city.split() 
                             if len(checkCity)>0:
                                 if checkCity[0].isdigit():
                                     if len(checkCity[0])==5:
                                         if ven.street!=None:
                                             ven.street+=' '+ ven.zipcode
                                         ven.zipcode = checkCity[0]
                                         ven.city = ven.city.replace(ven.zipcode,'')
                                     else:
                                         ven.city = None
                                         ven.street = None
                                         ven.zipcode= None
                                         ven.formatted_address = ' '.join(checkCity)
                             
                             
                             
                         if ven.zipcode!=None:
                             if len(ven.zipcode)==5:
                                 ven.zipcode = ven.zipcode
                             else:
                                 ven.zipcode = None'''
                         
                         
                         
                     if key_ =='Homepage:':
                         a_ = td[1].find('./a')
                         if a_ !=None:
                             ven.business_website = a_.get('href')
                     mobileCode =['015','016','017','+4915','+4916','+4917']
                     if key_ =='Tel:':
                         values_ = values_.replace('/', '').replace(' ', '').replace('Tel', '')
                         
                         #values_ ='01735465435'
                         
                         
                         for mCode in mobileCode:
                             if values_.startswith(mCode):
                                 ven.mobile_number = self.validatePhone__(self.validatePhone(values_), 'de')
                                 break
                         if ven.mobile_number==None:
                             ven.office_number = self.validatePhone__(self.validatePhone(values_), 'de')
                         
                         '''if values_.startswith('01')| values_.startswith('+0041')| values_.startswith('0041'):
                             ven.mobile_number = self.validatePhone__(self.validatePhone(values_), 'de')
                         else:
                             ven.office_number = self.validatePhone__(self.validatePhone(values_), 'de')'''
                         
                 img_ = leftInfo.find('./div/div[@class="profile_top_right"]/img')
                 if img_!=None:
                     img_ =img_.get('src')
                     img_link.append(img_)
                 rating = leftInfo.xpath('.//section[@id="ratings"]/div')
                 if len(rating)>=2:
                     rating1 = ''.join(rating[0].itertext()).strip().split()[1]
                     rating2 = ''.join(rating[1].itertext()).strip()
                     if len(rating2)>0:
                         rating2 = rating2.split()[0]
                         if rating2.find('/')!=-1:
                             rating2 =  rating2.split('/')[0].replace(',','.')
                     try:
                         float(rating2)
                     except Exception,ex:
                         rating2=None
                     ven.hqdb_nr_reviews = rating1
                     ven.hqdb_review_score = rating2
                 
                 
                 if ven.hqdb_review_score==None:
                     scoreIn = xmlVen[0].xpath('//div[@class="float_box"]//span[@class="txtLight"]/parent::div')
                     if len(scoreIn)>0:
                         core_ = scoreIn[0].text.replace(',','.')
                         try:
                             float(core_)
                         except Exception,ex:
                             core_ =None
                         ven.hqdb_review_score = core_
                 script_ = xmlPages.xpath('./head/script')
                 if ven.formatted_address.strip()=='' and ven.office_number==None and ven.office_number2 ==None and ven.mobile_number ==None and ven.mobile_number2 ==None:
                     return None
                 
                 '''streetTemp = ven.street
                 cityTemp =ven.city
                 zipcodeTemp =ven.zipcode
                 
                 if streetTemp ==None:
                     streetTemp =''
                 if ven.city ==None:
                     cityTemp = ''
                 if ven.zipcode ==None:
                     zipcodeTemp =''
                 address_ = streetTemp+', '+cityTemp+', '+zipcodeTemp
                 address_ = address_.strip().replace(', ,', ',').replace(',,', ',')
                 if address_.startswith(','):
                     address_ =address_[1:len(address_)]
                 if address_.endswith(','):
                     address_ = address_[0:len(address_)-1]
                     
                 if ven.formatted_address!=None:
                     address_ = ven.formatted_address'''
                 
                 #if len(address_.strip())>5:
                 #    (ven.latitude,ven.longitude)  = self.getLatlng(address_,'DE') #script_
                 zipFrom = self.findZipcode(ven.formatted_address)
                 if zipFrom!=None:
                     (ven.latitude,ven.longitude) = self.getLatlng(zipFrom, 'DE')
                     if ven.latitude ==None and ven.longitude==None:
                         Util.log.running_logger.info(ven.formatted_address+' : cannot get GEO code')
                 redirecPhotos= rightInfo.find('./nav/div/ul/li[@class="tabOff tab_foto"]/a')
                 if redirecPhotos!=None:
                     linkPhotos =  redirecPhotos.get('href')
                     if linkPhotos.startswith('/'):
                         linkPhotos = self.__url__+ linkPhotos
                     #time.sleep(1)
                     xpathPhotos =  Util.getRequestsXML(linkPhotos, '//div[@class="portfolio thumbs"]/a')
                     if xpathPhotos!=None:
                         listImg = xpathPhotos.xpath('./a')
                         for __img in listImg:
                             img_link.append(__img.get('data-thumb'))
                 
                 
                 desElement= rightInfo.find('./div/div[@id="cont_about"]')
                 
                 
                 '''
                 pTag = desElement.xpath('//div[@class="overview"]/p')
                 des = ''
                 for desE in pTag :
                     if ''.join(desE.itertext()).find('<xml>')>=0:
                         continue
                     des+=''.join(desE.itertext())
                 h5Tag = desElement.xpath('//div[@class="overview"]/h5')
                 for desE_ in h5Tag:
                     if ''.join(desE_.itertext()).find('<xml>')>=0:
                         continue
                     des += ''.join(desE_.itertext())
                 divTag =desElement.xpath('//div[@class="overview"]/h5')
                 for div_ in divTag:
                     if ''.join(div_.itertext()).find('<xml>')>=0:
                         continue
                     des+= ''.join(div_.itertext())
                 if len(pTag)==0 and len(h5Tag) ==0:
                     if desElement.find('.//div[@class="overview"]')!=None:
                         des =  desElement.find('.//div[@class="overview"]').text
                 ven.description = self.validateDes(des)
                 '''
                 des =''
                 divTag = desElement.xpath('//div[@class="overview"]')
                 for divDes in divTag:
                     des+= ' '.join(divDes.itertext())
                 ven.description =  self.validateDes(des)
                 
                 
          
                 
                 
                 certi = rightInfo.find('.//div/div[@id="cont_certs"]')
                 tablecerti =  certi.find('./table')
                 if tablecerti!=None:
                     certi_ = ''.join(tablecerti.itertext()).replace('Geprüfte Zertifikate:','')
                     ven.accreditations = certi_
                 ven.img_link = img_link
                 ven.country ='de'
                 ven.is_get_by_address = True
                 return ven