def ValidateGeoCode(fulladdress, country, lat, lng, scrape_page=None): if fulladdress != None and country != None and lat != None and lng != None: GermanyChar = [['ä', 'ae'], ['ö', 'oe'], ['ü', 'ue'], ['Ä', 'Ae'], ['Ö', 'Oe'], ['Ü', 'Ue'], ['ß', 'ss']] if country == 'de' or country == 'at': for x in GermanyChar: fulladdress = fulladdress.replace(x[0], x[1]) fulladdress = Util.removedoubleSpace(fulladdress) jsonLocation = Util.getGEOCode(fulladdress, country) if jsonLocation != None and jsonLocation.get('status').upper() == 'OK': latLo = str( jsonLocation.get('results')[0].get('geometry').get( 'location').get('lat')) lngLo = str( jsonLocation.get('results')[0].get('geometry').get( 'location').get('lng')) dotIndex = min(len(latLo), latLo.find('.') + 3) for c in range(0, min(dotIndex, len(lat))): if lat[c] != latLo[c]: Util.log.coordinate_logger.error(scrape_page + ': invalid latitude (' + lat + ',' + lng + ')') return False dotIndex = min(len(lngLo), lngLo.find('.') + 3) for c in range(0, min(dotIndex, len(lng))): if lng[c] != lngLo[c]: Util.log.coordinate_logger.error(scrape_page + ': invalid longitude (' + lat + ',' + lng + ')') return False else: Util.log.coordinate_logger.warning( fulladdress + ',' + country + ': cannot get GEO code ' + jsonLocation.get('status').upper()) return True
def __getListVenues(self, link, city): xmlDoc = Util.getRequestsXML( link, '//body/div[@class="container"]/div[@class="container"]') itemsXpath = [ '//div[@class="row top-buffer"]/a', '//div[@class="row top-buffer border-row"]/a' ] if xmlDoc != None: pages_ = xmlDoc.find('.//div[@class="row centered-text"]/p') if pages_ != None: totalPages = pages_.text try: currentPages = 0 totalPages = int(totalPages.split('of')[1]) '''self.total+= (totalPages*20) print str(self.total)''' while currentPages < totalPages: currentPages += 1 url = link + '&page=' + str(currentPages) xmlContent = Util.getRequestsXML( url, '//body/div[@class="container"]/div[@class="container"]' ) for path in itemsXpath: items = xmlContent.xpath(path) for item in items: self.__VenueParser( self.__url__ + item.get('href'), city) Util.log.running_logger.info(city + ' Done') except Exception, ex: print ex
def __getListVenues(self, city): #print "Getting list of Venues" #lens= len(self.__city__) #index_ = 0 #for city in range(0,lens): _Schools = Util.getRequestsXML(city, '//td[@class="welcome-padding"]/table') if _Schools == None: Util.log.running_logger.warning(city + ' Done') return if len(_Schools) >= 2: tds = _Schools[1].xpath('./tr/td') for td in tds: as_ = td.xpath('./a') for a in as_: link = a.get('href') name = a.text ven = self.__VenueParser(link, name) if ven != None: index_ = self.addIndex() print 'Writing to Index: ' + str(index_) ven.writeToFile(self.folder, index_, ven.name, False) #index_= self.addIndex() #time.sleep(2) Util.log.running_logger.warning(city + ' Done')
def doWork(self): self.phoneCodeList = Util.getPhoneCodeList() '''for i in range(10): thread1 = threading.Thread(target=self.print_ex,args=(i,)) thread1.start()''' self.__getListVenues()
def doWork(self): ''' Code Here ''' #Write Files self.phoneCodeList = Util.getPhoneCodeList() self.__getListVenues()
def getLatlng(self, address, countr): if address.strip() == '': #return (None,None) address = 'null' return (None, None) try: jsonLatlng = Util.getGEOCode(address, countr) if jsonLatlng != None: if jsonLatlng.get('status') == 'OK': result = jsonLatlng.get('results') for re in result: if re.get('geometry') != None: geometry = re.get('geometry') location = geometry.get('location') lat = location.get('lat') lng = location.get('lng') return (str(lat), str(lng)) else: return (None, None) else: return (None, None) except Exception, ex: return (None, None)
def getRequest(self, pages, value, xpath_): url = 'http://www.architecturalindex.com/consumers/search/search.asp?strNearRegion=' + str( value) + '&intPage=' + str(pages) print '*' * 25 print 'Request: ' + url print '*' * 25 results = Util.getRequestsXML(url, xpath_) return results
def getListRegion(self): xmlList = Util.getRequestsXML(self.__url__, '//div[@class="col-md-9 page-main"]/div/ul') if xmlList !=None: regions = xmlList.xpath('./ul/li/a') for region in regions : if region.text!='': # Channel Islands url__ = self.__url__+ region.get('href') self.__getListVenues(url__)
def doWork(self): #Set OutFile Values self.phoneCodeList = Util.getPhoneCodeList() ''' Code Here ''' self.__getListVenues()
def getCategory(self): listCate=[] xmlDoc = Util.getRequestsXML('https://www.blauarbeit.de/branchenbuch/index.html', '//div[@class="box_w box_r"]/ul/li/a') if xmlDoc!=None: a = xmlDoc.xpath('./a') for a_ in a : listCate.append(a_.text+'\t'+self.__url__+a_.get('href')) return listCate
def __getListVenues(self, urlRegion): xmlListVenues = Util.getRequestsXML( urlRegion, '//div[@class="row listing listing-horizontal-xs listing-horizontal-sm listing-horizontal-md business"]' ) if xmlListVenues != None: listElements = xmlListVenues.xpath('./div') for element in listElements: self.__VenueParser(element)
def getListcategory(self): xmlDoc = Util.getRequestsXML(self.__url__ + '/the-directory', '//div[@id="wpbdp-categories"]/ul/li') if xmlDoc != None: listcates = xmlDoc.xpath('./li/a') for cate in listcates: thread__ = threading.Thread(target=self.__getListVenues, args=(cate, )) thread__.start()
def __getListVenues_2(self): xmldoc = Util.getRequestsXML(self._url_lstVenues, '/html') listdata = self.getListGeoCode(xmldoc) listvenues = xmldoc.xpath('//div[@class="links"]/ul/li') count = 0 for ven in listvenues: count += 1 self.__VenueParser(ven, listdata, self._url_lstVenues + '#' + str(count))
def __getListVenues(self): xmlDoc = Util.getRequestsXML(self._url_lstVenues, self._xpath_lstVenues) listElement = xmlDoc.xpath('./tr') for ele in listElement: if len(ele.xpath('./td//a/@onclick')) > 0: self.__VenueParser(ele) else: self.__VenueParser_2(ele)
def doWork(self): ''' Code Here ''' #Write Files self.phoneCodeList = Util.getPhoneCodeList() #string = 'SILIGOM - SARL PNEU ROUTE 01 165 RADIOR' #print self.validateStreet2(string) #print self.replaceChar('-', self.replaceChar(',', string, True), True) self.__getListVenues()
def ReValidPhone(phone, type='phone'): if phone != None: if phone == "": return None phone = ReValidString(phone) if phone != None: phone = Util.removeSpecialChar(phone, type).replace(' ', '') if phone == '': return None return phone
def doWork(self): string = '15863 A Baña' print self.validateStreet(string) self.phoneCodeList = Util.getPhoneCodeList() try: self.__getListVenues() except Exception, ex: print ex
def __list_city(self): xmlDoc = Util.getRequestsXML(self._url_lstVenues, '//td[@class="welcome-padding"]') #print ET.dump(xmlDoc) listCity = xmlDoc.xpath('//table/tr/td/a') if len(listCity) > 0: for i in listCity: link = i.get('href') existing = [x for x in self.__city__ if link in x] if len(existing) <= 0: self.__city__.append(link)
def doWork(self): self.outFile = self.folder + '/' + self._chain_ + '_' + Validation.RevalidName(self.__name__) + '_Venues.csv' self.phoneCodeList = Util.getPhoneCodeList() self.__getListCities() if len(self.listCities) > 0: self.listCities = list(set(self.listCities)) self.__getListVenues() if len(self._lstVenues) > 0: listWrite2File = [] for i in range(len(self._lstVenues)): try: ven = self.__VenueParser(self._lstVenues[i]) if ven != None: listWrite2File.append(ven.toOrderDict(False)) except Exception,ex: print "URL: " + self._lstVenues[i].scrape_page + ": " + ex.message Util.log.error("URL: " + self._lstVenues[i].scrape_page + ": " + ex.message) Util.writelist2File(listWrite2File,self.outFile)
def ReValidPrice(price, pattern=None): if price == None: return None price = price.strip().replace(",", ".").strip() price = price.replace('\r\n', '').replace('\r', '').replace('\n', '').strip() if pattern != None and re.match(pattern, price) == None: return None if price.strip() == '': return None return Util.removedoubleSpace(price).strip()
def doWork(self): self.phoneCodeList = Util.getPhoneCodeList() self.__getListVenues() self.filterVenues() listDict = list(self.venues) for l in listDict: ven = self.venues.get(l) #for ven in self.venues: self.index += 1 ven.writeToFile(self.folder, self.index, ven.name, False)
def doWork(self): self.phoneCodeList = Util.getPhoneCodeList() #get by rergion #self.getListRegion() #get by postcodes self.getByPostCode() print str(self.index)
def __getListVenues(self, postcode): xmlDoc = Util.getRequestsXML(self._url_lstVenues + postcode, '/html') listData = self.getListGeoCode(xmlDoc) xmlListVenues = xmlDoc.xpath(self._xpath_lstVenues) if len(xmlListVenues) > 0: elements = xmlListVenues[0].xpath('./ul/li') count = 0 for ele in elements: count += 1 self.__VenueParser( ele, listData, self._url_lstVenues + postcode + '#' + str(count))
def __getListVenues(self, countr): for post in range( 0, 10) + [chr(x) for x in range(ord('a'), ord('z') + 1)]: #for post in [chr(x) for x in range(ord('b'), ord('z')+1)]: page = 1 countRequest = 1 url = self.url_(page, post, countRequest, countr) print 'Find with: ' + str(post) isTable = Util.getRequestsXML( url, '//table[@class="table-responsive firm-search-results expandable-rows"]' ) if isTable == None: continue while len(isTable) > 0: page = countRequest * 4 - 3 url = self.url_(page, post, countRequest, countr) #url ='http://www.accaglobal.com/ca/en/member/find-an-accountant/find-firm/results.html?isocountry=VN&location=&country=UK&firmname=a&organisationid=ACCA&hid=&pagenumber=3&resultsperpage=5&requestcount=1' print 'find with url: ' + url isTable = Util.getRequestsXML( url, '//table[@class="table-responsive firm-search-results expandable-rows"]' ) #hides= isTable.xpath('//tr[@class="expandable"]') hides = isTable.xpath('//tr/td/h5/a') print 'found: ' + str(len(hides)) while len(hides) > 0: link = hides[-1].get('href') if self.checkAlive() < 6: thread2 = threading.Thread(target=self.__VenueParser, args=(self.__url__ + link, countr)) hides.remove(hides[-1]) thread2.start() self.threadRunning.append(thread2) else: time.sleep(1) countRequest += 1 print countr + ' done'
def getRegions(self): xmlDoc = Util.getRequestsXML(self._url_lstVenues, '//ul[@id="menu-menu-adresses"]/li/a') if xmlDoc != None: listhref = xmlDoc.xpath('./a') while len(listhref) > 0: if self.countThread() < 6: href = listhref[-1] thread_ = threading.Thread(target=self.__getListVenues, args=(href.get('href'), )) thread_.start() self.runningThread.append(thread_) listhref.remove(href)
def doWork(self): #print self.check_website('https://www.2130113-C-2013') self.phoneCodeList = Util.getPhoneCodeList() postcode = self.postcode() try: for post in postcode: print '-' * 10 + post + '-' * 10 self.__getListVenues(post) #self.__getListVenues_2() except Exception, ex: print ex
def getListVenues(self, xmlElement, cate): pages = -100 while True: pages += 100 numbers = 0 try: link___ = xmlElement.get('href') + '/pag-' + str( pages) + '/rows-100/s:/' #link___ ='http://es.qdq.com/abastecimiento+de+agua/' listVenues__ = Util.getRequestsXML( link___, '//div[@id="listadoResultados"]') listVenues = listVenues__.xpath( '//li[@class="estirar gratuito"]') listVenues_2 = listVenues__.xpath('//li[@class="estirar "]') #print ET.dump(listVenues__) lenght_2 = len(listVenues_2) lenght = len(listVenues) self.countingVenues += lenght self.countingVenues += lenght_2 if len(listVenues) <= 0: break while len(listVenues) > 0: if self.checkAlive() < 10: numbers += 1 scrappages = link___ + '#' + str(numbers) thread1 = threading.Thread(target=self.__VenueParser, args=(listVenues[-1], cate, scrappages)) self.threadRunning.append(thread1) thread1.start() listVenues.pop() else: time.sleep(0.1) while len(listVenues_2) > 0: if self.checkAlive() < 10: numbers += 1 scrape_pages = link___ + '#00' + str(numbers) '''thread2 = threading.Thread(target=self.__VenueParser_2,args=(listVenues_2[-1],subcate,cate,scrape_pages)) self.threadRunning.append(thread2) thread2.start()''' self.__VenueParser_2(listVenues_2[-1], cate, scrape_pages) listVenues_2.pop() else: time.sleep(0.1) except Exception, ex: print ex continue
def doWork(self): #print self.validateZipcode('BN11ITU', 'UK') self.phoneCodeList = Util.getPhoneCodeList() for countr in range(0, len(self.listCountry)): countr_ = self.listCountry[countr] existing = [x for x in self.countryRunning if countr_ in x] if len(existing) <= 0: self.countryRunning.append(countr_) thread1 = threading.Thread(target=self.__getListVenues, args=(countr_, )) thread1.start() thread1.join()
def __getListVenues(self): print "Getting list of Venues" xmlArea = Util.getRequestsXML(self.__url__, self._xpath_lstVenues) if xmlArea==None | len(xmlArea)<=0: '' else: listArea = xmlArea.xpath('./a') for area in listArea: linkArea = area.get('href') xmlCity = Util.getRequestsXML(linkArea,self._xpath_lstVenues) if xmlCity ==None | len(xmlCity)<=0: '' else: listCity = xmlCity.xpath('./a') for city in listCity: linkCity = city.get('href') xmlContent = Util.getRequestsXML(linkCity, '//div[@class="content_column"]') if xmlContent==None | len(xmlContent)<=0: '' else: items= xmlContent.xpath('./div[@class="listings"]/div[@class="list"]/div[@class="listing_box"]/a') for item in items: link_item = item.get('href') self.list_url.append(link_item)
def doWork(self): ''' string='abc def' replace ='DEF' string_ = string.upper() starts = string_.find(replace) replace_ = string[starts:starts+len(replace)] print string.replace(replace_, '') ''' '''string ='www.facebook.com/faceface' print self.addHTTP(string)''' self.phoneCodeList = Util.getPhoneCodeList() self.getListRegion() print str(self.index)