def _scrape_city (soup): """def: _scrape_city""" if not soup.select('div.address-container'): return None address = soup.select('div.address-container')[0] city = soup.find('span', itemprop = 'addressLocality') return cp1252(city.string)
def _do_search (self, search_term): """def: _do_search""" search_term = cp1252(search_term) logger.dump ('searching %s' % search_term) query = {'vad' : search_term} url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query) self.url = url logger.dump ("Get: %s" % url) self._try_mech_open (url) tree = html.fromstring (self.mech.response().read()) # Did You Mean if tree.xpath ('//a[contains(@data-track-params,"did_you_mean")]/@href'): href = tree.xpath ('//a[contains(@data-track-params,"did_you_mean")]/@href')[0] url = "http://www.hitta.se" + href print "Did you mean: " + url self._try_mech_open (url) tree = html.fromstring (self.mech.response().read()) if not tree.xpath ('//div[@class="company-name"]/h2/a/@href'): query = {'vad' : search_term, 'sida' : 1, 'typ' : 'ftg'} url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query) logger.dump ("Get Clinic From Place: %s" % url) self._try_mech_open (url)
def search (self, row_h): """def: search""" for search_term in self.search_term_gen (row_h): self._do_search (search_term) if not re.search("gav ingen träff", cp1252(self.mech.response().read().decode('utf-8'))): break if re.search("gav ingen träff", cp1252(self.mech.response().read().decode('utf-8'))): result_h = {} result_h ['SEARCH_NAME'] = row_h ['PARENT_WORKPLACE_NAME'] result_h ['URL'] = "NOT FOUND" csv_eniro.write_row_h(result_h) for result_h in self._scrape_result(): result_h ['SEARCH_NAME'] = row_h ['PARENT_WORKPLACE_NAME'] result_h ['MATCHING_CRITERIA'] = search_term csv_eniro.write_row_h(result_h)
def _scrape_addr (soup): if not soup.select('div.address-container'): return None addr_lines = soup.select('div.address-container')[0] addr_lines = addr_lines.select('div.address__line') addr_lines = [addr_line.string for addr_line in addr_lines if (addr_line.string != None)] address = "\n" . join(addr_lines) return cp1252(address)
def _do_search (self, search_term): """def: _do_search""" search_term = cp1252(search_term) logger.dump ('searching %s' % search_term) self.mech.addheaders = [('Host', 'www.eniro.se')] query = {'what': 'supersearch', 'search_word': search_term} url = 'http://www.eniro.se/query?' + urllib.urlencode(query) logger.dump ("Step1 Get: %s" % url) self.mech.open(url, timeout=60)
def _scrape_home_url (soup): """def: _scrape_home_url""" website = soup.find('a', {'class': 'website'}) if website == None: return None website = website.get('href', None) if website != None: website = cp1252(website) if re.match('/', website): website = 'http://www.hitta.se/' + website return website
def _do_search (self, search_term): """def: _do_search""" search_term = cp1252(search_term) logger.dump ('searching %s' % search_term) query = {'vad' : search_term} url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query) logger.dump ("Get: %s" % url) self._try_mech_open (url) tree = html.fromstring (self.mech.response().read()) if not tree.xpath ('//div[@class="company-name"]/h2/a/@href'): query = {'vad' : search_term, 'sida' : 1, 'typ' : 'ftg'} url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query) logger.dump ("Get Clinic From Place: %s" % url) self._try_mech_open (url)
def _scrape_result (self): """def: _scrape_result""" result_h = {} tree = html.fromstring (self.mech.response().read()) articles = tree.xpath ('//article[@data-hit-number]') for article in articles: article = html.fromstring (html.tostring (article)) result_h ['URL'] = self._first_element (article.xpath('//a[contains(@class,"profile-page-link")]/@href')) result_h ['URL'] = "http://gulasidorna.eniro.se" + result_h ['URL'] result_h ['FOUND_NAME'] = self._first_element (article.xpath('//a[contains(@class,"profile-page-link")]/text()')) result_h ['WEBSITE'] = self._first_element (article.xpath ('//a[contains(@class,"hit-homepage-link")]/text()')) result_h ['WEBSITE'] = re.sub('.*1177\.se.*', '', result_h ['WEBSITE']) result_h ['PHONE'] = self._first_element (article.xpath ('//span[contains(@class,"hit-phone-number")]/text()')) result_h ['PHONE'] = re.sub(r'\s+', '', result_h ['PHONE']) result_h ['ADDRESS'] = self._first_element (article.xpath ('//span[contains(@class,"street-address")]/text()')) result_h ['ZIP'] = self._first_element (article.xpath ('//span[contains(@class,"postal-code")]/text()')) result_h ['ZIP'] = re.sub(r'\s+', '', result_h ['ZIP']) result_h ['CITY'] = self._first_element (article.xpath ('//span[contains(@class,"locality")]/text()')) result_h = cp1252(result_h) yield result_h
def parse(self): """def: parse""" for row_h in self.doctor_match_result_reader: if not row_h["FOUND_DOCTORS_URL"]: csv_hitta_vardcentraler.write_row_h(row_h) continue if row_h["FOUND_DOCTORS_URL"] == "": csv_hitta_vardcentraler.write_row_h(row_h) continue home_urls = row_h["FOUND_DOCTORS_URL"] self.doctor_tag = row_h["FOUND_DOCTOR_TAG"] all_names = [] for home_url in row_h["FOUND_DOCTORS_URL"].split("|"): names = self.get_all_names(home_url) all_names = all_names + names all_names_str = "|".join(all_names) row_h["FOUND_ALL_NAMES"] = cp1252(all_names_str) csv_hitta_vardcentraler.write_row_h(row_h)
def parse (self): """def: parse""" for row_h in self.doctor_match_result_reader: if not row_h ['FOUND_DOCTORS_URL']: csv_hitta_vardcentraler.write_row_h(row_h) continue if row_h ['FOUND_DOCTORS_URL'] == "": csv_hitta_vardcentraler.write_row_h(row_h) continue home_urls = row_h ['FOUND_DOCTORS_URL'] self.doctor_tag = row_h ['FOUND_DOCTOR_TAG'] all_names = [] for home_url in row_h['FOUND_DOCTORS_URL'].split("|"): names = self.get_all_names (home_url) all_names = all_names + names all_names_str = '|'.join(all_names) row_h["FOUND_ALL_NAMES"] = cp1252(all_names_str) row_h ['DATE'] = datetime.datetime.today().strftime('%Y%m%d') csv_hitta_vardcentraler.write_row_h(row_h)
def _scrape_name (soup): """def: _scrape_name""" heading = soup.find('h1', {'itemprop': 'name'}) if heading != None: return cp1252('' . join(heading.stripped_strings)) return None