def open(self, url, data=None, timeout=mechanize._sockettimeout._GLOBAL_DEFAULT_TIMEOUT): if str(type(url)) == "<type 'str'>": success = 0 error_303 = 0 while success == 0 and error_303 == 0: try: if re.search('www.eniro.se', url): self.addheaders = [('Host', 'www.eniro.se')] mechanize.Browser.open(self, url, data, timeout=timeout) except Exception as error: print "Error: {}".format(error) if str(error) == 'HTTP Error 303: See Other': error_303 = 1 else: success = 1 if error_303 == 1: logger.dump ('Skipping: {}'.format(url)) header = self.response().info() location = header ['location'] try: logger.dump ('Step2 Get: {}'.format(location)) self.addheaders = [('Host', 'gulasidorna.eniro.se')] mechanize.Browser.open(self, location, data, timeout=timeout) except Exception as error: print "Error: {}".format(error) if str(error) != 'HTTP Error 404: Not Found': error_303 = 0 else: success = 1 return self.response()
def search (self): """def: search""" search_term = 'vårdcentral' self._do_search (search_term) for link in self._company_links(): while True: success = 0 try: logger.dump ("Result Get: " + link) self.result_mech.open(link) success = 1 except Exception as error: try: logger.dump (error) except Exception: pass if success == 1: break row_h = self._scrape_result() row_h['URL'] = link row_h['SEARCH_NAME'] = search_term csv_hitta.write_row_h(row_h)
def search (self): """def: search""" search_term = 'vårdcentral' self._do_search (search_term) logger.dump ("Get Links:") for link in self._company_links(): logger.dump ("Result Get: " + link)
def _do_search (self, search_term): """def: _do_search""" search_term = cp1252(search_term) logger.dump ('searching %s' % search_term) self.mech.addheaders = [('Host', 'www.eniro.se')] query = {'what': 'supersearch', 'search_word': search_term} url = 'http://www.eniro.se/query?' + urllib.urlencode(query) logger.dump ("Step1 Get: %s" % url) self.mech.open(url, timeout=60)
def _try_mech_open (self, url): """def: _try_mech_open""" while True: is_succeed = 0 try: self.mech.open(url) is_succeed = 1 except: logger.dump ("Error search: %s" % url) logger.dump ("RETRY: %s" % url) if is_succeed == 1: break
def _company_links (self): """def: _company_links""" tree = html.fromstring (self.mech.response().read()) for link in tree.xpath ('//div[@class="company-name"]/h2/a/@href'): yield 'http://www.hitta.se/' + link if self._do_next_page(): logger.dump ("Next Page:") for link in self._company_links (): yield link
def _do_search (self, search_term): """def: _do_search""" search_term = cp1252(search_term) logger.dump ('searching %s' % search_term) query = {'vad' : search_term} url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query) logger.dump ("Get: %s" % url) self._try_mech_open (url) logger.dump ('===============================================================') logger.dump (self.mech.response().read()) quit() tree = html.fromstring (self.mech.response().read()) if not tree.xpath ('//div[@class="company-name"]/h2/a/@href'): query = {'vad' : search_term, 'sida' : 1, 'typ' : 'ftg'} url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query) logger.dump ("Get Clinic From Place: %s" % url) self._try_mech_open (url)
def _do_search (self, search_term): """def: _do_search""" search_term = cp1252(search_term) logger.dump ('searching %s' % search_term) query = {'vad' : search_term} url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query) self.url = url logger.dump ("Get: %s" % url) self._try_mech_open (url) tree = html.fromstring (self.mech.response().read()) # Did You Mean if tree.xpath ('//a[contains(@data-track-params,"did_you_mean")]/@href'): href = tree.xpath ('//a[contains(@data-track-params,"did_you_mean")]/@href')[0] url = "http://www.hitta.se" + href print "Did you mean: " + url self._try_mech_open (url) tree = html.fromstring (self.mech.response().read()) if not tree.xpath ('//div[@class="company-name"]/h2/a/@href'): query = {'vad' : search_term, 'sida' : 1, 'typ' : 'ftg'} url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query) logger.dump ("Get Clinic From Place: %s" % url) self._try_mech_open (url)
def find_match (self, full_name_list): """def: find_match""" is_succeed = 0 try: soup = BeautifulSoup(self.mech.response().read(), "html.parser") is_succeed = 1 except: logger.dump ("Error Socket Timeout, Skipping") if is_succeed != 1: return [] matches = [] for full_name in full_name_list: tags = soup.find_all(lambda tag: self._find_doctor(tag, full_name)) if len(tags) == 0: continue match = {} match['URL'] = self.mech.geturl() match['SEARCH_DOCTOR_NAME'] = full_name match['FOUND_DOCTOR_NAME'] = tags[0].string.encode('cp1252') match['FOUND_DOCTOR_TAG'] = tags[0].name matches.append(match) return matches
def _get_best_matching_row_h (self, vrow_h, hitta_results_rows_h): """def: _get_best_matching_row_h""" crows_h = [] for crow_h in self._compare_rows (vrow_h, hitta_results_rows_h): crows_h.append(crow_h) if len(crows_h) == 0: return {} if len(crows_h) == 1: return crows_h[0] logger.dump ('Found Multiple partial matches for %s' % crows_h[0]['SEARCH_NAME']) logger.dump ('Applying Filter to get best match') save_rows_h = crows_h filtered_rows_h = [row_h for row_h in crows_h if row_h['NAME_MATCH?'] == 'EXACT'] if len(filtered_rows_h) == 1: return filtered_rows_h [0] elif len(filtered_rows_h) == 0: filtered_rows_h = save_rows_h save_rows_h = filtered_rows_h filtered_rows_h = [row_h for row_h in filtered_rows_h if row_h['NAME_MATCH?'] == 'PARTIAL'] if len(filtered_rows_h) == 1: return filtered_rows_h [0] elif len(filtered_rows_h) == 0: filtered_rows_h = save_rows_h save_rows_h = filtered_rows_h filtered_rows_h = [row_h for row_h in filtered_rows_h if row_h['ADDRESS_MATCH?'] == 'EXACT'] if len(filtered_rows_h) == 1: return filtered_rows_h [0] elif len(filtered_rows_h) == 0: filtered_rows_h = save_rows_h save_rows_h = filtered_rows_h filtered_rows_h = [row_h for row_h in filtered_rows_h if row_h['PHONE_MATCH?'] == 'EXACT'] if len(filtered_rows_h) == 1: return filtered_rows_h [0] elif len(filtered_rows_h) == 0: filtered_rows_h = save_rows_h save_rows_h = filtered_rows_h filtered_rows_h = [row_h for row_h in filtered_rows_h if row_h['ADDRESS_MATCH?'] == 'PARTIAL'] if len(filtered_rows_h) == 1: return filtered_rows_h [0] elif len(filtered_rows_h) == 0: filtered_rows_h = save_rows_h # Remove sub clinic filtered_rows_h = [row_h for row_h in filtered_rows_h if not re.search('Distriktssköterska|mottagning', row_h['FOUND_NAME'])] if len(filtered_rows_h) == 1: return filtered_rows_h [0] elif len(filtered_rows_h) == 0: filtered_rows_h = save_rows_h save_rows_h = filtered_rows_h filtered_rows_h = [row_h for row_h in filtered_rows_h if row_h['ZIP_MATCH?'] == 'EXACT'] if len(filtered_rows_h) == 1: return filtered_rows_h [0] elif len(filtered_rows_h) == 0: filtered_rows_h = save_rows_h save_rows_h = filtered_rows_h filtered_rows_h = [row_h for row_h in filtered_rows_h if row_h['CITY_MATCH?'] == 'EXACT'] if len(filtered_rows_h) == 1: return filtered_rows_h [0] elif len(filtered_rows_h) == 0: filtered_rows_h = save_rows_h return filtered_rows_h [0]
def search (self, vrow_h): """def: search""" search_term = vrow_h['PARENT_WORKPLACE_NAME'] + ',' + vrow_h['PARENT_WORKPLACE_ADDRESS'] self._do_search (search_term) #------------------------------------------------------------------------------- # Special Case if vrow_h['PARENT_WORKPLACE_NAME'] == 'Familjeläkarna i Sverige AB': vrow_h['PARENT_WORKPLACE_NAME'] = 'Familjeläkarna' #------------------------------------------------------------------------------- if re.search('inget resultat', self.mech.response().read()): logger.dump ("NO RESULT FOUND!!") search_term = vrow_h['PARENT_WORKPLACE_NAME'] self._do_search (search_term) if re.search('inget resultat', self.mech.response().read()): logger.dump ("NO RESULT FOUND!!") logger.dump ("Searching with address, zip city!!!!") search_term = "%(PARENT_WORKPLACE_ADDRESS)s, %(PARENT_WORKPLACE_ZIP)s %(PARENT_WORKPLACE_CITY)s" % vrow_h self._do_search (search_term) if re.search('inget resultat', self.mech.response().read()): logger.dump ("NO RESULT FOUND!!") logger.dump ("Searching with address, city!!!!") search_term = "%(PARENT_WORKPLACE_ADDRESS)s, %(PARENT_WORKPLACE_CITY)s" % vrow_h self._do_search (search_term) if re.search('inget resultat', self.mech.response().read()): logger.dump ("NO RESULT FOUND!! Skipping...") row_h = {} row_h['URL'] = 'NOT FOUND' row_h['SEARCH_NAME'] = vrow_h['PARENT_WORKPLACE_NAME'] csv_hitta.write_row_h(row_h) return link_found = 0 for link in self._company_links(): link_found = 1 while True: success = 0 try: logger.dump ("Result Get: " + link) self.mech.open(link) success = 1 except Exception as error: logger.dump (error) if success == 1: break row_h = self._scrape_result() row_h['URL'] = link row_h['SEARCH_NAME'] = vrow_h['PARENT_WORKPLACE_NAME'] row_h['MATCHING_CRITERIA'] = search_term csv_hitta.write_row_h(row_h) if link_found == 0: row_h = self._scrape_result() row_h['URL'] = self.url row_h['SEARCH_NAME'] = vrow_h['PARENT_WORKPLACE_NAME'] row_h['MATCHING_CRITERIA'] = search_term csv_hitta.write_row_h(row_h)
def find (self, home_url, full_name_list): """def: find""" logger.dump ("HOME Get: %s" % home_url) retry_count = 1 while retry_count <= 5: is_succeed = 0 try: self.mech.open(home_url, timeout=60.0) is_succeed = 1 except: logger.dump ("Error Getting: %s" % home_url) logger.dump ("##### RETRYING #####: %s" % home_url) retry_count += 1 if is_succeed == 1: break if retry_count > 5: parsed_uri = urlparse( home_url ) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) is_succeed = 0 try: self.mech.open(domain, timeout=60.0) is_succeed = 1 except: logger.dump ("Error Getting: %s" % home_url) logger.dump ("### SKIPTING: %s" % home_url) if is_succeed == 0: return home_url = self.mech.geturl() logger.dump ("HOME URL: %s" % home_url) self.find_match (full_name_list) matches_h = {} for link in list(set(map(lambda x: x.absolute_url, self.mech.links()))): EXCLUDED_LINK_EXTENSIONS = ('jpg', 'gif', 'jpeg','pdf', 'doc', 'docx', 'ppt', 'txt', 'png', 'zip', 'rar', 'mp3') if link.split('.')[-1].lower() in EXCLUDED_LINK_EXTENSIONS: continue parsed_uri = urlparse( home_url ) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) if domain not in link: #domain = re.sub('https?://', '', domain) domain = re.sub('www\.', '', domain) if domain not in link: continue if re.search('hitta\.se', link): continue if re.search('google', link): continue if re.search('twitter', link): continue if re.search('itunes\.', link): continue if re.search('app\.eu', link): continue if re.search('linkedin', link): continue if re.search('facebook', link): continue if re.search('mailto:', link): continue if re.search('apple\.', link): continue if re.search('1177\.', link): continue # if re.search('jll\.se', link): continue # This domain hangs while getting request if len(matches_h) == len(full_name_list): break retry_count = 1; while retry_count <= 5: is_succeed = 0 try: self.mech.open(link, timeout=60.0) logger.dump ("Search Get: %s" % link) matches = self.find_match (full_name_list) for match in matches: match['HOME_URL'] = home_url match['MATCH?'] = "FOUND" if match['SEARCH_DOCTOR_NAME'] not in matches_h: matches_h[match['SEARCH_DOCTOR_NAME']] = match #if len(matches) != 0: break is_succeed = 1; except: logger.dump ("Error getting %s" % link) logger.dump ("#### RETRY {} ###".format(retry_count)) retry_count += 1 if is_succeed == 1: break for full_name in full_name_list: if full_name in matches_h: yield matches_h[full_name] else: match = {} match['HOME_URL'] = home_url match['SEARCH_DOCTOR_NAME'] = full_name match['MATCH?'] = 'NOT FOUND' yield match