コード例 #1
0
 def _scrape_city (soup):
     """def: _scrape_city"""
     if not soup.select('div.address-container'):
         return None
     address = soup.select('div.address-container')[0]
     city = soup.find('span', itemprop = 'addressLocality')
     return cp1252(city.string)
コード例 #2
0
    def _do_search (self, search_term):
        """def: _do_search"""
        search_term = cp1252(search_term)
        logger.dump ('searching %s' % search_term)

        query = {'vad' : search_term}
        url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query)
        self.url = url

        logger.dump ("Get: %s" % url)
        self._try_mech_open (url)
        tree = html.fromstring (self.mech.response().read())

        # Did You Mean
        if tree.xpath ('//a[contains(@data-track-params,"did_you_mean")]/@href'):
            href = tree.xpath ('//a[contains(@data-track-params,"did_you_mean")]/@href')[0]
            url = "http://www.hitta.se" + href
            print "Did you mean: " + url 
            self._try_mech_open (url)
            tree = html.fromstring (self.mech.response().read())

        if not tree.xpath ('//div[@class="company-name"]/h2/a/@href'):
            query = {'vad' : search_term, 'sida' : 1, 'typ' : 'ftg'}
            url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query)
            logger.dump ("Get Clinic From Place: %s" % url)
            self._try_mech_open (url)
コード例 #3
0
    def search (self, row_h):
        """def: search"""
        for search_term in self.search_term_gen (row_h):
            self._do_search (search_term)
            if not re.search("gav ingen träff", cp1252(self.mech.response().read().decode('utf-8'))):
                break

        if re.search("gav ingen träff", cp1252(self.mech.response().read().decode('utf-8'))):
            result_h = {}
            result_h ['SEARCH_NAME'] = row_h ['PARENT_WORKPLACE_NAME']
            result_h ['URL'] = "NOT FOUND"
            csv_eniro.write_row_h(result_h)

        for result_h in self._scrape_result():
            result_h ['SEARCH_NAME'] = row_h ['PARENT_WORKPLACE_NAME']
            result_h ['MATCHING_CRITERIA'] = search_term
            csv_eniro.write_row_h(result_h)
コード例 #4
0
 def _scrape_addr (soup):
     if not soup.select('div.address-container'):
         return None
         
     addr_lines = soup.select('div.address-container')[0]
     addr_lines = addr_lines.select('div.address__line')
     addr_lines = [addr_line.string for addr_line in addr_lines if (addr_line.string != None)]
     address = "\n" . join(addr_lines)
     return cp1252(address)
コード例 #5
0
    def _do_search (self, search_term):
        """def: _do_search"""
        search_term = cp1252(search_term)
        logger.dump ('searching %s' % search_term)

        self.mech.addheaders = [('Host', 'www.eniro.se')] 
        query = {'what': 'supersearch', 'search_word': search_term}
        url = 'http://www.eniro.se/query?' + urllib.urlencode(query)
        logger.dump ("Step1 Get: %s" % url)
        self.mech.open(url, timeout=60)
コード例 #6
0
        def _scrape_home_url (soup):
            """def: _scrape_home_url"""
            website = soup.find('a', {'class': 'website'})
            if website == None: return None

            website = website.get('href', None)
            if website != None:
                website = cp1252(website)
                if re.match('/', website): website = 'http://www.hitta.se/' + website
            return website
コード例 #7
0
    def _do_search (self, search_term):
        """def: _do_search"""
        search_term = cp1252(search_term)
        logger.dump ('searching %s' % search_term)

        query = {'vad' : search_term}
        url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query)

        logger.dump ("Get: %s" % url)
        self._try_mech_open (url)
        tree = html.fromstring (self.mech.response().read())
        if not tree.xpath ('//div[@class="company-name"]/h2/a/@href'):
            query = {'vad' : search_term, 'sida' : 1, 'typ' : 'ftg'}
            url = "http://www.hitta.se/s%C3%B6k?" + urllib.urlencode(query)
            logger.dump ("Get Clinic From Place: %s" % url)
            self._try_mech_open (url)
コード例 #8
0
 def _scrape_result (self):
     """def: _scrape_result"""
     result_h = {}
     tree = html.fromstring (self.mech.response().read())
     articles = tree.xpath ('//article[@data-hit-number]')
     for article in articles:
         article = html.fromstring (html.tostring (article))
         result_h ['URL'] = self._first_element (article.xpath('//a[contains(@class,"profile-page-link")]/@href'))
         result_h ['URL'] = "http://gulasidorna.eniro.se" + result_h ['URL']
         result_h ['FOUND_NAME'] = self._first_element (article.xpath('//a[contains(@class,"profile-page-link")]/text()'))
         result_h ['WEBSITE'] = self._first_element (article.xpath ('//a[contains(@class,"hit-homepage-link")]/text()'))
         result_h ['WEBSITE'] = re.sub('.*1177\.se.*', '', result_h ['WEBSITE'])
         result_h ['PHONE'] = self._first_element (article.xpath ('//span[contains(@class,"hit-phone-number")]/text()'))
         result_h ['PHONE'] = re.sub(r'\s+', '', result_h ['PHONE'])
         result_h ['ADDRESS'] = self._first_element (article.xpath ('//span[contains(@class,"street-address")]/text()'))
         result_h ['ZIP'] = self._first_element (article.xpath ('//span[contains(@class,"postal-code")]/text()'))
         result_h ['ZIP'] = re.sub(r'\s+', '', result_h ['ZIP'])
         result_h ['CITY'] = self._first_element (article.xpath ('//span[contains(@class,"locality")]/text()'))
         result_h = cp1252(result_h)
         yield result_h
コード例 #9
0
    def parse(self):
        """def: parse"""
        for row_h in self.doctor_match_result_reader:
            if not row_h["FOUND_DOCTORS_URL"]:
                csv_hitta_vardcentraler.write_row_h(row_h)
                continue

            if row_h["FOUND_DOCTORS_URL"] == "":
                csv_hitta_vardcentraler.write_row_h(row_h)
                continue

            home_urls = row_h["FOUND_DOCTORS_URL"]
            self.doctor_tag = row_h["FOUND_DOCTOR_TAG"]

            all_names = []
            for home_url in row_h["FOUND_DOCTORS_URL"].split("|"):
                names = self.get_all_names(home_url)
                all_names = all_names + names
            all_names_str = "|".join(all_names)
            row_h["FOUND_ALL_NAMES"] = cp1252(all_names_str)
            csv_hitta_vardcentraler.write_row_h(row_h)
コード例 #10
0
    def parse (self):
        """def: parse"""
        for row_h in self.doctor_match_result_reader:
            if not row_h ['FOUND_DOCTORS_URL']: 
                csv_hitta_vardcentraler.write_row_h(row_h)
                continue

            if row_h ['FOUND_DOCTORS_URL'] == "":
                csv_hitta_vardcentraler.write_row_h(row_h)
                continue 

            home_urls = row_h ['FOUND_DOCTORS_URL']
            self.doctor_tag = row_h ['FOUND_DOCTOR_TAG']

            all_names = []
            for home_url in row_h['FOUND_DOCTORS_URL'].split("|"):
                names = self.get_all_names (home_url)
                all_names = all_names + names
            all_names_str = '|'.join(all_names)
            row_h["FOUND_ALL_NAMES"] = cp1252(all_names_str)
            row_h ['DATE'] = datetime.datetime.today().strftime('%Y%m%d')

            csv_hitta_vardcentraler.write_row_h(row_h)
コード例 #11
0
 def _scrape_name (soup):
     """def: _scrape_name"""
     heading = soup.find('h1', {'itemprop': 'name'})
     if heading != None:
         return cp1252('' . join(heading.stripped_strings))
     return None