def run(cls, req: Requester) -> Iterator[Clinic]: sys.stdout.write(f'\r{cls.company_name_upper}: Processing.') total_location_count: int = 0 states: [str] = cls._get_states(req.get_page_bs(cls.base_url)) states_len: int = len(states) state_count: int = 0 for state_url in states: state_count = state_count + 1 sys.stdout.write( f'\r{cls.company_name_upper}: Processing state {state_count}/{states_len}.' ) locations: [str] = cls._get_location(req.get_page_bs(state_url)) locations_len: int = len(locations) total_location_count = total_location_count + locations_len location_count: int = 0 for location_url in locations: location_count = location_count + 1 sys.stdout.write( f'\r{cls.company_name_upper}: Processing state {state_count}/{states_len} and location {location_count}/{locations_len}.' ) profiles: [(str, str) ] = cls._get_clinics(req.get_page_bs(location_url)) for (profile_url, name) in profiles: yield cls._parse_profile(req.get_page_bs(profile_url), name, profile_url) sys.stdout.write( f'\r{cls.company_name_upper}: Processed {states_len} states to find {total_location_count} clinics.\n' )
def run(cls, req: Requester) -> Iterator[Clinic]: sys.stdout.write(f'\r{cls.company_name_upper}: Processing.') total_location_count: int = 0 communities: [[str]] = cls._get_communities( req.get_page_bs(cls.test_urls['communities'][1])) communities_len: int = len(communities) communities_count: int = 0 for community_urls in communities: communities_count = communities_count + 1 sys.stdout.write( f'\r{cls.company_name_upper}: Processing community {communities_count}/{communities_len}.' ) locations_len: int = len(community_urls) location_count: int = 0 for url in community_urls: location_count = location_count + 1 sys.stdout.write( f'\r{cls.company_name_upper}: Processing community {communities_count}/{communities_len} and location {location_count}/{locations_len}.' ) yield cls._get_clinic(req.get_page_bs(url), url) total_location_count = total_location_count + locations_len sys.stdout.write( f'\r{cls.company_name_upper}: Processed {communities_len} communities to find {total_location_count} clinics.\n' )
class TestRequesterClass(unittest.TestCase): _req: Requester def setUp(self): self._req = Requester(0.2) def test_valid(self): self.assertIsNotNone(self._req.get_page_str('https://www.google.com')) self.assertIsNotNone(self._req.get_page_bs('https://www.google.com'))
def run(cls, req: Requester) -> Iterator[Clinic]: sys.stdout.write(f'\r{cls.company_name_upper}: Processing.') total_location_count: int = 0 resp = req.form_json('post', 'https://manage.fyzical.com/locationsearch', data={ 'lng': -100, 'lat': 40, 'radius': 5000, }) locations = resp['data'] locations_len: int = len(locations) total_location_count = total_location_count + locations_len location_count: int = 0 for location in locations: sys.stdout.write( f'\r{cls.company_name_upper}: Processing location {location_count}/{locations_len}.' ) clinic: Clinic = cls._get_profile(location) yield clinic sys.stdout.write( f'\r{cls.company_name_upper}: Processed {total_location_count} clinics.\n' )
def run(cls, req: Requester) -> Iterator[Clinic]: sys.stdout.write(f'\r{cls.company_name_upper}: Processing.') total_location_count: int = 0 locations = req.form_json('get', f'{cls.base_url}/wp-admin/admin-ajax.php?', params={ 'action': 'store_search', 'lat': 30, 'long': -80, 'max_results': 25, 'search_radius': 50, 'autoload': 1, }) locations_len: int = len(locations) total_location_count = total_location_count + locations_len location_count: int = 0 for location in locations: sys.stdout.write( f'\r{cls.company_name_upper}: Processing location {location_count}/{locations_len}.' ) clinic: Clinic = cls._get_profile(location) yield clinic sys.stdout.write( f'\r{cls.company_name_upper}: Processed {total_location_count} clinics.\n' )
def run(cls, req: Requester) -> Iterator[Clinic]: sys.stdout.write(f'\r{cls.company_name_upper}: Processing.') total_location_count: int = 0 states: [str] = cls._get_states( req.get_page_bs(cls.test_urls['states'][1])) states_len: int = len(states) state_count: int = 0 for state in states: state_count = state_count + 1 sys.stdout.write( f'\r{cls.company_name_upper}: Processing state {state_count}/{states_len}.' ) locations: [str] = cls._get_locations( req.form_json('post', f'{cls.base_url}/wp-admin/admin-ajax.php?', data={ 'action': 'markersearch', 'method': 'state', 'state': state, })) locations_len: int = len(locations) total_location_count = total_location_count + locations_len location_count: int = 0 for location_url in locations: location_count = location_count + 1 sys.stdout.write( f'\r{cls.company_name_upper}: Processing state {state_count}/{states_len} and location {location_count}/{locations_len}.' ) clinic: Clinic = cls._get_profile( req.get_page_bs(location_url), location_url) yield clinic sys.stdout.write( f'\r{cls.company_name_upper}: Processed {states_len} states to find {total_location_count} clinics.\n' )
def run(cls, req: Requester) -> Iterator[Clinic]: sys.stdout.write(f'\r{cls.company_name_upper}: Processing.') total_location_count: int = 0 states: [(str, str)] = cls._get_states(req.get_page_bs(cls.states_url)) states_len: int = len(states) state_count: int = 0 for (state_url, state) in states: state_count = state_count + 1 sys.stdout.write( f'\r{cls.company_name_upper}: Processing state {state_count}/{states_len}.' ) locations: [Clinic] = cls._get_clinics(req.get_page_bs(state_url), state) total_location_count = total_location_count + len(locations) for clinic in locations: yield clinic sys.stdout.write( f'\r{cls.company_name_upper}: Processed {states_len} states to find {total_location_count} clinics.\n' )
def run(cls, req: Requester) -> Iterator[Clinic]: sys.stdout.write(f'\r{cls.company_name_upper}: Processing.') data = cls._get_data(req.get_page_bs(cls.base_url)) data_len: int = len(data) data_count: int = 0 for d in data: data_count = data_count + 1 sys.stdout.write( f'\r{cls.company_name_upper}: Processing location {data_count}/{data_len}.' ) yield cls._get_clinic(d) sys.stdout.write( f'\r{cls.company_name_upper}: Processed {data_len} clinics.\n')
def setUp(self): self._req = Requester(0.2)
def _get_clinic(cls, req: Requester, url) -> Clinic: raw_html: str = req.get_page_str(url) return cls._get_clinic_info(raw_html, url)
def _get_locations(cls, req: Requester, url: str) -> [str]: raw_html: str = req.get_page_str(url) return cls._get_location_urls(raw_html)
from pathlib import Path from ptls.requester import Requester from ptls.args import Args, get_args args: Args = get_args(default_location='./data/test_files') req: Requester = Requester(args.network_delay) def download(file: Path, url: str): file.open('wb').write(req.get_page_str(url)) for scraper in args.scrapers: print(f'Downloading {scraper.company_name} test files...') scraper_path: Path = args.out_location.joinpath(scraper.company_name) scraper_path.mkdir(parents=True, exist_ok=True) for (p, url) in scraper.test_urls.values(): download(scraper_path.joinpath(p), url)