def scrape(): gcd = google_chrome_driver() driver = gcd.driver driver.get(URL) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "ReportContent"))) page = pq(driver.page_source) rows = page.find('#ReportContent table.table tr') for row in rows: _row = [pq(td) for td in pq(row).find('td')] if len(_row) == 0: continue ret = dict( zip(('tender_id', 'page_title', 'claim_date', 'description'), [td.text() for td in _row])) link = pq(_row[0].find('a')).attr('href') if not link.startswith('http'): link = BASE + link ret.update( dict(tender_type='call_for_bids', page_url=URL, publication_id=0, tender_type_he='קול קורא', publisher='ועדת העזבונות', start_date=None, documents=[dict(link=link, description='מסמכי הקול הקורא')], contact='מוחמד זחלקה', contact_email='*****@*****.**')) yield ret gcd.teardown()
def fetcher(parameters): skip = 0 gcd = None try: while True: url = URL.format(**parameters, limit=skip + 10, skip=skip) skip += 10 results = None if gcd is None: try: results = requests.get(url) results = results.json() except Exception as e: print('FAILED to load from %s: %s' % (url, e)) if results and hasattr(results, 'content'): print('FAILED to parse JSON <pre>%s</pre>' % results.content[:2048]) if gcd is None: gcd = google_chrome_driver() if gcd is not None: results = gcd.json(url) results = results.get('results', []) yield from results if len(results) == 0: break finally: if gcd is not None: gcd.teardown()
def get_explanations(url): logging.info('Connecting to %r', url) try: resp = requests.get(url, stream=True, timeout=300).raw outfile = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.basename(url)) shutil.copyfileobj(resp, outfile) archive = outfile.name except Exception: gcl = google_chrome_driver() archive = gcl.download(url) gcl.teardown() if '.tar.gz' in url: t_archive = tarfile.open(name=archive, mode='r|gz') files = ((os.path.basename(member.name), t_archive.extractfile(member)) for member in t_archive if member is not None and member.isfile()) elif '.zip' in url: z_archive = zipfile.ZipFile(archive) files = ((os.path.basename(member.filename), z_archive.open(member)) for member in z_archive.filelist) else: assert False for name, item in files: contents = base64.b64encode(item.read()).decode('ascii') yield {'contents': contents, 'orig_name': name} os.unlink(archive)
def flow(*_): gcd = google_chrome_driver() download = gcd.download( 'https://data.gov.il/dataset/246d949c-a253-4811-8a11-41a137d3d613/resource/f004176c-b85f-4542-8901-7b3176f9a054/download/f004176c-b85f-4542-8901-7b3176f9a054.csv' ) return Flow( load(download, cast_strategy=load.CAST_TO_STRINGS), concatenate(_get_columns_mapping_dict(), target=dict(name='company-details')), set_type('id', type='string'), set_type('company_registration_date', type='date', format='%d/%m/%Y'), set_type('company_is_government', type='boolean', falseValues=['לא'], trueValues=['כן']), set_type('company_is_mafera', type='boolean', falseValues=['לא'], trueValues=['מפרה', 'התראה']), set_type('company_last_report_year', type='integer'), clear_bool_values, update_resource(**{'dpp:streaming': True}, resources='company-details'), set_primary_key(['id'], resources='company-details'), printer(), )
def flow(*_): global gcd gcd = google_chrome_driver(wait=False) return DF.Flow( scraper(gcd), DF.update_resource(-1, **{'dpp:streaming': True}), finalize_teardown(gcd), )
def wrapper(wait=False): gcd = None try: gcd = google_chrome_driver(wait=wait) return scraper(gcd) finally: logging.info('Tearing down %r', gcd) if gcd: gcd.teardown()
def wrapper(year): gcd = None try: gcd = google_chrome_driver(initial='http://example.com/') return scraper(gcd, year) finally: logging.info('Tearing down %r', gcd) if gcd: gcd.teardown()
def flow(*_): global gcd gcd = google_chrome_driver(initial=BASE_URL + MAIN_PAGES[0], wait=False) # import selenium # class G: # def __init__(self): # chrome_options = selenium.webdriver.ChromeOptions() # chrome_options.debugger_address = 'localhost:9222' # self.driver = selenium.webdriver.Chrome(options=chrome_options) # gcd = G() return DF.Flow( scraper(gcd), DF.update_resource(-1, **{'dpp:streaming': True}), )
def flow(parameters): dataset_name = str(parameters['dataset-name']) resource_name = str(parameters['resource-name']) resource = parameters.get('resource', {}) resource.update({ 'dpp:streaming': True, }) gcd = parameters.get('gcd') if parameters and parameters.get( 'gcd') else google_chrome_driver() url, path = get_resource(gcd, dataset_name, resource_name) args = {'name': resource_name, 'http_timeout': 30} if '.xls' in path: args['force_strings'] = True return Flow( add_source('{}/{}'.format(dataset_name, resource_name), url), load(path, **args), update_resource(resource_name, **resource), finalize_teardown(gcd), )
def get_explanations(url): logging.info('Connecting to %r', url) gcl = google_chrome_driver() archive = gcl.download(url) gcl.teardown() if '.tar.gz' in url: t_archive = tarfile.open(name=archive, mode='r|gz') files = ((os.path.basename(member.name), t_archive.extractfile(member)) for member in t_archive if member is not None and member.isfile()) elif '.zip' in url: z_archive = zipfile.ZipFile(archive) files = ((os.path.basename(member.filename), z_archive.open(member)) for member in z_archive.filelist) else: assert False for name, item in files: contents = base64.b64encode(item.read()).decode('ascii') yield {'contents': contents, 'orig_name': name} os.unlink(archive)
def scrape(): gcd = google_chrome_driver(wait=False) driver = gcd.driver def prepare(msg=''): logging.info('PREPARING') driver.get("https://www.misim.gov.il/mm_lelorasham/firstPage.aspx") # time.sleep(3) # driver.execute_script('Display(1)'); while True: try: bakasha = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "RadioBakasha1"))) bakasha.click() time.sleep(3) return except TimeoutException: logging.warning( 'Failed to find radio button, retrying in a few (%s)', msg) time.sleep(3600) prepare() # Prepare options time.sleep(3) logging.info('GETTING OPTIONS') page = driver.page_source option_re = re.compile('<option value="(\d+)">([^<]+)</option>') options = option_re.findall(page) logging.info('GETTING OPTIONS: %r', options) options = dict((k.strip(), v.strip()) for k, v in options) def select_option(selection_): logging.info('OPTION %s (%s)', selection_, options[selection_]) select = Select(driver.find_element_by_id('DropdownlistSugYeshut')) select.select_by_value(selection_) # driver.find_element_by_css_selector('option[value="%s"]' % selection_).click() time.sleep(3) search_button = driver.find_element_by_id('btnHipus') hover = ActionChains(driver).move_to_element(search_button)\ .move_to_element_with_offset(search_button, xoffset=10, yoffset=10)\ .click()\ .perform() for selection in options.keys(): if slugs.get(options[selection]) is None: logging.warning('SKIPPING option #%s (%s)', selection, options[selection]) continue logging.info('TRYING option #%s (%s)', selection, options[selection]) prepare('#%s (%s)' % (selection, options[selection])) select_option(selection) while True: try: WebDriverWait(driver, 60, poll_frequency=5).until( EC.presence_of_element_located( (By.CSS_SELECTOR, "#dgReshima tr.row1"))) except TimeoutException: logging.warning('FAILED TO GET ROWS, RETRYING OPTION') time.sleep(10) prepare() select_option(selection) continue page = driver.page_source row_re = re.compile('(<td.+</td>)') for line in page.split('\n'): row = row_re.findall(line) if len(row) > 0: row = row[0] logging.info('ROW %r', row) row = ([slugs[options[selection]]] + [pq(x).text() for x in pq(row).find('td')]) if len(row) == 1: continue logging.info('ROW %r', row) datum = dict(zip(headers, row)) the_id = datum['id'] if the_id not in scraped_ids: scraped_ids.add(the_id) yield datum if 'btnHaba' in page: try: next_button = driver.find_element_by_id('btnHaba') except NoSuchElementException: break hover = ActionChains(driver).move_to_element(next_button)\ .move_to_element_with_offset(next_button, xoffset=10, yoffset=10)\ .click()\ .perform() time.sleep(3) else: break gcd.teardown()
def get_gcd(): return google_chrome_driver()
def batch_flow(parameters): gcd = google_chrome_driver() return Flow(*[flow(dict(**p, gcd=gcd)) for p in parameters['batch']])
def flow(parameters): gcl = google_chrome_driver() parameters['from'] = gcl.download(parameters['from']) gcl.teardown() return load_flow(parameters)