def scrape():
    gcd = google_chrome_driver()
    driver = gcd.driver

    driver.get(URL)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "ReportContent")))
    page = pq(driver.page_source)
    rows = page.find('#ReportContent table.table tr')
    for row in rows:
        _row = [pq(td) for td in pq(row).find('td')]
        if len(_row) == 0:
            continue
        ret = dict(
            zip(('tender_id', 'page_title', 'claim_date', 'description'),
                [td.text() for td in _row]))
        link = pq(_row[0].find('a')).attr('href')
        if not link.startswith('http'):
            link = BASE + link
        ret.update(
            dict(tender_type='call_for_bids',
                 page_url=URL,
                 publication_id=0,
                 tender_type_he='קול קורא',
                 publisher='ועדת העזבונות',
                 start_date=None,
                 documents=[dict(link=link, description='מסמכי הקול הקורא')],
                 contact='מוחמד זחלקה',
                 contact_email='*****@*****.**'))
        yield ret
    gcd.teardown()
Ejemplo n.º 2
0
def fetcher(parameters):
    skip = 0
    gcd = None
    try:
        while True:
            url = URL.format(**parameters, limit=skip + 10, skip=skip)
            skip += 10
            results = None
            if gcd is None:
                try:
                    results = requests.get(url)
                    results = results.json()
                except Exception as e:
                    print('FAILED to load from %s: %s' % (url, e))
                    if results and hasattr(results, 'content'):
                        print('FAILED to parse JSON <pre>%s</pre>' %
                              results.content[:2048])
                    if gcd is None:
                        gcd = google_chrome_driver()
            if gcd is not None:
                results = gcd.json(url)
            results = results.get('results', [])
            yield from results
            if len(results) == 0:
                break
    finally:
        if gcd is not None:
            gcd.teardown()
def get_explanations(url):
    logging.info('Connecting to %r', url)
    try:
        resp = requests.get(url, stream=True, timeout=300).raw
        outfile = tempfile.NamedTemporaryFile(delete=False,
                                              suffix=os.path.basename(url))
        shutil.copyfileobj(resp, outfile)
        archive = outfile.name
    except Exception:
        gcl = google_chrome_driver()
        archive = gcl.download(url)
        gcl.teardown()

    if '.tar.gz' in url:
        t_archive = tarfile.open(name=archive, mode='r|gz')
        files = ((os.path.basename(member.name), t_archive.extractfile(member))
                 for member in t_archive
                 if member is not None and member.isfile())
    elif '.zip' in url:
        z_archive = zipfile.ZipFile(archive)
        files = ((os.path.basename(member.filename), z_archive.open(member))
                 for member in z_archive.filelist)
    else:
        assert False

    for name, item in files:
        contents = base64.b64encode(item.read()).decode('ascii')
        yield {'contents': contents, 'orig_name': name}

    os.unlink(archive)
def flow(*_):
    gcd = google_chrome_driver()
    download = gcd.download(
        'https://data.gov.il/dataset/246d949c-a253-4811-8a11-41a137d3d613/resource/f004176c-b85f-4542-8901-7b3176f9a054/download/f004176c-b85f-4542-8901-7b3176f9a054.csv'
    )
    return Flow(
        load(download, cast_strategy=load.CAST_TO_STRINGS),
        concatenate(_get_columns_mapping_dict(),
                    target=dict(name='company-details')),
        set_type('id', type='string'),
        set_type('company_registration_date', type='date', format='%d/%m/%Y'),
        set_type('company_is_government',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['כן']),
        set_type('company_is_mafera',
                 type='boolean',
                 falseValues=['לא'],
                 trueValues=['מפרה', 'התראה']),
        set_type('company_last_report_year', type='integer'),
        clear_bool_values,
        update_resource(**{'dpp:streaming': True},
                        resources='company-details'),
        set_primary_key(['id'], resources='company-details'),
        printer(),
    )
def flow(*_):
    global gcd
    gcd = google_chrome_driver(wait=False)
    return DF.Flow(
        scraper(gcd),
        DF.update_resource(-1, **{'dpp:streaming': True}),
        finalize_teardown(gcd),
    )
Ejemplo n.º 6
0
def wrapper(wait=False):
    gcd = None
    try:
        gcd = google_chrome_driver(wait=wait)
        return scraper(gcd)
    finally:
        logging.info('Tearing down %r', gcd)
        if gcd:
            gcd.teardown()
Ejemplo n.º 7
0
def wrapper(year):
    gcd = None
    try:
        gcd = google_chrome_driver(initial='http://example.com/')
        return scraper(gcd, year)
    finally:
        logging.info('Tearing down %r', gcd)
        if gcd:
            gcd.teardown()
Ejemplo n.º 8
0
def flow(*_):
    global gcd
    gcd = google_chrome_driver(initial=BASE_URL + MAIN_PAGES[0], wait=False)
    # import selenium
    # class G:
    #     def __init__(self):
    #         chrome_options = selenium.webdriver.ChromeOptions()
    #         chrome_options.debugger_address = 'localhost:9222'
    #         self.driver = selenium.webdriver.Chrome(options=chrome_options)
    # gcd = G()
    return DF.Flow(
        scraper(gcd),
        DF.update_resource(-1, **{'dpp:streaming': True}),
    )
Ejemplo n.º 9
0
def flow(parameters):
    dataset_name = str(parameters['dataset-name'])
    resource_name = str(parameters['resource-name'])
    resource = parameters.get('resource', {})
    resource.update({
        'dpp:streaming': True,
    })

    gcd = parameters.get('gcd') if parameters and parameters.get(
        'gcd') else google_chrome_driver()
    url, path = get_resource(gcd, dataset_name, resource_name)

    args = {'name': resource_name, 'http_timeout': 30}
    if '.xls' in path:
        args['force_strings'] = True

    return Flow(
        add_source('{}/{}'.format(dataset_name, resource_name), url),
        load(path, **args),
        update_resource(resource_name, **resource),
        finalize_teardown(gcd),
    )
def get_explanations(url):
    logging.info('Connecting to %r', url)
    gcl = google_chrome_driver()
    archive = gcl.download(url)
    gcl.teardown()

    if '.tar.gz' in url:
        t_archive = tarfile.open(name=archive, mode='r|gz')
        files = ((os.path.basename(member.name), t_archive.extractfile(member))
                 for member in t_archive
                 if member is not None and member.isfile())
    elif '.zip' in url:
        z_archive = zipfile.ZipFile(archive)
        files = ((os.path.basename(member.filename), z_archive.open(member))
                 for member in z_archive.filelist)
    else:
        assert False

    for name, item in files:
        contents = base64.b64encode(item.read()).decode('ascii')
        yield {'contents': contents, 'orig_name': name}

    os.unlink(archive)
Ejemplo n.º 11
0
def scrape():

    gcd = google_chrome_driver(wait=False)
    driver = gcd.driver

    def prepare(msg=''):
        logging.info('PREPARING')
        driver.get("https://www.misim.gov.il/mm_lelorasham/firstPage.aspx")
        # time.sleep(3)
        # driver.execute_script('Display(1)');
        while True:
            try:
                bakasha = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, "RadioBakasha1")))
                bakasha.click()
                time.sleep(3)
                return
            except TimeoutException:
                logging.warning(
                    'Failed to find radio button, retrying in a few (%s)', msg)
                time.sleep(3600)

    prepare()

    # Prepare options
    time.sleep(3)
    logging.info('GETTING OPTIONS')
    page = driver.page_source
    option_re = re.compile('<option value="(\d+)">([^<]+)</option>')
    options = option_re.findall(page)
    logging.info('GETTING OPTIONS: %r', options)
    options = dict((k.strip(), v.strip()) for k, v in options)

    def select_option(selection_):
        logging.info('OPTION %s (%s)', selection_, options[selection_])
        select = Select(driver.find_element_by_id('DropdownlistSugYeshut'))
        select.select_by_value(selection_)
        # driver.find_element_by_css_selector('option[value="%s"]' % selection_).click()
        time.sleep(3)
        search_button = driver.find_element_by_id('btnHipus')
        hover = ActionChains(driver).move_to_element(search_button)\
                            .move_to_element_with_offset(search_button, xoffset=10, yoffset=10)\
                            .click()\
                            .perform()

    for selection in options.keys():
        if slugs.get(options[selection]) is None:
            logging.warning('SKIPPING option #%s (%s)', selection,
                            options[selection])
            continue
        logging.info('TRYING option #%s (%s)', selection, options[selection])
        prepare('#%s (%s)' % (selection, options[selection]))
        select_option(selection)
        while True:
            try:
                WebDriverWait(driver, 60, poll_frequency=5).until(
                    EC.presence_of_element_located(
                        (By.CSS_SELECTOR, "#dgReshima tr.row1")))
            except TimeoutException:
                logging.warning('FAILED TO GET ROWS, RETRYING OPTION')
                time.sleep(10)
                prepare()
                select_option(selection)
                continue

            page = driver.page_source
            row_re = re.compile('(<td.+</td>)')
            for line in page.split('\n'):
                row = row_re.findall(line)
                if len(row) > 0:
                    row = row[0]
                    logging.info('ROW %r', row)
                    row = ([slugs[options[selection]]] +
                           [pq(x).text() for x in pq(row).find('td')])
                    if len(row) == 1:
                        continue
                    logging.info('ROW %r', row)
                    datum = dict(zip(headers, row))
                    the_id = datum['id']
                    if the_id not in scraped_ids:
                        scraped_ids.add(the_id)
                        yield datum

            if 'btnHaba' in page:
                try:
                    next_button = driver.find_element_by_id('btnHaba')
                except NoSuchElementException:
                    break
                hover = ActionChains(driver).move_to_element(next_button)\
                                            .move_to_element_with_offset(next_button, xoffset=10, yoffset=10)\
                                            .click()\
                                            .perform()
                time.sleep(3)
            else:
                break

    gcd.teardown()
Ejemplo n.º 12
0
def get_gcd():
    return google_chrome_driver()
Ejemplo n.º 13
0
def batch_flow(parameters):
    gcd = google_chrome_driver()
    return Flow(*[flow(dict(**p, gcd=gcd)) for p in parameters['batch']])
Ejemplo n.º 14
0
def flow(parameters):
    gcl = google_chrome_driver()
    parameters['from'] = gcl.download(parameters['from'])
    gcl.teardown()
    return load_flow(parameters)