'name': 'titulo', 'description': '' }, { 'type': 'String', 'name': 'estado', 'description': '' }] }] }) scraperhelper.setPrintTimeTo(True) browser = scraperhelper.initBrowser() # input file sessions_data = json.load(open('./data/sesiones.simple.1418.json')) scraperhelper.pt('Get sessions from file') # output lists data = [] errors = [] # main script GO! counting = 0 for session in sessions_data['data']: counting = counting + 1 saved = False try: sesion = { "fecha": session['fecha'], "sesion": session['sesion'], "estado": session['estado'],
{'type': 'String', 'name': 'prmid', 'description': ''} ] } ) scraperhelper.setPrintTimeTo(True) browser = scraperhelper.initBrowser() # output lists data = [] errors = [] # main script GO! try: browser.get('https://www.camara.cl/trabajamos/sala_sesiones.aspx') scraperhelper.pt('Get First Site') option_selected = browser.find_element_by_css_selector('#ctl00_mainPlaceHolder_ddlLegislaturas option[selected]').get_attribute('value') scraperhelper.pt('Get Handy Elements') while int(option_selected) > 45: scraperhelper.pt('Get year id: ' + str(option_selected) + ' ----------') page_number = browser.find_element_by_css_selector('#detail .pages ul li.current').text subcount = 1 while True: scraperhelper.pt('Get Sessions: Page ' + str(subcount)) subcount = subcount + 1 rows = browser.find_elements_by_css_selector('#detail table.tabla tbody tr') for row in rows:
{'type': 'List','name': 'contra','description': '' }, {'type': 'List','name': 'abstencion','description': '' }, {'type': 'List','name': 'articulo_quinto','description': '' }, {'type': 'List','name': 'pareos','description': '' }, ] } ) scraperhelper.setPrintTimeTo(True) browser = scraperhelper.initBrowser() # input file sessions_data = json.load(open('./data/sesiones.extended.1418.json')) saved_data = json.load(open('./data/votaciones.extended.1418.json')) errors_data = json.load(open('./data/errors/votaciones.extended.1418.json')) scraperhelper.pt('Get files') # output lists data = [] errors = [] # preload data for d in saved_data['data']: data.append(d) scraperhelper.pt('Preload Data - ' + str(len(saved_data['data']))) # main script GO! counting = 0 for votacion_prmid in errors_data: counting = counting + 1
'name': 'periodo', 'description': '' }] }) scraperhelper.setPrintTimeTo(True) browser = scraperhelper.initBrowser() # output lists data = [] errors = [] # main script GO! try: browser.get('https://www.camara.cl/camara/diputados.aspx') scraperhelper.pt('Get Current Reps Site') content = browser.find_elements_by_css_selector('li.alturaDiputado h4 a') for el in content: data.append({ "prmid": scraperhelper.getQueryParametersFromUrl( el.get_attribute('href'))[0], "nombre": str(el.text.replace('SR. ', '').replace('SRA. ', '')), "periodo": "2014-2018" }) except TimeoutException as ex: scraperhelper.pt('PAGE TimeoutException ERROR')
{'type': 'List','name': 'region','description': '' }, {'type': 'String','name': 'distrito','description': '' }, {'type': 'String','name': 'partido','description': '' }, {'type': 'String','name': 'tipo','description': '' } ] } ] } ) scraperhelper.setPrintTimeTo(True) browser = scraperhelper.initBrowser() # input file mociones = json.load(open('./data/mociones.simple.1418.json')) scraperhelper.pt('Get Motions from file') # saved files output_saved = json.load(open('./data/mociones.extended.1418.json')) errors_saved = json.load(open('./data/errors/mociones.extended.1418.json')) indexes_saved = scraperhelper.getSavedIndexes(output_saved['data'], 'prmid') scraperhelper.pt('Get saved files') # output lists data = output_saved['data'] errors = errors_saved # main script GO! all_count = len(mociones) counting = 0
browser = scraperhelper.initBrowser() # output lists data = [] errors = [] # main script GO! page = 1 try: browser.get('https://www.camara.cl/trabajamos/presolucion.aspx') browser.execute_script( "__doPostBack('ctl00$mainPlaceHolder$lblistadogral','')") WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.ID, 'ctl00_mainPlaceHolder_pnlResultados'))) scraperhelper.pt('Get First Site and load list') page_number = browser.find_element_by_css_selector( '#detail .pages ul li.current').text while True: for tr in browser.find_elements_by_css_selector( '#main table.tabla tbody tr'): cols = tr.find_elements_by_tag_name('td') if len(cols) > 5: res = { "ingreso": cols[0].text, "numero": cols[1].text, "titulo": cols[2].text,
'name': 'partido', 'description': '' }, { 'type': 'String', 'name': 'calidad', 'description': '' }] }] }) scraperhelper.setPrintTimeTo(True) browser = scraperhelper.initBrowser() # input file resoluciones = json.load(open('./data/resoluciones.simple.1418.json')) scraperhelper.pt('Get input file') # saved files try: output_saved = json.load(open('./data/resoluciones.extended.1418.json')) errors_saved = json.load( open('./data/errors/resoluciones.extended.1418.json')) indexes_saved = scraperhelper.getSavedIndexes(output_saved['data'], 'res_prmid') data = output_saved['data'] errors = errors_saved except FileNotFoundError: output_saved = [] errors_saved = [] indexes_saved = [] data = []
commitees = summary[2].find_elements_by_css_selector('p') for co in commitees: rep_extended['comite_parlamentario'] = rep_extended[ 'comite_parlamentario'] + co.text rep_extended['telefono'] = ficha.find_element_by_css_selector( 'div.phones p').text.replace('Teléfono: ', '') rep_extended['correo'] = ficha.find_element_by_css_selector( 'li.email a').text data.append(rep_extended) saved = True except TimeoutException as ex: scraperhelper.pt('PAGE TimeoutException ERROR') except NoSuchElementException as ex: scraperhelper.pt('PAGE NoSuchElementException ERROR') except StaleElementReferenceException as ex: scraperhelper.pt('PAGE StaleElementReferenceException ERROR') except WebDriverException as ex: scraperhelper.pt('PAGE WebDriverException ERROR') finally: scraperhelper.pt('Loaded Representative ' + rep['prmid']) if not saved: errors.append(rep['prmid']) print('----------- WITH ERROR! -------------') scraperhelper.closeSeleniumBrowser(browser) scraperhelper.saveToFile('diputados.extended.1418', data, errors)
'description': '' }, { 'type': 'List', 'name': 'pareos', 'description': '' }, ] }) scraperhelper.setPrintTimeTo(True) browser = scraperhelper.initBrowser() # input file sessions_data = json.load(open('./data/sesiones.extended.1418.json')) scraperhelper.pt('Get sessions from file') # output lists data = [] errors = [] # main script GO! counting = 0 for session in sessions_data['data']: for voting in session['votaciones']: counting = counting + 1 saved = False try: # Go to 'Detale Votaciones' browser.get( 'https://www.camara.cl/trabajamos/sala_votacion_detalle.aspx?prmId='
'name': 'entrega', 'description': '' }, { 'type': 'String', 'name': 'documento_link', 'description': '' }] }] }) scraperhelper.setPrintTimeTo(True) browser = scraperhelper.initBrowser() # input file sesiones = json.load(open('./data/sesiones.extended.1418.json')) scraperhelper.pt('Get input file') # saved files output_saved = json.load(open('./data/acuerdos.extended.1418.json')) errors_saved = json.load(open('./data/errors/acuerdos.extended.1418.json')) indexes_saved = scraperhelper.getSavedIndexes(output_saved['data'], 'acuerdo_prmid') scraperhelper.pt('Get saved files') # output lists data = output_saved['data'] errors = errors_saved # main script GO! all_count = len(sesiones['data']) counting = 0