def get_registraduria_with_params(): ''' combines all the params in the given dict `param` scrapes all the urls with those params in the given output_folder (scrape_data_folder) then the files in scrape_data_folder are parsed via SampleHTMLParser which counts the anchors & paragraphs in each file, and outputs the results to another folder. ''' scrape_data_folder = "registraduria_raw_data" extracted_data_folder = "registraduria_extracted_data" params = { "objeto": [ "10000000", "11000000", "12000000", "15000000", "13000000", "14000000", "27000000", "20000000", "21000000" ], "paginaObjetivo": ["1"], "cuantias": ["1", "2", "3"] } url = "https://www.contratos.gov.co/consultas/resultadosConsulta.do?&ctl00$ContentPlaceHolder1$hidIDProducto=-1&ctl00$ContentPlaceHolder1$hidRedir=&departamento=&ctl00$ContentPlaceHolder1$hidNombreDemandante=-1&ctl00$ContentPlaceHolder1$hidNombreProducto=-1&fechaInicial=&ctl00$ContentPlaceHolder1$hidIdEmpresaC=0&ctl00$ContentPlaceHolder1$hidIdOrgV=-1&ctl00$ContentPlaceHolder1$hidIDProductoNoIngresado=-1&ctl00$ContentPlaceHolder1$hidRangoMaximoFecha=&fechaFinal=&desdeFomulario=true&ctl00$ContentPlaceHolder1$hidIdOrgC=-1&ctl00$ContentPlaceHolder1$hidIDRubro=-1&tipoProceso=®istrosXPagina=10&numeroProceso=&municipio=0&estado=0&ctl00$ContentPlaceHolder1$hidNombreProveedor=-1&ctl00$ContentPlaceHolder1$hidIdEmpresaVenta=-1" print("scraping registraduria...") scrape(url, output_folder=scrape_data_folder, params_and_values=params, workers=10) print("parsing registraduria..") parse(input_folder=scrape_data_folder, output_folder=extracted_data_folder, parser=SampleHTMLParser(), workers=8)
def busqueda_de_vias(): params = { "id" : ["05","15","17","18","19","27","25","95","41","52","54","63","66","68","73","76"] } url = "https://monitoreoinvias.com/publico-departamento.php" scrape(url, output_folder=scrape_busqueda_data_folder, params_and_values=params, workers=10) parse(input_folder=scrape_busqueda_data_folder, output_folder=extracted_busqueda_data_folder, parser=MonitoreoViasParserSearchResults(), workers=10)
def busqueda_de_vias(): params = { "id": [ "05", "15", "17", "18", "19", "27", "25", "95", "41", "52", "54", "63", "66", "68", "73", "76" ] } url = "https://monitoreoinvias.com/publico-departamento.php" scrape(url, output_folder=scrape_busqueda_data_folder, params_and_values=params, workers=10) parse(input_folder=scrape_busqueda_data_folder, output_folder=extracted_busqueda_data_folder, parser=MonitoreoViasParserSearchResults(), workers=10)
def get_registraduria_with_params(): ''' combines all the params in the given dict `param` scrapes all the urls with those params in the given output_folder (scrape_data_folder) then the files in scrape_data_folder are parsed via SampleHTMLParser which counts the anchors & paragraphs in each file, and outputs the results to another folder. ''' scrape_data_folder = "registraduria_raw_data" extracted_data_folder = "registraduria_extracted_data" params = { "objeto" : ["10000000", "11000000", "12000000", "15000000", "13000000", "14000000", "27000000", "20000000", "21000000"], "paginaObjetivo": ["1"], "cuantias": ["1", "2", "3"] } url = "https://www.contratos.gov.co/consultas/resultadosConsulta.do?&ctl00$ContentPlaceHolder1$hidIDProducto=-1&ctl00$ContentPlaceHolder1$hidRedir=&departamento=&ctl00$ContentPlaceHolder1$hidNombreDemandante=-1&ctl00$ContentPlaceHolder1$hidNombreProducto=-1&fechaInicial=&ctl00$ContentPlaceHolder1$hidIdEmpresaC=0&ctl00$ContentPlaceHolder1$hidIdOrgV=-1&ctl00$ContentPlaceHolder1$hidIDProductoNoIngresado=-1&ctl00$ContentPlaceHolder1$hidRangoMaximoFecha=&fechaFinal=&desdeFomulario=true&ctl00$ContentPlaceHolder1$hidIdOrgC=-1&ctl00$ContentPlaceHolder1$hidIDRubro=-1&tipoProceso=®istrosXPagina=10&numeroProceso=&municipio=0&estado=0&ctl00$ContentPlaceHolder1$hidNombreProveedor=-1&ctl00$ContentPlaceHolder1$hidIdEmpresaVenta=-1" print("scraping registraduria...") scrape(url, output_folder=scrape_data_folder, params_and_values=params, workers=10) print("parsing registraduria..") parse(input_folder=scrape_data_folder, output_folder=extracted_data_folder, parser=SampleHTMLParser(), workers=8)