def tratar_gob_es(total, visto, id, raiz, padre): if id in visto: org = visto[id] if padre: org.idPadres.add(padre) return print("%3d%% completado: %6d" % (len(visto.keys()) * 100 / total, id), end="\r") fn = "fuentes/administracion.gob.es/id_%06d.html" % id if not os.path.isfile(fn): return soup = soup_from_file(fn) for n in soup.select(".hideAccessible"): n.extract() codigo = None hijos = [] deDireccion = None latlon = None for div in soup.select("section div"): for br in div.findAll("br"): br.replaceWith(" ") txt = div.get_text() txt = re_bk.sub(" ", txt) txt = txt.strip() if ":" not in txt: continue key, value = [i.strip() for i in txt.split(":", 1)] if key == "Código de unidad orgánica": codigo = value elif key == "Estructura orgánica": hijos = set([ int(a.attrs["href"].split("&")[0].split("=")[1]) for a in div.select("a[href]") if "idUnidOrganica=" in a.attrs["href"] ]) elif key == "Dirección": deDireccion = value if not codigo: return deOrganismo = re_bk.sub( " ", soup.select("h1.ppg-heading")[0].get_text()).strip() img = soup.find("img", attrs={"src": re_map}) if img: latlon = re_map.search(img.attrs["src"]).group(1) org = Organismo(codigo, deOrganismo, deDireccion, latlon=latlon, idRaiz=raiz, idUnidOrganica=id) visto[id] = org for h in hijos: tratar_gob_es(total, visto, h, raiz, codigo)
convocatorias = ( (2016, 'L', 'BOE-A-2018-991'), (2015, 'L', 'BOE-A-2016-12467'), ) total = 1 + len(xlss) + len(pdfs) + len(convocatorias) count = 1 print ("Leyendo puestos") print("%3d%% completado: cod_provincia.htm" % (count * 100 / total,), end="\r") idde = {} idde["provincias"] = {} soup = soup_from_file("fuentes/cod_provincia.htm") for tr in soup.select("table.miTabla tr"): tds = [td.get_text().strip() for td in tr.findAll("td")] if len(tds) == 2 and tds[0].isdigit(): cod, prov = tds idde["provincias"][int(cod)] = prov todos = [] organismos = {} for xls in xlss: count = count + 1 print("%3d%% completado: %-30s" % (count * 100 / total, os.path.basename(xls)), end="\r") wb = xlrd.open_workbook(xls) sh = wb.sheet_by_index(0)