def scrape_grants_for_fy(year): b.open(PAST_GRANTS_URL) try: b.select_form(name="Form1") b["oUcStartDate$ddlDay"] = ["1"] b["oUcStartDate$ddlMonth"] = ["4"] b["oUcStartDate$ddlYear"] = [str(year)] b["oUcEndDate$ddlDay"] = ["31"] b["oUcEndDate$ddlMonth"] = ["3"] b["oUcEndDate$ddlYear"] = [str(year + 1)] resp = b.submit() except mechanize._form.ItemNotFoundError: print("ERROR: could not submit form. This usually means you're " "trying to scrape for a year that doesn't exist " "on the GOTW website.", file=sys.stderr) raise page = PyQuery(resp.read()) for r in page("table tr:not(.GridHeader)"): grant = {} anchors = PyQuery(r).find('a') grant['id'] = anchors.eq(0).attr.title grant['title'] = anchors.eq(0).text() grant['pi'] = pi = {} pi['id'] = util.extract_id(anchors.eq(1).attr.href, 'Person') pi['name'] = anchors.eq(1).text() grant['organisation'] = org = {} org['id'] = util.extract_id(anchors.eq(2).attr.href, 'Organisation') org['name'] = anchors.eq(2).text() grant['department'] = dept = {} dept['id'] = util.extract_id(anchors.eq(3).attr.href, 'Department') dept['name'] = anchors.eq(3).text() value = PyQuery(r).find('span').eq(0).attr.title grant['value'] = util.extract_monetary_value(value) yield grant
def _extract_multiple_ids(elem, type): res = [] for el in (PyQuery(x) for x in elem.find("a")): o = {} o["id"] = util.extract_id(el.attr.href, type) o["name"] = el.text() res.append(o) return res
def _scrape_pi(g, el): pi_el = el.find('a#hlPrincipalInvestigator + a').eq(0) g['pi'] = pi = {} pi['id'] = util.extract_id(pi_el.attr.href, 'Person') pi['name'] = pi_el.text()
def _scrape_pi(g, el): pi_el = el.find("a#hlPrincipalInvestigator + a").eq(0) g["pi"] = pi = {} pi["id"] = util.extract_id(pi_el.attr.href, "Person") pi["name"] = pi_el.text()
def _scrape_departments(o, el): o["departments"] = ds = [] for e in (PyQuery(x) for x in el.find("table#dgDetails tr td a")): if e.attr.href.startswith("NGBOViewDepartment.aspx?DepartmentId="): ds.append({"id": util.extract_id(e.attr.href, "Department"), "name": e.text()})