def incorporate_row(row, graph): general_election_id = ns_election['72'] # change after the next elections bsq = int(row[6]) division_id = next(graph.subjects(ns_property.dgeqDivisionId, Literal(bsq))) lastname = row[9] firstname = row[10] gender = 'female' if row[7] == 'F' else 'male' candidate_id = ns_candidate[str2id(lastname + '_' + firstname)] graph.add((candidate_id, FOAF.gender, Literal(gender))) query = """select distinct ?run ?party where { ?elecnode rdf:type type:ProvincialElection . ?elecnode prop:generalElection <%s> . ?elecnode prop:division <%s> . ?run prop:election ?elecnode . ?run prop:runningCandidate <%s> . ?run prop:runningFor ?party . } """ % (general_election_id, division_id, candidate_id) qres = graph.query(query, initNs={'rdf': RDF, 'prop': ns_property, 'type': ns_type}) assert len(qres) == 1 run_id, party_id = list(qres)[0] graph.add((run_id, ns_property.dgeqRunId, Literal(int(row[1])))) graph.add((run_id, ns_property.dateRegistered, Literal(row[11]))) graph.add((party_id, RDF.type, FOAF.Organization)) graph.add((party_id, FOAF.name, Literal(row[3]))) agent_name = row[5] try: lastname, firstname = agent_name.split(', ') agent_id = ns_agent[str2id(lastname + '_' + firstname)] save_person(graph, agent_id, firstname, lastname) graph.add((run_id, ns_property.registrationAgent, agent_id)) except ValueError: pass # seriously, those damn freaking messed up names... we don't care about them.
def get_graph_from_soup(soup, dgeqid): graph = Graph() # extract election year header = soup('h2', id='e')[0].get_text() election_year = str2int(re.findall(r"\d{4}", header)[0]) print(election_year) table = soup('table', class_='tableau')[0] rows = table('tr') current_division = None general_election_id = ns_election[dgeqid] graph.add((general_election_id, RDF.type, ns_type.ProvincialGeneralElection)) graph.add((general_election_id, ns_property.year, Literal(election_year))) for row in rows: if not row.td: # header row, skip continue if 'circonscription-precedante' in row.td.attrs.get('class', ()): # division name row name = row.td.get_text().strip() current_division = ns_division[str2id(name)] graph.add((current_division, RDF.type, ns_type.ProvincialDivision)) graph.add((current_division, DC.description, Literal(name))) first = True election_id = BNode() graph.add((election_id, RDF.type, ns_type.ProvincialElection)) graph.add((election_id, ns_property.generalElection, general_election_id)) graph.add((election_id, ns_property.division, current_division)) elif row.td.attrs.get('colspan') == '4': # summary row summary = row.td.get_text().strip() matches = re.findall(r"(?<=:)[\s\d]+", summary) graph.add((election_id, ns_property.validVoteCount, Literal(str2int(matches[0])))) graph.add((election_id, ns_property.invalidVoteCount, Literal(str2int(matches[1])))) graph.add((election_id, ns_property.totalVoteCount, Literal(str2int(matches[2])))) graph.add((election_id, ns_property.admissibleVoterCount, Literal(str2int(matches[3])))) else: # normal row cells = row.find_all('td') name = cells[0].get_text().strip() m = re.match(r"(.+?), (.+) \((.+)\)", name) lastname = m.group(1) firstname = m.group(2) party_id = str2party(m.group(3)) candidate_id = ns_candidate[str2id(lastname + '_' + firstname)] votes = str2int(cells[1].get_text()) graph.add((candidate_id, RDF.type, ns_type.ProvincialCandidate)) save_person(graph, candidate_id, firstname, lastname) run_id = BNode() graph.add((run_id, RDF.type, ns_type.ProvincialCandidateRun)) graph.add((run_id, ns_property.runningCandidate, candidate_id)) graph.add((run_id, ns_property.election, election_id)) graph.add((run_id, ns_property.runningFor, party_id)) graph.add((run_id, ns_property.voteCount, Literal(votes))) if first: graph.add((run_id, ns_property.won, Literal(True))) first = False return graph