for result in results['results']['bindings']: if 'value' in result['prov']: p = result['prov']['value'] else: p = '' battlesWithProv.append((result['battle']['value'], p)) battlesWithProv = list(set(battlesWithProv)) print("{} instances identified.".format(len(battlesWithProv))) print("Downloading wikipdia pages for each identified battle.") dlErrors = [] with progress.Bar(expected_size=len(battlesWithProv)) as bar: for i, (b, p) in enumerate(battlesWithProv): name = uriToName(b) if p != '': p = urllib.parse.quote(p.encode('utf-8'), ':/=?()') try: urllib.request.urlretrieve(p, "rawData/html/battles/{}.html".format(name)) except urllib.error.HTTPError: dlErrors.append((b, p, '404')) else: url = "https://en.wikipedia.org/wiki/{}".format(name) try: urllib.request.urlretrieve(url, "rawData/html/battles/{}.html".format(name)) except urllib.error.HTTPError: dlErrors.append((b, url, '404')) bar.show(i+1) print("{} pages successfully downloaded.".format(len(battlesWithProv)-len(dlErrors)))
from bs4 import BeautifulSoup from clint.textui import progress battles = [] with open("processedData/listBattles.csv") as f: w = csv.reader(f, delimiter=',', quotechar='"') for i, row in enumerate(w): if i != 0: battles.append(row[0]) battlesWithCombatants = [] battlesWithoutCombatants = [] print("Identifying combatants for battles in downloaded HTML files...") with progress.Bar(expected_size=len(battles)) as bar: for i, battle in enumerate(battles): n = uriToName(battle) s = BeautifulSoup(open("rawData/html/battles/{}.html".format(n)), 'lxml') address = s.find(text="Belligerents") try: tr = address.parent.parent.nextSibling.nextSibling td = tr.find_all('td') combatants = [] for team, cell in enumerate(td): anchors = cell.find_all('a') for a in anchors: if not a.img: href = a.get('href') href = href[href.rfind('/')+1:] href = nameToUri(href) if 'File:' not in href and '#' not in href: combatants.append((battle, team, href))
for result in results["results"]["bindings"]: battlesPartOf.append((result["x"]["value"], result["y"]["value"])) g = Graph() g.vertex_properties["uri"] = g.new_vertex_property("string") g.vertex_properties["Label"] = g.new_vertex_property("string") g.nv = {} for o, t in battlesPartOf: if o in g.nv: v1 = g.nv[o] else: v1 = g.add_vertex() g.nv[o] = v1 g.vp.uri[v1] = o g.vp.Label[v1] = uriToName(o) if t in g.nv: v2 = g.nv[t] else: v2 = g.add_vertex() g.nv[t] = v2 g.vp.uri[v2] = t g.vp.Label[v2] = uriToName(t) e = g.add_edge(v1, v2) g.save("graphs/battlesPartOf.graphml") with open("processedData/battlesPartOf.csv", "w") as f: w = csv.writer(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_NONNUMERIC)