Example #1
0
def graphs_from_files():
	filenames = ["Adam Smith Business School",
				"Dental School",
				"School of Chemistry",
				"School of Critical Studies",
				"School of Culture and Creative Arts",
				"School of Education"
				]

	schools_tree = es.get_tree("http://www.gla.ac.uk/schools/")
	ns = 'http://exslt.org/regular-expressions'
	path = '//div[@class="row standardContent"]//a[re:match(@href, "schools/[A-Za-z]+/")]'
	a_elems = schools_tree.xpath(path, namespaces={'re':ns})
	base_url = "http://www.gla.ac.uk"
	urls = []
	names = []

	for a in a_elems:
		staff_page_url = base_url + a.get("href") + "staff/"
		urls.append(staff_page_url)
		school_name = a.text
		names.append(school_name)

	school_names_urls = zip(names, urls)
	print school_names_urls


	for name, url in school_names_urls:
		if name in filenames:
			with open("../coauthor_data/" + name + ".txt") as f:
				d = json.load(f)

			staff_names = es.get_names(url)
			gm = gfd.GraphMaker(d, staff_names)
			gm.write_to_file(name + " graph")
Example #2
0
def get_and_graph():
	schools_tree = es.get_tree("http://www.gla.ac.uk/schools/")
	ns = 'http://exslt.org/regular-expressions'
	path = '//div[@class="row standardContent"]//a[re:match(@href, "schools/[A-Za-z]+/")]'
	a_elems = schools_tree.xpath(path, namespaces={'re':ns})
	base_url = "http://www.gla.ac.uk"
	urls = []
	names = []

	for a in a_elems:
		staff_page_url = base_url + a.get("href") + "staff/"
		urls.append(staff_page_url)
		school_name = a.text
		names.append(school_name)

	school_names_urls = zip(names, urls)
	print school_names_urls

	#remove SOCS as done already, physics for now cause it's huge
	for tup in school_names_urls[:]:
		if "Physics" in tup[0]:
			school_names_urls.remove(tup)


	# For each school
	for name, url in school_names_urls[10:]:
		print name, url

		if "Humanities" in name:
			name = "School of Humanities"
		author_name_urls = es.get_author_name_urls(name, url)
		# write these to file for safe keeping
		# ALREADY BEING DONE BY ES
		#with open("../nameurls/" + name + ".txt", 'w') as f:
		#	json.dump(author_name_urls)
		coauthor_dict = es.get_coauthors_dict(author_name_urls, name)
		# extract just names from name urls and put in list
		#author_names = [author_name for author_name, author_url in author_name_urls]

		# Put names in Title First Name Last Name order
		for paper_id, data in coauthor_dict.items():
			authors = data["authors"]
			newauthors = [(anu[0].split(", ")[1] + " " + anu[0].split(", ")[0], anu[1]) for anu in authors]
			coauthor_dict[paper_id]["authors"] = newauthors

		# Do the same for author_name_urls
		# TODO is this necessary? Because we're checking against urls - could even just give gm the urls
		author_name_urls = [(anu[0].split(", ")[1] + " " + anu[0].split(", ")[0], anu[1]) for anu in author_name_urls]
		# now make graph
		gm = gfd.GraphMaker()
		gm.populate_graph(coauthor_dict, author_name_urls)
		gm.add_metrics()
		gm.add_just_school_community()
		gm.write_to_file("../newestgraphs/" + name + ".json")