コード例 #1
0
ファイル: makegraph.py プロジェクト: tvzeller/SumProj
def get_and_graph():
	schools_tree = es.get_tree("http://www.gla.ac.uk/schools/")
	ns = 'http://exslt.org/regular-expressions'
	path = '//div[@class="row standardContent"]//a[re:match(@href, "schools/[A-Za-z]+/")]'
	a_elems = schools_tree.xpath(path, namespaces={'re':ns})
	base_url = "http://www.gla.ac.uk"
	urls = []
	names = []

	for a in a_elems:
		staff_page_url = base_url + a.get("href") + "staff/"
		urls.append(staff_page_url)
		school_name = a.text
		names.append(school_name)

	school_names_urls = zip(names, urls)
	print school_names_urls

	#remove SOCS as done already, physics for now cause it's huge
	for tup in school_names_urls[:]:
		if "Physics" in tup[0]:
			school_names_urls.remove(tup)


	# For each school
	for name, url in school_names_urls[10:]:
		print name, url

		if "Humanities" in name:
			name = "School of Humanities"
		author_name_urls = es.get_author_name_urls(name, url)
		# write these to file for safe keeping
		# ALREADY BEING DONE BY ES
		#with open("../nameurls/" + name + ".txt", 'w') as f:
		#	json.dump(author_name_urls)
		coauthor_dict = es.get_coauthors_dict(author_name_urls, name)
		# extract just names from name urls and put in list
		#author_names = [author_name for author_name, author_url in author_name_urls]

		# Put names in Title First Name Last Name order
		for paper_id, data in coauthor_dict.items():
			authors = data["authors"]
			newauthors = [(anu[0].split(", ")[1] + " " + anu[0].split(", ")[0], anu[1]) for anu in authors]
			coauthor_dict[paper_id]["authors"] = newauthors

		# Do the same for author_name_urls
		# TODO is this necessary? Because we're checking against urls - could even just give gm the urls
		author_name_urls = [(anu[0].split(", ")[1] + " " + anu[0].split(", ")[0], anu[1]) for anu in author_name_urls]
		# now make graph
		gm = gfd.GraphMaker()
		gm.populate_graph(coauthor_dict, author_name_urls)
		gm.add_metrics()
		gm.add_just_school_community()
		gm.write_to_file("../newestgraphs/" + name + ".json")
コード例 #2
0
ファイル: data_processor.py プロジェクト: tvzeller/SumProj
def get_enlighten_data():
	"""
	Gets data for each school from Enlighten using englighten_scraper.
	Returns a list of tuples, one for each school; each tuple has the a data_dict
	(dictionary keyed by paper id with paper metadata as values) and a list of (name, enlighten url) pairs
	for the authors in the school
	"""
	data_dicts = []
	author_name_urls_list = []
	school_data = {}
	school_name_urls = es.get_school_name_urls()
	for schoolname, schoolurl in school_name_urls:
		author_name_urls = es.get_author_name_urls(schoolname, schoolurl)
		data_dict = es.get_coauthors_dict(author_name_urls, schoolname)
		#data_dicts.append(data_dict)
		#author_name_urls_list.append(author_name_urls)
		school_data[schoolname] = (data_dict, author_name_urls)

	#data_schoolauthors = zip(data_dicts, author_name_urls_list)
	return school_data
コード例 #3
0
ファイル: do_stats.py プロジェクト: tvzeller/SumProj
a_elems = schools_tree.xpath(path, namespaces={'re':ns})

base_url = "http://www.gla.ac.uk"
urls = []
names = []

for a in a_elems:
	staff_page_url = base_url + a.get("href") + "staff/"
	urls.append(staff_page_url)
	school_name = a.text
	names.append(school_name)

school_names_urls = zip(names, urls)
print school_names_urls

start_index = int(sys.argv[1])
end_index = int(sys.argv[2])

# TODO REMOVE SLICING
# THIS is temporary to just do the schools we haven't done yet
for schl_name, schl_url in school_names_urls[start_index:end_index]:
	
	author_name_urls = es.get_author_name_urls(schl_url, schl_name)
	titles_dict = es.get_titles_dict(author_name_urls)
	stats = cs.Stats(authors_dict, name)
	stats.write_to_file("stats_results/stats_test.txt")