def graphs_from_files(): filenames = ["Adam Smith Business School", "Dental School", "School of Chemistry", "School of Critical Studies", "School of Culture and Creative Arts", "School of Education" ] schools_tree = es.get_tree("http://www.gla.ac.uk/schools/") ns = 'http://exslt.org/regular-expressions' path = '//div[@class="row standardContent"]//a[re:match(@href, "schools/[A-Za-z]+/")]' a_elems = schools_tree.xpath(path, namespaces={'re':ns}) base_url = "http://www.gla.ac.uk" urls = [] names = [] for a in a_elems: staff_page_url = base_url + a.get("href") + "staff/" urls.append(staff_page_url) school_name = a.text names.append(school_name) school_names_urls = zip(names, urls) print school_names_urls for name, url in school_names_urls: if name in filenames: with open("../coauthor_data/" + name + ".txt") as f: d = json.load(f) staff_names = es.get_names(url) gm = gfd.GraphMaker(d, staff_names) gm.write_to_file(name + " graph")
def get_and_graph(): schools_tree = es.get_tree("http://www.gla.ac.uk/schools/") ns = 'http://exslt.org/regular-expressions' path = '//div[@class="row standardContent"]//a[re:match(@href, "schools/[A-Za-z]+/")]' a_elems = schools_tree.xpath(path, namespaces={'re':ns}) base_url = "http://www.gla.ac.uk" urls = [] names = [] for a in a_elems: staff_page_url = base_url + a.get("href") + "staff/" urls.append(staff_page_url) school_name = a.text names.append(school_name) school_names_urls = zip(names, urls) print school_names_urls #remove SOCS as done already, physics for now cause it's huge for tup in school_names_urls[:]: if "Physics" in tup[0]: school_names_urls.remove(tup) # For each school for name, url in school_names_urls[10:]: print name, url if "Humanities" in name: name = "School of Humanities" author_name_urls = es.get_author_name_urls(name, url) # write these to file for safe keeping # ALREADY BEING DONE BY ES #with open("../nameurls/" + name + ".txt", 'w') as f: # json.dump(author_name_urls) coauthor_dict = es.get_coauthors_dict(author_name_urls, name) # extract just names from name urls and put in list #author_names = [author_name for author_name, author_url in author_name_urls] # Put names in Title First Name Last Name order for paper_id, data in coauthor_dict.items(): authors = data["authors"] newauthors = [(anu[0].split(", ")[1] + " " + anu[0].split(", ")[0], anu[1]) for anu in authors] coauthor_dict[paper_id]["authors"] = newauthors # Do the same for author_name_urls # TODO is this necessary? Because we're checking against urls - could even just give gm the urls author_name_urls = [(anu[0].split(", ")[1] + " " + anu[0].split(", ")[0], anu[1]) for anu in author_name_urls] # now make graph gm = gfd.GraphMaker() gm.populate_graph(coauthor_dict, author_name_urls) gm.add_metrics() gm.add_just_school_community() gm.write_to_file("../newestgraphs/" + name + ".json")