def scrape_professor(school_name, faculty_url, extracts_title, extracts_name, extracts_cv_url = None, extracts_personal_url = None, extracts_gscholar_url = None, extracts_papers = None): """ :return: a Professor object or None if it's not a tenure track faculty """ tree = get_tree(faculty_url) if tree is None: return None job_title = strip_whitespace(extracts_title(tree)) if job_title is None: print("WARNING: job title not found on "+faculty_url) return None if not title_is_tenure_track(job_title): return None name = extracts_name(tree) cv_link = None if extracts_cv_url is None else extracts_cv_url(faculty_url, tree) personal_url = None if extracts_personal_url is None else extracts_personal_url(faculty_url, tree) google_scholar_url = None if extracts_gscholar_url is None else extracts_gscholar_url(faculty_url, tree) prof = Professor(name=name, title=job_title, cv_url=cv_link, school=school_name, faculty_directory_url=faculty_url, personal_url=personal_url, google_scholar_url=google_scholar_url) if extracts_papers is not None: paper_list_url, papers = extracts_papers(faculty_url, tree) # save paper list to disk if paper_list_url and papers and len(papers) > 0: prof.paper_list_url = paper_list_url save_paper_list('paper_list', prof, papers) return prof
def get_title(tree): # We don't know exactly where the title will be so we keep looking until we find one that matches. for candidate in [ strip_whitespace(c.text) for c in css_select(tree, 'ul#ctl00_content_titles li i') + css_select(tree, 'ul#ctl00_content_areas li') ]: if candidate is not None and is_job_title(candidate): return candidate return None
def get_papers(url, tree): papers = [] # find the list of divs for journal publications for heading in css_select(tree, 'h3'): if 'Journal articles' in heading.text: # keep collecting the publication divs until we get to the next <H3> next = heading while next is not None: if next.name == 'p': papers.append(strip_whitespace(next.text)) elif next.name == 'div' and 'Awards and Honors' in next.text: break next = next.next_element return url + '#research', papers
def get_papers(url, tree): # find the bulleted list for publications for heading in css_select(tree, '#center-col strong'): if 'Publications' in heading.text: # look for the first <UL> under the publications header next = heading while next is not None: if next.name == 'ul': return url, [ strip_whitespace(li.text.replace(' PDF.', '')) for li in css_select(next, 'li') ] next = next.next_element return None, None
def get_papers(faculty_url, tree): name = get_name(tree) # download faculty directory, if not already downloaded global library_directory_tree if not library_directory_tree: library_directory_tree = get_tree(library_directory_url) # iterate through faculty names, looking for the best match anchors = css_select(library_directory_tree, 'table.table-striped a') closest_match = min( anchors, key=lambda x: editdistance.eval(name, strip_whitespace(x.text))) # require that closest match be pretty close to accept it if editdistance.eval( name, closest_match.text ) > 3: # 3 characters would allow for a missing initial and period return None, None # download bibliography page bib_url = closest_match.get('href') bib_tree = get_tree(bib_url) # find the "Published Works" section for heading in css_select(bib_tree, 'div.rich-text h2'): if 'Published' in heading.text: # keep collecting the publication divs until we get to the next <H2> papers = [] next = heading.next_element while next: if next.name == 'p': citation = strip_whitespace(next.text) # drop trailing link citation = citation.split('http://')[0] if len(citation) > 0: papers.append(citation) elif next.name == 'h2': break next = next.next_element return bib_url, papers return None, None
def get_papers(faculty_url, tree): for e in css_select(tree, 'div.leftResearch div.entries'): # check that we're in the right section if e.previous_sibling[ 'class'] == 'tabSubheading' and e.previous_sibling.text != 'Articles': break paper_list = [] for c in css_select(e, 'div.entry div.copy'): citation = c.text # some articles have an abstract, which I want to ignore for abstract in css_select(c, 'div'): citation = citation.replace(abstract.text, '') paper_list.append(strip_whitespace(citation)) return faculty_url + '#research', paper_list return None, None
def get_papers(url, tree): # find the list of Journal Articles for heading in css_select(tree, 'div.view-gsb-publications-listing h2.title'): if 'Journal Articles' in heading.text: # look for the first <div class="view-content"> under the Journal Articles header candidate = heading while candidate is not None: if candidate.name == 'div' and 'view-content' in candidate.get( 'class'): return url, [ strip_whitespace(row.get_text()) for row in css_select(candidate, 'div.views-row') ] candidate = candidate.next_element return None, None
def get_papers(url, tree): # add "&facInfo=pub" to the faculty url to get the url for the publications tab pub_list_url = url + "&facInfo=pub" wait() pub_tree = get_tree(pub_list_url) # find the bulleted list for publications for heading in css_select(pub_tree, '.tab-content h3'): if 'Articles' in heading.text: # look for the first <OL> under the publications header next = heading while next is not None: if next.name == 'ol': return pub_list_url, [ strip_whitespace( li.text.replace('View Details', '').replace('Citation:', '')) for li in css_select(next, 'div.citation') ] next = next.next_element return None, None
def papers_in_top_journals( professors: List[Professor]) -> Dict[Professor, AnyStr]: """:return dict mapping from professor to a list of titles.""" top_papers = {} # also count the total number of papers since start_date total_papers = 0 for p in professors: candidates = load_papers(p) total_papers += len(candidates) # filter out papers in the top journals top_papers[p] = [ citation for (journal, citation) in candidates if is_a_top_journal(journal) ] # detect anomalies if len(top_papers[p]) > 30: print("\nWARNING: found %d top papers for %s" % (len(top_papers[p]), p.slug())) for paper in sorted(top_papers[p], key=lambda citation: get_year(citation)): print("\t" + strip_whitespace(paper)) print("\nTotal of %d papers since %s\n" % (total_papers, starting_year)) return top_papers
def get_title(tag): for e in css_select(tag, 'td p')[0].children: if isinstance(e, NavigableString): e = strip_whitespace(e) if len(e) > 0: return e
def get_papers(url, tree): # find the bulleted list for publications return url, [ strip_whitespace(li.text) for li in css_select(tree, 'div#tabs-1 ul li') ]
def norm_str(my_string): return strip_whitespace(my_string.lower().translate(non_letter_remover))