import re import urllib import pickle import util prof_pic_url =[] prof_no_pics = file('profs_no_pics').read().split('\n')[:-1] prof_no_pics = [p.strip() for p in prof_no_pics] #print prof_no_pics for prof in prof_no_pics: prof_pic_url.append('http://www.picsearch.com/index.cgi?' + urllib.urlencode([('q', prof), ('face','yes')])) prof_pics = [] for name, prof_url in zip(prof_no_pics, prof_pic_url): f = util.dl_and_prep(prof_url) if len(re.findall("img style=\"padding-top: [\d\.]*px;\" src=\"(.*?)\" height=\"\d*?\" width=\"\d*?\" alt=\".*?; result 1\"", f))>0: prof_pic_link = re.findall("img style=\"padding-top: [\d\.]*px;\" src=\"(.*?)\" height=\"\d*?\" width=\"\d*?\" alt=\".*?; result 1\"", f)[0] if not prof_pic_link: print prof_url prof_dict = {} prof_dict['name'] = name prof_dict['image'] = prof_pic_link print prof_pic_link prof_pics.append(prof_dict) pickle.dump(prof_pics, file('prof_dicts/images.dat', 'w')) print "Done!"
import re import urllib import pickle import util prof_profiles =[] for value in range(2,109): prof_profiles.append("http://www.seas.upenn.edu/directory/profile.php?ID="+str(value)) penn_profs =[] for prof in prof_profiles: f=util.dl_and_prep(prof) name ="" title ="" department ="" email ="" awards ="" research="" education="" image="" if len(re.findall("<h1>(.*?) (.*?)</h1>",f))>0: first_name = re.findall("<h1>(.*?) (.*?)</h1>",f)[0][0] last_name = re.findall("<h1>(.*?) (.*?)</h1>",f)[0][1] name = first_name.strip() + ' ' + last_name.strip() #print name if len(re.findall("<h2>(.*?)<br />",f))>0: title = re.findall("<h2>(.*?)<br />(.*?)<br",f)[0][0] department = re.findall("<h2>(.*?)<br />(.*?)<br",f)[0][1].strip() department = re.sub('\s{2}', ' ', department) #print department #print title, department
import pdb; pdb.set_trace seed_topics = ["robotics", "neurodevelopment"] seed_topics = ["robotics"] #seed_topics = ["computational biology"] #seed_topics = ["neurodevelopment"] seed_topics = ["biochemistry"] dict = {} #also... list of ___ topics for topic in seed_topics: #Check if there is an "outline of ___" page topic_underscored = topic.replace(" ","_") url_outline = "http://en.wikipedia.org/wiki/Outline_of_" + topic_underscored try: f1 = util.dl_and_prep(url_outline) all_links=re.findall("title=\"(.*?)\"", f1) for index, link in enumerate(all_links): if "page does not exist" in link: all_links[index] = link[:-22] good_links = [] bad_links = [] for l in all_links: if (l.count(" ")>3) or (len(l)>35) or (":" in l) or ("(" in l) or ("Outline of" in l) or ("Index of" in l) or ("List of" in l): bad_links += [l] else: good_links += [l] print "BAD LINKS:\n"
import re import urllib import pickle import util f = util.dl_and_prep("http://bbs.yale.edu/people/index.aspx") prof_links = [re.findall("<li><a href=\"(.*?)\"",f)[i].replace(".profile", "-3.profile") for i in range(31,len(re.findall("<li><a href=\"(.*?)\"",f))-5)] yale_profs = [] for prof in prof_links: g=util.dl_and_prep(prof) x=prof.replace("-3.profile", ".profile") h=util.dl_and_prep(prof.replace("-3.profile", ".profile")) print x image ="" name ="" title ="" research ="" keywords ="" if len(re.findall("h3><img class=\"bordered floatrt\" src=\"(.*?)\"",h))>0: image = re.findall("h3><img class=\"bordered floatrt\" src=\"(.*?)\"",h)[0] if len(re.findall("name=\"keywords\" content=\"(.*?),(.*?),",g))>0: first_name = re.findall("name=\"keywords\" content=\"(.*?),(.*?),",g)[0][0] last_name = re.findall("name=\"keywords\" content=\"(.*?),(.*?),",g)[0][1] name = first_name.strip() + ' ' + last_name.strip() #print name if len(re.findall("</h1><p>(.*?)</p>",g))>0: title = re.findall("</h1><p>(.*?)</p>",g)[0] #print title if len(re.findall("Research Interests</h3><p>(.*?)</p>",g))>0: interests = re.findall("Research Interests</h3><p>(.*?)</p>",g)[0]
import re import pickle import util pat = """<tr VALIGN=TOP bgcolor='.*?'><td><a href='(.*?)'>(.*?)</a></td><td>(.*?) </td><td>(.*?)</td></tr>""" results = [] for c in range(ord('A'), ord('[')): url = "http://soe.stanford.edu/research/pers_index_results.php?index=%s" % chr(c) doc = util.dl_and_prep(url) results += re.findall(pat, doc) print len(results), 'total professors' output = [] for prof in results: pd = {} pd['lab_website'] = 'http://soe.stanford.edu/research/%s' % prof[0] pd['source'] = 'http://soe.stanford.edu/research/%s' % prof[0] pd['name'] = prof[1] #extract the primary deptmartment from within the <b> tags if '<b>' in prof[2]: pd['department'] = re.findall('<b>(.*?)</b>', prof[2])[0] else: pd['department'] = util.prep_department(util.remove_tags(prof[2])) research = prof[3].replace(' ', '').strip() if len(research) > 0: pd['keywords'] = util.split_and_clean(research, ',') pd['school'] = 'Stanford University' personal_page = util.dl_and_prep(pd['lab_website']) summary = re.findall('<h3>Research Statement</h3><p>(.*?)</p><h3>Degrees</h3>', personal_page) if summary: pd['research_summary'] = util.html_escape(summary[0].strip())
#import scrapemark import re import util import pickle pat2 = """<tr><td valign="top"><div class="name"><a href="(.*?)>(.*?)</a></div><div>(.*?)</div>.*?<img src="(.*?)" alt.*?</td>.*?<td valign="top">.*?</td>.*?<td valign="top"><div>(.*?)</div></td></tr>""" results = [] for c in range(ord('A'), ord('Z')): doc = util.dl_and_prep('http://people.cs.cmu.edu/Faculty/' + chr(c)) print 'Got one' results += re.findall(pat2.strip(), doc) final_dicts = [] for prof in results: pd = {} pd['source'] = "http://people.cs.cmu.edu" + prof[0] pd['name'] = prof[1][prof[1].find(',')+1:].strip() + ' ' + prof[1][:prof[1].find(',')].strip() pd['title'] = prof[2] pd['image'] = "http://people.cs.cmu.edu" + prof[3] pd['school'] = 'Carnegie Mellon' pd['department'] = 'Computer Science' research = util.research_interest_extractor(prof[4]) if research: pd['keywords'] = research else: pd['research_summary'] = prof[4] final_dicts.append(pd) #website name title interests print 'got', len(final_dicts)
import re import urllib2 import pickle import util all_prof_info =[] courses = file('mitcourses.lst').read().split('\n')[:-1] #HACKACAKCAKCKAC missing_index = 0 missing_courses = ['Chemistry', 'Biological Engineering', 'Anthropology', 'Music and Theater Arts', 'Comparative Media Studies'] for course in courses: url = "http://web.mit.edu/urop/research/profiles/%s.html" % course f = util.dl_and_prep(url) prof_info=re.findall("<dt><a href=\"(.*?)\"><strong>Prof\. (.*?)</strong></a>,.*?,.*?,.*?<.*?>(.*?)</a>\s*</dt>\s*<dd>(.*?)<", f) prof_info_alt= re.findall("<dt><strong><a href=\"(.*?)\">.*?\. (.*?)</a></strong>,.*?,.*?,.*?<.*?>(.*?)</a></dt>\s*<dd>(.*?)<",f) prof_info_alt2= re.findall("<dt><strong><a href=\"(.*?)\">.*?\. (.*?)</a>,</strong>.*?,.*?,.*?<.*?>(.*?)</a></dt>\s*<dd>(.*?)<",f) prof_info_alt3= re.findall("<dt><strong></strong><a href=\"(.*?)\">.*?\. (.*?)</a>,.*?,.*?,.*?<.*?>(.*?)</a></dt>\s*<dd>(.*?)<",f) # prof_info_alt4= re.findall("<dt><a href=\"(.*?)\"><strong>.*?\. (.*?)</strong></a>,.*?,.*?,.*?<.*?>(.*?)</a>\s*<strong>\(On Leave\)</strong></dt>\s*<dd>(.*?)<",f) # prof_info_alt5= re.findall("<dt><strong><a href=\"(.*?)\">.*?\.(.*?)</a></strong>.*?,.*?,.*?<.*?>(.*?)</a>\s*<strong>\(On Leave\)</strong></dt>\s*<dd>(.*?)<",f) prof_info_alt6= re.findall("<dt><strong><a href=\"(.*?)\">.*?\. (.*?)</a></strong>,\s*.*?,\s*.*?,\s*.*?\s*<.*?>(.*?)</a></dt>\s*<dd>(.*?)<",f) department = re.findall("<title>MIT UROP: Current Research - (.*?):.*?</title>", f) if not department: department = [missing_courses[missing_index]] missing_index += 1 course_profs = [] #print department for prof in prof_info: course_profs.append(prof) for prof in prof_info_alt: course_profs.append(prof) for prof in prof_info_alt2:
import re import urllib import pickle import util alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" alphabet_url = [] for letter in alphabet: alphabet_url.append("http://research.brown.edu/includes/collaborators_namesearch.php?letter=" + letter) prof_ids = [] for url in alphabet_url: f = util.dl_and_prep(url) prof_links = re.findall('</option><option value="(\d*)">', f) for prof_link in prof_links: prof_ids.append(prof_link) # print prof_ids # print len(prof_ids) prof_profiles = [] for value in prof_ids: prof_profiles.append("http://research.brown.edu/research/profile.php?id=" + value) # print prof_profiles brown_profs = [] for prof in prof_profiles: first_name = "" last_name = "" title = ""
import re import urllib import pickle import util all_prof_links =[] for num in range(0,48): f = util.dl_and_prep("http://vcresearch.berkeley.edu/faculty-expertise?page=%s&name=&expertise_area=&term_node_tid_depth=" % num) prof_links = ["http://vcresearch.berkeley.edu" + re.findall("class=\"views-field views-field-title\">.*?<a href=\"(.*?)\"",f)[i] for i in range(0, len(re.findall("class=\"views-field views-field-title\">.*?<a href=\"(.*?)\"",f)))] for prof_link in prof_links: all_prof_links.append(prof_link) #print len(all_prof_links) berkeley_profs = [] #import pdb; pdb.set_trace() for prof in all_prof_links: name='' image='' title='' url='' email='' keywords=[] summary='' department='' g = util.dl_and_prep(prof) if len(re.findall("<h1 class=\"title\" id=\"page-title\">(.*?)</h1>",g))>0: name = re.findall("<h1 class=\"title\" id=\"page-title\">(.*?)</h1>",g)[0] #print name if len(re.findall("field field-type-filefield field-field-faculty-image\">.*?odd\">.*?img src=\"(.*?)\"",g))>0: image = re.findall("field field-type-filefield field-field-faculty-image\">.*?odd\">.*?img src=\"(.*?)\"",g)[0]