Ejemplo n.º 1
0
import re
import urllib
import pickle
import util

prof_pic_url =[]
prof_no_pics = file('profs_no_pics').read().split('\n')[:-1]
prof_no_pics = [p.strip() for p in prof_no_pics]
#print prof_no_pics

for prof in prof_no_pics:
    prof_pic_url.append('http://www.picsearch.com/index.cgi?' + urllib.urlencode([('q', prof), ('face','yes')]))

prof_pics = []
 
for name, prof_url in zip(prof_no_pics, prof_pic_url):
    f = util.dl_and_prep(prof_url)
    if len(re.findall("img style=\"padding-top: [\d\.]*px;\" src=\"(.*?)\" height=\"\d*?\" width=\"\d*?\" alt=\".*?; result 1\"", f))>0:
        prof_pic_link = re.findall("img style=\"padding-top: [\d\.]*px;\" src=\"(.*?)\" height=\"\d*?\" width=\"\d*?\" alt=\".*?; result 1\"", f)[0]
    if not prof_pic_link:
        print prof_url
    prof_dict = {}
    prof_dict['name'] = name        
    prof_dict['image'] = prof_pic_link
    print prof_pic_link
    prof_pics.append(prof_dict)

pickle.dump(prof_pics, file('prof_dicts/images.dat', 'w'))
print "Done!"
Ejemplo n.º 2
0
import re
import urllib
import pickle
import util

prof_profiles =[]
for value in range(2,109):
    prof_profiles.append("http://www.seas.upenn.edu/directory/profile.php?ID="+str(value))

penn_profs =[]
for prof in prof_profiles:
    f=util.dl_and_prep(prof)
    name =""
    title =""
    department =""
    email =""
    awards =""
    research=""
    education=""
    image=""
    if len(re.findall("<h1>(.*?)&nbsp;(.*?)</h1>",f))>0:
        first_name = re.findall("<h1>(.*?)&nbsp;(.*?)</h1>",f)[0][0]
        last_name = re.findall("<h1>(.*?)&nbsp;(.*?)</h1>",f)[0][1]
        name = first_name.strip() + ' ' + last_name.strip()
        #print name
    if len(re.findall("<h2>(.*?)<br />",f))>0:
        title = re.findall("<h2>(.*?)<br />(.*?)<br",f)[0][0]
        department = re.findall("<h2>(.*?)<br />(.*?)<br",f)[0][1].strip()
        department = re.sub('\s{2}', ' ', department)
        #print department
        #print title, department
Ejemplo n.º 3
0
import pdb; pdb.set_trace
seed_topics = ["robotics", "neurodevelopment"]
seed_topics = ["robotics"]
#seed_topics = ["computational biology"]
#seed_topics = ["neurodevelopment"]
seed_topics = ["biochemistry"]
dict = {}

 #also... list of ___ topics
 
for topic in seed_topics:
    #Check if there is an "outline of ___" page
    topic_underscored = topic.replace(" ","_")
    url_outline = "http://en.wikipedia.org/wiki/Outline_of_" + topic_underscored
    try:
        f1 = util.dl_and_prep(url_outline)
        all_links=re.findall("title=\"(.*?)\"", f1)

        for index, link in enumerate(all_links):
            if "page does not exist" in link:
                all_links[index] = link[:-22]
        
        good_links = []
        bad_links = []
        for l in all_links:
            if (l.count(" ")>3) or (len(l)>35) or (":" in l) or ("(" in l) or ("Outline of" in l) or ("Index of" in l) or ("List of" in l):
                bad_links += [l]
            else:
                good_links += [l]

        print "BAD LINKS:\n"
Ejemplo n.º 4
0
import re
import urllib
import pickle
import util

f = util.dl_and_prep("http://bbs.yale.edu/people/index.aspx")
prof_links = [re.findall("<li><a href=\"(.*?)\"",f)[i].replace(".profile", "-3.profile") for i in range(31,len(re.findall("<li><a href=\"(.*?)\"",f))-5)]

yale_profs = []
for prof in prof_links:
    g=util.dl_and_prep(prof)
    x=prof.replace("-3.profile", ".profile")
    h=util.dl_and_prep(prof.replace("-3.profile", ".profile"))
    print x
    image =""
    name =""
    title =""
    research =""
    keywords ="" 
    if len(re.findall("h3><img class=\"bordered floatrt\" src=\"(.*?)\"",h))>0:
        image = re.findall("h3><img class=\"bordered floatrt\" src=\"(.*?)\"",h)[0]
    if len(re.findall("name=\"keywords\" content=\"(.*?),(.*?),",g))>0:
        first_name = re.findall("name=\"keywords\" content=\"(.*?),(.*?),",g)[0][0]
        last_name = re.findall("name=\"keywords\" content=\"(.*?),(.*?),",g)[0][1]
        name = first_name.strip() + ' ' + last_name.strip()
        #print name
    if len(re.findall("</h1><p>(.*?)</p>",g))>0:
        title = re.findall("</h1><p>(.*?)</p>",g)[0]
        #print title
    if len(re.findall("Research Interests</h3><p>(.*?)</p>",g))>0:
        interests = re.findall("Research Interests</h3><p>(.*?)</p>",g)[0]
Ejemplo n.º 5
0
import re
import pickle
import util
pat = """<tr VALIGN=TOP bgcolor='.*?'><td><a href='(.*?)'>(.*?)</a></td><td>(.*?)&nbsp;</td><td>(.*?)</td></tr>"""
results = []
for c in range(ord('A'), ord('[')):
  url = "http://soe.stanford.edu/research/pers_index_results.php?index=%s" % chr(c)
  doc = util.dl_and_prep(url)
  results += re.findall(pat, doc)

print len(results), 'total professors'
output = []
for prof in results:
  pd = {}
  pd['lab_website'] = 'http://soe.stanford.edu/research/%s' % prof[0]
  pd['source'] = 'http://soe.stanford.edu/research/%s' % prof[0]
  pd['name'] = prof[1]
  #extract the primary deptmartment from within the <b> tags
  if '<b>' in prof[2]:
    pd['department'] = re.findall('<b>(.*?)</b>', prof[2])[0]
  else:
    pd['department'] = util.prep_department(util.remove_tags(prof[2]))
  research = prof[3].replace('&nbsp;', '').strip()
  if len(research) > 0:
    pd['keywords'] = util.split_and_clean(research, ',')
  
  pd['school'] = 'Stanford University'
  personal_page = util.dl_and_prep(pd['lab_website'])
  summary = re.findall('<h3>Research Statement</h3><p>(.*?)</p><h3>Degrees</h3>', personal_page)
  if summary:
    pd['research_summary'] = util.html_escape(summary[0].strip())
Ejemplo n.º 6
0
#import scrapemark
import re
import util
import pickle

pat2 = """<tr><td valign="top"><div class="name"><a href="(.*?)>(.*?)</a></div><div>(.*?)</div>.*?<img src="(.*?)" alt.*?</td>.*?<td valign="top">.*?</td>.*?<td valign="top"><div>(.*?)</div></td></tr>"""
results = []
for c in range(ord('A'), ord('Z')):
  doc = util.dl_and_prep('http://people.cs.cmu.edu/Faculty/' + chr(c))
  print 'Got one'
  results += re.findall(pat2.strip(), doc)

final_dicts = [] 
for prof in results:
  pd = {}
  pd['source'] = "http://people.cs.cmu.edu" + prof[0]
  pd['name'] = prof[1][prof[1].find(',')+1:].strip() + ' ' + prof[1][:prof[1].find(',')].strip()
  pd['title'] = prof[2]
  pd['image'] = "http://people.cs.cmu.edu" + prof[3]
  pd['school'] = 'Carnegie Mellon'
  pd['department'] = 'Computer Science'
   
  research = util.research_interest_extractor(prof[4])
  if research:
    pd['keywords'] = research
  else:
    pd['research_summary'] = prof[4]
  final_dicts.append(pd)
 #website name title interests 

print 'got', len(final_dicts)
Ejemplo n.º 7
0
import re
import urllib2
import pickle
import util
all_prof_info =[]
courses = file('mitcourses.lst').read().split('\n')[:-1]
#HACKACAKCAKCKAC
missing_index = 0
missing_courses = ['Chemistry', 'Biological Engineering', 'Anthropology', 'Music and Theater Arts',
    'Comparative Media Studies']
for course in courses:
    url = "http://web.mit.edu/urop/research/profiles/%s.html" % course
    f = util.dl_and_prep(url)
    prof_info=re.findall("<dt><a href=\"(.*?)\"><strong>Prof\. (.*?)</strong></a>,.*?,.*?,.*?<.*?>(.*?)</a>\s*</dt>\s*<dd>(.*?)<", f)
    prof_info_alt= re.findall("<dt><strong><a href=\"(.*?)\">.*?\. (.*?)</a></strong>,.*?,.*?,.*?<.*?>(.*?)</a></dt>\s*<dd>(.*?)<",f)
    prof_info_alt2= re.findall("<dt><strong><a href=\"(.*?)\">.*?\. (.*?)</a>,</strong>.*?,.*?,.*?<.*?>(.*?)</a></dt>\s*<dd>(.*?)<",f)
    prof_info_alt3= re.findall("<dt><strong></strong><a href=\"(.*?)\">.*?\. (.*?)</a>,.*?,.*?,.*?<.*?>(.*?)</a></dt>\s*<dd>(.*?)<",f)
#    prof_info_alt4= re.findall("<dt><a href=\"(.*?)\"><strong>.*?\. (.*?)</strong></a>,.*?,.*?,.*?<.*?>(.*?)</a>\s*<strong>\(On Leave\)</strong></dt>\s*<dd>(.*?)<",f)
 #   prof_info_alt5= re.findall("<dt><strong><a href=\"(.*?)\">.*?\.(.*?)</a></strong>.*?,.*?,.*?<.*?>(.*?)</a>\s*<strong>\(On Leave\)</strong></dt>\s*<dd>(.*?)<",f)
    prof_info_alt6= re.findall("<dt><strong><a href=\"(.*?)\">.*?\. (.*?)</a></strong>,\s*.*?,\s*.*?,\s*.*?\s*<.*?>(.*?)</a></dt>\s*<dd>(.*?)<",f)
    department = re.findall("<title>MIT UROP: Current Research - (.*?):.*?</title>", f)
    if not department:
      department = [missing_courses[missing_index]]
      missing_index += 1
    course_profs = []
    #print department
    for prof in prof_info:
        course_profs.append(prof)
    for prof in prof_info_alt:
        course_profs.append(prof)
    for prof in prof_info_alt2:
Ejemplo n.º 8
0
import re
import urllib
import pickle
import util

alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
alphabet_url = []
for letter in alphabet:
    alphabet_url.append("http://research.brown.edu/includes/collaborators_namesearch.php?letter=" + letter)


prof_ids = []
for url in alphabet_url:
    f = util.dl_and_prep(url)
    prof_links = re.findall('</option><option value="(\d*)">', f)
    for prof_link in prof_links:
        prof_ids.append(prof_link)
# print prof_ids
# print len(prof_ids)

prof_profiles = []
for value in prof_ids:
    prof_profiles.append("http://research.brown.edu/research/profile.php?id=" + value)
# print prof_profiles

brown_profs = []

for prof in prof_profiles:
    first_name = ""
    last_name = ""
    title = ""
Ejemplo n.º 9
0
import re
import urllib
import pickle
import util

all_prof_links =[]
for num in range(0,48):
    f = util.dl_and_prep("http://vcresearch.berkeley.edu/faculty-expertise?page=%s&name=&expertise_area=&term_node_tid_depth=" % num)
    prof_links = ["http://vcresearch.berkeley.edu" + re.findall("class=\"views-field views-field-title\">.*?<a href=\"(.*?)\"",f)[i] for i in range(0, len(re.findall("class=\"views-field views-field-title\">.*?<a href=\"(.*?)\"",f)))]
    for prof_link in prof_links:
        all_prof_links.append(prof_link)

#print len(all_prof_links)
berkeley_profs = []
#import pdb; pdb.set_trace()
for prof in all_prof_links:
    name=''
    image=''
    title=''
    url=''
    email=''
    keywords=[]
    summary=''
    department=''
    g = util.dl_and_prep(prof)
    if len(re.findall("<h1 class=\"title\" id=\"page-title\">(.*?)</h1>",g))>0:
        name = re.findall("<h1 class=\"title\" id=\"page-title\">(.*?)</h1>",g)[0]
        #print name

    if len(re.findall("field field-type-filefield field-field-faculty-image\">.*?odd\">.*?img src=\"(.*?)\"",g))>0:
        image = re.findall("field field-type-filefield field-field-faculty-image\">.*?odd\">.*?img src=\"(.*?)\"",g)[0]