Exemple #1
0
  pd['source'] = 'http://soe.stanford.edu/research/%s' % prof[0]
  pd['name'] = prof[1]
  #extract the primary deptmartment from within the <b> tags
  if '<b>' in prof[2]:
    pd['department'] = re.findall('<b>(.*?)</b>', prof[2])[0]
  else:
    pd['department'] = util.prep_department(util.remove_tags(prof[2]))
  research = prof[3].replace('&nbsp;', '').strip()
  if len(research) > 0:
    pd['keywords'] = util.split_and_clean(research, ',')
  
  pd['school'] = 'Stanford University'
  personal_page = util.dl_and_prep(pd['lab_website'])
  summary = re.findall('<h3>Research Statement</h3><p>(.*?)</p><h3>Degrees</h3>', personal_page)
  if summary:
    pd['research_summary'] = util.html_escape(summary[0].strip())
  try:
    pd['image'] = 'http://soe.stanford.edu/research/%s' % re.findall('\'(images/photos_faculty_staff/.*?)\'', personal_page)[0]
  except Exception:
    import pdb; pdb.set_trace()
  pd['title'] = re.findall("Title:</td><td class=\"data\">(.*?)</td>", personal_page)[0]  
  personal_website = re.findall("URL:</TD><TD class=\"data\"><a href='(.*?)'", personal_page)
  if personal_website:
    pd['personal_website'] = personal_website[0]
  print pd['name'], pd['department']
  util.validate_professor(pd)
  output.append(pd)

pickle.dump(output, file('prof_dicts/stanford.dat', 'w'))
print 'Done!'
Exemple #2
0
    for prof in course_profs:
      prof = list(prof)
      prof.append(department[0])
      prof.append(url)
      mod_profs.append(tuple(prof))

    all_prof_info += mod_profs 

prof_dictionary_list=[]
for prof in all_prof_info:
    if any('<' in p for p in prof):
      continue
    prof_dictionary={}
    prof_dictionary['name']=prof[1]
    prof_dictionary['personal_website']=prof[0]
    prof_dictionary['email']=prof[2]
    research = util.research_interest_extractor(prof[3])
    if research:
      prof_dictionary['keywords'] = research
    else:
      prof_dictionary['research_summary'] = prof[3]

    prof_dictionary['school'] = 'MIT'
    prof_dictionary['department'] = util.prep_department(prof[4])
    prof_dictionary['source'] = prof[5]
    print prof[5]
    util.validate_professor(prof_dictionary)
    prof_dictionary_list.append(prof_dictionary)

pickle.dump(prof_dictionary_list, file('prof_dicts/mit.dat', 'w'))