pd['source'] = 'http://soe.stanford.edu/research/%s' % prof[0] pd['name'] = prof[1] #extract the primary deptmartment from within the <b> tags if '<b>' in prof[2]: pd['department'] = re.findall('<b>(.*?)</b>', prof[2])[0] else: pd['department'] = util.prep_department(util.remove_tags(prof[2])) research = prof[3].replace(' ', '').strip() if len(research) > 0: pd['keywords'] = util.split_and_clean(research, ',') pd['school'] = 'Stanford University' personal_page = util.dl_and_prep(pd['lab_website']) summary = re.findall('<h3>Research Statement</h3><p>(.*?)</p><h3>Degrees</h3>', personal_page) if summary: pd['research_summary'] = util.html_escape(summary[0].strip()) try: pd['image'] = 'http://soe.stanford.edu/research/%s' % re.findall('\'(images/photos_faculty_staff/.*?)\'', personal_page)[0] except Exception: import pdb; pdb.set_trace() pd['title'] = re.findall("Title:</td><td class=\"data\">(.*?)</td>", personal_page)[0] personal_website = re.findall("URL:</TD><TD class=\"data\"><a href='(.*?)'", personal_page) if personal_website: pd['personal_website'] = personal_website[0] print pd['name'], pd['department'] util.validate_professor(pd) output.append(pd) pickle.dump(output, file('prof_dicts/stanford.dat', 'w')) print 'Done!'
for prof in course_profs: prof = list(prof) prof.append(department[0]) prof.append(url) mod_profs.append(tuple(prof)) all_prof_info += mod_profs prof_dictionary_list=[] for prof in all_prof_info: if any('<' in p for p in prof): continue prof_dictionary={} prof_dictionary['name']=prof[1] prof_dictionary['personal_website']=prof[0] prof_dictionary['email']=prof[2] research = util.research_interest_extractor(prof[3]) if research: prof_dictionary['keywords'] = research else: prof_dictionary['research_summary'] = prof[3] prof_dictionary['school'] = 'MIT' prof_dictionary['department'] = util.prep_department(prof[4]) prof_dictionary['source'] = prof[5] print prof[5] util.validate_professor(prof_dictionary) prof_dictionary_list.append(prof_dictionary) pickle.dump(prof_dictionary_list, file('prof_dicts/mit.dat', 'w'))