for prof in course_profs: prof = list(prof) prof.append(department[0]) prof.append(url) mod_profs.append(tuple(prof)) all_prof_info += mod_profs prof_dictionary_list=[] for prof in all_prof_info: if any('<' in p for p in prof): continue prof_dictionary={} prof_dictionary['name']=prof[1] prof_dictionary['personal_website']=prof[0] prof_dictionary['email']=prof[2] research = util.research_interest_extractor(prof[3]) if research: prof_dictionary['keywords'] = research else: prof_dictionary['research_summary'] = prof[3] prof_dictionary['school'] = 'MIT' prof_dictionary['department'] = util.prep_department(prof[4]) prof_dictionary['source'] = prof[5] print prof[5] util.validate_professor(prof_dictionary) prof_dictionary_list.append(prof_dictionary) pickle.dump(prof_dictionary_list, file('prof_dicts/mit.dat', 'w'))
url = "http://soe.stanford.edu/research/pers_index_results.php?index=%s" % chr(c) doc = util.dl_and_prep(url) results += re.findall(pat, doc) print len(results), 'total professors' output = [] for prof in results: pd = {} pd['lab_website'] = 'http://soe.stanford.edu/research/%s' % prof[0] pd['source'] = 'http://soe.stanford.edu/research/%s' % prof[0] pd['name'] = prof[1] #extract the primary deptmartment from within the <b> tags if '<b>' in prof[2]: pd['department'] = re.findall('<b>(.*?)</b>', prof[2])[0] else: pd['department'] = util.prep_department(util.remove_tags(prof[2])) research = prof[3].replace(' ', '').strip() if len(research) > 0: pd['keywords'] = util.split_and_clean(research, ',') pd['school'] = 'Stanford University' personal_page = util.dl_and_prep(pd['lab_website']) summary = re.findall('<h3>Research Statement</h3><p>(.*?)</p><h3>Degrees</h3>', personal_page) if summary: pd['research_summary'] = util.html_escape(summary[0].strip()) try: pd['image'] = 'http://soe.stanford.edu/research/%s' % re.findall('\'(images/photos_faculty_staff/.*?)\'', personal_page)[0] except Exception: import pdb; pdb.set_trace() pd['title'] = re.findall("Title:</td><td class=\"data\">(.*?)</td>", personal_page)[0] personal_website = re.findall("URL:</TD><TD class=\"data\"><a href='(.*?)'", personal_page)
department = "" email = "" bio = "" summary = "" interests = "" image = "" g = util.dl_and_prep(prof) if len(re.findall("\[firstname\] => (.*?)\s", g)) > 0: first_name = re.findall("\[firstname\] => (.*?)\s", g)[0] if len(re.findall("\[lastname\] => (.*?)\s", g)) > 0: last_name = re.findall("\[lastname\] => (.*?)\s", g)[0] if len(re.findall("\[title\] => (.*?)\n\s*\[", g)) > 0: title = re.findall("\[title\] => (.*?)\n\s*\[", g)[0] if len(re.findall("\[department\] => (.*?)\n\s*\[", g)) > 0: department = re.findall("\[department\] => (.*?)\n\s*\[", g)[0] department = util.prep_department(department) print department if len(re.findall("\[email\] => (.*?)\s", g)) > 0: email = re.findall("\[email\] => (.*?)\s", g)[0] if len(re.findall("\[bio\] => (.*?)\n\s*\[", g)) > 0: bio = re.findall("\[bio\] => (.*?)\n\s*\[", g)[0] if len(re.findall("\[summary\] => (.*?)\n\s*\[", g)) > 0: summary = re.findall("\[summary\] => (.*?)\n\s*\[", g)[0] if len(re.findall("\[interests\] => (.*?)\n\s*\[", g)) > 0: interests = re.findall("\[interests\] => (.*?)\n\s*\[", g)[0] if len(re.findall('class=rightColumn>.*?<img src="(.*?)"', g)) > 0: image = "http://research.brown.edu" + re.findall('class=rightColumn>.*?<img src="(.*?)"', g)[0] # research = re.findall("\[fresearch\] => (.*?)\n\s*\[",g)[0] if len()>0] name = first_name + " " + last_name brown_prof_dict = {} brown_prof_dict["name"] = name