import yaledirectory import os # "api" name can be whatever is most convenient for your program api = yaledirectory.API(os.environ['PEOPLE_SEARCH_SESSION'], os.environ['CSRF_TOKEN']) erik = api.person(netid='ekb33') print(erik.email) pronunciation = api.pronounce(email='*****@*****.**') print(pronunciation.recording_url) print(pronunciation.phonetic_spelling) results = api.people(first_name='Dylan', school='YC') for person in results: print(f'{person.display_name} is in {person.residential_college_name}') results = api.people(search_term='John', department='Public Health') print('Found at least %d people named John in Public Health department.' % len(results)) print('Performing a broad request with total included') results, total = api.people(netid='e', include_total=True) print('There are %d matching people, but only %d were returned this time.' % (total, len(results)))
def __init__(self, cache, people_search_session_cookie, csrf_token): super().__init__(cache) self.directory = yaledirectory.API(people_search_session_cookie, csrf_token)
def scrape(face_book_cookie, people_search_session_cookie, csrf_token): # Uncomment for quick testing """ directory = yaledirectory.API(people_search_session_cookie, csrf_token) people = [] directory_entries = read_directory(directory, 'aa') for entry in directory_entries: print('Parsing directory entry with NetID ' + entry.netid) person = add_directory_to_person({}, entry) people.append(person) """ html = get_html(face_book_cookie) tree = get_tree(html) containers = get_containers(tree) if len(containers) == 0: print( 'No people were found on this page. There may be something wrong with authentication, aborting.' ) return directory = yaledirectory.API(people_search_session_cookie, csrf_token) watermark_mask = Image.open('app/res/watermark_mask.png') image_uploader = ImageUploader() print('Already hosting {} images.'.format(len(image_uploader.files))) emails = {} people = [] for container in containers: person = { 'school': 'Yale College', 'school_code': 'YC', } person['last_name'], person['first_name'] = clean_name( container.find('h5', { 'class': 'yalehead' }).text) person['year'] = clean_year( container.find('div', { 'class': 'student_year' }).text) pronoun = container.find('div', {'class': 'student_info_pronoun'}).text person['pronoun'] = pronoun if pronoun else None info = container.find_all('div', {'class': 'student_info'}) person['college'] = info[0].text.replace(' College', '') try: person['email'] = info[1].find('a').text except AttributeError: pass #person.email = guess_email(person) trivia = info[1].find_all(text=True, recursive=False) try: room = trivia.pop(0) if RE_ROOM.match(trivia[0]) else None if room: person['residence'] = room result = RE_ROOM.search(room) person['building_code'], person['entryway'], person[ 'floor'], person['suite'], person['room'] = result.groups( ) person['birthday'] = trivia.pop() if RE_BIRTHDAY.match( trivia[-1]) else None person['major'] = trivia.pop() if trivia[-1] in MAJORS else None if person['major'] and person['major'] in MAJOR_FULL_NAMES: person['major'] = MAJOR_FULL_NAMES[person['major']] except IndexError: pass new_trivia = [] for r in range(len(trivia)): row = trivia[r].strip() if row.endswith(' /'): row = row.rstrip(' /') if RE_ACCESS_CODE.match(row): person['access_code'] = row if RE_PHONE.match(row): person['phone'] = clean_phone(row) if len(new_trivia) == 1 and not person.get('residence'): person['residence'] = new_trivia.pop(0) else: new_trivia.append(row) trivia = new_trivia # Handle first row of address being duplicated for residence if len(trivia) >= 2 and trivia[0] == trivia[1] and not person.get( 'residence'): person['residence'] = trivia.pop(0) person['address'] = '\n'.join(trivia) person['leave'] = False person['eli_whitney'] = False directory_entry = get_directory_entry(directory, person) if directory_entry is not None: person['netid'] = directory_entry.netid person['upi'] = directory_entry.upi if not person.get('email'): person['email'] = directory_entry.email if not person.get( 'year' ) and directory_entry.student_expected_graduation_year: person['year'] = int( directory_entry.student_expected_graduation_year) # This may not always be the case. But it's probably a safe bet. person['eli_whitney'] = True person = add_directory_to_person(person, directory_entry) else: print('Could not find directory entry.') image_id = clean_image_id(container.find('img')['src']) if image_id: image_filename = image_uploader.get_image_filename( image_id, person) if image_filename in image_uploader.files: person['image'] = image_uploader.get_file_url(image_filename) else: print('Image has not been processed yet.') image_r = requests.get( 'https://students.yale.edu/facebook/Photo?id=' + str(image_id), headers={ 'Cookie': face_book_cookie, }, stream=True) image_r.raw.decode_content = True try: im = Image.open(image_r.raw) # Paste mask over watermark im.paste(watermark_mask, (0, 0), watermark_mask) output = BytesIO() im.save(output, format='JPEG', mode='RGB') person['image'] = image_uploader.upload_image( output, image_filename) except OSError: # "Cannot identify image" error print('PIL could not identify image.') if person.get('email'): emails[person['email']] = len(people) people.append(person) # Check leaves people = compare_years('pre2020', people, emails) people = compare_years('fall2020', people, emails) # Fetch non-undergrad users by iterating netids # Get set of netids for students we've already processed checked_netids = { person_dict.get('netid') for person_dict in people if 'netid' in person_dict } directory_entries = read_directory(directory) for entry in directory_entries: if entry.netid not in checked_netids: print('Parsing directory entry with NetID ' + entry.netid) checked_netids.add(entry.netid) person = add_directory_to_person({}, entry) people.append(person) emails.append(person['email']) # Add data from departmental scraper departmental = Departmental() department_people = departmental.scrape() for record in department_people: person_i = None if record.get('email'): person_i = emails.index(record['email']) if not person_i: for i, person in enumerate(people): if name_matches(person, record['name']): person_i = i break # Add in data if we found a match if person_i: person = add_departmental_to_person(person, record) # Store people into database Person.query.delete() for person_dict in people: db.session.add( Person( ** {k: v for k, v in person_dict.items() if v or type(v) == bool})) db.session.commit() print('Done.')