Esempio n. 1
0
import yaledirectory
import os

# "api" name can be whatever is most convenient for your program
api = yaledirectory.API(os.environ['PEOPLE_SEARCH_SESSION'],
                        os.environ['CSRF_TOKEN'])

erik = api.person(netid='ekb33')
print(erik.email)

pronunciation = api.pronounce(email='*****@*****.**')
print(pronunciation.recording_url)
print(pronunciation.phonetic_spelling)

results = api.people(first_name='Dylan', school='YC')
for person in results:
    print(f'{person.display_name} is in {person.residential_college_name}')

results = api.people(search_term='John', department='Public Health')
print('Found at least %d people named John in Public Health department.' %
      len(results))

print('Performing a broad request with total included')
results, total = api.people(netid='e', include_total=True)
print('There are %d matching people, but only %d were returned this time.' %
      (total, len(results)))
Esempio n. 2
0
 def __init__(self, cache, people_search_session_cookie, csrf_token):
     super().__init__(cache)
     self.directory = yaledirectory.API(people_search_session_cookie,
                                        csrf_token)
Esempio n. 3
0
def scrape(face_book_cookie, people_search_session_cookie, csrf_token):
    # Uncomment for quick testing
    """
    directory = yaledirectory.API(people_search_session_cookie, csrf_token)
    people = []
    directory_entries = read_directory(directory, 'aa')
    for entry in directory_entries:
        print('Parsing directory entry with NetID ' + entry.netid)
        person = add_directory_to_person({}, entry)
        people.append(person)
    """

    html = get_html(face_book_cookie)
    tree = get_tree(html)
    containers = get_containers(tree)

    if len(containers) == 0:
        print(
            'No people were found on this page. There may be something wrong with authentication, aborting.'
        )
        return

    directory = yaledirectory.API(people_search_session_cookie, csrf_token)
    watermark_mask = Image.open('app/res/watermark_mask.png')

    image_uploader = ImageUploader()
    print('Already hosting {} images.'.format(len(image_uploader.files)))

    emails = {}
    people = []

    for container in containers:
        person = {
            'school': 'Yale College',
            'school_code': 'YC',
        }

        person['last_name'], person['first_name'] = clean_name(
            container.find('h5', {
                'class': 'yalehead'
            }).text)
        person['year'] = clean_year(
            container.find('div', {
                'class': 'student_year'
            }).text)
        pronoun = container.find('div', {'class': 'student_info_pronoun'}).text
        person['pronoun'] = pronoun if pronoun else None

        info = container.find_all('div', {'class': 'student_info'})

        person['college'] = info[0].text.replace(' College', '')
        try:
            person['email'] = info[1].find('a').text
        except AttributeError:
            pass
            #person.email = guess_email(person)
        trivia = info[1].find_all(text=True, recursive=False)
        try:
            room = trivia.pop(0) if RE_ROOM.match(trivia[0]) else None
            if room:
                person['residence'] = room
                result = RE_ROOM.search(room)
                person['building_code'], person['entryway'], person[
                    'floor'], person['suite'], person['room'] = result.groups(
                    )
            person['birthday'] = trivia.pop() if RE_BIRTHDAY.match(
                trivia[-1]) else None
            person['major'] = trivia.pop() if trivia[-1] in MAJORS else None
            if person['major'] and person['major'] in MAJOR_FULL_NAMES:
                person['major'] = MAJOR_FULL_NAMES[person['major']]
        except IndexError:
            pass

        new_trivia = []
        for r in range(len(trivia)):
            row = trivia[r].strip()
            if row.endswith(' /'):
                row = row.rstrip(' /')
                if RE_ACCESS_CODE.match(row):
                    person['access_code'] = row
                if RE_PHONE.match(row):
                    person['phone'] = clean_phone(row)
                if len(new_trivia) == 1 and not person.get('residence'):
                    person['residence'] = new_trivia.pop(0)
            else:
                new_trivia.append(row)
        trivia = new_trivia

        # Handle first row of address being duplicated for residence
        if len(trivia) >= 2 and trivia[0] == trivia[1] and not person.get(
                'residence'):
            person['residence'] = trivia.pop(0)

        person['address'] = '\n'.join(trivia)

        person['leave'] = False
        person['eli_whitney'] = False

        directory_entry = get_directory_entry(directory, person)
        if directory_entry is not None:
            person['netid'] = directory_entry.netid
            person['upi'] = directory_entry.upi
            if not person.get('email'):
                person['email'] = directory_entry.email
            if not person.get(
                    'year'
            ) and directory_entry.student_expected_graduation_year:
                person['year'] = int(
                    directory_entry.student_expected_graduation_year)
                # This may not always be the case. But it's probably a safe bet.
                person['eli_whitney'] = True
            person = add_directory_to_person(person, directory_entry)
        else:
            print('Could not find directory entry.')

        image_id = clean_image_id(container.find('img')['src'])
        if image_id:
            image_filename = image_uploader.get_image_filename(
                image_id, person)
            if image_filename in image_uploader.files:
                person['image'] = image_uploader.get_file_url(image_filename)
            else:
                print('Image has not been processed yet.')
                image_r = requests.get(
                    'https://students.yale.edu/facebook/Photo?id=' +
                    str(image_id),
                    headers={
                        'Cookie': face_book_cookie,
                    },
                    stream=True)
                image_r.raw.decode_content = True
                try:
                    im = Image.open(image_r.raw)

                    # Paste mask over watermark
                    im.paste(watermark_mask, (0, 0), watermark_mask)

                    output = BytesIO()
                    im.save(output, format='JPEG', mode='RGB')

                    person['image'] = image_uploader.upload_image(
                        output, image_filename)
                except OSError:
                    # "Cannot identify image" error
                    print('PIL could not identify image.')

        if person.get('email'):
            emails[person['email']] = len(people)
        people.append(person)

    # Check leaves
    people = compare_years('pre2020', people, emails)
    people = compare_years('fall2020', people, emails)

    # Fetch non-undergrad users by iterating netids
    # Get set of netids for students we've already processed
    checked_netids = {
        person_dict.get('netid')
        for person_dict in people if 'netid' in person_dict
    }
    directory_entries = read_directory(directory)
    for entry in directory_entries:
        if entry.netid not in checked_netids:
            print('Parsing directory entry with NetID ' + entry.netid)
            checked_netids.add(entry.netid)
            person = add_directory_to_person({}, entry)
            people.append(person)
            emails.append(person['email'])

    # Add data from departmental scraper
    departmental = Departmental()
    department_people = departmental.scrape()
    for record in department_people:
        person_i = None
        if record.get('email'):
            person_i = emails.index(record['email'])
        if not person_i:
            for i, person in enumerate(people):
                if name_matches(person, record['name']):
                    person_i = i
                    break

        # Add in data if we found a match
        if person_i:
            person = add_departmental_to_person(person, record)

    # Store people into database
    Person.query.delete()
    for person_dict in people:
        db.session.add(
            Person(
                **
                {k: v
                 for k, v in person_dict.items() if v or type(v) == bool}))
    db.session.commit()
    print('Done.')