def getFullQueryResults(query, qualifier='lastname'): """ Returns a DOM tree element object from http://www.buffalo.edu/directory/ with the full list of results. Normally a query to the directory will return results in pages of 10 entries each, which is inconvenient to scrape. Hence, this function makes an initial request, gets the quantity of results, then makes a second request that returns the full list of results """ # Make first request firstQueryTree = getQueryPageTree(query, qualifier=qualifier) # See if query results in any records. If not, raise NameError if selector('p.bodyalt')(firstQueryTree): print "No results found for %s '%s'" % (qualifier, query) raise NameError # Get number of entries resultCountElement = selector('p.result_count')(firstQueryTree)[0] resultCount = int(resultCountElement.text.split()[-1]) # Get full list response return getQueryPageTree(query, perpage=resultCount, qualifier=qualifier)
def getUBIT(targetName, targetMajor): """ Returns a dict of students' info, keyed by their UBIT name, based on name and major criteria This function makes the query request to UB Directory, parses the DOM tree, and loops over the entries, filtering by major and name matching. Not sure how reliable the returned data is, based on name matching. Some of the names provided by the input file may not completely match the names from the entries on the UB Directory, due to personal updates introducing nicknames, spelling corrections, or other unpredictable variations in name It also returns all likely matches, instead of one sure match every time. This is due to the fact that shared surnames might exist in the same major. If no viable match is discovered, an error is thrown to alert the calling scope. """ # Handle target information nameTokens = targetName.split() target = { 'name': targetName, 'firstName': nameTokens[0], 'lastName': nameTokens[-1], 'major': targetMajor, 'tokens': nameTokens } # Make list of `li` entries from UB directory query queryResultsTree = getFullQueryResults(target['lastName']) entryList = selector('ul.content_list')(queryResultsTree)[0] entryListElements = entryList.findall('li') # Loop over `li`s and check each entry for match with target info candidates = {} for liElement in entryListElements: # Get the 'item_info' `dl` from the entry and turn it into a dict for # easy access descriptionListElement = selector('dl.item_info')(liElement)[0] candidate = dl2dict(descriptionListElement) # Add their name candidateName = liElement.find('h3').find('a').text.strip() candidate['Name'] = candidateName # Add their UBIT queryPart = liElement.find('h3').find('a').get('href').split('/')[-1] candidateUBIT = queryPart.split('&')[0].split('?')[0] candidate['UBIT'] = candidateUBIT # Update their email candidate['Email'] = candidateUBIT + '@buffalo.edu' # Ignore candidates not belonging to the right department if target['major'].lower() != candidate['Department'].lower(): continue # ...and those whose last name does not exist in their entry name if not target['lastName'].lower() in candidate['Name'].lower(): continue # # ...same as above, but for first name # if not target['firstName'].lower() in candidate['Name'].lower(): # continue # Candidate made it through tests, add them candidates[candidateUBIT] = candidate if not candidates: raise KeyError("No viable candidates found for %s" % targetName) else: return candidates