Python selector Examples, lxml.cssselect.selector Python Examples

Example #1

0

Show file

File: ubdir_scraper.py Project: jackma93/Commencement-System-Firmware

def getFullQueryResults(query, qualifier='lastname'):
    """
    Returns a DOM tree element object from http://www.buffalo.edu/directory/
        with the full list of results.
    Normally a query to the directory will return results in pages of 10 entries
        each, which is inconvenient to scrape. Hence, this function makes an
        initial request, gets the quantity of results, then makes a second
        request that returns the full list of results
    """
    # Make first request
    firstQueryTree = getQueryPageTree(query, qualifier=qualifier)

    # See if query results in any records. If not, raise NameError
    if selector('p.bodyalt')(firstQueryTree):
        print "No results found for %s '%s'" % (qualifier, query)
        raise NameError

    # Get number of entries
    resultCountElement = selector('p.result_count')(firstQueryTree)[0]
    resultCount = int(resultCountElement.text.split()[-1])

    # Get full list response
    return getQueryPageTree(query, perpage=resultCount, qualifier=qualifier)

Example #2

0

Show file

File: ubdir_scraper.py Project: jackma93/Commencement-System-Firmware

def getFullQueryResults(query, qualifier='lastname'):
    """
    Returns a DOM tree element object from http://www.buffalo.edu/directory/
        with the full list of results.
    Normally a query to the directory will return results in pages of 10 entries
        each, which is inconvenient to scrape. Hence, this function makes an
        initial request, gets the quantity of results, then makes a second
        request that returns the full list of results
    """
    # Make first request
    firstQueryTree = getQueryPageTree(query, qualifier=qualifier)

    # See if query results in any records. If not, raise NameError
    if selector('p.bodyalt')(firstQueryTree):
        print "No results found for %s '%s'" % (qualifier, query)
        raise NameError

    # Get number of entries
    resultCountElement = selector('p.result_count')(firstQueryTree)[0]
    resultCount = int(resultCountElement.text.split()[-1])

    # Get full list response
    return getQueryPageTree(query, perpage=resultCount, qualifier=qualifier)

Example #3

0

Show file

File: ubdir_scraper.py Project: jackma93/Commencement-System-Firmware

def getUBIT(targetName, targetMajor):
    """
    Returns a dict of students' info, keyed by their UBIT name, based on name
        and major criteria
    This function makes the query request to UB Directory, parses the DOM tree,
        and loops over the entries, filtering by major and name matching.
    Not sure how reliable the returned data is, based on name matching. Some of
        the names provided by the input file may not completely match the names
        from the entries on the UB Directory, due to personal updates
        introducing nicknames, spelling corrections, or other unpredictable
        variations in name
    It also returns all likely matches, instead of one sure match every time.
        This is due to the fact that shared surnames might exist in the same
        major.
    If no viable match is discovered, an error is thrown to alert the calling
        scope.
    """
    # Handle target information
    nameTokens = targetName.split()
    target = {
        'name': targetName,
        'firstName': nameTokens[0],
        'lastName': nameTokens[-1],
        'major': targetMajor,
        'tokens': nameTokens
    }

    # Make list of `li` entries from UB directory query
    queryResultsTree = getFullQueryResults(target['lastName'])
    entryList = selector('ul.content_list')(queryResultsTree)[0]
    entryListElements = entryList.findall('li')

    # Loop over `li`s and check each entry for match with target info
    candidates = {}
    for liElement in entryListElements:
        # Get the 'item_info' `dl` from the entry and turn it into a dict for
        #   easy access
        descriptionListElement = selector('dl.item_info')(liElement)[0]
        candidate = dl2dict(descriptionListElement)
        # Add their name
        candidateName = liElement.find('h3').find('a').text.strip()
        candidate['Name'] = candidateName
        # Add their UBIT
        queryPart = liElement.find('h3').find('a').get('href').split('/')[-1]
        candidateUBIT = queryPart.split('&')[0].split('?')[0]
        candidate['UBIT'] = candidateUBIT
        # Update their email
        candidate['Email'] = candidateUBIT + '@buffalo.edu'

        # Ignore candidates not belonging to the right department
        if target['major'].lower() != candidate['Department'].lower():
            continue
        # ...and those whose last name does not exist in their entry name
        if not target['lastName'].lower() in candidate['Name'].lower():
            continue
        # # ...same as above, but for first name
        # if not target['firstName'].lower() in candidate['Name'].lower():
        #     continue

        # Candidate made it through tests, add them
        candidates[candidateUBIT] = candidate

    if not candidates:
        raise KeyError("No viable candidates found for %s" % targetName)
    else:
        return candidates

Example #4

0

Show file

File: ubdir_scraper.py Project: jackma93/Commencement-System-Firmware

def getUBIT(targetName, targetMajor):
    """
    Returns a dict of students' info, keyed by their UBIT name, based on name
        and major criteria
    This function makes the query request to UB Directory, parses the DOM tree,
        and loops over the entries, filtering by major and name matching.
    Not sure how reliable the returned data is, based on name matching. Some of
        the names provided by the input file may not completely match the names
        from the entries on the UB Directory, due to personal updates
        introducing nicknames, spelling corrections, or other unpredictable
        variations in name
    It also returns all likely matches, instead of one sure match every time.
        This is due to the fact that shared surnames might exist in the same
        major.
    If no viable match is discovered, an error is thrown to alert the calling
        scope.
    """
    # Handle target information
    nameTokens = targetName.split()
    target = {
        'name': targetName,
        'firstName': nameTokens[0],
        'lastName': nameTokens[-1],
        'major': targetMajor,
        'tokens': nameTokens
    }

    # Make list of `li` entries from UB directory query
    queryResultsTree = getFullQueryResults(target['lastName'])
    entryList = selector('ul.content_list')(queryResultsTree)[0]
    entryListElements = entryList.findall('li')

    # Loop over `li`s and check each entry for match with target info
    candidates = {}
    for liElement in entryListElements:
        # Get the 'item_info' `dl` from the entry and turn it into a dict for
        #   easy access
        descriptionListElement = selector('dl.item_info')(liElement)[0]
        candidate = dl2dict(descriptionListElement)
        # Add their name
        candidateName = liElement.find('h3').find('a').text.strip()
        candidate['Name'] = candidateName
        # Add their UBIT
        queryPart = liElement.find('h3').find('a').get('href').split('/')[-1]
        candidateUBIT = queryPart.split('&')[0].split('?')[0]
        candidate['UBIT'] = candidateUBIT
        # Update their email
        candidate['Email'] = candidateUBIT + '@buffalo.edu'

        # Ignore candidates not belonging to the right department
        if target['major'].lower() != candidate['Department'].lower():
            continue
        # ...and those whose last name does not exist in their entry name
        if not target['lastName'].lower() in candidate['Name'].lower():
            continue
        # # ...same as above, but for first name
        # if not target['firstName'].lower() in candidate['Name'].lower():
        #     continue

        # Candidate made it through tests, add them
        candidates[candidateUBIT] = candidate

    if not candidates:
        raise KeyError("No viable candidates found for %s" % targetName)
    else:
        return candidates