from pyquery import PyQuery as pq import scraper import constants sports = { "Baseball" : constants.BASEBALL, "Basketball - Men's" : constants.MENS_BASKETBALL, "Basketball - Women's" : constants.WOMENS_BASKETBALL, "Cheer Team" : constants.CHEERLEADING, "Football" : constants.FOOTBALL, "Golf" : [constants.MENS_GOLF, constants.WOMENS_GOLF], "Hockey - Men's" : constants.MENS_ICE_HOCKEY, "Hockey - Women's" : constants.WOMENS_ICE_HOCKEY, "Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Tennis" : [constants.MENS_TENNIS, constants.WOMENS_TENNIS], "Track and Field / Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping North Dakota") cxn = scraper.get_connection() college = scraper.get_college(cxn, "North Dakota") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('h3:contains("' + key + '")') info_row = header.parent().parent().next() while scraper.strip_string(info_row.children()[1].text): scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"], {'phone_prefix' : '(701) 77'}) info_row = info_row.next() scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants sports = { "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL, "Gymnastics" : constants.WOMENS_GYMNASTICS, "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Skiing" : [constants.MENS_SKIING, constants.WOMENS_SKIING], "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Women's Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Denver") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Denver") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('font:contains("' + key + '")').filter(lambda i: scraper.strip_string(this.text) == key).filter(lambda i: this.get("face")) info_row = header.parent().parent().parent().next() while len(info_row.children()) > 1: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [["name", "email"], "title", "phone"]) info_row = info_row.next() scraper.close_connection(cxn)
"Track & Field, Men's and Women's" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Volleyball," : constants.WOMENS_VOLLEYBALL } print ("Scraping Lake Superior State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Lake Superior State University") d = pq(url=college[1]) lines = d("p") for line in lines: for key, sport in sports.items(): if line.text and line.text.startswith(key): rest_of_line = line.text_content()[len(key):] fields = rest_of_line.split(",") name_and_title = fields[0].split("coach") title = scraper.strip_string(name_and_title[0]).capitalize() + " Coach" name = scraper.strip_string(name_and_title[1]) phone_index = 1 if not name: name = scraper.strip_string(fields[1]) phone_index = 2 phone = scraper.strip_string(fields[phone_index]) if not phone[:1].isdigit(): email = phone phone = scraper.strip_string(fields[phone_index + 1]) else: email = scraper.strip_string(fields[phone_index + 1]) if isinstance(sport, list): for sp in sport:
cxn = scraper.get_connection() college = scraper.get_college(cxn, "Michigan") d = pq(url=college[1]) coach_name_pattern = re.compile('<a href="(.*)">(.*)<\/a> \((.*)\)') for sport, keys in sports.items(): print (sport) info_row = d('a:contains("' + keys[0] + '")').filter(lambda i, this: this.get("href") == keys[1]).parent().parent() print (info_row) # Some rows contain 2 coaches if len(info_row.children()[1].getchildren()) > 1: coaches = [] coach_elements = info_row.children()[1] coaches_names = str(etree.tostring(coach_elements), encoding='utf8').split("<br />") phone_elements = info_row.children()[2] phone_numbers = str(etree.tostring(phone_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />") email_elements = info_row.children()[3] emails = str(etree.tostring(email_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />") for i, coach in enumerate(coaches_names): m = coach_name_pattern.search(coach) profile_url = scraper.strip_string(m.group(1)) name = scraper.strip_string(m.group(2)) title = scraper.strip_string(m.group(3) + " Head Coach") phone = scraper.strip_string(phone_numbers[i]) email = scraper.strip_string(emails[i]) + "@umich.edu" scraper.save_coach(cxn, college[0], scraper.get_sport_id(cxn, sport), name, title, phone, email, profile_url) else: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [None, "name", "phone", "email"], {'email_suffix' : "@umich.edu", 'title' : 'Head Coach'}) scraper.close_connection(cxn)