"Ice Hockey, Men's" : constants.MENS_ICE_HOCKEY, "Ice Hockey, Women's" : constants.WOMENS_ICE_HOCKEY, "Lacrosse, Men's" : constants.MENS_LACROSSE, "Rifle, Men's/Women's" : [constants.MENS_RIFLE, constants.WOMENS_RIFLE], "Rowing, Women's" : constants.WOMENS_ROWING, "Soccer, Men's" : constants.MENS_SOCCER, "Soccer, Women's" : constants.WOMENS_SOCCER, "Softball, Women's" : constants.SOFTBALL, "Swimming, Men's" : constants.MENS_SWIMMING_DIVING, "Swimming, Women's" : constants.WOMENS_SWIMMING_DIVING, "Tennis, Men's" : constants.MENS_TENNIS, "Tennis, Women's" : constants.WOMENS_TENNIS, "Track & Field/Cross Country, Men's" : [constants.MENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD], "Track & Field/Cross Country, Women's" : [constants.WOMENS_CROSS_COUNTRY, constants.WOMENS_TRACK_FIELD], "Volleyball, Women's" : constants.WOMENS_VOLLEYBALL, "Wrestling" : constants.WRESTLING } print ("Scraping Ohio State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Ohio State University") d = pq(url=college[1]) for key, sport in sports.items(): print(sport) header = d('b:contains("' + key + '")') table = header.parent().parent().parent() first = True for row in table("tr"): if not first: scraper.parse_row(cxn, college[0], college[1], sport, row.getchildren(), ["name", "title", "phone", "email"], {'phone_prefix' : '(614) '}) first = False scraper.close_connection(cxn)
import scraper import constants sports = {"BASEBALL" : constants.BASEBALL, "MEN'S BASKETBALL" : constants.MENS_BASKETBALL, "WOMEN'S BASKETBALL" : constants.WOMENS_BASKETBALL, "WOMEN'S ROWING" : constants.WOMENS_ROWING, "MEN'S CROSS COUNTRY" : constants.MENS_CROSS_COUNTRY, "WOMEN'S CROSS COUNTRY" : constants.WOMENS_CROSS_COUNTRY, "FIELD HOCKEY" : constants.FIELD_HOCKEY, "FOOTBALL" : constants.FOOTBALL, "ICE HOCKEY" : constants.MENS_ICE_HOCKEY, "MEN'S LACROSSE" : constants.MENS_LACROSSE, "WOMEN'S LACROSSE" : constants.WOMENS_LACROSSE, "MEN'S SOCCER" : constants.MENS_SOCCER, "WOMEN'S SOCCER" : constants.WOMENS_SOCCER, "SOFTBALL" : constants.SOFTBALL, "MEN'S SWIMMING & DIVING" : constants.MENS_SWIMMING_DIVING, "WOMEN'S SWIMMING & DIVING" : constants.WOMENS_SWIMMING_DIVING, "WOMEN'S TENNIS" : constants.WOMENS_TENNIS, "MEN'S TRACK & FIELD" : constants.MENS_TRACK_FIELD, "WOMEN'S TRACK & FIELD" : constants.WOMENS_TRACK_FIELD } print ("Scraping Umass") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Massachusetts") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('a:contains("' + key + '")').filter(lambda i, this: this.text.startswith(key)) info_row = header.parent().next() while not info_row.attr("bgcolor"): info_row = info_row.next(); while info_row.children().length > 1 and info_row.children().length < 7: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "phone", None, None, "title", "email"], {'phone_prefix' : "(413) "}) info_row = info_row.next() scraper.close_connection(cxn)
"Women's Crew" : constants.WOMENS_ROWING, "Men's Cross Country" : constants.MENS_CROSS_COUNTRY, "Women's Cross Country" : constants.WOMENS_CROSS_COUNTRY, "Fencing" : [constants.MENS_FENCING, constants.WOMENS_FENCING], "Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL, "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF, "Gymnastics" : constants.WOMENS_GYMNASTICS, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY, "Women's Ice Hockey" : constants.WOMENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Skiing" : constants.WOMENS_SKIING, "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Men's Swimming & Diving" : constants.MENS_SWIMMING_DIVING, "Women's Swimming & Diving" : constants.WOMENS_SWIMMING_DIVING, "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Men's Track & Field" : constants.MENS_TRACK_FIELD, "Women's Track & Field" : constants.WOMENS_TRACK_FIELD, "Volleyball" : constants.WOMENS_VOLLEYBALL, "Men's Water Polo" : constants.MENS_WATER_POLO, "Women's Water Polo" : constants.WOMENS_WATER_POLO, "Wrestling" : constants.WRESTLING} print ("Scraping Brown") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Brown University") d = pq(url=college[1]) for key, sport in sports.items(): header = d('h2:contains("' + key + '")') table = header.next() coaches = table("tr[class^='roster-row']") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
"footbl" : constants.FOOTBALL, "hockey" : constants.MENS_ICE_HOCKEY, "m-bball" : constants.MENS_BASKETBALL, "m-golf" : constants.MENS_GOLF, "m-soccer" : constants.MENS_SOCCER, "c-swim" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "m-tennis" : constants.MENS_TENNIS, "softbl" : constants.SOFTBALL, "spirit" : constants.CHEERLEADING, "mtrack" : [constants.MENS_TRACK_FIELD, constants.MENS_CROSS_COUNTRY, constants.WOMENS_TRACK_FIELD, constants.WOMENS_CROSS_COUNTRY], "w-bball" : constants.WOMENS_BASKETBALL, "w-golf" : constants.WOMENS_GOLF, "w-gym" : constants.WOMENS_GYMNASTICS, "rowing" : constants.WOMENS_ROWING, "w-soccer" : constants.WOMENS_SOCCER, "w-tennis" : constants.WOMENS_TENNIS, "volley" : constants.WOMENS_VOLLEYBALL, "wrestle" : constants.WRESTLING } print ("Scraping Michigan State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Michigan State University") d = pq(url=college[1]) for key, sport in sports.items(): print (sport) info_row = d('a[name="' + key + '"]').parent().parent().next().next().next() # Consideration for bad page formatting if len(info_row.children()) < 4: info_row = d('a[name="' + key + '"]').parent().next().next().next() while len(info_row.children()) > 1: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["title", "name", "phone", "email"], {'phone_prefix' : '(517) '}) info_row = info_row.next() scraper.close_connection(cxn)
import scraper import constants sports = {"m-basebl" : constants.BASEBALL, "m-baskbl" : constants.MENS_BASKETBALL, "w-baskbl" : constants.WOMENS_BASKETBALL, "cheer" : constants.CHEERLEADING, "xc" : constants.MENS_CROSS_COUNTRY, "w-xc" : constants.WOMENS_CROSS_COUNTRY, "fence" : constants.MENS_FENCING, "f-hockey" : constants.FIELD_HOCKEY, "m-footbl" : constants.FOOTBALL, "m-golf" : constants.MENS_GOLF, "w-golf" : constants.WOMENS_GOLF, "m-ihockey" : constants.MENS_ICE_HOCKEY, "w-ihockey" : constants.WOMENS_ICE_HOCKEY, "w-lax" : constants.WOMENS_LACROSSE, "w-row" : constants.WOMENS_ROWING, "sail" : constants.SAILING, "skiing": [constants.MENS_SKIING, constants.WOMENS_SKIING], "m-soccer" : constants.MENS_SOCCER, "w-soccer" : constants.WOMENS_SOCCER, "softbl" : constants.SOFTBALL, "swim" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "m-tennis" : constants.MENS_TENNIS, "w-tennis" : constants.WOMENS_TENNIS, "track" : constants.MENS_TRACK_FIELD, "w-tf" : constants.WOMENS_TRACK_FIELD, "w-volley" : constants.WOMENS_VOLLEYBALL} print ("Scraping BC") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Boston College") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); anchor = d('a[name="' + key + '"]') info_row = anchor.parent().parent().next() while info_row.children().length > 1: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"]) info_row = info_row.next() scraper.close_connection(cxn)
"Cheerleading" : constants.CHEERLEADING, "Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Fencing" : [constants.MENS_FENCING, constants.WOMENS_FENCING], "Football" : constants.FOOTBALL, "Golf" : constants.MENS_GOLF, "Gymnastics (M)" : constants.MENS_GYMNASTICS, "Gymnastics (W)" : constants.WOMENS_GYMNASTICS, "Ice Hockey" : constants.MENS_ICE_HOCKEY, "Lacrosse" : constants.MENS_LACROSSE, "Rifle" : constants.MENS_RIFLE, "Soccer (M)" : constants.MENS_SOCCER, "Soccer (W)" : constants.WOMENS_SOCCER, "Swimming & Diving (M)" : constants.MENS_SWIMMING_DIVING, "Swimming & Diving (W)" : constants.WOMENS_SWIMMING_DIVING, "Tennis (M)" : constants.MENS_TENNIS, "Tennis (W)" : constants.WOMENS_TENNIS, "Track & Field" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Volleyball" : constants.WOMENS_VOLLEYBALL, "Water Polo" : constants.MENS_WATER_POLO, "Wresting" : constants.WRESTLING } print ("Scraping Air Force") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Air Force Academy") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); strong = d("strong") header = strong('a:contains("' + key + '")') table = header.parent().parent().parent().parent().parent().parent().next() coaches = table("tr") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, [["name", "email"], "title", "phone"], {'phone_prefix' : "(719) "}) scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants sports = { "Baseball" : constants.BASEBALL, "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL, "Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Golf" : constants.MENS_GOLF, "Hockey" : constants.MENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Rowing" : constants.WOMENS_ROWING, "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Canisius") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Canisius College") d = pq(url=college[1]) for key, sport in sports.items(): header = d('font:contains("' + key + '")') table = header.parent().parent().parent().parent().next() rows = table("tr") first = True for row in rows: first = False if first else scraper.parse_row(cxn, college[0], college[1], sport, row.getchildren(), ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants print ("Scraping Wisconsin") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Wisconsin") d = pq(url=college[1]) for class_name in ["even", "odd"]: for row in d.items("tr." + class_name): idx = 0 for element in row.items("td"): if idx == 1 and element.text() in scraper.sports: scraper.parse_row(cxn, college[0], college[1], scraper.sports[element.text()], row.children(), [["name", "email"], "", "title", "phone"]) idx += 1 scraper.close_connection(cxn)
cxn = scraper.get_connection() college = scraper.get_college(cxn, "Michigan") d = pq(url=college[1]) coach_name_pattern = re.compile('<a href="(.*)">(.*)<\/a> \((.*)\)') for sport, keys in sports.items(): print (sport) info_row = d('a:contains("' + keys[0] + '")').filter(lambda i, this: this.get("href") == keys[1]).parent().parent() print (info_row) # Some rows contain 2 coaches if len(info_row.children()[1].getchildren()) > 1: coaches = [] coach_elements = info_row.children()[1] coaches_names = str(etree.tostring(coach_elements), encoding='utf8').split("<br />") phone_elements = info_row.children()[2] phone_numbers = str(etree.tostring(phone_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />") email_elements = info_row.children()[3] emails = str(etree.tostring(email_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />") for i, coach in enumerate(coaches_names): m = coach_name_pattern.search(coach) profile_url = scraper.strip_string(m.group(1)) name = scraper.strip_string(m.group(2)) title = scraper.strip_string(m.group(3) + " Head Coach") phone = scraper.strip_string(phone_numbers[i]) email = scraper.strip_string(emails[i]) + "@umich.edu" scraper.save_coach(cxn, college[0], scraper.get_sport_id(cxn, sport), name, title, phone, email, profile_url) else: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [None, "name", "phone", "email"], {'email_suffix' : "@umich.edu", 'title' : 'Head Coach'}) scraper.close_connection(cxn)
"Rifle" : [constants.MENS_RIFLE, constants.WOMENS_RIFLE], "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Track & Field/Cross Country" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD, constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Volleyball" : constants.WOMENS_VOLLEYBALL, "Wrestling" : constants.WRESTLING} print ("Scraping Army") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Army") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); finder = d('strong:contains("' + key + '")') if not finder: finder = d('span:contains("' + key + '")').filter(lambda i, this: not 'Sprint' in this.text) while not finder.is_("tr"): finder = finder.parent() info_row = finder.next().next() while not info_row.is_("tr"): info_row = info_row.next() while info_row.children(): print(info_row) scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"], {'email_suffix' : '@usma.edu', 'phone_prefix' : '(845) 938-', 'truncate_name' : "- @"}) info_row = info_row.next() scraper.close_connection(cxn)