from pyquery import PyQuery as pq import scraper import constants sports = { "Baseball" : constants.BASEBALL, "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL, "Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Golf" : constants.MENS_GOLF, "Hockey" : constants.MENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Rowing" : constants.WOMENS_ROWING, "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Canisius") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Canisius College") d = pq(url=college[1]) for key, sport in sports.items(): header = d('font:contains("' + key + '")') table = header.parent().parent().parent().parent().next() rows = table("tr") first = True for row in rows: first = False if first else scraper.parse_row(cxn, college[0], college[1], sport, row.getchildren(), ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
"Women's Crew" : constants.WOMENS_ROWING, "Men's Cross Country" : constants.MENS_CROSS_COUNTRY, "Women's Cross Country" : constants.WOMENS_CROSS_COUNTRY, "Fencing" : [constants.MENS_FENCING, constants.WOMENS_FENCING], "Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL, "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF, "Gymnastics" : constants.WOMENS_GYMNASTICS, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY, "Women's Ice Hockey" : constants.WOMENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Skiing" : constants.WOMENS_SKIING, "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Men's Swimming & Diving" : constants.MENS_SWIMMING_DIVING, "Women's Swimming & Diving" : constants.WOMENS_SWIMMING_DIVING, "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Men's Track & Field" : constants.MENS_TRACK_FIELD, "Women's Track & Field" : constants.WOMENS_TRACK_FIELD, "Volleyball" : constants.WOMENS_VOLLEYBALL, "Men's Water Polo" : constants.MENS_WATER_POLO, "Women's Water Polo" : constants.WOMENS_WATER_POLO, "Wrestling" : constants.WRESTLING} print ("Scraping Brown") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Brown University") d = pq(url=college[1]) for key, sport in sports.items(): header = d('h2:contains("' + key + '")') table = header.next() coaches = table("tr[class^='roster-row']") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
"Men's & Women's Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL, "Men's Golf" : constants.MENS_GOLF, "Men's Hockey" : constants.MENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Men's & Women's Swimming & Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Men's & Women's Tennis" : [constants.MENS_TENNIS, constants.WOMENS_TENNIS], "Men's & Women's Track & Field" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Women's Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Bentley") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Bentley University") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('h1:contains("' + key + '")') tables = header.next() coaches = tables("tr[class^='roster-row']") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
"footbl" : constants.FOOTBALL, "hockey" : constants.MENS_ICE_HOCKEY, "m-bball" : constants.MENS_BASKETBALL, "m-golf" : constants.MENS_GOLF, "m-soccer" : constants.MENS_SOCCER, "c-swim" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "m-tennis" : constants.MENS_TENNIS, "softbl" : constants.SOFTBALL, "spirit" : constants.CHEERLEADING, "mtrack" : [constants.MENS_TRACK_FIELD, constants.MENS_CROSS_COUNTRY, constants.WOMENS_TRACK_FIELD, constants.WOMENS_CROSS_COUNTRY], "w-bball" : constants.WOMENS_BASKETBALL, "w-golf" : constants.WOMENS_GOLF, "w-gym" : constants.WOMENS_GYMNASTICS, "rowing" : constants.WOMENS_ROWING, "w-soccer" : constants.WOMENS_SOCCER, "w-tennis" : constants.WOMENS_TENNIS, "volley" : constants.WOMENS_VOLLEYBALL, "wrestle" : constants.WRESTLING } print ("Scraping Michigan State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Michigan State University") d = pq(url=college[1]) for key, sport in sports.items(): print (sport) info_row = d('a[name="' + key + '"]').parent().parent().next().next().next() # Consideration for bad page formatting if len(info_row.children()) < 4: info_row = d('a[name="' + key + '"]').parent().next().next().next() while len(info_row.children()) > 1: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["title", "name", "phone", "email"], {'phone_prefix' : '(517) '}) info_row = info_row.next() scraper.close_connection(cxn)
import scraper import constants sports = {"BASEBALL" : constants.BASEBALL, "MEN'S BASKETBALL" : constants.MENS_BASKETBALL, "WOMEN'S BASKETBALL" : constants.WOMENS_BASKETBALL, "WOMEN'S ROWING" : constants.WOMENS_ROWING, "MEN'S CROSS COUNTRY" : constants.MENS_CROSS_COUNTRY, "WOMEN'S CROSS COUNTRY" : constants.WOMENS_CROSS_COUNTRY, "FIELD HOCKEY" : constants.FIELD_HOCKEY, "FOOTBALL" : constants.FOOTBALL, "ICE HOCKEY" : constants.MENS_ICE_HOCKEY, "MEN'S LACROSSE" : constants.MENS_LACROSSE, "WOMEN'S LACROSSE" : constants.WOMENS_LACROSSE, "MEN'S SOCCER" : constants.MENS_SOCCER, "WOMEN'S SOCCER" : constants.WOMENS_SOCCER, "SOFTBALL" : constants.SOFTBALL, "MEN'S SWIMMING & DIVING" : constants.MENS_SWIMMING_DIVING, "WOMEN'S SWIMMING & DIVING" : constants.WOMENS_SWIMMING_DIVING, "WOMEN'S TENNIS" : constants.WOMENS_TENNIS, "MEN'S TRACK & FIELD" : constants.MENS_TRACK_FIELD, "WOMEN'S TRACK & FIELD" : constants.WOMENS_TRACK_FIELD } print ("Scraping Umass") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Massachusetts") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('a:contains("' + key + '")').filter(lambda i, this: this.text.startswith(key)) info_row = header.parent().next() while not info_row.attr("bgcolor"): info_row = info_row.next(); while info_row.children().length > 1 and info_row.children().length < 7: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "phone", None, None, "title", "email"], {'phone_prefix' : "(413) "}) info_row = info_row.next() scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper sports = ["Cheerleading", "Football", "Field Hockey", "Gymnastics", "Men's Basketball", "Men's Ice Hockey", "Men's Skiing", "Men's Soccer", "Men's Track & Field", "Men's Cross Country", "Women's Basketball", "Women's Ice Hockey", "Women's Lacrosse", "Women's Skiing", "Women's Soccer", "Women's Swimming & Diving", "Women's Track & Field", "Women's Volleyball", "Women's Cross Country"] print ("Scraping UNH") cxn = scraper.get_connection() college = scraper.get_college(cxn, "New Hampshire") d = pq(url=college[1]) for sport in sports: print (sport); header = d('h1:contains("' + sport + '")') tables = header.next() coaches = tables("tr[class^='roster-row']") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
"Cheerleading" : constants.CHEERLEADING, "Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Fencing" : [constants.MENS_FENCING, constants.WOMENS_FENCING], "Football" : constants.FOOTBALL, "Golf" : constants.MENS_GOLF, "Gymnastics (M)" : constants.MENS_GYMNASTICS, "Gymnastics (W)" : constants.WOMENS_GYMNASTICS, "Ice Hockey" : constants.MENS_ICE_HOCKEY, "Lacrosse" : constants.MENS_LACROSSE, "Rifle" : constants.MENS_RIFLE, "Soccer (M)" : constants.MENS_SOCCER, "Soccer (W)" : constants.WOMENS_SOCCER, "Swimming & Diving (M)" : constants.MENS_SWIMMING_DIVING, "Swimming & Diving (W)" : constants.WOMENS_SWIMMING_DIVING, "Tennis (M)" : constants.MENS_TENNIS, "Tennis (W)" : constants.WOMENS_TENNIS, "Track & Field" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Volleyball" : constants.WOMENS_VOLLEYBALL, "Water Polo" : constants.MENS_WATER_POLO, "Wresting" : constants.WRESTLING } print ("Scraping Air Force") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Air Force Academy") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); strong = d("strong") header = strong('a:contains("' + key + '")') table = header.parent().parent().parent().parent().parent().parent().next() coaches = table("tr") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, [["name", "email"], "title", "phone"], {'phone_prefix' : "(719) "}) scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants print ("Scraping Wisconsin") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Wisconsin") d = pq(url=college[1]) for class_name in ["even", "odd"]: for row in d.items("tr." + class_name): idx = 0 for element in row.items("td"): if idx == 1 and element.text() in scraper.sports: scraper.parse_row(cxn, college[0], college[1], scraper.sports[element.text()], row.children(), [["name", "email"], "", "title", "phone"]) idx += 1 scraper.close_connection(cxn)
"Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL, "Golf - Men's" : constants.MENS_GOLF, "Golf - Women's" : constants.WOMENS_GOLF, "Ice Hockey - Men's" : constants.MENS_ICE_HOCKEY, "Ice Hockey - Women's" : constants.WOMENS_ICE_HOCKEY, "Lacrosse - Men's" : constants.MENS_LACROSSE, "Lacrosse - Women's" : constants.WOMENS_LACROSSE, "Soccer - Men's" : constants.MENS_SOCCER, "Soccer - Women's" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming & Diving - Men's" : constants.MENS_SWIMMING_DIVING, "Swimming & Diving - Women's" : constants.WOMENS_SWIMMING_DIVING, "Tennis - Men's" : constants.MENS_TENNIS, "Tennis - Women's" : constants.WOMENS_TENNIS, "Track and Field - Men's" : constants.MENS_TRACK_FIELD, "Track and Field - Women's" : constants.WOMENS_TRACK_FIELD, "Volleyball - Women's" : constants.WOMENS_VOLLEYBALL, "Water Polo" : [constants.MENS_WATER_POLO, constants.WOMENS_WATER_POLO], "Wrestling" : constants.WRESTLING } print ("Scraping Princeton") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Princeton University") d = pq(url=college[1]) print(d) for key, sport in sports.items(): print(sport) header = d('font:contains("' + key + ':")') print(header) info_row = header.parent().parent().parent().next() print(info_row) #while not info_row.attr("bgcolor"): # scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [(["name", "title"], ","), "phone", "email"], # {'phone_prefix' : '(609) '}) # info_row = info_row.next() # print(info_row)
"M&W/Cross Country/Track & Field" : [constants.MENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD, constants.WOMENS_CROSS_COUNTRY, constants.WOMENS_TRACK_FIELD], "M&W Fencing" : [constants.MENS_FENCING, constants.WOMENS_FENCING], "Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL, "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF, "Men's Gymnastics" : constants.MENS_GYMNASTICS, "Women's Gymnastics" : constants.WOMENS_GYMNASTICS, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY, "Women's Ice Hockey" : constants.WOMENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Men's & Women's Swimming & Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Women's Volleyball" : constants.WOMENS_VOLLEYBALL, "Wrestling" : constants.WRESTLING } print ("Scraping Penn State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Penn State University") d = pq(url=college[1]) for key, sport in sports.items(): print(sport) header = d('span:contains("' + key + ':")') table = header.parent().parent().parent().parent() for idx, row in enumerate(table("tr")): if idx > 1: scraper.parse_row(cxn, college[0], college[1], sport, row.getchildren(), ["name", "title", "phone", "email"], {'phone_prefix' : '(814) '}) scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants sports = { "Baseball" : constants.BASEBALL, "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL, "Football" : constants.FOOTBALL, "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY, "Women's Ice Hockey" : constants.WOMENS_ICE_HOCKEY, "Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Tennis" : constants.WOMENS_TENNIS, "Track/Cross" : [constants.WOMENS_TRACK_FIELD, constants.WOMENS_CROSS_COUNTRY], "Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Bemidji State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Bemidji State University") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); finder = d('th:contains("' + key + '")') info_row = finder.parent().next() while len(info_row.children()) > 2: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"],) info_row = info_row.next() scraper.close_connection(cxn)
constants.SOFTBALL : ["Softball" , "/softball"], constants.MENS_SWIMMING_DIVING : ["Swimming/Diving (M)" , "/swimming-m"], constants.WOMENS_SWIMMING_DIVING : ["Swimming/Diving (W)" , "/swimming-w"], constants.MENS_TENNIS : ["Tennis (M)" , "/tennis-m"], constants.WOMENS_TENNIS : ["Tennis (W)" , "/tennis-w"], constants.MENS_TRACK_FIELD : ["Track/Field (M)" , "/track-m"], constants.WOMENS_TRACK_FIELD : ["Track/Field (W)" , "/track-w"], constants.WOMENS_VOLLEYBALL : ["Volleyball" , "/volleyball"], constants.WOMENS_WATER_POLO : ["Water Polo" , "/waterpolo"], constants.WRESTLING : ["Wrestling" , "/wrestling"], constants.CHEERLEADING : ["Cheerleading" , "/spirit"] } print ("Scraping Michigan") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Michigan") d = pq(url=college[1]) coach_name_pattern = re.compile('<a href="(.*)">(.*)<\/a> \((.*)\)') for sport, keys in sports.items(): print (sport) info_row = d('a:contains("' + keys[0] + '")').filter(lambda i, this: this.get("href") == keys[1]).parent().parent() print (info_row) # Some rows contain 2 coaches if len(info_row.children()[1].getchildren()) > 1: coaches = [] coach_elements = info_row.children()[1] coaches_names = str(etree.tostring(coach_elements), encoding='utf8').split("<br />") phone_elements = info_row.children()[2] phone_numbers = str(etree.tostring(phone_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />") email_elements = info_row.children()[3] emails = str(etree.tostring(email_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />")
from pyquery import PyQuery as pq import scraper import constants sports = { "Baseball" : constants.BASEBALL, "Basketball, Men's" : constants.MENS_BASKETBALL, "Basketball, Women's" : constants.WOMENS_BASKETBALL, "Cross Country, Men's and Women's" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL, "Ice Hockey, Men's" : constants.MENS_ICE_HOCKEY, "Ice Hockey, Women's" : constants.WOMENS_ICE_HOCKEY, "Soccer, Women's" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Track and Field, Men's and Women's" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD] } print ("Scraping Maine") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Maine") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('h2:contains("' + key + '")') tables = header.next() coaches = tables("tr[class^='roster-row']") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants sports = {"BASKETBALL-M" : constants.MENS_BASKETBALL, "BASKETBALL-W" : constants.WOMENS_BASKETBALL, "CROSS COUNTRY" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "GYMNASTICS" : constants.WOMENS_GYMNASTICS, "HOCKEY" : constants.MENS_ICE_HOCKEY, "SKIING" : [constants.MENS_SKIING, constants.WOMENS_SKIING], "TRACK & FIELD" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "VOLLEYBALL" : constants.WOMENS_VOLLEYBALL } cxn = scraper.get_connection() college = scraper.get_college(cxn, "Alaska-Anchorage") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('strong:contains("' + key + '")') info_row = header.parent().parent().next() while info_row and not info_row.attr("class"): scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [(["name", "title"], ","), "phone", "email"]) info_row = info_row.next() scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants sports = { "Baseball" : constants.BASEBALL, "Basketball - Men's" : constants.MENS_BASKETBALL, "Basketball - Women's" : constants.WOMENS_BASKETBALL, "Cheer Team" : constants.CHEERLEADING, "Football" : constants.FOOTBALL, "Golf" : [constants.MENS_GOLF, constants.WOMENS_GOLF], "Hockey - Men's" : constants.MENS_ICE_HOCKEY, "Hockey - Women's" : constants.WOMENS_ICE_HOCKEY, "Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Tennis" : [constants.MENS_TENNIS, constants.WOMENS_TENNIS], "Track and Field / Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping North Dakota") cxn = scraper.get_connection() college = scraper.get_college(cxn, "North Dakota") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('h3:contains("' + key + '")') info_row = header.parent().parent().next() while scraper.strip_string(info_row.children()[1].text): scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"], {'phone_prefix' : '(701) 77'}) info_row = info_row.next() scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants sports = { "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL, "Gymnastics" : constants.WOMENS_GYMNASTICS, "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Skiing" : [constants.MENS_SKIING, constants.WOMENS_SKIING], "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Women's Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Denver") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Denver") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('font:contains("' + key + '")').filter(lambda i: scraper.strip_string(this.text) == key).filter(lambda i: this.get("face")) info_row = header.parent().parent().parent().next() while len(info_row.children()) > 1: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [["name", "email"], "title", "phone"]) info_row = info_row.next() scraper.close_connection(cxn)
sports = { "Baseball" : constants.BASEBALL, "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL, "Cheerleading" : constants.CHEERLEADING, "Men's Cross Country" : constants.MENS_CROSS_COUNTRY, "Women's Cross Country" : constants.WOMENS_CROSS_COUNTRY, "Fencing" : [constants.MENS_FENCING, constants.WOMENS_FENCING], "Football" : constants.FOOTBALL, "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF, "Hockey" : constants.MENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Rowing" : constants.WOMENS_ROWING, "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Men's Swimming & Diving" : constants.MENS_SWIMMING_DIVING, "Women's Swimming & Diving" : constants.WOMENS_SWIMMING_DIVING, "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Track & Field" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Notre Dame") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Notre Dame") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); title = d('b:contains("' + key + '")') info_row = title.parent().parent().next() while info_row.children().length > 1 and info_row.children().length < 5: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"], {'phone_prefix' : "(574) "}) info_row = info_row.next() scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants sports = {"MEN'S BASKETBALL" : constants.MENS_BASKETBALL, "WOMEN'S BASKETBALL" : constants.WOMENS_BASKETBALL, "CHEERLEADING" : constants.CHEERLEADING, "MEN'S AND WOMEN'S CROSS COUNTRY / TRACK AND FIELD" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "FIELD HOCKEY" : constants.FIELD_HOCKEY, "MEN'S ICE HOCKEY" : constants.MENS_ICE_HOCKEY, "WOMEN'S ICE HOCKEY" : constants.WOMENS_ICE_HOCKEY, "MEN'S LACROSSE" : constants.MENS_LACROSSE, "MEN'S SOCCER" : constants.MENS_SOCCER, "WOMEN'S SOCCER" : constants.WOMENS_SOCCER, "SOFTBALL" : constants.SOFTBALL, "MEN'S AND WOMEN'S SWIMMING" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "WOMEN'S TENNIS" : constants.WOMENS_TENNIS, "WOMEN'S VOLLEYBALL" : constants.WOMENS_VOLLEYBALL } print ("Scraping Providence") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Providence College") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('strong:contains("' + key + '")').filter(lambda i, this: this.text.startswith(key)) info_row = header.parent().parent().next() while info_row.children().length > 1 and info_row.children().length < 4: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [(["name", "title"], ","), "email", "phone"]) info_row = info_row.next() scraper.close_connection(cxn)
sports = { "BASEBALL" : constants.BASEBALL, "MEN'S BASKETBALL" : constants.MENS_BASKETBALL, "WOMEN'S BASKETBALL" : constants.WOMENS_BASKETBALL, "MEN'S CROSS COUNTRY/TRACK & FIELD" : [constants.MENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD], "WOMEN'S CROSS COUNTRY/TRACK & FIELD" : [constants.WOMENS_CROSS_COUNTRY, constants.WOMENS_TRACK_FIELD], "WOMEN'S FIELD HOCKEY" : constants.FIELD_HOCKEY, "FOOTBALL" : constants.FOOTBALL, "GOLF" : constants.MENS_GOLF, "MEN'S ICE HOCKEY" : constants.MENS_ICE_HOCKEY, "WOMEN'S ICE HOCKEY" : constants.WOMENS_ICE_HOCKEY, "WOMEN'S LACROSSE" : constants.WOMENS_LACROSSE, "WOMEN'S ROWING" : constants.WOMENS_ROWING, "MEN'S SOCCER" : constants.MENS_SOCCER, "WOMEN'S SOCCER" : constants.WOMENS_SOCCER, "SOFTBALL" : constants.SOFTBALL, "MEN'S & WOMEN'S SWIMMING/DIVING" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "MEN'S TENNIS" : constants.MENS_TENNIS, "WOMEN'S TENNIS" : constants.WOMENS_TENNIS, "WOMEN'S VOLLEYBALL" : constants.WOMENS_VOLLEYBALL } print ("Scraping Connecticut") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Connecticut") d = pq(url=college[1]) for key, sport in sports.items(): header = d('strong:contains("' + key + '")').filter(lambda i, this: this.text.startswith(key)) if not header: header = d('b:contains("' + key + '")').filter(lambda i, this: this.text.startswith(key)) if not header: header = d('a:contains("' + key + '")').filter(lambda i, this: this.text.startswith(key)) header = header.parent() info_row = header.parent().parent().parent().next().next() if len(info_row.children()) < 2: info_row = info_row.next() while len(info_row.children()) > 1: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "email"]) info_row = info_row.next()
from pyquery import PyQuery as pq import scraper import constants sports = { "Basketball, Men's" : constants.MENS_BASKETBALL, "Basketball, Women's" : constants.WOMENS_BASKETBALL, "Cross Country, Men's and Women's" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Golf, Men's" : constants.MENS_GOLF, "Golf, Women's" : constants.WOMENS_GOLF, "Ice Hockey, Men's" : constants.MENS_ICE_HOCKEY, "Softball," : constants.SOFTBALL, "Tennis, Men's and Women's" : [constants.MENS_TENNIS, constants.WOMENS_TENNIS], "Track & Field, Men's and Women's" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Volleyball," : constants.WOMENS_VOLLEYBALL } print ("Scraping Lake Superior State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Lake Superior State University") d = pq(url=college[1]) lines = d("p") for line in lines: for key, sport in sports.items(): if line.text and line.text.startswith(key): rest_of_line = line.text_content()[len(key):] fields = rest_of_line.split(",") name_and_title = fields[0].split("coach") title = scraper.strip_string(name_and_title[0]).capitalize() + " Coach" name = scraper.strip_string(name_and_title[1]) phone_index = 1 if not name: name = scraper.strip_string(fields[1]) phone_index = 2
from pyquery import PyQuery as pq import scraper import constants sports = { "Baseball" : constants.BASEBALL, "Basketball (Men's)" : constants.MENS_BASKETBALL, "Basketball (Women's)" : constants.WOMENS_BASKETBALL, "Cheerleading" : constants.CHEERLEADING, "Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL, "Golf" : constants.MENS_GOLF, "Ice Hockey" : constants.MENS_ICE_HOCKEY, "Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming & Diving (Men's)" : constants.MENS_SWIMMING_DIVING, "Swimming & Diving (Women's)" : constants.WOMENS_SWIMMING_DIVING, "Tennis" : constants.WOMENS_TENNIS, "Track & Field/Cross Country (Men's)" : [constants.MENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD], "Track & Field/Cross Country (Women's)" : [constants.WOMENS_CROSS_COUNTRY, constants.WOMENS_TRACK_FIELD], "Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Miami (OH)") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Miami University (Ohio)") d = pq(url=college[1]) for key, sport in sports.items(): header = d('strong:contains("' + key + '")') info_row = header.parent().parent().parent().next() while len(info_row.children()) > 1: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"]) info_row = info_row.next() scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants bu_sports = {"mbb" : constants.MENS_BASKETBALL, "wbb" : constants.WOMENS_BASKETBALL, "mcrew" : constants.MENS_ROWING, "mwxc" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "fh" : constants.FIELD_HOCKEY, "wgolf" : constants.WOMENS_GOLF, "mih" : constants.MENS_ICE_HOCKEY, "wih" : constants.WOMENS_ICE_HOCKEY, "mlax" : constants.MENS_LACROSSE, "wlax" : constants.WOMENS_LACROSSE, "wrow" : constants.WOMENS_ROWING, "msoc" : constants.MENS_SOCCER, "wsoc" : constants.WOMENS_SOCCER, "sb" : constants.SOFTBALL, "mwsd" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "mten" : constants.MENS_TENNIS, "wten" : constants.WOMENS_TENNIS, "wr" : constants.WRESTLING} print ("Scraping BU") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Boston University") d = pq(url=college[1]) for key, sport in bu_sports.items(): print (sport); anchor = d('a[name="' + key + '"]') info_row = anchor.parent().parent().next() while info_row.attr("style") == None: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "email", "phone"]) info_row = info_row.next() scraper.close_connection(cxn)
import scraper import constants sports = {"m-basebl" : constants.BASEBALL, "m-baskbl" : constants.MENS_BASKETBALL, "w-baskbl" : constants.WOMENS_BASKETBALL, "cheer" : constants.CHEERLEADING, "xc" : constants.MENS_CROSS_COUNTRY, "w-xc" : constants.WOMENS_CROSS_COUNTRY, "fence" : constants.MENS_FENCING, "f-hockey" : constants.FIELD_HOCKEY, "m-footbl" : constants.FOOTBALL, "m-golf" : constants.MENS_GOLF, "w-golf" : constants.WOMENS_GOLF, "m-ihockey" : constants.MENS_ICE_HOCKEY, "w-ihockey" : constants.WOMENS_ICE_HOCKEY, "w-lax" : constants.WOMENS_LACROSSE, "w-row" : constants.WOMENS_ROWING, "sail" : constants.SAILING, "skiing": [constants.MENS_SKIING, constants.WOMENS_SKIING], "m-soccer" : constants.MENS_SOCCER, "w-soccer" : constants.WOMENS_SOCCER, "softbl" : constants.SOFTBALL, "swim" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "m-tennis" : constants.MENS_TENNIS, "w-tennis" : constants.WOMENS_TENNIS, "track" : constants.MENS_TRACK_FIELD, "w-tf" : constants.WOMENS_TRACK_FIELD, "w-volley" : constants.WOMENS_VOLLEYBALL} print ("Scraping BC") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Boston College") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); anchor = d('a[name="' + key + '"]') info_row = anchor.parent().parent().next() while info_row.children().length > 1: scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"]) info_row = info_row.next() scraper.close_connection(cxn)
from pyquery import PyQuery as pq import scraper import constants sports = { "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL, "Men's Cross Country" : constants.MENS_CROSS_COUNTRY, "Women's Cross Country" : constants.WOMENS_CROSS_COUNTRY, "Football" : constants.FOOTBALL, "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Men's Track & Field" : constants.MENS_TRACK_FIELD, "Women's Track & Field" : constants.WOMENS_TRACK_FIELD, "Women's Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Ferris State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Ferris State University") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('h3:contains("' + key + '")') tables = header.next() coaches = tables("tr[class^='roster-row']") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
"Softball": constants.SOFTBALL, "Swimming & Diving - Men's": constants.MENS_SWIMMING_DIVING, "Swimming & Diving - Women's": constants.WOMENS_SWIMMING_DIVING, "Tennis - Men's": constants.MENS_TENNIS, "Tennis - Women's": constants.WOMENS_TENNIS, "Track, Field & Cross Country": [ constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD, ], "Volleyball - Women's": constants.WOMENS_VOLLEYBALL, "Waterpolo - Men's and Women's": [constants.MENS_WATER_POLO, constants.WOMENS_WATER_POLO], "Wrestling": constants.WRESTLING, } print("Scraping Harvard") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Harvard University") d = pq(url=college[1]) for key, sport in sports.items(): print(sport) header = d('h2:contains("' + key + '")') tables = header.next() coaches = tables("tr[class^='roster-row']") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
import scraper import constants sports = {"Baseball" : constants.BASEBALL, "Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL, "Women's Crew" : constants.WOMENS_ROWING, "Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL, "Women's Golf" : constants.WOMENS_GOLF, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY, "Women's Ice Hockey" : constants.WOMENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE, "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Women's Swimming" : constants.WOMENS_SWIMMING_DIVING, "Tennis" : [constants.MENS_TENNIS, constants.WOMENS_TENNIS], "Indoor and Outdoor Track" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD], "Women's Volleyball" : constants.WOMENS_VOLLEYBALL } print ("Scraping Merrimack") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Merrimack College") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); header = d('h2:contains("' + key + '")') table = header.next().next() coaches = table("tr[class^='roster-row']") for coach in coaches: scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"]) scraper.close_connection(cxn)
"Golf, Men's" : constants.MENS_GOLF, "Golf, Women's" : constants.WOMENS_GOLF, "Gymnastics, Men's" : constants.MENS_GYMNASTICS, "Gymnastics, Women's" : constants.WOMENS_GYMNASTICS, "Ice Hockey, Men's" : constants.MENS_ICE_HOCKEY, "Ice Hockey, Women's" : constants.WOMENS_ICE_HOCKEY, "Lacrosse, Men's" : constants.MENS_LACROSSE, "Rifle, Men's/Women's" : [constants.MENS_RIFLE, constants.WOMENS_RIFLE], "Rowing, Women's" : constants.WOMENS_ROWING, "Soccer, Men's" : constants.MENS_SOCCER, "Soccer, Women's" : constants.WOMENS_SOCCER, "Softball, Women's" : constants.SOFTBALL, "Swimming, Men's" : constants.MENS_SWIMMING_DIVING, "Swimming, Women's" : constants.WOMENS_SWIMMING_DIVING, "Tennis, Men's" : constants.MENS_TENNIS, "Tennis, Women's" : constants.WOMENS_TENNIS, "Track & Field/Cross Country, Men's" : [constants.MENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD], "Track & Field/Cross Country, Women's" : [constants.WOMENS_CROSS_COUNTRY, constants.WOMENS_TRACK_FIELD], "Volleyball, Women's" : constants.WOMENS_VOLLEYBALL, "Wrestling" : constants.WRESTLING } print ("Scraping Ohio State") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Ohio State University") d = pq(url=college[1]) for key, sport in sports.items(): print(sport) header = d('b:contains("' + key + '")') table = header.parent().parent().parent() first = True for row in table("tr"): if not first: scraper.parse_row(cxn, college[0], college[1], sport, row.getchildren(), ["name", "title", "phone", "email"], {'phone_prefix' : '(614) '}) first = False scraper.close_connection(cxn)
"Women's Basketball" : constants.WOMENS_BASKETBALL, "Football" : constants.FOOTBALL, "Golf" : constants.MENS_GOLF, "Gymnastics" : constants.MENS_GYMNASTICS, "Hockey" : constants.MENS_ICE_HOCKEY, "Lacrosse" : constants.MENS_LACROSSE, "Rifle" : [constants.MENS_RIFLE, constants.WOMENS_RIFLE], "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL, "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING], "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS, "Track & Field/Cross Country" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD, constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY], "Volleyball" : constants.WOMENS_VOLLEYBALL, "Wrestling" : constants.WRESTLING} print ("Scraping Army") cxn = scraper.get_connection() college = scraper.get_college(cxn, "Army") d = pq(url=college[1]) for key, sport in sports.items(): print (sport); finder = d('strong:contains("' + key + '")') if not finder: finder = d('span:contains("' + key + '")').filter(lambda i, this: not 'Sprint' in this.text) while not finder.is_("tr"): finder = finder.parent() info_row = finder.next().next() while not info_row.is_("tr"): info_row = info_row.next() while info_row.children(): print(info_row) scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"], {'email_suffix' : '@usma.edu', 'phone_prefix' : '(845) 938-', 'truncate_name' : "- @"})