Python strip_string Exemples, scraper.strip_string Python Exemples

Exemple #1

0

Afficher le fichier

from pyquery import PyQuery as pq
import scraper
import constants

sports = { "Baseball" : constants.BASEBALL, "Basketball - Men's" : constants.MENS_BASKETBALL,
           "Basketball - Women's" : constants.WOMENS_BASKETBALL, "Cheer Team" : constants.CHEERLEADING,
           "Football" : constants.FOOTBALL, "Golf" : [constants.MENS_GOLF, constants.WOMENS_GOLF],
           "Hockey - Men's" : constants.MENS_ICE_HOCKEY, "Hockey - Women's" : constants.WOMENS_ICE_HOCKEY,
           "Soccer" : constants.WOMENS_SOCCER, "Softball" : constants.SOFTBALL,
           "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING],
           "Tennis" : [constants.MENS_TENNIS, constants.WOMENS_TENNIS],
           "Track and Field / Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY,
                                             constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD],
           "Volleyball" : constants.WOMENS_VOLLEYBALL }


print ("Scraping North Dakota")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "North Dakota")
d = pq(url=college[1])
for key, sport in sports.items():
    print (sport);
    header = d('h3:contains("' + key + '")')
    info_row = header.parent().parent().next()
    while scraper.strip_string(info_row.children()[1].text):
        scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"],
                          {'phone_prefix' : '(701) 77'})
        info_row = info_row.next()

scraper.close_connection(cxn)

Exemple #2

0

Afficher le fichier

from pyquery import PyQuery as pq
import scraper
import constants

sports = { "Men's Basketball" : constants.MENS_BASKETBALL, "Women's Basketball" : constants.WOMENS_BASKETBALL,
           "Gymnastics" : constants.WOMENS_GYMNASTICS, "Men's Golf" : constants.MENS_GOLF,
           "Women's Golf" : constants.WOMENS_GOLF, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY,
           "Men's Lacrosse" : constants.MENS_LACROSSE,
           "Women's Lacrosse" : constants.WOMENS_LACROSSE,
           "Skiing" : [constants.MENS_SKIING, constants.WOMENS_SKIING],
           "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER,
           "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING],
           "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS,
           "Women's Volleyball" : constants.WOMENS_VOLLEYBALL }


print ("Scraping Denver")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Denver")
d = pq(url=college[1])
for key, sport in sports.items():
    print (sport);
    header = d('font:contains("' + key + '")').filter(lambda i: scraper.strip_string(this.text) == key).filter(lambda i: this.get("face"))
    info_row = header.parent().parent().parent().next()
    while len(info_row.children()) > 1:
        scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [["name", "email"], "title", "phone"])
        info_row = info_row.next()

scraper.close_connection(cxn)

Exemple #3

0

Afficher le fichier

           "Track & Field, Men's and Women's" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD],
           "Volleyball," : constants.WOMENS_VOLLEYBALL }


print ("Scraping Lake Superior State")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Lake Superior State University")
d = pq(url=college[1])
lines = d("p")
for line in lines:
    for key, sport in sports.items():
        if line.text and line.text.startswith(key):
            rest_of_line = line.text_content()[len(key):]
            fields = rest_of_line.split(",")
            name_and_title = fields[0].split("coach")
            title = scraper.strip_string(name_and_title[0]).capitalize() + " Coach"
            name = scraper.strip_string(name_and_title[1])
            phone_index = 1
            if not name:
                name = scraper.strip_string(fields[1])
                phone_index = 2

            phone = scraper.strip_string(fields[phone_index])
            if not phone[:1].isdigit():
                email = phone
                phone = scraper.strip_string(fields[phone_index + 1])
            else:
                email = scraper.strip_string(fields[phone_index + 1])

            if isinstance(sport, list):
                for sp in sport:

Exemple #4

0

Afficher le fichier

cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Michigan")
d = pq(url=college[1])
coach_name_pattern = re.compile('<a href="(.*)">(.*)<\/a> \((.*)\)')
for sport, keys in sports.items():
    print (sport)
    info_row = d('a:contains("' + keys[0] + '")').filter(lambda i, this: this.get("href") == keys[1]).parent().parent()
    print (info_row)
    # Some rows contain 2 coaches
    if len(info_row.children()[1].getchildren()) > 1:
        coaches = []
        coach_elements = info_row.children()[1]
        coaches_names = str(etree.tostring(coach_elements), encoding='utf8').split("<br />")
        phone_elements = info_row.children()[2]
        phone_numbers = str(etree.tostring(phone_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />")
        email_elements = info_row.children()[3]
        emails = str(etree.tostring(email_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />")
        for i, coach in enumerate(coaches_names):
            m = coach_name_pattern.search(coach)
            profile_url = scraper.strip_string(m.group(1))
            name = scraper.strip_string(m.group(2))
            title = scraper.strip_string(m.group(3) + " Head Coach")
            phone = scraper.strip_string(phone_numbers[i])
            email = scraper.strip_string(emails[i]) + "@umich.edu"
            scraper.save_coach(cxn, college[0], scraper.get_sport_id(cxn, sport), name, title, phone, email, profile_url) 
    else:
        scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [None, "name", "phone", "email"],
                          {'email_suffix' : "@umich.edu", 'title' : 'Head Coach'})

scraper.close_connection(cxn)