def getStyle(text): if re.search(r"^ *$", text): return "none" standard = {"ballroom", "standard"} am_or_intl = "" for word in set_library.american_styles: if word in cleanText(text): am_or_intl = "american" for word in set_library.international_styles: if word in cleanText(text): am_or_intl = "intl" for word in cleanText(text).split(" "): if word == "rhythm": return "rhythm" elif word == "latin": return "latin" elif word == "smooth": return "smooth" elif word in standard: return "standard" elif word == "fun": return "fun" # Figure out style from dances and am_or_intl if am_or_intl == "american": for dance in set_library.smooth_dances: if re.search(dance, cleanText(text)): return "smooth" for dance in set_library.rhythm_dances: if re.search(dance, cleanText(text)): return "rhythm" elif am_or_intl == "intl": for dance in set_library.standard_dances: if re.search(dance, cleanText(text)): return "standard" for dance in set_library.latin_dances: if re.search(dance, cleanText(text)): return "latin" if am_or_intl == "": for dance in set_library.other_dances: if re.search(dance, cleanText(text)): return "other" for dance_style in set_library.dance_styles: if re.search(dance_style, cleanText(text)): return dance_style return "none" else: return am_or_intl
def getAge(event_text): if re.search(r"^ $", event_text): return "none" age_group = [] for age in (set_library.ages - set_library.age_groups): if re.search(age + ' ', cleanText(event_text)): age_group.append(re.sub('\\\\', '', age)) if len(age_group) == 0: for age in set_library.ages: if re.search(age + ' ', cleanText(event_text)): age_group.append(re.sub('\\\\', '', age)) if len(age_group) == 0: age_group.append("none") return age_group
def buildO2CMCompTable(comp_ids, quick): print("\nScraping competitions from o2cm") # Connect to dreamhost db mydb = database.getDB() # initialize cursor cursor = mydb.cursor() # check if comp table exists # create if not exist cursor.execute("CREATE TABLE IF NOT EXISTS competitions " + "(comp_id varchar(255) NOT NULL, " + "comp_name varchar(255), " + "is_nqe varchar(255), " + "date varchar(255), " + "PRIMARY KEY (comp_id))") # Initialize date variables comp_id = '' current_date = datetime.datetime.now() year = current_date.year month = current_date.month while year != 2004: print("searching", year, month) # Initialize web driver options = webdriver.ChromeOptions() options.add_argument('headless') driver = webdriver.Chrome(options=options) # GET request to competition results page driver.get('http://results.o2cm.com/') # Find form and filter by year and month month_element = driver.find_element_by_id("inmonth") month_element.clear() month_element.send_keys(month) year_element = driver.find_element_by_id("inyear") year_element.clear() year_element.send_keys(year) driver.find_element_by_name("Go").click() allCompResultsPage = BeautifulSoup(driver.page_source, 'html.parser') compHTMLTable = allCompResultsPage.select('table[id=main_tbl]')[0] # Countdown to help sort results from same date counter = len(compHTMLTable.select('tr')) # Iterate through table of Competitions for row in compHTMLTable.select('tr'): # An anchor element with href attribute indicates link to competition # results page if row.select('a[href]'): # Comp ID href = row.select('a[href]')[0]['href'] comp_id = re.search('(?<=event=).*?(?=&)', href).group(0) print(comp_id) if set([comp_id]).issubset(comp_ids): print("competition id " + comp_id + "' already present") if quick: return else: comp_ids.add(comp_id) # Date day_string = row.select('td')[0].get_text(strip=True) month_string = scraper_utils.numericalMonth(day_string[0:3]) day = day_string[4:] # Competition Name comp_name = row.select('a[href]')[0].get_text(strip=True) # NQE? If a competition's name includes the string 'NQE', we # assume it's an NQE. is_NQE = False if re.search("nqe", scraper_utils.cleanText(comp_name)): is_NQE = True date = str(year) + '-' + month_string + "-" + day + "-" + str(counter) # Add comp to db insert = "INSERT INTO competitions (comp_id, comp_name, is_nqe, date) VALUES('" + comp_id + "', '" + comp_name + "', '" + str(is_NQE) + "', '" + date + "')" print(insert) cursor.execute(insert) mydb.commit() # # Alternatively, write to file # competition = [ comp_id, comp_name, is_NQE, date ] # print("competition: " + str(competition)) # f = open("./output/comp-table.txt", "a") # f.write(str(competition)) # f.write("\n") # f.close # Decrement counter counter = counter - 1 # Update date variables month = month - 1 if month == 0: month = 12 year = year - 1 # while comp_id != stop_point # close db connection cursor.close()
def buildO2CMPlacementsTable(comp_ids, quick): print("\nScraping O2CM placements") # start = False comp_id = '' current_date = datetime.datetime.now() year = current_date.year month = current_date.month while year != 2004: print("searching", year, month) # Initialize web driver options = webdriver.ChromeOptions() options.add_argument('headless') driver = webdriver.Chrome(options=options) # GET request to competition results page driver.get('http://results.o2cm.com/') # Find form and filter by year and month month_element = driver.find_element_by_id("inmonth") month_element.clear() month_element.send_keys(month) year_element = driver.find_element_by_id("inyear") year_element.clear() year_element.send_keys(year) driver.find_element_by_name("Go").click() allCompResultsPage = BeautifulSoup(driver.page_source, 'html.parser') # Iterates through every competition on results.o2cm.com for comp in allCompResultsPage.select('a'): if re.search('(?<=event=).*?(?=&)', comp['href']): # e.g. scoresheet3.asp?event=okc19&heatid=40453020&bclr=#FFFFFF&tclr=#000000 comp_id = re.search('(?<=event=).*?(?=&)', comp['href']).group(0) print(comp_id) if set([comp_id]).issubset(comp_ids): print(comp_id, "present in DB") if quick: return comp_ids.add(comp_id) else: # Initialize web driver options = webdriver.ChromeOptions() options.add_argument('headless') driver = webdriver.Chrome(options=options) # GET request to competition results page driver.get('http://results.o2cm.com/?event=' + comp_id) # Attempt to locate 'submit' button and click to reach complete results # page try: ok_button = driver.find_element_by_xpath( "//input[@type='submit']") ok_button.click() comp_all_page = BeautifulSoup(driver.page_source, 'html.parser') results_table = comp_all_page.select('table[width]')[1] # Parse competitor drop down, get competitor ID info # initialize empty data structure competitor_ids = {} if comp_all_page.find(id='selEnt') != None: competitors = comp_all_page.find(id='selEnt') # add each competitor number and text to data structure for competitor in competitors.find_all('option'): competitor_ids[format_name( competitor.get_text( strip=True))] = competitor['value'] heat_id = '' # For every event at a competition, find the things for row in results_table.find_all('tr'): # A row with an anchor element denotes a new event if len(row.select('a')) > 0: # Get heat_id href = row.select('a')[0]['href'] heat_id = re.search('(?<=heatid=).*?(?=&)', href).group(0) heat_id = comp_id + heat_id # scoresheet3.asp?event=okc19&heatid=40453020&bclr=#FFFFFF&tclr=#000000 # Another entry in same event else: if len(row.select('td')) >= 3: row_text = row.select('td')[2].get_text( strip=True) clean_row = scraper_utils.cleanText( row_text) # '1) 210 Jackson Fossen & Claire Thompson - MN' # '8) TBA TBA& TBA TBA' if clean_row != '----' and clean_row != '': ########################################## # Determine and save each placement here # ########################################## if not re.search( r'^\d+\)\s\d?\d?\d?\s?[\w\d\"\'\`\-\.\,\?\!\_\/\#\(\)\s]*\s?&\s?', clean_row): f = open( "./output/failed-pattern.txt", "a") f.write(clean_row) f.write("\n") f.close() else: # Overall placement in event numeric_placement = "0" if re.search( r'^\d\d?\d?(?=\))', clean_row): numeric_placement = re.search( r'^\d\d?\d?(?=\))', clean_row).group(0) clean_row = re.sub( numeric_placement + "\)", scraper_utils.subSpace( numeric_placement + ")"), clean_row) clean_row = re.sub( r'^\s*', '', clean_row) # Competitor Number competitor_number = "X" if re.search( r'^\s*\d+', clean_row): competitor_number = re.search( r'^\s*\d+', clean_row).group(0) competitor_number = re.sub( r'\s', '', competitor_number) clean_row = re.sub( competitor_number, scraper_utils.subSpace( competitor_number), clean_row) # Couple/Competitors # Lead name is any text before an ampersand lead_name = " " if re.search( r'^.*(?=\ *\&)', clean_row): lead_name = re.search( r'^.*(?=\ *\&)', clean_row).group(0) # Remove leading/trailing spaces from lead name lead_name = re.sub( r'^\ *', '', lead_name) lead_name = re.sub( r'\ *$', '', lead_name) # Lead ID (for this comp anyway) lead_id = '' if competitor_ids.get(lead_name): lead_id = competitor_ids[ lead_name] # Follow name is any text after an ampersand follow_name = re.search( r'(?<=\&).*$', clean_row).group(0) # Remove leading spaces from follow name follow_name = re.sub( r'^\ *', '', follow_name) # Filter potential location info location = '' if re.search( r'\ *(\-\ )?\-\ \ ?[\w\s\-\(\)]+$', follow_name): location = re.search( r'\ *(\-\ )?\-\ \ ?[\w\s\-\(\)]+$', follow_name).group(0) location = re.sub( r'^\ *(\-\ )?\-\ \ ?', '', location) # Remove location info from follow_name follow_name = re.sub( r'\ *(\-\ )?\-\ \ ?[\w\s\-\(\)]+$', '', follow_name) # Remove trailing space/hyphens from follow_name follow_name = re.sub( r'[\ \-]*$', '', follow_name) # Follow ID (for this comp anyway) follow_id = '' if competitor_ids.get(follow_name): follow_id = competitor_ids[ follow_name] # Complete Placement placement = [ heat_id, numeric_placement, competitor_number, lead_name, lead_id, follow_name, follow_id, location, row_text ] # TO DO: couple_id? lead_id? follow_id? checkPlacement(placement) except NoSuchElementException: print('No button for ' + comp_id) no_button_file = open("output/no-button-comp.txt", "a") no_button_file.write(comp_id) no_button_file.write("\n") no_button_file.close() # else: # print("skip " + comp_id) # Update date variables month = month - 1 if month == 0: month = 12 year = year - 1
def buildO2CMEventsTable(comp_ids, quick): print("\nScraping O2CM events") comp_id = '' current_date = datetime.datetime.now() year = current_date.year month = current_date.month while year != 2004: print("searching", year, month) # Initialize web driver options = webdriver.ChromeOptions() options.add_argument('headless') driver = webdriver.Chrome(options=options) # GET request to competition results page driver.get('http://results.o2cm.com/') # Find form and filter by year and month month_element = driver.find_element_by_id("inmonth") month_element.clear() month_element.send_keys(month) year_element = driver.find_element_by_id("inyear") year_element.clear() year_element.send_keys(year) driver.find_element_by_name("Go").click() allCompResultsPage = BeautifulSoup(driver.page_source, 'html.parser') compHTMLTable = allCompResultsPage.select('table[id=main_tbl]')[0] # Track event ID's to check for potential duplicates heat_uids = set() # Iterates through every competition on results.o2cm.com for comp in compHTMLTable.select('a'): # e.g. scoresheet3.asp?event=okc19&heatid=40453020&bclr=#FFFFFF&tclr=#000000 comp_id = re.search('(?<=event=).*?(?=&)', comp['href']).group(0) print(comp_id) if set([comp_id]).issubset(comp_ids): print(comp_id, "present in DB") if quick: return comp_ids.add(comp_id) else: comp_ids.add(comp_id) # Initialize web driver options = webdriver.ChromeOptions() options.add_argument('headless') driver = webdriver.Chrome(options=options) # GET request to competition results page driver.get('http://results.o2cm.com/?event=' + comp_id) # Attempt to locate 'submit' button and click to reach complete # results page try: ok_button = driver.find_element_by_xpath( "//input[@type='submit']") ok_button.click() comp_all_page = BeautifulSoup(driver.page_source, 'html.parser') results_table = comp_all_page.select('table[width]')[1] heat_id = '' status = [] age_group = [] style = '' skill_level = [] dances = [] num_rounds = 1 num_couples = 0 raw_event_text = "" # For every event at a competition, find the things for row in results_table.find_all('tr'): # A row with an anchor element denotes a new event if len(row.select('a')) > 0: # Save info from previous event if it exists if heat_id != '': event_summary = [ comp_id, heat_id, status, age_group, style, skill_level, num_couples, num_rounds, dances, raw_event_text ] checkEvent(event_summary) # Reset counts num_couples = 0 num_rounds = 1 # Get heat_id href = row.select('a')[0]['href'] heat_id = re.search('(?<=heatid=).*?(?=&)', href).group(0) heat_id = comp_id + heat_id # scoresheet3.asp?event=okc19&heatid=40453020&bclr=#FFFFFF&tclr=#000000 # Check for duplicate event ID if heat_id in heat_uids: # Write heat_id to file heat_ids_file = open("./output/heat-ids.txt", "a") heat_ids_file.write( str([comp_id, heat_id, "DUPLICATE"])) heat_ids_file.write("\n") heat_ids_file.close() raw_event_text = row.select('a')[0].get_text( strip=True) event_text = cleanText(raw_event_text) print(raw_event_text) print(event_text) # Get each attribute with get<Attribute>(event_text). After a determination is made, the value for # that field is removed from event_text to avoid the same section of the string being interpreted # multiple ways # Status status = getStatus(event_text) if len(status) == 1: event_text = re.sub(status[0], subSpace(status[0]), event_text) print(event_text) # Age age_group = getAge(event_text) if len(age_group) == 1: event_text = re.sub(age_group[0], subSpace(age_group[0]), event_text) print(event_text) # Style style = getStyle(event_text) event_text = re.sub(style, subSpace(style), event_text) print(event_text) # Level skill_level = getLevel(event_text) if len(skill_level) == 1: event_text = re.sub(skill_level[0], subSpace(skill_level[0]), event_text) print(event_text) # Dances dances = getDances(event_text) dances = re.sub(" ", "", re.escape(dances)) event_text = re.sub(dances, subSpace(dances), event_text) print(event_text) # More figuring out level, should move to getLevel stripped_event_text = cleanText(event_text) stripped_event_text = re.sub( " ", "", stripped_event_text) for stat in status: stripped_event_text = re.sub( stat, "", stripped_event_text) for age in age_group: stripped_event_text = re.sub( age, "", stripped_event_text) stripped_event_text = re.sub( style, "", stripped_event_text) stripped_event_text = re.sub( re.escape(dances), "", stripped_event_text) if stripped_event_text == "" and skill_level == []: skill_level = "none" # Another entry in same event else: if len(row.select('td')) >= 3: row_text = row.select('td')[2].get_text( strip=True) # Previous Round if row_text == '----': num_rounds = num_rounds + 1 # Another Result else: num_couples = num_couples + 1 event_summary = [ comp_id, heat_id, status, age_group, style, skill_level, num_couples, num_rounds, dances, raw_event_text ] print(event_summary) # checkEvent validates each event and writes valid ones to # Events table/file, invalid ones to invalid collection checkEvent(event_summary) except NoSuchElementException: print('No button for ' + comp_id) no_button_file = open("output/no-button-comp.txt", "a") no_button_file.write(comp_id) no_button_file.write("\n") no_button_file.close() # Update date variables month = month - 1 if month == 0: month = 12 year = year - 1