import datetime import birkie_fetcher from configs import Configs from RaceResultStore import RaceResultStore import skinnyski_fetcher config = Configs() history_years = config.get_as_int("HISTORY_LENGTH") # SEASONS = [str(datetime.date.today().year - year_delta) for year_delta in range(1, history_years)] SEASONS = ['2015'] #DIVISIONS = {"highschool":config.get_as_string("HS_DIVISION"), "citizen":config.get_as_string("CITIZEN_DIVISION")} DIVISIONS = {"citizen": config.get_as_string("CITIZEN_DIVISION")} race_store = RaceResultStore() ###################### # start control flow ###################### for season in SEASONS: for division in DIVISIONS.values(): # get low hanging fruit from skinnyski race_infos = skinnyski_fetcher.get_race_infos(season, division) for race_info in race_infos: # this doesn't solve race_infos that spawn more race_infos # so, downstream race_infos will also need to be checked in the store as well (todo) if race_info in race_store: print("Skipping race already present %s" % (race_info, )) else: skinnyski_fetcher.process_race(race_info, race_store)
""" classes representing all the data we hold about races & results """ from abc import ABCMeta, abstractmethod from configs import Configs from datetime import datetime import mysql.connector from pdf_serializer import write_pdf_and_text config = Configs() DB_USER = config.get_as_string("DB_USER") DB_PASSWORD = config.get_as_string("DB_PASSWORD") RACE_DB = config.get_as_string("RACE_DB") STRUCTURED_RESULTS_DB = config.get_as_string("STRUCTURED_RESULTS_DB") UNSTRUCTURED_RESULTS_DB = config.get_as_string("UNSTRUCTURED_RESULTS_DB") class RaceInfo: """ metadata about a race that can be used to uniquely identify the race """ def __init__(self, season, division, d, u, n): """ todo typing, although is a somewhat tall order due to unreliable data :param season: season (str) :param division: race division (hs, citizen, etc) (str) :param d: date, arbitrary format (str) :param u: url (str) :param n: race name (str) """
import re import requests import urllib2 from configs import Configs from HTMLParser import HTMLParser import gopher_state_fetcher import itiming_fetcher import mtec_fetcher from RaceResults import RaceInfo, UnstructuredPDFRaceResults, UnstructuredTextRaceResults config = Configs() SKINNYSKI_URL = config.get_as_string("RESULTS_URL") DB_USER = config.get_as_string("DB_USER") DB_PASSWORD = config.get_as_string("DB_PASSWORD") RACE_DB = config.get_as_string("RACE_DB") ###################################### # todo fetcher abstract class # fetchers may delegate to other fetchers, eg. skinnyski->mtec hosted # # todo create a prefetcher to generate RaceInfos, as in birkie_fetcher ###################################### class SkinnySkiRaceInfoParser(HTMLParser): """ a subclass of HTMLParser to override tag/data handlers """ def __init__(self, season, division): # since we're overriding the init member function of HTMLParser, need to run superclass's init
import os import subprocess from configs import Configs config = Configs() RACE_DB = config.get_as_string("RACE_DB") PDF_TO_TEXT = config.get_as_string("PDF_TO_TEXT") DATA_DIR = os.path.join(config.SCRAPERTOP, "data/") ############################################### # utilities to serialize pdf data # todo merge this into UnstructuredRaceResults ############################################## def write_pdf_and_text(pdf_content, race_id): """ save a pdf and text file to the local fs :param pdf_content: pdf blob (str) :param race_id: the race_id associated with the race :return: the text content of pdf (str) """ fpath = os.path.join(DATA_DIR,"pdf/") path_fname = os.path.join(fpath, str(race_id)) path_fname_ext = "%s.pdf" % (path_fname, ) txt_path = os.path.join(DATA_DIR, "text/") txt_dest = os.path.join(txt_path, str(race_id)) txt_dest_ext = "%s.txt" % (txt_dest, ) # build the data dest dirs, if not there
import requests from HTMLParser import HTMLParser import mysql.connector import urllib2 from configs import Configs from RaceResults import RaceResult, StructuredRaceResults, RaceInfo, UnstructuredPDFRaceResults config = Configs() DB_USER = config.get_as_string("DB_USER") DB_PASSWORD = config.get_as_string("DB_PASSWORD") # division ids for forming the url correspond to list index RACES = ["51K Skate","55K Classic","24K Skate","24K Classic"] # bunch of boilerplate, only variable params are page # (100 per page), year, and divId BASE_URL_FORMAT_2014ON = "http://birkie.pttiming.com/results/%d/index.php?page=1150&r_page=division&pageNum_rsOverall=%d&divID=%d" URL_PREFETCH_2007ON = "http://results.birkie.com" # yikes! this will spit raw sql errors if you supply malformed queries BASE_URL_FORMAT_2007ON = "http://results.birkie.com/index.php?event_id=%s&page_number=%s" URL_2007ON_DB_URL_PAGE = 0 URL_2014ON_DB_DIV_ID = 0 URL_PREFETCH_PRE2007 = "http://www.birkie.com/ski/events/birkie/results/" # todo this is dynamic BIRKIE_RACE_NAME = "American Birkebeiner" BIRKIE_RACE_DIVISION = config.get_as_string("CITIZEN_DIVISION") class Birkie2014Parser(HTMLParser): """ a custom parser for results on birkie.pttiming.com """