コード例 #1
0
import datetime

import birkie_fetcher
from configs import Configs
from RaceResultStore import RaceResultStore
import skinnyski_fetcher

config = Configs()

history_years = config.get_as_int("HISTORY_LENGTH")
# SEASONS = [str(datetime.date.today().year - year_delta) for year_delta in range(1, history_years)]
SEASONS = ['2015']
#DIVISIONS = {"highschool":config.get_as_string("HS_DIVISION"), "citizen":config.get_as_string("CITIZEN_DIVISION")}
DIVISIONS = {"citizen": config.get_as_string("CITIZEN_DIVISION")}
race_store = RaceResultStore()
######################
# start control flow
######################

for season in SEASONS:
    for division in DIVISIONS.values():
        # get low hanging fruit from skinnyski
        race_infos = skinnyski_fetcher.get_race_infos(season, division)

        for race_info in race_infos:
            # this doesn't solve race_infos that spawn more race_infos
            # so, downstream race_infos will also need to be checked in the store as well (todo)
            if race_info in race_store:
                print("Skipping race already present %s" % (race_info, ))
            else:
                skinnyski_fetcher.process_race(race_info, race_store)
コード例 #2
0
"""
classes representing all the data we hold about races & results
"""
from abc import ABCMeta, abstractmethod
from configs import Configs
from datetime import datetime
import mysql.connector

from pdf_serializer import write_pdf_and_text

config = Configs()
DB_USER = config.get_as_string("DB_USER")
DB_PASSWORD = config.get_as_string("DB_PASSWORD")
RACE_DB = config.get_as_string("RACE_DB")
STRUCTURED_RESULTS_DB = config.get_as_string("STRUCTURED_RESULTS_DB")
UNSTRUCTURED_RESULTS_DB = config.get_as_string("UNSTRUCTURED_RESULTS_DB")


class RaceInfo:
    """
    metadata about a race that can be used to uniquely identify the race
    """
    def __init__(self, season, division, d, u, n):
        """
        todo typing, although is a somewhat tall order due to unreliable data
        :param season: season (str)
        :param division: race division (hs, citizen, etc) (str)
        :param d: date, arbitrary format (str)
        :param u: url (str)
        :param n: race name (str)
        """
コード例 #3
0
import re
import requests
import urllib2

from configs import Configs
from HTMLParser import HTMLParser
import gopher_state_fetcher
import itiming_fetcher
import mtec_fetcher
from RaceResults import RaceInfo, UnstructuredPDFRaceResults, UnstructuredTextRaceResults

config = Configs()
SKINNYSKI_URL = config.get_as_string("RESULTS_URL")
DB_USER = config.get_as_string("DB_USER")
DB_PASSWORD = config.get_as_string("DB_PASSWORD")
RACE_DB = config.get_as_string("RACE_DB")

######################################
# todo fetcher abstract class
# fetchers may delegate to other fetchers, eg. skinnyski->mtec hosted
#
# todo create a prefetcher to generate RaceInfos, as in birkie_fetcher
######################################


class SkinnySkiRaceInfoParser(HTMLParser):
    """
        a subclass of HTMLParser to override tag/data handlers
    """
    def __init__(self, season, division):
        # since we're overriding the init member function of HTMLParser, need to run superclass's init
コード例 #4
0
ファイル: pdf_serializer.py プロジェクト: holub008/skiscraper
import os
import subprocess

from configs import Configs

config = Configs()
RACE_DB = config.get_as_string("RACE_DB")
PDF_TO_TEXT = config.get_as_string("PDF_TO_TEXT")
DATA_DIR = os.path.join(config.SCRAPERTOP, "data/")
###############################################
# utilities to serialize pdf data
# todo merge this into UnstructuredRaceResults
##############################################

def write_pdf_and_text(pdf_content, race_id):
    """
    save a pdf and text file to the local fs
    :param pdf_content: pdf blob (str)
    :param race_id: the race_id associated with the race
    :return: the text content of pdf (str)
    """

    fpath = os.path.join(DATA_DIR,"pdf/")
    path_fname = os.path.join(fpath, str(race_id))
    path_fname_ext = "%s.pdf" % (path_fname, )

    txt_path = os.path.join(DATA_DIR, "text/")
    txt_dest = os.path.join(txt_path, str(race_id))
    txt_dest_ext = "%s.txt" % (txt_dest, )

    # build the data dest dirs, if not there
コード例 #5
0
ファイル: birkie_fetcher.py プロジェクト: holub008/skiscraper
import requests
from HTMLParser import HTMLParser
import mysql.connector
import urllib2

from configs import Configs
from RaceResults import RaceResult, StructuredRaceResults, RaceInfo, UnstructuredPDFRaceResults

config = Configs()
DB_USER = config.get_as_string("DB_USER")
DB_PASSWORD = config.get_as_string("DB_PASSWORD")

# division ids for forming the url correspond to list index
RACES = ["51K Skate","55K Classic","24K Skate","24K Classic"]
# bunch of boilerplate, only variable params are page # (100 per page), year, and divId
BASE_URL_FORMAT_2014ON = "http://birkie.pttiming.com/results/%d/index.php?page=1150&r_page=division&pageNum_rsOverall=%d&divID=%d"
URL_PREFETCH_2007ON = "http://results.birkie.com"
# yikes! this will spit raw sql errors if you supply malformed queries
BASE_URL_FORMAT_2007ON = "http://results.birkie.com/index.php?event_id=%s&page_number=%s"
URL_2007ON_DB_URL_PAGE = 0
URL_2014ON_DB_DIV_ID = 0
URL_PREFETCH_PRE2007 = "http://www.birkie.com/ski/events/birkie/results/"

# todo this is dynamic
BIRKIE_RACE_NAME = "American Birkebeiner"
BIRKIE_RACE_DIVISION = config.get_as_string("CITIZEN_DIVISION")

class Birkie2014Parser(HTMLParser):
    """
    a custom parser for results on birkie.pttiming.com
    """