Beispiel #1
0
def scrape_games():
    parser = utils.incremental_date_range_cmd_line_parser()
    utils.ensure_exists('static/scrape_data')
    os.chdir('static/scrape_data')

    args = parser.parse_args()
    last_month = ''

    for cur_date in utils.daterange(datetime.date(2010, 10, 15), 
                                    datetime.date.today()):
        str_date = time.strftime("%Y%m%d", cur_date.timetuple())
        if not utils.includes_day(args, str_date):
            if DEBUG:
                print 'skipping', str_date, 'because not in cmd line arg daterange'
            continue
        mon = time.strftime("%b%y", cur_date.timetuple())
        if mon != last_month:
            print
            print mon, cur_date.day*"  ",
            sys.stdout.flush()
            last_month = mon
        ret = scrape_date(str_date, cur_date, passive=args.passive)
        if ret==DOWNLOADED:
            print 'o',
        elif ret==REPACKAGED:
            print 'O',
        elif ret==ERROR:
            print '!',
        elif ret==MISSING:
            print '_',
        else:
            print '.',
        sys.stdout.flush()
    print
    os.chdir('../..')
Beispiel #2
0
def main():
    # print AnnotateGame(codecs.open(fn, 'r', encoding='utf-8').read()).encode(
    #    'utf-8')
    # return
    args = utils.incremental_date_range_cmd_line_parser().parse_args()
    print args
    days = os.listdir("static/scrape_data")
    days.sort()
    for year_month_day in days:
        if not utils.includes_day(args, year_month_day):
            continue

        if args.incremental and os.path.exists("parsed_out/%s-0.json" % year_month_day):
            print "skipping", year_month_day, "because already done"
            continue

        try:
            print "trying", year_month_day
            convert_to_json(year_month_day)
        except ParseTurnHeaderError, e:
            print e
            return
Beispiel #3
0
def main():
    #print AnnotateGame(codecs.open(fn, 'r', encoding='utf-8').read()).encode(
    #    'utf-8')
    #return
    args = utils.incremental_date_range_cmd_line_parser().parse_args()
    print args
    days = os.listdir('static/scrape_data')
    days.sort()
    for year_month_day in days:
        if not utils.includes_day(args, year_month_day):
            continue

        if args.incremental and os.path.exists(
                'parsed_out/%s-0.json' % year_month_day):
            print 'skipping', year_month_day, 'because already done'
            continue

        try:
            print 'trying', year_month_day
            convert_to_json(year_month_day)
        except ParseTurnHeaderError, e:
            print e
            return
Beispiel #4
0
def scrape_games():
    parser = utils.incremental_date_range_cmd_line_parser()
    utils.ensure_exists('static/scrape_data')
    os.chdir('static/scrape_data')

    args = parser.parse_args()
    last_month = ''
    
    yesterday = datetime.date.today() - datetime.timedelta(days=1)
    #Goko updates logs in real time; wait a day so the list is finalized.

    for cur_date in utils.daterange(default_startdate, yesterday, reverse=True):
        str_date = time.strftime("%Y%m%d", cur_date.timetuple())
        if not utils.includes_day(args, str_date):
            if DEBUG:
                print 'skipping', str_date, 'because not in cmd line arg daterange'
            continue
        mon = time.strftime("%b%y", cur_date.timetuple())
        if mon != last_month:
            print
            print mon, cur_date.day*"  ",
            sys.stdout.flush()
            last_month = mon
        ret = scrape_date(str_date, cur_date, passive=args.passive)
        if ret==DOWNLOADED:
            print 'o',
        elif ret==REPACKAGED:
            print 'O',
        elif ret==ERROR:
            print '!',
        elif ret==MISSING:
            print '_',
        else:
            print '.',
        sys.stdout.flush()
    print
    os.chdir('../..')
#!/usr/bin/python

import logging
import logging.handlers
import os
import os.path
import pymongo
import re
import sys

import utils

from keys import *

parser = utils.incremental_date_range_cmd_line_parser()
find_id = re.compile("game-.*.html")


def process_file(filename, incremental, games_table, log):
    yyyymmdd = filename[:8]

    if incremental:
        contents = open("parsed_out/" + filename, "r").read()
        if contents.strip() == "[]":
            log.warning("empty contents in %s (make parser not dump empty files?)", filename)
            return

        assert find_id.search(contents), "could not get id from %s in file %s" % (contents[:100], filename)

        found_all = True
        for match in find_id.finditer(contents):
#!/usr/bin/python

import os
import pymongo
import re
import sys

import argparse
import utils

parser = utils.incremental_date_range_cmd_line_parser()

def main():
    args = parser.parse_args()
    games_table = pymongo.Connection().test.games
    games_table.ensure_index('players')
    games_table.ensure_index('supply')
    data_files_to_load = os.listdir('parsed_out')
    data_files_to_load.sort()
    find_id = re.compile('game-.*.html')
    done = set()
    for fn in data_files_to_load:
        yyyymmdd = fn[:8]
        print yyyymmdd
        if not utils.includes_day(args, yyyymmdd):
            print 'skipping', fn, 'because not in range'
            continue

        if args.incremental:
            if yyyymmdd in done:
                print 'skipping', fn, 'because done'
Beispiel #7
0
#!/usr/bin/python

import logging
import logging.handlers
import os
import os.path
import pymongo
import re
import sys

import utils

from keys import *

parser = utils.incremental_date_range_cmd_line_parser()
find_id = re.compile('game-.*.html')


def process_file(filename, incremental, games_table, log):
    yyyymmdd = filename[:8]

    if incremental:
        contents = open('parsed_out/' + filename, 'r').read()
        if contents.strip() == '[]':
            log.warning(
                "empty contents in %s (make parser not dump empty files?)",
                filename)
            return

        assert find_id.search(contents), (
            'could not get id from %s in file %s' % (contents[:100], filename))