Example #1
0
#!/usr/bin/python

from globals import TeamStat, getInFile, marshal
import string
from lxml import etree
from whohas import whohas

tree = etree.parse(getInFile('raw/standings.html'),etree.HTMLParser())
standings=list()

for letter in string.ascii_uppercase[:8]:
  table=tree.xpath('//table[@summary="Group Group %c"]'%(letter))[0]
  for row in table.findall('./tbody/tr'):
    [tdTeam,tdPlayed,tdWins,tdDraws,tdLosses,tdGoalsFor,tdGoalsAgainst,tdPoints]=row.getchildren()
    ts = TeamStat()
    ts.group = letter
    ts.teamCode = tdTeam.find('.//img').get('src')[-7:-4].upper()
    ts.teamName = unicode(tdTeam.find('.//img').get('title').encode('latin1'),'utf-8') # decode and re-encode utf-8 string
    # previously: ts.teamName = tdTeam.find('.//img').get('title')
    ts.played = int(tdPlayed.text)
    ts.wins = int(tdWins.text)
    ts.draws = int(tdDraws.text)
    ts.losses = int(tdLosses.text)
    ts.goalsFor = int(tdGoalsFor.text)
    ts.goalsAgainst = int(tdGoalsAgainst.text)
    ts.goalsDiff = int(ts.goalsFor - ts.goalsAgainst)
    ts.points = int(tdPoints.text)
    
    standings.append(ts)

marshal(standings,'parsed/gss.pkl')
Example #2
0
#!/usr/bin/python

from globals import MatchStat, getInFile, marshal, unmarshal
from lxml import etree
from whohas import whohas
import sys, re

tree = etree.parse(getInFile(''),etree.HTMLParser())

div = tree.xpath('//div[@id="fwcMatchHeader"]')[0]
ms = MatchStat()
ms.number = int(div.xpath('./div[@class="footer"]/div[@class="info"]/span[@class="matchInfo L"]')[0].text[6:])

try:
  oldms = unmarshal('parsed/match%02d.pkl'%(ms.number),None)
  if isinstance(oldms,MatchStat):
    print "Skipping, parsed/match%02d.pkl exists"%(ms.number)
    sys.exit(0) # MatchStat exists, abort parsing.
except IOError:
  pass # file not found means go on, the MatchStat does not yet exist! 

assert(ms.number == int(div.xpath('./div[@class="footer"]/div[@class="info"]/span')[0].text[6:]))

ms.group = div.xpath('./div[@class="footer"]/div[@class="info"]/span')[1].text
ms.group = re.search('\w[\w -]+\w', ms.group).group()
if 'GROUP ' in ms.group.upper():
  ms.group = ms.group[6:].upper()


whdate = div.xpath('./div[@class="footer"]/div[@class="info"]/span')[2].text
whtime = div.xpath('./div[@class="match"]/div[@class="time"]')[0].text
Example #3
0
#!/usr/bin/python

from globals import MatchStat, getInFile, marshal, unmarshal
import re

### Part one: parsing the match report txt

fp = getInFile('')
fn = fp.name
contents = fp.read() # data is utf-8 encoded
contents = unicode(contents, 'utf-8')

codes = re.search(r'_(?P<h>[A-Z]{3})-(?P<a>[A-Z]{3})_',fn.upper()).groupdict()
assert(all(codes.values()))

matchTitleMatch = re.search(ur"(?<=^\u000c)(?P<aaaa>[^()]+) \((?P<a>[A-Z]{3})\)(?P<hhhh>[^()]+) \((?P<h>[A-Z]{3})\)$",contents,re.M)
assert(matchTitleMatch)
matchTitle = matchTitleMatch.group()
teams = matchTitleMatch.groupdict()
assert(all(teams.values()))
assert(all([ codes[key]==teams[key] for key in ['h','a'] ]))

scoreDict = re.search(r'^(?P<hhhh>.+) - (?P<aaaa>.+) (?P<hg>\d+):(?P<ag>\d+)( a.e.t.)?( \(\d+:\d+(, \d+:\d+)?\))?( \d+:\d+ PSO)?$',contents,re.M).groupdict()
assert(all([ scoreDict[key]==teams[key] for key in ['hhhh','aaaa'] ]))
score = ( int(scoreDict['hg']), int(scoreDict['ag']) )

goals=list()
goalsLine = re.match(r'Goals Scored:\n((?P<goals>.+)\n)?'+re.escape(matchTitle), contents, re.S).group('goals')
if goalsLine: 
  for m in re.finditer(r"\((?P<who>[A-Z]{3})\)\s(?P<when>\d+)'(\+(?P<whenplus>\d+))?(?P<og>\sown goal)?",goalsLine):
    d = m.groupdict()