#!/usr/bin/python from globals import TeamStat, getInFile, marshal import string from lxml import etree from whohas import whohas tree = etree.parse(getInFile('raw/standings.html'),etree.HTMLParser()) standings=list() for letter in string.ascii_uppercase[:8]: table=tree.xpath('//table[@summary="Group Group %c"]'%(letter))[0] for row in table.findall('./tbody/tr'): [tdTeam,tdPlayed,tdWins,tdDraws,tdLosses,tdGoalsFor,tdGoalsAgainst,tdPoints]=row.getchildren() ts = TeamStat() ts.group = letter ts.teamCode = tdTeam.find('.//img').get('src')[-7:-4].upper() ts.teamName = unicode(tdTeam.find('.//img').get('title').encode('latin1'),'utf-8') # decode and re-encode utf-8 string # previously: ts.teamName = tdTeam.find('.//img').get('title') ts.played = int(tdPlayed.text) ts.wins = int(tdWins.text) ts.draws = int(tdDraws.text) ts.losses = int(tdLosses.text) ts.goalsFor = int(tdGoalsFor.text) ts.goalsAgainst = int(tdGoalsAgainst.text) ts.goalsDiff = int(ts.goalsFor - ts.goalsAgainst) ts.points = int(tdPoints.text) standings.append(ts) marshal(standings,'parsed/gss.pkl')
#!/usr/bin/python from globals import MatchStat, getInFile, marshal, unmarshal from lxml import etree from whohas import whohas import sys, re tree = etree.parse(getInFile(''),etree.HTMLParser()) div = tree.xpath('//div[@id="fwcMatchHeader"]')[0] ms = MatchStat() ms.number = int(div.xpath('./div[@class="footer"]/div[@class="info"]/span[@class="matchInfo L"]')[0].text[6:]) try: oldms = unmarshal('parsed/match%02d.pkl'%(ms.number),None) if isinstance(oldms,MatchStat): print "Skipping, parsed/match%02d.pkl exists"%(ms.number) sys.exit(0) # MatchStat exists, abort parsing. except IOError: pass # file not found means go on, the MatchStat does not yet exist! assert(ms.number == int(div.xpath('./div[@class="footer"]/div[@class="info"]/span')[0].text[6:])) ms.group = div.xpath('./div[@class="footer"]/div[@class="info"]/span')[1].text ms.group = re.search('\w[\w -]+\w', ms.group).group() if 'GROUP ' in ms.group.upper(): ms.group = ms.group[6:].upper() whdate = div.xpath('./div[@class="footer"]/div[@class="info"]/span')[2].text whtime = div.xpath('./div[@class="match"]/div[@class="time"]')[0].text
#!/usr/bin/python from globals import MatchStat, getInFile, marshal, unmarshal import re ### Part one: parsing the match report txt fp = getInFile('') fn = fp.name contents = fp.read() # data is utf-8 encoded contents = unicode(contents, 'utf-8') codes = re.search(r'_(?P<h>[A-Z]{3})-(?P<a>[A-Z]{3})_',fn.upper()).groupdict() assert(all(codes.values())) matchTitleMatch = re.search(ur"(?<=^\u000c)(?P<aaaa>[^()]+) \((?P<a>[A-Z]{3})\)(?P<hhhh>[^()]+) \((?P<h>[A-Z]{3})\)$",contents,re.M) assert(matchTitleMatch) matchTitle = matchTitleMatch.group() teams = matchTitleMatch.groupdict() assert(all(teams.values())) assert(all([ codes[key]==teams[key] for key in ['h','a'] ])) scoreDict = re.search(r'^(?P<hhhh>.+) - (?P<aaaa>.+) (?P<hg>\d+):(?P<ag>\d+)( a.e.t.)?( \(\d+:\d+(, \d+:\d+)?\))?( \d+:\d+ PSO)?$',contents,re.M).groupdict() assert(all([ scoreDict[key]==teams[key] for key in ['hhhh','aaaa'] ])) score = ( int(scoreDict['hg']), int(scoreDict['ag']) ) goals=list() goalsLine = re.match(r'Goals Scored:\n((?P<goals>.+)\n)?'+re.escape(matchTitle), contents, re.S).group('goals') if goalsLine: for m in re.finditer(r"\((?P<who>[A-Z]{3})\)\s(?P<when>\d+)'(\+(?P<whenplus>\d+))?(?P<og>\sown goal)?",goalsLine): d = m.groupdict()