def raw_harvest (year, game_num, away_acronym, home_acronym, away_roster, home_roster): """ Extract play-by-play information from a html file on the local machine (in the form of raw, unspeficied events). Returns list of unspecified event objects """ tree = Operations.germinate_report_seed(year,game_num,'PL','02') events = [] # empty list for holding unspecified events for item in tree.xpath('//table/tr[@class="evenColor"]'): event_raw = item.xpath('./td/text()') num = int(event_raw[0]) per_num = int(event_raw[1]) strength = unicode(event_raw[2]) time = unicode(event_raw[3]) event_type = unicode(event_raw[5]) description = unicode(event_raw[6]) try: # Zone not always indicated in event description # A bit redudant, done also before pruning events description_raw = description.split() zone_index = description_raw.index('Zone,') - 1 zone = description_raw[zone_index] except ValueError: try: # Certain events have zone at end of description zone_index = description_raw.index('Zone') - 1 zone = description_raw[zone_index] except ValueError: zone = None assert zone == 'Neu.' or zone == 'Off.' or zone == 'Def.' \ or zone == None, "ERROR: Event zone(%s) invalid"%(zone) # Goals have an additional row in the description cell for assists if event_type == 'GOAL' and event_raw[7].find('Assist') != -1: description = unicode(" ".join(event_raw[6:8])) players_on_ice = item.xpath('./td/table') home_on_ice = [] away_on_ice = [] if len (players_on_ice) == 2: # Perhaps make this more robust? away_on_ice = Operations.chop_on_ice_branch ( players_on_ice[0], away_roster) home_on_ice = Operations.chop_on_ice_branch ( players_on_ice[1], home_roster) event = Event(num, per_num, strength, time, event_type, zone, description, away_acronym, home_acronym, away_on_ice, home_on_ice) events.append (event) return events