def getBowlerInfo(url): webpage = urlopen(url).read() soup = BeautifulSoup(webpage, "html5lib") data = {} data['name'] = Preprocess.preprocess(soup.find(class_ = 'ciPlayernametxt').find('h1').text)\ .strip() for p in soup.find_all(class_='ciPlayerinformationtxt'): if 'Bowling style' in p.text: data['type'] = Preprocess.preprocess(p.find('span').text) break return data
def scrapeByYear(player, country, year): webpage = urlopen( "http://www.espncricinfo.com/ci/engine" "/series/index.html?season=" + year + ";view=season") \ .read() soup = BeautifulSoup(webpage, "html5lib") match_types = soup.find_all(class_='match-section-head') all_serieses = soup.find_all(class_='series-summary-wrap') # select ODI serieses k = 0 for i in range(len(match_types)): if Preprocess.preprocess( match_types[i].find('h2').text) == 'One-Day Internationals': k = i break odi_serieses = all_serieses[k] for series in odi_serieses.find_all(class_='series-summary-block'): series_url = "http://www.espncricinfo.com" + series.find( class_='teams').find('a').get('href') if country in series.find(class_='teams').text \ or country in series.find(class_='date-location').text: scrapeSeries(player, country, series_url)
def scrapeSeries(player, country, url): print("series : " + url) webpage = urlopen(url).read() soup = BeautifulSoup(webpage, "html5lib") match_summaries = soup.find_all('span', class_='potMatchLink') i = 0 for match in soup.find_all(class_='potMatchMenuLink'): if "Scorecard" in match.text: match_url = match.get('href') country_name = '-'.join(country.lower().strip().split(" ")) if country_name in match_url: scrapeMatch( player, country, match_url, Preprocess.preprocess(match_summaries[i].parent.text)) i += 1
def scrapeMatch(player, country, url, heading): print('match : ' + url) # url = url.replace('scorecard', 'commentary').strip('/') + '?innings=2&filter=wickets' webpage = urlopen(url).read() soup = BeautifulSoup(webpage, "html5lib") scorecards = soup.find_all(class_='scorecard-section batsmen') dismissal = {} player_dismissal = 0 isBreak = False wayOut = "" player_name = player numbers = 0 i = 0 for scorecard in scorecards: for row in scorecard.find_all(class_='flex-row'): if row.find(class_='wrap batsmen'): player_name = Preprocess.preprocess( row.find(class_='wrap batsmen').find( class_='cell batsmen').find('a').text) if player in player_name: wayOut = row.find(class_='wrap batsmen').find( class_='cell commentary').text if "not out" not in wayOut: wayOut = Preprocess.preprocess( row.find(class_='wrap batsmen').find(class_='cell commentary') \ .find('a').text) number_headings_temp = scorecard.find( class_='wrap header').find_all(class_='cell runs') number_headings = [ Preprocess.preprocess(x.text) for x in number_headings_temp ] numbers = row.find(class_='wrap batsmen').find_all( class_='cell runs') player_dismissal = row.find(class_='content') isBreak = True break if isBreak: break i += 1 if player_dismissal != 0: dismissal['player'] = player_name dismissal['player_innings'] = {} dismissal['player_innings']['runs'] = Preprocess.preprocess( numbers[number_headings.index("R")].text) dismissal['player_innings']['balls'] = Preprocess.preprocess( numbers[number_headings.index("B")].text) dismissal['player_innings']['4s'] = Preprocess.preprocess( numbers[number_headings.index("4s")].text) dismissal['player_innings']['6s'] = Preprocess.preprocess( numbers[number_headings.index("6s")].text) # dismissal['venue'] = heading.split("-")[0].split('at')[1].strip() dismissal['date'] = heading.split("-")[1].strip() dismissal['stadium'] = Preprocess.preprocess( soup.find(class_='stadium-details').find('span').text) dismissal['innings'] = i + 1 dismissal['bowler'] = {} dismissal['team'] = {} dismissal['opposition'] = {} dismissal['team']['country'] = country dismissal['wayOut'] = wayOut dismissal['scoreAt'] = Preprocess.preprocess( player_dismissal.find_all('span')[1].text).strip() dismissal['ball'] = Preprocess.preprocess( player_dismissal.find_all('span')[0].text).strip() dismissal['description'] = Preprocess.preprocess( player_dismissal.text).strip() countries = heading.split("-")[0].split(":")[1].split("at")[0].split( "v") for c in countries: if country not in c: dismissal['opposition']['country'] = c.strip() break if len(scorecards) > 1: dismissal['opposition']['total'] = Preprocess.preprocess(scorecards[1 - i] \ .find(class_='wrap total') \ .find_all('div')[1].text) else: dismissal['opposition']['total'] = 'DNB' dismissal['team']['total'] = Preprocess.preprocess(scorecards[i] \ .find(class_='wrap total') \ .find_all('div')[1].text) dismissal['bowler'] = {} if 'run out' not in wayOut and 'retired hurt' not in wayOut: bowler = None temp = wayOut.split(" ") for t in temp: if t.strip() == 'b': bowler = temp[temp.index(t) + 1] isBreak = False for bowlerSection in soup.find_all( class_='scorecard-section bowling'): for link in bowlerSection.find_all('a'): # print(link.text) if bowler in link.text: dismissal['bowler'] = getBowlerInfo(link.get('href')) isBreak = True break if isBreak: break print(dismissal) dismissals.append(dismissal)
def getPlayerInfo(url): # Sending the http request webpage = urlopen(url).read() # making the soup! yummy ;) soup = BeautifulSoup(webpage, "html5lib") data = {} data['bio'] = {} data['statistics'] = {} data['bio']['name'] = Preprocess.preprocess( soup.find(class_='ciPlayernametxt').find('h1').text).strip() data['bio']['coutry'] = soup.find( class_='PlayersSearchLink').find('b').text bio_details = soup.find_all(class_='ciPlayerinformationtxt') for det in bio_details: data['bio'][det.find('b').text] = Preprocess.preprocess( det.find('span').text).strip() stat_tables = soup.find_all(class_='engineTable') for table in stat_tables: if not table.find('thead'): stat_tables.remove(table) stat_headings = [] for head in soup.find_all('span', class_='ciPhotoWidgetLink'): stat_headings.append(Preprocess.preprocess(head.text)) for head in stat_headings: if head == "Batting and fielding averages" or head == "Bowling averages" or \ head == "Recent matches": data['statistics'][head] = {} else: stat_headings.remove(head) for k in range(len(stat_tables)): keys = [] if stat_headings[k] != 'Recent matches': # Batting and bowling for col_name in stat_tables[k].find_all('th'): key = Preprocess.preprocess(col_name.text) if key == '10': key += 'w' keys.append(key) for row in stat_tables[k].find('tbody').find_all('tr'): tds = row.find_all('td') if tds[0].find('a'): head = Preprocess.preprocess( tds[0].find('a').find('span').find('b').text) else: head = Preprocess.preprocess(tds[0].find('b').text) data['statistics'][stat_headings[k]][head] = {} for j in range(1, len(tds)): data['statistics'][stat_headings[k]][head][ keys[j]] = Preprocess.preprocess(tds[j].text) else: # recent scores for col_name in stat_tables[k].find_all('th'): keys.append(Preprocess.preprocess(col_name.text)) matches = [] for row in stat_tables[k].find('tbody').find_all('tr'): tds = row.find_all('td') match = {} for i in range(len(tds)): if tds[i].find('a'): d = Preprocess.preprocess( tds[i].find('a').text).replace(' ', '') else: d = Preprocess.preprocess( preprocess(tds[i].text).replace(' ', '')).replace( ' ', '') if keys[i] == 'Opposition': d = d.strip('v') match[keys[i]] = d matches.append(match) data['statistics'][stat_headings[k]] = matches return data
def run(self): model = None scaler = None while True: self.find_data_files() print( "Enter:\nt - to train the model, \ne - to test a trained model, \nl - to load a pre-trained model\nq - to quit" ) print( "IMPORTANT: Always train or load a model before testing it!\n") choice = input("Your choice: ") if choice is "l": model = load_model("model.h5") elif choice is "t" or choice is "e": print("Data files:\n") for i, file in enumerate(self.__files): print("{} - {}".format(i + 1, file)) if choice is "t": number = input( "\nPlease select the file to train the model on: ") else: number = input( "\nPlease select the file to test the model on: ") index = int(number) index -= 1 if 0 <= index < len(self.__files): data = self.dr.read_set(self.__data_indices[index]) pp = Preprocess() data, scaler = pp.clean_up(data) data = pp.convert_to_supervised(data, sample_shift=0) if choice is "t": train, test = pp.prepare_sets(data, 0.2) train_X, train_y = pp.make_input_output( train, remove_resp_from_input=True) test_X, test_y = pp.make_input_output( test, remove_resp_from_input=True) trainer = RespRatePredictor() self.dr.plot(data) model = trainer.make_network( input_shape=(train_X.shape[1], train_X.shape[2])) model = trainer.fit_network(model, train_X, train_y, test_X, test_y) model.save("model_{0:0>2}.h5".format( self.__data_indices[index - 1])) else: all_X, all_y = pp.make_input_output( data.drop("Time [s]", axis=1), remove_resp_from_input=True) predict_y = model.predict(all_X, batch_size=640) # min_ = scaler.min_[1] # scale_ = scaler.scale_[1] # predict_y = (predict_y - min_) / scale_ predicted = pnd.DataFrame( {"RESP_PREDICTED": predict_y.flatten()}) fused = pnd.concat([data, predicted], axis=1) self.dr.plot(fused) self.dr.plot_detail(fused) else: continue else: break
def scrapeMatch(dismissal, url): print('match : ' + url) webpage = urlopen(url).read() soup = BeautifulSoup(webpage, "html5lib") scorecards = soup.find_all(class_='scorecard-section batsmen') if len(scorecards) == 0: return False player_dismissal = None isBreak = False wayOut = "" player_name = dismissal['batsman']['name'] i = 0 for scorecard in scorecards: for row in scorecard.find_all(class_='flex-row'): if row.find(class_='wrap batsmen'): player_name = Preprocess.preprocess( row.find(class_='wrap batsmen').find( class_='cell batsmen').find('a').text) nameMatch = False if len(dismissal['batsman']['name'].split(" ")) > 1: n = dismissal['batsman']['name'].split(" ")[-1] for n2 in player_name.split(" "): if n in n2: nameMatch = True break if nameMatch: if row.find(class_='wrap batsmen').find( class_='cell commentary').find('a'): wayOut = Preprocess.preprocess( row.find(class_='wrap batsmen').find(class_='cell commentary') \ .find('a').text) else: wayOut = Preprocess.preprocess( row.find(class_='wrap batsmen').find( class_='cell commentary').text) player_dismissal = row.find(class_='content') isBreak = True break if isBreak: break i += 1 if player_dismissal != None: dismissal['bowler'] = {} dismissal['scoreAt'] = Preprocess.preprocess( player_dismissal.find_all('span')[1].text).strip() dismissal['ball'] = Preprocess.preprocess( player_dismissal.find_all('span')[0].text).strip() for s in player_dismissal.find_all('span'): s.decompose() dismissal['description'] = Preprocess.preprocess( player_dismissal.text).strip() if len(scorecards) > 1: dismissal['opposition']['total'] = Preprocess.preprocess(scorecards[1 - i] \ .find(class_='wrap total') \ .find_all('div')[1].text) else: dismissal['opposition']['total'] = 'DNB' dismissal['team']['total'] = Preprocess.preprocess(scorecards[i] \ .find(class_='wrap total') \ .find_all('div')[1].text) dismissal['bowler'] = {} if 'run out' not in wayOut and 'retired hurt' not in wayOut: bowler = None temp = wayOut.split(" ") for t in temp: if t.strip() == 'b': bowler = temp[temp.index(t) + 1] isBreak = False if bowler is not None: for bowlerSection in soup.find_all( class_='scorecard-section bowling'): for link in bowlerSection.find_all('a'): if bowler in link.text: dismissal['bowler'] = getBowlerInfo(link.get('href')) isBreak = True break if isBreak: break else: return False return dismissal
def scrapePlayerDismissals(player_name): id = 0 dismissals = [] webpage = urlopen('http://stats.espncricinfo.com' '/ci/engine/stats/analysis.html?' 'search=' + ('+'.join(player_name.split(' '))) + ';template=analysis').read() soup = BeautifulSoup(webpage, "html5lib") player_link = None for link in soup.find_all('a'): if 'One-Day Internationals player' in link.text: player_link = link break if player_link is not None: player_country = Preprocess.preprocess( player_link.parent.parent.find_all('td')[1].text) soup_batsman = BeautifulSoup( urlopen("http://stats.espncricinfo.com" + player_link.get('href')), "html5lib").find(class_='ciPhotoContainer') batsman = {} batsman['name'] = player_name for p in soup_batsman.find_all('p'): if 'right-hand bat' in p.text: batsman['batting-hand'] = 'right' break elif 'left-hand bat' in p.text: batsman['batting-hand'] = 'left' break player_url = player_link.get('href').split(';')[0] innings_url = 'http://stats.espncricinfo.com' + player_url + \ ';filter=advanced;orderby=start;outs=1;' \ 'template=results;type=batting;view=innings' innings_webpage = urlopen(innings_url) innings_html = BeautifulSoup(innings_webpage, "html5lib") innings_table = None for table in innings_html.find_all(class_='engineTable'): if table.find('caption') and \ 'Innings by innings list' in table.find('caption').text: innings_table = table break for row in innings_table.find('tbody').find_all('tr'): dismissal = {} dismissal['batsman'] = batsman dismissal['player_innings'] = {} dismissal['dismissal'] = {} dismissal['opposition'] = {} dismissal['team'] = {} dismissal['team']['country'] = player_country i = 1 for data in row.find_all('td'): if i == 1: dismissal['player_innings'][ 'runs'] = Preprocess.preprocess(data.text).strip() elif i == 3: dismissal['player_innings'][ 'balls'] = Preprocess.preprocess(data.text).strip() elif i == 4: dismissal['player_innings']['4s'] = Preprocess.preprocess( data.text).strip() elif i == 5: dismissal['player_innings']['6s'] = Preprocess.preprocess( data.text).strip() elif i == 7: dismissal['player_innings'][ 'batting_position'] = Preprocess.preprocess( data.text).strip() elif i == 8: dismissal['dismissal']['wayOut'] = Preprocess.preprocess( data.text).strip() elif i == 9: dismissal['team_innings'] = Preprocess.preprocess( data.text).strip() elif i == 11: dismissal['opposition']['country'] = Preprocess.preprocess( data.find('a').text).strip() elif i == 12: dismissal['Stadium'] = Preprocess.preprocess( data.text).strip() elif i == 13: dismissal['date'] = Preprocess.preprocess( data.text).strip() elif i == 14: scorecard_url = "http://www.espncricinfo.com" + data.find( 'a').get('href') dismissal = scrapeMatch(dismissal, scorecard_url) i += 1 if dismissal: dismissal['id'] = id dismissals.append(dismissal) id += 1 print(dismissal) with open( 'Samples/Dismissals/' + '-'.join(player_name.split(" ")) + '-odi-dismissals.json', 'w') as outfile: json.dump(dismissals, outfile)
# i += 1 # break # tot += 1 # if not detectedBallMovment: # text += desc + "\n" # text = '' # # wickets = [] # i = 0 # tot = 0 custom_Sent_tokenizer = PunktSentenceTokenizer() for dismissal in dismissals: if 'description' in dismissal: desc = Preprocess.preprocess(dismissal['description']) # desc = desc.replace(',', '') tokenized = custom_Sent_tokenizer.tokenize(desc) for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(desc) print(tagged) break # detectedLength = False # if dismissal['dismissal']['wayOut'] != 'run out': # for ball_length_list in ball_length: # if not detectedLength: # for k in ball_length_list[1]: # if all(w in desc for w in k): # text += (ball_length_list[0] + ", " + desc + "\n")