def data_cleansing_2(): start = t.time() for fighter in ufcfightdata: for ft_index,fightdict in ufcfightdata[fighter].items(): for fight, listdf in ufcfightdata[fighter][ft_index].items(): for df in listdf: pd_df_spltr(df) df['Fighter_ID'] =fighter df['Fight_ID'] = fight[34:50] for fighter in ufcfightdata: for ft_index,fightdict in ufcfightdata[fighter].items(): for fight, listdf in ufcfightdata[fighter][ft_index].items(): for df in range(0,len(listdf)): if df==0: #import data to fighter_data table load_to_db(listdf[df],"fighter_round_fact") else: load_to_db(listdf[df],"fight_area_round_fact") stop = t.time() time= stop-start print('This took %s seconds' %time)
def get_fight_end_times(): #sometimes missing data - how to treat ufcfightdata={} scores =['23','24','25','26','27','28','29','30','42','43','44','45','46','47','48','49','50'] ufcfightdataf= pickle.load( open( "data/ufcfightdata.pickle", "rb" ) ) fightersl = list(ufcfightdataf.keys()) keys_to_extract = fightersl[1501:] # keys_to_extract = [] # for fighter in fighters: # keys_to_extract.append(fighter[36:52]) ufcfightdata = {key: ufcfightdataf[key] for key in keys_to_extract} start = t.time() fight_detail_im = pd.DataFrame() for fighter in ufcfightdata: for ft_index,fightdict in ufcfightdata[fighter].items(): for fight_url in ufcfightdata[fighter][ft_index]: #print(fight_url) fight_details_df = pd.DataFrame() r = requests.get(fight_url) soup = bs.BeautifulSoup(r.content,'lxml') for a in soup.find_all('i', class_=["b-fight-details__fight-title","b-fight-details__text-item"]): if any(score in a.text for score in scores): continue else: fight_details_df = fight_details_df.append([a.text.replace('\n','').strip()]) for a in soup.find_all('i', attrs={'style': 'font-style: normal'}): fight_details_df = fight_details_df.append([a.text.replace('\n','').strip()]) fight_details_df = fight_details_df.T fight_details_df['Fighter_ID'] = fighter fight_details_df['Fight_ID'] = fight_url[34:50] fight_detail_im= fight_detail_im.append(fight_details_df) stop = t.time() time= stop-start print('This took %s seconds' %time) fight_detail_im.columns=['Weight_Class','Round_End','Time_End','Fight_format','Referee','Judge1','Judge2','Judge3','Outcome_Detail'] load_to_db(fight_detail_im, 'fight_outcome_dim')
def get_fight_odds(): with open(r'data\fighter_odds.json', 'r') as fp: fighter_odds = json.load(fp) fighter_odds = list(fighter_odds.keys())[100:] # fighter_odds_test=[] # fighter_odds_test.append('https://www.bestfightodds.com/fighters/Tony-Ferguson-2568') # fighter_odds_test.append("https://www.bestfightodds.com/fighters/Israel-Adesanya-7845") # fighter_odds_test.append("https://www.bestfightodds.com/fighters/Jan-Blachowicz-2371") # for fighter_url in fighter_odds_test: finaldf = pd.DataFrame(columns=[ 'Fighter_Name', 'Open', 'Close_range_Lower', 'Close_range_Upper', 'Event', 'Fighter_ID' ]) start = t.time() for fighter_url in fighter_odds: # print(fighter_url) dfs = pd.read_html("https://www.bestfightodds.com" + fighter_url) df = dfs[0] df.columns = [ 'Fighter_Name', 'Open', 'Close_range_Lower', 'Remove', 'Close_range_Upper', 'Remove1', 'Remove2', 'Event' ] df = df[df.index % 3 != 0] df = df.drop(columns=["Remove", "Remove1", "Remove2"]) df['Fighter_ID'] = fighter_url[39:] finaldf = finaldf.append(df) finaldf = finaldf.reset_index(drop=True) finaldf1 = finaldf[finaldf.index % 2 == 0] finaldf2 = finaldf[finaldf.index % 2 != 0] finaldf1 = finaldf1.reset_index(drop=True) finaldf2 = finaldf2.reset_index(drop=True) finaldf1.columns = [ 'Fighter_Name0', 'Open0', 'Close_range_Lower0', 'Close_range_Upper0', 'Event_Name', 'Fighter_Id_0' ] finaldf2.columns = [ 'Fighter_Name1', 'Open1', 'Close_range_Lower1', 'Close_range_Upper1', 'Event_Date', 'Fighter_Id_1' ] finaldf3 = finaldf1.join(finaldf2) stop = t.time() time = stop - start print('This took %s seconds' % time) load_to_db(finaldf3, 'fight_odd_fact')
def db_insert_fighter_dim(all_fighters): #run once a month (scheduled task) def fighter_dic_to_df(): fighter_id, fighter_name = [],[] for i in all_fighters: for j,k in all_fighters[i].items(): fighter_id.append(j) fighter_name.append(k) fighter_df=pd.DataFrame([fighter_id, fighter_name]).T fighter_df.columns=['Fighter_ID','Fighter_Name'] fighter_dic_to_df() load_to_db(fighter_df,"fighter_dim")
def get_fite_dates_results(): with open(r'data\ufcfighters.json','r') as fp: fighters = json.load(fp) dates =['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] # fighters =[] # fighters.append('http://ufcstats.com/fighter-details/22a92d7f62195791') # fighters.append('http://ufcstats.com/fighter-details/787bb1f087ccff8a') fighters = list(fighters.keys()) fighters = fighters[:500] ufcfightdatedim={} ufcfightid_dim={} start = t.time() for fighter in fighters: ufcfightdatedim[fighter[36:52]]={} r = requests.get(fighter) soup = bs.BeautifulSoup(r.content,'lxml') cnt=0 for p in soup.find_all('p',attrs={"class":"b-fight-details__table-text"} ): if any(x in p.text for x in dates): #print(p.text,cnt) ufcfightdatedim[fighter[36:52]][cnt] = p.text.strip() cnt+=1 ufcfightid_dim[fighter[36:52]]={} cnt=0 for a in soup.find_all('a', href=True): if "fight-details" in a['href']: # print(a.text,a['href']) if a['href'] not in ufcfightid_dim and a.text !='next' and 'Matchup' not in a.text: ufcfightid_dim[fighter[36:52]][cnt] = {a['href']: a.text} cnt+=1 stop = t.time() time= stop-start print('This took %s seconds' %time) datedim_df = pd.DataFrame() fightid_dimdf = pd.DataFrame() for fighter in ufcfightdatedim: df = pd.DataFrame.from_dict(ufcfightdatedim[fighter].items()) df['Fighter_Id'] = fighter datedim_df = datedim_df.append(df) for fight_index, fightdata in ufcfightid_dim[fighter].items(): fightid_dimdf = fightid_dimdf.append( [[fight_index, fighter, str(fightdata.keys())[46:62], str(fightdata.values())[14:] ]] ) datedim_df.columns=['Fight_Index','Fight_Date','Fighter_ID'] fightid_dimdf.columns=['Fight_Index','Fighter_ID','Fight_ID','Result'] #import data to fighter_data table load_to_db(datedim_df,"fight_date_dim") load_to_db(fightid_dimdf,"fight_id_dim")
def get_fight_odds(): fighter_odds = h.load_json(self.fighter_lst_address) # with open(self.fighter_lst_addres,'r') as fp: # fighter_odds = json.load(fp) #get only the difference between the latest fighter odd IDs and what's in the database #send that through for data scraping and creating the dfs # fighter_odds = list(fighter_odds.keys())[100:] # fighter_odds_test=[] # fighter_odds_test.append('https://www.bestfightodds.com/fighters/Tony-Ferguson-2568') # fighter_odds_test.append("https://www.bestfightodds.com/fighters/Israel-Adesanya-7845") # fighter_odds_test.append("https://www.bestfightodds.com/fighters/Jan-Blachowicz-2371") finaldf = pd.DataFrame(columns=['Fighter_Name','Open','Close_range_Lower','Close_range_Upper','Event','Fighter_ID']) start = t.time() for fighter_url in fighter_odds: dfs= pd.read_html(self.bestfightOddsUrl+fighter_url) df=dfs[0] df.columns =['Fighter_Name','Open','Close_range_Lower','Remove','Close_range_Upper','Remove1','Remove2','Event'] df = df[df.index %3 !=0] df = df.drop(columns=["Remove","Remove1","Remove2"]) df['Fighter_ID']= fighter_url[39:] finaldf= finaldf.append(df) finaldf=finaldf.reset_index(drop=True) finaldf1 = finaldf[finaldf.index%2==0] finaldf2 = finaldf[finaldf.index%2!=0] finaldf1=finaldf1.reset_index(drop=True) finaldf2=finaldf2.reset_index(drop=True) finaldf1.columns=['Fighter_Name0','Open0','Close_range_Lower0','Close_range_Upper0','Event_Name','Fighter_Id_0'] finaldf2.columns=['Fighter_Name1','Open1','Close_range_Lower1','Close_range_Upper1','Event_Date','Fighter_Id_1'] finaldf3= finaldf1.join(finaldf2) stop = t.time() time= stop-start print('This took %s seconds' %time) h.load_to_db(finaldf3,'fight_odd_fact') # def get_all_bestfightOdds_IDs(opponent, cnt, fighter_lst): # for fighter_url in fighter_lst: # if fighter_url == opponent: # ##extract the web page for the fighter # r = requests.get("https://www.bestfightodds.com"+fighter_url) # soup = bs.BeautifulSoup(r.content,'lxml') # #for each fighter, go through their opponents # #each link to their opopnent contains the string "fighters" within it # for a in soup.find_all('a', href=re.compile('fighters')): # #filter out the fighter urls that are the fighter themselves # if fighter_url not in a['href']: # #if the fighter's opponent is not already in the dictoinary # if a['href'] not in fighter_lst: # #add the ID to the new dictionary # # fighter_lst[a['href']] = a.text # fighter_lst.append(a['href']) # #some checking # cnt+=1 # print(a.text,' added #',cnt) # #send the opponent to scrape all the IDs of their opponents # opponent= a['href'] # #then recursively get the IDs of their opponents # get_all_bestfightOdds_IDs(opponent, cnt, fighter_lst) # return fighter_lst # fighter_list= ['/fighters/Israel-Adesanya-7845'] # fighter_lst.append('/fighters/Israel-Adesanya-7845') # cnt=0 # opponent = '/fighters/Israel-Adesanya-7845' # get_all_bestfightOdds_IDs(opponent, cnt, fighter_lst) # #how to terminate? count was going down for some reason # ##save output # c = str(datetime.date.today()) # with open(r'data\fighter_odds_%s.json' %c,'w') as fp: # json.dump(fighter_lst,fp) # fidf = pd.DataFrame(fighter_lst, columns=['Fighter_ID']) # h.load_to_db(fidf,'fighter_id_recursive')