Esempio n. 1
0
def data_cleansing_2():
    start = t.time()
    for fighter in ufcfightdata:
        for ft_index,fightdict in ufcfightdata[fighter].items():
            for fight, listdf in  ufcfightdata[fighter][ft_index].items():
                for df in listdf:
                    pd_df_spltr(df)
                    df['Fighter_ID'] =fighter
                    df['Fight_ID'] = fight[34:50]
                                   
    for fighter in ufcfightdata:
        
        for ft_index,fightdict in ufcfightdata[fighter].items():
            
            for fight, listdf in  ufcfightdata[fighter][ft_index].items():
                for df in range(0,len(listdf)):
                    if df==0:  
                    #import data to fighter_data table
                        load_to_db(listdf[df],"fighter_round_fact")

                    else:
                        load_to_db(listdf[df],"fight_area_round_fact")
    stop = t.time()         
    time= stop-start 
    print('This took %s seconds' %time)  
Esempio n. 2
0
def get_fight_end_times():
    #sometimes missing data - how to treat
    ufcfightdata={}
 
    scores =['23','24','25','26','27','28','29','30','42','43','44','45','46','47','48','49','50']
    
    ufcfightdataf= pickle.load( open( "data/ufcfightdata.pickle", "rb" ) )
    
    fightersl = list(ufcfightdataf.keys())
    keys_to_extract = fightersl[1501:]
    
    # keys_to_extract = []
    # for fighter in fighters:
    #     keys_to_extract.append(fighter[36:52])
    
    ufcfightdata = {key: ufcfightdataf[key] for key in keys_to_extract}
     
    start = t.time()         
    
    fight_detail_im = pd.DataFrame()
    
    for fighter in ufcfightdata:
        for ft_index,fightdict in ufcfightdata[fighter].items():
            for fight_url in  ufcfightdata[fighter][ft_index]:
                #print(fight_url)
                fight_details_df = pd.DataFrame()
                
                r = requests.get(fight_url)
                soup = bs.BeautifulSoup(r.content,'lxml')
                
                
                
                for a in soup.find_all('i', class_=["b-fight-details__fight-title","b-fight-details__text-item"]):
                    
                    if any(score in a.text for score in scores):
                        continue
                        
                    else:
                        fight_details_df = fight_details_df.append([a.text.replace('\n','').strip()])
                    
                for a in soup.find_all('i', attrs={'style': 'font-style: normal'}):    
                    fight_details_df = fight_details_df.append([a.text.replace('\n','').strip()])       
            
                fight_details_df = fight_details_df.T
                fight_details_df['Fighter_ID'] = fighter
                fight_details_df['Fight_ID'] = fight_url[34:50]
                fight_detail_im= fight_detail_im.append(fight_details_df)
    stop = t.time()         
    time= stop-start 
    print('This took %s seconds' %time)  
               
    fight_detail_im.columns=['Weight_Class','Round_End','Time_End','Fight_format','Referee','Judge1','Judge2','Judge3','Outcome_Detail']

    load_to_db(fight_detail_im, 'fight_outcome_dim')
Esempio n. 3
0
def get_fight_odds():
    with open(r'data\fighter_odds.json', 'r') as fp:
        fighter_odds = json.load(fp)

    fighter_odds = list(fighter_odds.keys())[100:]
    # fighter_odds_test=[]
    # fighter_odds_test.append('https://www.bestfightodds.com/fighters/Tony-Ferguson-2568')
    # fighter_odds_test.append("https://www.bestfightodds.com/fighters/Israel-Adesanya-7845")
    # fighter_odds_test.append("https://www.bestfightodds.com/fighters/Jan-Blachowicz-2371")

    # for fighter_url in fighter_odds_test:

    finaldf = pd.DataFrame(columns=[
        'Fighter_Name', 'Open', 'Close_range_Lower', 'Close_range_Upper',
        'Event', 'Fighter_ID'
    ])

    start = t.time()
    for fighter_url in fighter_odds:
        #  print(fighter_url)
        dfs = pd.read_html("https://www.bestfightodds.com" + fighter_url)
        df = dfs[0]
        df.columns = [
            'Fighter_Name', 'Open', 'Close_range_Lower', 'Remove',
            'Close_range_Upper', 'Remove1', 'Remove2', 'Event'
        ]
        df = df[df.index % 3 != 0]
        df = df.drop(columns=["Remove", "Remove1", "Remove2"])
        df['Fighter_ID'] = fighter_url[39:]
        finaldf = finaldf.append(df)

    finaldf = finaldf.reset_index(drop=True)

    finaldf1 = finaldf[finaldf.index % 2 == 0]
    finaldf2 = finaldf[finaldf.index % 2 != 0]
    finaldf1 = finaldf1.reset_index(drop=True)
    finaldf2 = finaldf2.reset_index(drop=True)
    finaldf1.columns = [
        'Fighter_Name0', 'Open0', 'Close_range_Lower0', 'Close_range_Upper0',
        'Event_Name', 'Fighter_Id_0'
    ]
    finaldf2.columns = [
        'Fighter_Name1', 'Open1', 'Close_range_Lower1', 'Close_range_Upper1',
        'Event_Date', 'Fighter_Id_1'
    ]

    finaldf3 = finaldf1.join(finaldf2)

    stop = t.time()
    time = stop - start
    print('This took %s seconds' % time)

    load_to_db(finaldf3, 'fight_odd_fact')
Esempio n. 4
0
def db_insert_fighter_dim(all_fighters):
    #run once a month (scheduled task)
    
    def fighter_dic_to_df():
        fighter_id, fighter_name = [],[]
        for i in all_fighters:
            for j,k in all_fighters[i].items():
                fighter_id.append(j)
                fighter_name.append(k)
        fighter_df=pd.DataFrame([fighter_id, fighter_name]).T
        fighter_df.columns=['Fighter_ID','Fighter_Name']
        
    fighter_dic_to_df()

    load_to_db(fighter_df,"fighter_dim")
Esempio n. 5
0
def get_fite_dates_results():
    with open(r'data\ufcfighters.json','r') as fp:
        fighters = json.load(fp) 
        
    dates =['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    
    # fighters =[]
    # fighters.append('http://ufcstats.com/fighter-details/22a92d7f62195791')
    # fighters.append('http://ufcstats.com/fighter-details/787bb1f087ccff8a')
    
    fighters = list(fighters.keys())
    fighters = fighters[:500]
    
    ufcfightdatedim={}
    ufcfightid_dim={}
    
    start = t.time()
    
    for fighter in fighters:
        ufcfightdatedim[fighter[36:52]]={}
        
        r = requests.get(fighter)
        soup = bs.BeautifulSoup(r.content,'lxml')
    
        cnt=0
        
        for p in soup.find_all('p',attrs={"class":"b-fight-details__table-text"} ):
            if any(x in p.text for x in dates):
                #print(p.text,cnt)
    
                ufcfightdatedim[fighter[36:52]][cnt] = p.text.strip()
                cnt+=1
        
        ufcfightid_dim[fighter[36:52]]={}
        
        cnt=0
        for a in soup.find_all('a', href=True):
            if "fight-details" in a['href']:
               # print(a.text,a['href'])
                if a['href'] not in ufcfightid_dim and a.text !='next' and 'Matchup' not in a.text:
                                    
                    ufcfightid_dim[fighter[36:52]][cnt] = {a['href']: a.text}
                    cnt+=1
                    
    stop = t.time()         
    time= stop-start 
    print('This took %s seconds' %time)  
      
 
    datedim_df = pd.DataFrame()
    fightid_dimdf = pd.DataFrame()
    
    for fighter in ufcfightdatedim:

        df = pd.DataFrame.from_dict(ufcfightdatedim[fighter].items())
        df['Fighter_Id'] = fighter 
    
        datedim_df = datedim_df.append(df)        
        
        
        
        for fight_index, fightdata in ufcfightid_dim[fighter].items():
            
            fightid_dimdf = fightid_dimdf.append( [[fight_index, fighter, str(fightdata.keys())[46:62], str(fightdata.values())[14:]  ]] )
        
 
        
    datedim_df.columns=['Fight_Index','Fight_Date','Fighter_ID']
    fightid_dimdf.columns=['Fight_Index','Fighter_ID','Fight_ID','Result']
    

    
    #import data to fighter_data table                 
    load_to_db(datedim_df,"fight_date_dim")
    load_to_db(fightid_dimdf,"fight_id_dim")
Esempio n. 6
0
    def get_fight_odds():
        
        fighter_odds = h.load_json(self.fighter_lst_address)
        # with open(self.fighter_lst_addres,'r') as fp:
        #     fighter_odds = json.load(fp) 
        
        #get only the difference between the latest fighter odd IDs and what's in the database
        #send that through for data scraping and creating the dfs
        
        # fighter_odds = list(fighter_odds.keys())[100:]
        # fighter_odds_test=[]
        # fighter_odds_test.append('https://www.bestfightodds.com/fighters/Tony-Ferguson-2568')   
        # fighter_odds_test.append("https://www.bestfightodds.com/fighters/Israel-Adesanya-7845")
        # fighter_odds_test.append("https://www.bestfightodds.com/fighters/Jan-Blachowicz-2371")
        
        
        finaldf = pd.DataFrame(columns=['Fighter_Name','Open','Close_range_Lower','Close_range_Upper','Event','Fighter_ID'])
        
        start = t.time()  
        for fighter_url in fighter_odds:

            dfs= pd.read_html(self.bestfightOddsUrl+fighter_url)
            df=dfs[0]
            df.columns =['Fighter_Name','Open','Close_range_Lower','Remove','Close_range_Upper','Remove1','Remove2','Event']
            df = df[df.index %3 !=0]
            df = df.drop(columns=["Remove","Remove1","Remove2"])
            df['Fighter_ID']= fighter_url[39:]
            finaldf= finaldf.append(df)
        
        
        finaldf=finaldf.reset_index(drop=True)
        
        finaldf1 = finaldf[finaldf.index%2==0]
        finaldf2 = finaldf[finaldf.index%2!=0]
        finaldf1=finaldf1.reset_index(drop=True)
        finaldf2=finaldf2.reset_index(drop=True)
        
        finaldf1.columns=['Fighter_Name0','Open0','Close_range_Lower0','Close_range_Upper0','Event_Name','Fighter_Id_0']
        finaldf2.columns=['Fighter_Name1','Open1','Close_range_Lower1','Close_range_Upper1','Event_Date','Fighter_Id_1']
        
        finaldf3= finaldf1.join(finaldf2)
        
        stop = t.time()   
        time= stop-start 
        print('This took %s seconds' %time)  
        
        h.load_to_db(finaldf3,'fight_odd_fact')
        
        




# def get_all_bestfightOdds_IDs(opponent, cnt, fighter_lst):
    
#     for fighter_url in fighter_lst:
#         if fighter_url == opponent:
            
#             ##extract the web page for the fighter
#             r = requests.get("https://www.bestfightodds.com"+fighter_url)
#             soup = bs.BeautifulSoup(r.content,'lxml')
              
#             #for each fighter, go through their opponents
#             #each link to their opopnent contains the string "fighters" within it
#             for a in soup.find_all('a', href=re.compile('fighters')):
                
#                 #filter out the fighter urls that are the fighter themselves
#                 if fighter_url not in a['href']:
                    
#                     #if the fighter's opponent is not already in the dictoinary
#                     if a['href'] not in fighter_lst:
                        
                        
#                         #add the ID to the new dictionary
# #                        fighter_lst[a['href']] = a.text
#                         fighter_lst.append(a['href'])
                        
#                         #some checking
#                         cnt+=1
#                         print(a.text,' added #',cnt)
                        
#                         #send the opponent to scrape all the IDs of their opponents
#                         opponent=  a['href']
                        
#                         #then recursively get the IDs of their opponents
#                         get_all_bestfightOdds_IDs(opponent, cnt, fighter_lst)                   
            
#     return fighter_lst


# fighter_list= ['/fighters/Israel-Adesanya-7845']
# fighter_lst.append('/fighters/Israel-Adesanya-7845')
# cnt=0
# opponent = '/fighters/Israel-Adesanya-7845'

# get_all_bestfightOdds_IDs(opponent, cnt, fighter_lst)
# #how to terminate? count was going down for some reason 

# ##save output
# c  = str(datetime.date.today())
# with open(r'data\fighter_odds_%s.json' %c,'w') as fp:
#     json.dump(fighter_lst,fp)

# fidf = pd.DataFrame(fighter_lst, columns=['Fighter_ID'])
# h.load_to_db(fidf,'fighter_id_recursive')