def main(train=False, analyze=False): """ Run training (if train=True), predict scores, and then run analysis routines (if analyze=True) """ engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, dbname)) df = pd.read_sql_query("SELECT * FROM works", engine) if train: make_genres(df) df = predict_genres(df) if train: make_topics(df) df = predict_topics(df) if train: detrend_dates(df) (tp, tn, fp, fn, y_train, y_test, yp_train_class, yp_test_class, y_train_regress, y_test_regress, yp_train_regress, yp_test_regress) = make_score(df) df = predict_score(df) df.to_sql('works_flask', engine, if_exists='replace') if analyze: initial_exploration(df) plot_nominees() analyze_genres(df) confusion_matrix(tp, tn, fp, fn) analyze_topics()
def constraint3(x): temp_df['alpha'] = x reps_per_movement_df_new_alphas = temp_df new_score = predict_score(reps_per_movement_df_new_alphas, wod_time) if old_error >= 0.0: return old_score - new_score else: return new_score - old_score
def process_movie(topic, title_tag, rank, url_home, audiance_link, utils_predics): producer = KafkaProducer(bootstrap_servers='localhost:9092') title = str(title_tag.next).strip() movie_link = title_tag.get("href") try: movie_page = uReq(url_home + movie_link).read() except HTTPError as e: print(f"ERROR can't access to server LINK : {url_home + movie_link} SKIPING.........") return 'FAILED' movie_data = soup(movie_page, "lxml") movie_propreties = get_movie_propreties(movie_data) score = movie_data.find("score-board", {"class": "scoreboard"})["audiencescore"] if score: score = int(score) * 1e-2 else: score = 0 genres = movie_propreties['Genre'] if 'Genre' in movie_propreties else 'unknown' original_language = movie_propreties['Original Language'] if 'Original Language' in movie_propreties else 'unknown' director = movie_propreties['Director'] if 'Director' in movie_propreties else 'unknown' writer = movie_propreties['Writer'] if 'Writer' in movie_propreties else 'unknown' date_theatre = movie_propreties[ 'Release Date (Theaters)'] if 'Release Date (Theaters)' in movie_propreties else 'unknown' date_streaming = movie_propreties[ 'Release Date (Streaming)'] if 'Release Date (Streaming)' in movie_propreties else 'unknown' box_office = movie_propreties[ 'Box Office (Gross USA)'] if 'Box Office (Gross USA)' in movie_propreties else 0 duree = movie_propreties['Runtime'] if 'Runtime' in movie_propreties else 0 reviews = [elem.p.next.strip() for elem in movie_data.find("div", {"id": "reviews"}).find_all("li")] try: movie_AD = uReq(url_home + movie_link + audiance_link).read() except HTTPError as e: print(f"ERROR can't access to server LINK : {url_home + movie_link + audiance_link} SKIPING.........") return 'FAILED' reviews_AD = soup(movie_AD, "lxml") reviews_audiance = reviews_AD.find("div", {"id": "movieUserReviewsContent"}) res = [] if reviews_audiance: reviews_audiance = reviews_AD.find("div", {"id": "movieUserReviewsContent"}).findAll("li", { "class": "audience-reviews__item"}) res = [reviews_audiance[i].find("p", { "class": "audience-reviews__review js-review-text clamp clamp-8 js-clamp"}).next for i in range(len(reviews_audiance))] reviews = reviews + res reviews = list(clean_df_column(reviews)) movie = Movie(name=title, rank=int(rank.next.split('%')[0]) * 1e-2, genres=genres, score=score, reviews=reviews, langage=original_language, director=director, writer=writer, date_theatre=date_theatre, date_streaming=date_streaming, box_office=box_office, duree=duree) movie = predict_score(movie, utils_predics) producer.send(str(topic), bytes(movie.serialize(), encoding='utf-8')) return movie
def constraint1(x): temp_df['alpha'] = x reps_per_movement_df_new_alphas = temp_df new_score = predict_score([reps_per_movement_df_new_alphas, wod_time], wod_format) new_error = (new_score - old_score) / old_score if old_error >= 0.0: return old_error - new_error else: return new_error - old_error
def api_prop(m, topic, utils_predics): producer = KafkaProducer(bootstrap_servers='localhost:9092') b = ia.get_movie(str(m.getID())) title = b['title'] rank = (float(m['rating']) / 10) if m.__contains__('rating') else 0 genres = b['genres'][0] if b.__contains__('genres') else 'unknown' score = float(m['rating']) / 10 if m.__contains__('rating') else 0 langage = b['languages'] if b.__contains__('languages') else 'unknown' director = b['directors'][0]['name'] if b.__contains__( 'directors') else 'unknown' writer = b['writer'][0]['name'] if b.__contains__( 'writer') else 'unknown' date_theatre = b['original air date'] if b.__contains__( 'original air date') else 'unknown' date_streaming = b['original air date'] if b.__contains__( 'original air date') else 'unknown' try: box_office = str( b['box office']['Budget'].split(',')[0]).split('$')[1] except: box_office = 'unknown' try: duree = b['runtimes'][0] if b['runtimes'][0] else 0 except: duree = 0 reviews = [] tweet = api.search('#' + title.strip().lower().replace(" ", ""), tweet_mode="extended", count=50, exclude_replies=True) for t in tweet: reviews.append(t.full_text) reviews = list(clean_df_column(reviews)) movie = Movie(name=title, rank=rank, genres=genres, score=score, reviews=reviews, langage=langage, director=director, writer=writer, date_theatre=date_theatre, date_streaming=date_streaming, box_office=box_office, duree=duree) movie = predict_score(movie, utils_predics) producer.send(str(topic), bytes(movie.serialize(), encoding='utf-8')) return movie
data.append(single_row) # Just for checking items in data[] """ for _ in data: print(_) """ # List with 0 as 'absent' values refined_data_list = refine_list(data) # List with final score which is appended at the end of lists ielts_scores_list = ielts_scores(refined_data_list) # for _ in ielts_scores_list: # print(_) user_scores = [] print("Enter your scores please") speaking_score = float(input("Speaking score: ")) listening_score = float(input("Listening score: ")) reading_score = float(input("Reading score: ")) writing_score = float(input("Writing score: ")) user_scores = [speaking_score, listening_score, reading_score, writing_score] # print(user_scores) print() print(f"User band score: {sum(user_scores)/len(user_scores)}") predicted_score = predict_score(ielts_scores_list, user_scores) print(f"Predicted score using DTR: {float(predicted_score)}")
def read_wods(wod_format, new_wod_df, new_wod_bool): """ TODO: 1.) Edit method to work with other WOD types, like RoundsForTime :param wod_format: enumeration that represents the WOD type :param new_wod_df: dataframe containing WOD information :param new_wod_bool: boolean that tells funcdtion if this WOD is new or not (in the library or not) :return: None """ if new_wod_bool is False: df_amrap = pd.read_csv('Data/amrap_wod_memory.csv', names=['format', 'time_limit', 'score', 'WOD']) else: df_amrap = new_wod_df wod_times_list = list() error_list = list() skip = False if wod_format == WodFormat.AMRAP: for k in range(0, df_amrap.shape[0]): wod_time = df_amrap.iloc[k].time_limit wod_score = df_amrap.iloc[k].score wod_str_pre = df_amrap.iloc[k].WOD wod_str = wod_str_pre.split('|')[0:-1] reps_per_set_tuple = list() movement_tuple = list() for object in wod_str: object = object.strip() reps = int(re.search('[0-9]+', object).group()) reps_per_set_tuple.append(int(reps)) movement = re.sub("\d+|\s", "", object) if 'run' in movement: movement = object.lstrip(digits) movement = re.sub("^\s", "", movement) temp = alpha_library.alpha_df['movement'] if temp.str.contains(movement).any(): alpha_temp = float(alpha_library.alpha_df.loc[ alpha_library.alpha_df['movement'] == movement] ['alpha']) movement_tuple.append((reps, reps, movement, alpha_temp)) else: print('{} are not currently support\n' 'Skipping WOD:\n' '{}\n'.format(movement, wod_str_pre)) skip = True break if skip is False: reps_per_movement_df = pd.DataFrame(movement_tuple) reps_per_movement_df.columns = [ 'reps_in_set', 'reps_performed', 'movement', 'alpha' ] reps_per_round = sum(reps_per_set_tuple) rounds_complete = int(wod_score / reps_per_round) reps_last_round = wod_score % reps_per_round reps_per_movement_df['reps_performed'] = reps_per_movement_df[ 'reps_performed'] * rounds_complete i = 0 num_movements = reps_per_movement_df.shape[0] # Determine how many reps were completed in the last round of the workout while reps_last_round > 0: if reps_last_round > reps_per_movement_df[ 'reps_in_set'].iloc[i % num_movements]: reps_per_movement_df['reps_performed'].iloc[ i % num_movements] += reps_per_movement_df[ 'reps_in_set'].iloc[i % num_movements] reps_last_round -= reps_per_movement_df[ 'reps_in_set'].iloc[i % num_movements] elif reps_last_round <= reps_per_movement_df[ 'reps_in_set'].iloc[i % num_movements]: reps_per_movement_df['reps_performed'].iloc[ i % num_movements] += reps_last_round reps_last_round = 0 i += 1 movements = np.array( np.unique(reps_per_movement_df['movement'].values)) movements_in_alpha_library = alpha_library.alpha_df[ 'movement'][alpha_library.alpha_df['movement'].isin( movements)].values movements_not_in_alpha_library = set(movements) ^ set( movements_in_alpha_library) # you have all movements' alphas, and now you can train/predict if len(movements_not_in_alpha_library) == 0: predicted_score_old_alphas = predict_score( [reps_per_movement_df, wod_time], wod_format) error_old_alphas = (predicted_score_old_alphas - wod_score) / wod_score new_alphas = change_alphas(reps_per_movement_df, wod_score, error_old_alphas, wod_time, 0.1, wod_format) temp_df = reps_per_movement_df.copy() temp_df['alpha'] = new_alphas reps_per_movement_df_new_alphas = temp_df predicted_score_new_alpha = predict_score( [reps_per_movement_df_new_alphas, wod_time], wod_format) if new_wod_bool: add_reps_and_alpha(reps_per_movement_df_new_alphas) error_new_alphas = (predicted_score_new_alpha - wod_score) / wod_score wod_times_list.append(wod_time) print('old alpha error: {}\n' 'old alpha predicted score: {}\n' 'new alpha error: {}\n' 'new alpha predicted score: {}\n' 'actual score: {}\n'.format( error_old_alphas, predicted_score_old_alphas, error_new_alphas, predicted_score_new_alpha, wod_score)) # adjust the alpha_df with new alpha values for movement in movements: alpha_temp = float( np.mean(temp_df.loc[temp_df['movement'] == movement]['alpha'])) alpha_library.alpha_df.loc[ alpha_library.alpha_df['movement'] == movement, 'alpha'] = alpha_temp file = open('Data/alpha_library.csv', 'w') file.truncate() file.close() file = open('Data/alpha_library.csv', 'a') for index, row in alpha_library.alpha_df.iterrows(): file.write('{}, {}\n'.format(row['movement'], row['alpha'])) file.close() # error_vs_time_df = pd.DataFrame({'WOD Time':wod_times_list, 'Error': error_list}) else: print( 'The following movements are NOT supported yet (Im workin on it!):\n' '{}\n'.format(movements_in_alpha_library)) skip = False elif wod_format == WodFormat.RoundsForTime: print('gimme some time...gosh\n')
def change_alphas(wod_df, old_score, old_error, wod_time, change_limit, wod_format): """ This function adjusts alphas in an attempt to make future predictions more accurate. The alphas are adjusted based on the difference between actual and predicted WOD scores. :param wod_df: :param old_score: :param old_error: :param wod_time: :param change_limit: :param wod_format: :return: """ num_unique_movements = len(np.unique(wod_df['movement'])) x0 = wod_df['alpha'] x_init = x0 reps = wod_df['reps_performed'] if old_error >= 0.0: x0 = x0 + change_limit * x0 lower_bound = x0 upper_bound = x0 + change_limit * x0 else: lower_bound = x0 - change_limit * x0 upper_bound = x0 bounds = list() for i in range(0, len(x0)): b = (lower_bound[i], upper_bound[i]) bounds.append(b) temp_df = wod_df.copy() def objective(x): objective_str = 'abs(' + str(old_score) + ' - (' for i in range(0, len(x0)): objective_str = objective_str + str( reps[i % num_unique_movements]) + '*' + str( x[i % num_unique_movements]) + '+' x[i] = x[i % num_unique_movements] objective_str = objective_str[:-1] + '))' return eval(objective_str) def constraint1(x): temp_df['alpha'] = x reps_per_movement_df_new_alphas = temp_df new_score = predict_score([reps_per_movement_df_new_alphas, wod_time], wod_format) new_error = (new_score - old_score) / old_score if old_error >= 0.0: return old_error - new_error else: return new_error - old_error def constraint2(x): temp_df['alpha'] = x reps_per_movement_df_new_alphas = temp_df new_score = predict_score(reps_per_movement_df_new_alphas, wod_time) new_error = (new_score - old_score) / old_score if old_error >= 0.0: return new_error else: return -new_error def constraint3(x): temp_df['alpha'] = x reps_per_movement_df_new_alphas = temp_df new_score = predict_score(reps_per_movement_df_new_alphas, wod_time) if old_error >= 0.0: return old_score - new_score else: return new_score - old_score cons1 = {'type': 'ineq', 'fun': constraint1} # cons2 = {'type': 'ineq', 'fun': constraint2} # cons3 = {'type': 'ineq', 'fun': constraint3} cons = [cons1] z = minimize(objective, x0, method='SLSQP', bounds=bounds, constraints=cons) new_alphas = z['x'] temp_df['alpha'] = new_alphas reps_per_movement_df_new_alphas = temp_df new_score = predict_score([reps_per_movement_df_new_alphas, wod_time], wod_format) new_error = (new_score - old_score) / old_score while abs(new_error) > abs(old_error): new_alphas = change_alphas(wod_df, old_score, old_error, wod_time, change_limit - 0.1, wod_format) alpha_change = x_init - new_alphas percent_change = np.sum(alpha_change / x_init) print('old alphas: {} \n' 'new alphas: {} \n' 'change in alpha: {}'.format( list(x_init[0:num_unique_movements]), new_alphas[0:num_unique_movements], list(alpha_change))) if percent_change < 0.10: return new_alphas return new_alphas
def new_wod_prediction(): """ Description ----------- Coordinates all the proper method calls to enter a new WOD and get a prediction by calling the following functions: - add_wod_to_memory() - predict_score() Proper descriptions of the methods can be found where they are defined Important Variable Descriptions ------------------------------- wod_obj will have the following components based on the type of workout entered: -AMRAP: wod_obj = List[wod_format, wod_df, wod_time] wod_format : enum Determines which type of WOD the workout is wod_df : Dataframe format : score : WOD RoundsForTime : 120 : 15 pushups|10 pullups| wod_time : string Represents the amount of time alloted for the AMRAP in mm:ss format -RoundsForTime: wod_obj = List[wod_format, wod_df, rounds] wod_format : enum Determines which type of WOD the workout is wod_df : Dataframe format : score : WOD RoundsForTime : 120 : 15 pushups|10 pullups| rounds : int The number of rounds that need to be completed in the WOD :return: None, but .csv files are used to store the new WOD and its results. The csv files have different formats depending on the format of the WOD: -AMRAPs -> wod format, time alloted, score, wod description -RoundsForTime -> wod format, rounds, score, wod description """ wod_obj = add_wod_to_memory(new_wod=True) wod_format = wod_obj[0] alpha_df = pd.read_csv('Data/alpha_library.csv', names=['movement', 'alpha']) # If the new WOD is an AMRAP: if wod_format == WodFormat.AMRAP: wod_df = wod_obj[1] wod_time = wod_obj[2] wod_str_pre = wod_df.iloc[0].WOD wod_str = wod_str_pre.split('|')[0:-1] df_tuple = parse_wod(wod_str, alpha_df) prediction_df = pd.DataFrame(df_tuple) prediction_df.columns = ['reps_in_set', 'movement', 'alpha'] predicted_score = predict_score([prediction_df, wod_time], wod_format) scoreReveal = input( "When would you like to see your predicted score? (now/after)\n") if scoreReveal == 'now': print('Predicted Score: {}\n'.format(predicted_score)) actual_score = int(input('What was your score?\n')) else: actual_score = int(input('What was your score?\n')) print('Predicted Score: {}\n'.format(predicted_score)) wod_df['score'] = [actual_score] read_wods(wod_format, wod_df, new_wod_bool=True) file = open('Data/amrap_wod_memory.csv', 'a') file.write('{}, {}, {}, {}\n'.format(wod_df.format[0], wod_time, actual_score, wod_str_pre)) elif wod_format == WodFormat.RoundsForTime: wod_df = wod_obj[1] rounds = wod_obj[2] wod_str_pre = wod_df.iloc[0].WOD wod_str = wod_str_pre.split('|')[0:-1] df_tuple = parse_wod(wod_str, alpha_df) prediction_df = pd.DataFrame(df_tuple) prediction_df.columns = ['reps_in_set', 'movement', 'alpha'] predicted_score = predict_score([prediction_df, rounds], wod_format) scoreReveal = input( "When would you like to see your predicted score? (now/after)\n") if scoreReveal == 'now': print('Predicted Score: {}\n'.format(predicted_score)) actual_score = int(input('What was your score?\n')) else: actual_score = int(input('What was your score?\n')) print('Predicted Score: {}\n'.format(predicted_score)) wod_df['score'] = [actual_score] read_wods(wod_format, wod_df, new_wod_bool=True) file = open('Data/rft_wod_memory.csv', 'a') file.write('{}, {}, {}, {}\n'.format(wod_df.format[0], rounds, actual_score, wod_str_pre))