def calculate_wins_per_team_per_season(): """Get wins per season per team""" df_regular_season = get_table("t_original_regular_season_compact_results") df_wins_per_team_per_seaons = df_regular_season.groupby( ["season", "w_team_id"]).size().reset_index() df_wins_per_team_per_seaons.columns = ["season", "team_id", "wins"] write_table(df_wins_per_team_per_seaons, "wins_per_team_per_season")
def combine_datasets(): poverty_table, _ = utils.read_table("poverty_data_clean.csv", True) crime_table, _ = utils.read_table("crime_data_clean.csv", True) rent_table, _ = utils.read_table("rent_data_no_dups.csv", True) combined_table = [] combined_header = ["County", "State",\ "Pov_Num_All","Pov_Pct_All","Median_Income", \ "Crime_Rate_per_100000","Murder","Rape","Robbery","Aggravated_Assault","Burglary","Larceny","Vehicle_Theft","Arson",\ "Population","Mean_Rent","Median_Rent", "Latitude", "Longitude"] for poverty_row in poverty_table: for crime_row in crime_table: for rent_row in rent_table: if poverty_row[2] == crime_row[0] and crime_row[0] == rent_row[ 3]: new_add = [poverty_row[2], poverty_row[1],\ poverty_row[3], poverty_row[4], poverty_row[9],\ crime_row[2], crime_row[3], crime_row[4], crime_row[5], crime_row[6], crime_row[7], crime_row[8], crime_row[9], crime_row[10], \ crime_row[11], rent_row[9], rent_row[10], rent_row[7], rent_row[8]] combined_table.append(new_add) no_dups = dict(((x[0], x[1]), x) for x in combined_table) new_table = list(no_dups.values()) new_table.insert(0, combined_header) utils.write_table("combined_data.csv", new_table)
def calculate_losses_per_team_per_season(): """Get losses per season per team""" df_regular_season = get_table("t_original_regular_season_compact_results") df_losses_per_team_per_season = df_regular_season.groupby( ["season", "l_team_id"]).size().reset_index() df_losses_per_team_per_season.columns = ["season", "team_id", "losses"] write_table(df_losses_per_team_per_season, "losses_per_team_per_season")
def calculate_seed_rank_per_team_per_season(): df_seed_rank_per_team_per_season = get_table( "t_original_ncaa_tourney_seeds") # strip beginning region and optional "a/b" (which might be of interest later on) df_seed_rank_per_team_per_season[ "seed_rank"] = df_seed_rank_per_team_per_season["seed"].apply( lambda seed: int(seed[1:3])) df_seed_rank_per_team_per_season[ "seed_region"] = df_seed_rank_per_team_per_season["seed"].apply( lambda seed: seed[0]) df_seed_rank_per_team_per_season.drop("seed", axis=1, inplace=True) write_table(df_seed_rank_per_team_per_season, "seed_rank_region_per_team_per_season")
def normalize_combined(): combined_table, header = utils.read_table("combined_data.csv", True) columns = [] new_header = [] for x in range(len(header)): if x not in [2, 6, 7, 8, 9, 10, 11, 12, 13, 16]: new_header.append(header[x]) columns.append(utils.get_column(combined_table, x)) columns.append([ round(columns[6][i] * 12 * 100 / columns[3][i], 1) for i in range(len(columns[0])) ]) new_header.append("Pct_Income_as_Rent") columns[2] = normalize_data(columns[2]) # Poverty columns[3] = normalize_data(columns[3]) # Median Income columns[4] = discretize_data(columns[4], 5) # Crime Rate columns[5] = normalize_data(columns[5]) # Population columns[6] = normalize_data(columns[6]) # Rent columns[7] = normalize_data(columns[7]) # Rent as percent of income. new_table = [] for x in range(len(columns[0])): buffer = [] for column in columns: buffer.append(column[x]) new_table.append(buffer) new_table.insert(0, new_header) utils.write_table("combined_data_normalized.csv", new_table) columns[2] = discretize_data(columns[2], 3) # Poverty columns[3] = discretize_data(columns[3], 3) # Median Income #columns[4] = discretize_data(columns[4], 3) # Crime Rate columns[5] = discretize_data(columns[5], 5) # Population columns[6] = discretize_data(columns[6], 3) # Rent columns[7] = discretize_data(columns[7], 5) # Rent as percent of income. new_table = [] for x in range(len(columns[0])): buffer = [] for column in columns: buffer.append(column[x]) new_table.append(buffer) new_table.insert(0, new_header) utils.write_table("combined_data_discretized.csv", new_table)
def clean_rent(): table = [] infile = open("rent_data.csv", "r") lines = infile.readlines() for line in lines: newline = line.strip() #removes whitespace characters values = newline.split(",") #splits on comma #utils.convert_to_float(values) if values[2] != "HI" and values[2] != "AK": table.append(values) utils.write_table("rent_data_clean.csv", table) no_dups = dict(((x[1], x[3]), x) for x in table) new_table = list(no_dups.values()) utils.write_table("rent_data_no_dups.csv", new_table) infile.close()
def calculate_wins_per_team_per_season_by_ot(): """Get regular season wins split (binary) by OT""" df_regular_season = get_table("t_original_regular_season_compact_results") # Aggregate df_wins_per_team_per_seaons_no_ot = \ df_regular_season[df_regular_season["num_ot"] == 0]\ .groupby(["season","w_team_id"]).size().reset_index() # Cosmetics df_wins_per_team_per_seaons_no_ot.rename(columns={ "w_team_id": "team_id", 0: "wins_no_ot" }, inplace=True) # Aggregate df_wins_per_team_per_seaons_ot = \ df_regular_season[df_regular_season["num_ot"] > 0]\ .groupby(["season","w_team_id"]).size().reset_index() # cosmetics df_wins_per_team_per_seaons_ot.rename(columns={ "w_team_id": "team_id", 0: "wins_ot" }, inplace=True) # join outer(!) to include teams that never or only won via OT df_wins_per_team_per_seaons_by_ot = \ pd.merge( df_wins_per_team_per_seaons_no_ot, df_wins_per_team_per_seaons_ot, on=["season", "team_id"], how="outer" ) # cosmetics df_wins_per_team_per_seaons_by_ot.fillna(0, inplace=True) df_wins_per_team_per_seaons_by_ot[ "wins_ot"] = df_wins_per_team_per_seaons_by_ot["wins_ot"].astype(int) df_wins_per_team_per_seaons_by_ot[ "wins_no_ot"] = df_wins_per_team_per_seaons_by_ot["wins_no_ot"].astype( int) write_table(df_wins_per_team_per_seaons_by_ot, "wins_per_team_per_season_by_ot")
def clean_poverty(): table = [] infile = open("poverty_data.csv", "r") lines = infile.readlines() for line in lines: newline = remove_quotes(line) newline = newline.strip() #removes whitespace characters values = newline.split(",") #splits on comma #utils.convert_to_float(values) if values[1] != "HI" and values[1] != "AK": table.append(values) header = table.pop(0) header[0] = header[0][3:] table.insert(0, header) utils.write_table("poverty_data_clean.csv", table) infile.close()
def calculate_mean_stats_per_team_per_season(): df_detailed_results = get_table("t_original_ncaa_tourney_detailed_results") df_results_winner = df_detailed_results[[ 'season', 'w_team_id', 'w_score', 'wfgm', 'wfga', 'wfgm3', 'wfga3', 'wftm', 'wfta', 'wor', 'wdr', 'w_ast', 'wto', 'w_stl', 'w_blk', 'wpf' ]] df_results_loser = df_detailed_results[[ 'season', 'l_team_id', 'l_score', 'lfgm', 'lfga', 'lfgm3', 'lfga3', 'lftm', 'lfta', 'lor', 'ldr', 'l_ast', 'lto', 'l_stl', 'l_blk', 'lpf' ]] df_results_winner.columns = map(lambda x: x.lstrip("w_"), df_results_winner.columns) df_results_loser.columns = map(lambda x: x.lstrip("l_"), df_results_loser.columns) df_mean_stats_per_team_per_season =\ df_results_winner.append(df_results_loser).groupby(["season", "team_id"]).mean().reset_index() write_table(df_mean_stats_per_team_per_season, "mean_stats_per_team_per_season")
def calculate_mean_score_per_team_per_season(): """Get the average score per team per season""" pd = get_table("t_original_regular_season_compact_results") # cover case team == winner df_scores_winner = pd[["season", "w_team_id", "w_score"]] df_scores_winner.columns = ["season", "team_id", "score"] # cover case team == loser df_scores_looser = pd[["season", "l_team_id", "l_score"]] df_scores_looser.columns = ["season", "team_id", "score"] # combine winner & loser frames df_scores_teams = df_scores_winner.append(df_scores_looser) df_mean_scores_per_team_per_season = df_scores_teams.groupby( ["season", "team_id"])["score"].mean().reset_index() df_mean_scores_per_team_per_season.columns = [ "season", "team_id", "score_avg" ] write_table(df_mean_scores_per_team_per_season, "mean_score_per_team_per_season")
def calculate_ncaa_losses_per_team_by_ot(): df_ncaa = get_table("t_original_ncaa_tourney_compact_results") # Aggregate df_losses_per_team_historic_ncaa_no_ot =\ df_ncaa[df_ncaa["num_ot"] == 0].groupby("l_team_id").size().reset_index() # Cosmetics df_losses_per_team_historic_ncaa_no_ot.rename(columns={ "l_team_id": "team_id", 0: "losses_no_ot" }, inplace=True) # Aggregate df_losses_per_team_historic_ncaa_ot =\ df_ncaa[df_ncaa["num_ot"] > 0].groupby("l_team_id").size().reset_index() # cosmetics df_losses_per_team_historic_ncaa_ot.rename(columns={ "l_team_id": "team_id", 0: "losses_ot" }, inplace=True) df_losses_per_team_historic_ncaa_by_ot = \ pd.merge( df_losses_per_team_historic_ncaa_no_ot, df_losses_per_team_historic_ncaa_ot, on=["team_id"], how="outer" ) # cosmetics df_losses_per_team_historic_ncaa_by_ot.fillna(0, inplace=True) df_losses_per_team_historic_ncaa_by_ot[ "losses_ot"] = df_losses_per_team_historic_ncaa_by_ot[ "losses_ot"].astype(int) df_losses_per_team_historic_ncaa_by_ot[ "losses_no_ot"] = df_losses_per_team_historic_ncaa_by_ot[ "losses_no_ot"].astype(int) write_table(df_losses_per_team_historic_ncaa_by_ot, "ncaa_losses_per_team_by_ot")
def clean_crime(): table = [] infile = open("crime_data.csv", "r") lines = infile.readlines() header = lines.pop(0).strip().split(",") header.insert(1, "State_Name") table.insert(0, header) for line in lines: newline = remove_quotes(line) newline = newline.strip() #removes whitespace characters values = newline.split(",") #splits on comma values.insert(1, values[0][-2:]) values[0] = values[0][:-3] #utils.convert_to_float(values) if values[1] != "HI" and values[1] != "AK": table.append(values) utils.write_table("crime_data_clean.csv", table) infile.close()
def singular_value_decomposition_pp(data, latent_factors_size, epochs): """ Based on the code available in: https://github.com/cheungdaven/recommendation Based on the paper of: https://people.engr.tamu.edu/huangrh/Spring16/papers_course/matrix_factorization.pdf """ random.seed() users_items, users, items = data_treatment.retrieve_guide_features( data['Historic Data']) matrix_users_items = data_treatment.mount_matrix_user_item(users_items) ratings_mean = utils.measure_average_rating(data['Historic Data']) # a matrix users x items historic_rating_matrix = model.generate_historic_data_matrix( data['Historic Data'], 'users', users, items, ratings_mean) #users_mean = utils.measure_row_mean(historic_rating_matrix) #historic_rating_matrix = utils.subtraction_matrix_row_mean(historic_rating_matrix, users_mean) # users latent matrix p_matrix = algebric_operations.generate_random_matrix( latent_factors_size, len(users)) # itens latent matrix q_matrix = algebric_operations.generate_random_matrix( latent_factors_size, len(items)) # prediction matrix y_matrix = algebric_operations.generate_random_matrix( len(items), latent_factors_size) ratings = calculate_first_estimation(users, users_items, latent_factors_size, y_matrix, items) residual_items = [random.uniform(0, 1) for item in range(0, len(items))] residual_users = [random.uniform(0, 1) for user in range(0, len(users))] for epoch in range(epochs): for row in matrix_users_items: user, item = row[0], row[1] user_index, item_index = users[user], items[item] amount_items = len(users_items[user]) # diving all the values of a a array by the sqrt of the users amount of items ratings[user] = list( map(lambda value: value / math.sqrt(amount_items), ratings[user])) # retriving all the values of a specific column column_array = retrieve_column(p_matrix, users[user]) ratings[user] = algebric_operations.sum_two_arrays( ratings[user], column_array) predicted_rating = ratings_mean + residual_items[ item_index] + residual_users[user_index] + svd_prediction( ratings[user], retrieve_column(q_matrix, item_index)) measured_error = historic_rating_matrix[user_index][ item_index] - predicted_rating # error_metric(historic_rating_matrix[users[user]][item_index], predicted_rating) # cost O(n) p_matrix = _update_p_matrix(p_matrix, q_matrix, user_index, item_index, measured_error) # cost O(n) q_matrix = _update_q_matrix(q_matrix, p_matrix, user_index, item_index, user, amount_items, ratings, measured_error) # reconstruction matrix - this will be the closest to the original matrix - cost O(n**2) y_matrix = _update_y_matrix(y_matrix, q_matrix, users_items, user, items, amount_items, measured_error) # cost O(1) residual_items = _update_residual_items(residual_items, item_index, measured_error) # cost O(1) residual_users = _update_residual_users(residual_users, user_index, measured_error) print( svd_rmse(historic_rating_matrix, matrix_users_items, users, items, ratings_mean, residual_users, residual_items, ratings, q_matrix, y_matrix, users_items, latent_factors_size)) predictions = make_prediction(historic_rating_matrix, data['Prediction Data'], ratings, ratings_mean, users, items, q_matrix, residual_users, residual_items, users_items, y_matrix, latent_factors_size) for index, prediction in enumerate(predictions): data['Prediction Data'][index].append(str(prediction)) data['Prediction Data'].insert(0, ['UserId', 'ItemId', 'Prediction']) utils.write_table(data['Prediction Data'], "Outputs/predictions.txt")
def tick_attendence(self,img_name = None,save_annotated = True,add_vector = True,n_neighbors = 1): global Label_test global location_list global vector_list global mode def line_select_callback(click,release): global Label_test global location_list global target row1,col2,row2,col1 = int(click.ydata),int(release.xdata),int(release.ydata),int(click.xdata) Label_test.append(int(target)) location_list.append((row1,col2,row2,col1)) print(special_layout(f"Added {self.dict_[str(target)]['name']} to annotated image.\n\ Amount of targets: {len(location_list)}")) plt.close() def onclick(event): global Label_test global location_list global vector_list global mode col, row = event.xdata, event.ydata for i in range(len(location_list)): row1,col2,row2,col1 = location_list[i] if row > row1 and row < row2 and col > col1 and col < col2: if mode == '2': try: correction = input(special_layout(f"You select {self.dict_[str(Label_test[i])]['name']} ({Label_test[i]})\n***correction -> 1 No correction -> 0")) if int(correction): correct_label = input(special_layout(f"Who is this? :\n\n\ {dict_5row_layout(self.dict_,'name',blank = 15,each_row =5,count_value = False)}\n***Please type number")) print(special_layout(f"{self.dict_[str(Label_test[i])]['name']} -> {self.dict_[str(correct_label)]['name']}")) Label_test[i] = correct_label except: pass break elif mode == '3': try: delete = input(special_layout(f"You confirm to delete {self.dict_[str(Label_test[i])]['name']} ({Label_test[i]}) on [{row1}:{row2},{col1}:{col2}]?\n***yes -> 1 No no -> 0")) except: pass break try: if int(delete): Label_test.pop(i) location_list.pop(i) vector_list.pop(i) except: pass plt.close() def toggle_selector(event): toggle_selector.RS.set_active(True) def object_mode_change(event): global target global mode if event.key == '1': mode = '1' print(special_layout(f" Add label on annotated image.")) try: target = input(special_layout(f"Select target label:\n\n\ {dict_5row_layout(self.dict_,'name',blank = 15,each_row =5,count_value = False)}")) print(f"You will annotate {self.dict_[str(target)]['name']} ({target})") plt.close() except: pass elif event.key == '2': mode = '2' print(special_layout(f" Change label on annotated image.")) plt.close() elif event.key == '3': mode = '3' print(special_layout(f" Delete label on annotated image.")) plt.close() elif event.key == 'q': mode = 'q' print(special_layout(f"Finish correction...")) plt.close() else: print(special_layout(f"Please press the followings key:\n\nAdd annotation -> 1\nClick show label and change label -> 2\nDelete annotation -> 3\n\ Exit correction -> q")) # change name if img_name == None: try: img_name = change_image_name(self.classname) except: img_name = [path for path in sorted(Path(f"./data/{self.classname}/image/class").glob("*.jpg"))][-1].name img_path = f'./data/{self.classname}/image/class/{img_name}' print(special_layout(f"Detect and encode faces on image({img_name})")) array = load_image(img_path) location_list, vector_list = face_location_encoding(array) Label_test = list(face_prediction(self.classname, vector_list)) annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_) count = 0 while True: print(special_layout(f"Show you the annotated image...\nEnter q")) fig, ax = plt.subplots(1) plt.imshow(annotated) plt.show() if count%2==0: indivdual = input(special_layout(f"Try individual model?\n1:yes 0:no")) if int(indivdual): Label_test = list(face_prediction(self.classname, vector_list,only_individual=True)) annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_) else: break else: back = input(special_layout(f"Try the previous model?\n1:yes 0:no")) if int(back): Label_test = list(face_prediction(self.classname, vector_list)) annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_) plt.close() else: print(123) break count += 1 print(special_layout(f"Show you the annotated image...\nPress H to watch instruction:)")) mode = '2' while True: annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_) fig, ax = plt.subplots(1) plt.imshow(annotated) if mode == '1': toggle_selector.RS = RectangleSelector( ax,line_select_callback, drawtype='box',useblit=True, button=[1],minspanx=5,minspany=5, spancoords='pixels',interactive=True ) plt.connect('key_press_event', toggle_selector) plt.connect('key_press_event',object_mode_change) elif mode == '2' or mode == '3': Cursor(ax, horizOn=False, # Controls the visibility of the horizontal line vertOn=False, # Controls the visibility of the vertical line ) fig.canvas.mpl_connect('button_press_event', onclick) plt.connect('key_press_event',object_mode_change) plt.show() if mode == 'q': break if save_annotated: create_annotated_dir(self.classname) plt.imsave(img_path.replace('class','annotated'),annotated) # write table write_table(self.dict_,self.classname,Label_test,img_name) # add vector # Modelling if add_vector: vector_correct = input(special_layout(f"Add all face into our knn model?\n***yes -> 1 no -> 0")) if int(vector_correct)-1: mode = '2' while True: annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_) fig, ax = plt.subplots(1) plt.imshow(annotated) if mode == '1': toggle_selector.RS = RectangleSelector( ax,line_select_callback, drawtype='box',useblit=True, button=[1],minspanx=5,minspany=5, spancoords='pixels',interactive=True ) plt.connect('key_press_event', toggle_selector) plt.connect('key_press_event',object_mode_change) elif mode == '2' or mode == '3': Cursor(ax, horizOn=False, # Controls the visibility of the horizontal line vertOn=False, # Controls the visibility of the vertical line ) fig.canvas.mpl_connect('button_press_event', onclick) plt.connect('key_press_event',object_mode_change) plt.show() if mode == 'q': break add_vector_location_img(self.dict_,self.classname,vector_list,Label_test,location_list,img_name) vector_train=[] label_train=[] print(special_layout(f"Vector amount summary:")) print(col_layout('Label','Vector(individual) amount','Vector(class) amount','Total')) for label,each_dict in self.dict_.items(): total = len(each_dict['vector(individual)'])+len(each_dict['vector(class)']) print(col_layout(str(label)+'.'+each_dict['name'],len(each_dict['vector(individual)']),len(each_dict['vector(class)']),total)) vector_train = vector_train + each_dict['vector(individual)']+each_dict['vector(class)'] label_train += [int(label) for i in range(total)] knn_modelling(self.classname,vector_train,label_train,n_neighbors =n_neighbors) # Final: output to dir with open(json_path,'w') as doc: doc.write(json.dumps(self.dict_)) # reminder print(special_layout(f'Renew label dictionary: {json_path}'))
def calculate_ncaa_losses_per_team(): """Get all NCAA wins per team""" df_ncaa = get_table("t_original_ncaa_tourney_compact_results") df_ncaa_losses_per_team = df_ncaa.groupby("l_team_id").size().reset_index() df_ncaa_losses_per_team.columns = ["team_id", "losses"] write_table(df_ncaa_losses_per_team, "ncaa_losses_per_team")
arg_map = {'name': nick} rst = sales_solr.sales_search(arg_map, page_index, countofpage, solr_ip_port) print rst lines = [] lines.append(subject.strip()) lines.append(arg_map['name'].strip()) lines.append(str(rst[0])) if rst[0]: total += 1 num += 1 result = parse_eml(msg) # pdb.set_trace() lines.append(result.get(u'联系人', '').strip()) lines.append(result.get(u'手机', '').strip()) lines.append(result.get(u'座机', '').strip()) lines.append(result.get(u'地址', '').strip()) lines.append(result.get('email', '').strip()) linecsv.append(','.join(lines)) # print '*****************************************' except: pass print total print num with open('d:/naren/test.csv', 'wb') as file: file.writelines('\n'.join(linecsv).encode('utf8')) emlutils.write_table(linecsv) server.quit() end = time.time() print (end - start)
def non_negative_matrix_factorization(data, latent_factors_size, epochs, output_file=None, test=False): """ Based on the code available in: https://github.com/cheungdaven/recommendation/blob/master/recSysNMF.py We also use as guide: https://blog.acolyer.org/2019/02/18/the-why-and-how-of-nonnegative-matrix-factorization/ Class 08 - Collaborative Filtering: Factorization Matrix """ random.seed() users_items, users, items, users_ratings, items_ratings = data_treatment.retrieve_guide_features( data['Historic Data']) ratings_mean = utils.measure_average_rating(data['Historic Data']) # users latent matrix p_matrix = algebric_operations.generate_random_matrix( latent_factors_size, len(users)) # itens latent matrix q_matrix = algebric_operations.generate_random_matrix( latent_factors_size, len(items)) epochs_rmse = [] for epoch in range(epochs): for row in data['Historic Data']: user, item, historic_rating = row[0], row[1], row[2] user_index, item_index = users[user], items[item] error = float(historic_rating) - ( users_ratings[user] + items_ratings[item] + nmf_prediction(retrieve_column(p_matrix, user_index), retrieve_column(q_matrix, item_index))) / 3 p_matrix = _update_matrixes(p_matrix, q_matrix, user_index, item_index, error) q_matrix = _update_matrixes(q_matrix, p_matrix, item_index, user_index, error) epochs_rmse.append( measure_rmse(data['Historic Data'], p_matrix, q_matrix, users, items, users_ratings, items_ratings)) print(epoch, epochs_rmse[-1]) predictions = make_prediction(data['Prediction Data'], p_matrix, q_matrix, ratings_mean, users, items, users_ratings, items_ratings) if test: return predictions, epochs_rmse for index, prediction in enumerate(predictions): data['Prediction Data'][index].append(str(prediction)) data['Prediction Data'].insert(0, ['UserId', 'ItemId', 'Prediction']) utils.write_table(data['Prediction Data'], output_file)
def main(): table = utils.read_table("auto-data-clean.txt") for row in table: del row[-2] utils.write_table("auto-data-no-names.txt", table)