def clean_movie_genre(cleanTable,nprocesses): gt_genres = database_handler.select("SELECT DISTINCT genre FROM groundtruth_movies ORDER BY genre") rows = database_handler.select("SELECT id, movie_genre FROM {0} where movie_genre <> '' ".format(cleanTable)) gtGenresDict = dict() i=0 for gt_genre in gt_genres: gtGenresDict[i] = gt_genre["genre"] i+=1 pool = mp.Pool(nprocesses) rows = list(rows) chunk_len = int(len(rows)/nprocesses) chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)] for chunk in chunks: pool.apply_async(clean_movie_genre_in_parallel,args=(gtGenresDict,chunk,cleanTable)) pool.close() pool.join()
def clean_media_type(clean_table): media_type = "media_type" query = "SELECT id, {0} FROM {1}".format(media_type, clean_table) rows = database_handler.select(query) gt = ["DVD","BLURAY","VHS"] gt_dict = dict() gt_dict[0] = gt[0] gt_dict[1] = gt[1] gt_dict[2] = gt[2] for row in rows: media_seq = get_seq_from_row(row, ["media_type"]) best_matching_index = "" best_matching_type = "" if not media_seq: best_matching_type = "" else: best_matching_index = string_matcher.get_matching_seq(media_seq, gt_dict) best_matching_type = gt_dict[best_matching_index] #print ' {0} {1} => {2} '.format(row["id"],row["media_type"],best_matching_type) update_query = "UPDATE {0} SET {1}='{2}' WHERE id={3}".format(clean_table, media_type, best_matching_type, row["id"])
def clean_movie_in_parallel(groundtruth,rows,movie): for row in rows: select_query = "select * from {0} where genre='{1}'".format(groundtruth, row['genre']) ground_truth_rows = database_handler.select(select_query) matching_seq = string_matcher.get_matching_mov_title(row['title'],ground_truth_rows,"title") id_as_string = str(row["id"]) update_query = "update {0} set title='{1}' where id={2}".format(movie,matching_seq,id_as_string) database_handler.update(update_query)
def clean_customer(clean_table,nprocesses): query = "SELECT id, customer_firstname, customer_lastname, customer_street, customer_birthday, customer_gender FROM {0} where customer_birthday is not null".format(clean_table) rows = database_handler.select(query) gt_table = tables["gt_address"] pool = mp.Pool(nprocesses) rows = list(rows) chunk_len = int(len(rows)/nprocesses) chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)] for chunk in chunks: pool.apply_async(clean_customer_in_parallel,args=(gt_table,chunk,clean_table,)) pool.close() pool.join()
def clean_movie(clean_table,nprocesses): # if mode is set to "test", use corresponding test table title_attr = "movie_title" genre_attr = "movie_genre" clean_select = "SELECT {0}, {1}, {2} FROM {3}".format("id", title_attr, genre_attr, clean_table) rows = database_handler.select(clean_select) gt_table = tables["gt_movies"] gt_select = "SELECT id, title, genre FROM {0}".format(gt_table) gt_rows = database_handler.select(gt_select) gt_seq_dict = dict() i = 0 for gt_row in gt_rows: gt_seq_dict[i] = "".join(gt_row[col] for col in ["title", "genre"]) i += 1 pool = mp.Pool(nprocesses) rows = list(rows) chunk_len = int(len(rows)/nprocesses) chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)] for chunk in chunks: pool.apply_async(clean_movie_in_parallel,args=(gt_seq_dict,gt_rows,chunk,clean_table,title_attr,genre_attr,)) pool.close() pool.join()
def clean_customer(mode,nprocesses): customer = tables["ml_cus"] #if mode == "test": # customer = tables["ml_cus_test"] query = "SELECT id, birthday, name, street, gender FROM {0}".format(customer) rows = database_handler.select(query) groundtruth = tables["gt_address"] pool = mp.Pool(nprocesses) rows = list(rows) chunk_len = int(len(rows)/nprocesses) chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)] for chunk in chunks: pool.apply_async(clean_customer_in_parallel, args=(groundtruth,chunk,customer,)) pool.close() pool.join()
def clean_movie(mode,nprocesses): movie = tables["ml_mov"] #if mode == "test": # movie = tables["ml_mov_test"] query = "SELECT * FROM {0}".format(movie) rows = database_handler.select(query) groundtruth = tables["gt_movies"] pool = mp.Pool(nprocesses) rows = list(rows) chunk_len = int(len(rows)/nprocesses) chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)] for chunk in chunks: pool.apply_async(clean_movie_in_parallel, args=(groundtruth,chunk,movie,)) pool.close() pool.join()
def clean_media_type(cleanTable,nprocesses): query = "SELECT id, media_type FROM {0} where media_type <> '' ".format(cleanTable) rows = database_handler.select(query) gt = ["DVD","BLURAY","VHS"] gtDict = dict() gtDict[0] = gt[0] gtDict[1] = gt[1] gtDict[2] = gt[2] pool = mp.Pool(nprocesses) rows = list(rows) chunk_len = int(len(rows)/nprocesses) chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)] for chunk in chunks: #print len(chunk) pool.apply_async(clean_media_type_parallel,args=(gtDict,chunk,cleanTable)) pool.close() pool.join()
def clean_customer_in_parallel(gt_table,rows,clean_table): for row in rows: gt_query = "SELECT firstname, lastname, street, gender FROM {0} WHERE birth_date='{1}'".format(gt_table, row["customer_birthday"]) gt_rows = database_handler.select(gt_query) cus_seq = ''.join(row[col] for col in ["customer_firstname", "customer_lastname", "customer_street", "customer_gender"]) gt_seq_dict = dict() i = 0 for gt_row in gt_rows: gt_seq_dict[i] = ''.join(gt_row[col] for col in ["firstname", "lastname", "street","gender"]) i += 1 best_matching_index = string_matcher.get_matching_seq(cus_seq, gt_seq_dict) best_matching_row = gt_rows[best_matching_index] firstname = best_matching_row["firstname"].replace("'", "''") lastname = best_matching_row["lastname"].replace("'", "''") street = best_matching_row["street"].replace("'", "''") gender = best_matching_row["gender"].replace("'", "''") #print '{0} {1} {2} {3} => {4} {5} {6} {7}'.format(row['customer_firstname'],row['customer_lastname'],row['customer_street'],row['customer_gender'],firstname,lastname,street,gender) update_query = "UPDATE {0} SET customer_firstname='{1}', customer_lastname='{2}', customer_street='{3}', customer_gender='{4}', customer_city='NEW YORK' WHERE id={5}".format(clean_table,firstname,lastname,street,gender,row["id"]) database_handler.update(update_query)
def clean_customer_in_parallel(groundtruth,rows,customer): for row in rows: gt_query = "SELECT firstname, lastname, street, gender FROM {0} WHERE birth_date='{1}'".format(groundtruth, row["birthday"]) groundtruth_rows = database_handler.select(gt_query) # avoid function call: replace get_seq_from_row with: cus_seq = ''.join(row[col] for col in ["name", "street", "gender"]) gt_seq_dict = dict() i = 0 for gt_row in groundtruth_rows: # avoid function call: replace get_seq_from_row with: gt_seq_dict[i] = ''.join(gt_row[col] for col in ["firstname", "lastname", "street", "gender"]) i += 1 best_matching_index = string_matcher.get_matching_seq(cus_seq, gt_seq_dict) best_matching_row = groundtruth_rows[best_matching_index] name = ' '.join([best_matching_row["firstname"],best_matching_row["lastname"]]) name = name.replace("'", "''") street = best_matching_row["street"] street = street.replace("'", "''") gender = best_matching_row["gender"] gender = gender.replace("'","''") update_query = "UPDATE {0} SET name='{1}', street='{2}', city='{3}',gender='{4}' WHERE id={5}".format(customer, name, street,"NEW YORK", gender, row["id"]) database_handler.update(update_query)