Ejemplo n.º 1
0
def clean_movie_genre(cleanTable,nprocesses):
	gt_genres = database_handler.select("SELECT DISTINCT genre FROM groundtruth_movies ORDER BY genre")
	rows = database_handler.select("SELECT id, movie_genre FROM {0} where movie_genre <> '' ".format(cleanTable))
	gtGenresDict = dict()
	i=0
	for gt_genre in gt_genres:
		gtGenresDict[i] = gt_genre["genre"]
		i+=1
	pool = mp.Pool(nprocesses)
	rows = list(rows)
	chunk_len = int(len(rows)/nprocesses)
	chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)]
	for chunk in chunks:
		pool.apply_async(clean_movie_genre_in_parallel,args=(gtGenresDict,chunk,cleanTable))
	pool.close()
	pool.join()		
Ejemplo n.º 2
0
def clean_media_type(clean_table):
    media_type = "media_type"
    query = "SELECT id, {0} FROM {1}".format(media_type, clean_table)

    rows = database_handler.select(query)

    gt = ["DVD","BLURAY","VHS"]
    gt_dict = dict()
    gt_dict[0] = gt[0]
    gt_dict[1] = gt[1]
    gt_dict[2] = gt[2]

    for row in rows:
        media_seq = get_seq_from_row(row, ["media_type"]) 

        best_matching_index = ""
        best_matching_type = "" 

        if not media_seq:
            best_matching_type = ""
        else:
            best_matching_index = string_matcher.get_matching_seq(media_seq, gt_dict)
            best_matching_type = gt_dict[best_matching_index]
            #print ' {0} {1} => {2} '.format(row["id"],row["media_type"],best_matching_type)
            update_query = "UPDATE {0} SET {1}='{2}' WHERE id={3}".format(clean_table, media_type, best_matching_type, row["id"])
Ejemplo n.º 3
0
def clean_movie_in_parallel(groundtruth,rows,movie):
		for row in rows:
			select_query = "select * from {0} where genre='{1}'".format(groundtruth, row['genre']) 
			ground_truth_rows = database_handler.select(select_query)
			matching_seq = string_matcher.get_matching_mov_title(row['title'],ground_truth_rows,"title")
			id_as_string = str(row["id"])
			update_query = "update {0} set title='{1}' where id={2}".format(movie,matching_seq,id_as_string)
			database_handler.update(update_query)
Ejemplo n.º 4
0
def clean_customer(clean_table,nprocesses):
	query = "SELECT id, customer_firstname, customer_lastname, customer_street, customer_birthday, customer_gender FROM {0} where customer_birthday is not null".format(clean_table)
	rows = database_handler.select(query)
	gt_table = tables["gt_address"]
	pool = mp.Pool(nprocesses)
	rows = list(rows)
	chunk_len = int(len(rows)/nprocesses)
	chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)]
	for chunk in chunks:
		pool.apply_async(clean_customer_in_parallel,args=(gt_table,chunk,clean_table,))
	pool.close()
	pool.join()
Ejemplo n.º 5
0
def clean_movie(clean_table,nprocesses): # if mode is set to "test", use corresponding test table 
	title_attr = "movie_title"
	genre_attr = "movie_genre"
	clean_select = "SELECT {0}, {1}, {2} FROM {3}".format("id", title_attr, genre_attr, clean_table)
	rows = database_handler.select(clean_select)
	gt_table = tables["gt_movies"]
	gt_select = "SELECT id, title, genre FROM {0}".format(gt_table)
	gt_rows = database_handler.select(gt_select)
	gt_seq_dict = dict()
	i = 0
	for gt_row in gt_rows:
		gt_seq_dict[i] = "".join(gt_row[col] for col in ["title", "genre"])
		i += 1
	pool = mp.Pool(nprocesses)
	rows = list(rows)
	chunk_len = int(len(rows)/nprocesses)
	chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)]
	for chunk in chunks:
		pool.apply_async(clean_movie_in_parallel,args=(gt_seq_dict,gt_rows,chunk,clean_table,title_attr,genre_attr,))
	pool.close()
	pool.join()
Ejemplo n.º 6
0
def clean_customer(mode,nprocesses):
	customer = tables["ml_cus"]
	#if mode == "test":
	#	customer = tables["ml_cus_test"]
	query = "SELECT id, birthday, name, street, gender FROM {0}".format(customer)
	rows = database_handler.select(query)
	groundtruth = tables["gt_address"]
	pool = mp.Pool(nprocesses)
	rows = list(rows)
	chunk_len = int(len(rows)/nprocesses)
	chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)]
	for chunk in chunks:
		pool.apply_async(clean_customer_in_parallel, args=(groundtruth,chunk,customer,))
	pool.close()
	pool.join()
Ejemplo n.º 7
0
def clean_movie(mode,nprocesses): 
	movie = tables["ml_mov"]
	#if mode == "test":
	#	movie = tables["ml_mov_test"]
	query = "SELECT * FROM {0}".format(movie)
	rows = database_handler.select(query)
	groundtruth = tables["gt_movies"]
	pool = mp.Pool(nprocesses)
	rows = list(rows)
	chunk_len = int(len(rows)/nprocesses)
	chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)]
	for chunk in chunks:
		pool.apply_async(clean_movie_in_parallel, args=(groundtruth,chunk,movie,))
	pool.close()
	pool.join()
Ejemplo n.º 8
0
def clean_media_type(cleanTable,nprocesses):
	query = "SELECT id, media_type FROM {0} where media_type <> '' ".format(cleanTable)
	rows = database_handler.select(query)
	gt = ["DVD","BLURAY","VHS"]
	gtDict = dict()
	gtDict[0] = gt[0]
	gtDict[1] = gt[1]
	gtDict[2] = gt[2]
	pool = mp.Pool(nprocesses)
	rows = list(rows)
	chunk_len = int(len(rows)/nprocesses)
	chunks = [rows[x:x+chunk_len] for x in xrange(0,len(rows),chunk_len)]
	for chunk in chunks:
		#print len(chunk)
		pool.apply_async(clean_media_type_parallel,args=(gtDict,chunk,cleanTable))
	pool.close()
	pool.join()
Ejemplo n.º 9
0
def clean_customer_in_parallel(gt_table,rows,clean_table):
	for row in rows:
		gt_query = "SELECT firstname, lastname, street, gender FROM {0} WHERE birth_date='{1}'".format(gt_table, row["customer_birthday"])
		gt_rows = database_handler.select(gt_query)
		cus_seq = ''.join(row[col] for col in ["customer_firstname", "customer_lastname", "customer_street", "customer_gender"])
		gt_seq_dict = dict()
		i = 0
		for gt_row in gt_rows:
			gt_seq_dict[i] = ''.join(gt_row[col] for col in ["firstname", "lastname", "street","gender"])
			i += 1
		best_matching_index = string_matcher.get_matching_seq(cus_seq, gt_seq_dict)
		best_matching_row = gt_rows[best_matching_index]
		firstname = best_matching_row["firstname"].replace("'", "''")
		lastname = best_matching_row["lastname"].replace("'", "''")
		street = best_matching_row["street"].replace("'", "''")
		gender = best_matching_row["gender"].replace("'", "''")
		#print '{0} {1} {2} {3} => {4} {5} {6} {7}'.format(row['customer_firstname'],row['customer_lastname'],row['customer_street'],row['customer_gender'],firstname,lastname,street,gender)
		update_query = "UPDATE {0} SET customer_firstname='{1}', customer_lastname='{2}', customer_street='{3}', customer_gender='{4}', customer_city='NEW YORK' WHERE id={5}".format(clean_table,firstname,lastname,street,gender,row["id"])
		database_handler.update(update_query)
Ejemplo n.º 10
0
def clean_customer_in_parallel(groundtruth,rows,customer):
	for row in rows:
			gt_query = "SELECT firstname, lastname, street, gender FROM {0} WHERE birth_date='{1}'".format(groundtruth, row["birthday"])
			groundtruth_rows = database_handler.select(gt_query)
			# avoid function call: replace get_seq_from_row with:
			cus_seq = ''.join(row[col] for col in ["name", "street", "gender"])
			gt_seq_dict = dict()
			i = 0
			for gt_row in groundtruth_rows:
				# avoid function call: replace get_seq_from_row with:
				gt_seq_dict[i] = ''.join(gt_row[col] for col in ["firstname", "lastname", "street", "gender"]) 
				i += 1
			best_matching_index = string_matcher.get_matching_seq(cus_seq, gt_seq_dict)
			best_matching_row = groundtruth_rows[best_matching_index]
			name = ' '.join([best_matching_row["firstname"],best_matching_row["lastname"]])
			name = name.replace("'", "''")
			street = best_matching_row["street"]
			street = street.replace("'", "''")
			gender = best_matching_row["gender"]
			gender = gender.replace("'","''")
			update_query = "UPDATE {0} SET name='{1}', street='{2}', city='{3}',gender='{4}' WHERE id={5}".format(customer, name, street,"NEW YORK", gender, row["id"])
			database_handler.update(update_query)