def load_data_without_nouns() : 
	'''
	stores breadcrumbs, urls and corresponding id's for products without nouns
	in breadcrumbs_file, urls_file and ids_file respectively
	
	:expects: 
		None
	:returns:
		None
	'''

	print("Log : load_data_without_nouns : started")
	####################
	# FILE DECLARATIONS
	####################

	# store in a location where it can be accessed the fastest
	mysql_output_file = working_directory + 'data/incorrect_nouns/mysql_output.txt'	# temporary file used to store
																					# the output of an sql query

	ag_output_file = working_directory +'data/incorrect_nouns/ag_output_file.txt'	# used to store the output of 
																					# an "ag (grep)" command

	breadcrumbs_file = working_directory + 'data/incorrect_nouns/breadcrumbs.txt'	# used for stroing the breadcrumbs
																					# for products with no nouns

	urls_file = working_directory + 'data/incorrect_nouns/urls.txt'					# used for storing the url's for
																					# products with no nouns so that
																					# they can be used to retrieve 
																					# breadcrumbs in the case none are
																					# available

	ids_file = working_directory + 'data/incorrect_nouns/ids.txt'					# used for storing id's for
																					# corresponding products 
	# cleaning files
	remove_file(mysql_output_file, supress_output=True)
	remove_file(ag_output_file)
	remove_file(breadcrumbs_file)
	remove_file(urls_file)
	remove_file(ids_file)

	####################################################################################################
	# OBTAINING PRODUCT PMI's FROM "flipkart_listing_products" TABLE WHICH DO NOT HAVE A NOUN ASSOCIATED
	####################################################################################################

	# the pmi's for the proucts not having a noun are stored in the file "ag_output_file.txt, along with their id's"
	print('Log : load_data_without_nouns : reading products with no nouns associated\n\t...')
	i = 0

	# initializing initial value of j
	j = 1000000
	# while number_of_products/j > 10 :
	# 	j *= 10

	# maximizing the amount of data read from the mysql server in one query
	# since fetching data from the server is computationally expensive
	while j > 0 : 
		while i < number_of_products :
			mysql_command = 'select id, pmi from ' + table_name + ' where id >= ' + str(i) + ' and id < ' + str(i + j) + ' into outfile "' + mysql_output_file + '"'
			mysql_comm.run_mysql_command(cursor, mysql_command, print_output=False)	
			i += j
			os.system("ag -v '\"noun\"' " + mysql_output_file + " >> " + ag_output_file)
			os.system('rm ' + mysql_output_file)
			print_string = '\treading upto id : ' + str(i) + ' / ' + str(number_of_products) 
			print(print_string, end='\r')
			sys.stdout.flush()
		i -= j
		j = j / 10

	#################################################################################
	# OBTAINING NOUNS AND BREADCRUMBS FOR PRODUCTS WHICH DO NOT HAVE NOUNS ASSOCIATED
	#################################################################################

	print('Log : load_data_without_nouns : Loading breadcrumbs and urls\n\t...')
	os.system("ag -o '\"breadcrumbs[^\]]*\]' " + ag_output_file  + " >> " + breadcrumbs_file)
	os.system("ag -o '\"url\":\"[^\"]*\"' " + ag_output_file + " >> " + urls_file)

	##########################################################
	# OBTAINING AND STORING CORRESPONDING ID's IN THE IDS FILE
	##########################################################

	print('Log : load_data_without_nouns : getting associated ids\n\t...')
	number_of_products_without_nouns = ''
	with open(ag_output_file, 'r') as f : 
		number_of_products_without_nouns = len(f.read().split('\n'))
	with open(ag_output_file, 'r') as f :
		ids = [] 
		lines = f.read().split('\n')
		for line in lines : 
			ids.append(line.split('\t')[0].split(':')[-1])
		print("\tgot the ids")

		with open(ids_file, 'w') as f :
			f.write('')
		with open(ids_file, 'a') as f : 
			for id in ids : 
				f.write(str(id) + '\n') 
		
		print('\tdone writing ids to ids_file')
		# print('Log : load_data_without_nouns : getting breadcrumbs and urls')
		# count = 0
		# for id in ids : 
		# 	mysql_command = 'select pmi from flipkart_listing_products where id = ' + str(id) + ' into outfile "' + mysql_output_file + '"'
		# 	mysql_comm.run_mysql_command(cursor, mysql_command, print_output=False)
		# 	# if you get an error here, please install "ag (the silver searcher)"
		# 	os.system("ag -o '\"breadcrumbs[^\]]*\]' " + mysql_output_file  + " >> " + breadcrumbs_file)
		# 	os.system("ag -o '\"url\":\"[^\"]*\"' " + mysql_output_file + " >> " + urls_file)
		# 	os.system('rm ' + mysql_output_file)
		# 	count += 1
		# 	print('\t' + str(count) + ' / ' + str(number_of_products_without_nouns), end='\r')
		# 	sys.stdout.flush()

	print('Log : load_data_without_nouns number of products without nouns : ' + str(number_of_products_without_nouns))
	print('Log : load_data_without_nouns : DONE')
output_file = working_directory + 'temp/output.txt'

############################################
# INITIALIZING CONNECTION WITH MYSQL SERVER
############################################

# before initiating this part, make sure that the mysqldb server is running
# and the database is loaded as the name "t

if len(sys.argv) == 1 : 
	mysql_terminal = MySQLdb.connect('localhost')
else :
	mysql_terminal = MySQLdb.connect(sys.argv[1])
cursor = mysql_terminal.cursor()

mysql_comm.run_mysql_command(cursor, 'USE ' + database_name)
mysql_comm.run_mysql_command(cursor, 'SHOW TABLES')


#####################################################################
# CALCULATING NUMBER OF ROWS IN THE "flipkart_listing_products" TABLE
#####################################################################

# uncomment the part below if you don't know the number of rows in the table
# (keeping in mind that it might take a considerable amount of time to do so
# for a large table)
# for the convenience, this has already been done once and is stored as 
# number_of_products = 740826729

# print('calculating the number of rows in the table ...')
# number_of_products = int(mysql_comm.run_mysql_command(cursor, 'SELECT COUNT(*) FROM flipkart_listing_products', print_output=False))
def load_data_with_nouns() : 
	'''
	stores nouns, breadcrumbs and corresponding id's for products with nouns
	in nouns_file, breadcrumbs_file and ids_file respectively
	
	:expects: 
		None
	:returns:
		None
	'''

	###################
	# FILE DECLARATIONS
	###################

	print('Log : load_data_with_nouns : starting')

	# store in a location where it can be accessed the fastest
	mysql_output_file = working_directory + 'data/correct_nouns/mysql_output.txt'	# temporary file used to store
																					# the output of an sql query

	ag_output_file = working_directory +'data/correct_nouns/ag_output_file.txt'		# used to store the output of 
																					# an "ag (grep)" command

	nouns_file = working_directory + 'data/correct_nouns/nouns.txt'					# used for nouns

	breadcrumbs_file = working_directory + 'data/correct_nouns/breadcrumbs.txt'		# used for storing breadcrumbs
																					# for products that have nouns

	ids_file = working_directory + 'data/correct_nouns/ids.txt'						# used for storing id's for 
																					# correspoonding products

	# cleaning files
	remove_file(mysql_output_file, supress_output=True)
	remove_file(ag_output_file)
	remove_file(breadcrumbs_file)
	remove_file(nouns_file)
	remove_file(ids_file)

	################################################################################################
	# OBTAINING PRODUCT PMI's FROM "flipkart_listing_products" TABLE WHICH DO HAVE A NOUN ASSOCIATED
	################################################################################################

	# the pmi's for the proucts not having a noun are stored in the file "ag_output_file.txt, along with their id's"
	print('Log : load_data_with_nouns : reading products with nouns associated\n\t...')
	i = 0

	# initializing initial value of j
	j = 1
	while number_of_products/j > 10 :
		j *= 10

	# maximizing the amount of data read from the mysql server in one query
	# since fetching data from the server is computationally expensive
	while j > 0 : 
		while i < number_of_products :
			mysql_command = 'select id, pmi from ' + table_name + ' where id >= ' + str(i) + ' and id < ' + str(i + j) + ' into outfile "' + mysql_output_file + '"'
			mysql_comm.run_mysql_command(cursor, mysql_command, print_output=False)	
			i += j
			os.system("ag '\"noun\"' " + mysql_output_file + " >> " + ag_output_file)
			os.system('rm ' + mysql_output_file)
			print_string = '\treading upto id : ' + str(i) + ' / ' + str(number_of_products) 
			print(print_string, end='\r')
			sys.stdout.flush()
		i -= j
		j = j / 10

	#############################################################################
	# OBTAINING NOUNS AND BREADCRUMBS FOR PRODUCTS WHICH DO HAVE NOUNS ASSOCIATED
	#############################################################################

	print('Log : load_data_without_nouns : Loading breadcrumbs and urls\n\t...')
	os.system("ag -o '\"noun\":[^:]*\",' " + ag_output_file + " >> " + nouns_file)
	os.system("ag -o '\"breadcrumbs[^\]]*\]' " + ag_output_file  + " >> " + breadcrumbs_file)

	##########################################################
	# OBTAINING AND STORING CORRESPONDING ID's IN THE IDS FILE
	##########################################################

	print('Log : load_data_with_nouns : getting associated ids\n\t...')
	number_of_products_without_nouns = ''
	with open(ag_output_file, 'r') as f : 
		number_of_products_without_nouns = len(f.read().split('\n'))
	print('Log : load_data_with_nouns : number of products with nouns : ' + str(number_of_products_without_nouns))
	with open(ag_output_file, 'r') as f :
		ids = [] 
		lines = f.read().split('\n')
		for line in lines : 
			ids.append(line.split('\t')[0].split(':')[-1])

	try : 
		os.remove(ids_file)
	except OSError : 
		pass
	with open(ids_file, 'a') as f: 
		for id in ids : 
			f.write(str(id) + '\n')

	print('Log : load_data_with_nouns : done')
def load_data() : 
	'''
	stores breadcrumbs, urls and corresponding id's for products without nouns
	in breadcrumbs_file, urls_file and ids_file respectively
	
	:expects: 
		None
	:returns:
		None
	'''

	print("Log : load_data : started")
	####################
	# FILE DECLARATIONS
	####################

	# store in a location where it can be accessed the fastest
	incorrect_nouns_mysql_output_file = working_directory + 'data/incorrect_nouns/mysql_output.txt'	# temporary file used to store
																									# the output of an sql query

	incorrect_nouns_ag_output_file = working_directory +'data/incorrect_nouns/ag_output_file.txt'	# used to store the output of 
																									# an "ag (grep)" command

	incorrect_nouns_breadcrumbs_file = working_directory + 'data/incorrect_nouns/breadcrumbs.txt'	# used for stroing the breadcrumbs
																									# for products with no nouns

	incorrect_nouns_urls_file = working_directory + 'data/incorrect_nouns/urls.txt'					# used for storing the url's for
																									# products with no nouns so that
																									# they can be used to retrieve 
																									# breadcrumbs in the case none are
																									# available

	incorrect_nouns_output_file = working_directory + 'data/incorrect_nouns/output.txt'

	incorrect_nouns_ids_file = working_directory + 'data/incorrect_nouns/ids.txt'					# used for storing id's for
																									# corresponding products 

	correct_nouns_mysql_output_file = working_directory + 'data/correct_nouns/mysql_output.txt'		# temporary file used to store
																									# the output of an sql query

	correct_nouns_ag_output_file = working_directory +'data/correct_nouns/ag_output_file.txt'		# used to store the output of 
																									# an "ag (grep)" command

	correct_nouns_nouns_file = working_directory + 'data/correct_nouns/nouns.txt'					# used for nouns

	correct_nouns_breadcrumbs_file = working_directory + 'data/correct_nouns/breadcrumbs.txt'		# used for storing breadcrumbs
																									# for products that have nouns

	correct_nouns_ids_file = working_directory + 'data/correct_nouns/ids.txt'						# used for storing id's for 
																									# correspoonding products

	correct_nouns_output_file = working_directory + 'data/correct_nouns/output.txt'

	# cleaning files
	remove_file(incorrect_nouns_mysql_output_file, supress_output=True)
	remove_file(incorrect_nouns_ag_output_file)
	remove_file(incorrect_nouns_breadcrumbs_file)
	remove_file(incorrect_nouns_urls_file)
	remove_file(incorrect_nouns_ids_file)
	remove_file(incorrect_nouns_output_file)

	remove_file(correct_nouns_mysql_output_file, supress_output=True)
	remove_file(correct_nouns_ag_output_file)
	remove_file(correct_nouns_breadcrumbs_file)
	remove_file(correct_nouns_nouns_file)
	remove_file(correct_nouns_ids_file)
	remove_file(correct_nouns_output_file)

	####################################################################################################
	# OBTAINING PRODUCT PMI's FROM "flipkart_listing_products" TABLE WHICH DO NOT HAVE A NOUN ASSOCIATED
	####################################################################################################

	# the pmi's for the proucts not having a noun are stored in the file "incorrect_nouns_ag_output_file.txt, along with their id's"
	print('Log : load_data_without_nouns : reading products\n\t...')
	i = 0

	# initializing initial value of j
	j_max = 1000000
	j = 1
	# while number_of_products/j > 10 :
	# 	j *= 10
	# if j > j_max : 
	# 	j = j_max

	# maximizing the amount of data read from the mysql server in one query
	# since fetching data from the server is computationally expensive
	while j > 0 : 
		while i < number_of_products :
			mysql_command = 'select id, pmi from ' + table_name + ' where id >= ' + str(i) + ' and id < ' + str(i + j) + ' into outfile "' + incorrect_nouns_mysql_output_file + '"'
			mysql_comm.run_mysql_command(cursor, mysql_command, print_output=False)	
			i += j
			os.system("ag -v '\"noun\"' " + incorrect_nouns_mysql_output_file + " >> " + incorrect_nouns_ag_output_file)
			os.system("ag '\"noun\"' " + incorrect_nouns_mysql_output_file + " >> " + correct_nouns_ag_output_file)
			
			#################################################################################
			# OBTAINING NOUNS AND BREADCRUMBS FOR PRODUCTS WHICH DO NOT HAVE NOUNS ASSOCIATED
			#################################################################################
			incorrect_nouns_breadcrumbs_command = "var=$(ag -o '\"breadcrumbs[^\]]*\]' " + incorrect_nouns_ag_output_file
			incorrect_nouns_breadcrumbs_command += ") && echo $var >> " + incorrect_nouns_breadcrumbs_file
			incorrect_nouns_breadcrumbs_command += " && echo \"id=$var ,\" >> " + incorrect_nouns_output_file

			correct_nouns_breadcrumbs_command = "var=$(ag -o '\"breadcrumbs[^\]]*\]' " + correct_nouns_ag_output_file
			correct_nouns_breadcrumbs_command += ") && echo $var >> " + correct_nouns_breadcrumbs_file
			correct_nouns_breadcrumbs_command += " && echo \"id=$var ,\" >> " + correct_nouns_output_file

			os.system(incorrect_nouns_breadcrumbs_command)
			os.system(correct_nouns_breadcrumbs_command)
			
			if i == 3 : 
				raise SystemExit(0)
			else : 
				print(i)

			os.system("var=$(ag -o '\"breadcrumbs[^\]]*\]' " + incorrect_nouns_ag_output_file  + ") && echo $var >> " + incorrect_nouns_breadcrumbs_file + "")
			os.system("ag -o '\"url\":\"[^\"]*\"' " + incorrect_nouns_ag_output_file + " >> " + incorrect_nouns_urls_file)
			os.system("ag -o '\"noun\":[^:]*\",' " + correct_nouns_ag_output_file + " >> " + correct_nouns_nouns_file)
			os.system("ag -o '\"breadcrumbs[^\]]*\]' " + correct_nouns_ag_output_file  + " >> " + correct_nouns_breadcrumbs_file)
			
			##########################################################
			# OBTAINING AND STORING CORRESPONDING ID's IN THE IDS FILE
			##########################################################

			os.system("ag -o ':.*\t{\"id' " + incorrect_nouns_ag_output_file + " | ag -o ':.*\t' | ag -o '[^:]*\t' >> " + incorrect_nouns_ids_file)
			os.system("ag -o ':.*\t{\"id' " + correct_nouns_ag_output_file + " | ag -o ':.*\t' | ag -o '[^:]*\t' >> " + correct_nouns_ids_file)

			# Cleaning up before next iteration and printing
			remove_file(incorrect_nouns_ag_output_file)
			remove_file(correct_nouns_ag_output_file)
			remove_file(incorrect_nouns_mysql_output_file)
			print_string = '\treading upto id : ' + str(i) + ' / ' + str(number_of_products) 
			print(print_string, end='\r')
			sys.stdout.flush()
		i -= j
		j = j / 10

	# print('Log : load_data_without_nouns : Loading breadcrumbs and urls\n\t...')


	# print('Log : load_data_without_nouns : getting associated ids\n\t...')

	# number_of_products_without_nouns = ''
	# with open(incorrect_nouns_ag_output_file, 'r') as f : 
	# 	number_of_products_without_nouns = len(f.read().split('\n'))
	# with open(incorrect_nouns_ag_output_file, 'r') as f :
	# 	ids = [] 
	# 	lines = f.read().split('\n')
	# 	for line in lines : 
	# 		ids.append(line.split('\t')[0].split(':')[-1])
	# 	print("\tgot the ids")

	# 	with open(incorrect_nouns_ids_file, 'w') as f :
	# 		f.write('')
	# 	with open(incorrect_nouns_ids_file, 'a') as f : 
	# 		for id in ids : 
	# 			f.write(str(id) + '\n') 
		
	# 	print('\tdone writing ids to incorrect_nouns_ids_file')
	# print('Log : load_data_without_nouns number of products without nouns : ' + str(number_of_products_without_nouns))

	# number_of_products_without_nouns = ''
	# with open(correct_nouns_ag_output_file, 'r') as f : 
	# 	number_of_products_without_nouns = len(f.read().split('\n'))
	# with open(correct_nouns_ag_output_file, 'r') as f :
	# 	ids = [] 
	# 	lines = f.read().split('\n')
	# 	for line in lines : 
	# 		ids.append(line.split('\t')[0].split(':')[-1])

	# try : 
	# 	os.remove(correct_nouns_ids_file)
	# except OSError : 
	# 	pass
	# with open(correct_nouns_ids_file, 'a') as f: 
	# 	for id in ids : 
	# 		f.write(str(id) + '\n')
	# print('Log : load_data_with_nouns : number of products with nouns : ' + str(number_of_products_without_nouns))
	
	print('Log : load_data_without_nouns : DONE')