def get_total_uploaded(table_name): '''Given a table_name, get the total number of rows where upload is 1.''' query = "SELECT COUNT(*) FROM {0} WHERE uploaded=1".format(table_name) connect = olrcdb.DatabaseConnection() result = connect.execute_query(query) result_tuple = result.fetchone() return result_tuple[0]
def get_min_id(table_name): '''Return the minimum id from table_name where uploaded=0''' query = "SELECT MIN(id) FROM {0} WHERE uploaded=0".format(table_name) connect = olrcdb.DatabaseConnection() result = connect.execute_query(query) result_tuple = result.fetchone() if not result_tuple[0]: sys.exit("Nothing to upload from table {0}".format(table_name)) return int(result_tuple[0])
def upload_table(lock, range, table_name, counter, speed): ''' Given a table_name, upload all the paths from the table where upload is 0. Using the range value, complete a BATCH worth of uploads at a time. ''' global FAILED_COUNT, BATCH connect = olrcdb.DatabaseConnection() # In order for the current process to upload a unique set of files, # acquire the lock to read from range's value. lock.acquire() while range.value <= TOTAL: # Grab a "BATCH" worth of file paths to upload. query = ("SELECT * FROM {0} WHERE uploaded=0" " AND id >= {1} AND id <{2}".format(table_name, range.value, range.value + BATCH)) # Let other processes know this batch has been accounted for. range.value += BATCH lock.release() # Fetch results. result = connect.execute_query(query) path_tuple = result.fetchone() # Loop until we run out of rows from the batch while (path_tuple): # If the upload is successful, update the database if upload_file(path_tuple[1]): lock.acquire() counter.value += 1 lock.release() set_uploaded(path_tuple[0], table_name) else: FAILED_COUNT += 1 error_log = open(table_name + '.upload.error.log', 'a') error_log.write("\rFailed: {0}\n".format( path_tuple[1].encode('utf-8'))) error_log.close() print_status(counter, lock, speed, table_name) path_tuple = result.fetchone() lock.acquire() #Executes on the last range. lock.release()
def set_uploaded(id, table_name): '''For the given path, set uploaded to 1 in table_name.''' query = "UPDATE {0} set uploaded='1' WHERE id='{1}'".format(table_name, id) connect = olrcdb.DatabaseConnection() connect.execute_query(query)
# Check required environment variables have been set if not env_vars_set(): set_env_message = "The following environment variables need to be " \ "set:\n" set_env_message += " \n".join(REQUIRED_VARIABLES) set_env_message += "\nPlease set these environment variables to " \ "connect to the OLRC." print(set_env_message) exit(0) #Open error log: error_log = open(table_name + '.prepare.error.log', 'w+') error_log.write("From execution {0}:\n".format(str( datetime.datetime.now()))) error_log.close() connect = olrcdb.DatabaseConnection() connect.create_table(table_name) prepare_upload(connect, directory, table_name) sys.stdout.flush() sys.stdout.write("\r{0} parsed. ".format(COUNT)) if FAILED != 0: sys.stdout.write("\n{0} FAILED. See error.log.".format(FAILED)) #Log the final count final_count = open(table_name + ".prepare.out", 'w+') final_count.write("\r{0} parsed. ".format(COUNT)) final_count.close()