def getSingularTable(self): if self.debug: print( "Attempting to find the name of the table in the Datafile (assuming only one table exists)" ) tables = self.getTableNames() table_count = len(tables) # if no tables exist throw error if table_count == 0: raise ValueError( fileErrorMsg("Datafile does not contain any tables", self.filepath)) # if multiple tables exist, throw error elif table_count > 1: raise ValueError( fileErrorMsg( "Datafile should only contain 1 table, but instead has multiple: {}" .format(tables), self.filepath, )) # return table name only if only one table exists elif table_count == 1: table = tables[0] if self.debug: print( "One table was found in Datafile -- the table name is: {}". format(table)) return table
def __isConnectionOpen(self, error_if_closed=None): if self.debug: if hasattr(self, "connection"): print("Connection status: {}".format(self.connection)) else: print("Connection status: [doesn't exist]") if error_if_closed is None: error_if_closed = False # if there's not even a connection attribute, return false if not hasattr(self, "connection"): connection_is_open = False elif self.connection is None: connection_is_open = False else: if isinstance(self.connection, self.fileformat.connection_class): connection_is_open = True else: raise AttributeError("\n".join([ "connection type appears to be invalid:", "> Expected: {}".format(self.fileformat.connection_class), "> Actual: {}".format(type(self.connection)), ])) if error_if_closed and not (connection_is_open): raise AttributeError( fileErrorMsg("datafile connection is closed", self.filepath)) return connection_is_open
def __returnConnection(self): error_msg = "Unable to connect to input data" try: # if file exists, and not creating a new db, then throw error if self.create_new: if os.path.isfile(self.filepath): try: deleteFile(self.filepath, debug=self.debug) except: raise PermissionError( "unable to delete file: {}".format(self.filepath)) if not os.access(os.path.dirname(self.filepath), os.W_OK): raise PermissionError( "unable to write to filepath: {}".format( self.filepath)) return None elif fileExists(self.filepath, throw_error=not (self.create_new), msg=error_msg): # open connection and attempt to read one row # to confirm that it is a valid file if self.fileformat.filetype == "sqlite": connection = sqlite3.connect(self.filepath) connection.execute("select * from sqlite_master limit 1") elif self.fileformat.filetype == "yxdb": connection = pyxdb.AlteryxYXDB() connection.open(self.filepath) # test that its a real yxdb file connection.get_num_records() else: self.__formatNotSupportedYet() return connection except: try: connection.close() except: pass raise ConnectionError(fileErrorMsg(error_msg, self.filepath))
def writeData(self, pandas_df, table, metadata=None, batch_size=1): if self.debug: print( '[CachedData.writeData] Attempting to write data to table "{}"' .format(table)) print("[Datafile.writeData] metadata: {}".format(metadata)) try: if self.fileformat.filetype == "sqlite": self.__createConnection() # prepare dtype arg for pandas dtypes = {} if isinstance(metadata, dict): for col in metadata: col_metadata = metadata[col] if "name" in col_metadata: name = col_metadata["name"] else: name = col if "type_length" in col_metadata: type_length = col_metadata["type_length"] if type_length is not None: dtypes[name] = type_length if len(dtypes.keys()) == 0: dtypes = None if self.debug: print("[Datafile.writeData] dtypes: {}".format(dtypes)) # write to database pandas_df.to_sql( table, self.connection, if_exists="replace", index=False, dtype=dtypes, ) elif self.fileformat.filetype == "yxdb": # prepare metadata dict for AlteryxYXDB().create_from_dict (list) metadata_list = [] column_conversions = {} pythontool_source = "PythonTool:" for index, col in enumerate(metadata.keys()): metadata_col = metadata[col] if self.debug: print("\n[Datafile.writeData] input column: {}".format( metadata_col)) field_name = col alteryx_type = metadata_col["type"] field_type = pyxdbLookupFieldTypeEnum(alteryx_type) field_length = metadata_col["length"] field_size = field_length[0] if len(field_length) > 1: field_scale = int(field_length[1]) else: field_scale = -1 # prepare source metadata if "source" in metadata_col: source = metadata_col["source"] else: source = "" if len(source) > 0: if source[-1:] == ":": source = "{}{}".format(source, pythontool_source) else: source = pythontool_source if "description" in metadata_col: description = metadata_col["description"] else: description = "" yxdb_metadata = { "name": field_name, "type": field_type, "size": field_size, "scale": field_scale, "source": source, "description": description, } if alteryx_type == "Boolean": column_conversions[index] = "bool" elif alteryx_type in ("Byte", "Int16", "Int32", "Int64"): column_conversions[index] = "int" elif alteryx_type in ("Float", "Fixed Decimal", "Double"): column_conversions[index] = "float" if self.debug: print("[Datafile.writeData] yxdb column: {}".format( yxdb_metadata)) metadata_list.append(yxdb_metadata) if self.debug: print("\nmetadata_list: {}".format(metadata_list)) print( "\ncolumn_conversions: {}".format(column_conversions)) try: self.__createConnection(metadata_list) row_count = pandas_df.shape[0] if self.debug: print("[Datafile.writeData] row count: {}".format( row_count)) rows_i = 0 for i in range(row_count): if self.debug: print("[Datafile.writeData] i: {}".format(i)) # get row (as list) from pandas dataframe row = list(pandas_df.iloc[i]) # convert numeric types to base python (instead of numpy types, which are used by pandas) for col_i in column_conversions: if pd.isnull(row[col_i]): row[col_i] = None else: row[col_i] = getattr( builtins, column_conversions[col_i])(row[col_i]) if batch_size == 1: self.connection.append_record(row) else: if rows_i == 0: rows = [None] * batch_size rows[rows_i] = row rows_i += 1 if rows_i == batch_size: self.connection.append_records(rows) elif i == row_count - 1: self.connection.append_records(rows[:rows_i]) except: raise finally: try: self.connection.close() except: pass else: self.__formatNotSupportedYet() if self.debug: print( fileErrorMsg( 'Success writing output table "{}"'.format(table), self.filepath)) return pandas_df except: print( fileErrorMsg( 'Error: unable to write output table "{}"'.format(table), self.filepath, )) raise
def getData(self, table=None, batch_size=1): if self.debug: print('Attempting to get data from table "{}"'.format(table)) self.__isConnectionOpen(error_if_closed=True) # if no table specified, check to see if there is only table and use that if table is None: table = self.getSingularTable() self.__validateTableName(table) # now that the table name has been retrieved, get the data as pandas df try: if self.fileformat.filetype == "sqlite": query_result = pd.read_sql_query( "select * from {}".format(table), self.connection) elif self.fileformat.filetype == "yxdb": # (another temporary solution) # get metadata (column names) # colnames = list(self.getMetadata()["name"]) colnames = [col["name"] for col in self.getMetadata()] # reset pointer back to first line self.openConnection() num_records = self.connection.get_num_records() if num_records > 0: self.connection.go_record(0) # get number of records in dataset # read in records in batch i = 0 data = [None] * num_records while i < num_records: if batch_size == 1: data[i] = self.connection.read_record() else: if i + batch_size < num_records: data[i:i + batch_size] = self.connection.read_records( batch_size) else: data[i:] = self.connection.read_records( num_records - i) i += batch_size query_result = pd.DataFrame(data, columns=colnames) # # get the actual data # try: # # try to load all records at once # num_records = self.connection.get_num_records() # query_result = pd.DataFrame( # self.connection.read_records(num_records), columns=colnames # ) # except: # # if unable to do that, load one record at a time # data = [] # while True: # data.append(self.connection.read_record()) # if data[-1] == []: # break # query_result = pd.DataFrame(data, columns=colnames) # # if unable to do that, load one record at a time # data = [] # while True: # row = self.connection.read_record() # if row == []: # break # else: # data.append(row) # # query_result = pd.DataFrame(data, columns=colnames) else: self.__formatNotSupportedYet() if self.debug: print( fileErrorMsg( 'Success reading input table "{}" '.format(table), self.filepath)) return query_result except: print( fileErrorMsg( 'Error: unable to read input table "{}"'.format(table), self.filepath, )) raise
def getMetadata(self, table=None): if self.debug: print('Attempting to get metadata from table "{}"'.format(table)) self.__isConnectionOpen(error_if_closed=True) if self.fileformat.filetype == "yxdb" and table is not None: raise ValueError(" ".join([ "specifying a table name ({})".format(table), "for a yxdb file ({}) does not make sense".format( self.filename), ])) # if no table specified, check to see if there is only table and use that if table is None: table = self.getSingularTable() self.__validateTableName(table) try: if self.fileformat.filetype == "sqlite": query = "pragma table_info({})" if self.debug: print("getMetadata (sqlite query): {}".format(query)) query_result = pd.read_sql_query(query.format(table), self.connection) if self.debug: print("getMetadata (sqlite query result): {}".format( query_result)) # now massage that pragma query result into our expected format... column_metadata_list = [] for index, field in query_result.iterrows(): field_dict = { "name": field["name"], # type is a string concatenation of "{type} {size}.{scale}" "type": field["type"], # source and description are lost/unavailable with sqlite "source": "", "description": "", } column_metadata_list.append(field_dict) elif self.fileformat.filetype == "yxdb": # reset pointer back to first line self.openConnection() raw_metadata = self.connection.get_record_meta() # now massage the metadata returned into our expected format... column_metadata_list = [] for column_metadata in raw_metadata: # concatenate type, size, and scale appropriately # (because this is what was expected with sqlite... the value # is then parsed, validated, and transformed. it works, and # it doesn't seem rewriting for this because getting metadata # is something that happens with relative infrequency) type_name = str(column_metadata["type"]) size = column_metadata["size"] scale = column_metadata["scale"] if column_metadata["scale"] > 0: length = "{}.{}".format(size, scale) else: length = str(size) type_length = "{} ({})".format(type_name, length) field_dict = { "name": column_metadata["name"], "type": type_length, "source": column_metadata["source"], "description": column_metadata["description"], } column_metadata_list.append(field_dict) else: self.__formatNotSupportedYet() if self.debug: print( fileErrorMsg( 'Success reading metadata from table "{}" '.format( table), self.filepath, )) return column_metadata_list except: print( fileErrorMsg( 'Error: unable to read metadata for table "{}"'.format( table), self.filepath, )) raise
def __createConnection(self, metadata=None): if self.debug: print("Attempting to create new data connection: {}".format( self.filepath)) error_msg = "Unable to create new data connection" try: if self.fileformat.filetype == "sqlite": if metadata is not None: raise ValueError( "metadata not currently supported for creating an empty sqlite file" ) connection = sqlite3.connect(self.filepath) connection.execute("select * from sqlite_master limit 1") self.connection = connection elif self.fileformat.filetype == "yxdb": if metadata is None: raise ValueError( "metadata is currently required for creating an empty yxdb file" ) # if given a dict, then convert it to list format if self.debug: print("DataFile.__createConnection() -- input metadata:") print(metadata) if isinstance(metadata, dict): metadata_list = [] if self.debug: print("---") print("metadata: {}".format(metadata)) for key in metadata: if self.debug: print("key: {}".format(key)) metadata_d = {"name": key} metadata_attr = metadata[key] if self.debug: print("metadata_attr: {}".format(metadata_attr)) for attr in metadata_attr: metadata_d[attr] = metadata_attr[attr] metadata_list.append(metadata_d) print("---") if self.debug: print( "DataFile.__createConnection() -- adjusted metadata:" ) print(metadata_list) else: metadata_list = metadata if not isinstance(metadata_list, list): raise TypeError( "AlteryxYXDB().create_from_dict() requires a list (ironically) of metadata, with each element being a dict representing a column" ) self.connection = pyxdb.AlteryxYXDB() self.connection.create_from_dict(self.filepath, metadata) else: self.__formatNotSupportedYet() if self.debug: print("Successfully created new data connection: {}".format( self.filepath)) except Exception as err: try: connection.close() except: pass print(err) print(fileErrorMsg(error_msg, self.filepath)) raise