def query_to_df(self, query_string, header="HEADER"): """ Given a query, write the requested data to csv. :param query_string: query to send :param file_name: name to save the file as :header: text to include in query indicating if a header should be saved in output :type query_string: str :type file_name: str :type header: str :return: none :rtype: none """ logging.debug("Copying to CSV query %s", query_string) copy_sql = "COPY ({query}) TO STDOUT WITH CSV {head}".format( query=query_string, head=header) conn = self.db_engine.raw_connection() cur = conn.cursor() out = io.StringIO() cur.copy_expert(copy_sql, out) out.seek(0) df = pandas.read_csv(out, parse_dates=["as_of_date"]) df.set_index(["entity_id", "as_of_date"], inplace=True) return downcast_matrix(df)
def test_downcast_matrix(): df = matrix_creator() downcasted_df = downcast_matrix(df) # make sure the contents are equivalent assert ((downcasted_df == df).all().all()) # make sure the memory usage is lower because there would be no point of this otherwise assert downcasted_df.memory_usage().sum() < df.memory_usage().sum()
def matrix(self): """The raw matrix. Will load from storage into memory if not already loaded""" if self.__matrix is None: self.__matrix = self._load() # Is the index already in place? if self.__matrix.index.names != self.metadata['indices']: self.__matrix.set_index(self.metadata['indices'], inplace=True) self.__matrix = downcast_matrix(self.__matrix) return self.__matrix
def _preprocess_and_split_matrix(self, matrix_with_labels): """Perform desired preprocessing that we generally want to do after loading a matrix This includes setting the index (depending on the storage, may not be serializable) and downcasting. """ indices = self.metadata['indices'] if matrix_with_labels.index.names != indices: matrix_with_labels.set_index(indices, inplace=True) matrix_with_labels = downcast_matrix(matrix_with_labels) labels = matrix_with_labels.pop(self.label_column_name) design_matrix = matrix_with_labels return design_matrix, labels
def _preprocess_and_split_matrix(self, matrix_with_labels): """Perform desired preprocessing that we generally want to do after loading a matrix This includes setting the index (depending on the storage, may not be serializable) and downcasting. """ if matrix_with_labels.index.names != self.indices: matrix_with_labels.set_index(self.indices, inplace=True) index_of_date = matrix_with_labels.index.names.index('as_of_date') if matrix_with_labels.index.levels[ index_of_date].dtype != "datetime64[ns]": raise ValueError( f"Woah is {matrix_with_labels.index.levels[index_of_date].dtype}" ) matrix_with_labels = downcast_matrix(matrix_with_labels) labels = matrix_with_labels.pop(self.label_column_name) design_matrix = matrix_with_labels return design_matrix, labels