def load_df_from_csvfile( file_obj, skiprows: Optional[int] = 0, skipfooter: Optional[int] = 0, ) -> pd.DataFrame: """Load a data frame from a CSV file. Given a file object, try to read the content as a CSV file and transform into a data frame. The skiprows and skipfooter are number of lines to skip from the top and bottom of the file (see read_csv in pandas). It also tries to convert as many columns as possible to date/time format (testing the conversion on every string column). :param file_obj: File object to read the CSV content :param skiprows: Number of lines to skip at the top of the document :param skipfooter: Number of lines to skip at the bottom of the document :return: Resulting data frame, or an Exception. """ data_frame = pd.read_csv(file_obj, index_col=False, infer_datetime_format=True, quotechar='"', skiprows=skiprows, skipfooter=skipfooter, encoding='utf-8') # Strip white space from all string columns and try to convert to # datetime just in case return pandas.detect_datetime_columns(data_frame)
def test_table_pandas_create(self): # Create a second workflow response = self.client.post(reverse('workflow:api_workflows'), { 'name': tests.wflow_name + '2', 'attributes': { 'one': 'two' } }, format='json') # Get the only workflow in the fixture workflow = models.Workflow.objects.get(id=response.data['id']) # Transform new table into a data frame r_df = pd.DataFrame(self.new_table) r_df = pandas.detect_datetime_columns(r_df) # Upload the table self.client.post(reverse('table:api_pops', kwargs={'wid': workflow.id}), {'data_frame': serializers.df_to_string(r_df)}, format='json') # Refresh wflow (has been updated) workflow = models.Workflow.objects.get(id=workflow.id) # Load the df from the db dframe = pandas.load_table(workflow.get_data_frame_table_name()) # Compare both elements self.compare_tables(r_df, dframe)
def _load_df_from_sqlconnection( conn_item: models.SQLConnection, run_params: Dict, ) -> pd.DataFrame: """Load a DF from a SQL connection. :param conn_item: SQLConnection object with the connection parameters. :param run_params: Dictionary with the execution parameters. :return: Data frame or raise an exception. """ if conn_item.db_password: password = conn_item.db_password else: password = run_params['db_password'] if conn_item.db_table: table_name = conn_item.db_table else: table_name = run_params['db_table'] db_engine = pandas.create_db_engine(conn_item.conn_type, conn_item.conn_driver, conn_item.db_user, password, conn_item.db_host, conn_item.db_name) # Try to fetch the data data_frame = pd.read_sql_table(table_name, db_engine) # Remove the engine db_engine.dispose() # Strip white space from all string columns and try to convert to # datetime just in case return pandas.detect_datetime_columns(data_frame)
def test_table_pandas_update(self): # Get the only workflow in the fixture workflow = Workflow.objects.all()[0] # Transform new table into string r_df = pd.DataFrame(self.new_table) r_df = detect_datetime_columns(r_df) # Upload a new table response = self.client.put( reverse( 'table:api_pops', kwargs={'wid': workflow.id}), {'data_frame': df_to_string(r_df)}, format='json') # Refresh wflow (has been updated) workflow = Workflow.objects.get(id=workflow.id) # Load the df from the db df = load_table(workflow.get_data_frame_table_name()) # Compare both elements self.compare_tables(r_df, df) # Check that the rest of the # information is correct workflow = Workflow.objects.get(id=workflow.id) self.assertTrue(check_wf_df(workflow))
def test_table_json_create(self): # Create a second workflow response = self.client.post( reverse('workflow:api_workflows'), {'name': test.wflow_name + '2', 'attributes': {'one': 'two'}}, format='json') # Get the only workflow in the fixture workflow = Workflow.objects.get(id=response.data['id']) # Upload the table response = self.client.post( reverse('table:api_ops', kwargs={'wid': workflow.id}), {'data_frame': self.new_table}, format='json') # Refresh wflow (has been updated) workflow = Workflow.objects.get(id=workflow.id) # Load the df from the db df = load_table(workflow.get_data_frame_table_name()) # Transform new table into data frame r_df = pd.DataFrame(self.new_table) r_df = detect_datetime_columns(r_df) # Compare both elements self.compare_tables(r_df, df) # Check that the rest of the information is correct self.assertTrue(check_wf_df(workflow))
def to_internal_value(self, data): """Create the data frame from the given data detecting date/time.""" try: df = pd.DataFrame(data) # Detect date/time columns df = detect_datetime_columns(df) except Exception as exc: raise serializers.ValidationError(exc) return df
def load_df_from_s3( aws_key: str, aws_secret: str, bucket_name: str, file_path: str, skiprows: Optional[int] = 0, skipfooter: Optional[int] = 0, ) -> pd.DataFrame: """Load data from a S3 bucket. Given a file object, try to read the content and transform it into a data frame. It also tries to convert as many columns as possible to date/time format (testing the conversion on every string column). :param aws_key: Key to access the S3 bucket :param aws_secret: Secret to access the S3 bucket :param bucket_name: Bucket name :param file_path: Path to access the file within the bucket :param skiprows: Number of lines to skip at the top of the document :param skipfooter: Number of lines to skip at the bottom of the document :return: Resulting data frame, or an Exception. """ path_prefix = '' if aws_key and aws_secret: # If key/secret are given, create prefix path_prefix = '{0}:{1}@'.format(aws_key, aws_secret) if settings.ONTASK_TESTING: uri = 'file:///{0}/{1}'.format(bucket_name, file_path) else: uri = 's3://{0}{1}/{2}'.format(path_prefix, bucket_name, file_path) data_frame = pd.read_csv( smart_open.open(uri), index_col=False, infer_datetime_format=True, quotechar='"', skiprows=skiprows, skipfooter=skipfooter, encoding='utf-8', ) # Strip white space from all string columns and try to convert to # datetime just in case return pandas.detect_datetime_columns(data_frame)
def test_table_JSON_get(self): # Get the only workflow in the fixture workflow = models.Workflow.objects.all()[0] # Get the data through the API response = self.client.get( reverse('table:api_ops', kwargs={'wid': workflow.id})) # Transform the response into a data frame r_df = pd.DataFrame(response.data['data_frame']) r_df = pandas.detect_datetime_columns(r_df) # Load the df from the db dframe = pandas.load_table(workflow.get_data_frame_table_name()) # Compare both elements self.compare_tables(r_df, dframe)
def test_table_pandas_JSON_get(self): # Get the only workflow in the fixture workflow = models.Workflow.objects.all()[0] # Get the data through the API response = self.client.get( reverse('table:api_merge', kwargs={'wid': workflow.id})) workflow = models.Workflow.objects.all()[0] # Transform new table into string r_df = pd.DataFrame(response.data['src_df']) r_df = pandas.detect_datetime_columns(r_df) # Load the df from the db dframe = pandas.load_table(workflow.get_data_frame_table_name()) # Compare both elements and check wf df consistency self.compare_tables(r_df, dframe)
def test_table_pandas_update(self): # Get the only workflow in the fixture workflow = models.Workflow.objects.all()[0] # Transform new table into string r_df = pd.DataFrame(self.new_table) r_df = pandas.detect_datetime_columns(r_df) # Upload a new table self.client.put(reverse('table:api_pops', kwargs={'wid': workflow.id}), {'data_frame': serializers.df_to_string(r_df)}, format='json') # Refresh wflow (has been updated) workflow = models.Workflow.objects.get(id=workflow.id) # Load the df from the db dframe = pandas.load_table(workflow.get_data_frame_table_name()) # Compare both elements self.compare_tables(r_df, dframe)
def load_df_from_excelfile(file_obj, sheet_name: str) -> pd.DataFrame: """Load a data frame from a sheet in an excel file. Given a file object, try to read the content as a Excel file and transform into a data frame. The sheet_name is the name of the sheet to read. It also tries to convert as many columns as possible to date/time format (testing the conversion on every string column). :param file_obj: File object to read the CSV content :param sheet_name: Sheet in the file to read :return: Resulting data frame, or an Exception. """ data_frame = pd.read_excel(file_obj, sheet_name=sheet_name, index_col=False, infer_datetime_format=True, quotechar='"') # Strip white space from all string columns and try to convert to # datetime just in case return pandas.detect_datetime_columns(data_frame)
def batch_load_df_from_athenaconnection( workflow: models.Workflow, conn: models.AthenaConnection, run_params: Dict, log_item: models.Log, ): """Batch load a DF from an Athena connection. run_params has: aws_secret_access_key: Optional[str] = None, aws_session_token: Optional[str] = None, table_name: Optional[str] = None key_column_name[str] = None merge_method[str] = None from pyathena import connect from pyathena.pandas_cursor import PandasCursor cursor = connect( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=aws_session_token, s3_staging_dir=staging_dir, region_name=region_name) df = pd.read_sql('SELECT * FROM given_table_name', cursor) print(df.describe()) print(df.head()) :param workflow: Workflow to store the new data :param conn: AthenaConnection object with the connection parameters. :param run_params: Dictionary with additional connection parameters :param log_item: Log object to reflect the status of the execution :return: Nothing. """ staging_dir = 's3://{0}'.format(conn.aws_bucket_name) if conn.aws_file_path: staging_dir = staging_dir + '/' + conn.aws_file_path cursor = connect(aws_access_key_id=conn.aws_access_key, aws_secret_access_key=run_params['aws_secret_access_key'], aws_session_token=run_params['aws_session_token'], s3_staging_dir=staging_dir, region_name=conn.aws_region_name) data_frame = pd.read_sql_table(run_params['table_name'], cursor) # Strip white space from all string columns and try to convert to # datetime just in case data_frame = pandas.detect_datetime_columns(data_frame) pandas.verify_data_frame(data_frame) col_names, col_types, is_key = pandas.store_temporary_dataframe( data_frame, workflow) upload_data = { 'initial_column_names': col_names, 'col_types': col_types, 'src_is_key_column': is_key, 'rename_column_names': col_names[:], 'columns_to_upload': [True] * len(col_names), 'keep_key_column': is_key[:] } if not workflow.has_data_frame(): # Regular load operation pandas.store_workflow_table(workflow, upload_data) log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload']) return # Merge operation upload_data['dst_column_names'] = workflow.get_column_names() upload_data['dst_is_unique_column'] = workflow.get_column_unique() upload_data['dst_unique_col_names'] = [ cname for idx, cname in enumerate(upload_data['dst_column_names']) if upload_data['dst_column_names'][idx] ] upload_data['src_selected_key'] = run_params['merge_key'] upload_data['dst_selected_key'] = run_params['merge_key'] upload_data['how_merge'] = run_params['merge_method'] dst_df = pandas.load_table(workflow.get_data_frame_table_name()) src_df = pandas.load_table(workflow.get_data_frame_upload_table_name()) try: pandas.perform_dataframe_upload_merge(workflow, dst_df, src_df, upload_data) except Exception as exc: # Nuke the temporary table sql.delete_table(workflow.get_data_frame_upload_table_name()) raise Exception( _('Unable to perform merge operation: {0}').format(str(exc))) col_names, col_types, is_key = workflow.get_column_info() log_item.payload['col_names'] = col_names log_item.payload['col_types'] = col_types log_item.payload['column_unique'] = is_key log_item.payload['num_rows'] = workflow.nrows log_item.payload['num_cols'] = workflow.ncols log_item.save(update_fields=['payload'])