def test_copy(mock_data): """Test copy of files.""" src_dirname, dest_dirname = mock_data with psycopg2.connect(port='5432', host='localhost', **db_args) as conn: db_table = Table('log_file', conn) sensor_file_table = Table('log_sensor_file', conn) db_rows = copy_files(src_dirname, dest_dirname, db_table, sensor_file_table)
def test_delete(mock_data): """Test deleting files.""" src_dirname, dest_dirname = mock_data with psycopg2.connect(port='5432', host='localhost', **db_args) as conn: db_table = Table('log_file', conn) delete_files(db_table, target_dir=src_dirname, suitable_dest_dir=dest_dirname, threshold=0.1, older_than=-1) df = db_table.query(where='is_deleted=True') assert df.src_dirname.isna().all() # directly written not copied assert (df.dest_dirname == src_dirname).all()
def test_write(mock_data): """Test writing files.""" src_dirname, _ = mock_data with psycopg2.connect(port='5432', host='localhost', **db_args) as conn: sensor_file_table = Table('log_sensor_file', conn) db_table = Table('log_file', conn) for id in range(5): with NamedTemporaryFile(dir=src_dirname, delete=False) as fp: fp.write(b'Hello world!') dest_dir, fname = os.path.split(fp.name) sensor_file_table.insert_rows([(f'sensor_file_{id}', [fname])], cols=['log_sensor_file_id', 'sensor_file_path']) write_files(sensor_file_table, db_table, dest_dir) # Write file to a subfolder dest_dir2 = os.path.join(src_dirname, 'test_subfolder') with NamedTemporaryFile(dir=dest_dir2, delete=False) as fp: fp.write(b'test') _, fname = os.path.split(fp.name) fname = os.path.join('test_subfolder', fname) sensor_file_table.insert_rows([(f'sensor_file{id + 1}', [fname])], cols=['log_sensor_file_id', 'sensor_file_path']) write_files(sensor_file_table, db_table, dest_dir) # no error should be thrown if no new files to write write_files(sensor_file_table, db_table, dest_dir)
schemaname != 'information_schema'; """ tables = query(query_tables_cmd) table_ids = [table[1] for table in tables] #### Create graph A = pgv.AGraph(directed=True, repulsiveforce=10.0, overlap=False, splines='curved') # Add nodes for table_id in table_ids: table = Table(table_id, conn) rows = [] rows.append(f'<th><td bgcolor="lightsalmon"><b>{table_id}</b></td></th>') for column_name in table.column_names: rows.append(f'' '<tr>' f'<td port="{column_name}">{column_name}</td>' '</tr>') label = "<<table border='0' cellborder='1' cellspacing='0' cellpadding='4'>" label += '\n'.join(rows) label += "</table>>" A.add_node(table_id, shape='plaintext', label=label) # Add edges for table_id in table_ids:
from neurobooth_terra import Table, create_table from neurobooth_terra.fixes import OptionalSSHTunnelForwarder from neurobooth_terra.dataflow import copy_files from config import ssh_args, db_args src_dir = '/autofs/nas/neurobooth/data_test/' dest_dir = '/autofs/nas/neurobooth/data_test_backup/' with OptionalSSHTunnelForwarder(**ssh_args) as tunnel: with psycopg2.connect(port=tunnel.local_bind_port, host=tunnel.local_bind_host, **db_args) as conn: sensor_file_table = Table('log_sensor_file', conn) db_table = Table('log_file', conn) copy_files(src_dir, dest_dir, db_table, sensor_file_table) # xdf and csv are not represented, other files did not get written due to # non-graceful exists. # Possible ways to handling retrospective entries added to log_sensor_file: # 1. Do not affect write_files assuming files still exist on NAS. # It will simply copy over the new entries to log_file. # 2. However, retrospective entries should be added for copy operations. # For example, if the data was copied from NAS to DrWho, the subsequent # rsync operation will not produce an output for that file and hence # it will not be added automatically to log_file. # 3. For deletion, there are two options: # a. Retrospectively adding the data_verified by looking at file header (?) # b. Use date_verified as current date and let the deletion happen x days
from neurobooth_terra.fixes import OptionalSSHTunnelForwarder from config import ssh_args, db_args from neurobooth_terra import Table def sanitize_date(s): return s[0].strftime('%Y-%m-%d') with OptionalSSHTunnelForwarder(**ssh_args) as tunnel: with psycopg2.connect(port=tunnel.local_bind_port, host=tunnel.local_bind_host, **db_args) as conn: table_task = Table('log_task', conn) table_session = Table('log_session', conn) task_df = table_task.query( where='log_session_id IS NULL AND date_times IS NOT NULL') session_df = table_session.query().reset_index() task_df['date_times'] = task_df['date_times'].apply(sanitize_date) task_groups = task_df.groupby(by=['subject_id', 'date_times']) log_session_id = int(session_df.log_session_id.max() + 1) for group, df in task_groups: table_session.insert_rows(cols=[ 'log_session_id', 'subject_id', 'date', 'application_id' ], vals=[(log_session_id, group[0],
# don't run rsync on weekend # run this file on weekend. if dry_run: stats = shutil.disk_usage(target_dir) threshold = stats.used / stats.total - 0.1 # ensure that it deletes older_than = 1 else: threshold = 0.9 older_than = 30 with OptionalSSHTunnelForwarder(**ssh_args) as tunnel: with psycopg2.connect(port=tunnel.local_bind_port, host=tunnel.local_bind_host, **db_args) as conn: if dry_run: copy_table(src_table_id=table_id, target_table_id=table_id + '_copy', conn=conn) db_table = Table(table_id + '_copy', conn) else: db_table = Table(table_id, conn) delete_files(db_table, target_dir, suitable_dest_dir, threshold=threshold, older_than=older_than, dry_run=dry_run)
def test_psql_connection(): """Test that we can connect to the database""" conn = psycopg2.connect(connect_str) table_id = 'test' drop_table(table_id, conn) assert 'test' not in list_tables(conn) column_names = ['subject_id', 'first_name_birth', 'last_name_birth', 'Age'] dtypes = ['VARCHAR (255)', 'VARCHAR (255)', 'VARCHAR (255)', 'INTEGER'] table_subject = create_table(table_id, conn=conn, column_names=column_names, dtypes=dtypes) with pytest.raises(Exception, match='already exists'): table_subject = create_table(table_id, conn=conn, column_names=column_names, dtypes=dtypes) conn.rollback() # XXX: needed for the error testing above table_subject.insert_rows([('x5dc', 'mainak', 'jas', 21), ('y5d3', 'anoopum', 'gupta', 25)], cols=column_names) with pytest.raises(ValueError, match='vals must be a list of tuple'): table_subject.insert_rows('blah', ['subject_id']) with pytest.raises(ValueError, match='entries in vals must be tuples'): table_subject.insert_rows(['blah'], ['subject_id']) with pytest.raises(ValueError, match='tuple length must match'): table_subject.insert_rows([('x5dc', 'mainak', 'jas')], ['subject_id']) assert sorted(table_subject.data_types) == sorted(dtypes) table_subject.close() table_test = Table('test', conn) assert table_test.primary_key == ['subject_id'] # test updating row table_test.update_row('y5d3', ('blah', 'anupum', 'gupta', 32), cols=column_names) df = table_test.query() assert 'blah' in df.index with pytest.raises(ValueError, match='vals must be a tuple'): table_test.update_row('blah', 'mainak', ['first_name_birth']) # test updating row partially table_test.update_row('blah', ('mainak', ), ['first_name_birth']) df = table_test.query() assert df.loc['blah']['first_name_birth'] == 'mainak' with pytest.raises(ValueError, match='column blah is not present'): table_test.update_row('blah', ('mainak', ), ['blah']) with pytest.raises(ValueError, match='length of vals'): table_test.update_row('blah', ('mainak', ), ['first_name_birth', 'last_name_birth']) # test dropping a column table_test.drop_column(col='subject_id') assert 'subject_id' not in table_test.column_names # test adding an auto-incrementing default value to the column table_test.add_column(col='subject_id', dtype='VARCHAR') table_test.alter_column(col='subject_id', default=dict(prefix='SUBJ')) pk_val = table_test.insert_rows( [('mainak', 'jas', 21)], cols=['first_name_birth', 'last_name_birth', 'Age']) assert pk_val == 'SUBJ1' df = table_test.query() assert 'SUBJ1' in df.index # test insertion of date table_id = 'test_consent' drop_table(table_id, conn) column_names = ['subject_id', 'consent_date'] dtypes = ['VARCHAR (255)', 'date'] table_consent = create_table(table_id, conn, column_names, dtypes) date = datetime.datetime.today().strftime('%Y-%m-%d') table_consent.insert_rows([('x5dc', date)], cols=column_names) conn.close()
metadata.to_csv('data_dictionary_modified.csv') table_infos = get_tables_structure(metadata, include_surveys=survey_ids.keys()) metadata = metadata.reset_index() rows_metadata, cols_metadata = dataframe_to_tuple( metadata, df_columns=['field_name', 'redcap_form_name', 'database_table_name', 'redcap_form_description', 'feature_of_interest', 'question', 'response_array'] ) with OptionalSSHTunnelForwarder(**ssh_args) as tunnel: with psycopg2.connect(port=tunnel.local_bind_port, host=tunnel.local_bind_host, **db_args) as conn: table_metadata = Table('rc_data_dictionary', conn) table_metadata.insert_rows(rows_metadata, cols_metadata, on_conflict='update') # for table_id, table_info in table_infos.items(): for table_id, _ in survey_ids.items(): table_info = table_infos[table_id] print(f'Overwriting table {table_id}') drop_table('rc_' + table_id, conn) table = create_table('rc_' + table_id, conn, table_info['columns'], table_info['dtypes'], primary_key=['subject_id', 'redcap_event_name']) df = fetch_survey(project, survey_name=table_id, survey_id=survey_ids[table_id]) df = df.rename(columns={'record_id': 'subject_id'})