def main(): if not table_empty(TABLE_NAME): print(f"Table '{TABLE_NAME} already populated, skipping loading of new records") return path_in = os.path.join(data_dir, FILE_NAME_IN) path_out = os.path.join(data_dir, FILE_NAME_OUT) statement = load_statement.format(dbname=sql.Identifier(TABLE_NAME)) with open(path_in) as infile: reader = csv.DictReader(infile, delimiter = ';') fieldnames = reader.fieldnames with open(path_out, 'w', newline='') as csv_file: fieldnames = fieldnames writer = csv.DictWriter(csv_file, fieldnames, delimiter=';') writer.writeheader() for row in reader: row.update({fieldname: value.strip() for (fieldname, value) in row.items()}) row.update({fieldname: value.replace('.','') for (fieldname, value) in row.items()}) writer.writerow(row) connection = get_connection() cursor = connection.cursor() try: print("Loading CBS energy use records, this might take a minute or so.") cursor.execute(statement, (path_out,)) cursor.close() connection.commit() connection.close() print("Done.") except UndefinedFile: print(f"\nError: CBS energy use data file not found.\nExpected file at {path}.")
def delete_labels_without_vbo_id(): # Around 88 000 energy labels have no associated vbo_id, # this is unhelpful in analysis because we can't link them # to building years or dwelling types from the BAG. # Note: delete_labels_not_in_bag() is not enough since it # ignores labels with NULL as vbo_id. print('Deleting labels without vbo_id...') delete_statement = "DELETE FROM energy_labels WHERE vbo_id IS NULL" connection = get_connection() cursor = connection.cursor() cursor.execute(delete_statement) print(f'\tDeleted {cursor.rowcount} labels') # After this, we can set the column to not accept # NULL values anymore not_null_statement = ''' ALTER TABLE energy_labels ALTER COLUMN vbo_id SET NOT NULL''' print('\tSetting column to NOT NULL') cursor.execute(not_null_statement) cursor.close() connection.commit() connection.close()
def main(): if not table_empty(TABLE_NAME): print( f"Table '{TABLE_NAME}' already populated, skipping loading of new records" ) return path = os.path.join(data_dir, FILE_NAME) statement = load_statement.format(dbname=sql.Identifier(TABLE_NAME)) connection = get_connection() cursor = connection.cursor() try: print( "Loading RVO Warmtenetten records, this might take a minute or so." ) cursor.execute(statement, (path, )) cursor.close() connection.commit() connection.close() print("Done.") except UndefinedFile: print( f"\nError: RVO Warmtenetten data file not found.\nExpected file at {path}." )
def get_huri_rel_cegsb01_data(std_batches): connect = database_utils.get_connection() cursor = connect.cursor() disease_table = 'hi2018_paper.OMIM_disease_tissue' file1 = open( '../data/katjas_data/GTEx/expression_datasets/GTExv6_JPnorm_subtypes_no_cell_no_testis.txt', 'r') tissues = [line[:-1] for line in file1.readlines()] file1.close() analysis1 = Analysis('analysis1') analysis1.tissues = tissues analysis1.get_std_batch_assay_id_dict(cursor, std_batches) analysis1.load_diseases(cursor, disease_table) print('Number of diseases loaded:', len(analysis1.diseases)) analysis1.load_genes(cursor, tissues) print('Number of genes loaded:', len(analysis1.genes)) analysis1.load_disease_alleles(cursor) print('Number of alleles loaded:', len(analysis1.alleles)) analysis1.load_interactions(cursor) print('Number of interactions loaded:', len(analysis1.interactions)) return analysis1
def format_bioplex_complexes(infile, outfile): connect = database_utils.get_connection() df_bp = pandas.read_table(infile) query = """select distinct ensembl_gene_id_short,entrez_gene_id from horfeome_annotation_gencode27.gencode2entrez where entrez_gene_id is not NULL""" geneid_ensg_map = pandas.read_sql(query, connect) df_bp = df_bp.merge(geneid_ensg_map, left_on='GeneID', right_on='entrez_gene_id', how='inner').drop(['entrez_gene_id'], axis=1) gene_ids = df_bp['ensembl_gene_id_short'].unique() annot_dict = {} for gene_id in gene_ids: annot_dict[gene_id] = set( df_bp.loc[df_bp['ensembl_gene_id_short'] == gene_id, 'Cluster Number'].values) if len(annot_dict[gene_id]) > 1: print gene_id target = open(outfile, 'w') target.write('ensembl_gene_id\tBioplex_complex_ids\n') for gene_id, cluster_ids in annot_dict.items(): target.write(gene_id + '\t' + '|'.join([str(i) for i in sorted(list(cluster_ids))]) + '\n') target.close()
def load_cbs_table(table_id, typed_data_set=False): ''' Create a Postgres table with the required structure, downloads the data from CBS, and loads the data into the Postgres table. ''' try: table_name = get_sanitized_cbs_table_title(table_id, typed_data_set) except ConnectionError as e: print( f'\tERROR! Could not load CBS table due to a ConnectionError. Are you connected to the internet? Error:\n\t{e}' ) # do not continue processing return print(f'Loading CBS table {table_name}') # Do not load data if table_exists(table_name): print('Table already exists, skipping downloading and loading.') else: print('Creating table...') create_table_for_cbs_table(table_id, table_name) print('Downloading data...') if typed_data_set: data = cbsodata.get_meta(table_id, 'TypedDataSet') else: data = cbsodata.get_data(table_id) print('Inserting data...') sanitize_key_dict = get_sanitize_key_dict(table_id) connection = get_connection() cursor = connection.cursor() for row in data: # Before we can insert the data, we have to manipulate # the key so it matches the columns in the created # Postgres table, and we might need to trim the white space # from the values. row_dict = { sanitize_key_dict[key]: sanitize_data(value) for key, value in row.items() } # TODO: it's probably more efficient to just pass a list, # I think the required keys are always there. And then we can # more easily batch these INSERTs. insert_dict(table_name, row_dict, cursor) cursor.close() connection.commit() connection.close() add_indices(table_name) print()
def main(): path = os.path.join(data_dir, FILE_NAME) statement = load_statement.format(dbname=sql.Identifier(TABLE_NAME)) connection = get_connection() cursor = connection.cursor() try: print("Loading household consumption records") cursor.execute(statement, (path,)) except UndefinedFile: print(f"\nError: household consumption data file not found.\nExpected file at {path}.") cursor.close() connection.commit() connection.close()
def main(): try: with open(path, 'r') as file: header = file.readline().strip() column_names = header.split(';') except FileNotFoundError: print( f"Error: WoON survey data file not found.\nExpected file at {path}." ) return columns = [(sanitize_column_name(column_name), 'text') for column_name in column_names] print('Creating table...') create_table(TABLE_NAME, columns) if not table_empty(TABLE_NAME): print( f"Table '{TABLE_NAME}' already populated, skipping loading of new records" ) return connection = get_connection() cursor = connection.cursor() # TODO: make this idempotent somehow? statement = load_statement.format(table_name=sql.Identifier(TABLE_NAME)) try: print("Loading WoON survey data...") cursor.execute(statement, (path, )) except UndefinedFile: print( f"Error: WoON survey data file not found.\nExpected file at {path}." ) columns_to_alter = ['ew_huis', 'ew_pers'] for column in columns_to_alter: alter_column_to_number(cursor, column) print("Committing...") cursor.close() connection.commit() connection.close()
def load_energy_labels_data(): connection = get_connection() cursor = connection.cursor() path = os.path.join(data_dir, FILENAME) insert_statement = sql.SQL('INSERT INTO {table_name} VALUES %s').format( table_name=sql.Identifier(env['EP_ONLINE_DBNAME'])) # Bind variables so we can pass a unary function to fast_iter. bounded_handle_element = partial(handle_element, cursor=cursor, insert_statement=insert_statement) context = ET.iterparse(path, tag='Pandcertificaat') fast_iter(context, bounded_handle_element) cursor.close() connection.commit() connection.close()
def delete_labels_not_in_bag(): # Sidenote: my naive first query was: # DELETE FROM energy_labels WHERE vbo_id NOT IN (SELECT vbo_id FROM bag) # but that takes ages! # This with the JOIN method seems backwards ("JOIN and then check for NULL # to see if the JOIN was succesful?!"), but is much faster. print('\tDeleting labels not in BAG...') delete_statement = ''' DELETE FROM energy_labels WHERE vbo_id IN ( SELECT energy_labels.vbo_id FROM energy_labels LEFT JOIN bag ON energy_labels.vbo_id = bag.vbo_id WHERE bag.vbo_id IS NULL )''' connection = get_connection() cursor = connection.cursor() cursor.execute(delete_statement) print(f'\tDeleted {cursor.rowcount} labels') cursor.close() connection.commit() connection.close()
outstring = outstring[:-1] else: outstring = str(edge[attr]) target.write('\t' + outstring) target.write('\n') target.close() if __name__ == '__main__': network_name = sys.argv[1] tissues = sys.argv[2].split(',') output_folder = sys.argv[3] connect = database_utils.get_connection() cursor = connect.cursor() file1 = open(config.NT_cond_file,'r') all_tissues = file1.readlines() file1.close() all_tissues = [c[:-1] for c in all_tissues] if len(tissues) == 1 and tissues[0] == 'all': tissues = all_tissues print('load gene expression data') GTEx = pandas.read_table(config.NT_expr_file,index_col=0) TiPmatrix = pandas.read_table(config.NT_pref_expr_file,index_col=0) GTEx_tissues = TiPmatrix.columns for tissue in GTEx_tissues: