def build_database(catalogue, foldername, indices=(), extra_function=None): print print "Building database for {} ({})".format(catalogue, foldername) t1 = time.time() files = get_files(catalogue, foldername) datadicts = parse_readme(foldername) db = SkyMapDatabase() for f, dds in datadicts.items(): table = "{}_{}".format(foldername, f.split(".")[0]) db.drop_table(table) columns = [] lc_columns = [] datatypes = [] for dd in dds: c = dd["label"] # Check for columns that have equivalent names i = 1 while c.lower() in lc_columns: if i == 1: c += "_1" else: c = c[:-2] + "_{}".format(i) i += 1 lc_columns.append(c.lower()) columns.append(c) datatypes.append(dd['format']) db.create_table(table, columns, datatypes) real_files = [fn for fn in files if fn.startswith(f)] for real_file in real_files: parse_datafile(db, foldername, real_file, table, dds, columns) for ind in indices: if ind in columns: db.add_index(table, ind) t2 = time.time() print print print "Time: {} s".format(t2 - t1) if extra_function: extra_function()
def split_tyc(): db = SkyMapDatabase() db.commit_query(""" ALTER TABLE hiptyc_tyc_main ADD COLUMN `TYC1` INT AFTER `TYC`, ADD COLUMN `TYC2` INT AFTER `TYC1`, ADD COLUMN `TYC3` INT AFTER `TYC2` """) db.commit_query("""DROP FUNCTION IF EXISTS SPLIT_TYC""") db.commit_query(""" CREATE FUNCTION SPLIT_TYC(str VARCHAR(255), pos INT) RETURNS INT BEGIN SET str = TRIM(str); WHILE INSTR(str, ' ') > 0 DO SET str = REPLACE(str, ' ', ' '); END WHILE; SET str = REPLACE( SUBSTRING( SUBSTRING_INDEX(str, ' ', pos), CHAR_LENGTH( SUBSTRING_INDEX(str, ' ', pos - 1) ) + 1 ) , ' ', '' ); RETURN CAST(str AS UNSIGNED); END; """) db.commit_query(""" UPDATE hiptyc_tyc_main SET TYC1=SPLIT_TYC(TYC, 1), TYC2=SPLIT_TYC(TYC, 2), TYC3=SPLIT_TYC(TYC, 3) """) db.add_index("hiptyc_tyc_main", "TYC1") db.add_index("hiptyc_tyc_main", "TYC2") db.add_index("hiptyc_tyc_main", "TYC3") db.add_multiple_column_index("hiptyc_tyc_main", ("TYC1", "TYC2", "TYC3"), "TYC", unique=True)
def split_tyc(): db = SkyMapDatabase() db.commit_query(""" ALTER TABLE hiptyc_tyc_main ADD COLUMN `TYC1` INT AFTER `TYC`, ADD COLUMN `TYC2` INT AFTER `TYC1`, ADD COLUMN `TYC3` INT AFTER `TYC2` """) db.commit_query(""" UPDATE hiptyc_tyc_main SET TYC1=CAST(substr(TYC, 1, 4) AS UNSIGNED), TYC2=CAST(substr(hiptyc_tyc_main.TYC, 5, 6) AS UNSIGNED), TYC3=CAST(substr(hiptyc_tyc_main.TYC, 11, 2) AS UNSIGNED) """) db.add_index("hiptyc_tyc_main", "TYC1") db.add_index("hiptyc_tyc_main", "TYC2") db.add_index("hiptyc_tyc_main", "TYC3") db.add_multiple_column_index("hiptyc_tyc_main", ("TYC1", "TYC2", "TYC3"), "TYC", unique=True)
def build_database(catalogue, foldername, indices=(), extra_function=None): """Downloads the datafiles for a catalog and builds a local database for it. Args: catalogue (str): the name of the catalog foldername (str): the folder where to save the data indices (list): the columns to generate indices for extra_function (function): a function to call after the database is built """ print() print(f"Building database for {catalogue} ({foldername})") t1 = time.time() files = download_files(catalogue, foldername) datadicts = parse_readme(foldername) db = SkyMapDatabase() column_name_dict = {} for filename, coldefs in datadicts.items(): datatypes = [coldef['format'] for coldef in coldefs] # SQL is case insensitive, and Vizier sometimes has column names in the same file that # have equivalent names. So, the column names are checked and updated when needed. column_names = [] for coldef in coldefs: column_name = coldef["label"] i = 1 lowercase_column_names = [x.lower() for x in column_names] while column_name.lower() in lowercase_column_names: if i > 1: column_name = column_name[:-2] column_name += "_{}".format(i) i += 1 column_names.append(column_name) table = "{}_{}".format(foldername, filename.split(".")[0]) column_name_dict[table] = column_names # Clear the database table db.drop_table(table) db.create_table(table, column_names, datatypes) # For large catalogs, the data can be spread over multiple files, so loop over all files real_files = [fn for fn in files if fn.startswith(filename)] for real_file in real_files: parse_datafile(db, foldername, real_file, table, coldefs, column_names) # Add indices for table, column_names in column_name_dict.items(): for ind in indices: if ind in column_names: db.add_index(table, ind) t2 = time.time() print() print() print(f"Time: {t2-t1} s") if extra_function: extra_function()