def format_file(f, full_path, directory, ext): global difference_files f = open_utf8(full_path, 'r') old_lines = f.read().split('\n') f.close() new_text = get_formatted_text(f, full_path, directory, ext) if check_only: new_lines = new_text.split('\n') old_lines = [x for x in old_lines if '...' not in x] new_lines = [x for x in new_lines if '...' not in x] diff_result = difflib.unified_diff(old_lines, new_lines) total_diff = "" for diff_line in diff_result: total_diff += diff_line + "\n" total_diff = total_diff.strip() if len(total_diff) > 0: print("----------------------------------------") print("----------------------------------------") print("Found differences in file " + full_path) print("----------------------------------------") print("----------------------------------------") print(total_diff) difference_files.append(full_path) else: tmpfile = full_path + ".tmp" f = open_utf8(tmpfile, 'w+') f.write(new_text) f.close() os.rename(tmpfile, full_path)
def copy_if_different(src, dest): if os.path.isfile(dest): # dest exists, check if the files are different with open_utf8(src, 'r') as f: source_text = f.read() with open_utf8(dest, 'r') as f: dest_text = f.read() if source_text == dest_text: # print("Skipping copy of " + src + ", identical copy already exists at " + dest) return # print("Copying " + src + " to " + dest) shutil.copyfile(src, dest)
def get_file_contents(fpath, add_line_numbers=False): with open_utf8(fpath, 'r') as f: result = f.read() if add_line_numbers: return '#line 1 "%s"\n' % (fpath, ) + result else: return result
def generate_amalgamation(source_file, header_file): # construct duckdb.hpp from these headers generate_duckdb_hpp(header_file) # now construct duckdb.cpp print("------------------------") print("-- Writing " + source_file + " --") print("------------------------") # scan all the .cpp files with open_utf8(temp_source, 'w+') as sfile: header_file_name = header_file.split(os.sep)[-1] sfile.write('#include "' + header_file_name + '"\n\n') sfile.write("#ifndef DUCKDB_AMALGAMATION\n#error header mismatch\n#endif\n\n") sfile.write("#if (!defined(DEBUG) && !defined NDEBUG)\n#define NDEBUG\n#endif\n\n") for compile_dir in compile_directories: sfile.write(write_dir(compile_dir)) sfile.write('\n\n/*\n') license_idx = 0 for license in licenses: sfile.write("\n\n\n### THIRD PARTY LICENSE #%s ###\n\n" % str(license_idx + 1)) sfile.write(write_file(license)) license_idx+=1 sfile.write('\n\n*/\n') copy_if_different(temp_header, header_file) copy_if_different(temp_source, source_file) try: os.remove(temp_header) os.remove(temp_source) except: pass
def gather_file(current_file, source_files, header_files): global linenumbers global written_files if not need_to_write_file(current_file, False): return "" written_files[current_file] = True # first read this file with open_utf8(current_file, 'r') as f: text = f.read() (statements, includes) = get_includes(current_file, text) # find the linenr of the final #include statement we parsed if len(statements) > 0: index = text.find(statements[-1]) linenr = len(text[:index].split('\n')) # now write all the dependencies of this header first for i in range(len(includes)): # source file inclusions are inlined into the main text include_text = write_file(includes[i]) if linenumbers and i == len(includes) - 1: # for the last include statement, we also include a #line directive include_text += '\n#line %d "%s"\n' % (linenr, current_file) if includes[i].endswith('.cpp') or includes[i].endswith('.cc') or includes[i].endswith('.c'): # source file inclusions are inlined into the main text text = text.replace(statements[i], include_text) else: text = text.replace(statements[i], '') header_files.append(include_text) # add the initial line here if linenumbers: text = '\n#line 1 "%s"\n' % (current_file,) + text source_files.append(cleanup_file(text))
def write_file(current_file, ignore_excluded = False): global linenumbers global written_files if not need_to_write_file(current_file, ignore_excluded): return "" written_files[current_file] = True # first read this file with open_utf8(current_file, 'r') as f: text = f.read() if current_file.startswith("third_party") and not current_file.endswith("LICENSE"): lic_idx = find_license(current_file) text = "\n\n// LICENSE_CHANGE_BEGIN\n// The following code up to LICENSE_CHANGE_END is subject to THIRD PARTY LICENSE #%s\n// See the end of this file for a list\n\n" % str(lic_idx + 1) + text + "\n\n// LICENSE_CHANGE_END\n" (statements, includes) = get_includes(current_file, text) # find the linenr of the final #include statement we parsed if len(statements) > 0: index = text.find(statements[-1]) linenr = len(text[:index].split('\n')) # now write all the dependencies of this header first for i in range(len(includes)): include_text = write_file(includes[i]) if linenumbers and i == len(includes) - 1: # for the last include statement, we also include a #line directive include_text += '\n#line %d "%s"\n' % (linenr, current_file) text = text.replace(statements[i], include_text) # add the initial line here if linenumbers: text = '\n#line 1 "%s"\n' % (current_file,) + text # print(current_file) # now read the header and write it return cleanup_file(text)
def format_tpch_queries(target_dir, tpch_in, comment): with open_utf8(tpch_in, 'r') as f: text = f.read() for i in range(1, 23): qnr = '%02d' % (i, ) target_file = os.path.join(target_dir, 'q' + qnr + '.benchmark') new_text = '''# name: %s # description: Run query %02d from the TPC-H benchmark (%s) # group: [sf1] template %s QUERY_NUMBER=%d QUERY_NUMBER_PADDED=%02d''' % (target_file, i, comment, tpch_in, i, i) with open_utf8(target_file, 'w+') as f: f.write(new_text)
def analyze_include_file(fpath, already_included_files, prev_include = ""): if fpath in already_included_files: return if fpath in amalgamation.always_excluded: return if fpath not in cached_includes: # print(fpath) with open_utf8(fpath, 'r') as f: text = f.read() (statements, includes) = amalgamation.get_includes(fpath, text) cached_includes[fpath] = includes else: includes = cached_includes[fpath] if fpath in include_counts: include_counts[fpath] += 1 else: include_counts[fpath] = 1 if fpath not in include_chains: include_chains[fpath] = {} if prev_include not in include_chains[fpath]: include_chains[fpath][prev_include] = 0 include_chains[fpath][prev_include] += 1 already_included_files.append(fpath) if fpath.endswith('.h') or fpath.endswith('.hpp'): prev_include = fpath for include in includes: analyze_include_file(include, already_included_files, prev_include)
def generate_parquet_amalgamation(source_file, header_file): # construct duckdb.hpp from these headers generate_parquet_hpp(header_file) print("------------------------") print("-- Writing " + source_file + " --") print("------------------------") # scan all the .cpp files with open_utf8(temp_source, 'w+') as sfile: header_file_name = header_file.split(os.sep)[-1] sfile.write('''#include "duckdb.hpp" #ifdef DUCKDB_AMALGAMATION #ifndef DUCKDB_AMALGAMATION_EXTENDED #error Parquet amalgamation requires extended DuckDB amalgamation (--extended) #endif #endif ''') sfile.write('#include "' + header_file_name + '"\n\n') for compile_dir in compile_directories: sfile.write(amalgamation.write_dir(compile_dir)) amalgamation.copy_if_different(temp_header, header_file) amalgamation.copy_if_different(temp_source, source_file) try: os.remove(temp_header) os.remove(temp_source) except: pass
def generate_unity_build(entries, unity_name, linenumbers): ub_file = os.path.join(target_dir, f'ub_{unity_name}.cpp') with open_utf8(ub_file, 'w+') as f: for entry in entries: if linenumbers: f.write('#line 0 "{}"\n'.format( convert_backslashes(entry))) f.write('#include "{}"\n\n'.format(convert_backslashes(entry))) return ub_file
def generate_unity_build(entries, idx, linenumbers): ub_file = os.path.join(target_dir, 'amalgamation-{}.cpp'.format(str(idx))) with open_utf8(ub_file, 'w+') as f: for entry in entries: if linenumbers: f.write('#line 0 "{}"\n'.format( convert_backslashes(entry))) f.write('#include "{}"\n\n'.format(convert_backslashes(entry))) return ub_file
def rewrite(file_in, file_out): # print(file_in) a_file = open_utf8(file_in, "r") out = open_utf8(file_out, "a") for line in a_file: if '#pragma once' in line: continue found = False for header in headers: if header in line: found = True break if found: out.write("// %s" % line) else: out.write(line) out.write("\n") out.close()
def generate_parquet_hpp(header_file): print("-----------------------") print("-- Writing " + header_file + " --") print("-----------------------") with open_utf8(temp_header, 'w+') as hfile: hfile.write("/*\n") hfile.write(amalgamation.write_file("LICENSE")) hfile.write("*/\n\n") hfile.write("#pragma once\n") for fpath in amalgamation.main_header_files: hfile.write(amalgamation.write_file(fpath))
def generate_duckdb_hpp(header_file): print("-----------------------") print("-- Writing " + header_file + " --") print("-----------------------") with open_utf8(temp_header, 'w+') as hfile: hfile.write("/*\n") hfile.write(write_file("LICENSE")) hfile.write("*/\n\n") hfile.write("#pragma once\n") hfile.write("#define DUCKDB_AMALGAMATION 1\n") hfile.write("#define DUCKDB_SOURCE_ID \"%s\"\n" % git_commit_hash()) hfile.write("#define DUCKDB_VERSION \"%s\"\n" % git_dev_version()) for fpath in main_header_files: hfile.write(write_file(fpath))
def create_tpcds_header(tpch_dir): result = """/* THIS FILE WAS AUTOMATICALLY GENERATED BY generate_csv_header.py */ #pragma once const int TPCDS_QUERIES_COUNT = 99; const int TPCDS_TABLE_COUNT = 24; """ # write the queries result += write_dir(tpcds_queries, "TPCDS_QUERIES") result += write_dir(tpcds_answers_sf001, "TPCDS_ANSWERS_SF0_01") result += write_dir(tpcds_answers_sf1, "TPCDS_ANSWERS_SF1") with open_utf8(tpcds_header, 'w+') as f: f.write(result)
def generate_amalgamation(source_file, header_file): # construct duckdb.hpp from these headers generate_duckdb_hpp(header_file) # now construct duckdb.cpp print("------------------------") print("-- Writing " + source_file + " --") print("------------------------") # scan all the .cpp files with open_utf8(temp_source, 'w+') as sfile: header_file_name = header_file.split(os.sep)[-1] sfile.write('#include "' + header_file_name + '"\n\n') sfile.write( "#ifndef DUCKDB_AMALGAMATION\n#error header mismatch\n#endif\n\n") for compile_dir in compile_directories: sfile.write(write_dir(compile_dir)) # for windows we write file_system.cpp last # this is because it includes windows.h which contains a lot of #define statements that mess up the other code sfile.write(write_file(file_system_cpp, True)) sfile.write('\n\n/*\n') license_idx = 0 for license in licenses: sfile.write("\n\n\n### THIRD PARTY LICENSE #%s ###\n\n" % str(license_idx + 1)) sfile.write(write_file(license)) license_idx += 1 sfile.write('\n\n*/\n') copy_if_different(temp_header, header_file) copy_if_different(temp_source, source_file) try: os.remove(temp_header) os.remove(temp_source) except: pass
def build_package(target_dir, linenumbers=False): if not os.path.isdir(target_dir): os.mkdir(target_dir) scripts_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(scripts_dir) import amalgamation sys.path.append(os.path.join(scripts_dir, '..', 'extension', 'parquet')) import parquet_amalgamation prev_wd = os.getcwd() os.chdir(os.path.join(scripts_dir, '..')) # obtain the list of source files from the amalgamation source_list = amalgamation.list_sources() include_list = amalgamation.list_include_dirs() include_files = amalgamation.list_includes() def copy_file(src, target_dir): # get the path full_path = src.split(os.path.sep) current_path = target_dir for i in range(len(full_path) - 1): current_path = os.path.join(current_path, full_path[i]) if not os.path.isdir(current_path): os.mkdir(current_path) target_name = full_path[-1] target_file = os.path.join(current_path, target_name) amalgamation.copy_if_different(src, target_file) # now do the same for the parquet extension parquet_include_directories = parquet_amalgamation.include_directories include_files += amalgamation.list_includes_files( parquet_include_directories) include_list += parquet_include_directories source_list += parquet_amalgamation.source_files for src in source_list: copy_file(src, target_dir) for inc in include_files: copy_file(inc, target_dir) # handle pragma_version.cpp: paste #define DUCKDB_SOURCE_ID there # read the source id proc = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.join(scripts_dir, '..')) githash = proc.stdout.read().strip().decode('utf8') # open the file and read the current contents fpath = os.path.join(target_dir, 'src', 'function', 'table', 'version', 'pragma_version.cpp') with open_utf8(fpath, 'r') as f: text = f.read() # now add the DUCKDB_SOURCE_ID define, if it is not there already found = False lines = text.split('\n') for i in range(len(lines)): if '#define DUCKDB_SOURCE_ID ' in lines[i]: lines[i] = '#define DUCKDB_SOURCE_ID "{}"'.format(githash) found = True break if not found: text = '#ifndef DUCKDB_SOURCE_ID\n#define DUCKDB_SOURCE_ID "{}"\n#endif\n'.format( githash) + text else: text = '\n'.join(text) with open_utf8(fpath, 'w+') as f: f.write(text) def file_is_excluded(fname): for entry in excluded_objects: if entry in fname: return True return False def generate_unity_build(entries, idx, linenumbers): ub_file = os.path.join(target_dir, 'amalgamation-{}.cpp'.format(str(idx))) with open_utf8(ub_file, 'w+') as f: for entry in entries: if linenumbers: f.write('#line 0 "{}"\n'.format( convert_backslashes(entry))) f.write('#include "{}"\n\n'.format(convert_backslashes(entry))) return ub_file def generate_unity_builds(source_list, nsplits, linenumbers): source_list.sort() files_per_split = len(source_list) / nsplits new_source_files = [] current_files = [] idx = 1 for entry in source_list: if not entry.startswith('src'): new_source_files.append(os.path.join('duckdb', entry)) continue current_files.append(entry) if len(current_files) > files_per_split: new_source_files.append( generate_unity_build(current_files, idx, linenumbers)) current_files = [] idx += 1 if len(current_files) > 0: new_source_files.append( generate_unity_build(current_files, idx, linenumbers)) current_files = [] idx += 1 return new_source_files original_sources = source_list source_list = generate_unity_builds(source_list, 8, linenumbers) os.chdir(prev_wd) return ([ convert_backslashes(x) for x in source_list if not file_is_excluded(x) ], [convert_backslashes(x) for x in include_list], [convert_backslashes(x) for x in original_sources])
def generate_amalgamation(source_file, header_file): def copy_if_different(src, dest): if os.path.isfile(dest): # dest exists, check if the files are different with open_utf8(src, 'r') as f: source_text = f.read() with open_utf8(dest, 'r') as f: dest_text = f.read() if source_text == dest_text: print("Skipping copy of " + src + ", identical copy already exists at " + dest) return print("Copying " + src + " to " + dest) shutil.copyfile(src, dest) # the header is unchanged copy_if_different('extension/parquet/include/parquet-extension.hpp', header_file) # now concat all the source/header files while removing known files out = open_utf8(temp_source, "w") out.write("// Parquet reader amalgamation\n\n#include \"%s\"\n" % os.path.basename(header_file)) out.close() def myglob(path, pattern): wd = os.getcwd() os.chdir(path) files = glob.glob(pattern) os.chdir(wd) return [f.replace('\\', '/') for f in files] headers = ["parquet-extension.hpp" ] + myglob("third_party/parquet", "*.h") + myglob( "third_party", "thrift/thrift/*.h") + myglob( "third_party", "thrift/thrift/**/*.h") + [ 'protocol/TCompactProtocol.tcc' ] + myglob("third_party/snappy", "*.h") + myglob( "third_party/miniz", "*.hpp") def rewrite(file_in, file_out): # print(file_in) a_file = open_utf8(file_in, "r") out = open_utf8(file_out, "a") for line in a_file: if '#pragma once' in line: continue found = False for header in headers: if header in line: found = True break if found: out.write("// %s" % line) else: out.write(line) out.write("\n") out.close() # inline all the headers first def rewrite_prefix(prefix, files): for f in files: rewrite("%s/%s" % (prefix, f), temp_source) # the local and overall order of these rewrites matters. rewrite_prefix('third_party/thrift/thrift', [ 'transport/PlatformSocket.h', 'config.h', 'thrift-config.h', 'Thrift.h', 'TLogging.h', 'transport/TTransportException.h', 'transport/TTransport.h', 'protocol/TProtocolException.h', 'protocol/TProtocol.h', 'protocol/TVirtualProtocol.h', 'protocol/TCompactProtocol.h', 'protocol/TCompactProtocol.tcc', 'transport/TVirtualTransport.h', 'transport/TBufferTransports.h', 'TBase.h', 'TToString.h', 'protocol/TProtocol.cpp', 'transport/TTransportException.cpp', 'transport/TBufferTransports.cpp' ]) rewrite_prefix('third_party/parquet', [ 'windows_compatibility.h', 'parquet_types.h', 'parquet_constants.h', 'parquet_types.cpp', 'parquet_constants.cpp' ]) rewrite_prefix('third_party/snappy', ['snappy-stubs-public.h', 'snappy.h']) rewrite_prefix('third_party/miniz', ['miniz.hpp']) # miniz.cpp is already in duckdb.cpp rewrite('third_party/utf8proc/include/utf8proc_wrapper.hpp', temp_source) # 'main' rewrite('extension/parquet/parquet-extension.cpp', temp_source) # snappy last because tons of #defines rewrite_prefix('third_party/snappy', [ 'snappy-stubs-internal.h', 'snappy-internal.h', 'snappy-sinksource.h', 'snappy-stubs-internal.cc', 'snappy-sinksource.cc', 'snappy.cc' ]) copy_if_different(temp_source, source_file)
GENERATED_HEADER = 'include/tpce_generated.hpp' GENERATED_SOURCE = 'tpce_generated.cpp' TPCE_DIR = os.path.join('third_party', 'tpce-tool') GENERATED_HEADER = os.path.join(TPCE_DIR, GENERATED_HEADER) GENERATED_SOURCE = os.path.join(TPCE_DIR, GENERATED_SOURCE) current_table = None tables = {} print(GENERATED_HEADER) print(GENERATED_SOURCE) header = open_utf8(GENERATED_HEADER, 'w+') source = open_utf8(GENERATED_SOURCE, 'w+') for fp in [header, source]: fp.write(""" //////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////// // THIS FILE IS GENERATED BY gentpcecode.py, DO NOT EDIT MANUALLY // //////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////// """) header.write(""" #include "duckdb/catalog/catalog.hpp" #include "duckdb/main/appender.hpp"
def get_formatted_text(f, full_path, directory, ext): if not can_format_file(full_path): print("Eek, cannot format file " + full_path + " but attempted to format anyway") exit(1) if f == 'list.hpp': # fill in list file file_list = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if os.path.splitext(f)[1] == '.hpp' and not f.endswith("list.hpp") ] file_list = [x.replace('src/include/', '') for x in file_list] file_list.sort() result = "" for x in file_list: result += '#include "%s"\n' % (x) return result if ext == ".hpp" and directory.startswith("src/include"): f = open_utf8(full_path, 'r') lines = f.readlines() f.close() # format header in files header_middle = "// " + os.path.relpath(full_path, base_dir) + "\n" text = header_top + header_middle + header_bottom is_old_header = True for line in lines: if not (line.startswith("//") or line.startswith("\n")) and is_old_header: is_old_header = False if not is_old_header: text += line if ext == '.test' or ext == '.test_slow': f = open_utf8(full_path, 'r') lines = f.readlines() f.close() found_name = False found_group = False group_name = full_path.split('/')[-2] new_path_line = '# name: ' + full_path + '\n' new_group_line = '# group: [' + group_name + ']' + '\n' found_diff = False for i in range(0, len(lines)): line = lines[i] if line.startswith('# name: ') or line.startswith('#name: '): if found_name: print("Error formatting file " + full_path + ", multiple lines starting with # name found") exit(1) found_name = True if lines[i] != new_path_line: lines[i] = new_path_line if line.startswith('# group: ') or line.startswith('#group: '): if found_group: print("Error formatting file " + full_path + ", multiple lines starting with # group found") exit(1) found_group = True if lines[i] != new_group_line: lines[i] = new_group_line if not found_group: lines = [new_group_line] + lines if not found_name: lines = [new_path_line] + lines return ''.join(lines) proc_command = format_commands[ext].split(' ') + [full_path] proc = subprocess.Popen(proc_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) new_text = proc.stdout.read().decode('utf8') stderr = proc.stderr.read().decode('utf8') if len(stderr) > 0: print(os.getcwd()) print("Failed to format file " + full_path) print(' '.join(proc_command)) print(stderr) exit(1) return new_text
def get_formatted_text(f, full_path, directory, ext): if not can_format_file(full_path): print("Eek, cannot format file " + full_path + " but attempted to format anyway") exit(1) if f == 'list.hpp': # fill in list file file_list = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if os.path.splitext(f)[1] == '.hpp' and not f.endswith("list.hpp") ] file_list = [x.replace('src/include/', '') for x in file_list] file_list.sort() result = "" for x in file_list: result += '#include "%s"\n' % (x) return result if ext == ".hpp" and directory.startswith("src/include"): with open_utf8(full_path, 'r') as f: lines = f.readlines() # format header in files header_middle = "// " + os.path.relpath(full_path, base_dir) + "\n" text = header_top + header_middle + header_bottom is_old_header = True for line in lines: if not (line.startswith("//") or line.startswith("\n")) and is_old_header: is_old_header = False if not is_old_header: text += line if ext == '.test' or ext == '.test_slow' or ext == '.test_coverage' or ext == '.benchmark': f = open_utf8(full_path, 'r') lines = f.readlines() f.close() found_name = False found_group = False group_name = full_path.split('/')[-2] new_path_line = '# name: ' + full_path + '\n' new_group_line = '# group: [' + group_name + ']' + '\n' found_diff = False # Find description. found_description = False for line in lines: if line.lower().startswith('# description:') or line.lower( ).startswith('#description:'): if found_description: print("Error formatting file " + full_path + ", multiple lines starting with # description found") exit(1) found_description = True new_description_line = '# description: ' + line.split( ':', 1)[1].strip() + '\n' # Filter old meta. meta = [ '#name:', '# name:', '#description:', '# description:', '#group:', '# group:' ] lines = [ line for line in lines if not any(line.lower().startswith(m) for m in meta) ] # Clean up empty leading lines. while lines and not lines[0].strip(): lines.pop(0) # Ensure header is prepended. header = [new_path_line] if found_description: header.append(new_description_line) header.append(new_group_line) header.append('\n') return ''.join(header + lines) proc_command = format_commands[ext].split(' ') + [full_path] proc = subprocess.Popen(proc_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) new_text = proc.stdout.read().decode('utf8') stderr = proc.stderr.read().decode('utf8') if len(stderr) > 0: print(os.getcwd()) print("Failed to format file " + full_path) print(' '.join(proc_command)) print(stderr) exit(1) return new_text.replace('\r', '')
def replace_in_file(fname, regex, replace): with open_utf8(fname, 'r') as f: contents = f.read() contents = re.sub(regex, replace, contents) with open_utf8(fname, 'w+') as f: f.write(contents)
with open(input, 'r') as f: text = f.read() new_text = '{ "result"' + text.split('{ "result"')[tree_index + 1] input += '.tmp' with open(input, 'w+') as f: f.write(new_text) duckdb_query_graph.generate(input, output) with open(output, 'r') as f: text = f.read() #inline javascript files javascript_base = os.path.join('tools', 'pythonpkg', 'duckdb_query_graph') with open(os.path.join(javascript_base, 'raphael.js'), 'r') as f: raphael = f.read() with open(os.path.join(javascript_base, 'treant.js'), 'r') as f: treant = f.read() text = text.replace('<script src="../../raphael.js"></script>', '<script>' + raphael + '</script>') text = text.replace('<script src="../../treant.js"></script>', '<script>' + treant + '</script>') with open_utf8(output, 'w+') as f: f.write(text) if open_output: os.system('open "' + output.replace('"', '\\"') + '"')
def build_package(target_dir, extensions, linenumbers=False): if not os.path.isdir(target_dir): os.mkdir(target_dir) scripts_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(scripts_dir) import amalgamation prev_wd = os.getcwd() os.chdir(os.path.join(scripts_dir, '..')) # obtain the list of source files from the amalgamation source_list = amalgamation.list_sources() include_list = amalgamation.list_include_dirs() include_files = amalgamation.list_includes() def copy_file(src, target_dir): # get the path full_path = src.split(os.path.sep) current_path = target_dir for i in range(len(full_path) - 1): current_path = os.path.join(current_path, full_path[i]) if not os.path.isdir(current_path): os.mkdir(current_path) target_name = full_path[-1] target_file = os.path.join(current_path, target_name) amalgamation.copy_if_different(src, target_file) # include the main extension helper include_files += [os.path.join('extension', 'extension_helper.hpp')] # include the separate extensions for ext in extensions: ext_path = os.path.join(scripts_dir, '..', 'extension', ext) include_package(ext, ext_path, include_files, include_list, source_list) for src in source_list: copy_file(src, target_dir) for inc in include_files: copy_file(inc, target_dir) # handle pragma_version.cpp: paste #define DUCKDB_SOURCE_ID and DUCKDB_VERSION there curdir = os.getcwd() os.chdir(os.path.join(scripts_dir, '..')) githash = git_commit_hash() dev_version = git_dev_version() os.chdir(curdir) # open the file and read the current contents fpath = os.path.join(target_dir, 'src', 'function', 'table', 'version', 'pragma_version.cpp') with open_utf8(fpath, 'r') as f: text = f.read() # now add the DUCKDB_SOURCE_ID define, if it is not there already found_hash = False found_dev = False lines = text.split('\n') for i in range(len(lines)): if '#define DUCKDB_SOURCE_ID ' in lines[i]: lines[i] = '#define DUCKDB_SOURCE_ID "{}"'.format(githash) found_hash = True break if '#define DUCKDB_VERSION ' in lines[i]: lines[i] = '#define DUCKDB_VERSION "{}"'.format(dev_version) found_dev = True break if not found_hash: lines = [ '#ifndef DUCKDB_SOURCE_ID', '#define DUCKDB_SOURCE_ID "{}"'.format(githash), '#endif' ] + lines if not found_dev: lines = [ '#ifndef DUCKDB_VERSION', '#define DUCKDB_VERSION "{}"'.format(dev_version), '#endif' ] + lines text = '\n'.join(lines) with open_utf8(fpath, 'w+') as f: f.write(text) def file_is_excluded(fname): for entry in excluded_objects: if entry in fname: return True return False def generate_unity_build(entries, idx, linenumbers): ub_file = os.path.join(target_dir, 'amalgamation-{}.cpp'.format(str(idx))) with open_utf8(ub_file, 'w+') as f: for entry in entries: if linenumbers: f.write('#line 0 "{}"\n'.format( convert_backslashes(entry))) f.write('#include "{}"\n\n'.format(convert_backslashes(entry))) return ub_file def generate_unity_builds(source_list, nsplits, linenumbers): source_list.sort() files_per_split = len(source_list) / nsplits new_source_files = [] current_files = [] idx = 1 for entry in source_list: if not entry.startswith('src'): new_source_files.append(os.path.join('duckdb', entry)) continue current_files.append(entry) if len(current_files) > files_per_split: new_source_files.append( generate_unity_build(current_files, idx, linenumbers)) current_files = [] idx += 1 if len(current_files) > 0: new_source_files.append( generate_unity_build(current_files, idx, linenumbers)) current_files = [] idx += 1 return new_source_files original_sources = source_list source_list = generate_unity_builds(source_list, 8, linenumbers) os.chdir(prev_wd) return ([ convert_backslashes(x) for x in source_list if not file_is_excluded(x) ], [convert_backslashes(x) for x in include_list], [convert_backslashes(x) for x in original_sources])
def generate_amalgamation_splits(source_file, header_file, nsplits): # construct duckdb.hpp from these headers generate_duckdb_hpp(header_file) # gather all files to read and write source_files = [] header_files = [] for compile_dir in compile_directories: if compile_dir != src_dir: continue gather_files(compile_dir, source_files, header_files) # for windows we write file_system.cpp last # this is because it includes windows.h which contains a lot of #define statements that mess up the other code source_files.append( write_file(os.path.join('src', 'common', 'file_system.cpp'), True)) # write duckdb-internal.hpp if '.hpp' in header_file: internal_header_file = header_file.replace('.hpp', '-internal.hpp') elif '.h' in header_file: internal_header_file = header_file.replace('.h', '-internal.h') else: raise "Unknown extension of header file" temp_internal_header = internal_header_file + '.tmp' with open_utf8(temp_internal_header, 'w+') as f: write_license(f) for hfile in header_files: f.write(hfile) # count the total amount of bytes in the source files total_bytes = 0 for sfile in source_files: total_bytes += len(sfile) # now write the individual splits # we approximate the splitting up by making every file have roughly the same amount of bytes split_bytes = total_bytes / nsplits current_bytes = 0 partitions = [] partition_names = [] current_partition = [] current_partition_idx = 1 for sfile in source_files: current_partition.append(sfile) current_bytes += len(sfile) if current_bytes >= split_bytes: partition_names.append(str(current_partition_idx)) partitions.append(current_partition) current_partition = [] current_bytes = 0 current_partition_idx += 1 if len(current_partition) > 0: partition_names.append(str(current_partition_idx)) partitions.append(current_partition) current_partition = [] current_bytes = 0 # generate partitions from the third party libraries for compile_dir in compile_directories: if compile_dir != src_dir: partition_names.append(compile_dir.split(os.sep)[-1]) partitions.append(write_dir(compile_dir)) header_file_name = header_file.split(os.sep)[-1] internal_header_file_name = internal_header_file.split(os.sep)[-1] partition_fnames = [] current_partition = 0 for partition in partitions: partition_name = source_file.replace( '.cpp', '-%s.cpp' % (partition_names[current_partition], )) temp_partition_name = partition_name + '.tmp' partition_fnames.append([partition_name, temp_partition_name]) with open_utf8(temp_partition_name, 'w+') as f: write_license(f) f.write('#include "%s"\n#include "%s"' % (header_file_name, internal_header_file_name)) f.write(''' #ifndef DUCKDB_AMALGAMATION #error header mismatch #endif ''') for sfile in partition: f.write(sfile) current_partition += 1 copy_if_different(temp_header, header_file) copy_if_different(temp_internal_header, internal_header_file) try: os.remove(temp_header) os.remove(temp_internal_header) except: pass for p in partition_fnames: copy_if_different(p[1], p[0]) try: os.remove(p[1]) except: pass
def format_file(f, full_path, directory, ext, sort_includes): if not os.path.isfile(full_path): return if f == 'list.hpp': # fill in list file list = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if os.path.splitext(f)[1] == '.hpp' and not f.endswith("list.hpp") ] list = [x.replace('src/include/', '') for x in list] list.sort() with open_utf8(full_path, "w") as file: for x in list: file.write('#include "%s"\n' % (x)) elif ext == ".hpp" and directory.startswith("src/include"): # format header in files header_middle = "// " + os.path.relpath(full_path, base_dir) + "\n" file = open_utf8(full_path, "r") lines = file.readlines() file.close() file = open_utf8(full_path, "w") file.write(header_top + header_middle + header_bottom) is_old_header = True for line in lines: if not (line.startswith("//") or line.startswith("\n")) and is_old_header: is_old_header = False if not is_old_header: file.write(line) file.close() elif ext == ".txt" and f != 'CMakeLists.txt': return elif ext == '.test' or ext == '.test_slow': try: with open_utf8(full_path, "r") as file_: lines = file_.readlines() except: return found_name = False found_group = False group_name = full_path.split('/')[-2] new_path_line = '# name: ' + full_path + '\n' new_group_line = '# group: [' + group_name + ']' + '\n' found_diff = False for i in range(0, len(lines)): line = lines[i] if line.startswith('# name: ') or line.startswith('#name: '): if found_name: print("Error formatting file " + full_path + ", multiple lines starting with # name found") exit(1) found_name = True if lines[i] != new_path_line: found_diff = True lines[i] = new_path_line if line.startswith('# group: ') or line.startswith('#group: '): if found_group: print("Error formatting file " + full_path + ", multiple lines starting with # group found") exit(1) found_group = True if lines[i] != new_group_line: found_diff = True lines[i] = new_group_line if not found_group: lines = [new_group_line] + lines found_diff = True if not found_name: lines = [new_path_line] + lines found_diff = True if found_diff: print(full_path) print(new_path_line) print(new_group_line) with open_utf8(full_path, "w+") as file_: file_.write(''.join(lines)) return format_command = format_commands[ext] cmd = format_command.replace("${FILE}", full_path).replace( "${SORT_INCLUDES}", "1" if sort_includes else "0") print(cmd) os.system(cmd) # remove empty lines at beginning and end of file with open_utf8(full_path, 'r') as fp: text = fp.read() text = text.strip() + "\n" with open_utf8(full_path, 'w+') as fp: fp.write(text)
namespace duckdb_libpgquery { #define PG_KEYWORD(a,b,c) {a,b,c}, const PGScanKeyword ScanKeywords[] = { """ for tpl in kwlist: kwtext += 'PG_KEYWORD("%s", %s, %s)\n' % (strip_p( tpl[0]).lower(), tpl[0], tpl[1]) kwtext += """ }; const int NumScanKeywords = lengthof(ScanKeywords); } // namespace duckdb_libpgquery """ with open_utf8(kwlist_header, 'w+') as f: f.write(kwtext) # generate the final main.y.tmp file # first read the template file with open_utf8(template_file, 'r') as f: text = f.read() # now perform a series of replacements in the file to construct the final yacc file def get_file_contents(fpath, add_line_numbers=False): with open_utf8(fpath, 'r') as f: result = f.read() if add_line_numbers: return '#line 1 "%s"\n' % (fpath, ) + result
flex_file_path = os.path.join(pg_path, 'scan.l') target_file = os.path.join(pg_path, 'src_backend_parser_scan.cpp') proc = subprocess.Popen( [flex_bin, '--nounistd', '-o', target_file, flex_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout = proc.stdout.read().decode('utf8') stderr = proc.stderr.read().decode('utf8') if proc.returncode != None or len(stderr) > 0: print("Flex failed") print("stdout: ", stdout) print("stderr: ", stderr) exit(1) with open_utf8(target_file, 'r') as f: text = f.read() # add the libpg_query namespace text = text.replace( ''' #ifndef FLEXINT_H #define FLEXINT_H ''', ''' #ifndef FLEXINT_H #define FLEXINT_H namespace duckdb_libpgquery { ''') text = text.replace('register ', '') text = text + "\n} /* duckdb_libpgquery */\n"
def read_list_from_file(fname): with open_utf8(fname, 'r') as f: return f.read().split('\n')
try_remove_file(gen_storage_target) try_remove_file(gen_storage_target + '.wal') def run_command_in_shell(cmd): print(cmd) res = subprocess.run( [shell_proc, '--batch', '-init', '/dev/null', gen_storage_target], capture_output=True, input=bytearray(cmd, 'utf8')) stdout = res.stdout.decode('utf8').strip() stderr = res.stderr.decode('utf8').strip() if res.returncode != 0: print("Failed to create database file!") print("----STDOUT----") print(stdout) print("----STDERR----") print(stderr) with open_utf8(gen_storage_script, 'r') as f: cmd = f.read() run_command_in_shell(cmd) # FIXME: force a checkpoint run_command_in_shell('select * from integral_values') run_command_in_shell('select * from integral_values') try_remove_file(gen_storage_target + '.wal')