def check_for_duplicates(paths, hash=hashlib.sha1, error_log=log): hashes = {} time_now = datetime.now().strftime("%Y-%m-%d_%H%M%S") exclude = [ "00_Archive", "00_archive", "01_Archive", "01_archive", "00_Document_templates", "01_Deleted lines", "02_Red_Corex", "03_Additional_Workfiles", "SKID", "Bare" ] wb_save_path = Path("N:\\DGAVRIC\\_ITTER") wb_save_name = "duplicated_files_proba" wb_rev = "02" # workbook revision wb = Workbook() # workbook ws = wb.active # workbook sheet activate r = 1 # initial row number log_file = "error_logfile" logger = error_log.get_logger(f"{wb_save_name}_{log_file}") for path in paths: fn_list = dl.dir_list(path, typ="f", lookup="*", extension="*", exclude=exclude) for file in fn_list: hashobj = hash() for chunk in chunk_reader(open(file, 'rb')): hashobj.update(chunk) file_id = (hashobj.digest(), os.path.getsize(file)) duplicate = hashes.get(file_id, None) if duplicate: # print("Duplicate found: %s and %s" % (file, duplicate)) try: ws.cell(r, 1, f'=HYPERLINK("{file}","Open")') ws.cell(r, 2, file.name) ws.cell(r, 3, str(file)) # file path ws.cell(r, 5, f'=HYPERLINK("{duplicate}","Open")') ws.cell(r, 6, duplicate.name) ws.cell(r, 7, str(duplicate)) # file path r += 1 except Exception as error: logger.exception(f"{wb_save_name}_{file} --> {error}") else: hashes[file_id] = file wb.save( Path.joinpath(wb_save_path, f"{wb_rev}_{wb_save_name}_{time_now}.xlsx")) wb.close()
from datetime import datetime from pathlib import Path from openpyxl import Workbook from openpyxl.utils.cell import column_index_from_string, get_column_letter import dir_list_r01 as dl root_path = Path( "J:\\32_IZ224_SIEMENS_Herne\\60_Construction\\20_Sx_Working\\50_Workfiles") exclude = [ "00_Archive", "01_Archive", "00_Document_templates", "01_Deleted lines", "SKID" ] fn_list = list() pattern = r"(\d\d)(BR)" for i in dl.dir_list(root_path, obj_type="d", exclude=exclude): if re.search(pattern, i.name): fn_list.append(i) def buy(**kwargs): time_now = datetime.now().strftime("%Y-%m-%d_%H%M%S") # wb_save_path = Path("D:\\00_HERNE\\_tracking") # wb_save_name = "_wf_list" wb = Workbook() # workbook ws = wb.active # workbook sheet activate r = 1 # initial row number for name, value in kwargs.items(): ws.cell(r, column_index_from_string(value[0]), {value[1]}) # system
time_now = datetime.now().strftime( "%Y-%m-%d_%H%M%S") # date/time in format as (Y-m-d_HMS) # root_path = Path("J:\\32_IZ224_SIEMENS_Herne\\60_Construction\\10_Sx_Input\\30_Sx_Project_Documentation\\10_Mechanical_Engineering_Project\\50_H&S_drawings") # main path root_path = Path( "J:\\32_IZ224_SIEMENS_Herne\\60_Construction\\20_Sx_Working\\50_Workfiles\\01_Deleted lines" ) exclude_dir = [ "00_Archive", "00_archive", "01_Archive", "01_archive", "00_Document_templates", "SKID", "02_Red_Corex", "03_Additional_Workfiles", "Deleted" ] # excluded folders (these are skipped) dlist = dl.dir_list(root_path, obj_type="f", src_for="60*BR*", ext="pdf", exclude=exclude_dir) # list of required files wb_save_path = Path("D:\\00_HERNE\\_tracking\\") # workbook save path wb_file_name = "deleted_lines_pdf_parsed_support_list" # workbook save filename wb_rev = "00" # workbook revision wb = Workbook() # workbook ws = wb.active # workbook sheet activate r = 1 # initial row number log_file = "error_logfile" log = log.get_logger(f"{wb_file_name}_{log_file}") for file in dlist: try:
from dir_list_r01 import dir_list from pathlib import Path from pdf_parser import parse, re_split, parse2, get_pdf_content_lines, parse3, parse4, parse5 import os main_dir = Path("D:/_test_ground/_zeran") ref_ls = [a.stem[:-3] for a in dir_list(main_dir, extension="pdf")] con_ls = [a.stem[:-3] for a in dir_list(main_dir, extension="pdf")] # latest = max(ref_ls, key=os.path.getctime) test = all(map(lambda x, y: x == y, ref_ls, con_ls)) # print(test) print(any(x in ref_ls for x in ref_ls)) file = Path(r"D:\_test_ground\_zeran\01_LBA_01_LB-HP\2018-07-25\Z214LBA25BR010_00.pdf") # file = Path(r"D:\_test_ground\_zeran\01_LBA_01_LB-HP\2018-07-25\Z214LBA10BR010_00.pdf") delimiters = " ", "\n" # print(parse(file)) # print(list(filter(None, re_split(delimiters, parse(file), maxsplit=0)))) # print(re_split(delimiters, parse(file), maxsplit=0)) # print(parse2(file)) # print(searchInPDF(file))
time_now = datetime.now().strftime("%Y-%m-%d_%H%M%S") root_path = Path("D:\\00_PRJS\\ITER\\08_Ax_Tender_Documentation") # root_path = Path("J:\\32_IZ224_SIEMENS_Herne\\60_Construction\\20_Sx_Working\\50_Workfiles") # root_path = Path("J:\\32_IZ224_SIEMENS_Herne\\60_Construction\\10_Sx_Input\\30_Sx_Project_Documentation\\10_Mechanical_Engineering_Project\\50_H&S_drawings") exclude = ["00_Archive", "00_archive", "01_Archive", "01_archive", "00_Document_templates", "02_Red_Corex", "03_Additional_Workfiles", "SKID", "Deleted", "Bare"] # ls = dl.dir_list(root_path, ext="pdf", exclude=exclude) # list of valves """ # fn_list = list() pattern = r"(\d\d)(BQ)" for i in dl.dir_list(root_path, exclude=exclude): if re.search(pattern, i.name): fn_list.append(i) """ fn_list = dl.dir_list(root_path, obj_type="f", src_for="*_*_*_*_*_V*.*.*", ext="pdf", exclude=exclude) wb_save_path = Path("D:\\00_PRJS\\ITER") wb_save_name = "procedure_list" wb_rev = "00" # workbook revision wb = Workbook() # workbook ws = wb.active # workbook sheet activate r = 1 # initial row number # patt = r"(\d{9})" log_file = "error_logfile" log = log.get_logger(f"{wb_save_name}_{log_file}") for file in fn_list: if "redmark" not in str(file): try: ws.cell(r, 1, f'=HYPERLINK("{file}","Open")')