import os, sys, re import sqlite3 import globals from comms_db import CommsDBTable # path_frag_filter = map (lambda x:x.lower(), globals.SKIP_DIR_NAME_FRAGS + globals.SKIP_DIR_NAMES) path_frag_filter = globals.SKIP_DIR_NAME_FRAGS + globals.SKIP_DIR_NAMES sqlite_file = '/Users/ostwald/Documents/Comms/FILTERED.sqlite' db = CommsDBTable (sqlite_file) for frag in path_frag_filter: recs = db.select ('*', "WHERE path like '%{}%'".format(frag)) print frag, len(recs)
class Reaper(DupManager): def __init__(self, dup_data_path, sqlite_file): DupManager.__init__(self, dup_data_path) self.db = CommsDBTable(sqlite_file) def delete_record(self, path): try: self.db.delete_record("path = '{}'".format(path)) except: print 'delete_record ERROR: {}'.format(sys.exc_info()) def select_record(self, path): conn = sqlite3.connect(self.db.sqlite_file) c = conn.cursor() query = "SELECT * from {} WHERE path = '{}'".format( self.db.table_name, path) c.execute(query) rows = c.fetchall() # return map(lambda x:x[0], rows) return rows def filter_by_paths(self): """ using the paths defined in globals, delete all images from relavant paths this is only necessary if global paths have changed since database was created """ path_frag_filter = map( lambda x: x.lower(), globals.SKIP_DIR_NAME_FRAGS + globals.SKIP_DIR_NAMES) clause = "(" + " OR ".join( map(lambda x: "LOWER(path) like '%{}%'".format(x), path_frag_filter)) + ")" print clause conn = sqlite3.connect(self.db.sqlite_file) c = conn.cursor() query = "DELETE from {} WHERE {}".format(self.db.table_name, clause) c.execute(query) print 'records affected: {}'.format(conn.total_changes) conn.commit() def dedup_1(self): """ the dup_sets found by find_disk_1_dups all have at least one ExternalDisk1 version - first delete the :return: """ dup_sets = self.find_disk_1_dups() disc_pat = re.compile("CIC-ExternalDisk1/disc [0-9]*") for key in dup_sets: disc_pat_keepers = [] # these match disc_pat other_keepers = [] # these just have CIC-ExternalDisk1 others = [] # print all in the set print '\n{}'.format(key) for path in self.dup_map[key]: print '- {}'.format(path) if self.is_ignorable(path): continue if disc_pat.search(path): disc_pat_keepers.append(path) elif 'CIC-ExternalDisk1' in path: other_keepers.append(path) else: others.append(path) to_keep = None if len(disc_pat_keepers) > 0: to_keep = disc_pat_keepers print "\n disc_pat_keepers" for p in disc_pat_keepers: print ' -', p elif len(other_keepers) > 0: to_keep = other_keepers print "\n other keepers" for p in other_keepers: print ' -', p elif len(others) > 0: to_keep = others print "\n others" for p in others: print ' -', p else: print '\n WARN: No Keepers found for {}'.format(key) if to_keep is not None: print '\n To Delete' for path in self.dup_map[key]: if not path in to_keep: print ' x', path # self.delete_record (path) def dedup_2(self): """ We are looking for chances to delete non-CIC-ExternalDisk1 dups. So if there is a CIC-ExternalDisk1, then delete all the non-CIC-ExternalDisk1 dups in the dupset """ dup_sets = self.find_disk_1_dups() disc_pat = re.compile("CIC-ExternalDisk1/disc [0-9]*") total_to_delete = 0 for key in dup_sets: externalDisk1_dups = [] # these contain CIC-ExternalDisk1 other_dups = [] # these do NOT have CIC-ExternalDisk1 # print all in the set print '\n{}'.format(key) for path in self.dup_map[key]: print '- {}'.format(path) if "CIC-ExternalDisk1/" in path: externalDisk1_dups.append(path) else: other_dups.append(path) if len(other_dups) > 0 and len(externalDisk1_dups) > 0: print '\n - to delete' for dup in other_dups: print ' -', dup total_to_delete += 1 self.delete_record(dup) print 'total to delete: {}'.format(total_to_delete) def is_ignorable(self, path): ignorables = [ 'design and work files', 'work files', 'work files restore', 'ignore these', 'need to be archived' ] for i in ignorables: if i in path: return 1 return 0 def dedup_4(self): """ After this round there will be no more dups in the database. For each dupset, - if a disk num is found, keep the lowest - else, sort paths and keep the first """ dup_set_keys = sorted(self.dup_map.keys()) dup_sets = [] # these will have at least one copy on CIC-ExternalDisk1 for checksum in dup_set_keys: dup_sets.append(self.dup_map[checksum]) disc_pat = re.compile("CIC-ExternalDisk1/disc ([0-9]+)") total_to_delete = 0 dowrites = 0 print '{} dupsets'.format(len(dup_sets)) for dup_set in dup_sets: dup_to_keep = None # print all in the set dup_set.sort() for path in dup_set: #print u'- {}'.format(path) m = disc_pat.search(path) if m: dup_to_keep = path break # if we haven't found a disc N dup, then just pick one if dup_to_keep is None: dup_to_keep = dup_set[0] dups_to_kill = filter(lambda x: x != dup_to_keep, dup_set) for dup in dups_to_kill: print 'x -', dup total_to_delete += 1 if dowrites: self.delete_record(dup) if 1: # VERBOSE print ' - ', dup_to_keep for d in dups_to_kill: print ' x ', d break print 'total to delete: {}'.format(total_to_delete) def dedup_5(self): """ After this round there will be no more dups in the database. For each dupset, - if a disk num is found, keep the lowest - else, sort paths and keep the first """ self.dedup_4()
def __init__(self): self.db = CommsDBTable(self.de_dup_db_file)
def __init__(self, sqlite_file): self.sqlite_file = sqlite_file self.db = CommsDBTable(self.sqlite_file)
def __init__(self, base_dir, level=0, recursive=True): DirLister.__init__(self, base_dir, level, recursive) self.db = CommsDBTable(globals.composite_sqlite_file) self.all_names = self.db.list_dir(self.base_dir)
def __init__(self, dup_data_path): DupFinder.__init__(self, dup_data_path) self.composite_DB = CommsDBTable(self.composite_sqlite_file) self.dedupe_DB = CommsDBTable(self.dedupe_sqlite_file)
class Writer: """ writes files to disk - only if they do not exist at that path write records to a dest_sqlite_file - existing dest_sqlite_file deleted at __init__ """ src_base_dir = '/Volumes/archives/CommunicationsImageCollection' dest_base_dir = '/Volumes/cic-de-duped' start_with_frest_dest_sqlite_file = False dowrites = 1 def __init__ (self, src_sqlite_file, dest_sqlite_file, path_pat=None): """ :param src_sqlite_file: database file from which records are selected :param dest_sqlite_file: the database to which records are added :param path_pat: used to select the records that will be written """ self.src_sqlite_file = src_sqlite_file self.dest_sqlite_file = dest_sqlite_file if self.start_with_frest_dest_sqlite_file: if os.path.exists(self.dest_sqlite_file): print 'deleting exsting db file at: {}'.format(self.dest_sqlite_file) os.remove(self.dest_sqlite_file) self.src_db = CommsDBTable(self.src_sqlite_file) self.dest_db = CommsDBTable(self.dest_sqlite_file) # self.records = map (DBRecord, self.src_db.select_all_records()) # self.records = self.src_db.select_all_records(sort_spec='path ASC') self.records = self.get_records_to_write(path_pat) print 'there are {} records'.format(len(self.records)) def get_records_to_write (self, path_pat=None): # print 'get_records_to_write: "{}"'.format(path_pat) if path_pat is None: return self.src_db.select_all_records(sort_spec='path ASC') else: return self.src_db.select('*', "WHERE path LIKE '%{}%'".format(path_pat)) def get_dest_path (self, src_path): dest_rel_path = src_path.replace(self.src_base_dir, '') if dest_rel_path.startswith('/Staging'): dest_rel_path = dest_rel_path.replace ('/Staging', '/Field Projects') # dest_path = src_path.replace (self.src_base_dir, self.dest_base_dir) dest_path = self.dest_base_dir + dest_rel_path return dest_path def write_all_records (self, start=0): num_recs = len(self.records) i = start for rec in self.records[start:]: self.write_record(rec) i += 1 if i > 0 and i % 100 == 0: print u'{}/{}'.format(i, num_recs) def write_record (self, rec): """ :param rec: a db_record :return: """ rec = list(rec) # to clone row of data src_path = rec[1] # path is the second field dest_path = self.get_dest_path(src_path) # escape quote for sql rec[1] = dest_path.replace ("'", "''") if not self.db_rec_exists(rec[1]): self.add_dest_record(rec) dest_file_path = globals.normalize_file_path(dest_path) src_file_path = globals.normalize_file_path(src_path) if self.dowrites: if not os.path.exists(os.path.dirname(dest_file_path)): os.makedirs(os.path.dirname(dest_file_path)) if not os.path.exists(dest_file_path): shutil.copy2 (src_file_path, dest_file_path) else: if not os.path.exists(dest_file_path): print ' - would have copied from ', src_file_path print u' - {} ({})'.format(dest_file_path, rec[4]) def db_rec_exists (self, path): """ return True if this path exists in the Destination DB """ try: normalized = globals.normalize_db_path(path) # print 'normalized: {}'.format(normalized) return self.dest_db.count_selected("WHERE path = '{}'".format(normalized)) except: print u'ERROR: db_rec_exists choked on "{}"'.format(normalized) return False def add_dest_record (self,row): conn = sqlite3.connect(self.dest_sqlite_file) c = conn.cursor() # quoted_schema = ','.join(map (lambda x:"'%s'" % x, HOSTS_SCHEMA_SPEC)) quoted_schema = self.dest_db.schema.quoted_schema # put data list together to match with schema fields quoted_values = ','.join(map (lambda x:u"'{}'".format(str (x).replace("'", "''")), row)) # current try: c.execute("INSERT INTO {tn} ({fn}) VALUES ({fv})" \ .format(tn=self.dest_db.table_name, fn=quoted_schema, fv=quoted_values.encode('utf8'))) except: print 'quoted_values is a {}'.format(type(quoted_values)) print quoted_values print('ERROR: {}'.format(sys.exc_info())) traceback.print_stack() sys.exit() conn.commit() conn.close()
print 'Copy error: {}'.format(err) else: # print '- would have copied {}: {}'.format(status, record.path) pass not_found.append(record.path) elif i > 0 and i % 100 == 0: print status return not_found if __name__ == '__main__': sqlite_file = '/Users/ostwald/Documents/Comms/Composite_DB/cic-de-duped.sqlite' db = CommsDBTable(sqlite_file) if 1: # where_clause = "where path LIKE '/Volumes/cic-de-duped/CIC-ExternalDisk1/disc %'" # where_clause = "where path LIKE '/Volumes/cic-de-duped/CIC-ExternalDisk1/disc 15/%'" where_clause = "where path LIKE '/Volumes/cic-de-duped/CIC-ExternalDisk6/%'" # where_clause = "WHERE path like '%cic-de-duped/Field Projects%'" records = db.select('file_name, path', where_clause) print '{} records selected'.format(len(records)) not_found = verify_records(records) print '\n{} were not found'.format(len(not_found)) for path in not_found: print path if 0: records = db.select_all_records()
class DirCmp: verbose = 0 def __init__(self, sqlite_file): self.db = CommsDBTable(sqlite_file) def compare(self, dirA, dirB): dirA_items = self.get_dir_items(dirA) dirA_cksums = map(lambda x: x[0], dirA_items) dirB_items = self.get_dir_items(dirB) dirB_cksums = map(lambda x: x[0], dirB_items) print 'DirA: {} ({} chksums)'.format(dirA, len(dirA_cksums)) print 'DirB: {} ({} chksums)'.format(dirB, len(dirB_cksums)) # print 'Items in {} but not in {}'.format(dirA, dirB) items = [] for n in dirA_cksums: if not n in dirB_cksums: items.append(n) print '{} Items in dirA but not in dirB'.format(len(items)) if self.verbose: for n in items: print n def get_dir_items(self, path): rows = self.db.select('check_sum, path', "WHERE path LIKE '{}%'".format(path)) return rows def get_sub_dirs_OFF(self, path): base_dir = os.path.dirname(path) segments = len(base_dir.split('/')) items = self.get_dir_items(base_dir) sub_dirs = [] for item in items: sub_dir = os.path.dirname(item[1]) if len(sub_dir.split('/')) == segments + 1: sub_dirs.append(sub_dir) return list(set(sub_dirs)) def get_sub_dirs(self, path): base_dir = os.path.dirname(path) base_dir_len = len(base_dir.split('/')) items = self.get_dir_items(base_dir) print '{} items in {}'.format(len(items), path) sub_dirs = [] child_names = [] for item in items: dir_path = os.path.dirname(item[1]) splits = dir_path.split('/') if len(splits) > base_dir_len: child_name = splits[base_dir_len] if not child_name in child_names: child_names.append(child_name) return map(lambda x: os.path.join(path, x), child_names) def top_level_compare(self, dirA, dirB): dc.compare(dirA, dirB) dc.compare(dirB, dirA) subDirsA = self.get_sub_dirs(dirA) subDirsB = self.get_sub_dirs(dirB) print '\nsubDirsA' for s in subDirsA: print s print '\nsubDirsB' for s in subDirsB: print s for subDirA in subDirsA: name = os.path.basename(subDirA) # print name subDirB = os.path.join(dirB, name) # if subDirsB.index (subDirB) > -1: if subDirB in subDirsB: print '\n{} is in both'.format(name) dc.compare(subDirA, subDirB) dc.compare(subDirB, subDirA) else: print '\n{} is in A but not in B'.format(name)