import os, sys, re
import sqlite3
import globals
from comms_db import CommsDBTable

# path_frag_filter = map (lambda x:x.lower(), globals.SKIP_DIR_NAME_FRAGS + globals.SKIP_DIR_NAMES)
path_frag_filter = globals.SKIP_DIR_NAME_FRAGS + globals.SKIP_DIR_NAMES

sqlite_file = '/Users/ostwald/Documents/Comms/FILTERED.sqlite'
db = CommsDBTable (sqlite_file)

for frag in path_frag_filter:
    recs = db.select ('*', "WHERE path like '%{}%'".format(frag))
    print frag, len(recs)
Exemple #2
0
class Reaper(DupManager):
    def __init__(self, dup_data_path, sqlite_file):
        DupManager.__init__(self, dup_data_path)
        self.db = CommsDBTable(sqlite_file)

    def delete_record(self, path):
        try:
            self.db.delete_record("path = '{}'".format(path))
        except:
            print 'delete_record ERROR: {}'.format(sys.exc_info())

    def select_record(self, path):
        conn = sqlite3.connect(self.db.sqlite_file)
        c = conn.cursor()

        query = "SELECT * from {} WHERE path = '{}'".format(
            self.db.table_name, path)

        c.execute(query)
        rows = c.fetchall()
        # return map(lambda x:x[0], rows)
        return rows

    def filter_by_paths(self):
        """
        using the paths defined in globals, delete all images from relavant paths

        this is only necessary if global paths have changed since database was created
        """
        path_frag_filter = map(
            lambda x: x.lower(),
            globals.SKIP_DIR_NAME_FRAGS + globals.SKIP_DIR_NAMES)
        clause = "(" + " OR ".join(
            map(lambda x: "LOWER(path) like '%{}%'".format(x),
                path_frag_filter)) + ")"

        print clause

        conn = sqlite3.connect(self.db.sqlite_file)
        c = conn.cursor()
        query = "DELETE from {} WHERE {}".format(self.db.table_name, clause)

        c.execute(query)
        print 'records affected: {}'.format(conn.total_changes)
        conn.commit()

    def dedup_1(self):
        """
        the dup_sets found by find_disk_1_dups all have at least one ExternalDisk1 version
        - first delete the
        :return:
        """
        dup_sets = self.find_disk_1_dups()

        disc_pat = re.compile("CIC-ExternalDisk1/disc [0-9]*")

        for key in dup_sets:

            disc_pat_keepers = []  # these match disc_pat
            other_keepers = []  # these just have CIC-ExternalDisk1
            others = []

            # print all in the set
            print '\n{}'.format(key)
            for path in self.dup_map[key]:
                print '- {}'.format(path)
                if self.is_ignorable(path): continue
                if disc_pat.search(path):
                    disc_pat_keepers.append(path)
                elif 'CIC-ExternalDisk1' in path:
                    other_keepers.append(path)
                else:
                    others.append(path)

            to_keep = None

            if len(disc_pat_keepers) > 0:
                to_keep = disc_pat_keepers
                print "\n  disc_pat_keepers"
                for p in disc_pat_keepers:
                    print '  -', p

            elif len(other_keepers) > 0:
                to_keep = other_keepers

                print "\n  other keepers"
                for p in other_keepers:
                    print '  -', p

            elif len(others) > 0:
                to_keep = others
                print "\n  others"
                for p in others:
                    print '  -', p

            else:
                print '\n  WARN: No Keepers found for {}'.format(key)

            if to_keep is not None:
                print '\n  To Delete'
                for path in self.dup_map[key]:
                    if not path in to_keep:
                        print '     x', path
                        # self.delete_record (path)

    def dedup_2(self):
        """
        We are looking for chances to delete non-CIC-ExternalDisk1 dups. So if there is
        a CIC-ExternalDisk1, then delete all the non-CIC-ExternalDisk1 dups in the dupset
        """
        dup_sets = self.find_disk_1_dups()

        disc_pat = re.compile("CIC-ExternalDisk1/disc [0-9]*")

        total_to_delete = 0

        for key in dup_sets:

            externalDisk1_dups = []  # these contain CIC-ExternalDisk1
            other_dups = []  # these do NOT have CIC-ExternalDisk1

            # print all in the set
            print '\n{}'.format(key)
            for path in self.dup_map[key]:
                print '- {}'.format(path)
                if "CIC-ExternalDisk1/" in path:
                    externalDisk1_dups.append(path)
                else:
                    other_dups.append(path)

            if len(other_dups) > 0 and len(externalDisk1_dups) > 0:
                print '\n - to delete'
                for dup in other_dups:
                    print '  -', dup
                    total_to_delete += 1
                    self.delete_record(dup)

        print 'total to delete: {}'.format(total_to_delete)

    def is_ignorable(self, path):

        ignorables = [
            'design and work files', 'work files', 'work files restore',
            'ignore these', 'need to be archived'
        ]
        for i in ignorables:
            if i in path:
                return 1
        return 0

    def dedup_4(self):
        """
        After this round there will be no more dups in the database.

        For each dupset,

        - if a disk num is found, keep the lowest
        - else, sort paths and keep the first
        """
        dup_set_keys = sorted(self.dup_map.keys())

        dup_sets = []  # these will have at least one copy on CIC-ExternalDisk1
        for checksum in dup_set_keys:
            dup_sets.append(self.dup_map[checksum])

        disc_pat = re.compile("CIC-ExternalDisk1/disc ([0-9]+)")

        total_to_delete = 0
        dowrites = 0

        print '{} dupsets'.format(len(dup_sets))

        for dup_set in dup_sets:
            dup_to_keep = None

            # print all in the set
            dup_set.sort()
            for path in dup_set:
                #print u'- {}'.format(path)

                m = disc_pat.search(path)
                if m:
                    dup_to_keep = path
                    break

            # if we haven't found a disc N dup, then just pick one
            if dup_to_keep is None:
                dup_to_keep = dup_set[0]

            dups_to_kill = filter(lambda x: x != dup_to_keep, dup_set)

            for dup in dups_to_kill:
                print 'x  -', dup
                total_to_delete += 1
                if dowrites:
                    self.delete_record(dup)

            if 1:  # VERBOSE
                print ' - ', dup_to_keep
                for d in dups_to_kill:
                    print '  x  ', d

            break

        print 'total to delete: {}'.format(total_to_delete)

    def dedup_5(self):
        """
        After this round there will be no more dups in the database.

        For each dupset,

        - if a disk num is found, keep the lowest
        - else, sort paths and keep the first
        """
        self.dedup_4()
Exemple #3
0
 def __init__(self):
     self.db = CommsDBTable(self.de_dup_db_file)
Exemple #4
0
 def __init__(self, sqlite_file):
     self.sqlite_file = sqlite_file
     self.db = CommsDBTable(self.sqlite_file)
Exemple #5
0
 def __init__(self, base_dir, level=0, recursive=True):
     DirLister.__init__(self, base_dir, level, recursive)
     self.db = CommsDBTable(globals.composite_sqlite_file)
     self.all_names = self.db.list_dir(self.base_dir)
Exemple #6
0
 def __init__(self, dup_data_path):
     DupFinder.__init__(self, dup_data_path)
     self.composite_DB = CommsDBTable(self.composite_sqlite_file)
     self.dedupe_DB = CommsDBTable(self.dedupe_sqlite_file)
Exemple #7
0
class Writer:
    """
    writes files to disk
    - only if they do not exist at that path
    write records to a dest_sqlite_file
    - existing dest_sqlite_file deleted at __init__
    """

    src_base_dir = '/Volumes/archives/CommunicationsImageCollection'
    dest_base_dir = '/Volumes/cic-de-duped'
    start_with_frest_dest_sqlite_file = False
    dowrites = 1

    def __init__ (self, src_sqlite_file, dest_sqlite_file, path_pat=None):
        """

        :param src_sqlite_file: database file from which records are selected
        :param dest_sqlite_file: the database to which records are added
        :param path_pat: used to select the records that will be written
        """
        self.src_sqlite_file = src_sqlite_file
        self.dest_sqlite_file = dest_sqlite_file

        if self.start_with_frest_dest_sqlite_file:
            if os.path.exists(self.dest_sqlite_file):
                print 'deleting exsting db file at: {}'.format(self.dest_sqlite_file)
                os.remove(self.dest_sqlite_file)

        self.src_db = CommsDBTable(self.src_sqlite_file)
        self.dest_db = CommsDBTable(self.dest_sqlite_file)

        # self.records = map (DBRecord, self.src_db.select_all_records())
        # self.records = self.src_db.select_all_records(sort_spec='path ASC')
        self.records = self.get_records_to_write(path_pat)
        print 'there are {} records'.format(len(self.records))

    def get_records_to_write (self, path_pat=None):
        # print 'get_records_to_write: "{}"'.format(path_pat)
        if path_pat is None:
            return self.src_db.select_all_records(sort_spec='path ASC')
        else:
            return self.src_db.select('*', "WHERE path LIKE '%{}%'".format(path_pat))

    def get_dest_path (self, src_path):
        dest_rel_path = src_path.replace(self.src_base_dir, '')
        if dest_rel_path.startswith('/Staging'):
            dest_rel_path = dest_rel_path.replace ('/Staging', '/Field Projects')
        # dest_path = src_path.replace (self.src_base_dir, self.dest_base_dir)
        dest_path = self.dest_base_dir + dest_rel_path
        return dest_path

    def write_all_records (self, start=0):
        num_recs = len(self.records)

        i = start
        for rec in self.records[start:]:
            self.write_record(rec)
            i += 1
            if i > 0 and i % 100 == 0:
                print u'{}/{}'.format(i, num_recs)

    def write_record (self, rec):
        """

        :param rec: a db_record
        :return:
        """
        rec = list(rec)  # to clone row of data
        src_path = rec[1] # path is the second field
        dest_path = self.get_dest_path(src_path)

        # escape quote for sql
        rec[1] = dest_path.replace ("'", "''")
        if not self.db_rec_exists(rec[1]):
            self.add_dest_record(rec)

        dest_file_path = globals.normalize_file_path(dest_path)
        src_file_path = globals.normalize_file_path(src_path)
        if self.dowrites:
            if not os.path.exists(os.path.dirname(dest_file_path)):
                os.makedirs(os.path.dirname(dest_file_path))
            if not os.path.exists(dest_file_path):
                shutil.copy2 (src_file_path, dest_file_path)
        else:
            if not os.path.exists(dest_file_path):
                print ' - would have copied from ', src_file_path
        print u' - {} ({})'.format(dest_file_path, rec[4])

    def db_rec_exists (self, path):
        """
        return True if this path exists in the Destination DB
        """
        try:
            normalized = globals.normalize_db_path(path)
            # print 'normalized: {}'.format(normalized)
            return self.dest_db.count_selected("WHERE path = '{}'".format(normalized))
        except:
            print u'ERROR: db_rec_exists choked on "{}"'.format(normalized)
            return False

    def add_dest_record (self,row):

        conn = sqlite3.connect(self.dest_sqlite_file)
        c = conn.cursor()

        # quoted_schema = ','.join(map (lambda x:"'%s'" % x, HOSTS_SCHEMA_SPEC))
        quoted_schema = self.dest_db.schema.quoted_schema

        # put data list together to match with schema fields
        quoted_values = ','.join(map (lambda x:u"'{}'".format(str (x).replace("'", "''")), row))  # current

        try:
            c.execute("INSERT INTO {tn} ({fn}) VALUES ({fv})" \
                      .format(tn=self.dest_db.table_name, fn=quoted_schema, fv=quoted_values.encode('utf8')))
        except:
            print 'quoted_values is a {}'.format(type(quoted_values))
            print quoted_values
            print('ERROR: {}'.format(sys.exc_info()))
            traceback.print_stack()
            sys.exit()

        conn.commit()
        conn.close()
                    print 'Copy error: {}'.format(err)
            else:
                # print '- would have copied {}: {}'.format(status, record.path)
                pass

            not_found.append(record.path)

        elif i > 0 and i % 100 == 0:
            print status

    return not_found


if __name__ == '__main__':
    sqlite_file = '/Users/ostwald/Documents/Comms/Composite_DB/cic-de-duped.sqlite'
    db = CommsDBTable(sqlite_file)

    if 1:
        # where_clause = "where path LIKE '/Volumes/cic-de-duped/CIC-ExternalDisk1/disc %'"
        # where_clause = "where path LIKE '/Volumes/cic-de-duped/CIC-ExternalDisk1/disc 15/%'"
        where_clause = "where path LIKE '/Volumes/cic-de-duped/CIC-ExternalDisk6/%'"
        # where_clause = "WHERE path like '%cic-de-duped/Field Projects%'"
        records = db.select('file_name, path', where_clause)
        print '{} records selected'.format(len(records))
        not_found = verify_records(records)
        print '\n{} were not found'.format(len(not_found))
        for path in not_found:
            print path

    if 0:
        records = db.select_all_records()
Exemple #9
0
class DirCmp:

    verbose = 0

    def __init__(self, sqlite_file):
        self.db = CommsDBTable(sqlite_file)

    def compare(self, dirA, dirB):
        dirA_items = self.get_dir_items(dirA)
        dirA_cksums = map(lambda x: x[0], dirA_items)

        dirB_items = self.get_dir_items(dirB)
        dirB_cksums = map(lambda x: x[0], dirB_items)

        print 'DirA: {} ({} chksums)'.format(dirA, len(dirA_cksums))
        print 'DirB: {} ({} chksums)'.format(dirB, len(dirB_cksums))

        # print 'Items in {} but not in {}'.format(dirA, dirB)
        items = []
        for n in dirA_cksums:
            if not n in dirB_cksums:
                items.append(n)
        print '{} Items in dirA but not in dirB'.format(len(items))

        if self.verbose:
            for n in items:
                print n

    def get_dir_items(self, path):
        rows = self.db.select('check_sum, path',
                              "WHERE path LIKE '{}%'".format(path))
        return rows

    def get_sub_dirs_OFF(self, path):
        base_dir = os.path.dirname(path)
        segments = len(base_dir.split('/'))
        items = self.get_dir_items(base_dir)
        sub_dirs = []
        for item in items:
            sub_dir = os.path.dirname(item[1])
            if len(sub_dir.split('/')) == segments + 1:
                sub_dirs.append(sub_dir)
        return list(set(sub_dirs))

    def get_sub_dirs(self, path):
        base_dir = os.path.dirname(path)
        base_dir_len = len(base_dir.split('/'))
        items = self.get_dir_items(base_dir)
        print '{} items in {}'.format(len(items), path)
        sub_dirs = []
        child_names = []
        for item in items:
            dir_path = os.path.dirname(item[1])
            splits = dir_path.split('/')
            if len(splits) > base_dir_len:
                child_name = splits[base_dir_len]
                if not child_name in child_names:
                    child_names.append(child_name)
        return map(lambda x: os.path.join(path, x), child_names)

    def top_level_compare(self, dirA, dirB):

        dc.compare(dirA, dirB)
        dc.compare(dirB, dirA)

        subDirsA = self.get_sub_dirs(dirA)
        subDirsB = self.get_sub_dirs(dirB)

        print '\nsubDirsA'
        for s in subDirsA:
            print s

        print '\nsubDirsB'
        for s in subDirsB:
            print s

        for subDirA in subDirsA:
            name = os.path.basename(subDirA)
            # print name
            subDirB = os.path.join(dirB, name)
            # if subDirsB.index (subDirB) > -1:
            if subDirB in subDirsB:
                print '\n{} is in both'.format(name)
                dc.compare(subDirA, subDirB)
                dc.compare(subDirB, subDirA)
            else:
                print '\n{} is in A but not in B'.format(name)