Example #1
0
 def __init__(self, path, name, wb_interface, monitor=False):
     self.path = path
     self.name = name
     self.wb_interface = wb_interface
     self.monitor = monitor
     #c self.Q = myQueue()
     self.Q = pyQueue()
     self.R = pyQueue()
     self.response = None
     self.localReset = Signal(bool(0))
     self.error = None
     self.value = None
     # transaction information (simulation only)
     self._write = Signal(bool(0))  # write command in progress
     self._read = Signal(bool(0))  # read command in progress
     self._address = 0  # address of current/last transaction
     self._data = 0  # ??? @todo: is this used ???
     self._write_data = 0  # holds the data written
     self._read_data = 0  # holds the data read
     # Utility signals
     self.inprog = Signal(bool(0))
     self.iswrite = Signal(bool(0))
     self.done = Signal(bool(0))
     # bus transaction timeout in clock ticks
     self.timeout = 10000
Example #2
0
def scrape_tree_meta(paths, cliargs, reindex_dict):
    jobstart = time.time()
    worker = get_worker_name()
    tree_dirs = []
    tree_files = []
    tree_crawltimes = []
    qumulo = cliargs['qumulo']
    totalcrawltime = 0
    # amount of time (sec) before starting threads to help crawl files
    filethreadtime = diskover.config['filethreadtime']

    for path in paths:
        threadsstarted = False
        starttime = time.time()
        root, files = path
        if qumulo:
            import diskover_qumulo
            if root['path'] != '/':
                root_path = root['path'].rstrip(os.path.sep)
            else:
                root_path = root['path']
            dmeta = diskover_qumulo.qumulo_get_dir_meta(worker, root, cliargs, reindex_dict, redis_conn)
        else:
            root_path = root
            dmeta = get_dir_meta(worker, root, cliargs, reindex_dict)
        if dmeta == "sametimes":
            # fetch meta data for directory and all it's files (doc sources) from index2 since
            # directory times haven't changed
            dir_source, files_source = get_metadata(root_path, cliargs)
            datenow = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
            for file_source in files_source:
                # update indexed at time
                file_source['indexing_date'] = datenow
                # update worker name
                file_source['worker_name'] = worker
                tree_files.append(('file', file_source))
            if dir_source:
                # update indexed at time
                dir_source['indexing_date'] = datenow
                # update worker name
                dir_source['worker_name'] = worker
                tree_dirs.append(dir_source)
                elapsed = time.time() - starttime
                tree_crawltimes.append({
                        "path": root_path,
                        "worker_name": worker,
                        "crawl_time": round(elapsed, 10),
                        "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"),
                        "_type": "crawlstat"})
                totalcrawltime += elapsed
        else:  # get meta off disk since times different in Redis than on disk
            for file in files:
                # spawn threads to help with getting file meta if running long
                if (time.time() - starttime) > filethreadtime:
                    if not threadsstarted:
                        bot_logger.info('*** %s taking more than %s to crawl, starting threads to help scrape file meta'
                                        % (root, filethreadtime))
                        # set up python Queue for threaded file meta scraping
                        file_in_thread_q = pyQueue()
                        file_out_thread_q = pyQueue()
                        start_file_threads(file_in_thread_q, file_out_thread_q)
                    threadsstarted = True
                    if qumulo:
                        file_in_thread_q.put((worker, file, cliargs, reindex_dict))
                    else:
                        file_in_thread_q.put((worker, os.path.join(root, file), cliargs, reindex_dict))
                else:
                    if qumulo:
                        fmeta = diskover_qumulo.qumulo_get_file_meta(worker, file, cliargs, reindex_dict)
                    else:
                        fmeta = get_file_meta(worker, os.path.join(root, file), cliargs, reindex_dict)
                    if fmeta:
                        tree_files.append(fmeta)
            if threadsstarted:
                bot_logger.info('*** Waiting for threads to finish...')
                # wait for threads to finish
                file_in_thread_q.join()
                bot_logger.info('*** Adding file meta thread results for %s' % root)
                # get all files and add to tree_files
                while file_out_thread_q.qsize():
                    tree_files.append(file_out_thread_q.get())
            if dmeta:
                tree_dirs.append(dmeta)
                elapsed = time.time() - starttime
                tree_crawltimes.append({
                    "path": root_path,
                    "worker_name": worker,
                    "crawl_time": round(elapsed, 10),
                    "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"),
                    "_type": "crawlstat"})
                totalcrawltime += elapsed

    if len(tree_dirs) > 0 or len(tree_files) > 0:
        es_bulk_adder(worker, (tree_dirs, tree_files, tree_crawltimes), cliargs, totalcrawltime)

    elapsed_time = round(time.time() - jobstart, 3)
    bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
Example #3
0
import time
import hashlib
try:
    from Queue import Queue as pyQueue
except ImportError:
    from queue import Queue as pyQueue
from threading import Thread, RLock
from diskover import config, plugins, progress_bar
from diskover_bot_module import get_worker_name, auto_tag, es_bulk_add, file_excluded

fake_dirs = []
buckets = []
workername = get_worker_name()

# create queue and threads for bulk adding to ES
s3queue = pyQueue()
s3threadlock = RLock()


def process_line(row, tree_dirs, tree_files, cliargs):
    global fake_dirs

    n = 2
    # S3 Inventory csv column headers
    inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]}
    try:
        inventory_dict['s3_size'] = int(row[n])
        n = n + 1
    except IndexError:
        pass
    try:
Example #4
0
                  ),
                  result_ttl=config['redis_ttl'])

    logger.info(
        '%s possible dupe file hashes have been enqueued, worker bots processing dupes...'
        % possibledupescount)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q]):
        if bar:
            q_len = len(q)
            try:
                bar.update(q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()


# set up python Queue for threaded file md5 checking
file_in_thread_q = pyQueue()
file_out_thread_q = pyQueue()
start_file_threads()
Example #5
0
def verify_dupes(hashgroup, cliargs):
    """This is the verify dupes function.
    It processes files in hashgroup to verify if they are duplicate.
    The first few bytes at beginning and end of files are
    compared and if same, a md5 check is run on the files.
    If the files are duplicate, their dupe_md5 field
    is updated to their md5sum.
    Returns hashgroup.
    """

    # number of bytes to check at start and end of file
    read_bytes = config['dupes_checkbytes']
    # min bytes to read of file size less than above
    min_read_bytes = 1

    # Add first and last few bytes for each file to dictionary

    # create a new dictionary with files that have same byte hash
    hashgroup_bytes = {}
    for file in hashgroup['files']:
        try:
            f = open(file['filename'], 'rb')
        except (IOError, OSError):
            continue
        except Exception:
            continue
        # check if files is only 1 byte
        try:
            bytes_f = base64.b64encode(f.read(read_bytes))
        except (IOError, OSError):
            pass
            try:
                bytes_f = base64.b64encode(f.read(min_read_bytes))
            except Exception:
                continue
        try:
            f.seek(-read_bytes, os.SEEK_END)
            bytes_l = base64.b64encode(f.read(read_bytes))
        except (IOError, OSError):
            pass
            try:
                f.seek(-min_read_bytes, os.SEEK_END)
                bytes_l = base64.b64encode(f.read(min_read_bytes))
            except Exception:
                continue
        f.close()

        # create hash of bytes
        bytestring = str(bytes_f) + str(bytes_l)
        bytehash = hashlib.md5(bytestring.encode('utf-8')).hexdigest()

        # create new key for each bytehash and
        # set value as new list and add file
        hashgroup_bytes.setdefault(bytehash, []).append(file['filename'])

    # remove any bytehash key that only has 1 item (no duplicate)
    for key, value in list(hashgroup_bytes.items()):
        if len(value) < 2:
            filename = value[0]
            del hashgroup_bytes[key]
            # remove file from hashgroup
            for i in range(len(hashgroup['files'])):
                if hashgroup['files'][i]['filename'] == filename:
                    del hashgroup['files'][i]
                    break

    # run md5 sum check if bytes were same
    hashgroup_md5 = {}
    # set up python Queue for threaded file md5 checking
    file_in_thread_q = pyQueue()
    file_out_thread_q = pyQueue()
    start_file_threads(file_in_thread_q, file_out_thread_q)

    # do md5 check on files with same byte hashes
    for key, value in list(hashgroup_bytes.items()):
        for filename in value:
            # add file into thread queue
            file_in_thread_q.put((filename, cliargs))

        # wait for threads to finish
        file_in_thread_q.join()

        # get all files and add to tree_files
        while file_out_thread_q.qsize():
            item = file_out_thread_q.get()
            file, md5 = item
            # create new key for each md5 sum and set value as new list and
            # add file
            hashgroup_md5.setdefault(md5, []).append(file)

    # remove any md5sum key that only has 1 item (no duplicate)
    for key, value in list(hashgroup_md5.items()):
        if len(value) < 2:
            filename = value[0]
            del hashgroup_md5[key]
            # remove file from hashgroup
            for i in range(len(hashgroup['files'])):
                if hashgroup['files'][i]['filename'] == filename:
                    del hashgroup['files'][i]
                    break
        else:
            md5 = key

    if len(hashgroup['files']) >= 2:
        # update hashgroup's md5sum key
        hashgroup['md5sum'] = md5
        return hashgroup
    else:
        return None
Example #6
0
 def __init__(self):
     self.queue = pyQueue()
     self.tracklist = None
     self.tracklistpointer = 0
Example #7
0
def verify_dupes(hashgroup, cliargs):
    """This is the verify dupes function.
    It processes files in hashgroup to verify if they are duplicate.
    The first few bytes at beginning and end of files are
    compared and if same, a md5 check is run on the files.
    If the files are duplicate, their dupe_md5 field
    is updated to their md5sum.
    Returns hashgroup.
    """

    # number of bytes to check at start and end of file
    read_bytes = diskover.config['dupes_checkbytes']
    # min bytes to read of file size less than above
    min_read_bytes = 1

    bot_logger = diskover_worker_bot.bot_logger

    if cliargs['verbose'] or cliargs['debug']:
        bot_logger.info('Processing %s files in hashgroup: %s' %
                        (len(hashgroup['files']), hashgroup['filehash']))

    # Add first and last few bytes for each file to dictionary

    # create a new dictionary with files that have same byte hash
    hashgroup_bytes = {}
    for file in hashgroup['files']:
        if cliargs['verbose'] or cliargs['debug']:
            bot_logger.info('Checking bytes: %s' % file['filename'])
        try:
            f = open(file['filename'], 'rb')
        except (IOError, OSError):
            if cliargs['verbose'] or cliargs['debug']:
                bot_logger.warning('Error opening file %s' % file['filename'])
            continue
        except Exception:
            if cliargs['verbose'] or cliargs['debug']:
                bot_logger.warning('Error opening file %s' % file['filename'])
            continue
        # check if files is only 1 byte
        try:
            bytes_f = base64.b64encode(f.read(read_bytes))
        except (IOError, OSError):
            if cliargs['verbose'] or cliargs['debug']:
                bot_logger.info(
                    'Can\'t read first %s bytes of %s, trying first byte' %
                    (str(read_bytes), file['filename']))
            pass
            try:
                bytes_f = base64.b64encode(f.read(min_read_bytes))
            except Exception:
                if cliargs['verbose'] or cliargs['debug']:
                    bot_logger.warning('Error reading bytes of %s, giving up' %
                                       file['filename'])
                continue
        try:
            f.seek(-read_bytes, os.SEEK_END)
            bytes_l = base64.b64encode(f.read(read_bytes))
        except (IOError, OSError):
            if cliargs['verbose'] or cliargs['debug']:
                bot_logger.info(
                    'Can\'t read last %s bytes of %s, trying last byte' %
                    (str(read_bytes), file['filename']))
            pass
            try:
                f.seek(-min_read_bytes, os.SEEK_END)
                bytes_l = base64.b64encode(f.read(min_read_bytes))
            except Exception:
                if cliargs['verbose'] or cliargs['debug']:
                    bot_logger.warning('Error reading bytes of %s, giving up' %
                                       file['filename'])
                continue
        f.close()

        # create hash of bytes
        bytestring = str(bytes_f) + str(bytes_l)
        bytehash = hashlib.md5(bytestring.encode('utf-8')).hexdigest()

        if cliargs['verbose'] or cliargs['debug']:
            bot_logger.info('Byte hash: %s (%s)' %
                            (bytehash, file['filename']))

        # create new key for each bytehash and
        # set value as new list and add file
        hashgroup_bytes.setdefault(bytehash, []).append(file['filename'])

    # remove any bytehash key that only has 1 item (no duplicate)
    for key, value in list(hashgroup_bytes.items()):
        if len(value) < 2:
            filename = value[0]
            if cliargs['verbose'] or cliargs['debug']:
                bot_logger.info('Unique file (bytes diff), removing: %s' %
                                filename)
            del hashgroup_bytes[key]
            # remove file from hashgroup
            for i in range(len(hashgroup['files'])):
                if hashgroup['files'][i]['filename'] == filename:
                    del hashgroup['files'][i]
                    break

    # run md5 sum check if bytes were same
    hashgroup_md5 = {}
    # set up python Queue for threaded file md5 checking
    file_in_thread_q = pyQueue()
    file_out_thread_q = pyQueue()
    start_file_threads(file_in_thread_q, file_out_thread_q)

    # do md5 check on files with same byte hashes
    for key, value in list(hashgroup_bytes.items()):
        if cliargs['verbose'] or cliargs['debug']:
            bot_logger.info('Comparing MD5 sums for bytehash: %s' % key)
        for filename in value:
            if cliargs['verbose'] or cliargs['debug']:
                bot_logger.info('Checking MD5: %s' % filename)
            # add file into thread queue
            file_in_thread_q.put((filename, cliargs, bot_logger))

        bot_logger.info('*** Waiting for threads to finish...')
        # wait for threads to finish
        file_in_thread_q.join()
        bot_logger.info('*** Adding file md5 thread results for bytehash: %s' %
                        key)
        # get all files and add to tree_files
        while file_out_thread_q.qsize():
            item = file_out_thread_q.get()
            file, md5 = item
            # create new key for each md5 sum and set value as new list and
            # add file
            hashgroup_md5.setdefault(md5, []).append(file)

    # remove any md5sum key that only has 1 item (no duplicate)
    for key, value in list(hashgroup_md5.items()):
        if len(value) < 2:
            filename = value[0]
            if cliargs['verbose'] or cliargs['debug']:
                bot_logger.info('Unique file (MD5 diff), removing: %s' %
                                filename)
            del hashgroup_md5[key]
            # remove file from hashgroup
            for i in range(len(hashgroup['files'])):
                if hashgroup['files'][i]['filename'] == filename:
                    del hashgroup['files'][i]
                    break
        else:
            md5 = key

    if len(hashgroup['files']) >= 2:
        if cliargs['verbose'] or cliargs['debug']:
            bot_logger.info('Found %s dupes in hashgroup' %
                            len(hashgroup['files']))
        # update hashgroup's md5sum key
        hashgroup['md5sum'] = md5
        return hashgroup
    else:
        return None
Example #8
0
                'change_percent_items_subdirs': changepercent_items_subdirs
            }
        }
        doclist.append(d)

    diskover.index_bulk_add(es, doclist, diskover.config, cliargs)

    elapsed_time = round(time.time() - jobstart, 6)
    bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))


# set up bot logging
bot_logger = bot_log_setup()

# create thread queue for files
filequeue = pyQueue()
filequeue_meta = pyQueue()
esbulkqueue = pyQueue()
# create threads to get file meta
for i in range(4):
    t = Thread(target=file_meta_collector)
    t.daemon = True
    t.start()
# create threads to bulk add docs to es
for i in range(4):
    t = Thread(target=es_bulk_adder)
    t.daemon = True
    t.start()

if __name__ == '__main__':
    # parse cli arguments into cliargs dictionary