def __init__(self, path, name, wb_interface, monitor=False): self.path = path self.name = name self.wb_interface = wb_interface self.monitor = monitor #c self.Q = myQueue() self.Q = pyQueue() self.R = pyQueue() self.response = None self.localReset = Signal(bool(0)) self.error = None self.value = None # transaction information (simulation only) self._write = Signal(bool(0)) # write command in progress self._read = Signal(bool(0)) # read command in progress self._address = 0 # address of current/last transaction self._data = 0 # ??? @todo: is this used ??? self._write_data = 0 # holds the data written self._read_data = 0 # holds the data read # Utility signals self.inprog = Signal(bool(0)) self.iswrite = Signal(bool(0)) self.done = Signal(bool(0)) # bus transaction timeout in clock ticks self.timeout = 10000
def scrape_tree_meta(paths, cliargs, reindex_dict): jobstart = time.time() worker = get_worker_name() tree_dirs = [] tree_files = [] tree_crawltimes = [] qumulo = cliargs['qumulo'] totalcrawltime = 0 # amount of time (sec) before starting threads to help crawl files filethreadtime = diskover.config['filethreadtime'] for path in paths: threadsstarted = False starttime = time.time() root, files = path if qumulo: import diskover_qumulo if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] dmeta = diskover_qumulo.qumulo_get_dir_meta(worker, root, cliargs, reindex_dict, redis_conn) else: root_path = root dmeta = get_dir_meta(worker, root, cliargs, reindex_dict) if dmeta == "sametimes": # fetch meta data for directory and all it's files (doc sources) from index2 since # directory times haven't changed dir_source, files_source = get_metadata(root_path, cliargs) datenow = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") for file_source in files_source: # update indexed at time file_source['indexing_date'] = datenow # update worker name file_source['worker_name'] = worker tree_files.append(('file', file_source)) if dir_source: # update indexed at time dir_source['indexing_date'] = datenow # update worker name dir_source['worker_name'] = worker tree_dirs.append(dir_source) elapsed = time.time() - starttime tree_crawltimes.append({ "path": root_path, "worker_name": worker, "crawl_time": round(elapsed, 10), "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"), "_type": "crawlstat"}) totalcrawltime += elapsed else: # get meta off disk since times different in Redis than on disk for file in files: # spawn threads to help with getting file meta if running long if (time.time() - starttime) > filethreadtime: if not threadsstarted: bot_logger.info('*** %s taking more than %s to crawl, starting threads to help scrape file meta' % (root, filethreadtime)) # set up python Queue for threaded file meta scraping file_in_thread_q = pyQueue() file_out_thread_q = pyQueue() start_file_threads(file_in_thread_q, file_out_thread_q) threadsstarted = True if qumulo: file_in_thread_q.put((worker, file, cliargs, reindex_dict)) else: file_in_thread_q.put((worker, os.path.join(root, file), cliargs, reindex_dict)) else: if qumulo: fmeta = diskover_qumulo.qumulo_get_file_meta(worker, file, cliargs, reindex_dict) else: fmeta = get_file_meta(worker, os.path.join(root, file), cliargs, reindex_dict) if fmeta: tree_files.append(fmeta) if threadsstarted: bot_logger.info('*** Waiting for threads to finish...') # wait for threads to finish file_in_thread_q.join() bot_logger.info('*** Adding file meta thread results for %s' % root) # get all files and add to tree_files while file_out_thread_q.qsize(): tree_files.append(file_out_thread_q.get()) if dmeta: tree_dirs.append(dmeta) elapsed = time.time() - starttime tree_crawltimes.append({ "path": root_path, "worker_name": worker, "crawl_time": round(elapsed, 10), "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"), "_type": "crawlstat"}) totalcrawltime += elapsed if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_adder(worker, (tree_dirs, tree_files, tree_crawltimes), cliargs, totalcrawltime) elapsed_time = round(time.time() - jobstart, 3) bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
import time import hashlib try: from Queue import Queue as pyQueue except ImportError: from queue import Queue as pyQueue from threading import Thread, RLock from diskover import config, plugins, progress_bar from diskover_bot_module import get_worker_name, auto_tag, es_bulk_add, file_excluded fake_dirs = [] buckets = [] workername = get_worker_name() # create queue and threads for bulk adding to ES s3queue = pyQueue() s3threadlock = RLock() def process_line(row, tree_dirs, tree_files, cliargs): global fake_dirs n = 2 # S3 Inventory csv column headers inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]} try: inventory_dict['s3_size'] = int(row[n]) n = n + 1 except IndexError: pass try:
), result_ttl=config['redis_ttl']) logger.info( '%s possible dupe file hashes have been enqueued, worker bots processing dupes...' % possibledupescount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish() # set up python Queue for threaded file md5 checking file_in_thread_q = pyQueue() file_out_thread_q = pyQueue() start_file_threads()
def verify_dupes(hashgroup, cliargs): """This is the verify dupes function. It processes files in hashgroup to verify if they are duplicate. The first few bytes at beginning and end of files are compared and if same, a md5 check is run on the files. If the files are duplicate, their dupe_md5 field is updated to their md5sum. Returns hashgroup. """ # number of bytes to check at start and end of file read_bytes = config['dupes_checkbytes'] # min bytes to read of file size less than above min_read_bytes = 1 # Add first and last few bytes for each file to dictionary # create a new dictionary with files that have same byte hash hashgroup_bytes = {} for file in hashgroup['files']: try: f = open(file['filename'], 'rb') except (IOError, OSError): continue except Exception: continue # check if files is only 1 byte try: bytes_f = base64.b64encode(f.read(read_bytes)) except (IOError, OSError): pass try: bytes_f = base64.b64encode(f.read(min_read_bytes)) except Exception: continue try: f.seek(-read_bytes, os.SEEK_END) bytes_l = base64.b64encode(f.read(read_bytes)) except (IOError, OSError): pass try: f.seek(-min_read_bytes, os.SEEK_END) bytes_l = base64.b64encode(f.read(min_read_bytes)) except Exception: continue f.close() # create hash of bytes bytestring = str(bytes_f) + str(bytes_l) bytehash = hashlib.md5(bytestring.encode('utf-8')).hexdigest() # create new key for each bytehash and # set value as new list and add file hashgroup_bytes.setdefault(bytehash, []).append(file['filename']) # remove any bytehash key that only has 1 item (no duplicate) for key, value in list(hashgroup_bytes.items()): if len(value) < 2: filename = value[0] del hashgroup_bytes[key] # remove file from hashgroup for i in range(len(hashgroup['files'])): if hashgroup['files'][i]['filename'] == filename: del hashgroup['files'][i] break # run md5 sum check if bytes were same hashgroup_md5 = {} # set up python Queue for threaded file md5 checking file_in_thread_q = pyQueue() file_out_thread_q = pyQueue() start_file_threads(file_in_thread_q, file_out_thread_q) # do md5 check on files with same byte hashes for key, value in list(hashgroup_bytes.items()): for filename in value: # add file into thread queue file_in_thread_q.put((filename, cliargs)) # wait for threads to finish file_in_thread_q.join() # get all files and add to tree_files while file_out_thread_q.qsize(): item = file_out_thread_q.get() file, md5 = item # create new key for each md5 sum and set value as new list and # add file hashgroup_md5.setdefault(md5, []).append(file) # remove any md5sum key that only has 1 item (no duplicate) for key, value in list(hashgroup_md5.items()): if len(value) < 2: filename = value[0] del hashgroup_md5[key] # remove file from hashgroup for i in range(len(hashgroup['files'])): if hashgroup['files'][i]['filename'] == filename: del hashgroup['files'][i] break else: md5 = key if len(hashgroup['files']) >= 2: # update hashgroup's md5sum key hashgroup['md5sum'] = md5 return hashgroup else: return None
def __init__(self): self.queue = pyQueue() self.tracklist = None self.tracklistpointer = 0
def verify_dupes(hashgroup, cliargs): """This is the verify dupes function. It processes files in hashgroup to verify if they are duplicate. The first few bytes at beginning and end of files are compared and if same, a md5 check is run on the files. If the files are duplicate, their dupe_md5 field is updated to their md5sum. Returns hashgroup. """ # number of bytes to check at start and end of file read_bytes = diskover.config['dupes_checkbytes'] # min bytes to read of file size less than above min_read_bytes = 1 bot_logger = diskover_worker_bot.bot_logger if cliargs['verbose'] or cliargs['debug']: bot_logger.info('Processing %s files in hashgroup: %s' % (len(hashgroup['files']), hashgroup['filehash'])) # Add first and last few bytes for each file to dictionary # create a new dictionary with files that have same byte hash hashgroup_bytes = {} for file in hashgroup['files']: if cliargs['verbose'] or cliargs['debug']: bot_logger.info('Checking bytes: %s' % file['filename']) try: f = open(file['filename'], 'rb') except (IOError, OSError): if cliargs['verbose'] or cliargs['debug']: bot_logger.warning('Error opening file %s' % file['filename']) continue except Exception: if cliargs['verbose'] or cliargs['debug']: bot_logger.warning('Error opening file %s' % file['filename']) continue # check if files is only 1 byte try: bytes_f = base64.b64encode(f.read(read_bytes)) except (IOError, OSError): if cliargs['verbose'] or cliargs['debug']: bot_logger.info( 'Can\'t read first %s bytes of %s, trying first byte' % (str(read_bytes), file['filename'])) pass try: bytes_f = base64.b64encode(f.read(min_read_bytes)) except Exception: if cliargs['verbose'] or cliargs['debug']: bot_logger.warning('Error reading bytes of %s, giving up' % file['filename']) continue try: f.seek(-read_bytes, os.SEEK_END) bytes_l = base64.b64encode(f.read(read_bytes)) except (IOError, OSError): if cliargs['verbose'] or cliargs['debug']: bot_logger.info( 'Can\'t read last %s bytes of %s, trying last byte' % (str(read_bytes), file['filename'])) pass try: f.seek(-min_read_bytes, os.SEEK_END) bytes_l = base64.b64encode(f.read(min_read_bytes)) except Exception: if cliargs['verbose'] or cliargs['debug']: bot_logger.warning('Error reading bytes of %s, giving up' % file['filename']) continue f.close() # create hash of bytes bytestring = str(bytes_f) + str(bytes_l) bytehash = hashlib.md5(bytestring.encode('utf-8')).hexdigest() if cliargs['verbose'] or cliargs['debug']: bot_logger.info('Byte hash: %s (%s)' % (bytehash, file['filename'])) # create new key for each bytehash and # set value as new list and add file hashgroup_bytes.setdefault(bytehash, []).append(file['filename']) # remove any bytehash key that only has 1 item (no duplicate) for key, value in list(hashgroup_bytes.items()): if len(value) < 2: filename = value[0] if cliargs['verbose'] or cliargs['debug']: bot_logger.info('Unique file (bytes diff), removing: %s' % filename) del hashgroup_bytes[key] # remove file from hashgroup for i in range(len(hashgroup['files'])): if hashgroup['files'][i]['filename'] == filename: del hashgroup['files'][i] break # run md5 sum check if bytes were same hashgroup_md5 = {} # set up python Queue for threaded file md5 checking file_in_thread_q = pyQueue() file_out_thread_q = pyQueue() start_file_threads(file_in_thread_q, file_out_thread_q) # do md5 check on files with same byte hashes for key, value in list(hashgroup_bytes.items()): if cliargs['verbose'] or cliargs['debug']: bot_logger.info('Comparing MD5 sums for bytehash: %s' % key) for filename in value: if cliargs['verbose'] or cliargs['debug']: bot_logger.info('Checking MD5: %s' % filename) # add file into thread queue file_in_thread_q.put((filename, cliargs, bot_logger)) bot_logger.info('*** Waiting for threads to finish...') # wait for threads to finish file_in_thread_q.join() bot_logger.info('*** Adding file md5 thread results for bytehash: %s' % key) # get all files and add to tree_files while file_out_thread_q.qsize(): item = file_out_thread_q.get() file, md5 = item # create new key for each md5 sum and set value as new list and # add file hashgroup_md5.setdefault(md5, []).append(file) # remove any md5sum key that only has 1 item (no duplicate) for key, value in list(hashgroup_md5.items()): if len(value) < 2: filename = value[0] if cliargs['verbose'] or cliargs['debug']: bot_logger.info('Unique file (MD5 diff), removing: %s' % filename) del hashgroup_md5[key] # remove file from hashgroup for i in range(len(hashgroup['files'])): if hashgroup['files'][i]['filename'] == filename: del hashgroup['files'][i] break else: md5 = key if len(hashgroup['files']) >= 2: if cliargs['verbose'] or cliargs['debug']: bot_logger.info('Found %s dupes in hashgroup' % len(hashgroup['files'])) # update hashgroup's md5sum key hashgroup['md5sum'] = md5 return hashgroup else: return None
'change_percent_items_subdirs': changepercent_items_subdirs } } doclist.append(d) diskover.index_bulk_add(es, doclist, diskover.config, cliargs) elapsed_time = round(time.time() - jobstart, 6) bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time)) # set up bot logging bot_logger = bot_log_setup() # create thread queue for files filequeue = pyQueue() filequeue_meta = pyQueue() esbulkqueue = pyQueue() # create threads to get file meta for i in range(4): t = Thread(target=file_meta_collector) t.daemon = True t.start() # create threads to bulk add docs to es for i in range(4): t = Thread(target=es_bulk_adder) t.daemon = True t.start() if __name__ == '__main__': # parse cli arguments into cliargs dictionary