def GET(self): if librarian.is_traktor_running(): response = {"status": "error", "message": "Please quit Traktor first."} else: cleaner = Cleaner(Library.instance()) cleaner.remove_duplicates() logger.debug(u"Duplicate removal complete") response = cleaner.get_result() response["status"] = "ok" web.header("Cache-Control", "no-cache") return json.dumps(response)
def main(): parser = argparse.ArgumentParser(description="OpsGenie Alert Classifier") parser.add_argument('file', type=lambda x: is_valid_file(parser, x), metavar='FILE', help='file to work with') parser.add_argument("--clean", nargs='+', dest="clean", help="create a 'clean' file with whitelisted columns a raw file") parser.add_argument("--remove", nargs='+', dest="remove", help="Match rows to remove based the 'Message' column") parser.add_argument("--count", nargs='?', dest="count", default=None, const=None, help="count of alerts grouped by specified column name") parser.add_argument("--fuzzy-count", nargs='?', dest="fuzzy_count", default=None, const=None, help="fuzzy count alerts grouped by specified column name") parser.add_argument("--limit", nargs='?', dest="limit", default=20, const=20, type=int, help="limit number of results returned (default: 20)") parser.add_argument("--interval", nargs='+', dest="interval", help="Time interval in hours to filter alerts") parser.add_argument("--match", nargs='?', dest="match", default=None, const=None, help="Regex match against specified column name for count") parser.add_argument("--update-minutes", nargs='?', dest="update_minutes", default=None, const=None, help="Number of minutes between 'CreatedAt' and 'UpdatedAt'") parser.add_argument("--outfile", nargs='?', dest="outfile", default=None, const=None, help="Optional file to output results of count") parser.add_argument("--threshold", nargs='?', dest="threshold", default=90, const=90, type=int, help="Threshold for alert fuzzy match (default: 100 - so 100% match)") parser.add_argument("--remove-numbers", nargs='?', dest="remove_numbers", default=False, const=None, type=bool, help="Remove numbers from alias before doing fuzzy matching (default: False). \ To be used in conjuction with the fuzzy threshold flag") parser.add_argument('--alias-strip-list', type=lambda x: is_valid_file(parser, x), dest='strip_file', help='csv file with a column of values to strip', metavar="FILE") args = parser.parse_args() if args.clean: if not args.file.endswith("raw.csv"): parser.error("The file {} does not end with 'raw.csv'".format(args.file)) Cleaner.clean(args.file, args.clean, args.remove) elif args.count: counter = Counter() counter.count(file=args.file, column=args.count, limit=args.limit, interval=args.interval, match=args.match, update_minutes=args.update_minutes, outfile=args.outfile) elif args.fuzzy_count: fuzzy_counter = FuzzyCounter() fuzzy_counter.count(file=args.file, column=args.fuzzy_count, limit=args.limit, threshold=args.threshold, remove_numbers=args.remove_numbers, outfile=args.outfile, alias_strip_list=args.strip_file)
def redact(png_path): """ Takes in a path png_path to a path of an image file, and overwrites it as a cleaned image file """ file = Image.open(png_path) imarray = np.array(file) clean_array = Cleaner.find_redactions(imarray) file = Image.fromarray(clean_array) file.save(png_path, 'PNG') return
def main(): try: lib = Library(conf.library_dir) logger.debug("Starting") if conf.action == "clean": cleaner = Cleaner(lib) print("Removing duplicates..."), cleaner.remove_duplicates() print("DONE") cleaner.report() if not conf.test: lib.flush() print("\nTraktor library updated.") else: print("\nTest run. No changes made to the library.") elif conf.action == "export": exporter = Exporter(lib, conf.export_dir) exporter.export() except Exception as e: logger.error(e, exc_info=False)
def main(): try: lib = Library(conf.library_dir) if conf.action == "clean": cleaner = Cleaner(lib) print("Removing duplicates..."), cleaner.remove_duplicates() print("DONE") cleaner.report() if not conf.test: lib.flush() print("\nTraktor library updated.") else: print("\nTest run. No changes made to the library.") elif conf.action == "export": exporter = Exporter(lib, conf.export_dir) exporter.export() except Exception as e: logger.error(e, exc_info=False)
def run(settings): """ Executes a single run where certain datasets might or might not be snapshotted """ now = datetime.now() yda = datetime.now() - timedelta(1) today = '{0:04d}{1:02d}{2:02d}'.format(now.year, now.month, now.day) yesterday = '{0:04d}{1:02d}{2:02d}'.format(yda.year, yda.month, yda.day) snapshots = ZFS.get_snapshots() datasets = ZFS.get_datasets() for dataset in datasets: if dataset in settings: try: dataset_settings = settings[dataset] local_snapshots = snapshots.get(dataset, []) take_snapshot = dataset_settings['snapshot'] is True replicate = dataset_settings['replicate'] is not None # Decide whether we need to handle this dataset execute = False if take_snapshot is True or replicate is True: if dataset_settings['time'] == 'trigger': # We wait until we find a trigger file in the filesystem trigger_filename = '{0}/.trigger'.format( dataset_settings['mountpoint']) if os.path.exists(trigger_filename): Manager.logger.info( 'Trigger found on {0}'.format(dataset)) os.remove(trigger_filename) execute = True else: trigger_time = dataset_settings['time'].split(':') hour = int(trigger_time[0]) minutes = int(trigger_time[1]) if (now.hour > hour or (now.hour == hour and now.minute >= minutes) ) and today not in local_snapshots: Manager.logger.info( 'Time passed for {0}'.format(dataset)) execute = True if execute is True: # Pre exectution command if dataset_settings['preexec'] is not None: Helper.run_command(dataset_settings['preexec'], '/') if take_snapshot is True: # Take today's snapshotzfs Manager.logger.info( 'Taking snapshot {0}@{1}'.format( dataset, today)) ZFS.snapshot(dataset, today) local_snapshots.append(today) Manager.logger.info( 'Taking snapshot {0}@{1} complete'.format( dataset, today)) # Replicating, if required if replicate is True: Manager.logger.info( 'Replicating {0}'.format(dataset)) replicate_settings = dataset_settings['replicate'] push = replicate_settings['target'] is not None remote_dataset = replicate_settings[ 'target'] if push else replicate_settings[ 'source'] remote_snapshots = ZFS.get_snapshots( remote_dataset, replicate_settings['endpoint']) last_common_snapshot = None if remote_dataset in remote_snapshots: if push is True: # If pushing, we search for the last local snapshot that is remotely available for snapshot in local_snapshots: if snapshot in remote_snapshots[ remote_dataset]: last_common_snapshot = snapshot else: # Else, we search for the last remote snapshot that is locally available for snapshot in remote_snapshots[ remote_dataset]: if snapshot in local_snapshots: last_common_snapshot = snapshot if last_common_snapshot is not None: # There's a common snapshot previous_snapshot = None if push is True: for snapshot in local_snapshots: if snapshot == last_common_snapshot: previous_snapshot = last_common_snapshot continue if previous_snapshot is not None: # There is a snapshot on this host that is not yet on the other side. size = ZFS.get_size( dataset, previous_snapshot, snapshot) Manager.logger.info( ' {0}@{1} > {0}@{2} ({3})'. format(dataset, previous_snapshot, snapshot, size)) ZFS.replicate( dataset, previous_snapshot, snapshot, remote_dataset, replicate_settings.get( 'buffer_size', BUFFER_SIZE), replicate_settings['endpoint'], direction='push', compression=replicate_settings[ 'compression']) ZFS.hold(dataset, snapshot) ZFS.hold( remote_dataset, snapshot, replicate_settings['endpoint']) ZFS.release( dataset, previous_snapshot) ZFS.release( remote_dataset, previous_snapshot, replicate_settings['endpoint']) previous_snapshot = snapshot else: for snapshot in remote_snapshots[ remote_dataset]: if snapshot == last_common_snapshot: previous_snapshot = last_common_snapshot continue if previous_snapshot is not None: # There is a remote snapshot that is not yet on the local host. size = ZFS.get_size( remote_dataset, previous_snapshot, snapshot, replicate_settings['endpoint']) Manager.logger.info( ' {0}@{1} > {0}@{2} ({3})'. format(remote_dataset, previous_snapshot, snapshot, size)) ZFS.replicate( remote_dataset, previous_snapshot, snapshot, dataset, replicate_settings.get( 'buffer_size', BUFFER_SIZE), replicate_settings['endpoint'], direction='pull', compression=replicate_settings[ 'compression']) ZFS.hold(dataset, snapshot) ZFS.hold( remote_dataset, snapshot, replicate_settings['endpoint']) ZFS.release( dataset, previous_snapshot) ZFS.release( remote_dataset, previous_snapshot, replicate_settings['endpoint']) previous_snapshot = snapshot elif push is True and len(local_snapshots) > 0: # No common snapshot if remote_dataset not in remote_snapshots: # No remote snapshot, full replication snapshot = local_snapshots[-1] size = ZFS.get_size( dataset, None, snapshot) Manager.logger.info( ' {0}@ > {0}@{1} ({2})'. format(dataset, snapshot, size)) ZFS.replicate( dataset, None, snapshot, remote_dataset, replicate_settings.get( 'buffer_size', BUFFER_SIZE), replicate_settings['endpoint'], direction='push', compression=replicate_settings[ 'compression']) ZFS.hold(dataset, snapshot) ZFS.hold(remote_dataset, snapshot, replicate_settings['endpoint']) elif push is False and remote_dataset in remote_snapshots and len( remote_snapshots[remote_dataset]) > 0: # No common snapshot if len(local_snapshots) == 0: # No local snapshot, full replication snapshot = remote_snapshots[ remote_dataset][-1] size = ZFS.get_size( remote_dataset, None, snapshot, replicate_settings['endpoint']) Manager.logger.info( ' {0}@ > {0}@{1} ({2})'. format(remote_dataset, snapshot, size)) ZFS.replicate( remote_dataset, None, snapshot, dataset, replicate_settings.get( 'buffer_size', BUFFER_SIZE), replicate_settings['endpoint'], direction='pull', compression=replicate_settings[ 'compression']) ZFS.hold(dataset, snapshot) ZFS.hold(remote_dataset, snapshot, replicate_settings['endpoint']) Manager.logger.info( 'Replicating {0} complete'.format(dataset)) # Post execution command if dataset_settings['postexec'] is not None: Helper.run_command(dataset_settings['postexec'], '/') # Cleaning the snapshots (cleaning is mandatory) if today in local_snapshots or yesterday in local_snapshots: Cleaner.clean(dataset, local_snapshots, dataset_settings['schema']) except Exception as ex: Manager.logger.error('Exception: {0}'.format(str(ex)))
def run(settings): """ Executes a single run where certain datasets might or might not be snapshotted """ now = datetime.now() today = "{0:04d}{1:02d}{2:02d}".format(now.year, now.month, now.day) snapshots = ZFS.get_snapshots() datasets = ZFS.get_datasets() for dataset in datasets: if dataset in settings: try: dataset_settings = settings[dataset] local_snapshots = snapshots.get(dataset, []) take_snapshot = dataset_settings["snapshot"] is True replicate = dataset_settings["replicate"] is not None # Decide whether we need to handle this dataset execute = False if take_snapshot is True or replicate is True: if dataset_settings["time"] == "trigger": # We wait until we find a trigger file in the filesystem trigger_filename = "{0}/.trigger".format(dataset_settings["mountpoint"]) if os.path.exists(trigger_filename): Manager.logger.info("Trigger found on {0}".format(dataset)) os.remove(trigger_filename) execute = True else: trigger_time = dataset_settings["time"].split(":") hour = int(trigger_time[0]) minutes = int(trigger_time[1]) if ( now.hour > hour or (now.hour == hour and now.minute >= minutes) ) and today not in local_snapshots: Manager.logger.info("Time passed for {0}".format(dataset)) execute = True if execute is True: # Pre exectution command if dataset_settings["preexec"] is not None: Helper.run_command(dataset_settings["preexec"], "/") if take_snapshot is True: # Take today's snapshotzfs Manager.logger.info("Taking snapshot {0}@{1}".format(dataset, today)) ZFS.snapshot(dataset, today) local_snapshots.append(today) Manager.logger.info("Taking snapshot {0}@{1} complete".format(dataset, today)) # Replicating, if required if replicate is True: Manager.logger.info("Replicating {0}".format(dataset)) replicate_settings = dataset_settings["replicate"] push = replicate_settings["target"] is not None remote_dataset = replicate_settings["target"] if push else replicate_settings["source"] remote_snapshots = ZFS.get_snapshots(remote_dataset, replicate_settings["endpoint"]) last_common_snapshot = None if remote_dataset in remote_snapshots: if ( push is True ): # If pushing, we search for the last local snapshot that is remotely available for snapshot in local_snapshots: if snapshot in remote_snapshots[remote_dataset]: last_common_snapshot = snapshot else: # Else, we search for the last remote snapshot that is locally available for snapshot in remote_snapshots[remote_dataset]: if snapshot in local_snapshots: last_common_snapshot = snapshot if last_common_snapshot is not None: # There's a common snapshot previous_snapshot = None if push is True: for snapshot in local_snapshots: if snapshot == last_common_snapshot: previous_snapshot = last_common_snapshot continue if previous_snapshot is not None: # There is a snapshot on this host that is not yet on the other side. size = ZFS.get_size(dataset, previous_snapshot, snapshot) Manager.logger.info( " {0}@{1} > {0}@{2} ({3})".format( dataset, previous_snapshot, snapshot, size ) ) ZFS.replicate( dataset, previous_snapshot, snapshot, remote_dataset, replicate_settings["endpoint"], direction="push", compression=replicate_settings["compression"], ) previous_snapshot = snapshot else: for snapshot in remote_snapshots[remote_dataset]: if snapshot == last_common_snapshot: previous_snapshot = last_common_snapshot continue if previous_snapshot is not None: # There is a remote snapshot that is not yet on the local host. size = ZFS.get_size( remote_dataset, previous_snapshot, snapshot, replicate_settings["endpoint"], ) Manager.logger.info( " {0}@{1} > {0}@{2} ({3})".format( remote_dataset, previous_snapshot, snapshot, size ) ) ZFS.replicate( remote_dataset, previous_snapshot, snapshot, dataset, replicate_settings["endpoint"], direction="pull", compression=replicate_settings["compression"], ) previous_snapshot = snapshot elif push is True and len(local_snapshots) > 0: # No common snapshot if remote_dataset not in remote_snapshots: # No remote snapshot, full replication snapshot = local_snapshots[-1] size = ZFS.get_size(dataset, None, snapshot) Manager.logger.info( " {0}@ > {0}@{1} ({2})".format(dataset, snapshot, size) ) ZFS.replicate( dataset, None, snapshot, remote_dataset, replicate_settings["endpoint"], direction="push", compression=replicate_settings["compression"], ) elif ( push is False and remote_dataset in remote_snapshots and len(remote_snapshots[remote_dataset]) > 0 ): # No common snapshot if len(local_snapshots) == 0: # No local snapshot, full replication snapshot = remote_snapshots[remote_dataset][-1] size = ZFS.get_size(remote_dataset, None, snapshot, replicate_settings["endpoint"]) Manager.logger.info( " {0}@ > {0}@{1} ({2})".format(remote_dataset, snapshot, size) ) ZFS.replicate( remote_dataset, None, snapshot, dataset, replicate_settings["endpoint"], direction="pull", compression=replicate_settings["compression"], ) Manager.logger.info("Replicating {0} complete".format(dataset)) # Post execution command if dataset_settings["postexec"] is not None: Helper.run_command(dataset_settings["postexec"], "/") # Cleaning the snapshots (cleaning is mandatory) if today in local_snapshots: Cleaner.clean(dataset, local_snapshots, dataset_settings["schema"]) except Exception as ex: Manager.logger.error("Exception: {0}".format(str(ex)))
import os congress_id = "" if len(sys.argv) > 3 or len(sys.argv) < 2: print("Please Enter valid parameter:") print("Parameter: Congress term number") print("Option: --skip, avoid data cleaning") sys.exit() if len(sys.argv) == 2: congress_id = str(sys.argv[1]) if os.path.isfile("rawData/" + "speeches_" + congress_id + ".txt") and os.path.isfile("rawData/" + congress_id + "_SpeakerMap.txt"): print("cleaning ....") data_cleaner = Cleaner([congress_id]) data_cleaner.clean_pipeline() print("classifying ....") congress_classifier = Classifier([congress_id]) congress_classifier.base_pipeline() print("done.") sys.exit() else: print( "There are no speeches and speakerMap text file to process for congress " + congress_id) print( "Please put the target congress raw text data into rawData directory" ) sys.exit()
def run(settings): """ Executes a single run where certain datasets might or might not be snapshotted """ now = datetime.now() yda = datetime.now() - timedelta(1) today = '{0:04d}{1:02d}{2:02d}'.format(now.year, now.month, now.day) yesterday = '{0:04d}{1:02d}{2:02d}'.format(yda.year, yda.month, yda.day) snapshots = ZFS.get_snapshots() datasets = ZFS.get_datasets() for dataset in datasets: if dataset in settings: try: dataset_settings = settings[dataset] local_snapshots = snapshots.get(dataset, []) take_snapshot = dataset_settings['snapshot'] is True replicate = dataset_settings['replicate'] is not None # Decide whether we need to handle this dataset execute = False if take_snapshot is True or replicate is True: if dataset_settings['time'] == 'trigger': # We wait until we find a trigger file in the filesystem trigger_filename = '{0}/.trigger'.format(dataset_settings['mountpoint']) if os.path.exists(trigger_filename): Manager.logger.info('Trigger found on {0}'.format(dataset)) os.remove(trigger_filename) execute = True else: trigger_time = dataset_settings['time'].split(':') hour = int(trigger_time[0]) minutes = int(trigger_time[1]) if (now.hour > hour or (now.hour == hour and now.minute >= minutes)) and today not in local_snapshots: Manager.logger.info('Time passed for {0}'.format(dataset)) execute = True if execute is True: # Pre exectution command if dataset_settings['preexec'] is not None: Helper.run_command(dataset_settings['preexec'], '/') if take_snapshot is True: # Take today's snapshotzfs Manager.logger.info('Taking snapshot {0}@{1}'.format(dataset, today)) ZFS.snapshot(dataset, today) local_snapshots.append(today) Manager.logger.info('Taking snapshot {0}@{1} complete'.format(dataset, today)) # Replicating, if required if replicate is True: Manager.logger.info('Replicating {0}'.format(dataset)) replicate_settings = dataset_settings['replicate'] push = replicate_settings['target'] is not None remote_dataset = replicate_settings['target'] if push else replicate_settings['source'] remote_snapshots = ZFS.get_snapshots(remote_dataset, replicate_settings['endpoint']) last_common_snapshot = None if remote_dataset in remote_snapshots: if push is True: # If pushing, we search for the last local snapshot that is remotely available for snapshot in local_snapshots: if snapshot in remote_snapshots[remote_dataset]: last_common_snapshot = snapshot else: # Else, we search for the last remote snapshot that is locally available for snapshot in remote_snapshots[remote_dataset]: if snapshot in local_snapshots: last_common_snapshot = snapshot if last_common_snapshot is not None: # There's a common snapshot previous_snapshot = None if push is True: for snapshot in local_snapshots: if snapshot == last_common_snapshot: previous_snapshot = last_common_snapshot continue if previous_snapshot is not None: # There is a snapshot on this host that is not yet on the other side. size = ZFS.get_size(dataset, previous_snapshot, snapshot) Manager.logger.info(' {0}@{1} > {0}@{2} ({3})'.format(dataset, previous_snapshot, snapshot, size)) ZFS.replicate(dataset, previous_snapshot, snapshot, remote_dataset, replicate_settings.get('buffer_size', BUFFER_SIZE), replicate_settings['endpoint'], direction='push', compression=replicate_settings['compression']) ZFS.hold(dataset, snapshot) ZFS.hold(remote_dataset, snapshot, replicate_settings['endpoint']) ZFS.release(dataset, previous_snapshot) ZFS.release(remote_dataset, previous_snapshot, replicate_settings['endpoint']) previous_snapshot = snapshot else: for snapshot in remote_snapshots[remote_dataset]: if snapshot == last_common_snapshot: previous_snapshot = last_common_snapshot continue if previous_snapshot is not None: # There is a remote snapshot that is not yet on the local host. size = ZFS.get_size(remote_dataset, previous_snapshot, snapshot, replicate_settings['endpoint']) Manager.logger.info(' {0}@{1} > {0}@{2} ({3})'.format(remote_dataset, previous_snapshot, snapshot, size)) ZFS.replicate(remote_dataset, previous_snapshot, snapshot, dataset, replicate_settings.get('buffer_size', BUFFER_SIZE), replicate_settings['endpoint'], direction='pull', compression=replicate_settings['compression']) ZFS.hold(dataset, snapshot) ZFS.hold(remote_dataset, snapshot, replicate_settings['endpoint']) ZFS.release(dataset, previous_snapshot) ZFS.release(remote_dataset, previous_snapshot, replicate_settings['endpoint']) previous_snapshot = snapshot elif push is True and len(local_snapshots) > 0: # No common snapshot if remote_dataset not in remote_snapshots: # No remote snapshot, full replication snapshot = local_snapshots[-1] size = ZFS.get_size(dataset, None, snapshot) Manager.logger.info(' {0}@ > {0}@{1} ({2})'.format(dataset, snapshot, size)) ZFS.replicate(dataset, None, snapshot, remote_dataset, replicate_settings.get('buffer_size', BUFFER_SIZE), replicate_settings['endpoint'], direction='push', compression=replicate_settings['compression']) ZFS.hold(dataset, snapshot) ZFS.hold(remote_dataset, snapshot, replicate_settings['endpoint']) elif push is False and remote_dataset in remote_snapshots and len(remote_snapshots[remote_dataset]) > 0: # No common snapshot if len(local_snapshots) == 0: # No local snapshot, full replication snapshot = remote_snapshots[remote_dataset][-1] size = ZFS.get_size(remote_dataset, None, snapshot, replicate_settings['endpoint']) Manager.logger.info(' {0}@ > {0}@{1} ({2})'.format(remote_dataset, snapshot, size)) ZFS.replicate(remote_dataset, None, snapshot, dataset, replicate_settings.get('buffer_size', BUFFER_SIZE), replicate_settings['endpoint'], direction='pull', compression=replicate_settings['compression']) ZFS.hold(dataset, snapshot) ZFS.hold(remote_dataset, snapshot, replicate_settings['endpoint']) Manager.logger.info('Replicating {0} complete'.format(dataset)) # Post execution command if dataset_settings['postexec'] is not None: Helper.run_command(dataset_settings['postexec'], '/') # Cleaning the snapshots (cleaning is mandatory) if today in local_snapshots or yesterday in local_snapshots: Cleaner.clean(dataset, local_snapshots, dataset_settings['schema']) except Exception as ex: Manager.logger.error('Exception: {0}'.format(str(ex)))
import sys from store import Storer from clean import Cleaner from csv_dumper import CsvDumper url_param = sys.argv[-1] saved_page_loc = Storer.html_page(url_param) Cleaner.start(saved_page_loc) CsvDumper.start()
[plot.fitted_histogram, y['LogSalePrice']], # [plot.qq, y['SalePrice']], # [plot.qq, y['Log1SalePrice']], ] #plot.view(plots) #y.to_csv('y.csv', index=False) y_np = y.drop('SalePrice', axis=1).to_numpy() train_id = x_train['Id'] test_id = x_test['Id'] x_train.drop('Id', axis=1, inplace=True) x_test.drop('Id', axis=1, inplace=True) cleaner = Cleaner(x_train, x_test) cleaner.clean(variables) #linear = regression.build('Linear') #linear_cv = regression.cross_validate(linear, cleaner.x_train_np, y_np) #print('LINEAR', linear_cv) lasso = regression.build('Lasso', alpha=0.002) lasso_cv = regression.cross_validate(lasso, cleaner.x_train_np, y_np) elastic_net = regression.build('ElasticNet', alpha=0.002) elastic_net_cv = regression.cross_validate(elastic_net, cleaner.x_train_np, y_np) kernel_ridge = regression.build('KernelRidge') kernel_ridge_cv = regression.cross_validate(kernel_ridge, cleaner.x_train_np,