def generate(self): def writeTweetAndGetNextTweet(tweet, tweetType, iterator): # print tweetType, trainingTime, testTime tweet["tweet_type"] = tweetType Utilities.writeAsJsonToFile(tweet, self.outputCombinedFile) return iterator.next() trainingFileIterator = Utilities.iterateTweetsFromFileWithTerminatingNone(self.inputTrainingSetFile) testFileIterator = Utilities.iterateTweetsFromFileWithTerminatingNone(self.inputTestSetFile) trainingTweet, testTweet = trainingFileIterator.next(), testFileIterator.next() trainingTime, testTime = None, None while trainingTweet != None or testTweet != None: if trainingTweet != None: trainingTime = datetime.strptime(trainingTweet["created_at"], Settings.twitter_api_time_format) if testTweet != None: testTime = datetime.strptime(testTweet["created_at"], Settings.twitter_api_time_format) if trainingTweet != None and testTweet != None: if testTime < trainingTime: testTweet = writeTweetAndGetNextTweet(testTweet, TweetType.test, testFileIterator) else: trainingTweet = writeTweetAndGetNextTweet(trainingTweet, TweetType.train, trainingFileIterator) elif trainingTweet == None: while testTweet != None: testTweet = writeTweetAndGetNextTweet(testTweet, TweetType.test, testFileIterator) else: while trainingTweet != None: trainingTweet = writeTweetAndGetNextTweet(trainingTweet, TweetType.train, trainingFileIterator)
def sendRequest(self): try: self.params = [param.items()[0] for param in self.params]; params = urllib.urlencode(self.params); Utilities.debug("Opening connection to "+self.base_url); self.conn = httplib.HTTPConnection(self.base_url,80); headers = {"User-Agent":self.getUserAgent(), "Content-Type":"application/x-www-form-urlencoded", "Accept":"text/json" }; #Utilities.debug(headers); #Utilities.debug(params); self.conn.request("GET",self.req_file,params,headers); resp=self.conn.getresponse() response=resp.read(); #Utilities.debug(response); self.done.emit(json.loads(response)); return json.loads(response); except: self.fail.emit()
def __init__(self, currentTime, dataType, numberOfExperts): self.currentTime = currentTime self.numberOfExperts = numberOfExperts self.inputTrainingSetFile = Utilities.getTrainingFile(currentTime, dataType, self.numberOfExperts) self.inputTestSetFile = Utilities.getTestFile(currentTime, dataType, self.numberOfExperts, bottom=True) self.outputCombinedFile = Utilities.getStreamingSetsFile(currentTime, dataType, numberOfExperts) Utilities.createDirectory(self.outputCombinedFile)
def request(self, method, url, data=[]): if Iugu.getApiKey() is None: Utilities.authFromEnv() if Iugu.getApiKey() is None: raise IuguAuthenticationException("Chave de API não configurada. Utilize Iugu.setApiKey(...) para configurar.") headers = self.__defaultHeaders() ( response_body, response_code ) = self._requestWithCURL( method, url, headers, data ) try: response = json.loads(response_body) except ValueError: raise IuguObjectNotFound(response_body) if response_code == 404: raise IuguObjectNotFound(response_body) if response.errors is not None: if type(response.errors) != "str" and len(response.errors) == 0: response.errors = None elif type(response.errors) != "str" and len(response.errors) > 0: response.errors = response.errors if response.errors is not None and type(response.errors) == "str": response.errors = response.errors iugu_last_api_response_code = response_code return response
def to_xml(self, library=None): d = self.to_dict() res = "<book>\n" for x in d: if x == "authors": if len(d["authors"]) > 0: res = res + " <authors>\n"+ \ " <name>" + ("</name>\n <name>".join([Utilities.escape_xml(y) for y in d["authors"]])) + "</name>\n"+ \ " </authors>\n" elif x == "categories" and library != None: if len(d["categories"]) > 0: res = res + " <categories>\n" col = library.categories.collection for i in d["categories"]: res = res + \ " <item>\n" + \ " " + "".join(["<n color='" + col[y].color + "'>" + Utilities.escape_xml(col[y].name) + "</n>" for y in library.categories.get_full_category_ids(i)]) + "\n" +\ " </item>\n" res = res + " </categories>\n" elif x == "isbn10": res = res + " <isbn10>" + ISBN.to_string(d["isbn10"], set_hyphen=False) + "</isbn10>\n" elif x == "isbn13": res = res + " <isbn13>" + ISBN.to_string(d["isbn13"], set_hyphen=False) + "</isbn13>\n" elif d[x] != None: res = res + " <" + x + ">" + Utilities.escape_xml(str(d[x])) + "</" + x + ">\n" return res + "</book>\n"
def close(self): """ None -> None Deletes residual files from the simulation """ Utilities.clean_retrosheet_files()
def __init__(self,inputstream,dictionary): Utilities.debug('Reader init'); self.tokenMap = dictionary; self.rawIn = inputstream; self.inn = ByteArray(); self.buf = bytearray(1024); self.bufSize = 0; self.readSize = 1;
def __init__(self, conn, reader, writer, digest): super(WALogin, self).__init__() self.conn = conn self.out = writer self.inn = reader self.digest = digest Utilities.debug("WALOGIN INIT")
def generateStatsForGlobalClassifier(): classifier = GlobalClassifier() classifier.load() currentDay = Settings.startTime while currentDay<=Settings.endTime: data = {'day': datetime.strftime(currentDay, Settings.twitter_api_time_format), 'metric': 'aucm', 'data_type': DocumentType.typeRuuslUnigram, 'test_data_days': 1} data['value'] = classifier.getAUCM(TestDocuments(currentTime=currentDay, numberOfExperts=Settings.numberOfExperts, dataType=DocumentType.typeRuuslUnigram, noOfDays=1).iterator()) Utilities.writeAsJsonToFile(data, Settings.stats_for_global_classifier) currentDay+=timedelta(days=1)
def nextTree(self): stanzaSize = self.readInt16(self.rawIn,1); self.inn.buf = []; self.fillBuffer(stanzaSize); ret = self.nextTreeInternal(); Utilities.debug("<<") if ret is not None: '''Utilities.debug(ret.toString());''' return ret;
def message_status_update(self,fmsg): Utilities.debug("Message status updated {0}".format(fmsg.status)); contact = fmsg.getContact(); modelData = fmsg.getModelData(); modelData["Contact"] = contact.getModelData(); if fmsg.status == WAXMPP.message_store.store.Message.STATUS_SENT: self.messageSent.emit(modelData); elif fmsg.status == WAXMPP.message_store.store.Message.STATUS_DELIVERED: self.messageDelivered.emit(modelData);
def __init__(self, currentTime, numberOfExperts): super(DocumentTypeRuuslUnigramWithMeta, self).__init__( currentTime, DocumentType.typeRuuslUnigramWithMeta, numberOfExperts ) self.inputTrainingSetFile = Utilities.getTrainingFile( currentTime, DocumentType.typeRuuslUnigram, self.numberOfExperts ) self.inputTestSetFile = Utilities.getTestFile( currentTime, DocumentType.typeRuuslUnigram, self.numberOfExperts, bottom=True )
def generateStatsForTrainingDataPerDay(): currentDay = Settings.startTime noOfDays = 1 while currentDay<=Settings.endTime: classDistribution = defaultdict(int) for d in Utilities.getTweets(fileNameMethod=Utilities.getTrainingFile, dataDirection=DataDirection.past, currentTime=currentDay, numberOfExperts=Settings.numberOfExperts, dataType=DocumentType.typeRuuslUnigram, noOfDays=noOfDays): classDistribution[d[1]]+=1 data = {'day': datetime.strftime(currentDay, Settings.twitter_api_time_format), 'class_distribution': classDistribution} Utilities.writeAsJsonToFile(data, Settings.stats_for_training_data) currentDay+=timedelta(days=1)
def write(self, node,needsFlush = 0): if node is None: self.out.write(0); else: Utilities.debug(">>"); '''Utilities.debug(node.toString());''' self.writeInternal(node); self.flushBuffer(needsFlush); self.out.buf = [];
def setup(self): """ None -> None Downloads and parses necessary retrosheet data for the simulation """ retro = Retrosheet(self.simYear) Utilities.ensure_gamelog_files_exist(self.simYear) Utilities.ensure_boxscore_files_exist(self.simYear, 'HOU') retro.clean_used_files()
def generateStatsForTopFeatures(): global maxLength currentDay = Settings.startTime noOfDays = 1 while currentDay<=Settings.endTime: classifier = FixedWindowClassifier(currentTime=currentDay, numberOfExperts=Settings.numberOfExperts, dataType=DocumentType.typeRuuslUnigram, noOfDays=noOfDays) classifier.load() data = {'day': datetime.strftime(currentDay, Settings.twitter_api_time_format), 'classifier_length': noOfDays, 'number_of_experts': Settings.numberOfExperts, 'data_type': DocumentType.typeRuuslUnigram} data['features']=classifier.showMostInformativeFeatures(2000) Utilities.writeAsJsonToFile(data, Settings.stats_for_most_informative_features) currentDay+=timedelta(days=1)
def __repr__(self): """ Give a verbose representation for a word in the format <form>@<lemma><categories>, for example: [email protected]('pl',)" @rtype: str """ z = Utilities.unidecode(self.__form) + "@" + ` self.__lemma ` if len(self.categories) == 0: return z else: return z + Utilities.tuple_str(self.categories)
def generateStatsToDetermineFixedWindowLength(): global maxLength currentDay = Settings.startTime while currentDay<=Settings.endTime: for noOfDays in Utilities.getClassifierLengthsByDay(currentDay, maxLength): classifier = FixedWindowClassifier(currentTime=currentDay, numberOfExperts=Settings.numberOfExperts, dataType=DocumentType.typeRuuslUnigram, noOfDays=noOfDays) classifier.load() data = {'day': datetime.strftime(currentDay, Settings.twitter_api_time_format), 'classifier_length': noOfDays, 'metric': 'aucm', 'number_of_experts': Settings.numberOfExperts, 'data_type': DocumentType.typeRuuslUnigram, 'test_data_days': 1} data['value'] = classifier.getAUCM(TestDocuments(currentTime=currentDay+timedelta(days=1), numberOfExperts=Settings.numberOfExperts, dataType=DocumentType.typeRuuslUnigram, noOfDays=1).iterator()) Utilities.writeAsJsonToFile(data, Settings.stats_to_determine_fixed_window_length) currentDay+=timedelta(days=1)
def streamStart(self): stanzaSize = self.readInt16(self.rawIn,1); self.fillBuffer(stanzaSize); tag = self.inn.read(); size = self.readListSize(tag); tag = self.inn.read(); if tag != 1: Utilities.debug(tag); raise Exception("expecting STREAM_START in streamStart"); attribCount = (size - 2 + size % 2) / 2; attributes = self.readAttributes(attribCount);
def generateStatsObservePerformanceByRelabelingDocuments(): global maxLength, idealModelLength currentDay = Settings.startTime while currentDay<=Settings.endTime: noOfDaysList = list(set([idealModelLength]).intersection(set(Utilities.getClassifierLengthsByDay(currentDay, maxLength)))) for noOfDays in noOfDaysList: classifier = FixedWindowWithRelabeledDocumentsClassifier(currentTime=currentDay, numberOfExperts=Settings.numberOfExperts, dataType=DocumentType.typeRuuslUnigram, noOfDays=noOfDays) classifier.load() data = {'day': datetime.strftime(currentDay, Settings.twitter_api_time_format), 'classifier_length': noOfDays, 'metric': 'aucm', 'number_of_experts': Settings.numberOfExperts, 'data_type': DocumentType.typeRuuslUnigram, 'test_data_days': 1} data['value'] = classifier.getAUCM(TestDocuments(currentTime=currentDay+timedelta(days=1), numberOfExperts=Settings.numberOfExperts, dataType=DocumentType.typeRuuslUnigram, noOfDays=1).iterator()) Utilities.writeAsJsonToFile(data, Settings.stats_to_observe_performance_by_relabeling_documents) currentDay+=timedelta(days=1)
def generateDataSetStats125(): currentDay = Settings.startTime while currentDay<=Settings.endTime: data = {'day': datetime.strftime(currentDay, Settings.twitter_api_time_format), 'train_classes': defaultdict(int), 'test_classes': defaultdict(int)} inputTrainingSetFile = Utilities.getTrainingFile(currentDay, DocumentType.typeRuuslUnigram, Settings.numberOfExperts) inputTestSetFile = Utilities.getTestFile(currentDay, DocumentType.typeRuuslUnigram, Settings.numberOfExperts, bottom=True) for file, tweetType in [(inputTrainingSetFile, 'training'), (inputTestSetFile, 'test')]: for tweet in Utilities.iterateTweetsFromFile(file): if tweetType=='training': data['train_classes'][tweet['class']]+=1 else: data['test_classes'][tweet['class']]+=1 Utilities.writeAsJsonToFile(data, Settings.stats_for_dataset_125) currentDay+=timedelta(days=1)
def quit(self): Utilities.debug("got quit!!!") #self.connMonitor.exit() #self.conn.disconnect() '''del self.connMonitor del self.conn.inn del self.conn.out del self.conn.login del self.conn.stanzaReader''' #del self.conn self.doQuit.emit();
def checkConnection(self): try: if self.conn.state == 0: raise Exception("Not connected"); elif self.conn.state == 2: self.conn.sendPing(); except: print "Connection crashed, reason: %s"%sys.exc_info()[1] self.networkDisconnected() self.networkAvailable(); Utilities.debug("CHECK PASSEDDDDDDDDDDDDDDD")
def __repr__(self): """ Return a verbose string representation. @rtype: str """ form, entry_form, id = self._content[0:3] categories = self.categories() r = ["{"] r.append(Utilities.unidecode(form) + "@" + Utilities.unidecode(entry_form) + "." + str(id)) if self.categories(): r.append(categories) r.append("}") return "".join(r)
def generate(self): for inputFile, outputFile in [ (self.inputTrainingSetFile, self.outputTrainingSetFile), (self.inputTestSetFile, self.outputTestSetFile), ]: for tweet in Utilities.iterateTweetsFromFile(inputFile): data = {} for k in DocumentType.keys: data[k] = tweet[k] data["screen_name"] = tweet["screen_name"] data["user_id"] = tweet["user_id"] data["document"] = tweet["document"] + DocumentTypeRuuslUnigramWithMeta.getUrlMeta(data["text"]) Utilities.writeAsJsonToFile(data, outputFile)
def generate(self): for inputFile, outputFile in [ (self.inputTrainingSetFile, self.outputTrainingSetFile), (self.inputTestSetFile, self.outputTestSetFile), ]: for tweet in Utilities.iterateTweetsFromFile(inputFile): data = {} for k in DocumentType.keys: data[k] = tweet[k] data["screen_name"] = tweet["user"]["screen_name"] data["user_id"] = tweet["user"]["id_str"] data["document"] = self.modifyDocument(data["text"]) Utilities.writeAsJsonToFile(data, outputFile)
def generateDataForGlobalClassifier(): inputDataFile = "/home/kykamath/projects/Classifiers/src/lda_svm/global_classifier/data/global_classifier" classToIntMap = {"sports": 1, "politics": 2, "entertainment": 3, "technology": 4} for line in open(inputDataFile): try: classType, term = line.strip().split() stringClassType = Utilities.getTopicForIndex(classType) if stringClassType in classToIntMap: Utilities.writeAsJsonToFile( {"class": stringClassType, "data": [term]}, Settings.globalClassifierData ) except: pass
def get_tasks(): u = request.form['url'].lower() url = Utilities.get_shortened_url(u) url_3 = Utilities.get_shortened_url(u,3) return_only_parent = False # If url is same as parent url, return everything just for parent # Dont redundantly return for parent and itself if url == url_3 or url+'/' == url_3: return_only_parent = True ds = DataStore() if not return_only_parent: all_urls = Utilities.modify_url(url) print all_urls # If the same url is also a parent url, return all results of parent . # And skip individual url results for url in all_urls: result = ds.fetch(url) if result == False: print " Tried for url " + url else: x = {"result":result} return jsonify(x) # If for our exact url and its modifications , nothing got returned outer_url = "parent::" + Utilities.get_shortened_url(url,3) print outer_url result = ds.fetch_all_from_parent(outer_url) if result : x = {"result":result} return jsonify(x) else: if outer_url[-1] == '/': result = ds.fetch_all_from_parent(outer_url[:-1]) else: result = ds.fetch_all_from_parent(outer_url + '/') if result : x = {"result":result} return jsonify(x) # If there is still nothing to show return 'No Response'
def run(self): """ Custom runner for OWD initiative It takes as arguments the parameters that gaiatest command would need For example: python ffox_test_runner_py --testvars=<testvars path> --address=localhost:2828 <tests path |\ test suite path> """ # Preprocess parser = BaseMarionetteOptions(usage='%prog [options] test_file_or_dir <test_file_or_dir> ...') structured.commandline.add_logging_group(parser) options, tests = parser.parse_args(self.args[1:]) parser.verify_usage(options, tests) # Traverse the tbpl logs option list and create directories if required for f in options.log_tbpl: d = f[:f.rfind('/')] if not os.path.exists(d): os.makedirs(d) logger = structured.commandline.setup_logging(options.logger_name, options) options.logger = logger # Remove default stdout logger from mozilla logger to_delete = filter(lambda h: h.stream.name == '<stdout>', logger.handlers) for d in to_delete: logger.remove_handler(d) location = self.parse_toolkit_location(self.args) options.toolkit_location = location # Hit the runner Utilities.connect_device() self.runner = self.start_test_runner(self.runner_class, options, tests) # Show the results via console and prepare the details self.process_runner_results() self.edit_html_results() self.edit_test_details() self.display_results() if self.runner.testvars['graphics']['enabled']: total_results_count = [self.passed, self.unexpected_failures, self.automation_failures, self.expected_failures, self.unexpected_passed, self.skipped] self.graphics = Graphics(results_by_suite=self.results_by_suite, total_results_count=total_results_count, output_dir=self.runner.testvars['graphics']['graphics_dir']) self.graphics.generate_all_graphics() # Generate CSV results (if required) is_cert = self.runner.testvars['general']['is_cert_device'] Utilities.generate_csv_reports(self, is_cert)
def search_user(user: str, stars: int, full: bool, utilities: Utilities): start = time() rate_limit_start, _ = utilities.get_rate_limit() user = User(name=user, minimal_stars=stars, full_search=full, utilities=utilities) user.get_all_repositories_parallel() # s.get_all_repositories() rate_limit_end, rate_reset = utilities.get_rate_limit() user.printout() end = time() print("-" * 100) print("TIME: " + str(end - start)) print("Rate limit remaining: {}\nRate limit will be reset in {} seconds.".format(rate_limit_end, rate_reset)) print("API rate used for this user: {}".format(rate_limit_start - rate_limit_end))
def __init__(self): self.utilities = Utilities() self.aspect_classifier = AspectClassifier() self.random_states = [11, 22, 33, 44, 55]
def compress_image(oimg, block_size=4, step_size=2, spatial_factor=2, intensity_shrinkage=0.75, max_x_offset=None, max_y_offset=None, err_func=ImageUtils.mse, verbosity=0): if verbosity > 0: print(Utilities.whoami()) argdict = locals().copy() for k in argdict.keys(): val = argdict[k] if not Utilities.is_iterable(val): print(" {0}: {1}".format(k, argdict[k])) else: print(" {0} is iterable".format(k)) if verbosity > 0: print("orig dims: {0}, {1}".format(oimg.shape[0], oimg.shape[1])) cimg = ImageUtils.trim_image(oimg, spatial_factor=spatial_factor, block_size=block_size, verbosity=verbosity) if verbosity > 0: print("trimmed dims: {0}, {1}".format(cimg.shape[0], cimg.shape[1])) if max_x_offset is None: max_x_offset = cimg.shape[1] - block_size if max_y_offset is None: max_y_offset = cimg.shape[0] - block_size dimg = ImageUtils.spatial_shrink(cimg, spatial_factor=spatial_factor) print("dimg_wd = {0}, dimg_ht = {1}".format(dimg.shape[0], dimg.shape[1])) FCode = namedtuple("FCode", ["dx", "dy", "mean_add", "rx", "ry", "err"]) codes = [] for rx in range(0, cimg.shape[1], block_size): if verbosity > 0: print("rx={0}".format(rx), end='') for ry in range(0, cimg.shape[0], block_size): parts = Compressor.find_best_params( cimg, dimg, rx, ry, block_size=block_size, step_size=step_size, spatial_factor=spatial_factor, intensity_shrinkage=intensity_shrinkage, max_x_offset=max_x_offset, max_y_offset=max_y_offset, err_func=err_func, verbosity=verbosity) dx, dy, mean_add, x, y, err, tries = parts code = FCode(dx, dy, mean_add, x, y, err) codes.append(code) print("--") params = OrderedDict() params['img_ht'] = cimg.shape[0] params['img_wd'] = cimg.shape[1] params['block_size'] = block_size params['step_size'] = step_size params['spatial_factor'] = spatial_factor params['intensity_shrinkage'] = intensity_shrinkage params['codes'] = codes return params
accuracy = np.mean((predictions > 0.5) == y_t) conf_matrix = pd.crosstab(y_t, predictions, rownames=['Actual'], colnames=['Predicted']) return accuracy, conf_matrix, self.c_ # Test usage if 1: """ Sigmoid prediction accuracy: 0.720 RelU prediction accuracy: 0.750 TanH prediction accuracy: 0.710 """ ut = Utilities() # Pulling the data into a tableu data = pd.read_csv('student_data.csv') # Drill-down the rank column processed_data = ut.one_hot_encoder(data, "rank") # Scaling the columns processed_data['gre'] = processed_data['gre'] / 800 processed_data['gpa'] = processed_data['gpa'] / 4.0 # Split the data 2/3 train and 1/3 test train_data, test_data = ut.test_train_split(processed_data) # Splitting inputs and labels
def setup(self): """ Setups the GAN """ # TODO new method called from init opt passed print("Attack type: " + self.attack_type) conn = SQLConnector() data = conn.pull_kdd99(attack=self.attack_type, num=5000) dataframe = pd.DataFrame.from_records( data=data, columns=conn.pull_kdd99_columns(allQ=True)) # ========== # ENCODING # ========== # https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn d = defaultdict(LabelEncoder) # Splitting the data from features and lablels. Want labels to be consistent with evaluator encoding, so # we use the utils attack_to_num function features = dataframe.iloc[:, :41] attack_labels = dataframe.iloc[:, 41:] for i in range(0, attack_labels.size): attack_labels.at[i, 'attack_type'] = util.attacks_to_num( attack_labels.at[i, 'attack_type']) features = features.apply( lambda x: d[x.name].fit_transform(x)) # fit is encoded dataframe # feature scaling, reccomended from github implementation self.scaler = MinMaxScaler(feature_range=(-1, 1)) scaled_features = self.scaler.fit_transform(features.astype(float)) scaled_df = pd.DataFrame(data=scaled_features) # Join the seperately encoded sections back into one dataframe dataframe = scaled_df.join(attack_labels) dataset = dataframe.values # transform to ndarray print(dataset) # TODO: Feature scaling? May be necessary. Has to be on a per-feature basis? # Splitting up the evaluation dataset. Should maybe be moved? eval_dataset = pd.read_csv('PortsweepAndNonportsweep.csv', header=None) eval_dataset = eval_dataset.values self.eval_dataset_X = eval_dataset[:, 0:41].astype(int) self.eval_dataset_Y = eval_dataset[:, 41] validationToTrainRatio = 0.05 validationSize = int(validationToTrainRatio * len(self.eval_dataset_X)) self.eval_validation_data = self.eval_dataset_X[:validationSize] self.eval_validation_labels = self.eval_dataset_Y[:validationSize] self.eval_dataset_X = self.eval_dataset_X[validationSize:] self.eval_dataset_Y = self.eval_dataset_Y[validationSize:] testToTrainRatio = 0.05 testSize = int(testToTrainRatio * len(self.eval_dataset_X)) self.eval_test_data = self.eval_dataset_X[:testSize] self.eval_test_labels = self.eval_dataset_Y[:testSize] self.eval_dataset_X = self.eval_dataset_X[testSize:] self.eval_dataset_Y = self.eval_dataset_Y[testSize:] # to visually judge encoded dataset print("Real encoded " + self.attack_type + " attacks:") print(dataset[:1]) # Set X as our input data and Y as our label self.X_train = dataset[:, 0:41].astype(float) Y_train = dataset[:, 41] # labels for data. 1 for valid attacks, 0 for fake (generated) attacks self.valid = np.ones((self.batch_size, 1)) self.fake = np.zeros((self.batch_size, 1))
import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer import numpy as np import pandas as pd from utilities import Utilities util = Utilities() [training_set_X, training_set_Y] = util.read_input_data() Y = training_set_Y[0:100] def sanitize_data(): X = [] for i in range(0, 100): conversation_with_tags = training_set_X[i] conversation = util.remove_tags(conversation_with_tags) conversation = util.remove_punctuation(conversation) conversation = conversation.lower().split( ) #sets everything to lowercase and splits on the spaces by default #conversation = util.stem(conversation) #stemming (taking the root of the word) conversation = util.lemmatize(conversation) X.append(conversation) return X, Y
def __init__(self): """Constructor """ self._utilities = Utilities()
class CommentLevelEvaluation: def __init__(self): self.data_file = 'mmh_dataset.csv' self.utilities = Utilities() # self.Processor = Processor() self.storage_path = 'comment-level-datasets-2/' # self.storage_path = 'r-combine-outputs/' self.random_states = [111, 122, 133, 144, 155] def generate_datasets(self, dataset_initial): X = self.utilities.read_from_csv(self.data_file) y = [0] * len(X) # fake labels for random_state in self.random_states: X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=random_state) for row in X_test: row[0] = row[0].replace('**$**', "") self.utilities.save_list_as_csv( X_train, self.storage_path + dataset_initial + '_train_' + str(random_state) + '.csv') self.utilities.save_list_as_csv( X_test, self.storage_path + dataset_initial + '_test_' + str(random_state) + '.csv') def run_experiment(self, dataset_initial): for random_state in self.random_states: X_train = self.storage_path + dataset_initial + '_train_' + str( random_state) + '.csv' X_test = self.storage_path + dataset_initial + '_test_' + str( random_state) + '.csv' settings = { 'training_file': X_train, 'data_file': X_test, 'max_reviews': None, # Options: 0 to any integer | default: None (all) 'output_file': self.storage_path + dataset_initial + '_output_' + str(random_state) + '.csv' } processor = Processor(settings=settings) processor.run() def merge_aspect_classes(self, aspects): group_1 = ['staff attitude and professionalism', 'communication'] group_2 = ['care quality', 'resource', 'process'] group_3 = ['environment', 'food', 'parking'] group_4 = ['waiting time'] group_5 = ['other', 'noise'] groups = [group_1, group_2, group_3, group_4, group_5] new_aspects = [] for aspect in aspects: for group in groups: if aspect in group: new_aspects.append( group[0] ) # all members will be replaced by the first member of the group break return new_aspects def calculate_comment_level_scores_for_categories(self, y_test, y_pred): categories = [] for aspects in y_test: categories = categories + aspects categories = list(set(categories)) cat_scores = {} for category in categories: test_binary = [] pred_binary = [] for index, test_categories in enumerate(y_test): pred_categories = y_pred[index] if category in test_categories: test_binary.append(1) else: test_binary.append(0) if category in pred_categories: pred_binary.append(1) else: pred_binary.append(0) scores = { 'precision': precision_score(test_binary, pred_binary), 'recall': recall_score(test_binary, pred_binary), 'f1-score': f1_score(test_binary, pred_binary) } cat_scores[category] = scores return cat_scores def calculate_comment_level_scores_for_categories_backup( self, y_test, y_pred): categories = [] for aspects in y_test: categories = categories + aspects categories = list(set(categories)) category_f_scores = {} for category in categories: true_positives = 0 false_positives = 0 false_negatives = 0 true_negatives = 0 for index, test_categories in enumerate(y_test): pred_categories = y_pred[index] if category in test_categories and category in pred_categories: true_positives += 1 elif category in test_categories and category not in pred_categories: false_negatives += 1 elif category not in test_categories and category in pred_categories: false_positives += 1 else: true_negatives += 1 # print [true_positives, false_positives, false_negatives, true_negatives] if float(true_positives + false_positives) > 0: precision = true_positives / float(true_positives + false_positives) else: precision = 0 if true_positives / float(true_positives + false_negatives): recall = true_positives / float(true_positives + false_negatives) else: recall = 0 f_score = (2 * precision * recall) / ( precision + recall) if precision + recall > 0 else 0 category_f_scores[category] = f_score return category_f_scores def calculate_accuracy(self, dataset_initials): overall_precisions = [] overall_recalls = [] overall_f1_scores = [] envs = [] wts = [] saaps = [] cqs = [] ots = [] for random_state in self.random_states: X_test = self.utilities.read_from_csv(self.storage_path + dataset_initials + '_test_' + str(random_state) + '.csv') X_pred = self.utilities.read_from_csv('r-combine-outputs/' + dataset_initials + '_combined_confidence_' + str(random_state) + '.csv') y_test = [] y_pred = [] for index, row in enumerate(X_test): del row[0] aspects = [] for item in row: if item: aspects.append(item.rsplit(' ', 1)[0]) y_test.append(list(set(self.merge_aspect_classes(aspects)))) predicted_row = X_pred[index] del predicted_row[0] aspects = [] for item in predicted_row: if item: aspects.append(item) y_pred.append(list(set(aspects))) true_positives = 0 false_positives = 0 false_negatives = 0 true_negatives = 0 for index, test in enumerate(y_test): pred = y_pred[index] pred_minus_test = [item for item in pred if item not in test] test_minus_pred = [item for item in test if item not in pred] if len(pred_minus_test) == 0 and len(test_minus_pred) == 0: true_positives += 1 # elif len(pred_minus_test) > 0 and len(test_minus_pred) == 0: elif len(pred_minus_test) > 0: false_positives += 1 # elif len(test_minus_pred) > 0 and len(pred_minus_test) == 0: elif len(test_minus_pred) > 0: false_negatives += 1 else: true_negatives += 1 precision = true_positives / float(true_positives + false_positives) recall = true_positives / float(true_positives + false_negatives) overall_f1_score = (2 * precision * recall) / (precision + recall) overall_accuracy = (true_positives + true_negatives) / float( len(y_test)) #print overall_accuracy overall_precisions.append(precision) overall_recalls.append(recall) overall_f1_scores.append(overall_f1_score) category_scores = self.calculate_comment_level_scores_for_categories( y_test, y_pred) score_name = 'f1-score' envs.append(category_scores['environment'][score_name]) wts.append(category_scores['waiting time'][score_name]) saaps.append(category_scores['staff attitude and professionalism'] [score_name]) cqs.append(category_scores['care quality'][score_name]) ots.append(category_scores['other'][score_name]) # print overall_precisions precision = sum(overall_precisions) / float(len(overall_precisions)) recall = sum(overall_recalls) / float(len(overall_recalls)) f1_score = sum(overall_f1_scores) / float(len(overall_f1_scores)) environment = sum(envs) / float(len(envs)) waiting_time = sum(wts) / float(len(wts)) staff_attitude = sum(saaps) / float(len(saaps)) care_quality = sum(cqs) / float(len(cqs)) other = sum(ots) / float(len(ots)) #print "precision\trecall\tf1_score\tenvironment\twaiting_time\tstaff_attitude\tcare_quality\tother" print '%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f' % ( precision, recall, f1_score, environment, waiting_time, staff_attitude, care_quality, other) def calculate_per_system_accuracy(self, dataset_initials): overall_precisions = [] overall_recalls = [] overall_f1_scores = [] envs = [] wts = [] saaps = [] cqs = [] ots = [] for random_state in self.random_states: X_test = self.utilities.read_from_csv(self.storage_path + dataset_initials + '_test_' + str(random_state) + '.csv') # system A output # X_pred = self.utilities.read_from_csv(self.storage_path + dataset_initials + '_output_' + str(random_state) + '.csv') # system B output X_pred = self.utilities.read_from_csv('r-combine-outputs/' + dataset_initials + '_output_confidence_' + str(random_state) + '.csv') y_test = [] y_pred = [] for index, row in enumerate(X_test): del row[0] aspects = [] for item in row: if item: aspects.append(item.rsplit(' ', 1)[0]) # aspects.append(item) y_test.append(list(set(self.merge_aspect_classes(aspects)))) predicted_row = X_pred[index] del predicted_row[0] aspects = [] for item in predicted_row: if item: aspects.append(item.rsplit(' ', 1)[0]) # aspects.append(item) y_pred.append(list(set(aspects))) true_positives = 0 false_positives = 0 false_negatives = 0 true_negatives = 0 for index, test in enumerate(y_test): pred = y_pred[index] pred_minus_test = [item for item in pred if item not in test] test_minus_pred = [item for item in test if item not in pred] if len(pred_minus_test) == 0 and len(test_minus_pred) == 0: true_positives += 1 # elif len(pred_minus_test) > 0 and len(test_minus_pred) == 0: elif len(pred_minus_test) > 0: false_positives += 1 # elif len(test_minus_pred) > 0 and len(pred_minus_test) == 0: elif len(test_minus_pred) > 0: false_negatives += 1 else: true_negatives += 1 precision = true_positives / float(true_positives + false_positives) recall = true_positives / float(true_positives + false_negatives) overall_f1_score = (2 * precision * recall) / (precision + recall) overall_accuracy = (true_positives + true_negatives) / float( len(y_test)) #print overall_accuracy overall_precisions.append(precision) overall_recalls.append(recall) overall_f1_scores.append(overall_f1_score) category_scores = self.calculate_comment_level_scores_for_categories( y_test, y_pred) score_name = 'f1-score' envs.append(category_scores['environment'][score_name]) wts.append(category_scores['waiting time'][score_name]) saaps.append(category_scores['staff attitude and professionalism'] [score_name]) cqs.append(category_scores['care quality'][score_name]) ots.append(category_scores['other'][score_name]) precision = sum(overall_precisions) / float(len(overall_precisions)) recall = sum(overall_recalls) / float(len(overall_recalls)) f1_score = sum(overall_f1_scores) / float(len(overall_f1_scores)) environment = sum(envs) / float(len(envs)) waiting_time = sum(wts) / float(len(wts)) staff_attitude = sum(saaps) / float(len(saaps)) care_quality = sum(cqs) / float(len(cqs)) other = sum(ots) / float(len(ots)) #print "precision\trecall\tf1_score\tenvironment\twaiting_time\tstaff_attitude\tcare_quality\tother" print '%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f' % ( precision, recall, f1_score, environment, waiting_time, staff_attitude, care_quality, other)
def copyAllToMaster(nodeIP, sourcePath): ret_status = [ True, "Step 1(of 18) - Check config files", "Step 2(of 18) - Check Directory for node", "Step 3(of 18) - Check directory for backup node", "Step 4(of 18) - check Home directory ", "Step 5(of 18) - Create Temp Folder", "Step 6(of 18) - Copy Main Node config to temp ", "Step 7(of 18) - Truncate Node Directory ", "Step 8(of 18) -Extract Node Files to Temp ", "Step 9(of 18) - Get Zip File Name", "Step 10(of 18) - Copy node files from temp to node directory", "Step 11(of 18) - Copy node Config file to node Directory", "Step 12(of 18) - Create Temp Folder", "Step 13(of 18) - Copy Backup Node config to temp ", "Step 14(of 18) - Truncate Backup Node Directory ", "Step 15(of 18) -Extract Backup Node Files to Temp ", "Step 16(of 18) - Get Zip File Name", "Step 17(of 18) - Copy Backup node files from temp to node directory", "Step 18(of 18) - Copy Backup node Config file to backup node Directory" ] print("\n Running startBackupNodeExecution() \n") try: backup_node_ip = nodestatus.getBackupNodeIp(nodeIP) if (backup_node_ip == ""): ret_status[0] = False ret_status.append( 'ERROR CODE 2002: No Backup Node Ip found for Node IP :' + nodeIP) return ret_status ret_status[1] = ret_status[1] + " :Done" sourcePathList1 = sourcePath.split(os.sep) sourcePathList2 = sourcePathList1[len(sourcePathList1) - 1].split('.') sourcePathFileName = sourcePathList2[0] if not os.path.isdir(".." + os.sep + "files" + os.sep + nodeIP): ret_status[0] = False ret_status.append('ERROR CODE 2001: Home Directory for node :' + nodeIP + " doesn't exist") return ret_status elif not os.path.isdir(".." + os.sep + "files" + os.sep + backup_node_ip): ret_status[0] = False ret_status[2] = ret_status[2] + " : Done" ret_status.append( 'ERROR CODE 2001: Home Directory for backup node :' + nodeIP + " doesn't exist") return ret_status elif not os.path.isfile(sourcePath): ret_status[0] = False ret_status[2] = ret_status[2] + " : Done" ret_status[3] = ret_status[3] + " : Done" ret_status.append('ERROR CODE 2002: Invalid Source Path: :' + sourcePath) return ret_status else: ret_status[2] = ret_status[2] + " : Done" ret_status[3] = ret_status[3] + " : Done" ret_status[4] = ret_status[4] + " : Done" ########## Managing Main Node Copies ########################## utilities.createOrReplace(".." + os.sep + "temp" + os.sep) ret_status[5] = ret_status[5] + " : Done" shutil.move( ".." + os.sep + "files" + os.sep + nodeIP + os.sep + "configClient.xml", ".." + os.sep + "temp" + os.sep + "configClient.xml") ret_status[6] = ret_status[6] + " : Done" shutil.rmtree(".." + os.sep + "files" + os.sep + nodeIP) ret_status[7] = ret_status[7] + " : Done" zip_ref = zipfile.ZipFile(sourcePath, 'r') zip_ref.extractall(".." + os.sep + "temp") zip_ref.close() ret_status[8] = ret_status[8] + " : Done" source1 = sourcePath.split(os.sep) source2 = source1[len(source1) - 1] source3 = source2.split(".zip")[0] ret_status[9] = ret_status[9] + " : Done" shutil.copytree(".." + os.sep + "temp" + os.sep + source3, ".." + os.sep + "files" + os.sep + nodeIP) ret_status[10] = ret_status[10] + " : Done" shutil.move( ".." + os.sep + "temp" + os.sep + "configClient.xml", ".." + os.sep + "files" + os.sep + nodeIP + os.sep + "configClient.xml") ret_status[11] = ret_status[11] + " : Done" ########## Managing Backup Node Copies ########################## utilities.createOrReplace(".." + os.sep + "temp" + os.sep) ret_status[12] = ret_status[12] + " : Done" shutil.move( ".." + os.sep + "files" + os.sep + backup_node_ip + os.sep + "configClient.xml", ".." + os.sep + "temp" + os.sep + "configClient.xml") ret_status[13] = ret_status[13] + " : Done" shutil.rmtree(".." + os.sep + "files" + os.sep + backup_node_ip) ret_status[14] = ret_status[14] + " : Done" #utilities.zipdir(sourcePath,"../temp") zip_ref = zipfile.ZipFile(sourcePath, 'r') zip_ref.extractall(".." + os.sep + "temp") zip_ref.close() ret_status[15] = ret_status[15] + " : Done" source1 = sourcePath.split(os.sep) source2 = source1[len(source1) - 1] source3 = source2.split(".zip")[0] ret_status[16] = ret_status[16] + " : Done" shutil.copytree(".." + os.sep + "temp" + os.sep + source3, ".." + os.sep + "files" + os.sep + backup_node_ip) ret_status[17] = ret_status[17] + " : Done" shutil.move( ".." + os.sep + "temp" + os.sep + "configClient.xml", ".." + os.sep + "files" + os.sep + backup_node_ip + os.sep + "configClient.xml") ret_status[18] = ret_status[18] + " : Done" except: ret_status[0] = False ret_status.append("\n**** Exception Occurred: " + str(sys.exc_info()[1]) + str(traceback.print_exc())) print("\n Done \n") return ret_status
def _run_rkhunter(self): Avalon.info('Launching rkhunter') Utilities.execute(['rkhunter'], std_in=sys.stdin, std_out=sys.stdout, std_err=sys.stderr)
def search_wrapper(client_defined_expand, client_defined_goal_state_check, client_defined_hashed_state, client_defined_compute_state_cost=None, start_state_hash=None, start_state=None, search_type="bfs", debug=False): results = { "path_to_goal": None, "cost_of_path": 0, "nodes_expanded": 0, "search_depth": 0, "max_search_depth": 0, "running_time": 0, "max_ram_usage": 0 } def update_stats(max_search_depth=None, increment_expanded=False): if (max_search_depth is not None and results["max_search_depth"] < max_search_depth): results["max_search_depth"] = max_search_depth if (increment_expanded == True): results["nodes_expanded"] += 1 # Wrapper to generate node def generate_node(node_options): return Node(**node_options) # Wrap the client_defined_expand and add stats def expand_with_stats(state, state_hash): children = client_defined_expand(state, state_hash) return children # Wrapper to track the cost (heuristic) of a given node -- only applicable in A-Star def compute_state_cost(state, state_hash): if (client_defined_compute_state_cost is not None): return client_defined_compute_state_cost(state, state_hash) else: return 1 start_time = Utilities.get_current_time() node_solution = Algorithms.search( expand=expand_with_stats, goal_state_check=client_defined_goal_state_check, hashed_state=client_defined_hashed_state, generate_node=generate_node, compute_state_cost=compute_state_cost, update_stats=update_stats, start_state_hash=start_state_hash, start_state=start_state, search_type=search_type, debug=debug, ) if (node_solution is not None): results["path_to_goal"] = Algorithms.get_node_path_to_root( node_solution) results["search_depth"] = len(results["path_to_goal"]) results["cost_of_path"] = node_solution.cost end_time = Utilities.get_current_time() results["running_time"] = end_time - start_time max_ram_usage_in_bytes = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss # in bytes results["max_ram_usage"] = max_ram_usage_in_bytes / 1000000 return results
def __init__(self): self.classifier = Classifier() self.dataset = 'bbc-dataset-500-rows.csv' self.utilities = Utilities()
def __init__(self, num_to_keep=None, user=None, db_pwd=None, target_db=None, host=None, tables=[], logging_level=logging.INFO, unittests=False): ''' Most parameters have defaults that can be set once and for all. The main params that might change are num_to_keep, and tables. The num_to_keep integer value declares how many of the newest backup tables to keep for each table. The tables list of table names may contain a mix of table root names (e.g. AssignmentSubmission, Terms), and backup table names (e.g. Terms_2019_01_10_14_14_40_123456) For root names, all backup tables are collected and the num_to_keep newest are retained. For backup table names only those specific tables are removed. @param num_to_keep: how many of the latest backup tables to retain for each aux table @type num_to_keep: int @param user: MySQL user for login @type user: str @param db_pwd: password for logging into MySQL. Don't use for security reasons. Instead, put the pwd into $HOME/.ssh/canvas_pwd @type db_pwd: str @param target_db: MySQL where aux tables reside. @type target_db: str @param host: MySQL host name @type host: str @param tables: list of specific tables to consider. If None, backups for all aux tables are trimmed. @type tables: [str] @param logging_level: how much information to provide during runtime @type logging_level: logging.loglevel @param unittests: whether this instantiation is from a unittest @type unittests: boolean ''' # Get local configuration info: self.config_info = ConfigInfo() # Access to common functionality: self.utils = Utilities() if user is None: user = config_info.default_user if host is None: host = config_info.default_host if db_pwd is None: db_pwd = self.utils.get_db_pwd(host, unittests=unittests) elif db_pwd == True: db_pwd = self.utils.get_db_pwd(host, ask_user=True, unittests=unittests) if target_db is None: target_db = self.config_info.canvas_db_aux self.target_db = target_db if num_to_keep is None: self.num_to_keep = BackupRemover.default_num_backups_to_keep else: self.num_to_keep = num_to_keep # Better name for tables to consider removing: tables_to_consider = tables # Unittests expect a db name in self.db: self.db = target_db self.db_obj = self.utils.log_into_mysql(user, db_pwd, db=target_db, host=host) self.utils.setup_logging(logging_level) if unittests: self.db_name = target_db return # Get names of all tables in the target_db all_tables = self.utils.get_existing_tables_in_dir(self.db_obj, return_all=True, target_db=target_db) # If caller specified only specific tables/backup tables to # remove, week out all table names not in caller's list: all_tables_to_consider = self.find_tables_to_consider( all_tables, tables_to_consider) self.remove_old_backups(all_tables_to_consider) self.close()
class LoadHistoryLister(object): ''' Reads table LoadLog. Lists date of latest refresh for each table. Lists missing tables, and list of all tables. ''' load_table_name = 'LoadLog' #------------------------- # Constructor #-------------- def __init__(self, latest_only=False, unittests=False): ''' Constructor ''' config_info = ConfigInfo() self.utils = Utilities() # For convenience: self.load_table_name = LoadHistoryLister.load_table_name if unittests: self.aux_db = 'Unittest' else: self.aux_db = config_info.canvas_db_aux # Get results as dictionaries: if unittests: self.db_obj = self.utils.log_into_mysql(config_info.test_default_user, self.utils.get_db_pwd(config_info.test_default_host, unittests=unittests), db=self.aux_db, host=config_info.test_default_host, cursor_class = Cursors.DICT ) # Let unittests call methods on their own: return else: self.db_obj = self.utils.log_into_mysql(config_info.default_user, self.utils.get_db_pwd(config_info.default_host, unittests=unittests), db=config_info.canvas_db_aux, host=config_info.default_host, cursor_class = Cursors.DICT ) try: success = self.print_latest_refresh(latest_only) if success: self.print_missing_tables() # self.backup_availability() finally: self.db_obj.close() #------------------------- # print_latest_refresh #-------------- def print_latest_refresh(self, latest_only=False, out_fd=sys.stdout, load_log_content=None): ''' Pretty print a list of aux tables that exist in the database. @param latest_only: if True, only the most recent refresh event for each table will be shown. @type latest_only: bool @param out_fd: if provided, a file-like object to which output is written. Default: stdout. Used by unittests, but could also be used to write the report to a file. @type out_fd: file-like @param load_log_content: a list of dicts reflecting the content of the LoadLog table. Only used by unittests! @type load_log_content: [{}] @return: True for success, False for failure @rtype: bool ''' try: # Only read content of LoadLog table if # unittests did not pass in their own in # the call: # Result will be: # [{tbl_name : <str>, num_rows : <int>, time_refreshed : datetime}, # {tbl_name : <str>, num_rows : <int>, time_refreshed : datetime}, # .. # ] if load_log_content is None: self.utils.ensure_load_log_table_existence(self.load_table_name, self.db_obj) load_log_content = self.db_obj.query(f"SELECT * FROM {self.aux_db}.{self.load_table_name}") except ValueError as e: out_fd.write(f"Cannot list tables: {repr(e)}\n") return False # Pull all row-dicts out from the query result: tbl_dicts = [tbl_dict for tbl_dict in load_log_content] # Sort the dicts by table name: sorted_tbl_dicts = sorted(tbl_dicts, key=lambda one_dict: one_dict['tbl_name']) out_fd.write(f"\nAux tables in {self.aux_db}:\n\n") tbl_nm_header = 'Table Name' load_time_header = 'Last Refreshed' num_rows_header = 'Num Rows' # Print the header: out_fd.write(f'{tbl_nm_header:>30} {load_time_header:^25} {num_rows_header:^5}\n') # If requested, only show the latest update # for each table: if latest_only: sorted_tbl_dicts = self.keep_latest_dict(sorted_tbl_dicts) # For each result dict, pull out the table name, # time refreshed, and number of rows. Assign them # to variables: for tbl_entry_dict in sorted_tbl_dicts: tbl_nm = tbl_entry_dict['tbl_name'] num_rows = tbl_entry_dict['num_rows'] # Get a UTC datetime obj (b/c we initialize # each MySQL session to be UTC): utc_load_datetime = tbl_entry_dict['time_refreshed'] # Tell this 'unaware' datetime obj that it' tz_aware_load_datetime = utc_load_datetime.replace(tzinfo=timezone.utc) localized_datetime = tz_aware_load_datetime.astimezone(tz=None) load_time_str = localized_datetime.strftime("%Y-%m-%d %H:%M:%S %Z") # The ':>30' is "right-justfy; allow 30 chars. # The '^20' is "center-justfy; allow 20 chars. out_fd.write(f"{tbl_nm:>30} {load_time_str:^25} {num_rows:^5}\n") return True #------------------------- # keep_latest_dict #-------------- def keep_latest_dict(self, load_event_dicts): ''' Given a list of dicts with table-name, load-date, and row num keys, return a new list with only the dicts that describe the most recent table refresh. @param load_event_dicts: array of dict describing table refresh events. @type load_event_dicts: [{}] ''' # Dict {tbl_name : load_event_dict} to hold # the most recent dict for the respective table. # Use an ordered dict to not mess up order of # passed-in dicts: latest_dicts = OrderedDict() for load_event_dict in load_event_dicts: tbl_nm = load_event_dict['tbl_name'] try: if load_event_dict['time_refreshed'] > latest_dicts[tbl_nm]['time_refreshed']: latest_dicts[tbl_nm] = load_event_dict except KeyError: # First time we see an entry for this table: latest_dicts[tbl_nm] = load_event_dict res = [newest_refresh_dict for newest_refresh_dict in latest_dicts.values()] return res #------------------------- # print_missing_tables #-------------- def print_missing_tables(self, num_cols=4): ''' Print the tables that are missing in the aux tables database. Print in column form, alpha sorted. @param num_cols: number of table names in one row @type num_cols: int @return: True for success, False for failure @rtype: bool ''' all_tables = set(self.utils.create_table_name_array()) tables_present = self.utils.get_tbl_names_in_schema(self.db_obj, self.aux_db) tables_present = set([table_dict['TABLE_NAME'] for table_dict in tables_present]) missing_tables = all_tables - tables_present if len(missing_tables) == 0: print("No missing tables.") return True self.utils.print_columns(missing_tables, 'Missing Tables:', num_cols=num_cols, alpha=True) return True
help='Save data to CSV') parser.add_argument('-wimages', action='store_true', default=False, dest='wimages', help='Save images in query folder') arguments = parser.parse_args() if arguments.dataset: datasetPath = arguments.dataset else: parser.print_help() print("-dataset <datasetPath>") sys.exit(1) util = Utilities() # experiment directory expDir = "sift_experiments" qBuildings = ['22', '39', '60'] qidx = 0 ## Prepare Dataset ## dataset, queryList = util.createDataset(datasetPath, qBuildings) #creating result lists & house rank list of (<image>,#inliers,#inliers,#accuracy) resList = np.zeros(len(dataset), [('idx', 'int16'), ('imageId', 'a28'), ('inliers', 'int16'), ('percent', 'float')]) rankedClassList = np.zeros(15, [('idx', 'int16'), ('imageId', 'a28'), ('inliers', 'int16'), ('percent', 'float'), ('building', 'int8')])
class Bot: """Bot object each group will have that handle checking for commands and processing them""" def __init__(self, group, yt_key=None, delim="$", refresh_group_interval=600): """ :param group: the group this object will read messages from :param yt_key: youtube api key. need it to use yt_search but not needed for other commands :param delim: the first character that will let the bot know it is a command. default is "$" """ self.group = group self.delim = delim self.ult = Utilities(yt_key) self.tags = Tags(group.name, group.id, group.members) self.valid_commands = ["avatar", "git", "yt", "tag", "help"] Timer(refresh_group_interval, self.reload_group).start() def get_message(self): """ :return: returns the latest message from a group. if there is an error, return None """ try: return self.group.messages.list()[0] except Exception as err: Utilities.log( "Exception: {self.group.name}: bot.get_message: {err}") return None def reload_tags(self): """ :return: reloads the tags in the Tag object """ self.tags.reload_tags() def reload_group(self, stop=Event()): """ :param stop: threading Event. not set by default so this method would be called every 10 minutes :return: updates the group name, group id, and group members of a group every 10 minutes """ self.group.refresh_from_server() self.tags.update_members(self.group.members) self.tags.update_group_id(self.group.id) self.tags.update_group_name(self.group.name) self.tags.save_tags() if not stop.is_set(): Timer(600, self.reload_group).start() def save_tags(self): """ :return: writes the tags to a file for the group """ self.tags.save_tags() def find_owner_name(self, user_id): """ :param user_id: user_id of a member in the group :return: returns the nickname associated with the user_id """ return list(filter(lambda x: x.user_id == user_id, self.group.members))[0] def find_avatar(self, message, mentions): """ :param message: the avatar command. checks to see if it's a help call or actual usage :param mentions: list of attachments with the message. uses it to check for mentions :return: avatar url of person mentioned or an error message saying to mention the user """ if message[2] == "help": return "Usage: avatar [person]" else: mentions = list(filter(lambda x: x.type == "mentions", mentions)) if len(mentions) == 1: user_id = mentions[0].user_ids[0] return self.find_owner_name(user_id).image_url return "Please mention the one person you want the avatar of" def send_message(self, message): """ :param message: message that will be sent to the group :return: message should post in the group """ try: if isinstance(message, list): for res in message: self.group.post(res) else: self.group.post(message) except Exception as err: Utilities.log( f"Exception: {self.group.name}: bot.send_message: {err}") def process_message(self, message): """ :param message: checks if the message is a valid comand and execute command it is associated with :return: results of the command executed """ if message is not None: try: message_text = message.text.lower() delim = message_text[:len(self.delim)] message_text = message_text[len(self.delim):] message_text = message_text.split(" ") command = message_text[0] if delim == self.delim and command in self.valid_commands: user_id = message.user_id owner = self.find_owner_name(user_id) Utilities.log( f"{self.group.name}: Processing from {owner}: {message_text}, Command: {command}" ) result = None if command == "help": result = self.ult.post_help() if command == "avatar": result = self.find_avatar(message_text, message.attachments) if command == "git": result = self.ult.git() if command == "yt": query = ' '.join(message_text[1:]) result = self.ult.yt_search(query) if command == "tag": result = self.tags.parse_commands( message_text, user_id, message.attachments) if result is not None: Utilities.log( f"{self.group.name}: posting \"{result}\"") self.send_message(result) except Exception as err: if isinstance(err, googleapiclient.errors.HttpError): self.send_message(str(err)) if message.text is None: pass else: Utilities.log( f"{self.group.name}: bot.process_message: {err}")
def generateAnswer(choice): ''' This function takes in a choice as input and it loads the corresponding model of that choice and uses the loaded model to predict the output and the weights. Parameters: choice (str) : It can be either 'single' or 'double' Returns: story (list) : A list of sentences in the story question (str) : The question correct_answer (str) : The correct answer weights1 (numpy array) : The array of weights for outer hop weights2 (numpy array) : The array of weights of inner hop predicted_answer (str) : The anwer predicted by the model ''' tar = tarfile.open('Data/babi_tasks_1-20_v1-2.tar.gz') challenges = { # QA1 with 10,000 samples 'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt', # QA2 with 10,000 samples 'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt', } if choice == 'single': ## Single Supporting Fact Challenge ss_train_stories, ss_test_stories, \ ss_stories_train, ss_questions_train, ss_answers_train, \ ss_stories_test, ss_questions_test, ss_answers_test, \ ss_story_maxlen, ss_story_maxsents, ss_question_maxlen, \ ss_vocab, ss_vocab_size, ss_word2idx = \ Preprocess.getData(challenges['single_supporting_fact_10k'], tar) ss_idx2word = {value : key for key, value in ss_word2idx.items()} single_model = Utilities.loadModel('single_model') single_debug_model = Utilities.loadModel('single_debug_model') story, question, correct_answer, weights2, predicted_answer = \ Models.predictSingleModelAnswer(ss_test_stories, ss_stories_test, ss_questions_test, ss_idx2word, single_model, single_debug_model) weights1 = np.zeros(weights2.shape) K.clear_session() return story, question, correct_answer, weights1, weights2, predicted_answer else: ## Two Supporting Fact challenge ts_train_stories, ts_test_stories, \ ts_stories_train, ts_questions_train, ts_answers_train, \ ts_stories_test, ts_questions_test, ts_answers_test, \ ts_story_maxlen, ts_story_maxsents, ts_question_maxlen, \ ts_vocab, ts_vocab_size, ts_word2idx = \ Preprocess.getData(challenges['two_supporting_facts_10k'], tar) ts_idx2word = {value : key for key, value in ts_word2idx.items()} double_model = Utilities.loadModel('double_model') double_debug_model = Utilities.loadModel('double_debug_model') story, question, correct_answer, weights1, weights2, predicted_answer = \ Models.predictDoubleModelAnswer(ts_test_stories, ts_stories_test, ts_questions_test, ts_idx2word, double_model, double_debug_model) K.clear_session() return story, question, correct_answer, weights1, weights2, predicted_answer
from sklearn.cross_validation import cross_val_score from sklearn.linear_model import LinearRegression wine_white_results = open('wine_white_results.txt', 'w') wine_red_results = open('wine_red_results.txt', 'w') # read data into a DataFrame data = pd.read_csv("winequality-white.csv", delimiter=';') data2 = pd.read_csv("winequality-red.csv", delimiter=';') X = data[[ "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol" ]] y = data[["quality"]] _indent = Utilities.draw_whatever("-", 6) #calculate correlation matrix corMat = DataFrame(data.iloc[:, :11].corr()) wine_white_results.writelines(_indent + 'correlation matrix' + _indent + '\n') wine_white_results.writelines(str(corMat) + '\n') # instantiate a logistic regression model, and fit with X and y model = LinearRegression() model = model.fit(X, y.values.ravel()) # check the accuracy on the training set score = model.score(X, y) wine_white_results.write("accuracy :" + str(score) + '\n') # print intercept and coefficients wine_white_results.write("intercept_ :" + str(model.intercept_) + '\n')
import os, sys, vtk import numpy as np script_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(script_path, '../src')) from utilities import Utilities rescale_path = '/work/lpzmateo/data/DL_shapes/shapes/no_features_no_groups/rescaled' csv_path = '/work/lpzmateo/data/DL_shapes/shapes/no_features_no_groups/dataset_description.csv' util = Utilities() csv_dict = util.readDictCSVFile(csv_path) print('getting params') min_coord = None max_coord = None for path in csv_dict['VTK Files']: reader = vtk.vtkPolyDataReader() reader.SetFileName(path) reader.Update() polydata = reader.GetOutput() for i in range(polydata.GetNumberOfPoints()): ptn = polydata.GetPoint(i) for n in ptn: if (min_coord == None): min_coord = n elif (min_coord > n): min_coord = n
def train(self): """ Trains the GAN system """ # break condition for training (when diverging) loss_increase_count = 0 prev_g_loss = 0 conn = SQLConnector() idx = np.arange(self.batch_size) for epoch in range(self.max_epochs): #selecting batch_size random attacks from our training data #idx = np.random.randint(0, X_train.shape[0], batch_size) attacks = self.X_train[idx] # generate a matrix of noise vectors noise = np.random.normal(0, 1, (self.batch_size, 41)) # create an array of generated attacks gen_attacks = self.generator.predict(noise) # loss functions, based on what metrics we specify at model compile time c_loss_real = self.critic.train_on_batch(attacks, self.valid) c_loss_fake = self.critic.train_on_batch(gen_attacks, self.fake) d_loss = 0.5 * np.add(c_loss_real, c_loss_fake) for l in self.critic.layers: weights = l.get_weights() weights = [ np.clip(w, -self.clip_value, self.clip_value) for w in weights ] l.set_weights(weights) # generator loss function g_loss = self.gan.train_on_batch(noise, self.valid) if epoch % 500 == 0: print( "%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss, g_loss - prev_g_loss, loss_increase_count)) gen_attacks = self.scaler.inverse_transform(gen_attacks) predicted_gen_attack_labels = self.evaluator.predict( gen_attacks).transpose().astype(int) gen_attack_labels = np.full(predicted_gen_attack_labels.shape, 1) print("Generated attack labels: ") print(gen_attack_labels) print("Predicted labels of generated attacks: ") print(predicted_gen_attack_labels) right = (predicted_gen_attack_labels == 1).sum() wrong = (predicted_gen_attack_labels != 1).sum() accuracy = (right / float(right + wrong)) print("5 generated attacks: ") print(gen_attacks[:5, :]) print() print("Accuracy of evaluator on generated data: %.4f " % accuracy) if accuracy > .50: conn.write_gens(gen_attacks, util.attacks_to_num(self.attack_type)) layersstr = str(self.generator_layers[0]) + "," + str( self.generator_layers[1]) + "," + str(self.generator_layers[2]) attack_num = util.attacks_to_num(self.attack_type) conn.write_hypers(layerstr=layersstr, attack_encoded=attack_num, accuracy=accuracy)
def driver(): TimerUtility.start_timer('drv_driver') name = "_cumcolor_urban_" output_dir = Scenario.get_scen_value("output_dir") landuse_flag = len(Scenario.get_scen_value("landuse_data_file")) > 0 nrows = IGrid.nrows ncols = IGrid.ncols total_pixels = IGrid.get_total_pixels() z_cumulate = PGrid.get_cumulate() sim_landuse = PGrid.get_land1() # Create Annual Landuse Probability File if Processing.get_processing_type() == Globals.mode_enum["predict"]: if landuse_flag: LandClass.init_annual_prob(total_pixels) # Monte Carlo Simulation Driver.monte_carlo(z_cumulate, sim_landuse) if Processing.get_processing_type() == Globals.mode_enum["predict"]: # Output Urban Images if IGrid.using_gif: filename = f"{output_dir}cumulate_urban.gif" else: filename = f"{output_dir}cumulate_urban.tif" IGrid.echo_meta(f"{output_dir}cumulate_urban.tfw", "urban") colortable = Color.get_grayscale_table() ImageIO.write_gif(z_cumulate, colortable, filename, "", nrows, ncols) Utilities.write_z_prob_grid(z_cumulate.gridData, name) if landuse_flag: cum_prob, cum_uncert = LandClass.build_prob_image(total_pixels) #print(cum_prob) # Output Cumulative Prob Image if IGrid.using_gif: filename = f"{output_dir}cumcolor_landuse.gif" else: filename = f"{output_dir}cumcolor_landuse.tif" IGrid.echo_meta(f"{output_dir}cumcolor_landuse.tfw", "landuse") cum_prob_grid = IGrid.wrap_list(cum_prob) ImageIO.write_gif(cum_prob_grid, Color.get_landuse_table(), filename, "", nrows, ncols) # Output Cumulative Uncertainty Image if IGrid.using_gif: filename = f"{output_dir}uncertainty.landuse.gif" else: filename = f"{output_dir}uncertainty.landuse.tif" IGrid.echo_meta(f"{output_dir}uncertainty.landuse.tfw", "landuse") cum_uncert_grid = IGrid.wrap_list(cum_uncert) ImageIO.write_gif(cum_uncert_grid, Color.get_grayscale_table(), filename, "", nrows, ncols) if not landuse_flag or Processing.get_processing_type( ) == Globals.mode_enum['predict']: fmatch = 0.0 else: landuse1 = IGrid.igrid.get_landuse_igrid(1) fmatch = Driver.fmatch(sim_landuse, landuse1, landuse_flag, total_pixels) Stats.analyze(fmatch) TimerUtility.stop_timer('drv_driver')
def spread(z, avg_slope): TimerUtility.start_timer('spr_spread') sng = 0 sdc = 0 og = 0 rt = 0 nrows = IGrid.nrows ncols = IGrid.ncols total_pixels = nrows * ncols road_gravity = Coeff.get_current_road_gravity() diffusion = Coeff.get_current_diffusion() breed = Coeff.get_current_breed() spread = Coeff.get_current_spread() excld = IGrid.igrid.get_excld_grid() roads = IGrid.igrid.get_road_grid_by_year( Processing.get_current_year()) slope = IGrid.igrid.get_slope_grid() nrows = IGrid.nrows ncols = IGrid.ncols # Zero the growth array for this time period delta = [0] * (nrows * ncols) # Get slope rates slope_weights = Spread.get_slope_weights() # Phase 1N3 - Spontaneous Neighborhood Growth and Spreading sng, sdc = Spread.phase1n3(diffusion, breed, z.gridData, delta, slope, excld, slope_weights, sng, sdc) # Phase 4 - Organic Growth og = Spread.phase4(spread, z.gridData, excld, delta, slope, slope_weights, og) # Phase 5 - Road Influence Growth rt = Spread.phase5(road_gravity, diffusion, breed, z.gridData, delta, slope, excld, roads, slope_weights, rt) Utilities.condition_gt_gif(delta, UGMDefines.PHASE5G, delta, 0) Utilities.condition_ge_gif(excld, 100, delta, 0) # Now place growth array into current array num_growth_pix = 0 avg_slope = 0.0 for i in range(total_pixels): if z.gridData[i] == 0 and delta[i] > 0: # New growth being placed into array avg_slope += slope[i] z.gridData[i] = delta[i] num_growth_pix += 1 pop = 0 for pixels in z.gridData: if pixels >= UGMDefines.PHASE0G: pop += 1 if num_growth_pix == 0: avg_slope = 0.0 else: avg_slope /= num_growth_pix TimerUtility.stop_timer('spr_spread') return avg_slope, num_growth_pix, sng, sdc, og, rt, pop
class ShapeEvaluator(): def __init__(self): self.input_description_path=None self.input_description=None self.model_info_path=None self.model_info=None self.dataset_info_path=None self.dataset_info=None self.tfrecord_info_path=None self.tfrecord_info=None self.output_dir=None self.util=Utilities() def setInputDescription(self,path): self.input_description_path=path self.input_description=self.util.readDictCSVFile(path) def setModelInformation(self,path): self.model_info_path=path self.model_info=self.util.readJSONFile(path) self.dataset_info_path=self.model_info['dataset_info_path'] self.dataset_info=self.util.readJSONFile(self.dataset_info_path) self.tfrecord_info_path=self.dataset_info['tfrecord_info'] self.tfrecord_info=self.util.readJSONFile(self.tfrecord_info_path) def setOutputDirectory(self,path): self.output_dir=path def evaluate(self): print('Starting evaluation') graph = tf.Graph() with graph.as_default(): if self.model_info['model_type']=='classification': from classification_nn import ClassificationNN nn=ClassificationNN() nn.setTFRecordInfo(tfrecord_info=self.tfrecord_info) with tf.variable_scope("evaluation_data"): if 'VTK Files' in self.input_description: data_extractor=ShapeDataExtractor() data_extractor.setCSVDescription(self.input_description_path) if self.tfrecord_info['extraction_info']['points_feature']: data_extractor.setPointFeature(self.tfrecord_info['extraction_info']['points_feature']['feature_names']) if self.tfrecord_info['extraction_info']['cells_feature']: data_extractor.setCellFeature(self.tfrecord_info['extraction_info']['cells_feature']['feature_names']) data_extractor.setOutputDirectory(os.path.join(self.output_dir,'tfrecords')) tfrecord_info_path=data_extractor.extractAndSave() nn.setTFRecordInfo(tfrecord_info_path=tfrecord_info_path) dataset=nn.extractSet(self.input_description['TFRecords'], batch_size=len(self.input_description['TFRecords']), num_epochs=None, shuffle_buffer_size=None, variable_scope='evaluation_set') ite = dataset.make_initializable_iterator() data_tuple=ite.get_next() nn.setTFRecordInfo(tfrecord_info=self.tfrecord_info) ops=nn.getOps(data_tuple=data_tuple, # images=None, is_training=False, #learning_rate=self.learning_rate, # decay_steps=10000, # decay_rate=0.96, # staircase=False, ps_device="/cpu:0", w_device="/cpu:0") with tf.Session() as sess: #Global Variables Initialisation sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) #Initialazing The Iterators sess.run([ite.initializer]) #Initializing the model saver saver = tf.train.Saver() saver.restore(sess,self.model_info['model_path']) #eval feed_dict=nn.getEvaluationParameters() predictions = sess.run(ops['class_prediction'],feed_dict=feed_dict) #convert digit into original class name for i in range(len(predictions)): predictions[i]=self.tfrecord_info['class_corres_digit_to_name'][str(predictions[i])] new_description=self.input_description new_description['Predictions']=predictions prediction_path=os.path.join(self.output_dir,'prediction_description.csv') self.util.writeDictCSVFile(new_description,prediction_path) print('Prediction description saved: %s'%(prediction_path)) return prediction_path
def createConfigNode(node_ip, backup_node_ip, executableList=[]): ret_status = [ True, "Step 1(of 5) - Getting config.xml", "Step 2(of 5) - Checking Config File", "Step(3 of 5) - Creating Node Folders", "Step 4(of 5) - Writing to Node Config Files", "Step 5(of 5) - Writing to main Config File" ] print("\n Running createConfigNode() \n") try: values = NodeStatus.getNodeList() tree = ElementTree.parse(".." + os.sep + "config.xml") ret_status[1] = ret_status[1] + " Done" root = tree.getroot() if node_ip in values: ret_status[0] = False ret_status[2] = ret_status[ 2] + 'ERROR CODE 1003: Node with Ip Address :' + node_ip + " already present in config.xml" return ret_status else: ret_status[2] = ret_status[2] + " : Done" node_element = ElementTree.Element('node') node_ip_element = ElementTree.Element('nip') node_ip_element.text = node_ip node_bip_element = ElementTree.Element('nbip') node_bip_element.text = backup_node_ip node_executable_element = ElementTree.Element('executables') executable_files = [] for files in executableList[:]: node_file_element = ElementTree.Element('file') node_file_element.text = files node_executable_element.append(node_file_element) node_element.append(node_ip_element) node_element.append(node_bip_element) node_element.append(node_executable_element) root.append(node_element) tree = ElementTree.ElementTree(root) ########## Creating Folders for the nodes ############################## utilities.checkExistOrCreate(".." + os.sep + "files") path = ".." + os.sep + "files" + os.sep + node_ip pathbkup = ".." + os.sep + "files" + os.sep + backup_node_ip utilities.createOrReplace(path) utilities.createOrReplace(pathbkup) ret_status[3] = ret_status[3] + " : Done" ########## Writing to all the config files ############################## child_root = ElementTree.Element("client") child_root.append(node_element) tree_child = ElementTree.ElementTree(child_root) tree_child.write(path + os.sep + "configClient.xml") tree_child.write(pathbkup + os.sep + "configClient.xml") ret_status[4] = ret_status[4] + " : Done" tree.write(".." + os.sep + "config.xml") ret_status[5] = ret_status[5] + " : Done" except: ret_status[0] = False ret_status.append("\n**** Exception Occurred: " + str(sys.exc_info()[1]) + str(traceback.print_exc())) print("\n Done \n") return ret_status
def main(): print() conn = SQLConnector() data = conn.pull_all_attacks(num=10000) dataframe = pd.DataFrame.from_records( data=data, columns=conn.pull_kdd99_columns(allQ=True)) d = defaultdict(LabelEncoder) features = dataframe.iloc[:, :41] attack_labels = dataframe.iloc[:, 41:] for i in range(0, attack_labels.size): attack_labels.at[i, 'attack_type'] = util.attacks_to_num( attack_labels.at[i, 'attack_type']) fit = features.apply(lambda x: d[x.name].fit_transform(x)) unbalanced_df = fit.join(attack_labels) balanced_df = unbalanced_df.copy(deep=True) gen_data = np.asarray(conn.read_gen_attacks_acc_thresh(.90, 1000)) gen_df = pd.DataFrame.from_records( gen_data, columns=conn.pull_kdd99_columns(allQ=True)) gen_df = gen_df.fillna(0) balanced_df = pd.concat([balanced_df, gen_df]) print(len(balanced_df)) unbalanced_array = unbalanced_df.values balanced_array = balanced_df.values # BEGIN LOOP # Create two identical multi-class classifiers, make sure their output dimensions match the number of classes in our data layers = [16, 32, 16] alpha = 0.1 dropout = 0.3 unb_labels = unbalanced_array[:, 41] [unb_classes, unb_counts] = np.unique(unb_labels, return_counts=True) print("Unique classes in unbalanced labels: ") print(unb_classes) print("Counts for the classes in unbalanced labels: ") print(unb_counts) unb_class_count = len(unb_classes) print("Number of classes in unbalanced dataset: " + str(unb_class_count)) bal_labels = balanced_array[:, 41] [bal_classes, bal_counts] = np.unique(bal_labels, return_counts=True) dummy_bal_labels = np_utils.to_categorical(bal_labels) bal_class_count = len(bal_classes) print("Number of classes in balanced dataset: " + str(bal_class_count)) print("Unique classes in balanced labels: ") print(bal_classes) print("Counts for the classes in balanced labels: ") print(bal_counts) for j in range(100): unbalanced_classifier = build_discriminator(layers, alpha, dropout, unb_class_count) balanced_classifier = build_discriminator(layers, alpha, dropout, bal_class_count) optimizer = Adam(.001) unbalanced_classifier.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) balanced_classifier.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) # encoding labels, classifier wants them in range 0 to num_classes unb_enc = LabelEncoder() bal_enc = LabelEncoder() unb_labels = unbalanced_array[:, 41] bal_labels = balanced_array[:, 41] unb_enc = unb_enc.fit(unb_labels) bal_enc = bal_enc.fit(bal_labels) unbalanced_array[:, 41] = unb_enc.transform(unbalanced_array[:, 41]) balanced_array[:, 41] = bal_enc.transform(balanced_array[:, 41]) [unb_classes, _] = np.unique(unbalanced_array[:, 41], return_counts=True) train_data = unbalanced_array[:, :41].astype(int) unb_cm = train(unbalanced_classifier, unbalanced_array, train_data) bal_cm = train(balanced_classifier, balanced_array, train_data) print("Metrics for iteration " + str(j)) # print("Confusion matrix of unbalanced: ") # print print("Accuracy of unbalanced: " + str(getmetrics(unb_cm))) # print("Confusion matrix of balanced: ") # print(bal_cm) print("Accuracy of balanced" + str(getmetrics(bal_cm))) print("Diff: " + str(getmetrics(bal_cm) - getmetrics(unb_cm)))
def find_best_params(img, dimg, rx, ry, block_size, step_size, spatial_factor, intensity_shrinkage, max_x_offset, max_y_offset, err_func=ImageUtils.rmse, verbosity=0): if verbosity > 0: print("<{0}>".format(ry), end='') if verbosity > 1: argdict = locals().copy() for k in argdict.keys(): val = argdict[k] if not Utilities.is_iterable(val): print(" {0}: {1}".format(k, argdict[k])) else: print(" {0} is iterable".format(k)) if verbosity > 1: print("rx= {0}, ry= {1}".format(rx, ry)) if rx > 0: pass left = max(int(rx / 2) - max_x_offset, 0) right = min(int(rx / 2) + max_x_offset, dimg.shape[1] - block_size) up = max(int(ry / 2) - max_y_offset, 0) down = min(int(ry / 2) + max_y_offset, dimg.shape[0] - block_size) if (left >= right) or (up >= down): import pdb pdb.set_trace() if verbosity > 0: pass rblock = img[ry:ry + block_size, rx:rx + block_size] rmean = np.mean(rblock) best_err = np.finfo('float').max tries = 0 best_x = rx best_y = ry best_mean_adj = 0 for dx in range(left, right, step_size): for dy in range(up, down, step_size): temp = dimg[dy:dy + block_size, dx:dx + block_size] * intensity_shrinkage if (temp.shape[0] != rblock.shape[0]) or (temp.shape[1] != rblock.shape[1]): msg = Utilities.last_exception_info() warnings.warn(msg) dmean = np.mean(temp) mean_add = rmean - dmean dblock = temp + mean_add dblock = np.clip(dblock, 0, 255) newmean = np.mean(dblock) if (dblock.shape[0] != rblock.shape[0]) or (dblock.shape[1] != rblock.shape[1]): msg = "range and domain have different shapes" msg += "rx={0}, ry={1}".format(rx, ry) raise RuntimeError(msg) err = np.finfo('float').max try: if (dblock.shape[0] != rblock.shape[0]) or ( dblock.shape[1] != rblock.shape[1]): msg = Utilities.last_exception_info() warnings.warn(msg) err = err_func(rblock, dblock) except Exception as e: emsg = Utilities.last_exception_info() print(emsg) raise RuntimeError(emsg) tries += 1 if err < best_err: best_x = dx best_y = dy best_mean_add = mean_add best_err = err if tries == 0: msg = "tries==0, rx={0}, ry= {1}".format(rx, ry) raise RuntimeError(msg) if best_x > dimg.shape[1] - block_size or best_y > dimg.shape[ 0] - block_size: msg = "codes out of range, x= {0}, y={1}".format(best_x, best_y) msg += "image wd= {0}, ht= {1}".format(dimg.shape[1], dimg.shape[0]) print(msg) raise RuntimeError(msg) return (best_x, best_y, best_mean_add, rx, ry, best_err, tries)
help='Save image to file') parser.add_argument('-o', action='store_true', default=False, dest='o', help='Save data to CSV') arguments = parser.parse_args() if arguments.im1: img1Path = str(arguments.im1)[2:-2] else: parser.print_help() print("-img1: Query Image") sys.exit(1) util = Utilities() #image counter n = 0 ## Prepare Dataset ## dataset = [] listImages = glob.glob('dataset/*.jpg') for i in listImages: dataset.append(i.split('/')[-1]) #creating a list of (<image>,#inliers) pairs resList = np.zeros(len(dataset), [('idx', 'int16'), ('imageId', 'a28'), ('inliers', 'int16'), ('percent', 'float')]) print("\n================")
step_size = 2 verbosity = 1 intensity_shrinkage = 0.7 oimg = ImageUtils.trim_image(small_img, spatial_factor=spatial_factor, block_size=block_size, verbosity=verbosity) # show image plt.imshow(oimg, cmap=plt.get_cmap('gray')) #, vmin=0, vmax=1) plt.show() # compress Comp = Compressor() start = Utilities.now() print(start) params = Comp.compress_image(oimg, block_size=block_size, spatial_factor=spatial_factor, intensity_shrinkage=intensity_shrinkage, err_func=ImageUtils.rmse, max_x_offset=None, max_y_offset=None, verbosity=1) end = Utilities.now() print(end) cdf = pd.DataFrame(params['codes']) print("rmse= {0}".format(np.sqrt(cdf['err'].mean()))) # decompress Decomp = Decompressor()
<<<<<<< HEAD def main(): ======= from groupmembers import print_group_members def main(): print_group_members() >>>>>>> e5ec83de46f0de1c97640e694748734d8edf81f3 # initialize the utilites class. donwloads mnist data and initializes input variable x, # predicted output label valriable y_ num_iterations = 20000 batch_size = 50 learning_rate = 0.5 utility_obj = Utilities(num_iterations, batch_size, learning_rate) # Read the USPS data from proj3_images folder and store for further use utility_obj.get_usps_data() # create the logistic regression model, train using mnist data and test it using mnist and usps data set logistic_regression(utility_obj) # create single layer neural network model, train using mnist and test it using mnist and usps num_neurons = 100 <<<<<<< HEAD # single_layer_nn(utility_obj,num_neurons) # create convolutional neural network model, train using mnist and test it using mnist and usps # train_cnn(utility_obj) =======
class BackupRemover(object): ''' Utility to remove all but a given number of table backups from the aux directory. ''' default_num_backups_to_keep = 2 #------------------------------------ # Constructor #------------------- def __init__(self, num_to_keep=None, user=None, db_pwd=None, target_db=None, host=None, tables=[], logging_level=logging.INFO, unittests=False): ''' Most parameters have defaults that can be set once and for all. The main params that might change are num_to_keep, and tables. The num_to_keep integer value declares how many of the newest backup tables to keep for each table. The tables list of table names may contain a mix of table root names (e.g. AssignmentSubmission, Terms), and backup table names (e.g. Terms_2019_01_10_14_14_40_123456) For root names, all backup tables are collected and the num_to_keep newest are retained. For backup table names only those specific tables are removed. @param num_to_keep: how many of the latest backup tables to retain for each aux table @type num_to_keep: int @param user: MySQL user for login @type user: str @param db_pwd: password for logging into MySQL. Don't use for security reasons. Instead, put the pwd into $HOME/.ssh/canvas_pwd @type db_pwd: str @param target_db: MySQL where aux tables reside. @type target_db: str @param host: MySQL host name @type host: str @param tables: list of specific tables to consider. If None, backups for all aux tables are trimmed. @type tables: [str] @param logging_level: how much information to provide during runtime @type logging_level: logging.loglevel @param unittests: whether this instantiation is from a unittest @type unittests: boolean ''' # Get local configuration info: self.config_info = ConfigInfo() # Access to common functionality: self.utils = Utilities() if user is None: user = config_info.default_user if host is None: host = config_info.default_host if db_pwd is None: db_pwd = self.utils.get_db_pwd(host, unittests=unittests) elif db_pwd == True: db_pwd = self.utils.get_db_pwd(host, ask_user=True, unittests=unittests) if target_db is None: target_db = self.config_info.canvas_db_aux self.target_db = target_db if num_to_keep is None: self.num_to_keep = BackupRemover.default_num_backups_to_keep else: self.num_to_keep = num_to_keep # Better name for tables to consider removing: tables_to_consider = tables # Unittests expect a db name in self.db: self.db = target_db self.db_obj = self.utils.log_into_mysql(user, db_pwd, db=target_db, host=host) self.utils.setup_logging(logging_level) if unittests: self.db_name = target_db return # Get names of all tables in the target_db all_tables = self.utils.get_existing_tables_in_dir(self.db_obj, return_all=True, target_db=target_db) # If caller specified only specific tables/backup tables to # remove, week out all table names not in caller's list: all_tables_to_consider = self.find_tables_to_consider( all_tables, tables_to_consider) self.remove_old_backups(all_tables_to_consider) self.close() #------------------------- # find_tables_to_consider #-------------- def find_tables_to_consider(self, table_nm_list, specific_tables): ''' Given mixed list of root and backup table names, and a list of specific table names, return a new list of effectively wanted tables. If a table root name occurs in specific_tables, then all of that root name's backup versions in table_nm_list are retained. For backup table names in specific_tables, only those backup names are retained, not the others of the same root. If specific_tables is empty or None, table_nm_list is returned. @param table_nm_list: list of all aux tables and backups @type table_nm_list: [str] @param specific_tables: possibly empty list of specific tables to remove. If empty list or none: return the full table_nm_list @type specific_tables: [str] ''' if specific_tables is None: return table_nm_list if len(specific_tables) == 0: return table_nm_list # Collect all root names in specific_tables: roots = [ tbl_name for tbl_name in specific_tables if self.utils.is_aux_table(tbl_name) ] # Remember root table names, so we can # keep all their backup names in the returned list: new_all = table_nm_list.copy() for tbl_nm in table_nm_list: # Is it a backup name whose root table name # is in the list to consider? if self.utils.is_backup_name(tbl_nm) and \ self.utils.get_root_name(tbl_nm) in roots: # Keep the backup name: continue # At this pt the table must explicitly be # in the keep list to survive: if tbl_nm not in specific_tables: new_all.remove(tbl_nm) return new_all #------------------------- # remove_old_backups #-------------- def remove_old_backups(self, all_table_names): ''' Given a list of aux table names, find the backup tables among them. Then delete all but the newest self.num_to_keep backup tables from the database. @param all_table_names: list of table names to consider removing. @type all_table_names: [str] ''' # Map table name --> list of its backups backup_tables = {} for tbl_nm in all_table_names: if self.utils.is_aux_table(tbl_nm): # Found root of an official table name. # (as apposed to a backup table) if tbl_nm not in backup_tables: # Initialize an entry for this tbl: backup_tables[tbl_nm] = [] continue # Is it a backup table name? if self.utils.is_backup_name(tbl_nm): # Get root of the backup table name: root_nm = self.utils.get_root_name(tbl_nm) # Add to dict: try: backup_tables[root_nm].append(tbl_nm) except KeyError: # Hadn't seen this root table yet: backup_tables[root_nm] = [tbl_nm] # Go through dict; for each table, sort its existing backup tables # by their dates, which are part of the names: # We'll modify backup_tables in the following # loop, so use a copy: backup_tables_copy = backup_tables.copy() for (tbl_nm, backup_names_list) in backup_tables_copy.items(): # Sort the backup tbl names by their date, newest first: sorted_backups = sorted(backup_names_list, key=lambda name: self.get_date(name), reverse=True) backup_tables[tbl_nm] = sorted_backups # Go through, and remove all but the first num_to_keep # backup tables in each list: for backup_nm_list in backup_tables.values(): # Chop off all names after the first num_to_keep names, # unless the name list is shorter than num_to_keep, # in which case we keep what we have: if len(backup_nm_list) <= self.num_to_keep: continue for to_delete in backup_nm_list[self.num_to_keep:]: self.db_obj.dropTable(to_delete) self.utils.log_info(f"Removing old backup table {to_delete}") self.utils.log_info( f"In {self.target_db}: no more than {self.num_to_keep} backup tables left per table." ) #------------------------- # get_date #-------------- def get_date(self, backup_tbl_nm): (_root, date_str, _dateobj) = self.utils.backup_table_name_components(backup_tbl_nm) return date_str #------------------------- # close #-------------- def close(self): try: self.db_obj.close() except Exception: pass
# Used for plotting. CATEGORIES_50_NICO = [ 'book', 'book', 'book', 'book', 'book', 'book', 'book', 'book', 'book', 'book', 'hairbrush', 'hairbrush', 'hairbrush', 'hairbrush', 'hairbrush', 'hairbrush', 'hairbrush', 'hairbrush', 'hairbrush', 'hairbrush', 'hair clip', 'hair clip', 'hair clip', 'hair clip', 'hair clip', 'hair clip', 'hair clip', 'hair clip', 'hair clip', 'hair clip', 'flower', 'flower', 'flower', 'flower', 'flower', 'flower', 'flower', 'flower', 'flower', 'flower', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass', 'glass' ] # ------------------------------------ Initialization -------------------------------------------------------------- rgwr = GammaGWR() utils = Utilities() learning = Learning() args = utils.parse_arguments() # Get data. original_data = utils.load_data(args.dataset).values original_data_normalized = utils.normalize_data(original_data, DATA_DIMENSION) # original_data_normalized = original_data # Get training data. train_data = original_data_normalized[np.in1d( original_data_normalized[:, SESSION_COLUMN], TRAIN_SESSIONS)] train_data = train_data[np.in1d(train_data[:, INSTANCE_COLUMN], TRAIN_INSTANCES)] train_data = utils.reduce_number_of_frames(train_data, FACTOR_FRAMES)