import scipy.io import numpy as np from util.logger import LoggerBuilder logger = LoggerBuilder().with_name("file_reader").build() def read_matlab_file(path): dataset = scipy.io.loadmat(path) logger.debug("Load matlab file %s", path) for (key, value) in dataset.items(): if '__' not in key: logger.debug('\tFound entry \'%s\' with size %s', key, value.shape if hasattr(value, 'shape') else len(value)) return dataset def parse_dataset_file(dataset_file_path, expected_labels=None): if ".txt" in dataset_file_path: dataset = read_dataset_txt_file(dataset_file_path) features = np.ones(dataset.shape) features[:, 1:] = dataset[:, 0:-1] results = dataset[:, -1:] return dataset, features, results if ".mat" in dataset_file_path: dataset = read_matlab_file(dataset_file_path) if expected_labels is None: return dataset dataset_entries = {}
import numpy as np import scipy.optimize as op from abstract_lab import Lab from ml_6 import util, graph from ml_6.k_mean import k_mean_algorithm, hierarchical_clustering from ml_6.util import translate_mat_to_compressed_img, read_image from util.logger import LoggerBuilder from util.file.matlab_file_reader import read_matlab_file logger = LoggerBuilder().with_name("lab6").build() DATA_PATH_1 = "./ml_6/resources/ex6data1.mat" DATA_PATH_2 = "./ml_6/resources/bird_small.mat" INIT_CENTERS_COUNT = 3 ITERATIONS_COUNT = 100 BIRDSMALL_CLASSES_COUNT = 16 WOLF_CLASSES_COUNT = 16 BIRDSMALL_IMAGE_PATH = "./ml_6/images/bird_small.jpg" WOLF_IMAGE_PATH = "./ml_6/images/wolf.jpg" class SixthLab(Lab): def __init__(self): pass def run_lab(self): # (1) dataset = read_matlab_file(DATA_PATH_1) x = dataset.get("X")
import time from functools import wraps from util.logger import LoggerBuilder logger = LoggerBuilder().with_name("profiler").build() def timed(func): """This decorator prints the execution time for the decorated function.""" @wraps(func) def wrapper(*args, **kwargs): start = time.time_ns() result = func(*args, **kwargs) end = time.time_ns() logger.debug("Method '{}' ran in {} ms".format(func.__name__, round((end - start) / 1000000, 2))) result = func(*args, **kwargs) return result return wrapper
import numpy as np import pandas import scipy.optimize as op from util.logger import LoggerBuilder from sklearn.preprocessing import PolynomialFeatures from util.timed import timed EPSYLON = 1e-5 logger = LoggerBuilder().with_name("logistic_regression").build() def sigmoid(z): return 1. / (1 + np.e**(-z)) def calc_cost_function(x, y, theta, learning_rate, m): z = x @ theta h = sigmoid(z) loss = calc_loss(h, y) cost = np.sum(loss) / m gradient = np.dot(x.T, (h - y)) / m theta = theta - learning_rate * gradient return cost, gradient, theta def calc_loss(h, y): return (-y * np.log(h + EPSYLON) - (1 - y) * np.log(1 - h + EPSYLON)).mean()
import re from ml_5.external import PorterStemmer from ml_5.util import convert_to_features from util.file.data_loader import read_file from util.logger import LoggerBuilder logger = LoggerBuilder().with_name("text_processing").build() def is_spam(name, file_path, model, vocabulary, vocabulary_size): email = process_text(read_file(file_path), vocabulary) email_features = convert_to_features(email, vocabulary_size) logger.info('%s is %s', name, 'spam' if model.predict(email_features) == 1 else 'not spam') def process_text(content, vocabulary): content = content.lower() content = re.compile('<[^<>]+>').sub(' ', content) content = re.compile('[0-9]+').sub(' number ', content) content = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', content) content = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', content) content = re.compile('[$]+').sub(' dollar ', content) content = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', content) content = [word for word in content if len(word) > 0] # Stem the email contents word by word stemmer = PorterStemmer() processed_content = [] word_indices = []