Ejemplo n.º 1
0
import scipy.io
import numpy as np

from util.logger import LoggerBuilder

logger = LoggerBuilder().with_name("file_reader").build()


def read_matlab_file(path):
    dataset = scipy.io.loadmat(path)
    logger.debug("Load matlab file %s", path)

    for (key, value) in dataset.items():
        if '__' not in key:
            logger.debug('\tFound entry \'%s\' with size %s', key,
                         value.shape if hasattr(value, 'shape') else len(value))
    return dataset


def parse_dataset_file(dataset_file_path, expected_labels=None):
    if ".txt" in dataset_file_path:
        dataset = read_dataset_txt_file(dataset_file_path)
        features = np.ones(dataset.shape)
        features[:, 1:] = dataset[:, 0:-1]
        results = dataset[:, -1:]
        return dataset, features, results
    if ".mat" in dataset_file_path:
        dataset = read_matlab_file(dataset_file_path)
        if expected_labels is None:
            return dataset
        dataset_entries = {}
Ejemplo n.º 2
0
import numpy as np
import scipy.optimize as op

from abstract_lab import Lab
from ml_6 import util, graph
from ml_6.k_mean import k_mean_algorithm, hierarchical_clustering
from ml_6.util import translate_mat_to_compressed_img, read_image
from util.logger import LoggerBuilder
from util.file.matlab_file_reader import read_matlab_file

logger = LoggerBuilder().with_name("lab6").build()

DATA_PATH_1 = "./ml_6/resources/ex6data1.mat"
DATA_PATH_2 = "./ml_6/resources/bird_small.mat"
INIT_CENTERS_COUNT = 3
ITERATIONS_COUNT = 100
BIRDSMALL_CLASSES_COUNT = 16
WOLF_CLASSES_COUNT = 16

BIRDSMALL_IMAGE_PATH = "./ml_6/images/bird_small.jpg"
WOLF_IMAGE_PATH = "./ml_6/images/wolf.jpg"


class SixthLab(Lab):
    def __init__(self):
        pass

    def run_lab(self):
        # (1)
        dataset = read_matlab_file(DATA_PATH_1)
        x = dataset.get("X")
Ejemplo n.º 3
0
import time

from functools import wraps

from util.logger import LoggerBuilder

logger = LoggerBuilder().with_name("profiler").build()


def timed(func):
    """This decorator prints the execution time for the decorated function."""

    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time_ns()
        result = func(*args, **kwargs)
        end = time.time_ns()
        logger.debug("Method '{}' ran in {} ms".format(func.__name__, round((end - start) / 1000000, 2)))
        result = func(*args, **kwargs)
        return result

    return wrapper
Ejemplo n.º 4
0
import numpy as np
import pandas
import scipy.optimize as op

from util.logger import LoggerBuilder
from sklearn.preprocessing import PolynomialFeatures
from util.timed import timed

EPSYLON = 1e-5

logger = LoggerBuilder().with_name("logistic_regression").build()


def sigmoid(z):
    return 1. / (1 + np.e**(-z))


def calc_cost_function(x, y, theta, learning_rate, m):
    z = x @ theta
    h = sigmoid(z)
    loss = calc_loss(h, y)
    cost = np.sum(loss) / m
    gradient = np.dot(x.T, (h - y)) / m
    theta = theta - learning_rate * gradient
    return cost, gradient, theta


def calc_loss(h, y):
    return (-y * np.log(h + EPSYLON) -
            (1 - y) * np.log(1 - h + EPSYLON)).mean()
Ejemplo n.º 5
0
import re

from ml_5.external import PorterStemmer
from ml_5.util import convert_to_features
from util.file.data_loader import read_file
from util.logger import LoggerBuilder

logger = LoggerBuilder().with_name("text_processing").build()


def is_spam(name, file_path, model, vocabulary, vocabulary_size):
    email = process_text(read_file(file_path), vocabulary)
    email_features = convert_to_features(email, vocabulary_size)
    logger.info('%s is %s', name,
                'spam' if model.predict(email_features) == 1 else 'not spam')


def process_text(content, vocabulary):
    content = content.lower()
    content = re.compile('<[^<>]+>').sub(' ', content)
    content = re.compile('[0-9]+').sub(' number ', content)
    content = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', content)
    content = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', content)
    content = re.compile('[$]+').sub(' dollar ', content)
    content = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', content)
    content = [word for word in content if len(word) > 0]

    # Stem the email contents word by word
    stemmer = PorterStemmer()
    processed_content = []
    word_indices = []