Beispiel #1
0
def get_x_matrix(var_to_calc, restricted=False):
    var_to_calc -= 1

    x_2 = read_column_from_csv(column_number=0 + var_to_calc * 4,
                               file='data/4problem.csv')
    x_3 = read_column_from_csv(column_number=1 + var_to_calc * 4,
                               file='data/4problem.csv')
    x_4 = read_column_from_csv(column_number=2 + var_to_calc * 4,
                               file='data/4problem.csv')
    y_1 = read_column_from_csv(column_number=3 + var_to_calc * 4,
                               file='data/4problem.csv')

    len_of_data = len(y_1)

    # vector of MSE coefficients is (X^T * X)^-1 * X^T * Y
    # 1. make vector-column

    x = numpy.ones((len_of_data, 1), dtype=float)

    x_2 = numpy.array([x_2]).T
    x_3 = numpy.array([x_3]).T
    x_4 = numpy.array([x_4]).T
    y_1 = numpy.array([y_1]).T

    x = numpy.concatenate((x, x_2), axis=1)
    if not restricted:
        x = numpy.concatenate((x, x_3), axis=1)
        x = numpy.concatenate((x, x_4), axis=1)
    return x
Beispiel #2
0
def ls(var_to_calc, ridge=0, restricted=False):
    """
    :param ridge: ridge coefficient
    :param var_to_calc: variant from 1 to 10
    :param restricted: model with restriction or not
    :return: array of LS(least squares) coefficients
    """

    x = get_x_matrix(var_to_calc, restricted)

    # (X^T * X)
    # numpy.dot is a matrix multiplication

    # if ridge is none 0
    if not restricted:
        ar = numpy.zeros((4, 4), float)
        numpy.fill_diagonal(ar, float(ridge))
        x_step1 = numpy.dot(x.T, x) + ar
    else:
        x_step1 = numpy.dot(x.T, x)

    # (X^T * X)^-1
    x_step2 = linal.inv(x_step1)

    # (X^T * X)^-1 * X^T
    x_step3 = numpy.dot(x_step2, x.T)

    # (X^T * X)^-1 * X^T * Y
    y_1 = read_column_from_csv(column_number=3 + (var_to_calc - 1) * 4,
                               file='data/4problem.csv')
    y_1 = numpy.array([y_1]).T
    coefficient_vector = numpy.dot(x_step3, y_1)

    return coefficient_vector.T[0]
Beispiel #3
0
        for i in range(len(prev_centers)):
            prev_centers[i] = list(centers[i])  # we must copy this way.

        for i in range(len(centers)):  # calculating new centers
            sx = 0
            sy = 0
            for j in range(len(clusters[i])):
                sx += clusters[i][j][0]
                sy += clusters[i][j][1]
            if len(clusters[i]
                   ) > 0:  # situation when cluster is empty is possible
                centers[i][0] = round(sx / len(clusters[i]), 5)
                centers[i][1] = round(sy / len(clusters[i]), 5)

        is_continue = False
        for i in range(len(centers)):  # decide must we continue or not
            if centers[i] not in prev_centers:
                is_continue = True
                break

    return clusters


X = read_column_from_csv(0 + (v_number - 1) * 2, 'data/7problem.csv')
Y = read_column_from_csv(1 + (v_number - 1) * 2, 'data/7problem.csv')
draw_plot(X, Y)
for k in range(2, 5):
    clusters = clustering(k, X, Y)
    draw_clusters(clusters)
Beispiel #4
0
import pandas as pd
import statsmodels.formula.api as smf
import math
from lib import rss, ess
from scipy.stats import f, norm, chi
import numpy as np

from lib import mk_data_var, read_column_from_csv

# TODO alter this to your variant
v_number = 1

mk_data_var(v_number)

class_1 = read_column_from_csv(0,
                               'data/6problem_{}.csv'.format(v_number),
                               type='f')
class_2 = read_column_from_csv(1,
                               'data/6problem_{}.csv'.format(v_number),
                               type='f')
sex = read_column_from_csv(3,
                           'data/6problem_{}.csv'.format(v_number),
                           type='f')
survived = read_column_from_csv(4,
                                'data/6problem_{}.csv'.format(v_number),
                                type='f')

df = pd.DataFrame({
    "class_1": class_1,
    "class_2": class_2,
    "sex": sex,
Beispiel #5
0
    d3=1, если квартира трёхкомнатная, 0 иначе;
    d4=1, если квартира четырёхкомнатная, 0 иначе;
    dist расстояние от центра Москвы (в км);
    walk=1,  если до метро можно быстро дойти пешком, 0 иначе;
    brick=1,  если дом кирпичный, 0 иначе;
    bal=1,  если есть балкон, 0 иначе;
    floor=0,  если этаж первый или последний, 1 иначе.
"""

variation = 1

# TODO Вы должны сделать свой .csv файл из того что прислал Фурманов. Нужно удалить из него все строки
# в которых есть пустые элементы. Пустые строки в каждом варианте разные, поэтому удалите
# только те, что пустые именно в вашем варианте. После этих операций сохраните его как data/5problem.csv

bal = read_column_from_csv(column_number=0 + (variation - 1) * 11,
                           file='data/5problem.csv')
brick = read_column_from_csv(column_number=1 + (variation - 1) * 11,
                             file='data/5problem.csv')
d2 = read_column_from_csv(column_number=2 + (variation - 1) * 11,
                          file='data/5problem.csv')
d3 = read_column_from_csv(column_number=3 + (variation - 1) * 11,
                          file='data/5problem.csv')
d4 = read_column_from_csv(column_number=4 + (variation - 1) * 11,
                          file='data/5problem.csv')
dist = read_column_from_csv(column_number=5 + (variation - 1) * 11,
                            file='data/5problem.csv')
floor = read_column_from_csv(column_number=6 + (variation - 1) * 11,
                             file='data/5problem.csv')
price = read_column_from_csv(column_number=7 + (variation - 1) * 11,
                             file='data/5problem.csv')
totsp = read_column_from_csv(column_number=8 + (variation - 1) * 11,
Beispiel #6
0
 методом наименьших квадратов""")
print()
# LS(least squares)
# b1 b2 b3 b4
coefficient_vector = ls(variation)
print("Коэфициенты b1 b2 b3 b4:")
print(coefficient_vector)
print()

##############################################################
######## Проверьте значимость регрессии в целом ##############
##############################################################

print("2. Проверьте значимость регрессии в целом")
print()
x_2 = read_column_from_csv(column_number=0 + (variation - 1) * 4,
                           file='data/4problem.csv')
x_3 = read_column_from_csv(column_number=1 + (variation - 1) * 4,
                           file='data/4problem.csv')
x_4 = read_column_from_csv(column_number=2 + (variation - 1) * 4,
                           file='data/4problem.csv')
y_1 = read_column_from_csv(column_number=3 + (variation - 1) * 4,
                           file='data/4problem.csv')

y_estimation = [
    coefficient_vector[0] + coefficient_vector[1] * x_2[i] +
    coefficient_vector[2] * x_3[i] + coefficient_vector[3] * x_4[i]
    for i in range(40)
]

ess_ur = ess(y_1, y_estimation)
rss_ur = rss(y_1, y_estimation)
Beispiel #7
0
 методом наименьших квадратов""")
print()
# LS(least squares)
# b1 b2 b3 b4
coefficient_vector = ls(variation)
print("Коэфициенты b1 b2 b3 b4:")
print(coefficient_vector)
print()

##############################################################
######## Проверьте значимость регрессии в целом ##############
##############################################################

print("2. Проверьте значимость регрессии в целом")
print()
x_2 = read_column_from_csv(column_number=0 + (variation - 1) * 4,
                           file=file_path)
x_3 = read_column_from_csv(column_number=1 + (variation - 1) * 4,
                           file=file_path)
x_4 = read_column_from_csv(column_number=2 + (variation - 1) * 4,
                           file=file_path)
y_1 = read_column_from_csv(column_number=3 + (variation - 1) * 4,
                           file=file_path)

y_estimation = [
    coefficient_vector[0] + coefficient_vector[1] * x_2[i] +
    coefficient_vector[2] * x_3[i] + coefficient_vector[3] * x_4[i]
    for i in range(40)
]

ess_ur = ess(y_1, y_estimation)
rss_ur = rss(y_1, y_estimation)