Esempio n. 1
0
def cross_validate(data, k=3, cons=True):
    """
    Implementation of cross validation.
    """
    lamb = numpy.arange(0.1, 1, 0.2)

    for l_idx in range(len(lamb)):
        errors = []
        new_data = cv_data_split(data, k, cons)  # split data
        for idx in range(len(new_data)):
            train_data = [
                d for i in range(len(new_data)) if i != idx
                for d in new_data[i]
            ]  # flatten array
            test_data = new_data[idx]
            lin_l = LineSpecificLearner(LinearLearner(lamb[l_idx]))
            lin_c = lin_l(train_data)

            predict = []
            real = []
            for elem in test_data:
                predict.append(lin_c(elem))
                real.append(lpputils.tsdiff(elem[ARR_IDX], elem[DEP_IDX]))
            errors.append(mean_absolute_error(real, predict))
        print("Lambda : {0} , Errors: {1}".format(lamb[l_idx],
                                                  np.mean(errors)))
Esempio n. 2
0
def visualize(train_data, _month, day_s, day_e):
    """
    Function which outputs daily travel time by hour (graph + text)
    """
    comp_data = []
    times = np.zeros(24)
    cnts = np.zeros(24)
    for d in range(day_s, day_e, 1):
        for row in train_data:
            date = lpputils.parsedate(row[DEP_IDX])
            hour = date.hour
            month = date.month
            day = date.day
            if month == _month and day == d:
                times[hour] += lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX])
                cnts[hour] += 1
    norm_times = [
        float(times[i]) / (float(cnts[i]) + 0.0000000000001)
        for i in range(len(times))
    ]
    #comp_data.append(np.asarray(norm_times))
    print(norm_times)
    #with open('vizualizacija.csv', 'wb') as abc:
    #    np.savetxt(abc, np.asarray(comp_data), delimiter=",", fmt="%d")

    #data = np.genfromtxt('vizualizacija.csv', delimiter=',')
    #for i in range(len(data)):
    plt.plot(norm_times, label='the data')
    plt.show()
Esempio n. 3
0
def zgradi_matrike(linija, training):
    d = open("prazniki_in_dela_prosti_dnevi.csv", "rt", encoding="latin1")
    branje = csv.reader(d)
    next(branje)
    prazniki = []
    for d in branje:
        prazniki.append(
            datetime.datetime.strptime((d[0].split(";", 1))[0],
                                       "%d.%m.%Y").date())

    if training:
        x = []
        y = []
        for d in linija:
            x.append(napolni_x(d, prazniki))
            y.append(
                lpputils.tsdiff(lpputils.parsedate(d[-3]),
                                lpputils.parsedate(d[-1])))
        X = linear.append_ones(np.array(x))
        Y = np.array(y)
        return X, Y
    else:
        x = []
        originalen_datum = []
        dejanski_cas = []
        route = []
        for d in linija:
            originalen_datum.append(d[-3])
            dejanski_cas.append(d[-1])
            route.append(d[3])
            x.append(napolni_x(d, prazniki))
        X = linear.append_ones(np.array(x))
        return route, dejanski_cas, originalen_datum, X
Esempio n. 4
0
def prepare_NN_data(data, test=False):
    X, y = [], []
    for row in data:
        X.append(parse_row(row))
        if not test:
            y.append(lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]))

    X = scipy.sparse.csr_matrix(X)
    y = np.array(y)
    return X,y
Esempio n. 5
0
def model_to_csv(train_data, filename):

    file = open(filename, 'w')
    file.write('line,month,day,hour,travel\n')
    for row in train_data:
        date = lpputils.parsedate(row[DEP_IDX])
        hour = date.hour
        day = date.weekday()
        month = date.month
        travel = lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX])
        file.write('{0},{1},{2},{3},{4}\n'.format(row[2], month, day, hour,
                                                  travel))
Esempio n. 6
0
    def __call__(self, data):
        X, y = [], []
        for row in data:
            X.append(parse_row(row))
            y.append(lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]))

        X = scipy.sparse.csr_matrix(X)
        y = np.array(y)

        self.regressor.fit(X, y)
        print("Regressor has been fitted !")
        return RFRegressorPredictor(self.regressor)
Esempio n. 7
0
def bus_average(bus_id=None, data=None):
    global buses
    if bus_id is None:
        for row in data:
            if row[BUS_IDX] not in buses.keys():
                buses[row[BUS_IDX]] = [
                    lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]), 1
                ]
            else:
                tmp = buses[row[BUS_IDX]]
                tmp[0] = lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX])
                tmp[1] += 1
                buses[row[BUS_IDX]] = tmp
        tmp = {
            bus: float(buses[bus][0]) / float(buses[bus][1])
            for bus in buses.keys()
        }
        buses = {bus: tmp[bus] / max(tmp.values()) for bus in tmp}
    else:
        try:
            return buses[bus_id]
        except KeyError:
            print(sum(buses.values()) / float(len(buses)))
            return sum(buses.values()) / float(len(buses))
Esempio n. 8
0
def driver_average(driver_id=None, data=None):
    global drivers
    if driver_id is None:
        for row in data:
            if row[DRV_IDX] not in drivers.keys():
                drivers[row[DRV_IDX]] = [
                    lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]), 1
                ]
            else:
                tmp = drivers[row[DRV_IDX]]
                tmp[0] = lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX])
                tmp[1] += 1
                drivers[row[DRV_IDX]] = tmp
        tmp = {
            driver: float(drivers[driver][0]) / float(drivers[driver][1])
            for driver in drivers.keys()
        }
        drivers = {driver: tmp[driver] / max(tmp.values()) for driver in tmp}
    else:
        try:
            return drivers[driver_id]
        except KeyError:
            print(sum(drivers.values()) / float(len(drivers)))
            return sum(drivers.values()) / float(len(drivers))
Esempio n. 9
0
    def __call__(self, data):
        """
        Zgradi napovedni model za ucne podatke X z razredi y.
        """
        X, y = [], []
        for row in data:
            X.append(parse_row(row))
            y.append(lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]))

        X = scipy.sparse.csr_matrix(X)
        X = append_ones(X)
        y = np.array(y)

        th = fmin_l_bfgs_b(cost_grad_linear,
                           x0=numpy.zeros(X.shape[1]),
                           args=(X, y, self.lambda_))[0]

        return LinearRegClassifier(th)
Esempio n. 10
0
def absolute_error(cas, napoved):
    if cas == "?": return 0
    return abs(lpputils.tsdiff(cas, napoved))
    def __call__(self, data):
        delays = [lpputils.tsdiff(d[-1], d[-3]) for d in data]
        mean = sum(delays) / len(delays)

        return AverageTripClassifier(mean)
    return x


if __name__ == "__main__":
    f = gzip.open("train_pred.csv.gz", "rt")
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    # ['Registration', 'Driver ID', 'Route', 'Route Direction', 'Route description', 'First station', 'Departure time', 'Last station', 'Arrival time']

    data = [d for d in reader]
    noLines = len(data)

    Y = numpy.zeros(noLines)
    X = numpy.zeros([noLines, 7])
    for i, line in enumerate(data):
        Y[i] = lpputils.tsdiff(line[-1], line[-3])  # določimo čas vožnje
        odhod = lpputils.parsedate(line[-1])
        X[i] = getAttributes(odhod)

    lr = linear.LinearLearner(lambda_=1.)
    napovednik = lr(X, Y)

    f = gzip.open("test_pred.csv.gz", "rt")
    test = csv.reader(f, delimiter="\t")
    next(reader)  # skip legend

    fo = open("naloga3.txt", "wt")
    for l in test:
        odhod = lpputils.parsedate(l[-3])
        nov_primer = numpy.array(getAttributes(odhod))
        #print(nov_primer)