def cross_validate(data, k=3, cons=True): """ Implementation of cross validation. """ lamb = numpy.arange(0.1, 1, 0.2) for l_idx in range(len(lamb)): errors = [] new_data = cv_data_split(data, k, cons) # split data for idx in range(len(new_data)): train_data = [ d for i in range(len(new_data)) if i != idx for d in new_data[i] ] # flatten array test_data = new_data[idx] lin_l = LineSpecificLearner(LinearLearner(lamb[l_idx])) lin_c = lin_l(train_data) predict = [] real = [] for elem in test_data: predict.append(lin_c(elem)) real.append(lpputils.tsdiff(elem[ARR_IDX], elem[DEP_IDX])) errors.append(mean_absolute_error(real, predict)) print("Lambda : {0} , Errors: {1}".format(lamb[l_idx], np.mean(errors)))
def visualize(train_data, _month, day_s, day_e): """ Function which outputs daily travel time by hour (graph + text) """ comp_data = [] times = np.zeros(24) cnts = np.zeros(24) for d in range(day_s, day_e, 1): for row in train_data: date = lpputils.parsedate(row[DEP_IDX]) hour = date.hour month = date.month day = date.day if month == _month and day == d: times[hour] += lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]) cnts[hour] += 1 norm_times = [ float(times[i]) / (float(cnts[i]) + 0.0000000000001) for i in range(len(times)) ] #comp_data.append(np.asarray(norm_times)) print(norm_times) #with open('vizualizacija.csv', 'wb') as abc: # np.savetxt(abc, np.asarray(comp_data), delimiter=",", fmt="%d") #data = np.genfromtxt('vizualizacija.csv', delimiter=',') #for i in range(len(data)): plt.plot(norm_times, label='the data') plt.show()
def zgradi_matrike(linija, training): d = open("prazniki_in_dela_prosti_dnevi.csv", "rt", encoding="latin1") branje = csv.reader(d) next(branje) prazniki = [] for d in branje: prazniki.append( datetime.datetime.strptime((d[0].split(";", 1))[0], "%d.%m.%Y").date()) if training: x = [] y = [] for d in linija: x.append(napolni_x(d, prazniki)) y.append( lpputils.tsdiff(lpputils.parsedate(d[-3]), lpputils.parsedate(d[-1]))) X = linear.append_ones(np.array(x)) Y = np.array(y) return X, Y else: x = [] originalen_datum = [] dejanski_cas = [] route = [] for d in linija: originalen_datum.append(d[-3]) dejanski_cas.append(d[-1]) route.append(d[3]) x.append(napolni_x(d, prazniki)) X = linear.append_ones(np.array(x)) return route, dejanski_cas, originalen_datum, X
def prepare_NN_data(data, test=False): X, y = [], [] for row in data: X.append(parse_row(row)) if not test: y.append(lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX])) X = scipy.sparse.csr_matrix(X) y = np.array(y) return X,y
def model_to_csv(train_data, filename): file = open(filename, 'w') file.write('line,month,day,hour,travel\n') for row in train_data: date = lpputils.parsedate(row[DEP_IDX]) hour = date.hour day = date.weekday() month = date.month travel = lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]) file.write('{0},{1},{2},{3},{4}\n'.format(row[2], month, day, hour, travel))
def __call__(self, data): X, y = [], [] for row in data: X.append(parse_row(row)) y.append(lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX])) X = scipy.sparse.csr_matrix(X) y = np.array(y) self.regressor.fit(X, y) print("Regressor has been fitted !") return RFRegressorPredictor(self.regressor)
def bus_average(bus_id=None, data=None): global buses if bus_id is None: for row in data: if row[BUS_IDX] not in buses.keys(): buses[row[BUS_IDX]] = [ lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]), 1 ] else: tmp = buses[row[BUS_IDX]] tmp[0] = lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]) tmp[1] += 1 buses[row[BUS_IDX]] = tmp tmp = { bus: float(buses[bus][0]) / float(buses[bus][1]) for bus in buses.keys() } buses = {bus: tmp[bus] / max(tmp.values()) for bus in tmp} else: try: return buses[bus_id] except KeyError: print(sum(buses.values()) / float(len(buses))) return sum(buses.values()) / float(len(buses))
def driver_average(driver_id=None, data=None): global drivers if driver_id is None: for row in data: if row[DRV_IDX] not in drivers.keys(): drivers[row[DRV_IDX]] = [ lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]), 1 ] else: tmp = drivers[row[DRV_IDX]] tmp[0] = lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX]) tmp[1] += 1 drivers[row[DRV_IDX]] = tmp tmp = { driver: float(drivers[driver][0]) / float(drivers[driver][1]) for driver in drivers.keys() } drivers = {driver: tmp[driver] / max(tmp.values()) for driver in tmp} else: try: return drivers[driver_id] except KeyError: print(sum(drivers.values()) / float(len(drivers))) return sum(drivers.values()) / float(len(drivers))
def __call__(self, data): """ Zgradi napovedni model za ucne podatke X z razredi y. """ X, y = [], [] for row in data: X.append(parse_row(row)) y.append(lpputils.tsdiff(row[ARR_IDX], row[DEP_IDX])) X = scipy.sparse.csr_matrix(X) X = append_ones(X) y = np.array(y) th = fmin_l_bfgs_b(cost_grad_linear, x0=numpy.zeros(X.shape[1]), args=(X, y, self.lambda_))[0] return LinearRegClassifier(th)
def absolute_error(cas, napoved): if cas == "?": return 0 return abs(lpputils.tsdiff(cas, napoved))
def __call__(self, data): delays = [lpputils.tsdiff(d[-1], d[-3]) for d in data] mean = sum(delays) / len(delays) return AverageTripClassifier(mean)
return x if __name__ == "__main__": f = gzip.open("train_pred.csv.gz", "rt") reader = csv.reader(f, delimiter="\t") next(reader) # ['Registration', 'Driver ID', 'Route', 'Route Direction', 'Route description', 'First station', 'Departure time', 'Last station', 'Arrival time'] data = [d for d in reader] noLines = len(data) Y = numpy.zeros(noLines) X = numpy.zeros([noLines, 7]) for i, line in enumerate(data): Y[i] = lpputils.tsdiff(line[-1], line[-3]) # določimo čas vožnje odhod = lpputils.parsedate(line[-1]) X[i] = getAttributes(odhod) lr = linear.LinearLearner(lambda_=1.) napovednik = lr(X, Y) f = gzip.open("test_pred.csv.gz", "rt") test = csv.reader(f, delimiter="\t") next(reader) # skip legend fo = open("naloga3.txt", "wt") for l in test: odhod = lpputils.parsedate(l[-3]) nov_primer = numpy.array(getAttributes(odhod)) #print(nov_primer)