def __init__(self): self.data = self.read_file("train_pred.csv.gz") self.test_data = self.read_file("test_pred.csv.gz") X = np.vstack(self.day_time(self.data)) y = np.array(self.duration(self.data)) test_matrix = np.vstack(self.day_time(self.test_data)) lr = linear.LinearLearner(lambda_=1.) napovednik = lr(X, y) result = [napovednik(line) for line in test_matrix] fo = open("predtekmovanje_results.txt", "wt") for l, e in zip(result, self.test_data): fo.write(lpputil.tsadd(e[6], l) + "\n")
def train_routes(routes_to_train, train): models_routes = {} for route in routes_to_train: X_train = my_train[my_train["Route"] == route] Y_train = X_train["TravelTime"].dt.total_seconds() X_train = X_train.drop([ "TravelTime", "DepartureTime", "ArrivalTime", "Route", "RouteDirection" ], axis=1) X_train = np.array(X_train) Y_train = np.array(Y_train) lr = linear.LinearLearner(lambda_=1.) models_routes[route] = lr(X_train, Y_train) return models_routes
def train_route_dirs(route_dirs_to_train, train): models_dirs = {} routes_to_train = [] for route_dir in route_dirs_to_train: X_train = train[train["RouteDirection"] == route_dir] # train later with route number if (X_train.shape[0] == 0): rows = train.loc[train["RouteDirection"] == route_dir, "Route"] if (len(rows) > 0): routes_to_train.append(rows.iloc[0]) continue Y_train = X_train["TravelTime"].dt.total_seconds() X_train = X_train.drop([ "TravelTime", "DepartureTime", "ArrivalTime", "Route", "RouteDirection" ], axis=1) lr = linear.LinearLearner(lambda_=1.) models_dirs[route_dir] = lr(np.array(X_train), np.array(Y_train)) return [routes_to_train, models_dirs]
import linear import numpy if __name__ == "__main__": X = numpy.array([[1, 3], [2, 2], [3, 3]]) y = numpy.array([10, 11, 12]) lr = linear.LinearLearner(lambda_=1.) napovednik = lr(X, y) print "Koeficienti", napovednik.th #prvi je konstanten faktor nov_primer = numpy.array([2, 11]) print "Napoved", napovednik(nov_primer)
X = linear.append_ones(np.array(x)) return route, dejanski_cas, originalen_datum, X f = gzip.open("train.csv.gz", "rt", encoding="latin1") reader = csv.reader(f, delimiter="\t") next(reader) linije = {} for primer in reader: if primer[3] in linije: linije[primer[3]].append(primer) else: linije[primer[3]] = [primer] linearna_regresija = linear.LinearLearner() for linija in linije.keys(): x, y = zgradi_matrike(linije[linija], True) linije[linija] = linearna_regresija(x, y) f = gzip.open("test.csv.gz", "rt", encoding="latin1") #za izpis MAE spremeni v "train.csv.gz" vrstica = csv.reader(f, delimiter="\t") next(vrstica) ime, dejanski_cas, primeri, testni_X = zgradi_matrike(vrstica, False) datoteka = open("napovedi_tekmovanje.txt", "wt", encoding="latin1") mae_mesec = 11 mae = 0 stevilo_primerov = 0
def read_file(file_path): #funkcija za branje podtkov iz datoteke f = gzip.open(file_path, "rt", encoding="UTF-8") reader = csv.reader(f, delimiter="\t") next(reader) #preskocimo glavo tabele data = [d for d in reader] return data if __name__ == "__main__": #preberemo datoteke ankaterih se ucimo in tiste na katerih testiramo data = read_file("train.csv.gz") test_data = read_file("test.csv.gz") #zgradimo model l = SeparateBySetLearner(linear.LinearLearner(lambda_=1.)) c = l(data) fo = open("results.txt", "wt") for l in test_data: fo.write(lpputil.tsadd(l[-3], c(l)) + "\n") fo.close() #preverjamo na internih podatkih data, test_data, real = loci_po_mesecu(data) l = SeparateBySetLearner(linear.LinearLearner(lambda_=1.)) c = l(data) results = [] for l in test_data: results.append(lpputil.tsadd(l[-3], c(l)))
def learn(data, DEC): map_reg, inv_map_reg = mapData([getReg(l) for l in data]) map_dri, inv_map_dri = mapData([getDri(l) for l in data]) map_route, inv_map_route = mapData([getRoute(l) for l in data]) N = len(map_route) line_tim = [] line_route_tim = [[] for _ in range(N)] lm_data = [(0, 0) for _ in range(N)] for l in data: mp_route = map_route[getRoute(l)] dtx = timeDifference(getArr(l), getDep(l), FMT) line_tim.append(dtx.seconds) line_route_tim[mp_route].append(dtx.seconds) x, y = lm_data[mp_route] lm_data[mp_route] = x + dtx.seconds, y + 1 avg_line = sum(line_tim) / len(line_tim) avg_route = [ sum(line_route_tim[i]) / len(line_route_tim[i]) for i in range(N) ] avg_data = [x / max(1, y) for x, y in lm_data] dr = [[] for _ in range(N)] dd = [[] for _ in range(N)] dt = [[] for _ in range(N)] dy = [[] for _ in range(N)] for l in data: mp_route = map_route[getRoute(l)] dr[mp_route].append(map_reg[getReg(l)]) dd[mp_route].append(map_dri[getDri(l)]) dt[mp_route].append(mapTime(getDep(l), FMT, DEC)) dy[mp_route].append(timeDifference(getArr(l), getDep(l), FMT).seconds) # [MAPPED_ROUTE][X] ... X == 0 ? AVG : X == 1 ? RANK FOR THE I-TH EXAMPLE lm_reg = [] lm_dri = [] lm_tim = [] mpx_reg = [] mpx_dri = [] mpx_tim = [] for i in range(N): x, y = rankData(dr[i], dy[i]) lm_reg.append(x) mpx_reg.append(y) x, y = rankData(dd[i], dy[i]) lm_dri.append(x) mpx_dri.append(y) x, y = rankData(dt[i], dy[i]) lm_tim.append(x) mpx_tim.append(y) sr = [len(set(dr[i])) for i in range(N)] sd = [len(set(dd[i])) for i in range(N)] st = [len(set(dt[i])) for i in range(N)] for i in range(N): for j in range(len(lm_reg[i][1])): lm_reg[i][1][j] /= sr[i] lm_dri[i][1][j] /= sd[i] lm_tim[i][1][j] /= st[i] models = [] for i in range(N): #print("i = %2d %s" % (i, inv_map_route[i])) Y = numpy.array(dy[i]) X = numpy.array([ [ #lm_reg[i][0][j], lm_reg[i][1][j], #lm_dri[i][0][j], lm_dri[i][1][j], #lm_tim[i][0][j], lm_tim[i][1][j], lm_tim[i][1][j]**2, lm_reg[i][1][j] + lm_dri[i][1][j] + lm_tim[i][1][j] ] for j in range(len(dy[i])) ]) lr = linear.LinearLearner(lambda_=17) models.append(lr(X, Y)) return models, avg_line, map_route, map_reg, map_dri, mpx_reg, mpx_dri, mpx_tim, sr, sd, st