def train(): from data import input_reader import pandas as pd print("Читаем трейнсет") train_csv = input_reader.load("data/clean_train.csv") train_true = train_csv['category'].tolist() train_csv = train_csv.drop('category', 1) train_features = train_csv.as_matrix() sz = 600000 train__train_features = train_features[:sz, :] train__train_true = train_true[:sz] train__test_features = train_features[sz:, :] train__test_true = train_true[sz:] model = xgb.XGBClassifier(max_depth=80, n_estimators=30, learning_rate=0.05, nthread=4, subsample=0.7, colsample_bytree=0.7, silent=True) print("Учимся") model.fit(train__train_features, train__train_true) print("Оцениваем на части трейна") score = model.score(train__test_features, train__test_true) print("Результат =", score) print("Учимся на полном сете") model.fit(train_features, train_true) print("Читаем тест сет") test_csv = input_reader.load("data/clean_test.csv") test_features = test_csv.as_matrix() print("Предсказываем вероятности") predicted_probas = model.predict_proba(test_features) # выводим классы cur_dir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(cur_dir, "data/clean_train.csv_classes") clz_map = pd.read_csv(filename, index_col=False) r = clz_map.ix[0] cols = [r[str(c)].upper() for c in model.classes_] print(cols) print("Пишем результат") res_df = pd.DataFrame(predicted_probas, columns=cols) res_df.to_csv("data/res.csv", index=True, index_label="Id") print("Готово!")
def plot_gen(is_demo): # Plotting Options sns.set_style("whitegrid") sns.despine() def rname(old): return translate(old, "en", "ru") def plot_bar(df, title, filename): p = ( 'Set2', 'Paired', 'colorblind', 'husl', 'Set1', 'coolwarm', 'RdYlGn', 'spectral' ) df = df.rename(rname) bar = df.plot(kind='barh', title=title, fontsize=8, figsize=(12, 8), stacked=False, width=1, color=sns.color_palette(np.random.choice(p), len(df)), ) bar.figure.savefig(filename) plt.show() def plot_top_crimes(df, column, title, fname, items=0): df.columns = df.columns.map(operator.methodcaller('lower')) by_col = df.groupby(column) col_freq = by_col.size() col_freq.index = col_freq.index.map(capwords) col_freq.sort(ascending=True, inplace=True) plot_bar(col_freq[slice(-1, - items, -1)], title, fname) df = load("../data/train.csv.zip" if not is_demo else "data/train.csv.zip") plot_top_crimes(df, 'category', 'Количество преступлений (по типу)', 'category.png') plot_top_crimes(df, 'resolution', 'Результаты расследования', 'resolution.png') plot_top_crimes(df, 'pddistrict', 'Активность полиции', 'police.png') plot_top_crimes(df, 'dayofweek', 'Преступления по дням недели', 'weekly.png') plot_top_crimes(df, 'address', 'Адреса преступлений (топ 20)', 'location.png', items=20) plot_top_crimes(df, 'descript', 'Конкретные преступления (топ 20)', 'descript.png', items=20)
def clean_test_set(filename, output_filename, day_of_week_map, weather_type_map, weather_json, log=False): weather_data = None with open(weather_json, 'r') as myfile: data = myfile.read().replace('\n', '') weather_data = json.loads(data) initial_df = load(filename) length = initial_df.shape[0] arr = [] last_percent = -1 for i in range(length): if log: percent = int((i / length) * 100) if percent > last_percent: last_percent = percent print(str(percent) + "%") print(str(i) + " строка") row = initial_df.ix[i] d = parse(row["Dates"]) day = d.day month = d.month year = d.year hour = d.hour minute = d.minute weather_dic = weather_data[str(year)][str(month)][str(day)] weather_type = weather_dic['weather'] wt = weather_type_map[weather_type] temperature = weather_dic['temperature'] day_of_week = string.capwords(row['DayOfWeek']) day_of_week = day_of_week_map[day_of_week] lon = row['X'] lat = row['Y'] x = [day, month, year, hour, minute, day_of_week, lon, lat, wt, temperature] arr.append(x) columns = ['day', 'month', 'year', 'hour', 'minute', 'day_of_week', 'lon', 'lat', 'weather', 'temperature'] new_df = pd.DataFrame(arr, columns=columns) new_df.to_csv(output_filename, index=False)
def clean_train_set(filename, output_filename, weather_json, log=False): weather_data = None with open(weather_json, 'r') as myfile: data = myfile.read().replace('\n', '') weather_data = json.loads(data) last_day_of_week = -1 last_category = -1 last_weather_type = -1 initial_df = load(filename) length = initial_df.shape[0] arr = [] last_percent = -1 for i in range(len(initial_df.index)): # for i in range(1000): if log: percent = int((i / length) * 100) if percent > last_percent: last_percent = percent print(str(percent) + "%") print(str(i) + " строка") d = initial_df.index[i] row = initial_df.ix[i] day = d.day month = d.month year = d.year hour = d.hour minute = d.minute category = string.capwords(row['Category']) if category not in category_map: last_category += 1 category_map[category] = last_category category = category_map[category] day_of_week = string.capwords(row['DayOfWeek']) if day_of_week not in day_of_week_map: last_day_of_week += 1 day_of_week_map[day_of_week] = last_day_of_week weather_dic = weather_data[str(year)][str(month)][str(day)] weather_type = weather_dic['weather'] if weather_type not in weather_type_map: last_weather_type += 1 weather_type_map[weather_type] = last_weather_type wt = weather_type_map[weather_type] temperature = weather_dic['temperature'] day_of_week = day_of_week_map[day_of_week] lon = row['X'] lat = row['Y'] x = [day, month, year, hour, minute, day_of_week, lon, lat, wt, temperature, category] arr.append(x) columns = ['day', 'month', 'year', 'hour', 'minute', 'day_of_week', 'lon', 'lat', 'weather', 'temperature', 'category'] new_df = pd.DataFrame(arr, columns=columns) new_df.to_csv(output_filename, index=False) clz = {} for v in category_map: clz[str(category_map[v])] = [v] clz_df = pd.DataFrame(clz) clz_df.to_csv(output_filename + '_classes', index=False) clz = {} for v in day_of_week_map: clz[str(day_of_week_map[v])] = [v] clz_df = pd.DataFrame(clz) clz_df.to_csv(output_filename + '_days', index=False) clz = {} for v in weather_type_map: clz[str(weather_type_map[v])] = [v] clz_df = pd.DataFrame(clz) clz_df.to_csv(output_filename + '_weather', index=False) return day_of_week_map, weather_type_map
def map_gen(is_demo): cur_dir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(cur_dir, 'data/map.png' if is_demo else '../data/map.png') image = Image.open(filename) width, height = image.size # левая верхняя точка lon_l = -122.51456 lat_l = 37.81155 # правая нижняя точка lon_r = -122.37173 lat_r = 37.69238 lx = lon2x(lon_l) ly = lat2y_m(lat_l) rx = lon2x(lon_r) ry = lat2y_m(lat_r) real_width = math.fabs(rx - lx) real_height = math.fabs(ry - ly) # широта долгота в точки на картинке def calc_on_map_point(lon, lat): px = lon2x(lon) py = lat2y_m(lat) # реальные точки, но относительно левого края картинки local_x = math.fabs(px - lx) local_y = math.fabs(py - ly) # точки на картинке (делим относительную точку на реальный размер и умножаем на размер картинки) img_x = (local_x / real_width) * width img_y = (local_y / real_height) * height return (img_x, img_y) rad = 2 def point_to_ellipse(p): return p[0] - rad, p[1] - rad, p[0] + rad, p[1] + rad x = [] # longitudes y = [] # latitudes print("Загружаем датасет") df = load('data/train.csv.zip' if is_demo else '../data/train.csv.zip') print("Загружено") i = Image.new("RGBA", (width, height), 'white') d = ImageDraw.Draw(i, "RGBA") i.paste(image) # draw = ImageDraw.Draw(image, "RGBA") draw = d image = i def top_crimes(df, items=0): df.columns = df.columns.map(operator.methodcaller('lower')) by_col = df.groupby("category") col_freq = by_col.size() col_freq.index = col_freq.index.map(string.capwords) col_freq.sort(ascending=False, inplace=True) cols = [col for col in col_freq.index] cols = cols[0:1] + cols[3:] return cols[:items] print("Достаём топ-5 преступлений") top = top_crimes(df, items=5) opacity = 220 # color_list = ['red', 'green', 'blue', 'purple', 'orange', 'pink', 'black', 'brown'] color_list = [(255, 0, 0, opacity), (0, 255, 0, opacity), (0, 0, 255, opacity), (255, 127, 80, opacity), (139, 0, 139, opacity), (255, 105, 180, opacity), (0, 0, 0, opacity), (188, 143, 143, opacity)] colors_map = {top[i]: color_list[i] for i in range(len(top))} print("Начинаем рисовать на карте") length = df.shape[0] if is_demo: length = int(length * 0.15) #нарисуем 100,000 точек, вместо всех last_percent = -1 for i in range(length): percent = int((i / length) * 100) if percent > last_percent: last_percent = percent print(str(percent) + "%") print(str(i) + " строка") first_row = df.ix[i] category = string.capwords(first_row["category"]) if category in top: lon = first_row["x"] lat = first_row["y"] p = calc_on_map_point(lon, lat) color = colors_map[category] draw.ellipse(point_to_ellipse(p), fill=color, outline=color) # рисуем легенду lineheight = 35 padding = 20 space = 20 legend_rad = 10 font = truetype(font="times.ttf", size=15) legend_right = width / 3 legend_down = (len(top)) * lineheight + padding draw.rectangle((0, 0, legend_right, legend_down), 'white', 'black') for i in range(len(top)): x = padding y = i * lineheight + padding category = top[i] color = colors_map[category] draw.ellipse((x, y, x + (2 * legend_rad), y + (2 * legend_rad)), fill=color, outline='black') draw.text((x + (2 * legend_rad) + space, y), translate(category, "en", "ru"), fill='black', font=font) image.save('test.png', 'PNG')