Example #1
0
def train():
    from data import input_reader
    import pandas as pd

    print("Читаем трейнсет")
    train_csv = input_reader.load("data/clean_train.csv")
    train_true = train_csv['category'].tolist()
    train_csv = train_csv.drop('category', 1)
    train_features = train_csv.as_matrix()

    sz = 600000

    train__train_features = train_features[:sz, :]
    train__train_true = train_true[:sz]

    train__test_features = train_features[sz:, :]
    train__test_true = train_true[sz:]

    model = xgb.XGBClassifier(max_depth=80, n_estimators=30, learning_rate=0.05, nthread=4, subsample=0.7,
                              colsample_bytree=0.7, silent=True)
    print("Учимся")
    model.fit(train__train_features, train__train_true)

    print("Оцениваем на части трейна")
    score = model.score(train__test_features, train__test_true)
    print("Результат =", score)

    print("Учимся на полном сете")
    model.fit(train_features, train_true)

    print("Читаем тест сет")
    test_csv = input_reader.load("data/clean_test.csv")
    test_features = test_csv.as_matrix()
    print("Предсказываем вероятности")
    predicted_probas = model.predict_proba(test_features)



    # выводим классы
    cur_dir = os.path.dirname(os.path.realpath('__file__'))
    filename = os.path.join(cur_dir, "data/clean_train.csv_classes")
    clz_map = pd.read_csv(filename, index_col=False)
    r = clz_map.ix[0]

    cols = [r[str(c)].upper() for c in model.classes_]

    print(cols)

    print("Пишем результат")
    res_df = pd.DataFrame(predicted_probas, columns=cols)
    res_df.to_csv("data/res.csv", index=True, index_label="Id")
    print("Готово!")
def plot_gen(is_demo):
    # Plotting Options
    sns.set_style("whitegrid")
    sns.despine()


    def rname(old):
        return translate(old, "en", "ru")


    def plot_bar(df, title, filename):
        p = (
            'Set2', 'Paired', 'colorblind', 'husl',
            'Set1', 'coolwarm', 'RdYlGn', 'spectral'
        )
        df = df.rename(rname)
        bar = df.plot(kind='barh',
                      title=title,
                      fontsize=8,
                      figsize=(12, 8),
                      stacked=False,
                      width=1,
                      color=sns.color_palette(np.random.choice(p), len(df)),
                      )

        bar.figure.savefig(filename)

        plt.show()


    def plot_top_crimes(df, column, title, fname, items=0):
        df.columns = df.columns.map(operator.methodcaller('lower'))
        by_col = df.groupby(column)
        col_freq = by_col.size()
        col_freq.index = col_freq.index.map(capwords)

        col_freq.sort(ascending=True, inplace=True)
        plot_bar(col_freq[slice(-1, - items, -1)], title, fname)


    df = load("../data/train.csv.zip" if not is_demo else "data/train.csv.zip")

    plot_top_crimes(df, 'category', 'Количество преступлений (по типу)', 'category.png')
    plot_top_crimes(df, 'resolution', 'Результаты расследования', 'resolution.png')
    plot_top_crimes(df, 'pddistrict', 'Активность полиции', 'police.png')
    plot_top_crimes(df, 'dayofweek', 'Преступления по дням недели', 'weekly.png')
    plot_top_crimes(df, 'address', 'Адреса преступлений (топ 20)', 'location.png', items=20)
    plot_top_crimes(df, 'descript', 'Конкретные преступления (топ 20)', 'descript.png', items=20)
def clean_test_set(filename, output_filename, day_of_week_map, weather_type_map, weather_json,  log=False):

    weather_data = None

    with open(weather_json, 'r') as myfile:
        data = myfile.read().replace('\n', '')
        weather_data = json.loads(data)

    initial_df = load(filename)
    length = initial_df.shape[0]
    arr = []
    last_percent = -1
    for i in range(length):
        if log:
            percent = int((i / length) * 100)
            if percent > last_percent:
                last_percent = percent
                print(str(percent) + "%")
                print(str(i) + " строка")
        row = initial_df.ix[i]

        d = parse(row["Dates"])

        day = d.day
        month = d.month
        year = d.year
        hour = d.hour
        minute = d.minute

        weather_dic = weather_data[str(year)][str(month)][str(day)]
        weather_type = weather_dic['weather']
        wt = weather_type_map[weather_type]
        temperature = weather_dic['temperature']

        day_of_week = string.capwords(row['DayOfWeek'])
        day_of_week = day_of_week_map[day_of_week]
        lon = row['X']
        lat = row['Y']
        x = [day, month, year, hour, minute, day_of_week, lon, lat, wt, temperature]
        arr.append(x)
    columns = ['day', 'month', 'year', 'hour', 'minute', 'day_of_week', 'lon', 'lat', 'weather', 'temperature']
    new_df = pd.DataFrame(arr, columns=columns)
    new_df.to_csv(output_filename, index=False)
def clean_train_set(filename, output_filename, weather_json, log=False):

    weather_data = None

    with open(weather_json, 'r') as myfile:
        data = myfile.read().replace('\n', '')
        weather_data = json.loads(data)

    last_day_of_week = -1
    last_category = -1
    last_weather_type = -1

    initial_df = load(filename)
    length = initial_df.shape[0]
    arr = []
    last_percent = -1
    for i in range(len(initial_df.index)):
    # for i in range(1000):
        if log:
            percent = int((i / length) * 100)
            if percent > last_percent:
                last_percent = percent
                print(str(percent) + "%")
                print(str(i) + " строка")
        d = initial_df.index[i]
        row = initial_df.ix[i]

        day = d.day
        month = d.month
        year = d.year
        hour = d.hour
        minute = d.minute

        category = string.capwords(row['Category'])
        if category not in category_map:
            last_category += 1
            category_map[category] = last_category
        category = category_map[category]
        day_of_week = string.capwords(row['DayOfWeek'])
        if day_of_week not in day_of_week_map:
            last_day_of_week += 1
            day_of_week_map[day_of_week] = last_day_of_week

        weather_dic = weather_data[str(year)][str(month)][str(day)]
        weather_type = weather_dic['weather']
        if weather_type not in weather_type_map:
            last_weather_type += 1
            weather_type_map[weather_type] = last_weather_type
        wt = weather_type_map[weather_type]
        temperature = weather_dic['temperature']
        day_of_week = day_of_week_map[day_of_week]
        lon = row['X']
        lat = row['Y']
        x = [day, month, year, hour, minute, day_of_week, lon, lat, wt, temperature, category]
        arr.append(x)
    columns = ['day', 'month', 'year', 'hour', 'minute', 'day_of_week', 'lon', 'lat', 'weather', 'temperature', 'category']
    new_df = pd.DataFrame(arr, columns=columns)
    new_df.to_csv(output_filename, index=False)


    clz = {}
    for v in category_map:
        clz[str(category_map[v])] = [v]
    clz_df = pd.DataFrame(clz)
    clz_df.to_csv(output_filename + '_classes', index=False)

    clz = {}
    for v in day_of_week_map:
        clz[str(day_of_week_map[v])] = [v]
    clz_df = pd.DataFrame(clz)
    clz_df.to_csv(output_filename + '_days', index=False)

    clz = {}
    for v in weather_type_map:
        clz[str(weather_type_map[v])] = [v]
    clz_df = pd.DataFrame(clz)
    clz_df.to_csv(output_filename + '_weather', index=False)

    return day_of_week_map, weather_type_map
def map_gen(is_demo):
    cur_dir = os.path.dirname(os.path.realpath('__file__'))
    filename = os.path.join(cur_dir, 'data/map.png' if is_demo else '../data/map.png')
    image = Image.open(filename)

    width, height = image.size

    # левая верхняя точка
    lon_l = -122.51456
    lat_l = 37.81155

    # правая нижняя точка
    lon_r = -122.37173
    lat_r = 37.69238

    lx = lon2x(lon_l)
    ly = lat2y_m(lat_l)
    rx = lon2x(lon_r)
    ry = lat2y_m(lat_r)
    real_width = math.fabs(rx - lx)
    real_height = math.fabs(ry - ly)


    # широта долгота в точки на картинке
    def calc_on_map_point(lon, lat):
        px = lon2x(lon)
        py = lat2y_m(lat)

        # реальные точки, но относительно левого края картинки
        local_x = math.fabs(px - lx)
        local_y = math.fabs(py - ly)

        # точки на картинке (делим относительную точку на реальный размер и умножаем на размер картинки)
        img_x = (local_x / real_width) * width
        img_y = (local_y / real_height) * height
        return (img_x, img_y)


    rad = 2


    def point_to_ellipse(p):
        return p[0] - rad, p[1] - rad, p[0] + rad, p[1] + rad


    x = []  # longitudes
    y = []  # latitudes


    print("Загружаем датасет")
    df = load('data/train.csv.zip' if is_demo else '../data/train.csv.zip')
    print("Загружено")

    i = Image.new("RGBA", (width, height), 'white')
    d = ImageDraw.Draw(i, "RGBA")

    i.paste(image)
    # draw = ImageDraw.Draw(image, "RGBA")
    draw = d
    image = i


    def top_crimes(df, items=0):
        df.columns = df.columns.map(operator.methodcaller('lower'))
        by_col = df.groupby("category")
        col_freq = by_col.size()
        col_freq.index = col_freq.index.map(string.capwords)
        col_freq.sort(ascending=False, inplace=True)
        cols = [col for col in col_freq.index]
        cols = cols[0:1] + cols[3:]
        return cols[:items]


    print("Достаём топ-5 преступлений")
    top = top_crimes(df, items=5)

    opacity = 220
    # color_list = ['red', 'green', 'blue', 'purple', 'orange', 'pink', 'black', 'brown']
    color_list = [(255, 0, 0, opacity), (0, 255, 0, opacity), (0, 0, 255, opacity), (255, 127, 80, opacity),
                  (139, 0, 139, opacity), (255, 105, 180, opacity), (0, 0, 0, opacity),
                  (188, 143, 143, opacity)]

    colors_map = {top[i]: color_list[i] for i in range(len(top))}

    print("Начинаем рисовать на карте")
    length = df.shape[0]
    if is_demo:
        length = int(length * 0.15) #нарисуем 100,000 точек, вместо всех
    last_percent = -1
    for i in range(length):
        percent = int((i / length) * 100)
        if percent > last_percent:
            last_percent = percent
            print(str(percent) + "%")
            print(str(i) + " строка")
        first_row = df.ix[i]
        category = string.capwords(first_row["category"])
        if category in top:
            lon = first_row["x"]
            lat = first_row["y"]
            p = calc_on_map_point(lon, lat)
            color = colors_map[category]
            draw.ellipse(point_to_ellipse(p), fill=color, outline=color)


    # рисуем легенду


    lineheight = 35
    padding = 20
    space = 20
    legend_rad = 10
    font = truetype(font="times.ttf", size=15)
    legend_right = width / 3
    legend_down = (len(top)) * lineheight + padding
    draw.rectangle((0, 0, legend_right, legend_down), 'white', 'black')
    for i in range(len(top)):
        x = padding
        y = i * lineheight + padding
        category = top[i]
        color = colors_map[category]
        draw.ellipse((x, y, x + (2 * legend_rad), y + (2 * legend_rad)), fill=color, outline='black')
        draw.text((x + (2 * legend_rad) + space, y), translate(category, "en", "ru"), fill='black', font=font)

    image.save('test.png', 'PNG')