def start(self):

        self.data = pd.read_csv(os.path.join(project_root(), 'data', 'raw',
                                             'ubaar-competition', 'train.csv'),
                                encoding="utf-8",
                                index_col="ID")
        self.features = pd.DataFrame()

        self.next(self.get_day_feature)
import os
import pandas as pd
from feature_extraction.date_utils import date_features
from feature_extraction.coords_features import coord_features
from feature_extraction.other_features import raw_features, categorical_features
from feature_extraction.path_utils import project_root
import xgboost as xgb
import joblib

raw_data = pd.read_csv(os.path.join(project_root(), 'data', 'raw',
                                    'ubaar-competition', 'train.csv'),
                       encoding="utf-8",
                       index_col="ID")
all_features_cols = pd.read_csv(os.path.join(project_root(), 'data',
                                             'processed',
                                             'ubaar_features.csv'),
                                encoding="utf-8",
                                index_col="ID").columns

model = joblib.load(
    os.path.join(project_root(), 'data', 'processed', 'model.bin'))
num_cols = [
    'sourceLatitude', 'sourceLongitude', 'destinationLatitude',
    'destinationLongitude', 'distanceKM', 'taxiDurationMin', 'weight', 'price'
]
num_cols_dict = {col: float for col in num_cols}


def _add_missing_cat_columns(features, all_features_cols):
    missing_columns = [
        c for c in all_features_cols if c not in features.columns
import pandas as pd
import os

import plotly.express as px

from feature_extraction.path_utils import project_root
from feature_extraction.coords_features import coords_clusters_dbscan, coords_clusters_kmeans

if __name__ == '__main__':

    data = pd.read_csv(os.path.join(project_root(), 'data', 'raw', 'ubaar-competition', 'train.csv'),
                       encoding="utf-8", index_col="ID")

    coords = data[["sourceLatitude", "sourceLongitude", "destinationLatitude", "destinationLongitude"]]

    # coords['cluster_src'], _ = coords_clusters_kmeans(coords, n_clusters=50)
    coords['cluster_src'], _ = coords_clusters_dbscan(coords)

    fig = px.scatter_mapbox(coords, lat="sourceLatitude", lon="sourceLongitude", zoom=3, height=900,
                            color='cluster_src', title="Clusters")
    fig.update_layout(mapbox_style="stamen-terrain", mapbox_zoom=2, mapbox_center_lat=41,
                      margin={"r": 0, "t": 0, "l": 0, "b": 0})
    fig.write_html(os.path.join(project_root(), "data", "processed", "clusters.html"))

    fig.show()


 def save(self):
     self.features.to_csv(
         os.path.join(project_root(), 'data', 'processed',
                      'ubaar_features.csv'))
     self.next(self.end)
Beispiel #5
0
import pandas as pd
import os

import plotly.express as px
import reverse_geocoder as rg

from feature_extraction.path_utils import project_root

if __name__ == '__main__':
    src_dest = 'source'

    data = pd.read_csv(os.path.join(project_root(), 'data', 'raw',
                                    'ubaar-competition', 'train.csv'),
                       encoding="utf-8",
                       index_col="ID")

    coords = data[[f'{src_dest}Latitude', f'{src_dest}Longitude']]
    localisations = rg.search([tuple(row) for row in coords.values])

    data[f'{src_dest}_city'] = [l['name'] for l in localisations]
    data[f'{src_dest}_province'] = [l['admin1'] for l in localisations]

    # data['price_per_km'] = data['price'] / data['distanceKM']
    city_ave_prices = dict(data.groupby(f'{src_dest}_city')['price'].mean())

    data['ave_price'] = data.apply(
        lambda x: city_ave_prices[x[f'{src_dest}_city']], axis=1)

    fig = px.scatter_mapbox(data,
                            lat=f"{src_dest}Latitude",
                            lon=f"{src_dest}Longitude",