def get_accidents(datasize, host, port):
    """
    Args:
        datasize: number of positively identified items to generate
        host: the host for the dataset (accidents)
        port: the port for the host
    Returns accidents dataset
    """
    # Get the accidents data
    database = MongoDBConnect(host, port)
    with database as db:
        cursor = db.get_all(collection='accidents', limit=datasize,
                            order=1)  # asc
        db_accidents = json_normalize(list(cursor))  # flatten weather json
    _logger.info('Retrieved accident data from data source')

    # Set correct data types as necessary
    db_accidents[features.get('lat')] = pd.to_numeric(
        db_accidents[features.get('lat')])
    db_accidents[features.get('long')] = pd.to_numeric(
        db_accidents[features.get('long')])
    db_accidents[features.get('datetime')] = pd.to_datetime(
        db_accidents[features.get('datetime')])
    db_accidents[features.get('sunrise')] = pd.to_datetime(
        db_accidents[features.get('weatherSunrise')], unit='s')
    db_accidents[features.get('sunset')] = pd.to_datetime(
        db_accidents[features.get('weatherSunset')], unit='s')

    # Append any joined information (new.street_name, new.speed_limit, pop_sq_mile, median_age)
    accidents = join_features(db_accidents)
    return accidents
def extract_signals(signals, row):
    """ Extract signal proximity using haversine distance
    Args:
        signals: list of valid signal X,Y coords
        row: the dataset training row
    Returns:
        number of signals nearby
    """
    dists = haversine_np(signals["Y"], signals["X"], row[features.get('lat')],
                         row[features.get('long')])
    signals_near = len(dists[dists < 500])
    return signals_near
def extract_road_info(volumes, roads, row):
    """ Extract road info (volumes, curves, lengths, names)
    Args:
        volumes: grouped objects based on reference data
        roads: roads info created from reference data
        row: the dataset training row
    Returns:
        road volumes, curves, lengths, names
    """
    first_word = find_first_word(row[features.get('address')])
    # Road information
    if first_word:
        vols = volumes[volumes["ROUTE"].str.contains(first_word,
                                                     na=False)]["2016"]
        curves = roads[roads["STREETNAME"].str.contains(first_word,
                                                        na=False)]["curve"]
        lengths = roads[roads["STREETNAME"].str.contains(
            first_word, na=False)]["ShapeSTLength"]
        roads_matched = roads[roads["STREETNAME"].str.contains(
            first_word, na=False)]["STREETNAME"]
        freq_roads = roads_matched.mode()

        road_name = (freq_roads.iloc[0]
                     if freq_roads.any() else "GENERIC_STREET")
        return vols.mean(), curves.mean(), lengths.mean(), road_name
    else:
        return None, None, None, "GENERIC_STREET"
def extract_pop_info(polygons, row):
    """ Extract population information for single instance
    Based on polygons coordinates from reference data
    Args:
        polygons: Shapely polygons to inspect (list)
        row: the dataset training row
    Returns:
        tuple: median_age, median_pop for instance
    """
    for poly_obj in polygons:
        median_age = None
        median_pop = None
        if poly_obj["poly"].contains(
                Point(row[features.get('lat')], row[features.get('long')])):
            median_age = poly_obj["median_age"]
            median_pop = poly_obj["pop_sq_mile"]
            break
    return median_age, median_pop
def create_train_test_data(datasize, host, port, imbalance_multiplier,
                           test_size):
    """
    Args:
        datasize: number of positively identified items to generate
        host: the host for the dataset (accidents)
        port: the port for the host
        imbalance_multiplier: Multiplier of the non-accident size
        test_size: test data size proportion
    Returns X_train, y_train, X_test, y_test, and feature names
    """
    # Get actual accidents
    accidents = get_accidents(datasize, host, port)

    # Create the oversampling of non-accidents
    non_accidents = generate_non_accidents(data=accidents,
                                           iterations=imbalance_multiplier)
    # Identify accidents vs. non-accidents
    accidents[features.get('is_accident')] = 1
    non_accidents[features.get('is_accident')] = 0

    # Join final training dataset (accidents with non-accidents) with key features
    trainset = pd.concat([accidents, non_accidents])
    feature_cols = [
        features.get('division'),
        features.get('weatherTemp'),
        features.get('weatherRain3'),
        features.get('weatherVisibility'),
        features.get('weatherWindSpeed'),
        features.get('sunrise_hour'),
        features.get('month'),
        features.get('hour'),
        features.get('day_of_week'),
        features.get('day'),
        features.get('road_curve'),
        features.get('road_length'),
        features.get('road_volume'),
        features.get('signals_near'),
        features.get('road_speed'),
        features.get('pop_sq_mile'),
        features.get('median_age'),
        features.get('is_accident')
    ]
    try:
        trainset = trainset[feature_cols]
    except KeyError:
        _logger.error(
            'Feature key not found in dataset, adding missing features...')
        trainset = trainset.reindex(columns=feature_cols)
        pass

    # Return train set and final holdout set based on defined percent
    X = trainset.iloc[:, :-1].values
    y = trainset[features.get('is_accident')].values
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=1234)

    return X_train, y_train, X_test, y_test, trainset.columns.values[:-1]
def generate_non_accidents(data, iterations):
    """
    Args:
        data: dataframe of existing accidents to utilize for generation
        iterations: iterations to perform for generating training data, ie, (1, 2, ...)
    Returns dataset of non-accidents
    Method of generation:
    For each positive sample (accident) change value of one feature from the following features:
    ( hour, day, road )
    If the result is negative, we add to negative pool of samples
    Dataset should contain at least 3-4 times negative samples to positive for proper oversampling
    """
    choices = [features.get('hour'), features.get('day'), features.get('road')]
    hours = data[features.get('hour')].unique()
    days = data[features.get('day')].unique()
    roads = data[features.get('road')].unique()
    feature_choice = random.choice(choices)
    cols = data.columns.tolist()
    non_accidents = pd.DataFrame(columns=cols)
    for _ in itertools.repeat(None, iterations):
        non_accs = pd.DataFrame(columns=cols)
        for i, row in data.iterrows():
            acc_rec = row
            if feature_choice == features.get('hour'):
                random_choice = np.asscalar(np.random.choice(hours, 1))
                acc_rec[feature_choice] = random_choice
            elif feature_choice == features.get('day'):
                random_choice = np.asscalar(np.random.choice(days, 1))
                acc_rec[feature_choice] = random_choice
            else:
                random_choice = np.asscalar(np.random.choice(roads, 1))
                acc_rec[feature_choice] = random_choice
            if ((data[features.get('day')] == acc_rec[features.get('day')]) &
                (data[features.get('hour')] == acc_rec[features.get('hour')]) &
                (data[features.get('road')]
                 == acc_rec[features.get('road')])).any():
                continue
            else:
                non_accs.loc[i] = acc_rec
        non_accidents = non_accidents.append(non_accs, ignore_index=True)

    _logger.info("Generated {0} non-accidents to go with {1} accidents".format(
        len(non_accidents), len(data)))
    return non_accidents
def join_features(data):
    """
    Args:
        data: dataframe to join based on
    Returns modified existing dataframe to join new features
    Features added:
        - Time series info
        - Traffic info (signals, traffic volumes, population)
        - Road info (curvature, length, KMeans grouping)
        - Any other census information
    """
    # Load reference data
    population, roads, signals, traffic_vol = load_reference_data()

    # Time information
    data[features.get('month')] = data[features.get('datetime')].dt.month
    data[features.get('day')] = data[features.get('datetime')].dt.day
    data[features.get('hour')] = data[features.get('datetime')].dt.hour
    data[features.get('minute')] = data[features.get('datetime')].dt.minute
    data[features.get('day_of_week')] = data[features.get(
        'datetime')].dt.dayofweek

    # Load road information
    roads["curve"] = create_roads(roads)

    # Load polygons from census information
    polygons = create_polygons(population)

    # Traffic info
    meck_vols = traffic_vol[(traffic_vol["COUNTY"] == "MECKLENBURG")
                            & (traffic_vol["2016"] != ' ')][["ROUTE", "2016"]]
    meck_vols["2016"] = meck_vols["2016"].astype(int)
    grouped = meck_vols.groupby(["ROUTE"], as_index=False).mean()

    # Main data join with other features
    mean_vols, mean_curves, mean_lengths, signals_near, road_names, ages, pops = (
        [], [], [], [], [], [], [])

    for _, row in data.iterrows():

        # Road information (volumes, curves, lengths)
        vol, curve, length, name = extract_road_info(grouped, roads, row)
        mean_vols.append(vol)
        mean_curves.append(curve)
        mean_lengths.append(length)
        road_names.append(name)

        # Signals proximity
        signals_near.append(extract_signals(signals, row))

        # Population
        ages.append(extract_pop_info(polygons, row)[0])

        # Age
        pops.append(extract_pop_info(polygons, row)[1])

    data[features.get('road')] = road_names
    data[features.get('road_curve')] = mean_curves
    data[features.get('road_length')] = mean_lengths
    data[features.get('road_volume')] = mean_vols
    data[features.get('signals_near')] = signals_near
    data[features.get('road_speed')] = data[features.get('address')].apply(
        lambda x: extract_speed(x))
    data[features.get('median_age')] = ages
    data[features.get('pop_sq_mile')] = pops

    # Clean data before further preprocessing
    cleansed_data = clean_data(data)

    # Weather
    cleansed_data[features.get('weatherCategory')] = cleansed_data[
        features.get('weather')].values.tolist()[0][0]['main']
    cleansed_data[features.get('sunrise_hour')] = pd.DatetimeIndex(
        cleansed_data[features.get('sunrise')]).hour
    cleansed_data[features.get('sunrise_minute')] = pd.DatetimeIndex(
        cleansed_data[features.get('sunrise')]).minute
    cleansed_data[features.get('sunset_hour')] = pd.DatetimeIndex(
        cleansed_data[features.get('sunset')]).hour
    cleansed_data[features.get('sunset_minute')] = pd.DatetimeIndex(
        cleansed_data[features.get('sunset')]).minute

    _logger.info(
        'Added features... joined data features including spatial data')
    return cleansed_data