Python cleanUsingIQR Examples

Programming Language: Python

Namespace/Package Name: logproj.ml_dataCleaning

Method/Function: cleanUsingIQR

Examples at hotexamples.com: 8

Python cleanUsingIQR - 8 examples found. These are the top rated real world Python examples of logproj.ml_dataCleaning.cleanUsingIQR extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def returnPopularitydistanceGraphLocations(D_results):
    '''
    

    Parameters
    ----------
    D_results : TYPE pandas dataframe
        DESCRIPTION.dataframe containing columns :
            idNode : idnode defined from a graph (asis scenario)
            popularity: popularity of the idnode (asis scenario)
            distance: distance from the input-output
            
            new_idNode: idNodeTobe (tobe scenario)
            new_distance: distance of the new_idNode from the input-output (tobe scenario)
            
            

    Returns
    -------
    figure_out : TYPE dictionary
        DESCRIPTION. dictionary of figure with the chart

    '''
    
    figure_out={}
    D_results['distance'] = D_results['distance'].astype(float)
    
    
    D_graph=D_results.groupby(['idNode']).agg({'popularity':['sum'],'distance':['mean']}).reset_index()
    D_graph.columns = ['idNode','popularity','distance']
    
    # clean popularity using IQR
    D_graph, _ = cleanUsingIQR(D_graph, features=['popularity'])
    
    #plot asis graph
    fig1 = plt.figure()
    plt.scatter(D_graph['popularity'], D_graph['distance'])
    plt.xlabel('Popularity')
    plt.ylabel('Distance')
    plt.title("AS-IS Scenario")
    figure_out['asis'] = fig1
    
    
    # graph pop-dist optimal
    D_results['new_distance'] = D_results['new_distance'].astype(float)
    D_graph=D_results.groupby(['new_idNode']).agg({'popularity':['sum'],'new_distance':['mean']}).reset_index()
    D_graph.columns = ['idNode','popularity','distance']
    
    # clean popularity using IQR
    D_graph, _ = cleanUsingIQR(D_graph, features=['popularity'])
    
    #plot tobe graph
    fig2 = plt.figure()
    plt.scatter(D_graph['popularity'], D_graph['distance'])
    plt.xlabel('Popularity')
    plt.ylabel('Distance')
    plt.title("TO-BE Scenario")
    figure_out['tobe'] = fig2
    
    return figure_out

Example #2

Show file

def asisTobeBubblePopDist(D_results,cleanData=False):
    
    output_figures={}
    if cleanData:
        D_results, _=cleanUsingIQR(D_results, ['popularity'])
    
    D_results['distance'] = D_results['distance'].astype(float)
    
    #ASIS GRAPH
    D_graph=D_results.groupby(['idNode']).agg({'popularity':['sum'],'distance':['mean']}).reset_index()
    D_graph.columns = ['idNode','popularity','distance']
    
    
    fig1 = plt.figure()
    plt.scatter(D_graph['distance'],D_graph['popularity'])
    plt.xlabel('Distance (m)')
    plt.ylabel('Popularity')
    plt.title("AS-IS configuration")
    output_figures['pop_dist_asis'] = fig1
    
    #TOBE GRAPH
    D_results['new_distance'] = D_results['new_distance'].astype(float)
    D_graph=D_results.groupby(['new_idNode']).agg({'popularity':['sum'],'new_distance':['mean']}).reset_index()
    D_graph.columns = ['idNode','popularity','distance']
    
    
    fig2 = plt.figure()
    plt.scatter(D_graph['distance'],D_graph['popularity'])
    plt.xlabel('Distance (m)')
    plt.ylabel('Popularity')
    plt.title("TO-BE configuration")
    output_figures['pop_dist_tobe'] = fig2
    return output_figures

Example #3

Show file

def import_graph_drive(D_node,
                       latCol,
                       lonCol,
                       D_plant,
                       plantLatitude,
                       plantLongitude,
                       cleanOutliers=False):
    '''
    the function imports a road network using osmnx library
    
    D_node is the table containing the nodes of the network
    latCol is the name attribute of the latitude of the node collection
    lonCol is the name attribute of the longitude of the node collection
    
    D_plant id the table containing the plant of the network
    plantLatitude is the name attribute of the latitude of the plant collection
    plantLongitude is the name attribute of the longitude of the plant collection
    cleanOutliers is True to remove outliers of latitude and logitude by using IQR
    
    return the cleaned dataframe and a coverage tuple
    
    '''

    coverages = (1, np.nan)
    #mdb.setConnection(dbName)
    #D_plant=mdb.queryTodf(model_prod.plant.objects)

    #D_node=mdb.queryTodf(model_dist.node.objects)

    #remove latitude and longitude outliers
    if cleanOutliers:
        D_node, coverages, = cleanUsingIQR(D_node, [latCol, lonCol])

    allLatitudes = list(D_node[latCol]) + list(D_plant[plantLatitude])
    allLongitudes = list(D_node[lonCol]) + list(D_plant[plantLongitude])

    min_lat = min(allLatitudes)
    max_lat = max(allLatitudes)
    min_lon = min(allLongitudes)
    max_Lon = max(allLongitudes)

    G = ox.graph_from_bbox(max_lat,
                           min_lat,
                           max_Lon,
                           min_lon,
                           network_type='drive')

    output_coverages = pd.DataFrame(coverages)
    return G, output_coverages

Example #4

Show file

def returnbubbleGraphAsIsToBe(D_results,cleanData=False):
    '''
    Return the graph with storage plant layout and picking bubbles

    Parameters
    ----------
    D_results : TYPE pandas dataframe
        DESCRIPTION. 

    Returns
    -------
    figure_out : TYPE dictionary
        DESCRIPTION. dictionary of output figures

    '''
    
    def normaliseVector(x):
        return(x-min(x))/(max(x)-min(x))


    figure_out={}
    
    if cleanData:
        D_results, _=cleanUsingIQR(D_results, ['popularity'])
    
    #graph as/is
    D_graph=D_results.groupby(['loccodex','loccodey'])['popularity'].agg(['sum']).reset_index()
    D_graph['size'] = normaliseVector(D_graph['sum'])*100
    
    fig1 = plt.figure()
    plt.scatter(D_graph.loccodex, D_graph.loccodey, D_graph['size'])
    plt.title("Warehouse as-is")
    figure_out['pick_layout_asis']=fig1
    
    
    #graph to/be
    D_graph=D_results.groupby(['loccodexTOBE','loccodeyTOBE'])['popularity'].agg(['sum']).reset_index()
    D_graph['size'] = normaliseVector(D_graph['sum'])*100
    
    fig2=plt.figure()
    plt.scatter(D_graph.loccodexTOBE, D_graph.loccodeyTOBE, D_graph['size'])
    plt.title("Warehouse to-be")
    figure_out['pick_layout_tobe']=fig2
    
    return figure_out

Example #5

Show file

File: wh_productivity_assessment.py Project: aletuf93/logproj

def spaceProductivity(D_movements,variableToPlot,inout_column, x_col,  y_col, z_col, graphType='2D',cleanData = False):
    '''
    

    Parameters
    ----------
    D_movements : TYPE pandas dataframe
        DESCRIPTION. pandas dataframe with movements
    variableToPlot : string
        DESCRIPTION. string with the column to plot. or "popularity" for movement count
    inout_column : TYPE string
        DESCRIPTION. string of the column with inout
    x_col : TYPE string
        DESCRIPTION. string of the column with x coordinates
    y_col : TYPE string
        DESCRIPTION. string of the column with y coordinates
    z_col : TYPE string
        DESCRIPTION. string of the column with z coordinates
    graphType : TYPE string, optional
        DESCRIPTION. The default is '2D'. 2D or 3D depending on the graph type
    cleanData : TYPE boolean, optional
        DESCRIPTION. The default is False. if True, IQR is used to clean popularity of each location

    Returns
    -------
    figure_output : TYPE dict
        DESCRIPTION. dictionary of output figures

    '''
    
    def scaleSize(series):
        if min(series)==max(series):
            return [1 for i in range(0,len(series))]
        else:
            return (series - min(series))/(max(series)-min(series))
         
                      
    figure_output={}
    #group data
    if variableToPlot=='popularity':
        if graphType=='3D':
            D_mov = D_movements.groupby(['PERIOD',inout_column,x_col,y_col,z_col]).size().reset_index()
            D_mov.columns=['PERIOD','INOUT','LOCCODEX','LOCCODEY','LOCCODEZ','POPULARITY']
        elif graphType=='2D':
            D_mov = D_movements.groupby(['PERIOD',inout_column,x_col,y_col,]).size().reset_index()
            D_mov.columns=['PERIOD','INOUT','LOCCODEX','LOCCODEY','POPULARITY']
    else:
        if graphType=='3D':
            D_mov = D_movements.groupby(['PERIOD',inout_column,x_col,y_col,z_col]).sum()[variableToPlot].reset_index()
            D_mov.columns=['PERIOD','INOUT','LOCCODEX','LOCCODEY','LOCCODEZ','POPULARITY']
        elif graphType=='2D':
            D_mov = D_movements.groupby(['PERIOD',inout_column,x_col,y_col,]).sum()[variableToPlot].reset_index()
            D_mov.columns=['PERIOD','INOUT','LOCCODEX','LOCCODEY','POPULARITY']
        
        
    
    # split data into inbound and outbound
    D_loc_positive=D_mov[D_mov[inout_column]=='+']
    D_loc_negative=D_mov[D_mov[inout_column]=='-']
    
    
    
    #render inbound figure
    if len(D_loc_positive)>0:
        
        #clean data
        if cleanData:
            D_warehouse_grouped, _ = cleanUsingIQR(D_loc_positive, features = ['POPULARITY'],capacityField=[])
        else:
            D_warehouse_grouped = D_loc_positive
        #create figures
        for period in set(D_warehouse_grouped['PERIOD']):
            #period = list(set(D_warehouse_grouped['PERIOD']))[0]
            D_warehouse_grouped_filtered = D_warehouse_grouped[D_warehouse_grouped['PERIOD']==period]
            D_warehouse_grouped_filtered['SIZE'] = scaleSize(D_warehouse_grouped_filtered['POPULARITY'])
            
            #scale size
            D_warehouse_grouped_filtered['SIZE'] =100*D_warehouse_grouped_filtered['SIZE'] 
            
            #graphType 2-Dimensional
            if graphType == '2D':
                fig1 = plt.figure()
                plt.scatter(D_warehouse_grouped_filtered['LOCCODEX'],
                                   D_warehouse_grouped_filtered['LOCCODEY'],
                                   D_warehouse_grouped_filtered['SIZE'],
                                   c=D_warehouse_grouped_filtered['SIZE'])
                plt.colorbar()
                plt.title(f"Warehouse INBOUND productivity, period:{period}")
                plt.xlabel("Warehouse front (x)")
                plt.ylabel("Warehouse depth (y)")
                figure_output[f"IN_productivity_2D_{period}"] = fig1
            
            #graphtype 3-Dimensional
            elif graphType == '3D':
                fig1 = plt.figure()
                fig1.add_subplot(111, projection='3d')
                plt.scatter(x = D_warehouse_grouped_filtered['LOCCODEX'],
                            y = D_warehouse_grouped_filtered['LOCCODEY'],
                            zs = D_warehouse_grouped_filtered['LOCCODEZ'],
                            s = D_warehouse_grouped_filtered['SIZE'],
                            c = D_warehouse_grouped_filtered['SIZE']
                                   )
                plt.colorbar()
                plt.xlabel("Warehouse front (x)")
                plt.ylabel("Warehouse depth (y)")
                plt.title(f"Warehouse INBOUND productivity, period:{period}")
                figure_output[f"IN_productivity_3D_{period}"] = fig1
                
    #render outbound figure
    if len(D_loc_negative)>0:
        
        #clean data
        if cleanData:
            D_warehouse_grouped, _ = cleanUsingIQR(D_loc_negative, features = ['POPULARITY'],capacityField=[])
        else:
            D_warehouse_grouped = D_loc_negative
        #create figures
        for period in set(D_warehouse_grouped['PERIOD']):
            #period = list(set(D_warehouse_grouped['PERIOD']))[0]
            D_warehouse_grouped_filtered = D_warehouse_grouped[D_warehouse_grouped['PERIOD']==period]
            D_warehouse_grouped_filtered['SIZE'] = scaleSize(D_warehouse_grouped_filtered['POPULARITY'])
            
            #scale size
            D_warehouse_grouped_filtered['SIZE'] =100*D_warehouse_grouped_filtered['SIZE'] 
            
            #graphType 2-Dimensional
            if graphType == '2D':
                fig1 = plt.figure()
                plt.scatter(D_warehouse_grouped_filtered['LOCCODEX'],
                                   D_warehouse_grouped_filtered['LOCCODEY'],
                                   D_warehouse_grouped_filtered['SIZE'],
                            c = D_warehouse_grouped_filtered['SIZE'])
                plt.colorbar()
                plt.title(f"Warehouse OUTBOUND productivity, period:{period}")
                plt.xlabel("Warehouse front (x)")
                plt.ylabel("Warehouse depth (y)")
                figure_output[f"OUT_productivity_2D_{period}"] = fig1
            
            #graphtype 3-Dimensional
            elif graphType == '3D':
                fig1 = plt.figure()
                fig1.add_subplot(111, projection='3d')
                plt.scatter(x = D_warehouse_grouped_filtered['LOCCODEX'],
                            y = D_warehouse_grouped_filtered['LOCCODEY'],
                            zs = D_warehouse_grouped_filtered['LOCCODEZ'],
                            s = D_warehouse_grouped_filtered['SIZE'],
                            c = D_warehouse_grouped_filtered['SIZE']
                                   )
                plt.colorbar()
                plt.xlabel("Warehouse front (x)")
                plt.ylabel("Warehouse depth (y)")
                plt.title(f"Warehouse OUTBOUND productivity, period:{period}")
                figure_output[f"OUT_productivity_3D_{period}"] = fig1
    return figure_output

Example #6

Show file

def calculateMultipleOptimalLocation(D_table,
                             timeColumns,
                             distanceType,
                             latCol,
                             lonCol,
                             codeCol_node, 
                             descrCol_node,
                             cleanOutliers=False,
                             k=1,
                             method='kmeans'):
    '''
    #this function defines k facility location using an aggregation method
    
    # this function import a table D_table where each row is a node of the network
    #columns "NODE_DESCRIPTION" describe the node 
    #timeColumns e' la lista delle colonne con l'orizzonte temporale che contengono i dati di flusso
    #latCol identify the latitude of the node
    #lonCol identify the longitude of the node
    #codeCol_node is a column with description of the node (the same appearing in plantListName)
    #descrCol_node is a column with description of the node
    #cleanOutliers if True use IQR to remove latitude and longitude outliers
    # k is the number of optimal point to define
    # method is the method to cluster the points: kmeans, gmm
    
    
    # it returns a dataframe D_res with the ID, LATITUDE, LONGITUDE AND YEAR
    # for each flow adding the column COST AND FLOW representing the distance 
    # travelled (COST) and the flow intensity (FLOW). The column
    # COST_NORM is a the flows scaled between 0 and 100
    
    # it returns a dataframe D_res_optimal with the loptimal latitude and longitude for each
    # time frame, and a column COST and FLOW with the total cost (distance) and flows
    
    '''
    # pulisco i dati e calcolo le coperture
    output_coverages={}
    
    analysisFieldList=[latCol, lonCol]
    outputCoverages, _ = getCoverageStats(D_table,analysisFieldList,capacityField=timeColumns[0])
    D_table=D_table.dropna(subset=[latCol,lonCol])
    if cleanOutliers:
        D_table, coverages, =cleanUsingIQR(D_table, [latCol,lonCol])
        outputCoverages = (coverages[0]*outputCoverages[0],coverages[1]*outputCoverages[1])
    output_coverages['coverages'] = pd.DataFrame(outputCoverages)
    
    #sostituisco i nulli rimasti con zeri
    D_table=D_table.fillna(0)
    
    
    #identifico gli anni nella colonna dizionario
    yearsColumns = timeColumns
    
    #clusterizzo i punti
    
    
    if method == 'kmeans':
        km = cluster.KMeans(n_clusters=k).fit(D_table[[latCol,lonCol]])
        D_table['CLUSTER'] = pd.DataFrame(km.labels_)
        
    elif method == 'gmm':
        gmm = GaussianMixture(n_components=k, covariance_type='full').fit(D_table[[latCol,lonCol]])
        D_table['CLUSTER']=pd.DataFrame(gmm.predict(D_table[[latCol,lonCol]]))
    else:
        print("No valid clustering method")
        return [], [], []
    
    
    # identifico le colonne utili
    D_res=pd.DataFrame(columns=[codeCol_node, descrCol_node,latCol,lonCol,'YEAR','COST','CLUSTER'])
    D_res_optimal=pd.DataFrame(columns=['PERIOD',latCol,lonCol,'YEAR','COST','FLOW','CLUSTER'])
    
    #analizzo ogni cluster separatamente
    for cluster_id in set(D_table['CLUSTER']):
        #cluster_id=0
        D_table_filtered=D_table[D_table['CLUSTER']==cluster_id]
        for year in yearsColumns:
            #year = yearsColumns[0]
            D_filter_columns=[codeCol_node,descrCol_node,latCol,lonCol,year,'CLUSTER']
            D_filtered = D_table_filtered[D_filter_columns]
            D_filtered = D_filtered.rename(columns={year:'FLOW'})
            D_filtered['YEAR']=year
            
            # define optimal location
            if distanceType.lower()=='rectangular':
                lat_optimal, lon_optimal = optimalLocationRectangularDistance(D_filtered, latCol, lonCol, 'FLOW')
                D_filtered['COST']=func_rectangularDistanceCost(D_filtered[lonCol], D_filtered[latCol], lon_optimal, lat_optimal, D_filtered['FLOW'])
            elif distanceType.lower()=='gravity':
                lat_optimal, lon_optimal = optimalLocationGravityProblem(D_filtered, latCol, lonCol, 'FLOW')
                D_filtered['COST']=func_gravityDistanceCost(D_filtered[lonCol], D_filtered[latCol], lon_optimal, lat_optimal, D_filtered['FLOW'])
            elif distanceType.lower()=='euclidean':
                lat_optimal, lon_optimal = optimalLocationEuclideanDistance(D_filtered, latCol, lonCol, 'FLOW')
                D_filtered['COST']=func_euclideanDistanceCost(D_filtered[lonCol], D_filtered[latCol], lon_optimal, lat_optimal, D_filtered['FLOW'])
            D_res=D_res.append(D_filtered)
            
            
            D_res_optimal=D_res_optimal.append(pd.DataFrame([[f"OPTIMAL LOCATION YEAR: {year}",
                                                              lat_optimal, 
                                                              lon_optimal, 
                                                              year,
                                                              sum(D_res['COST']),
                                                              sum(D_res['FLOW']),
                                                              cluster_id
                                                              ]], columns=D_res_optimal.columns))
    
        
    #D_res['COST_norm']=(D_res['COST']-min(D_res['COST']))/(max(D_res['COST'])-min(D_res['COST']))*10
    D_res['FLOW_norm']=(D_res['FLOW']-min(D_res['FLOW']))/(max(D_res['FLOW'])-min(D_res['FLOW']))*100
    

    D_res=D_res.rename(columns={'COST':'COST_TOBE'})
    
    return D_res, D_res_optimal, output_coverages

Example #7

Show file

def calculateOptimalLocation(D_table,
                             timeColumns,
                             distanceType,
                             latCol,
                             lonCol,
                             codeCol_node, 
                             descrCol_node,
                             cleanOutliers=False):
    '''
    # this function import a table D_table where each row is a node of the network
    #columns "NODE_DESCRIPTION" describe the node 
    #timeColumns e' la lista delle colonne con l'orizzonte temporale che contengono i dati di flusso
    #latCol identify the latitude of the node
    #lonCol identify the longitude of the node
    #codeCol_node is a column with description of the node (the same appearing in plantListName)
    #descrCol_node is a column with description of the node
    #cleanOutliers if True use IQR to remove latitude and longitude outliers
    
    
    # it returns a dataframe D_res with the ID, LATITUDE, LONGITUDE AND YEAR
    # for each flow adding the column COST AND FLOW representing the distance 
    # travelled (COST) and the flow intensity (FLOW). The column
    # COST_NORM is a the flows scaled between 0 and 100
    
    # it returns a dataframe D_res_optimal with the loptimal latitude and longitude for each
    # time frame, and a column COST and FLOW with the total cost (distance) and flows
    '''
    # pulisco i dati e calcolo le coperture
    output_coverages={}
    
    analysisFieldList=[latCol, lonCol]
    outputCoverages, _ = getCoverageStats(D_table,analysisFieldList,capacityField=timeColumns[0])
    D_table=D_table.dropna(subset=[latCol,lonCol])
    if cleanOutliers:
        D_table, coverages, =cleanUsingIQR(D_table, [latCol,lonCol])
        outputCoverages = (coverages[0]*outputCoverages[0],coverages[1]*outputCoverages[1])
    output_coverages['coverages'] = pd.DataFrame(outputCoverages)
    
    #sostituisco i nulli rimasti con zeri
    D_table=D_table.fillna(0)
    
    
    #identifico gli anni nella colonna dizionario
    yearsColumns = timeColumns
    
    
    # identifico le colonne utili
    D_res=pd.DataFrame(columns=[codeCol_node, descrCol_node,latCol,lonCol,'YEAR','COST',])
    D_res_optimal=pd.DataFrame(columns=['PERIOD',latCol,lonCol,'YEAR','COST','FLOW'])
    
    for year in yearsColumns:
        #year = yearsColumns[0]
        D_filter_columns=[codeCol_node,descrCol_node,latCol,lonCol,year]
        D_filtered = D_table[D_filter_columns]
        D_filtered = D_filtered.rename(columns={year:'FLOW'})
        D_filtered['YEAR']=year
        
        # define optimal location
        if distanceType.lower()=='rectangular':
            lat_optimal, lon_optimal = optimalLocationRectangularDistance(D_filtered, latCol, lonCol, 'FLOW')
            D_filtered['COST']=func_rectangularDistanceCost(D_filtered[lonCol], D_filtered[latCol], lon_optimal, lat_optimal, D_filtered['FLOW'])
        elif distanceType.lower()=='gravity':
            lat_optimal, lon_optimal = optimalLocationGravityProblem(D_filtered, latCol, lonCol, 'FLOW')
            D_filtered['COST']=func_gravityDistanceCost(D_filtered[lonCol], D_filtered[latCol], lon_optimal, lat_optimal, D_filtered['FLOW'])
        elif distanceType.lower()=='euclidean':
            lat_optimal, lon_optimal = optimalLocationEuclideanDistance(D_filtered, latCol, lonCol, 'FLOW')
            D_filtered['COST']=func_euclideanDistanceCost(D_filtered[lonCol], D_filtered[latCol], lon_optimal, lat_optimal, D_filtered['FLOW'])
        D_res=D_res.append(D_filtered)
        
        
        D_res_optimal=D_res_optimal.append(pd.DataFrame([[f"OPTIMAL LOCATION YEAR: {year}",
                                                          lat_optimal, 
                                                          lon_optimal, 
                                                          year,
                                                          sum(D_res['COST']),
                                                          sum(D_res['FLOW']),
                                                          ]], columns=D_res_optimal.columns))
    
        
    #D_res['COST_norm']=(D_res['COST']-min(D_res['COST']))/(max(D_res['COST'])-min(D_res['COST']))*10
    D_res['FLOW_norm']=(D_res['FLOW']-min(D_res['FLOW']))/(max(D_res['FLOW'])-min(D_res['FLOW']))*100
    

    D_res=D_res.rename(columns={'COST':'COST_TOBE'})
    
    return D_res, D_res_optimal, output_coverages

Example #8

Show file

def defineDistanceTableEstimator(D_mov,lonCol_From_mov,latCol_From_mov,lonCol_To_mov,latCol_To_mov,G,cleanOutliersCoordinates=False,capacityField='QUANTITY'):
    
    '''
    D_mov is the dataframe with movements
    lonCol_From_mov is the name of the D_mov dataframe with longitude of the loading node
    latCol_From_mov is the name of the D_mov dataframe with latitude of the loading node
    lonCol_To_mov is the name of the D_mov dataframe with longitude of the discharging node
    latCol_To_mov is the name of the D_mov dataframe with latitude of the loading node
    G is a road graph obtained with osmnx
    cleanOutliersCoordinates is true to remove outliers in latitude and longitude
    capacityField is a field of capacity to measure the coverage statistics on it
    '''
    
    #clean data and get coverages
    analysisFieldList = [lonCol_From_mov,latCol_From_mov,lonCol_To_mov,latCol_To_mov]
    coverages,_ = getCoverageStats(D_mov,analysisFieldList,capacityField=capacityField)
    D_dist = D_mov[[lonCol_From_mov,latCol_From_mov,lonCol_To_mov,latCol_To_mov]].drop_duplicates().dropna().reset_index()
    if cleanOutliersCoordinates:
        D_dist,coverages_outl=cleanUsingIQR(D_dist, [lonCol_From_mov,latCol_From_mov,lonCol_To_mov,latCol_To_mov])
        coverages = (coverages[0]*coverages_outl[0],coverages[1]*coverages_outl[1])
    
    df_coverages = pd.DataFrame(coverages)
        
        
    D_dist['REAL_DISTANCE'] = np.nan
    D_dist['MERCATOR_X_FROM'] = np.nan
    D_dist['MERCATOR_Y_FROM'] = np.nan
    D_dist['MERCATOR_X_TO'] = np.nan
    D_dist['MERCATOR_Y_TO'] = np.nan
    
    for index, row in D_dist.iterrows():
        
        #get the coordinates
        lonFrom = row[lonCol_From_mov]
        latFrom = row[latCol_From_mov]
        lonTo = row[lonCol_To_mov]
        latTo = row[latCol_To_mov]
        
        #get the closest node on the graph
        node_from = ox.get_nearest_node(G, (latFrom,lonFrom), method='euclidean')
        node_to = ox.get_nearest_node(G, (latTo,lonTo), method='euclidean')
        length = nx.shortest_path_length(G=G, source=node_from, target=node_to, weight='length')
        D_dist['REAL_DISTANCE'].loc[index]=length
        
        #convert into mercator coordinates
        x_merc_from, y_merc_from =mercatorProjection(latFrom,lonFrom)
        x_merc_to, y_merc_to =mercatorProjection(latTo,lonTo)
        
        D_dist['MERCATOR_X_FROM'].loc[index]=x_merc_from
        D_dist['MERCATOR_Y_FROM'].loc[index]=y_merc_from
        D_dist['MERCATOR_X_TO'].loc[index]=x_merc_to
        D_dist['MERCATOR_Y_TO'].loc[index]=y_merc_to
    
    
    D_dist['EUCLIDEAN_DISTANCE'] = 1000*func_euclideanDistanceCost(D_dist['MERCATOR_X_FROM'],D_dist['MERCATOR_Y_FROM'],D_dist['MERCATOR_X_TO'],D_dist['MERCATOR_Y_TO'],1)
    D_dist['RECTANGULAR_DISTANCE'] = 1000*func_rectangularDistanceCost(D_dist['MERCATOR_X_FROM'],D_dist['MERCATOR_Y_FROM'],D_dist['MERCATOR_X_TO'],D_dist['MERCATOR_Y_TO'],1)
    D_dist['GRAVITY_DISTANCE'] = 1000*func_gravityDistanceCost(D_dist['MERCATOR_X_FROM'],D_dist['MERCATOR_Y_FROM'],D_dist['MERCATOR_X_TO'],D_dist['MERCATOR_Y_TO'],1)
    
    
    error_euclidean = mean_squared_error(D_dist['REAL_DISTANCE'], D_dist['EUCLIDEAN_DISTANCE'])
    error_rectangular = mean_squared_error(D_dist['REAL_DISTANCE'], D_dist['RECTANGULAR_DISTANCE'])
    error_gravity = mean_squared_error(D_dist['REAL_DISTANCE'], D_dist['GRAVITY_DISTANCE'])
    
    print(f"MSE EUCLIDEAN: {np.round(error_euclidean,2)}")
    print(f"MSE RECTANGULAR: {np.round(error_rectangular,2)}")
    print(f"MSE GRAVITY: {np.round(error_gravity,2)}")
    return D_dist, df_coverages