def test_get_dataset_dtypes(self):
     expected = {
         'airlines': {
             'carrier': {
                 'dtype': 'O'
             }
         },
         'airports': {
             'dest': {
                 'dtype': 'O'
             }
         },
         'flights': {
             'dest': {
                 'dtype': 'O'
             },
             'carrier': {
                 'dtype': 'O'
             },
             'flight_id': {
                 'dtype': 'O'
             }
         },
         'trip_logs': {
             'flight_id': {
                 'dtype': 'O'
             }
         }
     }
     result = get_dataset_dtypes(None)
     self.assertEqual(expected, result)
    def test_get_dataset_dtypes(self):
        expected = {'airlines': {'carrier': {'dtype': 'O'}},
                    'airports': {'dest': {'dtype': 'O'}},
                    'flights': {'dest': {'dtype': 'O'}, 'carrier': {'dtype': 'O'},'flight_id': {'dtype': 'O'}},
                    'trip_logs': {'flight_id': {'dtype': 'O'}}}
        result = get_dataset_dtypes(None)
        self.assertEqual(expected, result)

        expected = {'airlines': {'carrier': {'dtype': 'O',
                                             # 'key_candidate': True,
                                             'relationships': [{'flights.carrier': {}}]}},
                    'airports': {'dest': {'dtype': 'O',
                                          # 'key_candidate': True,
                                          'relationships': [{'flights.dest': {}}]}},
                    'flights': {'dest': {'dtype': 'O',
                                         # 'key_candidate': False,
                                         'relationships': [{'airports.dest': {}}]},
                                'carrier': {'dtype': 'O',
                                            # 'key_candidate': False,
                                            'relationships': [{'airlines.carrier': {}}]},
                                'flight_id': {'dtype': 'O',
                                              # 'key_candidate': True,
                                              'relationships': [{'trip_logs.flight_id': {}}]}},
                    'trip_logs': {'flight_id': {'dtype': 'O',
                                                # 'key_candidate': False,
                                                'relationships': [{'flights.flight_id': {}}]}}}
        result = find_related_cols_by_name(None, result)
        self.assertEqual(expected, result)

        result = find_related_cols_by_content(None, result)
        self.assertEqual(expected, result)

        expected = {'airlines': {'carrier': {'dtype': 'O',
                                             # 'key_candidate': True,
                                             'relationships': [{'flights.carrier': {'type': 'Parent'}}]}},
                    'airports': {'dest': {'dtype': 'O',
                                          # 'key_candidate': True,
                                          'relationships': [{'flights.dest': {'type': 'Parent'}}]}},
                    'flights': {'dest': {'dtype': 'O',
                                         # 'key_candidate': False,
                                         'relationships': [{'airports.dest': {'type': 'Child'}}]},
                                'carrier': {'dtype': 'O',
                                            # 'key_candidate': False,
                                            'relationships': [{'airlines.carrier': {'type': 'Child'}}]},
                                'flight_id': {'dtype': 'O',
                                              # 'key_candidate': True,
                                              'relationships': [{'trip_logs.flight_id': {'type': 'Parent'}}]}},
                    'trip_logs': {'flight_id': {'dtype': 'O',
                                                # 'key_candidate': False,
                                                'relationships': [{'flights.flight_id': {'type': 'Child'}}]}}}
        result = find_parent_child_relationships(None, result)
        self.assertEqual(expected, result)
Beispiel #3
0
    def test_find_primary_key_candidates(self):

        # Get initial relationships_dict
        expected = {
            'airlines': {
                'carrier': {
                    'dtype': 'O'
                }
            },
            'airports': {
                'dest': {
                    'dtype': 'O'
                }
            },
            'flights': {
                'dest': {
                    'dtype': 'O'
                },
                'carrier': {
                    'dtype': 'O'
                },
                'flight_id': {
                    'dtype': 'O'
                }
            },
            'trip_logs': {
                'flight_id': {
                    'dtype': 'O'
                }
            }
        }
        result = get_dataset_dtypes(None)
        self.assertEqual(expected, result)

        expected = {
            'airlines': {
                'carrier': {
                    'dtype': 'O',
                    'key_candidate': True
                }
            },
            'airports': {
                'dest': {
                    'dtype': 'O',
                    'key_candidate': True
                }
            },
            'flights': {
                'dest': {
                    'dtype': 'O',
                    'key_candidate': False
                },
                'carrier': {
                    'dtype': 'O',
                    'key_candidate': False
                },
                'flight_id': {
                    'dtype': 'O',
                    'key_candidate': True
                }
            },
            'trip_logs': {
                'flight_id': {
                    'dtype': 'O',
                    'key_candidate': False
                }
            }
        }
        result = find_primary_key_candidates(None, result)
        self.assertEqual(expected, result)
    def test_find_primary_key_candidates(self):

        # Get initial relationships_dict
        expected = {
            'airlines': {
                'carrier': {
                    'dtype': 'O'
                }
            },
            'airports': {
                'dest': {
                    'dtype': 'O'
                }
            },
            'flights': {
                'dest': {
                    'dtype': 'O'
                },
                'carrier': {
                    'dtype': 'O'
                },
                'flight_id': {
                    'dtype': 'O'
                }
            },
            'trip_logs': {
                'flight_id': {
                    'dtype': 'O'
                }
            }
        }
        result = get_dataset_dtypes(None)
        self.assertEqual(expected, result)

        data = os.path.join(
            git.Repo('.', search_parent_directories=True).working_tree_dir,
            'data')

        dataframe_dict = {
            'airlines':
            pd.read_csv(os.path.join(data, 'airlines', 'airlines.csv')),
            'flights':
            pd.read_csv(os.path.join(data, 'flights', 'flights.csv')),
            'airports':
            pd.read_csv(os.path.join(data, 'airports', 'airports.csv')),
            'trip_logs':
            pd.read_csv(os.path.join(data, 'trip_logs', 'trip_logs.csv'))
        }

        expected = {
            'airlines': {
                'carrier': {
                    'dtype': 'O',
                    'key_candidate': True
                }
            },
            'airports': {
                'dest': {
                    'dtype': 'O',
                    'key_candidate': True
                },
                'dest_city': {
                    'key_candidate': False
                },
                'dest_state': {
                    'key_candidate': False
                }
            },
            'flights': {
                'carrier': {
                    'dtype': 'O',
                    'key_candidate': False
                },
                'dest': {
                    'dtype': 'O',
                    'key_candidate': False
                },
                'distance_group': {
                    'key_candidate': False
                },
                'first_trip_logs_time': {
                    'key_candidate': False
                },
                'flight_id': {
                    'dtype': 'O',
                    'key_candidate': True
                },
                'flight_num': {
                    'key_candidate': False
                },
                'origin': {
                    'key_candidate': False
                },
                'origin_city': {
                    'key_candidate': False
                },
                'origin_state': {
                    'key_candidate': False
                }
            },
            'trip_logs': {
                'air_time': {
                    'key_candidate': False
                },
                'arr_delay': {
                    'key_candidate': False
                },
                'arr_time': {
                    'key_candidate': False
                },
                'canceled': {
                    'key_candidate': False
                },
                'carrier_delay': {
                    'key_candidate': False
                },
                'date_scheduled': {
                    'key_candidate': False
                },
                'dep_delay': {
                    'key_candidate': False
                },
                'dep_time': {
                    'key_candidate': False
                },
                'distance': {
                    'key_candidate': False
                },
                'diverted': {
                    'key_candidate': False
                },
                'flight_id': {
                    'dtype': 'O',
                    'key_candidate': False
                },
                'late_aircraft_delay': {
                    'key_candidate': False
                },
                'national_airspace_delay': {
                    'key_candidate': False
                },
                'scheduled_arr_time': {
                    'key_candidate': False
                },
                'scheduled_dep_time': {
                    'key_candidate': False
                },
                'scheduled_elapsed_time': {
                    'key_candidate': False
                },
                'security_delay': {
                    'key_candidate': False
                },
                'taxi_in': {
                    'key_candidate': False
                },
                'taxi_out': {
                    'key_candidate': False
                },
                'trip_log_id': {
                    'key_candidate': True
                },
                'weather_delay': {
                    'key_candidate': False
                }
            }
        }
        result = find_primary_key_candidates(dataframe_dict, result)
        self.assertEqual(expected, result)
    def test_find_related_cols_by_name(self):
        result = get_dataset_dtypes(None)
        expected = {
            'airlines': {
                'carrier': {
                    'dtype': 'O',
                    # 'key_candidate': True,
                    'relationships': [{
                        'flights.carrier': {}
                    }]
                }
            },
            'airports': {
                'dest': {
                    'dtype': 'O',
                    # 'key_candidate': True,
                    'relationships': [{
                        'flights.dest': {}
                    }]
                }
            },
            'flights': {
                'dest': {
                    'dtype': 'O',
                    # 'key_candidate': False,
                    'relationships': [{
                        'airports.dest': {}
                    }]
                },
                'carrier': {
                    'dtype': 'O',
                    # 'key_candidate': False,
                    'relationships': [{
                        'airlines.carrier': {}
                    }]
                },
                'flight_id': {
                    'dtype': 'O',
                    # 'key_candidate': True,
                    'relationships': [{
                        'trip_logs.flight_id': {}
                    }]
                }
            },
            'trip_logs': {
                'flight_id': {
                    'dtype': 'O',
                    # 'key_candidate': False,
                    'relationships': [{
                        'flights.flight_id': {}
                    }]
                }
            }
        }

        data = os.path.join(
            git.Repo('.', search_parent_directories=True).working_tree_dir,
            'data')

        dataframe_dict = {
            'airlines':
            pd.read_csv(os.path.join(data, 'airlines', 'airlines.csv')),
            'flights':
            pd.read_csv(os.path.join(data, 'flights', 'flights.csv')),
            'airports':
            pd.read_csv(os.path.join(data, 'airports', 'airports.csv')),
            'trip_logs':
            pd.read_csv(os.path.join(data, 'trip_logs', 'trip_logs.csv'))
        }

        result = find_related_cols_by_name(dataframe_dict, result)
        self.assertEqual(expected, result)
Beispiel #6
0
        if not os.path.exists(file_with_path):
            if click.confirm('OK to download demo featuretools data?',
                             default=False):
                es = ft.demo.load_flight(verbose=True)
                save_demo_data(es, file_list)
                break


# demonstration - this will be removed later
if __name__ == "__main__":
    print(sys.version)
    print(sys.executable)

    # Download example data (if it doesn't exist)
    download_data()

    print(dt.load_csv_to_df(None))

    relationship_dict = dt.get_dataset_dtypes(None)
    print(relationship_dict)

    relationship_dict = dt.find_primary_key_candidates(None, relationship_dict)
    print(relationship_dict)

    relationship_dict = dt.find_related_cols_by_name(None, relationship_dict)
    print(relationship_dict)

    relationship_dict = dt.find_parent_child_relationships(
        None, relationship_dict)
    print(relationship_dict)
Beispiel #7
0
    download_data()

    # Load the csv files into dataframes
    print('=================')
    print('Loading CSV Files')
    dataframe_dict = dt.load_csv_to_df('data',
                                       include_hidden=False,
                                       traverse_subdir=True,
                                       ignore_errors=True,
                                       follow_symlink=False)
    print('Found the following tables:')
    print(dataframe_dict.keys())

    print('================')
    print("get datatypes...")
    relationship_dict = dt.get_dataset_dtypes(dataframe_dict)
    pp.pprint(relationship_dict)

    print('===================')
    print("get primary keys...")
    relationship_dict = dt.find_primary_key_candidates(dataframe_dict, relationship_dict)
    pp.pprint(relationship_dict)

    print('===============================')
    print('Finding related columns by name')
    relationship_dict = dt.find_related_cols_by_name(dataframe_dict, relationship_dict)
    # print('standard relationship dict unfiltered for relationships: ')
    pp.pprint(relationship_dict)

    print('===============================')
    print('Find related columns by content')