def test_get_dataset_dtypes(self): expected = { 'airlines': { 'carrier': { 'dtype': 'O' } }, 'airports': { 'dest': { 'dtype': 'O' } }, 'flights': { 'dest': { 'dtype': 'O' }, 'carrier': { 'dtype': 'O' }, 'flight_id': { 'dtype': 'O' } }, 'trip_logs': { 'flight_id': { 'dtype': 'O' } } } result = get_dataset_dtypes(None) self.assertEqual(expected, result)
def test_get_dataset_dtypes(self): expected = {'airlines': {'carrier': {'dtype': 'O'}}, 'airports': {'dest': {'dtype': 'O'}}, 'flights': {'dest': {'dtype': 'O'}, 'carrier': {'dtype': 'O'},'flight_id': {'dtype': 'O'}}, 'trip_logs': {'flight_id': {'dtype': 'O'}}} result = get_dataset_dtypes(None) self.assertEqual(expected, result) expected = {'airlines': {'carrier': {'dtype': 'O', # 'key_candidate': True, 'relationships': [{'flights.carrier': {}}]}}, 'airports': {'dest': {'dtype': 'O', # 'key_candidate': True, 'relationships': [{'flights.dest': {}}]}}, 'flights': {'dest': {'dtype': 'O', # 'key_candidate': False, 'relationships': [{'airports.dest': {}}]}, 'carrier': {'dtype': 'O', # 'key_candidate': False, 'relationships': [{'airlines.carrier': {}}]}, 'flight_id': {'dtype': 'O', # 'key_candidate': True, 'relationships': [{'trip_logs.flight_id': {}}]}}, 'trip_logs': {'flight_id': {'dtype': 'O', # 'key_candidate': False, 'relationships': [{'flights.flight_id': {}}]}}} result = find_related_cols_by_name(None, result) self.assertEqual(expected, result) result = find_related_cols_by_content(None, result) self.assertEqual(expected, result) expected = {'airlines': {'carrier': {'dtype': 'O', # 'key_candidate': True, 'relationships': [{'flights.carrier': {'type': 'Parent'}}]}}, 'airports': {'dest': {'dtype': 'O', # 'key_candidate': True, 'relationships': [{'flights.dest': {'type': 'Parent'}}]}}, 'flights': {'dest': {'dtype': 'O', # 'key_candidate': False, 'relationships': [{'airports.dest': {'type': 'Child'}}]}, 'carrier': {'dtype': 'O', # 'key_candidate': False, 'relationships': [{'airlines.carrier': {'type': 'Child'}}]}, 'flight_id': {'dtype': 'O', # 'key_candidate': True, 'relationships': [{'trip_logs.flight_id': {'type': 'Parent'}}]}}, 'trip_logs': {'flight_id': {'dtype': 'O', # 'key_candidate': False, 'relationships': [{'flights.flight_id': {'type': 'Child'}}]}}} result = find_parent_child_relationships(None, result) self.assertEqual(expected, result)
def test_find_primary_key_candidates(self): # Get initial relationships_dict expected = { 'airlines': { 'carrier': { 'dtype': 'O' } }, 'airports': { 'dest': { 'dtype': 'O' } }, 'flights': { 'dest': { 'dtype': 'O' }, 'carrier': { 'dtype': 'O' }, 'flight_id': { 'dtype': 'O' } }, 'trip_logs': { 'flight_id': { 'dtype': 'O' } } } result = get_dataset_dtypes(None) self.assertEqual(expected, result) expected = { 'airlines': { 'carrier': { 'dtype': 'O', 'key_candidate': True } }, 'airports': { 'dest': { 'dtype': 'O', 'key_candidate': True } }, 'flights': { 'dest': { 'dtype': 'O', 'key_candidate': False }, 'carrier': { 'dtype': 'O', 'key_candidate': False }, 'flight_id': { 'dtype': 'O', 'key_candidate': True } }, 'trip_logs': { 'flight_id': { 'dtype': 'O', 'key_candidate': False } } } result = find_primary_key_candidates(None, result) self.assertEqual(expected, result)
def test_find_primary_key_candidates(self): # Get initial relationships_dict expected = { 'airlines': { 'carrier': { 'dtype': 'O' } }, 'airports': { 'dest': { 'dtype': 'O' } }, 'flights': { 'dest': { 'dtype': 'O' }, 'carrier': { 'dtype': 'O' }, 'flight_id': { 'dtype': 'O' } }, 'trip_logs': { 'flight_id': { 'dtype': 'O' } } } result = get_dataset_dtypes(None) self.assertEqual(expected, result) data = os.path.join( git.Repo('.', search_parent_directories=True).working_tree_dir, 'data') dataframe_dict = { 'airlines': pd.read_csv(os.path.join(data, 'airlines', 'airlines.csv')), 'flights': pd.read_csv(os.path.join(data, 'flights', 'flights.csv')), 'airports': pd.read_csv(os.path.join(data, 'airports', 'airports.csv')), 'trip_logs': pd.read_csv(os.path.join(data, 'trip_logs', 'trip_logs.csv')) } expected = { 'airlines': { 'carrier': { 'dtype': 'O', 'key_candidate': True } }, 'airports': { 'dest': { 'dtype': 'O', 'key_candidate': True }, 'dest_city': { 'key_candidate': False }, 'dest_state': { 'key_candidate': False } }, 'flights': { 'carrier': { 'dtype': 'O', 'key_candidate': False }, 'dest': { 'dtype': 'O', 'key_candidate': False }, 'distance_group': { 'key_candidate': False }, 'first_trip_logs_time': { 'key_candidate': False }, 'flight_id': { 'dtype': 'O', 'key_candidate': True }, 'flight_num': { 'key_candidate': False }, 'origin': { 'key_candidate': False }, 'origin_city': { 'key_candidate': False }, 'origin_state': { 'key_candidate': False } }, 'trip_logs': { 'air_time': { 'key_candidate': False }, 'arr_delay': { 'key_candidate': False }, 'arr_time': { 'key_candidate': False }, 'canceled': { 'key_candidate': False }, 'carrier_delay': { 'key_candidate': False }, 'date_scheduled': { 'key_candidate': False }, 'dep_delay': { 'key_candidate': False }, 'dep_time': { 'key_candidate': False }, 'distance': { 'key_candidate': False }, 'diverted': { 'key_candidate': False }, 'flight_id': { 'dtype': 'O', 'key_candidate': False }, 'late_aircraft_delay': { 'key_candidate': False }, 'national_airspace_delay': { 'key_candidate': False }, 'scheduled_arr_time': { 'key_candidate': False }, 'scheduled_dep_time': { 'key_candidate': False }, 'scheduled_elapsed_time': { 'key_candidate': False }, 'security_delay': { 'key_candidate': False }, 'taxi_in': { 'key_candidate': False }, 'taxi_out': { 'key_candidate': False }, 'trip_log_id': { 'key_candidate': True }, 'weather_delay': { 'key_candidate': False } } } result = find_primary_key_candidates(dataframe_dict, result) self.assertEqual(expected, result)
def test_find_related_cols_by_name(self): result = get_dataset_dtypes(None) expected = { 'airlines': { 'carrier': { 'dtype': 'O', # 'key_candidate': True, 'relationships': [{ 'flights.carrier': {} }] } }, 'airports': { 'dest': { 'dtype': 'O', # 'key_candidate': True, 'relationships': [{ 'flights.dest': {} }] } }, 'flights': { 'dest': { 'dtype': 'O', # 'key_candidate': False, 'relationships': [{ 'airports.dest': {} }] }, 'carrier': { 'dtype': 'O', # 'key_candidate': False, 'relationships': [{ 'airlines.carrier': {} }] }, 'flight_id': { 'dtype': 'O', # 'key_candidate': True, 'relationships': [{ 'trip_logs.flight_id': {} }] } }, 'trip_logs': { 'flight_id': { 'dtype': 'O', # 'key_candidate': False, 'relationships': [{ 'flights.flight_id': {} }] } } } data = os.path.join( git.Repo('.', search_parent_directories=True).working_tree_dir, 'data') dataframe_dict = { 'airlines': pd.read_csv(os.path.join(data, 'airlines', 'airlines.csv')), 'flights': pd.read_csv(os.path.join(data, 'flights', 'flights.csv')), 'airports': pd.read_csv(os.path.join(data, 'airports', 'airports.csv')), 'trip_logs': pd.read_csv(os.path.join(data, 'trip_logs', 'trip_logs.csv')) } result = find_related_cols_by_name(dataframe_dict, result) self.assertEqual(expected, result)
if not os.path.exists(file_with_path): if click.confirm('OK to download demo featuretools data?', default=False): es = ft.demo.load_flight(verbose=True) save_demo_data(es, file_list) break # demonstration - this will be removed later if __name__ == "__main__": print(sys.version) print(sys.executable) # Download example data (if it doesn't exist) download_data() print(dt.load_csv_to_df(None)) relationship_dict = dt.get_dataset_dtypes(None) print(relationship_dict) relationship_dict = dt.find_primary_key_candidates(None, relationship_dict) print(relationship_dict) relationship_dict = dt.find_related_cols_by_name(None, relationship_dict) print(relationship_dict) relationship_dict = dt.find_parent_child_relationships( None, relationship_dict) print(relationship_dict)
download_data() # Load the csv files into dataframes print('=================') print('Loading CSV Files') dataframe_dict = dt.load_csv_to_df('data', include_hidden=False, traverse_subdir=True, ignore_errors=True, follow_symlink=False) print('Found the following tables:') print(dataframe_dict.keys()) print('================') print("get datatypes...") relationship_dict = dt.get_dataset_dtypes(dataframe_dict) pp.pprint(relationship_dict) print('===================') print("get primary keys...") relationship_dict = dt.find_primary_key_candidates(dataframe_dict, relationship_dict) pp.pprint(relationship_dict) print('===============================') print('Finding related columns by name') relationship_dict = dt.find_related_cols_by_name(dataframe_dict, relationship_dict) # print('standard relationship dict unfiltered for relationships: ') pp.pprint(relationship_dict) print('===============================') print('Find related columns by content')