def main(argv): del argv # Unused. logging.info("Creating Wikigeo dataset.") # Extract items. if FLAGS.osm_path is not None: output_prefix = FLAGS.region.lower() + '_osm' results = extract.get_data_by_region_with_osm( regions.get_region(FLAGS.region), FLAGS.osm_path) else: output_prefix = FLAGS.region.lower() results = extract.get_data_by_region(regions.get_region(FLAGS.region)) logging.info(f'Found {len(results)} items.') # Split data into train, dev, and test sets. splits = extract.split_dataset(results, 0.8, 0.1) # Output to disk. if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) for split_name, split_data in splits.items(): logging.info(f'The size of the {split_name} set is {len(split_data)}.') output_path = os.path.join( FLAGS.output_dir, f'{output_prefix}_{split_name}.json') extract.write_files(output_path, split_data)
def setUpClass(cls): # Load map from disk. cls.manhattan_region = regions.get_region('Manhattan') cls.pittsburgh_region = regions.get_region('Pittsburgh') cls.items_lean = query.get_geofenced_wikidata_items( cls.manhattan_region) cls.items_info = query.get_geofenced_info_wikidata_items( cls.manhattan_region)
def create_dataset(data_dir: Text, region: Text, s2level) -> Tuple[CabbyDataset, CabbyDataset, CabbyDataset, Dict[int, int]]: '''Loads data and creates datasets and train, validate and test sets. Arguments: data_dir: The directory of the data. region: The region of the data. s2level: The s2level of the cells. Returns: The train, validate and test sets and the dictionary of labels to cellids. ''' train_ds = pd.read_json(data_dir + '/' + 'train.json') valid_ds = pd.read_json(data_dir + '/' + 'dev.json') test_ds = pd.read_json(data_dir + '/' + 'test.json') # Get lables. get_region = regions.get_region(region) unique_cellid = util.cellids_from_polygon(get_region, s2level) label_to_cellid = {idx: cellid for idx, cellid in enumerate(unique_cellid)} cellid_to_label = {cellid: idx for idx, cellid in enumerate(unique_cellid)} # Create Cabby dataset. train_dataset = CabbyDataset(train_ds, s2level, cellid_to_label) val_dataset = CabbyDataset(valid_ds, s2level, cellid_to_label) test_dataset = CabbyDataset(test_ds, s2level, cellid_to_label) return train_dataset, val_dataset, test_dataset, label_to_cellid
def main(argv): del argv # Unused. map_region = map_structure.Map( regions.get_region(FLAGS.region), FLAGS.min_s2_level, FLAGS.directory) # Create a file with multiple layers of data. walker = walk.Walker(map_region) walker.generate_and_save_rvs_routes(FLAGS.path, FLAGS.n_samples, FLAGS.n_cpu)
def create_dataset( data_dir: str, region: str, s2level: int, infer_only: bool = False ) -> dataset_item.TextGeoDataset: '''Loads data and creates datasets and train, validate and test sets. Arguments: data_dir: The directory of the data. region: The region of the data. s2level: The s2level of the cells. Returns: The train, validate and test sets and the dictionary of labels to cellids. ''' train_ds = pd.read_json(os.path.join(data_dir, 'train.json')) valid_ds = pd.read_json(os.path.join(data_dir, 'dev.json')) test_ds = pd.read_json(os.path.join(data_dir, 'test.json')) # Get labels. get_region = regions.get_region(region) unique_cellid = gutil.cellids_from_polygon(get_region.polygon, s2level) label_to_cellid = {idx: cellid for idx, cellid in enumerate(unique_cellid)} cellid_to_label = {cellid: idx for idx, cellid in enumerate(unique_cellid)} points = gutil.get_centers_from_s2cellids(unique_cellid) unique_cells_df = pd.DataFrame({'point': points, 'cellid': unique_cellid}) unique_cells_df['far'] = unique_cells_df.point.apply( lambda x: gutil.far_cellid(x, unique_cells_df)) vec_cells = util.binary_representation(unique_cells_df.cellid.to_numpy(), dim = CELLID_DIM) tens_cells = torch.tensor(vec_cells) # Create WikiGeo dataset. train_dataset = None val_dataset = None logging.info("Starting to create the splits") if infer_only == False: train_dataset = TextGeoSplit( train_ds, s2level, unique_cells_df, cellid_to_label) logging.info( f"Finished to create the train-set with {len(train_dataset)} samples") val_dataset = TextGeoSplit( valid_ds, s2level, unique_cells_df, cellid_to_label) logging.info( f"Finished to create the valid-set with {len(val_dataset)} samples") test_dataset = TextGeoSplit( test_ds, s2level, unique_cells_df, cellid_to_label) logging.info( f"Finished to create the test-set with {len(test_dataset)} samples") return dataset_item.TextGeoDataset.from_TextGeoSplit( train_dataset, val_dataset, test_dataset, np.array(unique_cellid), tens_cells, label_to_cellid)
def main(argv): del argv # Unused. logging.info("Starting to build map of {} at level {}.".format( FLAGS.region, FLAGS.min_s2_level)) map = map_structure.Map(regions.get_region(FLAGS.region), FLAGS.min_s2_level) logging.info("Created map of {} at level {}.".format( FLAGS.region, FLAGS.min_s2_level)) if FLAGS.directory is not None: # Write to disk. map.write_map(FLAGS.directory) logging.info("Map written to => {}".format(FLAGS.directory)) # Load from disk. map_new = map_structure.Map(regions.get_region(FLAGS.region), FLAGS.min_s2_level, FLAGS.directory) logging.info('Number of POI found: {0}'.format(map_new.poi.shape[0]))
def testQueryWithOSM(self): samples = extract.get_data_by_region_with_osm( regions.get_region('Pittsburgh_small')) self.assertEqual(samples[0].sample_type, 'Wikipedia_page') wikidata_sample = samples[4] self.assertEqual(wikidata_sample.sample_type, 'Wikidata') self.assertEqual( wikidata_sample.text, ('Renaissance Revival architecture, building, building in ' 'Pennsylvania, United States, Birmingham Public School.')) foundFigleaf = False for sample in samples: if sample.title == 'Figleaf': foundFigleaf = True self.assertEqual(sample.sample_type, 'OSM') self.assertEqual( sample.text, 'Figleaf and building and East Carson Street.') self.assertTrue(foundFigleaf)
def testSingleOutput(self): # Get Pittsburgh items. Also tests cabby.geo.util.item and # cabby.data.wikidata.query. pittsburgh_region = regions.get_region('Pittsburgh') pittsburgh_items = [ item.WikidataEntity.from_sparql_result(result) for result in query.get_geofenced_wikidata_items(pittsburgh_region) ] pittsburgh_index = {e.qid: e for e in pittsburgh_items} # Select five POIs in Pittsburgh. market_square = pittsburgh_index['Q6770726'] warhol_museum = pittsburgh_index['Q751172'] carnegie_library = pittsburgh_index['Q5043895'] reserve_bank = pittsburgh_index['Q5440376'] heinz_hall = pittsburgh_index['Q12059806'] # Check computed distances from Warhol Museum to the others. goal = market_square.location pois = [warhol_museum, carnegie_library, reserve_bank, heinz_hall] obtained_distances = observe.get_all_distances(goal, pois) expected_distances = { 'Q751172': 0.1497106143476055, 'Q5043895': 0.39191208288190965, 'Q5440376': 0.8607457546797966, 'Q12059806': 0.09590394273539078 } for qid, expected_distance in expected_distances.items(): self.assertIn(qid, obtained_distances.keys()) self.assertAlmostEqual(obtained_distances[qid], expected_distance) start = Point(-79.992383, 40.446844) # Near Senator Heinz Center obtained_pivot = observe.get_pivot_poi(start, goal, pois) self.assertEqual(obtained_pivot, 'Q12059806') # Should be Heinz Hall.
def setUpClass(cls): # Load map from disk. cls.map = map_structure.Map(regions.get_region('DC'), 18) cls.walker = walk.Walker(rand_sample=False, map=cls.map)
def setUpClass(cls): # Process the map for an area in D.C. cls.map = map_structure.Map(regions.get_region('DC'), 18)
def main(argv): del argv # Unused. results = query.get_geofenced_wikidata_items( regions.get_region(FLAGS.region)) print('The number of Wikidata items found is: {}'.format(len(results)))