Exemple #1
0
def main(argv):
  del argv  # Unused.
  
  logging.info("Creating Wikigeo dataset.")

  # Extract items.
  if FLAGS.osm_path is not None:
    output_prefix = FLAGS.region.lower() + '_osm'
    results = extract.get_data_by_region_with_osm(
      regions.get_region(FLAGS.region), FLAGS.osm_path)
  else:
    output_prefix = FLAGS.region.lower()
    results = extract.get_data_by_region(regions.get_region(FLAGS.region))
  logging.info(f'Found {len(results)} items.')
  
  # Split data into train, dev, and test sets.
  splits = extract.split_dataset(results, 0.8, 0.1)

  # Output to disk.
  if not os.path.exists(FLAGS.output_dir):
    os.makedirs(FLAGS.output_dir)


  for split_name, split_data in splits.items():
    logging.info(f'The size of the {split_name} set is {len(split_data)}.')
    output_path = os.path.join(
      FLAGS.output_dir, f'{output_prefix}_{split_name}.json')
    extract.write_files(output_path, split_data)
Exemple #2
0
    def setUpClass(cls):

        # Load map from disk.
        cls.manhattan_region = regions.get_region('Manhattan')
        cls.pittsburgh_region = regions.get_region('Pittsburgh')
        cls.items_lean = query.get_geofenced_wikidata_items(
            cls.manhattan_region)
        cls.items_info = query.get_geofenced_info_wikidata_items(
            cls.manhattan_region)
Exemple #3
0
def create_dataset(data_dir: Text,
           region: Text, s2level) -> Tuple[CabbyDataset, CabbyDataset,
                           CabbyDataset, Dict[int, int]]:
  '''Loads data and creates datasets and train, validate and test sets.
  Arguments:
    data_dir: The directory of the data.
    region: The region of the data.
    s2level: The s2level of the cells.
  Returns:
    The train, validate and test sets and the dictionary of labels to cellids.
  '''

  train_ds = pd.read_json(data_dir + '/' + 'train.json')
  valid_ds = pd.read_json(data_dir + '/' + 'dev.json')
  test_ds = pd.read_json(data_dir + '/' + 'test.json')

  # Get lables.
  get_region = regions.get_region(region)
  unique_cellid = util.cellids_from_polygon(get_region, s2level)
  label_to_cellid = {idx: cellid for idx, cellid in enumerate(unique_cellid)}
  cellid_to_label = {cellid: idx for idx, cellid in enumerate(unique_cellid)}

  # Create Cabby dataset.
  train_dataset = CabbyDataset(train_ds, s2level, cellid_to_label)
  val_dataset = CabbyDataset(valid_ds, s2level, cellid_to_label)
  test_dataset = CabbyDataset(test_ds, s2level, cellid_to_label)

  return train_dataset, val_dataset, test_dataset, label_to_cellid
Exemple #4
0
def main(argv):
  del argv  # Unused.
  map_region = map_structure.Map(
    regions.get_region(FLAGS.region), FLAGS.min_s2_level, FLAGS.directory)

  # Create a file with multiple layers of data.
  walker = walk.Walker(map_region)
  walker.generate_and_save_rvs_routes(FLAGS.path, FLAGS.n_samples, FLAGS.n_cpu)
Exemple #5
0
def create_dataset(
          data_dir: str, 
          region: str, 
          s2level: int, 
          infer_only: bool = False
) -> dataset_item.TextGeoDataset:
  '''Loads data and creates datasets and train, validate and test sets.
  Arguments:
    data_dir: The directory of the data.
    region: The region of the data.
    s2level: The s2level of the cells.
  Returns:
    The train, validate and test sets and the dictionary of labels to cellids.
  '''
  train_ds = pd.read_json(os.path.join(data_dir, 'train.json'))
  valid_ds = pd.read_json(os.path.join(data_dir, 'dev.json'))
  test_ds = pd.read_json(os.path.join(data_dir, 'test.json'))
  # Get labels.
  get_region = regions.get_region(region)
  unique_cellid = gutil.cellids_from_polygon(get_region.polygon, s2level)
  label_to_cellid = {idx: cellid for idx, cellid in enumerate(unique_cellid)}
  cellid_to_label = {cellid: idx for idx, cellid in enumerate(unique_cellid)}

  points = gutil.get_centers_from_s2cellids(unique_cellid)

  unique_cells_df = pd.DataFrame({'point': points, 'cellid': unique_cellid})
  
  unique_cells_df['far'] = unique_cells_df.point.apply(
      lambda x: gutil.far_cellid(x, unique_cells_df))

  vec_cells = util.binary_representation(unique_cells_df.cellid.to_numpy(), 
  dim = CELLID_DIM)
  tens_cells = torch.tensor(vec_cells)

  # Create WikiGeo dataset.
  train_dataset = None
  val_dataset = None
  logging.info("Starting to create the splits")
  if infer_only == False:
    train_dataset = TextGeoSplit(
      train_ds, s2level, unique_cells_df, cellid_to_label)
    logging.info(
      f"Finished to create the train-set with {len(train_dataset)} samples")
    val_dataset = TextGeoSplit(
      valid_ds, s2level, unique_cells_df, cellid_to_label)
    logging.info(
      f"Finished to create the valid-set with {len(val_dataset)} samples")
  test_dataset = TextGeoSplit(
    test_ds, s2level, unique_cells_df, cellid_to_label)
  logging.info(
    f"Finished to create the test-set with {len(test_dataset)} samples")

  return dataset_item.TextGeoDataset.from_TextGeoSplit(
    train_dataset, val_dataset, test_dataset, np.array(unique_cellid), 
    tens_cells, label_to_cellid)
Exemple #6
0
def main(argv):
    del argv  # Unused.

    logging.info("Starting to build map of {} at level {}.".format(
        FLAGS.region, FLAGS.min_s2_level))

    map = map_structure.Map(regions.get_region(FLAGS.region),
                            FLAGS.min_s2_level)
    logging.info("Created map of {} at level {}.".format(
        FLAGS.region, FLAGS.min_s2_level))

    if FLAGS.directory is not None:
        # Write to disk.
        map.write_map(FLAGS.directory)
        logging.info("Map written to => {}".format(FLAGS.directory))

        # Load from disk.

        map_new = map_structure.Map(regions.get_region(FLAGS.region),
                                    FLAGS.min_s2_level, FLAGS.directory)

        logging.info('Number of POI found: {0}'.format(map_new.poi.shape[0]))
Exemple #7
0
    def testQueryWithOSM(self):
        samples = extract.get_data_by_region_with_osm(
            regions.get_region('Pittsburgh_small'))
        self.assertEqual(samples[0].sample_type, 'Wikipedia_page')

        wikidata_sample = samples[4]
        self.assertEqual(wikidata_sample.sample_type, 'Wikidata')
        self.assertEqual(
            wikidata_sample.text,
            ('Renaissance Revival architecture, building, building in '
             'Pennsylvania, United States, Birmingham Public School.'))

        foundFigleaf = False
        for sample in samples:
            if sample.title == 'Figleaf':
                foundFigleaf = True
                self.assertEqual(sample.sample_type, 'OSM')
                self.assertEqual(
                    sample.text,
                    'Figleaf and building and East Carson Street.')
        self.assertTrue(foundFigleaf)
Exemple #8
0
    def testSingleOutput(self):

        # Get Pittsburgh items. Also tests cabby.geo.util.item and
        # cabby.data.wikidata.query.
        pittsburgh_region = regions.get_region('Pittsburgh')
        pittsburgh_items = [
            item.WikidataEntity.from_sparql_result(result)
            for result in query.get_geofenced_wikidata_items(pittsburgh_region)
        ]
        pittsburgh_index = {e.qid: e for e in pittsburgh_items}

        # Select five POIs in Pittsburgh.
        market_square = pittsburgh_index['Q6770726']
        warhol_museum = pittsburgh_index['Q751172']
        carnegie_library = pittsburgh_index['Q5043895']
        reserve_bank = pittsburgh_index['Q5440376']
        heinz_hall = pittsburgh_index['Q12059806']

        # Check computed distances from Warhol Museum to the others.
        goal = market_square.location
        pois = [warhol_museum, carnegie_library, reserve_bank, heinz_hall]
        obtained_distances = observe.get_all_distances(goal, pois)
        expected_distances = {
            'Q751172': 0.1497106143476055,
            'Q5043895': 0.39191208288190965,
            'Q5440376': 0.8607457546797966,
            'Q12059806': 0.09590394273539078
        }

        for qid, expected_distance in expected_distances.items():
            self.assertIn(qid, obtained_distances.keys())
            self.assertAlmostEqual(obtained_distances[qid], expected_distance)

        start = Point(-79.992383, 40.446844)  # Near Senator Heinz Center
        obtained_pivot = observe.get_pivot_poi(start, goal, pois)
        self.assertEqual(obtained_pivot, 'Q12059806')  # Should be Heinz Hall.
Exemple #9
0
    def setUpClass(cls):

        # Load map from disk.
        cls.map = map_structure.Map(regions.get_region('DC'), 18)
        cls.walker = walk.Walker(rand_sample=False, map=cls.map)
Exemple #10
0
    def setUpClass(cls):

        # Process the map for an area in D.C.
        cls.map = map_structure.Map(regions.get_region('DC'), 18)
def main(argv):
    del argv  # Unused.
    results = query.get_geofenced_wikidata_items(
        regions.get_region(FLAGS.region))
    print('The number of Wikidata items found is: {}'.format(len(results)))