def split(dataset,
          output_path,
          train_frac,
          val_frac,
          test_frac,
          source_id='landsat8'):

    images = list(dataset.load_images(source_id))

    n_images = len(images)
    n_train = int(round(n_images * train_frac))
    n_test = int(round(n_images * test_frac))
    n_val = int(round(n_images * val_frac))

    indices = list(range(n_images))
    random.shuffle(indices)

    assert n_train + n_test + n_val == n_images, "Error doesn't add up"

    if n_train > 0:
        train_dataset = storage.DiskDataset(output_path + "_train")

        for train_index in indices[0:n_train]:
            image, image_metadata = images[train_index]
            mask = dataset.load_image(image_metadata['location_id'], 'mask')
            mask_metadata = dataset.image_metadata(
                image_metadata['location_id'], 'mask')

            copy_entry(train_dataset, mask, mask_metadata.to_dict(), image,
                       image_metadata.to_dict(), source_id)

    if n_test > 0:
        test_dataset = storage.DiskDataset(output_path + "_test")

        for test_index in indices[n_train:n_train + n_test]:
            image, image_metadata = images[test_index]
            mask = dataset.load_image(image_metadata['location_id'], 'mask')
            mask_metadata = dataset.image_metadata(
                image_metadata['location_id'], 'mask')

            copy_entry(test_dataset, mask, mask_metadata.to_dict(), image,
                       image_metadata.to_dict(), source_id)

    if n_val > 0:
        val_dataset = storage.DiskDataset(output_path + "_val")

        for val_index in indices[n_train + n_test:]:
            image, image_metadata = images[val_index]
            mask = dataset.load_image(image_metadata['location_id'], 'mask')
            mask_metadata = dataset.image_metadata(
                image_metadata['location_id'], 'mask')

            copy_entry(val_dataset, mask, mask_metadata.to_dict(), image,
                       image_metadata.to_dict(), source_id)
Ejemplo n.º 2
0
def main(args):
    input_datasets = [storage.DiskDataset(path) for path in args.input_dirs]
    output_dataset = storage.DiskDataset(args.output_dir)

    total_input_images = sum(
        len(dataset.metadata()) for dataset in input_datasets)
    logging.info("Merging %d input datasets with a combined %d images.",
                 len(input_datasets), total_input_images)

    storage.merge_datasets(input_datasets,
                           output_dataset,
                           remove_existing_images=args.remove_existing_images)

    logging.info("%s now contains %d images", output_dataset.base_dir,
                 len(output_dataset.metadata()))
def main(args):
    source_id = 'landsat8'
    dataset = storage.DiskDataset(args.storage_path)

    predictions = []

    for image, image_metadata in dataset.load_images(source_id):
        mask = dataset.load_image(image_metadata['location_id'], 'mask')
        prediction = dataset.load_image(image_metadata['location_id'],
                                        source_id + '_inference_timeagg')

        assert mask.shape == prediction.shape, "Error, prediciton and mask not the same size."

        predictions.append(
            pd.DataFrame({
                "prediction":
                np.reshape(mask, (mask.size, )),
                "has_mine":
                np.reshape(prediction, (prediction.size, ))
            }))

    predictions = pd.concat(predictions)
    predictions['example_id'] = range(len(predictions))

    confusion_matrix = pd.pivot_table(predictions,
                                      index="has_mine",
                                      columns="prediction",
                                      values="example_id",
                                      fill_value=0,
                                      aggfunc=lambda series: series.count())

    print(confusion_matrix)
Ejemplo n.º 4
0
def main(args):
    dataset = storage.DiskDataset(args.base_dir)

    # Fetch mining masks, locations.
    masks = ee_utils.load_feature_collection_from_fusion_table(
        mask.FUSION_TABLES["duckworthd"])
    locations = ee_utils.load_feature_collection_from_fusion_table(
        "ft:1AFPyNO4MpeeV9TAD-dJj7xsnhS4splJ4DHVZnobG")
    locations = locations[locations.pcode.isin(masks.pcode)]
    assert len(locations) == len(locations.pcode.unique())

    # Fetch features for each location with a valid mask.
    logging.info("Loading features for {} locations.".format(len(locations)))
    feature_specs = [{
        "source": "mask",
        "table": "duckworthd"
    }, {
        "source": "landsat8",
        "start_date": "2016-10-01",
        "end_date": "2017-10-01"
    }]

    # Download with gevent.
    pool = gevent.pool.Pool(args.max_concurrent_requests)
    lock = gevent.lock.Semaphore()
    for _, location in locations.iterrows():
        for feature_spec in feature_specs:
            pool.spawn(add_image_to_dataset, dataset, location, feature_spec,
                       lock)
    pool.join()
def main(args):

    assert args.test_frac + args.train_frac + args.val_frac == 1.0, "Fractions must add up to 1.0"

    dataset = storage.DiskDataset(args.data_input_path)

    split(dataset, args.data_output_path, args.train_frac, args.val_frac,
          args.test_frac)
def load_images_storage(image_input_dir, source_id='landsat8', bqa_index=11):
    dataset = storage.DiskDataset(image_input_dir)

    images_features = []
    masks = []
    bqas = []

    for image, image_metadata in dataset.load_images(source_id):
        image = image[:, :, :, 0:-1]

        mask = dataset.load_image(image_metadata['location_id'], 'mask')

        image_metadata['metadata']['dates'] = image_metadata['metadata'][
            'dates'][0:-1]

        # Shift the indices so the order is  [dates,x,y, bands] (need this to reshape later)
        image = np.transpose(image, [3, 0, 1, 2])
        image_databands = image[:, :, :, :bqa_index]

        bqa = image[:, :, :, bqa_index]

        n_dates, n_x, n_y, n_bands = np.shape(image_databands)

        # We take the cosine of the month becase the altenative is making it a categorical variable
        # and one hot encoding it. The cosine trick means we can make it a continuous variable
        # and puts january and december in similar places.
        months = [
            math.cos(float(date[4:6]) / 12.0)
            for date in image_metadata['metadata']['dates']
        ]

        dates = np.expand_dims(np.repeat(months, n_x * n_y), 1)

        #Append our mask (nparray) to a list of masks
        mask = np.repeat(np.expand_dims(mask, axis=0), n_dates, axis=0)
        masks.append(np.reshape(mask, (n_x * n_y * n_dates)))

        image_feature = np.reshape(image_databands,
                                   (n_x * n_y * n_dates, n_bands))
        image_features = np.hstack([image_feature, dates])

        # Append the numpy array to a list of numpy arrays
        images_features.append(image_features)

        bqa_features = np.reshape(bqa, (n_x * n_y * n_dates))
        bqas.append(bqa_features)

    bqas = np.concatenate(bqas, axis=0)
    images_features = np.concatenate(images_features, axis=0)
    masks = np.concatenate(masks, axis=0)

    return images_features, bqas, masks
Ejemplo n.º 7
0
def getDistance2WaterFeature(pcode, fp=water_fp):
    '''
    input:
    pcode: the identifier for mining site
    
    output:
    an 100*100 image patch for this mining site. 
    Each pixel value represents the distance (meters) from this pixel to nearest water body
    
    If this image patch already exists, then load and return it;
    otherwise, create one image patch in the folder first, and return it. 
    
    '''

    #get the lat,lng from a site id
    ipis = ee.FeatureCollection(
        'ft:1P1f-A2Sl44YJEqtD1FvA1z7QtDFsRut1QziMD-nV').getInfo()
    lat = ipis['features']['pcode' == pcode]['geometry']['coordinates'][1]
    lng = ipis['features']['pcode' == pcode]['geometry']['coordinates'][0]

    # Open an on-disk image dataset (may not exist yet).
    dataset = storage.DiskDataset("/tmp/dataset")

    # Add a new image to the dataset. Images are indexed by pcode and image
    # source (e.g. "distance2water"). Both are arbitrary strings.

    location_id = pcode
    source_id = "distance_to_ground_feature"

    metadata = {"bands": "water"}
    if not dataset.has_image(location_id, source_id):
        image = calculateDistanceMatrix(lat, lng, fp)

        dataset.add_image(location_id, source_id, image, metadata)
        return image
    else:
        #print "image {}/{} already exists!".format(location_id, source_id)
        image = dataset.load_image(pcode, source_id)
        return image
Ejemplo n.º 8
0
def main(args):
    model = load_model(args.model_path)
    dataset = storage.DiskDataset(args.data_path)

    inference_store(dataset, model)