def __init__(self, labels, dataset, ids, rasters, patches, use_rasters=True): """ :param labels: list of labels :param dataset: list of (latitude, longitude) :param ids: list of identifiers :param rasters: path to the rasters root :param patches: path to the patches root """ self.labels = labels self.dataset = dataset self.ids = ids self.one_hot_size = 34 self.one_hot = np.eye(self.one_hot_size) self.rasters = rasters self.patches = patches global patch_extractor if patch_extractor is None and rasters is not None and use_rasters: # 256 is mandatory as images have been extracted in 256 and will be stacked in the __getitem__ method patch_extractor = PatchExtractor(rasters, size=256, verbose=True) patch_extractor.add_all() self.extractor = patch_extractor
def __init__(self, extractor, dataset, labels): self.extractor = extractor self.labels = labels self.dataset = dataset def __len__(self): return len(self.labels) def __getitem__(self, idx): tensor = self.extractor[self.dataset[idx]] return torch.from_numpy(tensor).float(), self.labels[idx] if __name__ == '__main__': patch_extractor = PatchExtractor( '/Users/ykarmim/Documents/Cours/Master/Projet_DAC/geoLifeClef/data/rasters_GLC19', size=64, verbose=True) #patch_extractor.append('proxi_eau_fast') #patch_extractor.append('text') patch_extractor.add_all() # example of dataset dataset_list = [(43.61, 3.88), (42.61, 4.88), (46.15, -1.1), (49.54, -1.7)] labels_list = [0, 1, 0, 1] dataset_pytorch = GeoLifeClefDataset(patch_extractor, dataset_list, labels_list) print(len(dataset_pytorch), 'elements in the dataset')
from environmental_raster_glc import PatchExtractor extractor = PatchExtractor('/home/data/rasters_GLC19', size=64, verbose=True) extractor.append('chbio_1') extractor.append('text') extractor.append('clc') extractor.append('bs_top') extractor.append('oc_top') # extractor.add_all() print(extractor[43.61, 3.88].shape)
from environmental_raster_glc import PatchExtractor extractor = PatchExtractor('../data/rasters_GLC19', size=64, verbose=True) #extractor.append('proxi_eau_fast') extractor.append('text') extractor.append('clc') extractor.append('bs_top') extractor.append('oc_top') # extractor.add_all() extractor.plot((43.61, 3.88))
true_idx = self.data.index[idx] # Reads the env tensor from the patch extractor lat,lng = self.data.loc[true_idx,'Longitude'],self.data.loc[true_idx,'Latitude'] patch = self.patch_extractor[lat,lng] return {'lat': lat,'lng': lng,'patch': patch} if __name__ == '__main__': # Test from environmental_raster_glc import PatchExtractor from environmental_raster_glc import raster_metadata # building the patch extractor # some channels are set to be avoided in the 'exception' list ext = PatchExtractor('../data/rasters_GLC19', size=64, verbose=False) exception = ('alti','proxi_eau_fast') for channel in raster_metadata: if channel not in exception: ext.append(channel) df = pd.read_csv('example_occurrences.csv', sep=';', header='infer', quotechar='"', low_memory=True) df = df[['Longitude','Latitude','glc19SpId','scName']] if not (len(df.dropna(axis=0, how='all')) == len(df)): raise Exception("nan lines in dataframe, cannot build the dataset!") df = df.astype({'glc19SpId': 'int64'}) glc_dataset = GLCDataset(df[['Longitude','Latitude']], df['glc19SpId'], scnames=df[['glc19SpId','scName']],patch_extractor=ext) print(len(glc_dataset), 'occurrences in the dataset')
def __init__(self, extractor, dataset, labels): self.extractor = extractor self.labels = labels self.dataset = dataset def __len__(self): return len(self.labels) def __getitem__(self, idx): tensor = self.extractor[self.dataset[idx]] return torch.from_numpy(tensor).float(), self.labels[idx] if __name__ == '__main__': patch_extractor = PatchExtractor('/data/rasters_GLC19', size=64, verbose=True) patch_extractor.append('chbio_1') patch_extractor.append('text') # patch_extractor.add_all() # example of dataset dataset_list = [(43.61, 3.88), (42.61, 4.88), (46.15, -1.1), (49.54, -1.7)] labels_list = [0, 1, 0, 1] dataset_pytorch = GeoLifeClefDataset(patch_extractor, dataset_list, labels_list) print(len(dataset_pytorch), 'elements in the dataset')
from environmental_raster_glc import PatchExtractor extractor = PatchExtractor('/home/data/rasters_GLC20/soilgrids/', size=256, verbose=True) # extractor.append('bdticm') # extractor.append('text') # extractor.append('clc') # extractor.append('bs_top') # extractor.append('oc_top') extractor.add_all() extractor.plot((37.746517, -122.423786))
from environmental_raster_glc import PatchExtractor from environmental_raster_glc import raster_metadata extractor = PatchExtractor('../data/rasters_GLC19', size=1, verbose=True) #extractor.append('chbio_1') for channel in raster_metadata.keys(): if not channel in {'alti', 'clc', 'proxi_eau_fast'}: extractor.append(channel) print(f'Dimension of the extractor: {len(extractor)}') print(f'Dimension of a point: {extractor[43.61, 3.88].shape}') print( f'Environmental tensor at (43.61,3.88) (Montpellier) :\n{extractor[43.61, 3.88]}' )
def __init__(self, extractor, dataset, labels): self.extractor = extractor self.labels = labels self.dataset = dataset def __len__(self): return len(self.labels) def __getitem__(self, idx): tensor = self.extractor[self.dataset[idx]] return torch.from_numpy(tensor).float(), self.labels[idx] if __name__ == '__main__': patch_extractor = PatchExtractor('/home/data/rasters_GLC19', size=64, verbose=True) patch_extractor.add_all() dataset_list = [(43.61, 3.88), (42.61, 4.88)] labels_list = [0, 1] dataset_pytorch = GeoLifeClefDataset(patch_extractor, dataset_list, labels_list) print(dataset_pytorch[0]) print(len(dataset_pytorch)) data_loader = test_loader = torch.utils.data.DataLoader(dataset_pytorch, shuffle=True, batch_size=8)
sep=';', header='infer', quotechar='"', low_memory=True) df = df.dropna(axis=0, how='all') # keep only columns required for extraction, to free up memory df = df[['Latitude', 'Longitude']] ## MODIFIED : Now all the files are saved in the same directory and their ## names are just the index of the row in the dataframe. batch_size = 10000 # number of patch to extract simultaneously # testing destination directory if not os.path.isdir(args.destination): os.mkdir(args.destination) ext = PatchExtractor(args.rasters, size=args.size, verbose=True) positions = [] # exception = ('proxi_eau_fast','alti', 'clc') # add rasters that don't fit into memory exception = tuple() # testing destination directory if not os.path.isdir(args.destination): os.mkdir(args.destination) export_idx = 0 for idx, occurrence in enumerate(df.iterrows()): # adding an occurrence latitude and longitude positions.append((occurrence[1].Latitude, occurrence[1].Longitude)) # if the batch is full, extract and export if len(positions) == batch_size or idx == len(df) - 1:
def extract_environmental_data(dataset, rasters, destination=None, mean_window_size=None, patch_size=1, row_number_limit=None): """This function builds a dataset containing all the latitude, longitude, and vectors build from the environmental tensors associated saved in a directory, and save in optionnally in a csv file. Used to fit to Scikit-Learn models. If the environmental tensors are just row vectors (i.e the env. variables values at the location) then it loads them in a new dataframe. Otherwise, we either take the mean of the tensor values under a window parameter, or the tensors are flattened as long row vectors. This last option is very expensive in memory and will not work on dataset containing 250k+ occurrences. :param df: the locations dataframe, containing Latitude,Longitude columns, and glc19SpId, the labels column. :param rasters: the directory where the rasters are located :param destination: an optional csv file where to save the data. The script takes quite some time so its useful to save the result in a file. :param mean_window_size: if not None, takes the mean value of each channel on under this window size :param patch_size: size of channels in the patches. 1 means channels are scalar values at this position, >1 means they are arrays around this position. :param row_number_limit: a max number of rows to extract. Default extract all the data. :return: a new dataframe containing the locations concatenated with their env. vectors """ n_env_features = len(raster_metadata.keys()) rasters_names = sorted(raster_metadata.keys()) if patch_size == 1 and mean_window_size: raise Exception( 'Patches are already vectors of scalars (size 1), cannot provide a window size' ) if patch_size == 1 or mean_window_size: shape_env = (n_env_features) else: shape_env = n_env_features * patch_size * patch_size print('Will build row vectors of size', shape_env) # Reads the csv file containing the occurences df = pd.read_csv(dataset, sep=';', header='infer', quotechar='"')\ .dropna(axis=0, how='all') #test data file: different label column name if 'glc19SpId' in df.columns: target_column = 'glc19SpId' elif 'glc19TestOccId' in df.columns: target_column = 'glc19TestOccId' else: raise Exception('Unknown target column in the data') df = df.astype({target_column: 'int64'}) # keep only columns required, to free up memory df = df[['Latitude', 'Longitude', target_column]] ext = PatchExtractor(rasters, size=patch_size, verbose=True) positions = [] # exception = ('proxi_eau_fast','alti', 'clc') # add rasters that don't fit into memory exception = tuple() env_vectors = list() # number of values per channel, 1 if patches are vector n_features_per_channel = 1 if not row_number_limit: row_number_limit = len(df) print('Starting') try: positions = list(zip(df.Latitude, df.Longitude))[:row_number_limit] print('Loading rasters and extract..') variables = [] for raster in rasters_names: if raster in exception: continue ext.clean() ext.append(raster) variable = np.stack([ext[p] for p in positions]) variables.append(variable) ext.clean() variables = np.concatenate(variables, axis=1) # the shape of variables is (batch_size, nb_rasters, size, size) print('Build env vectors..') # build env vector for each occurrence in the batch for p_idx, patch in enumerate(variables): if mean_window_size: patch = np.array([ ch[ch.shape[0] // 2 - mean_window_size // 2:ch.shape[0] // 2 + mean_window_size // 2, ch.shape[1] // 2 - mean_window_size // 2:ch.shape[1] // 2 + mean_window_size // 2].mean() for ch in patch ]) else: if len(patch.shape) > 1: n_features_per_channel = patch[0].shape[0] * patch[ 0].shape[1] # flatten to build row vector lat, lng = positions[p_idx] env_vectors.append(np.concatenate(([lat, lng], patch), axis=None)) print('Done! building dataframe') except MemoryError as e: raise e( f'Reached out of memory, was able to extract {len(env_vectors)} rows' ) if n_features_per_channel == 1: header_env = rasters_names else: header_env = [] for name in rasters_names: header_env.extend( [name + f'__{i}' for i in range(n_features_per_channel)]) header = ['Latitude', 'Longitude'] + header_env env_df = pd.DataFrame(env_vectors, columns=header, dtype='float64') print('Saving on disk') # concatenate column for the specie's label target_df = df[target_column].reset_index(drop=True).loc[:row_number_limit] env_df = pd.concat((env_df, target_df), axis=1) if destination: env_df.to_csv(destination, sep=';', index=False, quotechar='"') return env_df
def train(num_epochs=10, batch_size=8, R=16, BN=False, name="TEST", folder_rasters="../Data/rasters_2019", occur_file="../Data/full_occur.csv", taxanames="../Data/test/taxaNameTest.csv", onlytest=0, w=2, opt="sgd", loss="ce", gam=2, LR=0.001, sep=True, decay=True, patience=2, scale=0.2, tolerance=10, metric="acc", Kacc=30, alt=1, drop=1, weighted=True, actemb=None, act="leakyrelu", actfc="relu", vocab_size=1055, window=50, EMBNP=100, regional=20, archi=0, grinton_mode=0, gpus=1, init_weights=None, runmode=0): print("Radius ", R, sep=" ") print("BatchNorm ", BN) clim_vars = [ 'alti', 'etp', 'chbio_1', 'chbio_2', 'chbio_3', 'chbio_4', 'chbio_5', 'chbio_6', 'chbio_7', 'chbio_8', 'chbio_9', 'chbio_10', 'chbio_11', 'chbio_12', 'chbio_13', 'chbio_14', 'chbio_15', 'chbio_16', 'chbio_17', 'chbio_18', 'chbio_19', 'proxi_eau_fast' ] pedo_vars = [ 'awc_top', 'bs_top', 'cec_top', 'crusting', 'dgh', 'dimp', 'erodi', 'oc_top', 'pd_top', 'text' ] comps = { 'clim': (clim_vars, 0), 'landcover': (["clc"], 1), 'pedo': (pedo_vars, 1) } patch_extractors = [] if (archi != 2): for k in comps.keys(): trt = comps.get(k)[1] varlist = comps.get(k)[0] if (trt == 0): patch_extractor = PatchExtractor(folder_rasters, size=R, verbose=True) for v in varlist: patch_extractor.append(v, normalized=True) patch_extractors.append(patch_extractor) else: ### one patch extractor for each raster (case of categorical variables) for v in varlist: pe = PatchExtractor(folder_rasters, size=R, verbose=True) pe.append(v, normalized=False) patch_extractors.append(pe) allspecies = pd.read_csv(taxanames, sep=";", decimal=".") testspecies = allspecies[allspecies.test == True]["glc19SpId"] del allspecies th = 1 dataset = pd.read_csv("../Data/occurrence/full_occur.csv", sep=";", decimal=".") if (onlytest == 1): ##Train only on test species dataset = dataset[dataset["glc19SpId"].isin(testspecies.tolist())] th = 0 classes = dataset["glc19SpId"].sort_values().unique() prevs = np.array( [len(dataset[dataset["glc19SpId"] == c]) for c in classes]) freq = classes[np.where(prevs > th)[0]] dataset = dataset[dataset["glc19SpId"].isin(freq)] class_codes = np.array(range(0, len(freq))) for i in range(len(freq)): dataset["glc19SpId"] = dataset["glc19SpId"].replace( freq[i], class_codes[i]) teststatus = [w * int(x in testspecies.tolist()) for x in freq] encoding = pd.DataFrame( data=np.stack( (freq, class_codes, prevs[np.where(prevs > th)[0]], teststatus), axis=1), columns=["glc19SpId", "codes", "prevalences", "teststatus"]) encoding.to_csv(name + "_trace_encoding.csv", sep=";", decimal=".", index=False) partitions = OccurrencePartition("stratified", dataset) partitions.cross_val_part(0.9, encoding) partitions.shuffle_idx() ## Example of dataset x_train = [ tuple(x) for x in (dataset.iloc[partitions.train_idx, :] )[["Latitude", "Longitude"]].values ] x_val = [ tuple(x) for x in (dataset.iloc[partitions.val_idx, :] )[["Latitude", "Longitude"]].values ] y_train = dataset.iloc[partitions.train_idx, :]["glc19SpId"] y_val = dataset.iloc[partitions.val_idx, :]["glc19SpId"] data_train_generator = GlcGenerator(patch_extractors, x_train, y_train, comps, batch_size, shuffle=True, name="train", folder_np="../Data/retained_np/", window=window, vocab_size=vocab_size, archi=archi) data_val_generator = GlcGenerator(patch_extractors, x_val, y_val, comps, batch_size, shuffle=False, name="valid", folder_np="../Data/retained_np/", window=window, vocab_size=vocab_size, archi=archi) gp = GrintonParams(NBPLANTS=partitions.get_poolsize(), R=R, BN=BN, NBNP=vocab_size, window=window, EMBNP=EMBNP) gp.update_params(alt=alt, drop=drop, act=act, actfc=actfc, actemb=actemb) if (archi != 2): #### First architecture: Grinnell #### Grinnell = GrinnellianNiche(sep, archi) if (sep): Grinnell.create_grinnell(gp.Anames, gp.Pnames, gp.topoHydroClim_params, gp.pedo_params, gp.anthropo_params, gp.ft_params_sep, gp.spat_params_list, gp.trt, gp.feat_names_list, gp.im_name_list) else: Grinnell.create_grinnell(gp.Anames, gp.Pnames, gp.topoHydroClim_params, gp.pedo_params, gp.anthropo_params, gp.ft_params_join, gp.spat_params_list, gp.trt, gp.feat_names_list, gp.im_name_list) ### Plot model params and architecture ### Grinnell.plot_grinnell(name + "_grinnell.png") #Grinnell.grinnell.summary() if (archi != 1): ### Second architecture: Elton #### if (archi == 2): isfinal = True else: isfinal = False Elton = EltonianNiche(final=isfinal) Elton.create_elton(bio_params=gp.bio_params, emb=0) #Elton.plot_elton(name+"_elton.png") #Elton.elton.summary() if (archi == 1): parallel_model = Grinnell.grinnell elif (archi == 2): parallel_model = Elton.elton else: grinton = Grinton(Grinnell.grinnell, Elton.elton) if grinton_mode: grinton.create_grinton(gp.ensemble_grinton_params) else: grinton.create_grinton(gp.joint_grinton_params) #grinton.plot_grinton(name+"_grinton.png") parallel_model = grinton.grinton parallel_model.summary() ### use GPU for data parallelism if gpus > 1: parallel_model = multi_gpu_model(parallel_model, gpus=4) if (init_weights != None): parallel_model.load_weights(init_weights) if (opt == "adam"): optimizer = adam(lr=LR) else: optimizer = sgd(lr=LR, momentum=0.9, nesterov=True) if (loss == "fl"): obj = focal_loss_softmax(gamma=gam) else: obj = "sparse_categorical_crossentropy" if (metric == "acc"): met = "sparse_categorical_accuracy" else: met = topk_accuracy(Kacc) parallel_model.compile(optimizer, obj, [met]) #Grinnell.compile_grinnell(optimizer,obj,[topk_accuracy(3)]) if runmode: #### Callbacks #### callbcks = [] ##TensorBoard tbc = TensorBoard(log_dir='./logs_' + name) callbcks.append(tbc) ##Training hyperparameters update => weight decay if (decay): wdc = ReduceLROnPlateau(monitor='val_loss', factor=scale, patience=patience, min_lr=0.000001, min_delta=0.001) esc = EarlyStopping(monitor='val_loss', min_delta=1E-4, patience=tolerance) callbcks.append(wdc) callbcks.append(esc) ##Checkpointing the model periodically filepath = name + "_weights.{epoch:02d}-{val_loss:.2f}.hdf5" cpc = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=True, mode='auto', period=5) callbcks.append(cpc) #### Start training ##### #batch_size=2 if (onlytest == 2): classweights = [ max(1, encoding.loc[i, "teststatus"]) for i in range(len(encoding)) ] elif (weighted): classweights = (1 / encoding["prevalences"].values).tolist() else: classweights = np.ones(len(encoding)) #oi=parallel_model.get_weights() #init_weights="elton_standalone_weights.15-6.57.hdf5" #ol=parallel_model.get_weights() print("Train Mode") train_history = parallel_model.fit_generator( generator=data_train_generator, steps_per_epoch=(len(partitions.train_idx) // batch_size), epochs=num_epochs, verbose=1, validation_data=data_val_generator, validation_steps=(len(partitions.val_idx) // batch_size), use_multiprocessing=False, shuffle=True, callbacks=callbcks, class_weight=classweights, workers=1) print("End of training") #perform_test=Grinnell.grinnell.evaluate_generator(generator=data_val_generator,verbose=1) #trainex=data_train_generator.extracted #validex=data_val_generator.extracted ### Save model ### print("Saving model") path = name + ".h5" parallel_model.save_weights(path) return train_history else: if onlytest == 1: encod_file = "pretrain/test_encoding.csv" else: encod_file = "pretrain/full_encoding.csv" predict(parallel_model, patch_extractors, comps, testset="../Data/test/testSet.csv", encoding_file=encod_file, R=R, archi=archi, folder_rasters=folder_rasters, window=window, vocab_size=vocab_size, run_name=name)
class GeoLifeClefDataset: def __init__(self, extractor, dataset, labels): self.extractor = extractor self.labels = labels self.dataset = dataset def __len__(self): return len(self.labels) def __getitem__(self, idx): tensor = self.extractor[self.dataset[idx]] return tensor, self.labels[idx] if __name__ == '__main__': patch_extractor = PatchExtractor('../rasters GLC19', size=8, verbose=True) patch_extractor.add_all() # dataset df = pd.read_csv("../PL_trusted.csv", sep=';') classes = set(df['glc19SpId']) df = pd.concat([ df.drop('glc19SpId', axis=1), pd.get_dummies(df['glc19SpId'], dtype=int) ], axis=1) dataset_list = list(zip(df["Latitude"], df["Longitude"])) labels_list = (df.iloc[:, 10:]).values train_ds = GeoLifeClefDataset(patch_extractor, dataset_list[:230000], labels_list[:230000]) test_ds = GeoLifeClefDataset(patch_extractor, dataset_list[230000:], labels_list[230000:])
class GeoLifeClefDataset(Dataset): def __init__(self, extractor, dataset, labels): self.extractor = extractor self.labels = labels self.dataset = dataset def __len__(self): return len(self.labels) def __getitem__(self, idx): tensor = self.extractor[self.dataset[idx]] return torch.from_numpy(tensor).float(), self.labels[idx] if __name__ == '__main__': patch_extractor = PatchExtractor('../rasters GLC19', size=64, verbose=True) # patch_extractor.add_all() patch_extractor.append('chbio_1') patch_extractor.append('chbio_2') patch_extractor.append('chbio_3') patch_extractor.append('chbio_4') patch_extractor.append('chbio_5') patch_extractor.append('chbio_6') patch_extractor.append('chbio_7') patch_extractor.append('chbio_8') patch_extractor.append('chbio_9') patch_extractor.append('chbio_10') patch_extractor.append('text') # dataset
from environmental_raster_glc import PatchExtractor extractor = PatchExtractor('/data/rasters_GLC19', size=64, verbose=True) extractor.append('proxi_eau_fast') # extractor.append('text') # extractor.append('clc') # extractor.append('bs_top') # extractor.append('oc_top') # extractor.add_all() extractor.plot((43.61, 3.88))