def from_metadata_csv( cls, data_root, meta_csv_file, network=None, temp_root=gettempdir() ): """ Load a previously created and stored filelist from :func:`ismn.filecollection.IsmnFileCollection.to_metadata_csv` Parameters ---------- data_root : IsmnRoot or str or Path Path where the ismn data is stored, can also be a zip file meta_csv_file : str or Path Csv file where the metadata is stored. network : list, optional (default: None) List of networks that are considered. Filehandlers for other networks are set to None. temp_root : str or Path, optional (default: gettempdir()) Temporary folder where extracted data is copied during reading from zip archive. """ if network is not None: network = np.atleast_1d(network) if isinstance(data_root, IsmnRoot): root = data_root else: root = IsmnRoot(data_root) print(f"Found existing ismn metadata in {meta_csv_file}.") metadata_df = _load_metadata_df(meta_csv_file) filelist = OrderedDict([]) all_networks = metadata_df["network"]["val"].values columns = np.array(list(metadata_df.columns)) for i, row in enumerate(metadata_df.values): # todo: slow!?? parallelise? this_nw = all_networks[i] if (network is not None) and not np.isin([this_nw], network)[0]: f = None continue else: vars = np.unique(columns[:-2][:, 0]) vals = row[:-2].reshape(-1, 3) metadata = MetaData( [ MetaVar.from_tuple( (vars[i], vals[i][2], vals[i][0], vals[i][1]) ) for i in range(len(vars)) ] ) f = DataFile( root=root, file_path=str(PurePosixPath(row[-2])), load_metadata=False, temp_root=temp_root, ) f.metadata = metadata f.file_type = row[-1] this_nw = f.metadata["network"].val if this_nw not in filelist.keys(): filelist[this_nw] = [] filelist[this_nw].append(f) if network is None: cls.metadata_df = metadata_df else: flags = np.isin(metadata_df["network"]["val"].values, network) cls.metadata_df = metadata_df.loc[flags] return cls(root, filelist=filelist)
def _read_station_dir( root: Union[IsmnRoot, Path, str], stat_dir: Union[Path, str], temp_root: Path, ) -> (dict, list): """ Parallelizable function to read metadata for files in station dir """ infos = [] if not isinstance(root, IsmnRoot): proc_root = True root = IsmnRoot(root) else: proc_root = False csv = root.find_files(stat_dir, "*.csv") try: if len(csv) == 0: raise IsmnFileError( "Expected 1 csv file for station, found 0. " "Use empty static metadata." ) else: if len(csv) > 1: infos.append( f"Expected 1 csv file for station, found {len(csv)}. " f"Use first file in dir." ) static_meta_file = StaticMetaFile( root, csv[0], load_metadata=True, temp_root=temp_root ) station_meta = static_meta_file.metadata except IsmnFileError as e: infos.append(f"Error loading static meta for station: {e}") station_meta = MetaData([MetaVar(k, v) for k, v in CSV_META_TEMPLATE.items()]) data_files = root.find_files(stat_dir, "*.stm") filelist = [] for file_path in data_files: try: f = DataFile(root, file_path, temp_root=temp_root) except IOError as e: infos.append(f"Error loading ismn file: {e}") continue f.metadata.merge(station_meta, inplace=True) f.metadata = f.metadata.best_meta_for_depth( Depth( f.metadata["instrument"].depth.start, f.metadata["instrument"].depth.end, ) ) network = f.metadata["network"].val station = f.metadata["station"].val filelist.append((network, station, f)) infos.append(f"Processed file {file_path}") if proc_root: root.close() return filelist, infos
def from_metadata_csv(cls, data_root, meta_csv_file, network=None, temp_root=gettempdir()): """ Load a previously created and stored filelist from pkl. Parameters ---------- data_root : IsmnRoot or str or Path Path where the ismn data is stored, can also be a zip file meta_csv_file : str or Path Csv file where the metadata is stored. network : list, optional (default: None) List of networks that are considered. Other filehandlers are set to None. temp_root : str or Path, optional (default: gettempdir()) Temporary folder where extracted data is copied during reading from zip archive. """ if network is not None: network = np.atleast_1d(network) if isinstance(data_root, IsmnRoot): root = data_root else: root = IsmnRoot(data_root) print(f"Found existing ismn metadata in {meta_csv_file}.") metadata_df = pd.read_csv(meta_csv_file, index_col=0, header=[0, 1], low_memory=False, engine='c') # parse date cols as datetime for col in ['timerange_from', 'timerange_to']: metadata_df[col, 'val'] = pd.to_datetime(metadata_df[col, 'val']) lvars = [] for c in metadata_df.columns: if c[0] not in lvars: lvars.append(c[0]) # we assume triples for all vars except these, so they must be at the end assert lvars[-2:] == ['file_path', 'file_type'], \ "file_type and file_path must be at the end." filelist = OrderedDict([]) all_networks = metadata_df['network']['val'].values columns = np.array(list(metadata_df.columns)) for i, row in enumerate( metadata_df.values): # todo: slow!?? parallelise? this_nw = all_networks[i] if (network is not None) and not np.isin([this_nw], network)[0]: f = None continue else: vars = np.unique(columns[:-2][:, 0]) vals = row[:-2].reshape(-1, 3) metadata = MetaData([ MetaVar.from_tuple( (vars[i], vals[i][2], vals[i][0], vals[i][1])) for i in range(len(vars)) ]) f = DataFile(root=root, file_path=str(PurePosixPath(row[-2])), load_metadata=False, temp_root=temp_root) f.metadata = metadata f.file_type = row[-1] this_nw = f.metadata['network'].val if this_nw not in filelist.keys(): filelist[this_nw] = [] filelist[this_nw].append(f) return cls(root, filelist=filelist)