def test_complicated_groups(self): # Test a deeply nested field (nested with-in 3 group fields) structures = pds4_read(self.data('af.xml'), lazy_load=True, quiet=True) structure = structures[9] _check_array_equal( structure.field(0)[11, 2, 5, 2:5], [-0.52061242, -0.51312923, -0.50972084], 'float64') # Test two fields with-in one group field structures = pds4_read(self.data('test_group_fields.xml'), lazy_load=True, quiet=True) structure = structures[0] _check_array_equal( structure.field(0)[9, 5, 2:5], [331.28526671, 328.97851487, 327.87342654], 'float64') _check_array_equal( structure.field(1)[3, 7, 1:4], [277.80563195, 281.21064631, 279.24594501], 'float64') # Test three fields with-in one group field structure = structures[1] _check_array_equal( structure.field(0)[20, 7:10], [207., 208., 209.], 'float64') _check_array_equal( structure.field(1)[9, 5, 2:5], [331.28526671, 328.97851487, 327.87342654], 'float64') _check_array_equal( structure.field(2)[3, 7, 1:4], [277.80563195, 281.21064631, 279.24594501], 'float64')
def test_simple_groups(self): # Test via binary tables structures = pds4_read(self.data('af.xml'), lazy_load=True, quiet=True) # Test single nested, 1D group fields structure = structures[11] _check_array_equal( structure.field(10)[0], [-0.08405675, 0.60469515, -0.79200899], 'float64') structure = structures[13] string = [ 'mvn_app_rel_150601_150607_v01.bc', 'mvn_sc_rel_150601_150607_v01.bc ' ] _check_array_equal(structure.field(-1)[0, 3:5], string, 'U32') # Test via delimited table structures = pds4_read(self.data('Product_DelimitedTable.xml'), lazy_load=True, quiet=True) # Test single nested, 1D group fields structure = structures[0] _check_array_equal( structure.field(-1)[-1], [5, 1, 1, 1, 1, 1, 0, 0, 0, 0], 'int8')
def setup(self): super(TestArrayDataTypes, self).setup() self.structures = pds4_read(self.data('test_array_data_types.xml'), lazy_load=True, quiet=True)
def read_dat_pds4(filename, write_csv=False, quiet=True): """ Reads a PDS4 .dat format file, preserving column order and data type, except that byte order is switched to native if applicable. The .dat file and .xml label must exist in the same directory. Return the data _and_ the label. """ if filename[-4:].lower() == ".dat": filename = filename[:-4] + ".xml" if filename[-4:].lower() != ".xml": raise TypeError("Unknown filetype: {ext}".format(ext=filename[-4:])) structures = pds4_tools.pds4_read(filename, quiet=quiet) dat_dict = OrderedDict({}) for i in range(len(structures[0].fields)): name = structures[0].fields[i].meta_data["name"] dat_dtype = structures[0].fields[i].meta_data["data_type"] dtype = pds4_tools.reader.data_types.pds_to_numpy_type(dat_dtype) data = np.array(structures[0].fields[i], dtype=dtype) if (sys.byteorder == "little" and ">" in str(dtype)) or (sys.byteorder == "big" and "<" in str(dtype)): data = data.byteswap().newbyteorder() dat_dict[name] = data dataframe = pd.DataFrame(dat_dict) if write_csv: dataframe.to_csv(filename.replace(".xml", ".csv"), index=False) return dataframe
def setup(self): super(TestTableDataTypes, self).setup() structures = pds4_read(self.data('test_table_data_types.xml'), lazy_load=True, quiet=True) self.table = structures[0]
def setup(self): super(TestDelimitedTable, self).setup() structures = pds4_read(self.data('Product_DelimitedTable.xml'), lazy_load=True, quiet=True) self.structure = structures[0]
def setup(self): super(TestCharacterTable, self).setup() structures = pds4_read(self.data('colors.xml'), lazy_load=True, quiet=True) self.structure = structures[0]
def load_file(self, filespec, numhdu=None, dstobj=None, **kwdargs): # create object of the appropriate type, usually # an AstroImage or AstroTable, by looking up the correct # class in self.factory_dict, under the keys 'image' or # 'table' import numpy as np from urllib.parse import urlparse from pds4_tools import pds4_read from .exceptions import InvalidPDS4Data urlinfo = urlparse(filespec) if urlinfo.scheme not in ['file', '']: raise IOError('File must be local: {}'.format(filespec)) struct = pds4_read(urlinfo.path) if numhdu is None: # return the first table or array for i in range(len(struct)): if struct[i].is_array(): break else: raise InvalidPDS4Data('No image found in {}'.format(filespec)) else: i = numhdu im = np.array(struct[i].data) # Ginga draws from bottom to top, left to right. Transform # our data so that when it is drawn this way it is displayed # in the correct orientation disp_dir = struct[i].meta_data.display_settings['Display_Direction'] haxis = struct[i].meta_data.get_axis_array( disp_dir['horizontal_display_axis']) # PDS4 data is Last Index Fastest and axis numbering starts at # 1. Numpy arrays are also Last Index Fastest, but start at # 0. if haxis['sequence_number'] == 1: # Swap axes so that the horizontal axis is numpy axis 1: im = im.T hdisp_dir = disp_dir['horizontal_display_direction'] vdisp_dir = disp_dir['vertical_display_direction'] if 'Right to Left' in hdisp_dir: im = im[:, ::-1] # invert horizontal axis if 'Top to Bottom' in vdisp_dir: im = im[::-1] # invert vertical axis if dstobj is not None: dstobj.set_data(im) return im, i, None
def process(path): url = "".join((ARCHIVE_PREFIX, path)) label = pds4_read(url, lazy_load=True, quiet=True).label lid = label.find("Identification_Area/logical_identifier").text tel = lid.split(":")[5][:3].upper() if tel in CatalinaBigelow._telescopes: obs = CatalinaBigelow() elif tel in CatalinaLemmon._telescopes: obs = CatalinaLemmon() elif tel in CatalinaKittPeak._telescopes: obs = CatalinaKittPeak() else: raise ValueError(f"Unknown telescope {tel}") obs.product_id = lid obs.mjd_start = Time( label.find( "Observation_Area/Time_Coordinates/start_date_time").text).mjd obs.mjd_stop = Time( label.find( "Observation_Area/Time_Coordinates/stop_date_time").text).mjd obs.exposure = round((obs.mjd_stop - obs.mjd_start) * 86400, 3) survey = label.find(".//survey:Survey") ra, dec = [], [] for corner in ("Top Left", "Top Right", "Bottom Right", "Bottom Left"): coordinate = survey.find( "survey:Image_Corners" f"/survey:Corner_Position[survey:corner_identification='{corner}']" "/survey:Coordinate") ra.append(float(coordinate.find("survey:right_ascension").text)) dec.append(float(coordinate.find("survey:declination").text)) obs.set_fov(ra, dec) maglimit = survey.find( "survey:Limiting_Magnitudes" "/survey:Percentage_Limit[survey:Percentage_Limit='50']" "/survey:limiting_magnitude") if maglimit is not None: obs.maglimit = float(maglimit.text) return obs
def inventory(base_path): """Iterate over all files of interest. Returns ------- labels : iterator of tuples Path and pds4_tools label object. """ logger = logging.getLogger("add-spacewatch") inventory_fn = f"{base_path}/gbo.ast.spacewatch.survey/data/collection_gbo.ast.spacewatch.survey_data_inventory.csv" if not os.path.exists(base_path): raise Exception('Missing inventory list %s', fn) # Read in all relevant LIDs from the inventory. lids = set() with open(inventory_fn, 'r') as inf: for line in inf: if not line.startswith( 'P,urn:nasa:pds:gbo.ast.spacewatch.survey:data:sw_'): continue if '.fits' not in line: continue lid = line[2:-6] lids.add(lid) # search directory-by-directory for labels with those LIDs for fn in iglob( f"{base_path}/gbo.ast.spacewatch.survey/data/20*/*/*/*.xml"): label = pds4_read(fn, lazy_load=True, quiet=True).label lid = label.find("Identification_Area/logical_identifier").text if lid in lids: lids.remove(lid) yield fn, label # did we find all the labels? if len(lids) > 0: logger.error(f'{len(lids)} LIDs were not found.')
import png import glob import numpy as np from pds4_tools import pds4_read for i in glob.glob('*.*L'): # traverse file data = pds4_read(i,quiet=True) # read pds img = np.array(data[0].data) # to array img = img.reshape(-1,2352*3) # reshape img16 = np.array(np.uint16(img*256)) # to 16bits png.from_array(img16,'RGB').save(f"{i}.png") # to png & save
# group_data1.ravel()[::2].reshape(21, 10, 5) # # Table 1, column 2 # group_data1.ravel()[1::2].reshape(21, 10, 5) # # Table 2, column 1 # group_data2.ravel()[::11].reshape(21, 10) # # Table 2, column 2 # same as table 1, column 1 # # Table 2, column 3 # same as table 1, column 2 af_path = os.path.join(os.path.dirname(__file__), '..', 'data/af.xml') structures = pds4_read(af_path) original_data = structures[9]['PIXEL_CORNER_LON'] # Create a table with two columns, each of shape (21,10,5) group_data1 = np.asarray([original_data.ravel(), original_data.ravel()]).reshape(21, 10, 10) # Create a table with three columns, where the first has shape (21,10) and the other two have shapes (21,10,5) group_data2 = group_data1.copy().ravel() group_data2 = np.insert(group_data2, list(range(0, 21 * 10 * 10, 10)), list(range(0, 210))).reshape(21, 10, 11) # Ensure data is MSB if sys.byteorder == 'little': group_data1.byteswap(True) group_data2.byteswap(True)
def setup(self): super(TestArrayStructure, self).setup() structures = pds4_read(self.data('af.xml'), lazy_load=True, quiet=True) self.structure = structures[1]
# group_data1.ravel()[::2].reshape(21, 10, 5) # # Table 1, column 2 # group_data1.ravel()[1::2].reshape(21, 10, 5) # # Table 2, column 1 # group_data2.ravel()[::11].reshape(21, 10) # # Table 2, column 2 # same as table 1, column 1 # # Table 2, column 3 # same as table 1, column 2 af_path = os.path.join(os.path.dirname(__file__), '..', 'data/af.xml') structures = pds4_read(af_path, quiet=True) original_data = structures[9]['PIXEL_CORNER_LON'] # Create a table with two columns, each of shape (21,10,5) group_data1 = np.asarray([original_data.ravel(), original_data.ravel()]).reshape(21, 10, 10) # Create a table with three columns, where the first has shape (21,10) and the other two have shapes (21,10,5) group_data2 = group_data1.copy().ravel() group_data2 = np.insert(group_data2, list(range(0, 21 * 10 * 10, 10)), list(range(0, 210))).reshape(21, 10, 11) # Ensure data is MSB if sys.byteorder == 'little': group_data1.byteswap(True) group_data2.byteswap(True)
def setup(self): super(TestBinaryTable, self).setup() structures = pds4_read(self.data('af.xml'), lazy_load=True, quiet=True) self.structure = structures[3]
def setup(self): super(TestStructureList, self).setup() self.structures = pds4_read(self.data('af.xml'), lazy_load=True, quiet=True)
def load_hierarchy(path): xml_paths = [] for dirpath, dirname, files in os.walk(path): for filepath in fnmatch.filter(files, '*.xml'): xml_paths.append(os.path.join(dirpath, filepath)) print 'Processing', len(xml_paths), 'XML files' num_records = 0 for xml_path in xml_paths: struct_list = pds4_read(xml_path) print '*' * 80 print xml_path print 'Processing', len(struct_list), 'structs' # See SBN dev wiki for pds4_read usage: # http://sbndev.astro.umd.edu/wiki/Python_PDS4_Tools#pds4_read for struct in struct_list: print '-' * 80 if type(struct) != TableStructure: print 'Unknown struct type encountered:', type(struct) continue # xpath tester: https://codebeautify.org/Xpath-Tester# fields = [ elt.text for elt in struct.label.findall('.//Field_Character/name') ] formats = [ elt.text for elt in struct.label.findall( './/Field_Character/field_format') ] # See astropy docs for writing fits tables: # http://docs.astropy.org/en/stable/io/fits/#creating-a-new-table-file cols = [] for field, fmt in zip(fields, formats): try: cols.append( fits.Column(name=field, format='E', array=struct[field])) except ValueError: pass if len(cols) < 1: continue print 'Writing fits file...' coldef = fits.ColDefs(cols) tbhdu = fits.BinTableHDU.from_columns(coldef) prihdr = fits.Header() prihdr['COMMENT'] = 'Converted by PDSKit from %s' % xml_path prihdu = fits.PrimaryHDU(header=prihdr) thdulist = fits.HDUList([prihdu, tbhdu]) out_dirpath = './out/%s' % os.path.dirname(xml_path) if not os.path.exists(out_dirpath): os.makedirs(out_dirpath) out_filename = os.path.basename(xml_path).split('.')[0] out_path = '%s.fits' % os.path.join(out_dirpath, out_filename) if not os.path.exists(out_path): thdulist.writeto(out_path) num_records += len(struct.data) print 'Total number of records:', num_records
def read_table(label_file, table_name=None, index_col=None, quiet=True): """ Reads data from a PDS4 product using pds4_tools. Data are converted to a Pandas DataFrame and any columns that are using PDS4 time data types are converted to Timestamps. By default the first table is read, otherwise the table_name can be used to specify. If index_col is set, this field will be used as an index in the returned pandas DataFrame, otherwise if a time field is present this will be used. NOTE: only simple 2D tables can currently be read. Group fields are skipped with a warning message! """ data = pds4_read(label_file, quiet=True) labelpath = Path(label_file) num_arrays = 0 tables = [] for structure in data.structures: if structure.is_array(): num_arrays += 1 elif structure.is_table(): tables.append(structure.id) if len(tables) == 0: log.error('no tables found in this product') return None if not quiet: log.info('product {:s} has {:d} tables and {:d} arrays'.format( labelpath.name, len(tables), num_arrays)) if table_name is not None: if table_name in tables: table = data[table_name] else: log.error( 'table name {:s} not found in product'.format(table_name)) return None else: table = data[tables[0]] if not quiet: log.info('using table {:s}'.format(table.id)) # clunky way to get the names of group fields to ignore for now table_manifest = TableManifest.from_label(data[table.id].label) time_cols = [] fields = [] group_fields = [] for i in range(len(table_manifest)): if table_manifest[i].is_group(): continue name = table_manifest[i].full_name() if table_manifest.get_parent_by_idx(i): group_fields.append(table_manifest[i].full_name()) continue fields.append(name) data_type = table_manifest[i]['data_type'] if 'Date' in data_type: time_cols.append(name) # TODO: fix nested tables (group fields) # TODO: fix handling of masked arrays (in particular missing vals in CSVs trigger this) data = pds4_df(table.data, columns=fields) for field in fields: data[field] = table.data[field] for group_field in group_fields: field_name = group_field.split(',')[1].strip() field_data = table[group_field] if field_data.shape[0] != len(data): log.warn( 'group field length does not match table length - skipping!') continue data[field_name] = None for idx in range(len(data)): data[field_name].iat[idx] = field_data[idx] path, filename = os.path.split(label_file) data.path = path data.filename = filename for col in time_cols: data[col] = pd.to_datetime(data[col]).dt.tz_localize(None) if index_col is not None: if index_col in fields: data.set_index(index_col, drop=True, inplace=True) log.info('data indexed with field {:s}'.format(time_cols[0])) else: log.warn('requested index field {:s} not found'.format(index_col)) index_col = None if index_col is None: if len(time_cols) == 0: log.warning( 'no time-based columns found, returned data will not be time-indexed' ) elif len(time_cols) == 1: data.set_index(time_cols[0], drop=True, inplace=True) log.info('data time-indexed with field {:s}'.format(time_cols[0])) else: if 'TIME_UTC' in data.columns: data.set_index('TIME_UTC', drop=True, inplace=True) log.info('data time-indexed with field {:s}'.format( time_cols[0])) else: data.set_index(time_cols[0], drop=True, inplace=True) log.info('data time-indexed with field {:s}'.format( time_cols[0])) return data