def __init__(self, catalog=None, name=None, stub=False): """Create a new `Entry` object with the given `name`. Arguments --------- catalog : `astrocats.catalog.catalog.Catalog` instance The parent catalog object of which this entry belongs. name : str The name of this entry, e.g. `SN1987A` for a `Supernova` entry. stub : bool Whether or not this instance represents a 'stub' (see above). """ super(Entry, self).__init__() self.catalog = catalog self.filename = None self.dupe_of = [] self._stub = stub if catalog: self._log = catalog.log else: from astrocats.catalog.catalog import Catalog self._log = logging.getLogger() self.catalog = Catalog(None, self._log) self[self._KEYS.NAME] = name return
class Entry(OrderedDict): """Class representing an individual element of each Catalog. For example, a single supernova in the supernova catalog, this object handles and manages the addition of data for this `Entry`, using different `CatDict` instances (e.g. `Photometry`). Notes ----- - Stubs: a stub is the most minimal entry, containing an entry's 'name' and possible aliases. These instances are used to represent entries which are known to exist (e.g. have already been saved) for cross referencing and duplicate removal. + The `Entry.get_stub` method returns the 'stub' corresponding to the Entry instance. i.e. it returns a *new object* with only the name and aliases copied over. Attributes ---------- catalog : `astrocats.catalog.catalog.Catalog` object Pointer to the parent catalog object of which this entry is a member. filename : str or 'None' If this entry is loaded from a file, its (full path and) filename. _log : `logging.Logger` object Pointer to the logger from the parent catalog. _stub : bool Whether this instance represents a 'stub' (see above). _KEYS : `astrocats.catalog.key.KeyCollection` object The associated object which contains the different dictionary keys used in this type (e.g. `Supernova`) entry. """ _KEYS = ENTRY def __init__(self, catalog=None, name=None, stub=False): """Create a new `Entry` object with the given `name`. Arguments --------- catalog : `astrocats.catalog.catalog.Catalog` instance The parent catalog object of which this entry belongs. name : str The name of this entry, e.g. `SN1987A` for a `Supernova` entry. stub : bool Whether or not this instance represents a 'stub' (see above). """ super(Entry, self).__init__() self.catalog = catalog self.filename = None self.dupe_of = [] self._stub = stub if catalog: self._log = catalog.log else: from astrocats.catalog.catalog import Catalog self._log = logging.getLogger() self.catalog = Catalog(None, self._log) self[self._KEYS.NAME] = name return def __repr__(self): """Return JSON representation of self.""" jsonstring = dict_to_pretty_string({ENTRY.NAME: self}) return jsonstring def _append_additional_tags(self, quantity, source, cat_dict): """Append additional bits of data to an existing quantity. Called when a newly added quantity is found to be a duplicate. """ pass def _get_save_path(self, bury=False): """Return the path that this Entry should be saved to.""" filename = self.get_filename(self[self._KEYS.NAME]) # Put objects that shouldn't belong in this catalog in the boneyard if bury: outdir = self.catalog.get_repo_boneyard() # Get normal repository save directory else: repo_folders = self.catalog.PATHS.get_repo_output_folders() # If no repo folders exist, raise an error -- cannot save if not len(repo_folders): err_str = ( "No output data repositories found. Cannot save.\n" "Make sure that repo names are correctly configured " "in the `input/repos.json` file, and either manually or " "automatically (using `astrocats CATALOG git-clone`) " "clone the appropriate data repositories.") self.catalog.log.error(err_str) raise RuntimeError(err_str) outdir = repo_folders[0] return outdir, filename def _ordered(self, odict): """Convert the object into a plain OrderedDict.""" ndict = OrderedDict() if isinstance(odict, CatDict) or isinstance(odict, Entry): key = odict.sort_func else: key = None nkeys = list(sorted(odict.keys(), key=key)) for key in nkeys: if isinstance(odict[key], OrderedDict): odict[key] = self._ordered(odict[key]) if isinstance(odict[key], list): if (not (odict[key] and not isinstance(odict[key][0], OrderedDict))): nlist = [] for item in odict[key]: if isinstance(item, OrderedDict): nlist.append(self._ordered(item)) else: nlist.append(item) odict[key] = nlist ndict[key] = odict[key] return ndict def get_hash(self, keys=[]): """Return a unique hash associated with the listed keys.""" if not len(keys): keys = list(self.keys()) string_rep = '' oself = self._ordered(deepcopy(self)) for key in keys: string_rep += json.dumps(oself.get(key, ''), sort_keys=True) return hashlib.sha512(string_rep.encode()).hexdigest()[:16] def _clean_quantity(self, quantity): """Clean quantity value before it is added to entry.""" value = quantity.get(QUANTITY.VALUE, '').strip() error = quantity.get(QUANTITY.E_VALUE, '').strip() unit = quantity.get(QUANTITY.U_VALUE, '').strip() kind = quantity.get(QUANTITY.KIND, '') if isinstance(kind, list) and not isinstance(kind, string_types): kind = [x.strip() for x in kind] else: kind = kind.strip() if not value: return False if is_number(value): value = '%g' % Decimal(value) if error: error = '%g' % Decimal(error) if value: quantity[QUANTITY.VALUE] = value if error: quantity[QUANTITY.E_VALUE] = error if unit: quantity[QUANTITY.U_VALUE] = unit if kind: quantity[QUANTITY.KIND] = kind return True def __deepcopy__(self, memo): """Define how an `Entry` should be deep copied.""" new_entry = self.__class__(self.catalog) for key in self: if not key.startswith('__') and key != 'catalog': new_entry[key] = deepcopy(self[key]) return new_entry def _load_data_from_json(self, fhand, clean=False, merge=True, pop_schema=True, ignore_keys=[], compare_to_existing=True, gzip=False, filter_on={}): # FIX: check for overwrite??""" self._log.debug("_load_data_from_json(): {}\n\t{}".format(self.name(), fhand)) # Store the filename this was loaded from self.filename = fhand if gzip: jfil = gz.open(fhand, 'rb') else: jfil = codecs.open(fhand, 'r') data = json.load(jfil, object_pairs_hook=OrderedDict) name = list(data.keys()) if len(name) != 1: err = "json file '{}' has multiple keys: {}".format(fhand, list(name)) self._log.error(err) raise ValueError(err) name = name[0] # Remove the outmost dict level data = data[name] self._log.debug("Name: {}".format(name)) # Delete ignored keys for key in ignore_keys: if key in data: del data[key] # Convert the OrderedDict data from json into class structure i.e. # `Sources` will be extracted and created from the dict Everything # that remains afterwards should be okay to just store to this # `Entry` self._convert_odict_to_classes( data, clean=clean, merge=merge, pop_schema=pop_schema, compare_to_existing=compare_to_existing, filter_on=filter_on) if len(data): err_str = ("Remaining entries in `data` after " "`_convert_odict_to_classes`.") err_str += "\n{}".format(dict_to_pretty_string(data)) self._log.error(err_str) raise RuntimeError(err_str) jfil.close() # If object doesnt have a name yet, but json does, store it self_name = self[ENTRY.NAME] if len(self_name) == 0: self[ENTRY.NAME] = name # Warn if there is a name mismatch elif self_name.lower().strip() != name.lower().strip(): self._log.warning("Object name '{}' does not match name in json:" "'{}'".format(self_name, name)) self.check() return def _convert_odict_to_classes(self, data, clean=False, merge=True, pop_schema=True, compare_to_existing=True, filter_on={}): """Convert `OrderedDict` into `Entry` or its derivative classes.""" self._log.debug("_convert_odict_to_classes(): {}".format(self.name())) self._log.debug("This should be a temporary fix. Dont be lazy.") # Setup filters. Currently only used for photometry. fkeys = list(filter_on.keys()) # Handle 'name' name_key = self._KEYS.NAME if name_key in data: self[name_key] = data.pop(name_key) # Handle 'schema' schema_key = self._KEYS.SCHEMA if schema_key in data: # Schema should be re-added every execution (done elsewhere) so # just delete the old entry if pop_schema: data.pop(schema_key) else: self[schema_key] = data.pop(schema_key) # Cleanup 'internal' repository stuff if clean: # Add data to `self` in ways accomodating 'internal' formats and # leeway. Removes each added entry from `data` so the remaining # stuff can be handled normally data = self.clean_internal(data) # Handle 'sources' # ---------------- src_key = self._KEYS.SOURCES if src_key in data: # Remove from `data` sources = data.pop(src_key) self._log.debug("Found {} '{}' entries".format( len(sources), src_key)) self._log.debug("{}: {}".format(src_key, sources)) for src in sources: self.add_source(allow_alias=True, **src) # Handle `photometry` # ------------------- photo_key = self._KEYS.PHOTOMETRY if photo_key in data: photoms = data.pop(photo_key) self._log.debug("Found {} '{}' entries".format( len(photoms), photo_key)) phcount = 0 for photo in photoms: skip = False for fkey in fkeys: if fkey in photo and photo[fkey] not in filter_on[fkey]: skip = True if skip: continue self._add_cat_dict( Photometry, self._KEYS.PHOTOMETRY, compare_to_existing=compare_to_existing, **photo) phcount += 1 self._log.debug("Added {} '{}' entries".format( phcount, photo_key)) # Handle `spectra` # --------------- spec_key = self._KEYS.SPECTRA if spec_key in data: # When we are cleaning internal data, we don't always want to # require all of the normal spectrum data elements. spectra = data.pop(spec_key) self._log.debug("Found {} '{}' entries".format( len(spectra), spec_key)) for spec in spectra: self._add_cat_dict( Spectrum, self._KEYS.SPECTRA, compare_to_existing=compare_to_existing, **spec) # Handle `error` # -------------- err_key = self._KEYS.ERRORS if err_key in data: errors = data.pop(err_key) self._log.debug("Found {} '{}' entries".format( len(errors), err_key)) for err in errors: self._add_cat_dict(Error, self._KEYS.ERRORS, **err) # Handle `models` # --------------- model_key = self._KEYS.MODELS if model_key in data: # When we are cleaning internal data, we don't always want to # require all of the normal spectrum data elements. model = data.pop(model_key) self._log.debug("Found {} '{}' entries".format( len(model), model_key)) for mod in model: self._add_cat_dict( Model, self._KEYS.MODELS, compare_to_existing=compare_to_existing, **mod) # Handle everything else --- should be `Quantity`s # ------------------------------------------------ if len(data): self._log.debug("{} remaining entries, assuming `Quantity`".format( len(data))) # Iterate over remaining keys for key in list(data.keys()): vals = data.pop(key) # All quantities should be in lists of that quantity # E.g. `aliases` is a list of alias quantities if not isinstance(vals, list): vals = [vals] self._log.debug("{}: {}".format(key, vals)) for vv in vals: self._add_cat_dict( Quantity, key, check_for_dupes=merge, compare_to_existing=compare_to_existing, **vv) if merge and self.dupe_of: self.merge_dupes() return def _check_cat_dict_source(self, cat_dict_class, key_in_self, **kwargs): """Check that a source exists and that a quantity isn't erroneous.""" # Make sure that a source is given source = kwargs.get(cat_dict_class._KEYS.SOURCE, None) if source is None: raise CatDictError( "{}: `source` must be provided!".format(self[self._KEYS.NAME]), warn=True) # Check that source is a list of integers for x in source.split(','): if not is_integer(x): raise CatDictError( "{}: `source` is comma-delimited list of " " integers!".format(self[self._KEYS.NAME]), warn=True) # If this source/data is erroneous, skip it if self.is_erroneous(key_in_self, source): self._log.info("This source is erroneous, skipping") return None # If this source/data is private, skip it if (self.catalog.args is not None and not self.catalog.args.private and self.is_private(key_in_self, source)): self._log.info("This source is private, skipping") return None return source def _init_cat_dict(self, cat_dict_class, key_in_self, **kwargs): """Initialize a CatDict object, checking for errors.""" # Catch errors associated with crappy, but not unexpected data try: new_entry = cat_dict_class(self, key=key_in_self, **kwargs) except CatDictError as err: if err.warn: self._log.info("'{}' Not adding '{}': '{}'".format(self[ self._KEYS.NAME], key_in_self, str(err))) return None return new_entry def _add_cat_dict(self, cat_dict_class, key_in_self, check_for_dupes=True, compare_to_existing=True, **kwargs): """Add a `CatDict` to this `Entry`. CatDict only added if initialization succeeds and it doesn't already exist within the Entry. """ # Make sure that a source is given, and is valid (nor erroneous) if cat_dict_class != Error: try: source = self._check_cat_dict_source(cat_dict_class, key_in_self, **kwargs) except CatDictError as err: if err.warn: self._log.info("'{}' Not adding '{}': '{}'".format(self[ self._KEYS.NAME], key_in_self, str(err))) return False if source is None: return False # Try to create a new instance of this subclass of `CatDict` new_entry = self._init_cat_dict(cat_dict_class, key_in_self, **kwargs) if new_entry is None: return False # Compare this new entry with all previous entries to make sure is new if compare_to_existing and cat_dict_class != Error: for item in self.get(key_in_self, []): if new_entry.is_duplicate_of(item): item.append_sources_from(new_entry) # Return the entry in case we want to use any additional # tags to augment the old entry return new_entry # If this is an alias, add it to the parent catalog's reverse # dictionary linking aliases to names for fast lookup. if key_in_self == self._KEYS.ALIAS: # Check if this adding this alias makes us a dupe, if so mark # ourselves as a dupe. if (check_for_dupes and 'aliases' in dir(self.catalog) and new_entry[QUANTITY.VALUE] in self.catalog.aliases): possible_dupe = self.catalog.aliases[new_entry[QUANTITY.VALUE]] # print(possible_dupe) if (possible_dupe != self[self._KEYS.NAME] and possible_dupe in self.catalog.entries): self.dupe_of.append(possible_dupe) if 'aliases' in dir(self.catalog): self.catalog.aliases[new_entry[QUANTITY.VALUE]] = self[ self._KEYS.NAME] self.setdefault(key_in_self, []).append(new_entry) if (key_in_self == self._KEYS.ALIAS and check_for_dupes and self.dupe_of): self.merge_dupes() return True @classmethod def get_filename(cls, name): """Convert from an `Entry` name into an appropriate filename.""" fname = name.replace('/', '_') return fname @classmethod def init_from_file(cls, catalog, name=None, path=None, clean=False, merge=True, pop_schema=True, ignore_keys=[], compare_to_existing=True, try_gzip=False, filter_on={}): """Construct a new `Entry` instance from an input file. The input file can be given explicitly by `path`, or a path will be constructed appropriately if possible. Arguments --------- catalog : `astrocats.catalog.catalog.Catalog` instance The parent catalog object of which this entry belongs. name : str or 'None' The name of this entry, e.g. `SN1987A` for a `Supernova` entry. If no `path` is given, a path is constructed by trying to find a file in one of the 'output' repositories with this `name`. note: either `name` or `path` must be provided. path : str or 'None' The absolutely path of the input file. note: either `name` or `path` must be provided. clean : bool Whether special sanitization processing should be done on the input data. This is mostly for input files from the 'internal' repositories. """ if not catalog: from astrocats.catalog.catalog import Catalog log = logging.getLogger() catalog = Catalog(None, log) catalog.log.debug("init_from_file()") if name is None and path is None: err = ("Either entry `name` or `path` must be specified to load " "entry.") log.error(err) raise ValueError(err) # If the path is given, use that to load from load_path = '' if path is not None: load_path = path name = '' # If the name is given, try to find a path for it else: repo_paths = catalog.PATHS.get_repo_output_folders() for rep in repo_paths: filename = cls.get_filename(name) newpath = os.path.join(rep, filename + '.json') if os.path.isfile(newpath): load_path = newpath break if load_path is None or not os.path.isfile(load_path): # FIX: is this warning worthy? return None # Create a new `Entry` instance new_entry = cls(catalog, name) # Check if .gz file if try_gzip and not load_path.endswith('.gz'): try_gzip = False # Fill it with data from json file new_entry._load_data_from_json( load_path, clean=clean, merge=merge, pop_schema=pop_schema, ignore_keys=ignore_keys, compare_to_existing=compare_to_existing, gzip=try_gzip, filter_on=filter_on) return new_entry def add_alias(self, alias, source, clean=True): """Add an alias, optionally 'cleaning' the alias string. Calls the parent `catalog` method `clean_entry_name` - to apply the same name-cleaning as is applied to entry names themselves. Returns ------- alias : str The stored version of the alias (cleaned or not). """ if clean: alias = self.catalog.clean_entry_name(alias) self.add_quantity(self._KEYS.ALIAS, alias, source) return alias def add_error(self, value, **kwargs): """Add an `Error` instance to this entry.""" kwargs.update({ERROR.VALUE: value}) self._add_cat_dict(Error, self._KEYS.ERRORS, **kwargs) return def add_photometry(self, compare_to_existing=True, **kwargs): """Add a `Photometry` instance to this entry.""" self._add_cat_dict( Photometry, self._KEYS.PHOTOMETRY, compare_to_existing=compare_to_existing, **kwargs) return def merge_dupes(self): """Merge two entries that correspond to the same entry.""" for dupe in self.dupe_of: if dupe in self.catalog.entries: if self.catalog.entries[dupe]._stub: # merge = False to avoid infinite recursion self.catalog.load_entry_from_name( dupe, delete=True, merge=False) self.catalog.copy_entry_to_entry(self.catalog.entries[dupe], self) del self.catalog.entries[dupe] self.dupe_of = [] def add_quantity(self, quantities, value, source, check_for_dupes=True, compare_to_existing=True, **kwargs): """Add an `Quantity` instance to this entry.""" success = True for quantity in listify(quantities): kwargs.update({QUANTITY.VALUE: value, QUANTITY.SOURCE: source}) cat_dict = self._add_cat_dict( Quantity, quantity, compare_to_existing=compare_to_existing, check_for_dupes=check_for_dupes, **kwargs) if isinstance(cat_dict, CatDict): self._append_additional_tags(quantity, source, cat_dict) success = False return success def add_self_source(self): """Add a source that refers to the catalog itself. For now this points to the Open Supernova Catalog by default. """ return self.add_source( bibcode=self.catalog.OSC_BIBCODE, name=self.catalog.OSC_NAME, url=self.catalog.OSC_URL, secondary=True) def add_source(self, allow_alias=False, **kwargs): """Add a `Source` instance to this entry.""" if not allow_alias and SOURCE.ALIAS in kwargs: err_str = "`{}` passed in kwargs, this shouldn't happen!".format( SOURCE.ALIAS) self._log.error(err_str) raise RuntimeError(err_str) # Set alias number to be +1 of current number of sources if SOURCE.ALIAS not in kwargs: kwargs[SOURCE.ALIAS] = str(self.num_sources() + 1) source_obj = self._init_cat_dict(Source, self._KEYS.SOURCES, **kwargs) if source_obj is None: return None for item in self.get(self._KEYS.SOURCES, ''): if source_obj.is_duplicate_of(item): return item[item._KEYS.ALIAS] self.setdefault(self._KEYS.SOURCES, []).append(source_obj) return source_obj[source_obj._KEYS.ALIAS] def add_model(self, allow_alias=False, **kwargs): """Add a `Model` instance to this entry.""" if not allow_alias and MODEL.ALIAS in kwargs: err_str = "`{}` passed in kwargs, this shouldn't happen!".format( SOURCE.ALIAS) self._log.error(err_str) raise RuntimeError(err_str) # Set alias number to be +1 of current number of models if MODEL.ALIAS not in kwargs: kwargs[MODEL.ALIAS] = str(self.num_models() + 1) model_obj = self._init_cat_dict(Model, self._KEYS.MODELS, **kwargs) if model_obj is None: return None for item in self.get(self._KEYS.MODELS, ''): if model_obj.is_duplicate_of(item): return item[item._KEYS.ALIAS] self.setdefault(self._KEYS.MODELS, []).append(model_obj) return model_obj[model_obj._KEYS.ALIAS] def add_spectrum(self, compare_to_existing=True, **kwargs): """Add a `Spectrum` instance to this entry.""" spec_key = self._KEYS.SPECTRA # Make sure that a source is given, and is valid (nor erroneous) source = self._check_cat_dict_source(Spectrum, spec_key, **kwargs) if source is None: return None # Try to create a new instance of `Spectrum` new_spectrum = self._init_cat_dict(Spectrum, spec_key, **kwargs) if new_spectrum is None: return None is_dupe = False for item in self.get(spec_key, []): # Only the `filename` should be compared for duplicates. If a # duplicate is found, that means the previous `exclude` array # should be saved to the new object, and the old deleted if new_spectrum.is_duplicate_of(item): if SPECTRUM.EXCLUDE in new_spectrum: item[SPECTRUM.EXCLUDE] = new_spectrum[SPECTRUM.EXCLUDE] elif SPECTRUM.EXCLUDE in item: item.update(new_spectrum) is_dupe = True break if not is_dupe: self.setdefault(spec_key, []).append(new_spectrum) return def check(self): """Check that the entry has the required fields.""" # Make sure there is a schema key in dict if self._KEYS.SCHEMA not in self: self[self._KEYS.SCHEMA] = self.catalog.SCHEMA.URL # Make sure there is a name key in dict if (self._KEYS.NAME not in self or len(self[self._KEYS.NAME]) == 0): raise ValueError("Entry name is empty:\n\t{}".format( json.dumps( self, indent=2))) return def clean_internal(self, data=None): """Clean input from 'internal', human added data. This is used in the 'Entry.init_from_file' method. """ return data def extra_aliases(self): """Return aliases considered when merging duplicates.""" return [] def get_aliases(self, includename=True): """Retrieve the aliases of this object as a list of strings. Arguments --------- includename : bool Include the 'name' parameter in the list of aliases. """ # empty list if doesnt exist alias_quanta = self.get(self._KEYS.ALIAS, []) aliases = [aq[QUANTITY.VALUE] for aq in alias_quanta] if includename and self[self._KEYS.NAME] not in aliases: aliases = [self[self._KEYS.NAME]] + aliases return aliases def get_entry_text(self, fname): """Retrieve the raw text from a file.""" if fname.split('.')[-1] == 'gz': with gz.open(fname, 'rt') as f: filetext = f.read() else: with codecs.open(fname, 'r') as f: filetext = f.read() return filetext def get_source_by_alias(self, alias): """Given an alias, find the corresponding source in this entry. If the given alias doesn't exist (e.g. there are no sources), then a `ValueError` is raised. Arguments --------- alias : str The str-integer (e.g. '8') of the target source. Returns ------- source : `astrocats.catalog.source.Source` object The source object corresponding to the passed alias. """ for source in self.get(self._KEYS.SOURCES, []): if source[self._KEYS.ALIAS] == alias: return source raise ValueError("Source '{}': alias '{}' not found!".format(self[ self._KEYS.NAME], alias)) def get_stub(self): """Get a new `Entry` which contains the 'stub' of this one. The 'stub' is only the name and aliases. Usage: ----- To convert a normal entry into a stub (for example), overwrite the entry in place, i.e. >>> entries[name] = entries[name].get_stub() Returns ------- stub : `astrocats.catalog.entry.Entry` subclass object The type of the returned object is this instance's type. """ stub = type(self)(self.catalog, self[self._KEYS.NAME], stub=True) if self._KEYS.ALIAS in self: stub[self._KEYS.ALIAS] = self[self._KEYS.ALIAS] if self._KEYS.DISTINCT_FROM in self: stub[self._KEYS.DISTINCT_FROM] = self[self._KEYS.DISTINCT_FROM] if self._KEYS.RA in self: stub[self._KEYS.RA] = self[self._KEYS.RA] if self._KEYS.DEC in self: stub[self._KEYS.DEC] = self[self._KEYS.DEC] if self._KEYS.DISCOVER_DATE in self: stub[self._KEYS.DISCOVER_DATE] = self[self._KEYS.DISCOVER_DATE] if self._KEYS.SOURCES in self: stub[self._KEYS.SOURCES] = self[self._KEYS.SOURCES] return stub def is_erroneous(self, field, sources): """Check if attribute has been marked as being erroneous.""" if self._KEYS.ERRORS in self: my_errors = self[self._KEYS.ERRORS] for alias in sources.split(','): source = self.get_source_by_alias(alias) bib_err_values = [ err[ERROR.VALUE] for err in my_errors if err[ERROR.KIND] == SOURCE.BIBCODE and err[ERROR.EXTRA] == field ] if (SOURCE.BIBCODE in source and source[SOURCE.BIBCODE] in bib_err_values): return True name_err_values = [ err[ERROR.VALUE] for err in my_errors if err[ERROR.KIND] == SOURCE.NAME and err[ERROR.EXTRA] == field ] if (SOURCE.NAME in source and source[SOURCE.NAME] in name_err_values): return True return False def is_private(self, key, sources): """Check if attribute is private.""" # aliases are always public. if key == ENTRY.ALIAS: return False return all([ SOURCE.PRIVATE in self.get_source_by_alias(x) for x in sources.split(',') ]) def name(self): """Return own name.""" try: return self[self._KEYS.NAME] except KeyError: return None def num_sources(self): """Return the current number of sources stored in this instance. Returns ------- len : int The *integer* number of existing sources. """ return len(self.get(self._KEYS.SOURCES, [])) def num_models(self): """Return the current number of models stored in this instance. Returns ------- len : int The *integer* number of existing models. """ return len(self.get(self._KEYS.MODELS, [])) def priority_prefixes(self): """Return prefixes to given priority when merging duplicate entries.""" return () def sanitize(self): """Sanitize the data (sort it, etc.) before writing it to disk. Template method that can be overridden in each catalog's subclassed `Entry` object. """ name = self[self._KEYS.NAME] aliases = self.get_aliases(includename=False) if name not in aliases: # Assign the first source to alias, if not available assign us. if self._KEYS.SOURCES in self: self.add_quantity(self._KEYS.ALIAS, name, '1') if self._KEYS.ALIAS not in self: source = self.add_self_source() self.add_quantity(self._KEYS.ALIAS, name, source) else: source = self.add_self_source() self.add_quantity(self._KEYS.ALIAS, name, source) if self._KEYS.ALIAS in self: self[self._KEYS.ALIAS].sort( key=lambda key: alias_priority(name, key[QUANTITY.VALUE])) else: self._log.error( 'There should be at least one alias for `{}`.'.format(name)) if self._KEYS.PHOTOMETRY in self: self[self._KEYS.PHOTOMETRY].sort( key=lambda x: ((float(x[PHOTOMETRY.TIME]) if isinstance(x[PHOTOMETRY.TIME], (basestring, float, int)) else min([float(y) for y in x[PHOTOMETRY.TIME]])) if PHOTOMETRY.TIME in x else 0.0, x[PHOTOMETRY.BAND] if PHOTOMETRY.BAND in x else '', float(x[PHOTOMETRY.MAGNITUDE]) if PHOTOMETRY.MAGNITUDE in x else '')) if (self._KEYS.SPECTRA in self and list( filter(None, [ SPECTRUM.TIME in x for x in self[self._KEYS.SPECTRA] ]))): self[self._KEYS.SPECTRA].sort( key=lambda x: (float(x[SPECTRUM.TIME]) if SPECTRUM.TIME in x else 0.0, x[SPECTRUM.FILENAME] if SPECTRUM.FILENAME in x else '') ) if self._KEYS.SOURCES in self: # Remove orphan sources source_aliases = [ x[SOURCE.ALIAS] for x in self[self._KEYS.SOURCES] ] # Sources with the `PRIVATE` attribute are always retained source_list = [ x[SOURCE.ALIAS] for x in self[self._KEYS.SOURCES] if SOURCE.PRIVATE in x ] for key in self: # if self._KEYS.get_key_by_name(key).no_source: if (key in [ self._KEYS.NAME, self._KEYS.SCHEMA, self._KEYS.SOURCES, self._KEYS.ERRORS ]): continue for item in self[key]: source_list += item[item._KEYS.SOURCE].split(',') new_src_list = sorted( list(set(source_aliases).intersection(source_list))) new_sources = [] for source in self[self._KEYS.SOURCES]: if source[SOURCE.ALIAS] in new_src_list: new_sources.append(source) else: self._log.info('Removing orphaned source from `{}`.' .format(name)) if not new_sources: del self[self._KEYS.SOURCES] self[self._KEYS.SOURCES] = new_sources def save(self, bury=False, final=False): """Write entry to JSON file in the proper location. Arguments --------- bury : bool final : bool If this is the 'final' save, perform additional sanitization and cleaning operations. """ outdir, filename = self._get_save_path(bury=bury) if final: self.sanitize() # FIX: use 'dump' not 'dumps' jsonstring = json.dumps( { self[self._KEYS.NAME]: self._ordered(self) }, indent='\t' if sys.version_info[0] >= 3 else 4, separators=(',', ':'), ensure_ascii=False) if not os.path.isdir(outdir): raise RuntimeError("Output directory '{}' for event '{}' does " "not exist.".format(outdir, self[ self._KEYS.NAME])) save_name = os.path.join(outdir, filename + '.json') with codecs.open(save_name, 'w', encoding='utf8') as sf: sf.write(jsonstring) if not os.path.exists(save_name): raise RuntimeError("File '{}' was not saved!".format(save_name)) return save_name def set_preferred_name(self): """Set a preferred name for the entry.""" return self[self._KEYS.NAME] def sort_func(self, key): """Used to sort keys when writing Entry to JSON format. Should be supplemented/overridden by inheriting classes. """ if key == self._KEYS.SCHEMA: return 'aaa' if key == self._KEYS.NAME: return 'aab' if key == self._KEYS.SOURCES: return 'aac' if key == self._KEYS.ALIAS: return 'aad' if key == self._KEYS.MODELS: return 'aae' if key == self._KEYS.PHOTOMETRY: return 'zzy' if key == self._KEYS.SPECTRA: return 'zzz' return key
def init_from_file(cls, catalog, name=None, path=None, clean=False, merge=True, pop_schema=True, ignore_keys=[], compare_to_existing=True, try_gzip=False, filter_on={}): """Construct a new `Entry` instance from an input file. The input file can be given explicitly by `path`, or a path will be constructed appropriately if possible. Arguments --------- catalog : `astrocats.catalog.catalog.Catalog` instance The parent catalog object of which this entry belongs. name : str or 'None' The name of this entry, e.g. `SN1987A` for a `Supernova` entry. If no `path` is given, a path is constructed by trying to find a file in one of the 'output' repositories with this `name`. note: either `name` or `path` must be provided. path : str or 'None' The absolutely path of the input file. note: either `name` or `path` must be provided. clean : bool Whether special sanitization processing should be done on the input data. This is mostly for input files from the 'internal' repositories. """ if not catalog: from astrocats.catalog.catalog import Catalog log = logging.getLogger() catalog = Catalog(None, log) catalog.log.debug("init_from_file()") if name is None and path is None: err = ("Either entry `name` or `path` must be specified to load " "entry.") log.error(err) raise ValueError(err) # If the path is given, use that to load from load_path = '' if path is not None: load_path = path name = '' # If the name is given, try to find a path for it else: repo_paths = catalog.PATHS.get_repo_output_folders() for rep in repo_paths: filename = cls.get_filename(name) newpath = os.path.join(rep, filename + '.json') if os.path.isfile(newpath): load_path = newpath break if load_path is None or not os.path.isfile(load_path): # FIX: is this warning worthy? return None # Create a new `Entry` instance new_entry = cls(catalog, name) # Check if .gz file if try_gzip and not load_path.endswith('.gz'): try_gzip = False # Fill it with data from json file new_entry._load_data_from_json( load_path, clean=clean, merge=merge, pop_schema=pop_schema, ignore_keys=ignore_keys, compare_to_existing=compare_to_existing, gzip=try_gzip, filter_on=filter_on) return new_entry
def generate_event_list(self, event_list): """Generate a list of events and/or convert events to JSON format.""" prt = self._printer cidict = OrderedDict() intro_shown = False new_event_list = [] previous_file = None for event in event_list: rsource = {SOURCE.NAME: self._DEFAULT_SOURCE} use_self_source = None new_events = [] toffset = Decimal('0') if ('.' in event and os.path.isfile(event) and not event.endswith('.json')): if not intro_shown: prt.message('converter_info') intro_shown = True prt.message('converting_to_json', [event]) with open(event, 'r') as f: ftxt = f.read() # Try a couple of table formats from astropy. table = None try: table = read(ftxt, Reader=Cds, guess=False) except Exception: pass else: prt.message('convert_cds') flines = [table.colnames] + [ list(x) for x in np.array(table).tolist()] for i in range(len(flines)): flines[i] = [str(x) for x in flines[i]] try: table = read(ftxt, Reader=Latex, guess=False) except Exception: pass else: prt.message('convert_latex') flines = [table.colnames] + [ list(x) for x in np.array(table).tolist()] if table is None: # Count to try and determine delimiter. delims = [' ', '\t', ',', ';', '|', '&'] delimnames = [ 'Space: ` `', 'Tab: `\t`', 'Comma: `,`', 'Semi-colon: `;`', 'Bar: `|`', 'Ampersand: `&`'] delim = None delimcounts = [ftxt.count(x) for x in delims] maxdelimcount = max(delimcounts) delim = delims[delimcounts.index(maxdelimcount)] # If two delimiter options are close in count, ask user. for i, x in enumerate(delimcounts): if x > 0.5 * maxdelimcount and delims[i] != delim: delim = None if delim is None: odelims = list(np.array(delimnames)[ np.array(delimcounts) > 0]) delim = delims[prt.prompt( 'delim', kind='option', options=odelims) - 1] ad = list(delims) ad.remove(delim) ad = ''.join(ad) fsplit = ftxt.splitlines() fsplit = [ x.replace('$', '').replace('\\pm', delim) .replace('±', delim).replace('(', delim + '(') .strip(ad + '()# ').replace('′', "'") for x in fsplit] flines = [] for fs in fsplit: flines.append(list( csv.reader([fs], delimiter=delim))[0]) flines = [[ x.strip(ad + '#$()\\') for x in y] for y in flines] # Find band columns if they exist and insert error columns # if they don't exist. for fi, fl in enumerate(list(flines)): flcopy = list(fl) offset = 0 if not any([is_number(x) for x in fl]): for fci, fc in enumerate(fl): if (fc in self._band_names and (fci == len(fl) - 1 or fl[fci + 1] not in self._emagstrs)): flcopy.insert(fci + 1 + offset, 'e mag') offset += 1 flines[fi] = flcopy # Find the most frequent column count. These are probably # the tables we wish to read. flens = [len(x) for x in flines] ncols = Counter(flens).most_common(1)[0][0] newlines = [] potential_name = None for fi, fl in enumerate(flines): if (len(fl) and flens[fi] == 1 and fi < len(flines) - 1 and flens[fi + 1] == ncols and not len(newlines)): potential_name = fl[0] if flens[fi] == ncols: if potential_name is not None and any( [is_number(x) for x in fl]): newlines.append([potential_name] + list(fl)) else: newlines.append(list(fl)) flines = newlines for fi, fl in enumerate(flines): if len(fl) == ncols and potential_name is not None: if not any([is_number(x) for x in fl]): flines[fi] = ['name'] + list(fl) # If none of the rows contain numeric data, the file # is likely a list of transient names. if (len(flines) and (not any(any([is_number(x) or x == '' for x in y]) for y in flines) or len(flines) == 1)): new_events = [ it for s in flines for it in s] # If last row is numeric, then likely this is a file with # transient data. elif (len(flines) > 1 and any([is_number(x) for x in flines[-1]])): # Check that each row has the same number of columns. if len(set([len(x) for x in flines])) > 1: print(set([len(x) for x in flines])) raise ValueError( 'Number of columns in each row not ' 'consistent!') if len(cidict) and len(new_event_list): msg = ('is_file_same' if previous_file else 'is_event_same') reps = [previous_file] if previous_file else [''.join( new_event_list[-1].split('.')[:-1])] text = prt.text(msg, reps) is_same = prt.prompt(text, message=False, kind='bool') if not is_same: cidict = OrderedDict() # If the first row has no numbers it is likely a header. if not len(cidict): self.assign_columns(cidict, flines) perms = 1 for key in cidict: if isinstance(cidict[key], list) and not isinstance( cidict[key], string_types): if cidict[key][0] != 'j': perms = len(cidict[key]) # Get event name (if single event) or list of names from # table. event_names = [] if ENTRY.NAME in cidict: for fi, fl in enumerate(flines): flines[fi][cidict[ENTRY.NAME]] = name_clean( fl[cidict[ENTRY.NAME]]) event_names = list(sorted(set([ x[cidict[ENTRY.NAME]] for x in flines[ self._first_data:]]))) new_events = [x + '.json' for x in event_names] else: new_event_name = '.'.join(event.split( '.')[:-1]).split('/')[-1] text = prt.message( 'is_event_name', [new_event_name], prt=False) is_name = prt.prompt(text, message=False, kind='bool', default='y') if not is_name: new_event_name = '' while new_event_name.strip() == '': new_event_name = prt.prompt( 'enter_name', kind='string') event_names.append(new_event_name) new_events = [new_event_name + '.json'] # Create a new event, populate the photometry, and dump # to a JSON file in the run directory. entries = OrderedDict([(x, Entry(name=x)) for x in event_names]) # Clean up the data a bit now that we know the column # identities. # Strip common prefixes/suffixes from band names if PHOTOMETRY.BAND in cidict: bi = cidict[PHOTOMETRY.BAND] for d in [True, False]: if not isinstance(bi, (int, np.integer)): break strip_cols = [] lens = [len(x[bi]) for x in flines[self._first_data:]] llen = min(lens) ra = range(llen) if d else range(-1, -llen - 1, -1) for li in ra: letter = None for row in list(flines[self._first_data:]): if letter is None: letter = row[bi][li] elif row[bi][li] != letter: letter = None break if letter is not None: strip_cols.append(li) else: break if len(strip_cols) == llen: break for ri in range(len(flines[self._first_data:])): flines[self._first_data + ri][bi] = ''.join( [c for i, c in enumerate(flines[ self._first_data + ri][bi]) if (i if d else i - len(flines[ self._first_data + ri][bi])) not in strip_cols]) if (PHOTOMETRY.TIME in cidict and (not isinstance(cidict[PHOTOMETRY.TIME], list) or len(cidict[PHOTOMETRY.TIME]) <= 2)): bi = cidict[PHOTOMETRY.TIME] if isinstance(bi, list) and not isinstance( bi, string_types) and isinstance( bi[0], string_types) and bi[0] == 'jd': bi = bi[-1] mmtimes = [float(x[bi]) for x in flines[self._first_data:]] mintime, maxtime = min(mmtimes), max(mmtimes) if mintime < 10000: while True: try: response = prt.prompt( 'small_time_offset', kind='string') if response is not None: toffset = Decimal(response) break except Exception: pass elif maxtime > 60000 and cidict[ PHOTOMETRY.TIME][0] != 'jd': isjd = prt.prompt( 'large_time_offset', kind='bool', default='y') if isjd: toffset = Decimal('-2400000.5') for row in flines[self._first_data:]: photodict = {} rname = (row[cidict[ENTRY.NAME]] if ENTRY.NAME in cidict else event_names[0]) for pi in range(perms): sources = set() for key in cidict: if key in self._bool_keys: rval = row[cidict[key]] if rval in self._FALSE_VALS: rval = False elif rval in self._TRUE_VALS: rval = True if type(rval) != 'bool': try: rval = bool(rval) except Exception: pass if type(rval) != 'bool': try: rval = bool(float(rval)) except Exception: rval = True if not rval: continue row[cidict[key]] = rval elif key == 'reference': if (isinstance(cidict[key], string_types) and len(cidict[key]) == 19): new_src = entries[rname].add_source( bibcode=cidict[key]) sources.update(new_src) row[ cidict[key]] = new_src elif key == ENTRY.NAME: continue elif (isinstance(key, Key) and key.type == KEY_TYPES.TIME and isinstance(cidict[key], list) and not isinstance(cidict[key], string_types)): tval = np.array(row)[np.array(cidict[key][ 1:], dtype=int)] if cidict[key][0] == 'j': date = '-'.join([x.zfill(2) for x in tval]) date = self._month_rep.sub( lambda x: self._MONTH_IDS[ x.group()], date) photodict[key] = str( astrotime(date, format='isot').mjd) elif cidict[key][0] == 'jd': photodict[key] = str( jd_to_mjd(Decimal(tval[-1]))) continue val = cidict[key] if (isinstance(val, list) and not isinstance(val, string_types)): val = val[pi] if isinstance(val, string_types): if val != '': photodict[key] = val else: photodict[key] = row[val] else: if isinstance(val, string_types): if val != '': photodict[key] = val else: photodict[key] = row[val] if self._data_type == 2: if self._zp: photodict[PHOTOMETRY.ZERO_POINT] = self._zp else: photodict[PHOTOMETRY.ZERO_POINT] = ( row[cidict[PHOTOMETRY.ZERO_POINT][pi]] if isinstance(cidict[ PHOTOMETRY.ZERO_POINT], list) else row[cidict[PHOTOMETRY.ZERO_POINT]]) zpp = photodict[PHOTOMETRY.ZERO_POINT] cc = ( row[cidict[PHOTOMETRY.COUNT_RATE][pi]] if isinstance(cidict[ PHOTOMETRY.COUNT_RATE], list) else row[cidict[PHOTOMETRY.COUNT_RATE]]) ecc = ( row[cidict[PHOTOMETRY.E_COUNT_RATE][pi]] if isinstance(cidict[ PHOTOMETRY.E_COUNT_RATE], list) else row[cidict[PHOTOMETRY.E_COUNT_RATE]]) if '<' in cc: set_pd_mag_from_counts( photodict, ec=cc.strip('<'), zp=zpp) else: set_pd_mag_from_counts( photodict, c=cc, ec=ecc, zp=zpp) elif self._data_type == 3: photodict[ PHOTOMETRY.U_FLUX_DENSITY] = self._ufd if PHOTOMETRY.U_FLUX_DENSITY in cidict: photodict[PHOTOMETRY.U_FLUX_DENSITY] = ( row[cidict[ PHOTOMETRY.U_FLUX_DENSITY][pi]] if isinstance(cidict[ PHOTOMETRY. U_FLUX_DENSITY], list) else row[cidict[PHOTOMETRY.U_FLUX_DENSITY]]) if photodict[ PHOTOMETRY.U_FLUX_DENSITY] == '': photodict[ PHOTOMETRY.U_FLUX_DENSITY] = 'µJy' fd = ( row[cidict[PHOTOMETRY.FLUX_DENSITY][pi]] if isinstance(cidict[ PHOTOMETRY.FLUX_DENSITY], list) else row[cidict[PHOTOMETRY.FLUX_DENSITY]]) efd = ( row[cidict[ PHOTOMETRY.E_FLUX_DENSITY][pi]] if isinstance(cidict[ PHOTOMETRY.E_FLUX_DENSITY], list) else row[cidict[PHOTOMETRY.E_FLUX_DENSITY]]) mult = Decimal('1') ufd = photodict[PHOTOMETRY.U_FLUX_DENSITY] if ufd.lower() in [ 'mjy', 'millijy', 'millijansky']: mult = Decimal('1e3') elif ufd.lower() in ['jy', 'jansky']: mult = Decimal('1e6') if '<' in fd: set_pd_mag_from_flux_density( photodict, efd=str( Decimal(fd.strip('<')) * mult)) else: set_pd_mag_from_flux_density( photodict, fd=Decimal(fd) * mult, efd=Decimal(efd) * mult) if not len(sources): if use_self_source is None: sopts = [ ('Bibcode', 'b'), ('Last name', 'l')] if self._require_source: sel_str = 'must_select_source' else: sel_str = 'select_source' text = prt.text(sel_str) skind = prt.prompt( text, kind='option', options=sopts, default='b', none_string=( None if self._require_source else 'Neither, tag MOSFiT as source')) if skind == 'b': rsource = {} bibcode = '' while len(bibcode) != 19: bibcode = prt.prompt( 'bibcode', kind='string', allow_blank=False ) bibcode = bibcode.strip() if (re.search( '[0-9]{4}..........[\.0-9]{4}' '[A-Za-z]', bibcode) is None): bibcode = '' rsource[ SOURCE.BIBCODE] = bibcode use_self_source = False elif skind == 'l': rsource = {} last_name = prt.prompt( 'last_name', kind='string' ) rsource[ SOURCE.NAME] = ( last_name.strip().title() + ' et al., in preparation') use_self_source = False elif skind == 'n': use_self_source = True photodict[ PHOTOMETRY.SOURCE] = entries[ rname].add_source(**rsource) if any([x in photodict.get( PHOTOMETRY.MAGNITUDE, '') for x in ['<', '>']]): photodict[PHOTOMETRY.UPPER_LIMIT] = True photodict[ PHOTOMETRY.MAGNITUDE] = photodict[ PHOTOMETRY.MAGNITUDE].strip('<>') if '<' in photodict.get(PHOTOMETRY.COUNT_RATE, ''): photodict[PHOTOMETRY.UPPER_LIMIT] = True photodict[ PHOTOMETRY.COUNT_RATE] = photodict[ PHOTOMETRY.COUNT_RATE].strip('<') if PHOTOMETRY.E_COUNT_RATE in photodict: del(photodict[PHOTOMETRY.E_COUNT_RATE]) if '<' in photodict.get( PHOTOMETRY.FLUX_DENSITY, ''): photodict[PHOTOMETRY.UPPER_LIMIT] = True photodict[ PHOTOMETRY.FLUX_DENSITY] = photodict[ PHOTOMETRY.FLUX_DENSITY].strip('<') if PHOTOMETRY.E_FLUX_DENSITY in photodict: del(photodict[PHOTOMETRY.E_FLUX_DENSITY]) # Apply offset time if set. if (PHOTOMETRY.TIME in photodict and toffset != Decimal('0')): photodict[PHOTOMETRY.TIME] = str( Decimal(photodict[PHOTOMETRY.TIME]) + toffset) # Skip entries for which key values are not # expected type. if not all([ is_number(photodict.get(x, '')) for x in photodict.keys() if (PHOTOMETRY.get_key_by_name(x).type == KEY_TYPES.NUMERIC)]): continue # Skip placeholder values. if float(photodict.get( PHOTOMETRY.MAGNITUDE, 0.0)) > 50.0: continue # Add system if specified by user. if (self._system is not None and PHOTOMETRY.SYSTEM not in photodict): photodict[PHOTOMETRY.SYSTEM] = self._system # Remove keys not in the `PHOTOMETRY` class. for key in list(photodict.keys()): if key not in PHOTOMETRY.vals(): del(photodict[key]) # Add the photometry. entries[rname].add_photometry( **photodict) merge_with_existing = None for ei, entry in enumerate(entries): entries[entry].sanitize() if os.path.isfile(new_events[ei]): if merge_with_existing is None: merge_with_existing = prt.prompt( 'merge_with_existing', default='y') if merge_with_existing: existing = Entry.init_from_file( catalog=None, name=event_names[ei], path=new_events[ei], merge=False, pop_schema=False, ignore_keys=[ENTRY.MODELS], compare_to_existing=False) Catalog().copy_entry_to_entry( existing, entries[entry]) oentry = entries[entry]._ordered(entries[entry]) entabbed_json_dump( {entry: oentry}, open(new_events[ei], 'w'), separators=(',', ':')) self._converted.extend([ [event_names[x], new_events[x]] for x in range(len(event_names))]) new_event_list.extend(new_events) previous_file = event else: new_event_list.append(event) return new_event_list