Esempio n. 1
0
 def contents(self):
     """A table summarizing the HDUs.
     """
     if self._contents is None:
         self._contents = list()
         self._contents.append(self.contents_header)
         for k in range(self.nhdr):
             if 'EXTNAME' in self.headers[k]:
                 extname = self.headers[k]['EXTNAME'].strip()
             else:
                 extname = ''
                 #
                 # Don't complain about missing EXTNAME on primary, empty HDUs.
                 # See https://github.com/desihub/desidatamodel/issues/69
                 #
                 if k > 0:
                     log.warning("HDU%d has no EXTNAME set!", k)
             if k > 0:
                 if 'ZTENSION' in self.headers[k]:
                     exttype = self.headers[k]['ZTENSION'].strip()
                 else:
                     exttype = self.headers[k]['XTENSION'].strip()
             else:
                 exttype = 'IMAGE'
             self._contents.append((self.hduname.format(k)+'_',
                                    extname, exttype,
                                    '*Brief Description*'))
     return self._contents
Esempio n. 2
0
 def contents(self):
     """A table summarizing the HDUs.
     """
     if self._contents is None:
         self._contents = list()
         self._contents.append(self.contents_header)
         for k in range(self.nhdr):
             if 'EXTNAME' in self.headers[k]:
                 extname = self.headers[k]['EXTNAME'].strip()
             else:
                 extname = ''
                 #
                 # Don't complain about missing EXTNAME on primary, empty HDUs.
                 # See https://github.com/desihub/desidatamodel/issues/69
                 #
                 if k > 0:
                     log.warning("HDU%d has no EXTNAME set!", k)
             if k > 0:
                 if 'ZTENSION' in self.headers[k]:
                     exttype = self.headers[k]['ZTENSION'].strip()
                 else:
                     exttype = self.headers[k]['XTENSION'].strip()
             else:
                 exttype = 'IMAGE'
             self._contents.append((self.hduname.format(k) + '_', extname,
                                    exttype, '*Brief Description*'))
     return self._contents
Esempio n. 3
0
def update_truth(filepath, hdu=2, chunksize=50000, skip=('SLOPES', 'EMLINES')):
    """Add data from columns in other HDUs of the Truth table.

    Parameters
    ----------
    filepath : :class:`str`
        Full path to the data file.
    hdu : :class:`int` or :class:`str`, optional
        Read a data table from this HDU (default 2).
    chunksize : :class:`int`, optional
        If set, update database `chunksize` rows at a time (default 50000).
    skip : :func:`tuple`, optional
        Do not load columns with these names (default, ``('SLOPES', 'EMLINES')``)
    """
    tcls = Truth
    tn = tcls.__tablename__
    t = tcls.__table__
    if filepath.endswith('.fits'):
        with fits.open(filepath) as hdulist:
            data = hdulist[hdu].data
    elif filepath.endswith('.ecsv'):
        data = Table.read(filepath, format='ascii.ecsv')
    else:
        log.error("Unrecognized data file, %s!", filepath)
        return
    log.info("Read data from %s HDU %s", filepath, hdu)
    try:
        colnames = data.names
    except AttributeError:
        colnames = data.colnames
    for col in colnames:
        if data[col].dtype.kind == 'f':
            bad = np.isnan(data[col])
            if np.any(bad):
                nbad = bad.sum()
                log.warning("%d rows of bad data detected in column " +
                            "%s of %s.", nbad, col, filepath)
    log.info("Integrity check complete on %s.", tn)
    # if rowfilter is None:
    #     good_rows = np.ones((maxrows,), dtype=np.bool)
    # else:
    #     good_rows = rowfilter(data[0:maxrows])
    # data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames]
    data_list = [data[col].tolist() for col in colnames if col not in skip]
    data_names = [col.lower() for col in colnames if col not in skip]
    data_names[0] = 'b_targetid'
    finalrows = len(data_list[0])
    log.info("Initial column conversion complete on %s.", tn)
    del data
    data_rows = list(zip(*data_list))
    del data_list
    log.info("Converted columns into rows on %s.", tn)
    for k in range(finalrows//chunksize + 1):
        data_chunk = [dict(zip(data_names, row))
                      for row in data_rows[k*chunksize:(k+1)*chunksize]]
        q = t.update().where(t.c.targetid == bindparam('b_targetid'))
        if len(data_chunk) > 0:
            engine.execute(q, data_chunk)
            log.info("Updated %d rows in %s.",
                     min((k+1)*chunksize, finalrows), tn)
Esempio n. 4
0
def collect_files(root, files):
    """Scan a directory tree for files that correspond to data model files.

    Parameters
    ----------
    root : :class:`str`
        Path to real files on disk.
    files : :class:`list`
        A list of data model files.

    Notes
    -----
    Files are analyzed using this algorithm:

    * The first file that matches a regexp becomes the 'prototype' for that
      data model file.
    * If no files match a data model file, then files of that type are
      'missing'.
    * If a file does not match any regular expression, it is 'extraneous'.
    * If a file matches a regular expression that already has a prototype,
      it is 'ignored'.
    """
    ignore_directories = ('logs', 'scripts')
    include_extensions = ('.fits', '.fits.fz')
    for dirpath, dirnames, filenames in os.walk(root):
        for d in ignore_directories:
            try:
                dirnames.remove(d)
            except ValueError:
                pass
        include_filenames = list()
        for e in include_extensions:
            include_filenames += [f for f in filenames if f.endswith(e)]
        for f in include_filenames:
            extraneous_file = True
            fullname = os.path.join(dirpath, f)
            for r in files:
                if r.regexp is not None:
                    m = r.regexp.match(fullname)
                    if m is not None:
                        extraneous_file = False
                        if r.prototype is None:
                            r.prototype = fullname
            if extraneous_file:
                log.warning("Extraneous file detected: %s", fullname)
    #
    # Scan for missing files, but don't penalize (here) data models that
    # don't have a valid regular expression.  Files with bad regexeps will
    # be flagged elsewhere.
    #
    for r in files:
        if r.regexp is not None and r.prototype is None:
            log.warning("No files found matching %s!", r.filename)
    return
Esempio n. 5
0
def collect_files(root, files):
    """Scan a directory tree for files that correspond to data model files.

    Parameters
    ----------
    root : :class:`str`
        Path to real files on disk.
    files : :class:`list`
        A list of data model files.

    Notes
    -----
    Files are analyzed using this algorithm:

    * The first file that matches a regexp becomes the 'prototype' for that
      data model file.
    * If no files match a data model file, then files of that type are
      'missing'.
    * If a file does not match any regular expression, it is 'extraneous'.
    * If a file matches a regular expression that already has a prototype,
      it is 'ignored'.
    """
    ignore_directories = ('logs', 'scripts')
    include_extensions = ('.fits', '.fits.fz')
    for dirpath, dirnames, filenames in os.walk(root):
        for d in ignore_directories:
            try:
                dirnames.remove(d)
            except ValueError:
                pass
        include_filenames = list()
        for e in include_extensions:
            include_filenames += [f for f in filenames if f.endswith(e)]
        for f in include_filenames:
            extraneous_file = True
            fullname = os.path.join(dirpath, f)
            for r in files:
                if r.regexp is not None:
                    m = r.regexp.match(fullname)
                    if m is not None:
                        extraneous_file = False
                        if r.prototype is None:
                            r.prototype = fullname
            if extraneous_file:
                log.warning("Extraneous file detected: %s", fullname)
    #
    # Scan for missing files, but don't penalize (here) data models that
    # don't have a valid regular expression.  Files with bad regexeps will
    # be flagged elsewhere.
    #
    for r in files:
        if r.regexp is not None and r.prototype is None:
            log.warning("No files found matching %s!", r.filename)
    return
Esempio n. 6
0
def extract_keywords(hdr):
    """Extract interesting keywords from a FITS header.

    Parameters
    ----------
    hdr : :class:`~astropy.io.fits.Header`
        The header to parse.

    Returns
    -------
    :class:`list`
        A list of tuples containing the metadata of interesting keywords.
    """
    keywords = list()
    for key in hdr:
        if extrakey(key):
            # Escape &, <, >, in strings, but don't choke on int/float
            value = hdr[key]
            if isinstance(value, bool):
                ktype = 'bool'
                value = ('F', 'T')[int(value)]
            if isinstance(value, (str,)):
                value = escape(value)
                if value == 'T' or value == 'F':
                    ktype = 'bool'
                else:
                    ktype = 'str'
            if isinstance(value, int):
                value = str(value)
                ktype = 'int'
            if isinstance(value, float):
                value = str(value)
                ktype = 'float'
            if key.endswith('_'):
                key = key[0:len(key)-1] + '\\_'
            try:
                if value.endswith('_'):
                    value = value[0:len(value)-1] + '\\_'
            except AttributeError:
                ktype = 'Unknown'
                log.warning("Raised AttributeError on %s = %s.", key, value)
            keywords.append((key, value, ktype, escape(hdr.comments[key])))
    return keywords
Esempio n. 7
0
def extract_keywords(hdr):
    """Extract interesting keywords from a FITS header.

    Parameters
    ----------
    hdr : :class:`~astropy.io.fits.Header`
        The header to parse.

    Returns
    -------
    :class:`list`
        A list of tuples containing the metadata of interesting keywords.
    """
    keywords = list()
    for key in hdr:
        if extrakey(key):
            # Escape &, <, >, in strings, but don't choke on int/float
            value = hdr[key]
            if isinstance(value, bool):
                ktype = 'bool'
                value = ('F', 'T')[int(value)]
            if isinstance(value, (str, )):
                value = escape(value)
                if value == 'T' or value == 'F':
                    ktype = 'bool'
                else:
                    ktype = 'str'
            if isinstance(value, int):
                value = str(value)
                ktype = 'int'
            if isinstance(value, float):
                value = str(value)
                ktype = 'float'
            if key.endswith('_'):
                key = key[0:len(key) - 1] + '\\_'
            try:
                if value.endswith('_'):
                    value = value[0:len(value) - 1] + '\\_'
            except AttributeError:
                ktype = 'Unknown'
                log.warning("Raised AttributeError on %s = %s.", key, value)
            keywords.append((key, value, ktype, escape(hdr.comments[key])))
    return keywords
Esempio n. 8
0
 def hdumeta(self):
     """Metadata associated with each HDU.
     """
     if self._hdumeta is None:
         self._hdumeta = list()
         for k in range(self.nhdr):
             meta = dict()
             meta['title'] = self.hduname.format(k)
             meta['extname'] = self.contents[k + 1][1]
             meta['keywords'] = extract_keywords(self.headers[k])
             if 'XTENSION' in self.headers[k]:
                 meta['extension'] = self.headers[k]['XTENSION'].strip()
                 if meta['extension'] == 'IMAGE':
                     meta['format'] = self.image_format(self.headers[k])
                 elif meta['extension'] == 'BINTABLE':
                     try:
                         meta['format'] = self.columns(k, self.error)
                     except DataModelError:
                         meta['format'] = self.image_format(self.headers[k])
                         try:
                             meta['extension'] = self.headers[k][
                                 'ZTENSION'].strip()
                         except KeyError:
                             try:
                                 i = self.headers[k]['ZIMAGE']
                                 if i:
                                     meta['extension'] = 'IMAGE'
                             except KeyError:
                                 log.warning(
                                     "Possible malformed compressed data in HDU %d of %s.",
                                     k, self.filename)
                 else:
                     w = ("Unknown extension type: " +
                          "{extension}.").format(**meta)
                     meta['format'] = w
                     log.warning(w)
             else:
                 meta['extension'] = 'IMAGE'
                 meta['format'] = self.image_format(self.headers[k])
             self._hdumeta.append(meta)
     return self._hdumeta
Esempio n. 9
0
 def hdumeta(self):
     """Metadata associated with each HDU.
     """
     if self._hdumeta is None:
         self._hdumeta = list()
         for k in range(self.nhdr):
             meta = dict()
             meta['title'] = self.hduname.format(k)
             meta['extname'] = self.contents[k+1][1]
             meta['keywords'] = extract_keywords(self.headers[k])
             if 'XTENSION' in self.headers[k]:
                 meta['extension'] = self.headers[k]['XTENSION'].strip()
                 if meta['extension'] == 'IMAGE':
                     meta['format'] = self.image_format(self.headers[k])
                 elif meta['extension'] == 'BINTABLE':
                     try:
                         meta['format'] = self.columns(k, self.error)
                     except DataModelError:
                         meta['format'] = self.image_format(self.headers[k])
                         try:
                             meta['extension'] = self.headers[k]['ZTENSION'].strip()
                         except KeyError:
                             try:
                                 i = self.headers[k]['ZIMAGE']
                                 if i:
                                     meta['extension'] = 'IMAGE'
                             except KeyError:
                                 log.warning("Possible malformed compressed data in HDU %d of %s.",
                                             k, self.filename)
                 else:
                     w = ("Unknown extension type: " +
                          "{extension}.").format(**meta)
                     meta['format'] = w
                     log.warning(w)
             else:
                 meta['extension'] = 'IMAGE'
                 meta['format'] = self.image_format(self.headers[k])
             self._hdumeta.append(meta)
     return self._hdumeta
Esempio n. 10
0
    def check_unit(self, unit, error=False):
        """Check units for consistency with FITS standard, while allowing
        some special exceptions.

        Parameters
        ----------
        unit : :class:`str`
            The unit to parse.
        error : :class:`bool`, optional
            If ``True``, failure to interpret the unit raises an
            exception.

        Returns
        -------
        :class:`str`
            If a special exception is detected, the name of the unit
            is returned.  Otherwise, ``None``.

        Raises
        ------
        :exc:`ValueError`
            If `error` is set and the unit can't be parsed.
        """
        try:
            au = Unit(unit, format='fits')
        except ValueError as e:
            bad_unit = str(e).split()[0]
            if any([u in bad_unit for u in self._acceptable_units]):
                return bad_unit
            else:
                if error:
                    log.critical(str(e))
                    raise
                else:
                    log.warning(str(e))
        return None
Esempio n. 11
0
    def get_regexp(self, root, error=False):
        """Obtain the regular expression used to match files on disk.

        Parameters
        ----------
        root : :class:`str`
            Path to real files on disk.
        error : :class:`bool`, optional
            If ``True``, failure to find a regular expression raises an
            exception instead of just a warning.

        Returns
        -------
        regular expression
            The regular expression found, or ``None`` if not found.
            The regular expression is also stored internally.

        Raises
        ------
        :exc:`~desimodel.DataModelError`
            If `error` is set and problems with the data model file are
            detected.
        """
        with open(self.filename) as dm:
            for line in dm.readlines():
                if line.startswith('See :doc:'):
                    self.ref = self._cross_reference(line)
                    log.debug("Cross reference detected %s -> %s.",
                              self.filename, self.ref)
                    break
                if self._regexpline.match(line) is not None:
                    d = os.path.dirname(self.filename).replace(self.section,
                                                               root)
                    for k in self._d2r:
                        d = d.replace(k, self._d2r[k])
                    r = line.strip().split()[1].replace('``', '')
                    self.regexp = re.compile(os.path.join(d, r))
                    break
        if self.regexp is None and self.ref is not None:
            with open(self.ref) as dm:
                for line in dm.readlines():
                    #
                    # Hopefully cross-references are not nested.
                    #
                    # if line.startswith('See :doc:'):
                    #     self.ref = self._cross_reference(line)
                    #     break
                    if self._regexpline.match(line) is not None:
                        d = os.path.dirname(self.filename).replace(self.section,
                                                                   root)
                        for k in self._d2r:
                            d = d.replace(k, self._d2r[k])
                        r = line.strip().split()[1].replace('``', '')
                        self.regexp = re.compile(os.path.join(d, r))
                        break
        if self.regexp is None:
            m = "%s has no file regexp!"
            if error:
                log.critical(m, self.filename)
                raise DataModelError(m % self.filename)
            else:
                log.warning(m, self.filename)
        return self.regexp
Esempio n. 12
0
    def validate_prototype(self, error=False):
        """Compares a model's prototype data file to the data models.

        Parameters
        ----------
        error : :class:`bool`, optional
            If ``True``, failure to extract certain required metadata raises an
            exception.

        Notes
        -----
        * Use set theory to compare the data headers to model headers.  This should
          automatically find missing headers, extraneous headers, etc.
        """
        if self.prototype is None:
            #
            # A warning should have been issued already, so just skip silently.
            #
            return
        log.info("Comparing %s to %s.", self.prototype, self.filename)
        if self._stub is None:
            self._stub = Stub(self.prototype, error=error)
        stub_meta = self._stub_meta = self._stub.hdumeta
        modelmeta = self.extract_metadata(error=error)
        #
        # Check number of headers.
        #
        if self._stub.nhdr != len(modelmeta):
            log.warning("Prototype file %s has the wrong number of " +
                        "sections (HDUs) according to %s.",
                        self.prototype, self.filename)
            return
        for i in range(self._stub.nhdr):
            dkw = stub_meta[i]['keywords']
            mkw = modelmeta[i]['keywords']
            #
            # Check number of keywords.
            #
            if len(dkw) != len(mkw):
                log.warning("Prototype file %s has the wrong number of " +
                            "HDU%d keywords according to %s.",
                            self.prototype, i, self.filename)
                continue
            #
            # If number of keywords is correct, check them individually.
            #
            for j in range(len(dkw)):
                if dkw[j][0] != mkw[j][0]:
                    log.warning("Prototype file %s has a keyword " +
                                "mismatch (%s != %s) in HDU%d according to " +
                                "%s.", self.prototype, dkw[j][0], mkw[j][0], i,
                                self.filename)
            #
            # Check the extension type.
            #
            dex = stub_meta[i]['extension']
            try:
                mex = modelmeta[i]['extension']
            except KeyError:
                mex = "Extension type not found"
            if dex != mex:
                log.warning("Prototype file %s has an extension type " +
                            "mismatch in HDU%d (%s != %s) " +
                            "according to %s.",
                            self.prototype, i, dex, mex, self.filename)
                continue
            #
            # Check for EXTNAME
            #
            dexex = stub_meta[i]['extname']
            mexex = modelmeta[i]['extname']
            if dexex == '' and i > 0:
                log.warning("Prototype file %s has no EXTNAME in HDU%d.",
                            self.prototype, i)
            if (dexex != '' and mexex != '' and dexex != mexex):
                log.warning("Prototype file %s has an EXTNAME mismatch " +
                            "in HDU%d (%s != %s) " +
                            "according to %s.",
                            self.prototype, i, dexex, mexex, self.filename)
            #
            # If the extension type is correct, check the contents of the
            # extension.
            #
            dexf = stub_meta[i]['format']
            try:
                mexf = modelmeta[i]['format']
            except KeyError:
                mexf = "Extension format not found"
            if dex == 'IMAGE':
                try:
                    icomma = dexf.index(',')
                except ValueError:
                    icomma = len(dexf)
                if dexf[:icomma] != mexf[:icomma]:
                    log.warning("Prototype file %s has an extension " +
                                "format mismatch in HDU%d " +
                                "according to %s.",
                                self.prototype, i, self.filename)
            else:
                dexf = dexf[1:]  # Get rid of header line.
                if len(dexf) != len(mexf):
                    log.warning("Prototype file %s has the wrong " +
                                "number of HDU%d columns according to %s.",
                                self.prototype, i, self.filename)
                else:
                    for j in range(len(dexf)):
                        if dexf[j][0] != mexf[j][0]:
                            log.warning("Prototype file %s has a " +
                                        "column name mismatch (%s != %s) " +
                                        "in HDU%d according to %s.",
                                        self.prototype, dexf[j][0], mexf[j][0],
                                        i, self.filename)
        return
Esempio n. 13
0
    def extract_metadata(self, error=False):
        """Extract metadata from a data model file.

        Parameters
        ----------
        error : :class:`bool`, optional
            If ``True``, failure to extract certain required metadata raises an
            exception.

        Returns
        -------
        :class:`list`
            Metadata in a form similar to :class:`~desidatamodel.stub.Stub`
            metadata.

        Raises
        ------
        :exc:`~desidatamodel.DataModelError`
            If `error` is set and the HDU has no `EXTNAME` keyword.
        """
        metafile = self.filename
        if self.ref is not None:
            metafile = self.ref
        if self._metafile_data is None:
            with open(metafile) as f:
                self._metafile_data = f.read()
        lines = self._metafile_data.split('\n')
        hdu_sections = [i for i, l in enumerate(lines)
                        if (self._hduline.match(l) is not None or
                            self._hduspan.match(l) is not None)]
        self.hdumeta = list()
        for k in range(len(hdu_sections)):
            try:
                section = lines[hdu_sections[k]:hdu_sections[k+1]]
            except IndexError:
                section = lines[hdu_sections[k]:]
            m = self._hduspan.match(section[0])
            if m is not None:
                #
                # Detected HDU span.
                #
                g = m.groups()
                spanstart = int(g[0])
                spanend = int(g[1])
                log.debug('Detected range specification from HDU %d to HDU %d',
                          spanstart, spanend)
                spanref = [l for l in section if l.startswith('Data:')][0]
                spanext = spanref[spanref.lower().index('see') + 4:].replace('.', '')
                spanmeta = [m for m in self.hdumeta if m['extname'] == spanext][0]
                spanname = [l.split('=')[1].strip() for l in section
                            if l.startswith('EXTNAME = ')][0]
                extnames = [p.strip() for p in spanname.split(',')]
                if len(range(spanstart, spanend+1)) == len(extnames):
                    for i, l in enumerate(range(spanstart, spanend+1)):
                        meta = dict()
                        meta['title'] = 'HDU{0:d}'.format(l)
                        meta['extname'] = extnames[i]
                        meta['extension'] = spanmeta['extension']
                        meta['format'] = spanmeta['format']
                        meta['keywords'] = spanmeta['keywords']
                        self.hdumeta.append(meta)
                else:
                    log.warning(('Range specification from HDU %d to HDU %d ' +
                                 'does not have a matching EXTNAME specification'),
                                spanstart, spanend)
                continue
            meta = dict()
            meta['title'] = section[0]
            if 'Empty HDU.' in section:
                meta['extension'] = 'IMAGE'
                meta['format'] = 'Empty HDU.'
            image_data = [l for l in section if l.startswith('Data:')]
            if image_data:
                meta['extension'] = 'IMAGE'
                meta['format'] = image_data[0]
            try:
                rdtc = section.index('Required Data Table Columns')
            except ValueError:
                rdtc = None
            if rdtc is not None:
                meta['extension'] = 'BINTABLE'
                table = [i for i, l in enumerate(section[rdtc:])
                         if self._tableboundary.match(l) is not None][1:3]
                columns = list(map(len, section[rdtc:][table[0]].split()))
                table_lines = section[rdtc:][table[0]+1:table[1]]
                meta['format'] = [self._extract_columns(t, columns)
                                  for t in table_lines]
                for mk in meta['format']:
                    if not mk[1]:
                        m = "Missing type for column %s in HDU %d of %s!"
                        if error:
                            log.critical(m, mk[0], k, metafile)
                            raise DataModelError(m % (mk[0], k, metafile))
                        else:
                            log.warning(m, mk[0], k, metafile)
                    if mk[2]:
                        bad_unit = self.check_unit(mk[2], error=error)
                        if bad_unit:
                            log.debug("Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.",
                                      bad_unit, mk[0], k, metafile)
            try:
                rhk = section.index('Required Header Keywords')
            except ValueError:
                meta['keywords'] = []
            else:
                table = [i for i, l in enumerate(section[rhk:])
                         if self._tableboundary.match(l) is not None][1:3]
                columns = list(map(len, section[rhk:][table[0]].split()))
                table_lines = section[rhk:][table[0]+1:table[1]]
                meta['keywords'] = [self._extract_columns(t, columns)
                                    for t in table_lines]
                for mk in meta['keywords']:
                    if not mk[2]:
                        m = "Missing type for keyword %s in HDU %d of %s!"
                        if error:
                            log.critical(m, mk[0], k, metafile)
                            raise DataModelError(m % (mk[0], k, metafile))
                        else:
                            log.warning(m, mk[0], k, metafile)
                    if mk[0] == 'BUNIT':
                        bad_unit = self.check_unit(mk[1], error=error)
                        if bad_unit:
                            log.debug("Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.",
                                      bad_unit, mk[0], k, metafile)
            #
            # Need to know the format by this point!
            #
            try:
                foo = meta['format']
            except KeyError:
                m = "Unable to determine format for HDU %d in %s!"
                log.critical(m, k, metafile)
                raise DataModelError(m % (k, metafile))
            #
            # See https://github.com/desihub/desidatamodel/issues/69 for
            # the detailed policy on EXTNAME.
            #
            try:
                meta['extname'] = [l.split()[2] for l in section
                                   if l.startswith('EXTNAME = ')][0]
            except IndexError:
                meta['extname'] = ''
                if (k > 0 or (k == 0 and meta['format'] != 'Empty HDU.')):
                    m = "HDU %d in %s has no EXTNAME!"
                    if error:
                        log.critical(m, k, metafile)
                        raise DataModelError(m % (k, metafile))
                    else:
                        log.warning(m, k, metafile)
                else:
                    if k == 0 and meta['format'] == 'Empty HDU.':
                        if len(meta['keywords']) > 0:
                            m = "HDU %d in %s should have EXTNAME = 'PRIMARY'."
                            log.warning(m, k, metafile)
            else:
                if k == 0:
                    if meta['format'] == 'Empty HDU.':
                        if len(meta['keywords']) > 0:
                            m = "HDU %d in %s should have EXTNAME = 'PRIMARY'."
                            log.warning(m, k, metafile)
                    else:
                        if meta['extname'] == 'PRIMARY':
                            m = "HDU %d in %s should have a more meaningful EXTNAME than 'PRIMARY'."
                            log.warning(m, k, metafile)
            self.hdumeta.append(meta)
        return self.hdumeta
Esempio n. 14
0
def load_fiberassign(datapath,
                     maxpass=4,
                     hdu='FIBERASSIGN',
                     q3c=False,
                     latest_epoch=False,
                     last_column='NUMOBS_MORE'):
    """Load fiber assignment files into the fiberassign table.

    Tile files can appear in multiple epochs, so for a given tileid, load
    the tile file with the largest value of epoch.  In the "real world",
    a tile file appears in each epoch until it is observed, therefore
    the tile file corresponding to the actual observation is the one
    with the largest epoch.

    Parameters
    ----------
    datapath : :class:`str`
        Full path to the directory containing tile files.
    maxpass : :class:`int`, optional
        Search for pass numbers up to this value (default 4).
    hdu : :class:`int` or :class:`str`, optional
        Read a data table from this HDU (default 'FIBERASSIGN').
    q3c : :class:`bool`, optional
        If set, create q3c index on the table.
    latest_epoch : :class:`bool`, optional
        If set, search for the latest tile file among several epochs.
    last_column : :class:`str`, optional
        Do not load columns past this name (default 'NUMOBS_MORE').
    """
    fiberpath = os.path.join(datapath, 'fiberassign*.fits')
    log.info("Using tile file search path: %s.", fiberpath)
    tile_files = glob.glob(fiberpath)
    if len(tile_files) == 0:
        log.error("No tile files found!")
        return
    log.info("Found %d tile files.", len(tile_files))
    #
    # Find the latest epoch for every tile file.
    #
    latest_tiles = dict()
    if latest_epoch:
        tileidre = re.compile(r'/(\d+)/fiberassign/fiberassign\-(\d+)\.fits$')
        for f in tile_files:
            m = tileidre.search(f)
            if m is None:
                log.error("Could not match %s!", f)
                continue
            epoch, tileid = map(int, m.groups())
            if tileid in latest_tiles:
                if latest_tiles[tileid][0] < epoch:
                    latest_tiles[tileid] = (epoch, f)
            else:
                latest_tiles[tileid] = (epoch, f)
    else:
        for f in tile_files:
            # fiberassign-TILEID.fits
            tileid = int(
                re.match('fiberassign\-(\d+)\.fits', os.path.basename(f))[1])
            latest_tiles[tileid] = (0, f)
    log.info("Identified %d tile files for loading.", len(latest_tiles))
    #
    # Read the identified tile files.
    #
    data_index = None
    for tileid in latest_tiles:
        epoch, f = latest_tiles[tileid]
        with fits.open(f) as hdulist:
            data = hdulist[hdu].data
        log.info("Read data from %s HDU %s", f, hdu)
        for col in data.names[:data_index]:
            if data[col].dtype.kind == 'f':
                bad = np.isnan(data[col])
                if np.any(bad):
                    nbad = bad.sum()
                    log.warning(
                        "%d rows of bad data detected in column " +
                        "%s of %s.", nbad, col, f)
                    #
                    # This replacement may be deprecated in the future.
                    #
                    if col in ('TARGET_RA', 'TARGET_DEC', 'FIBERASSIGN_X',
                               'FIBERASSIGN_Y'):
                        data[col][bad] = -9999.0
                assert not np.any(np.isnan(data[col]))
                assert np.all(np.isfinite(data[col]))
        n_rows = len(data)
        if data_index is None:
            data_index = data.names.index(last_column) + 1
        data_list = ([[tileid] * n_rows] +
                     [data[col].tolist() for col in data.names[:data_index]])
        data_names = ['tileid'
                      ] + [col.lower() for col in data.names[:data_index]]
        log.info("Initial column conversion complete on tileid = %d.", tileid)
        data_rows = list(zip(*data_list))
        log.info("Converted columns into rows on tileid = %d.", tileid)
        dbSession.bulk_insert_mappings(
            FiberAssign, [dict(zip(data_names, row)) for row in data_rows])
        log.info("Inserted %d rows in %s for tileid = %d.", n_rows,
                 FiberAssign.__tablename__, tileid)
        dbSession.commit()
    if q3c:
        q3c_index('fiberassign', ra='target_ra')
    return
Esempio n. 15
0
def load_file(filepath, tcls, hdu=1, expand=None, convert=None, index=None,
              rowfilter=None, q3c=False, chunksize=50000, maxrows=0):
    """Load a data file into the database, assuming that column names map
    to database column names with no surprises.

    Parameters
    ----------
    filepath : :class:`str`
        Full path to the data file.
    tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta`
        The table to load, represented by its class.
    hdu : :class:`int` or :class:`str`, optional
        Read a data table from this HDU (default 1).
    expand : :class:`dict`, optional
        If set, map FITS column names to one or more alternative column names.
    convert : :class:`dict`, optional
        If set, convert the data for a named (database) column using the
        supplied function.
    index : :class:`str`, optional
        If set, add a column that just counts the number of rows.
    rowfilter : callable, optional
        If set, apply this filter to the rows to be loaded.  The function
        should return :class:`bool`, with ``True`` meaning a good row.
    q3c : :class:`bool`, optional
        If set, create q3c index on the table.
    chunksize : :class:`int`, optional
        If set, load database `chunksize` rows at a time (default 50000).
    maxrows : :class:`int`, optional
        If set, stop loading after `maxrows` are loaded.  Alteratively,
        set `maxrows` to zero (0) to load all rows.
    """
    tn = tcls.__tablename__
    if filepath.endswith('.fits'):
        with fits.open(filepath) as hdulist:
            data = hdulist[hdu].data
    elif filepath.endswith('.ecsv'):
        data = Table.read(filepath, format='ascii.ecsv')
    else:
        log.error("Unrecognized data file, %s!", filepath)
        return
    if maxrows == 0:
        maxrows = len(data)
    log.info("Read data from %s HDU %s", filepath, hdu)
    try:
        colnames = data.names
    except AttributeError:
        colnames = data.colnames
    for col in colnames:
        if data[col].dtype.kind == 'f':
            bad = np.isnan(data[col][0:maxrows])
            if np.any(bad):
                nbad = bad.sum()
                log.warning("%d rows of bad data detected in column " +
                            "%s of %s.", nbad, col, filepath)
                #
                # Temporary workaround for bad flux values, see
                # https://github.com/desihub/desitarget/issues/397
                #
                if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'):
                    data[col][0:maxrows][bad] = -9999.0
    log.info("Integrity check complete on %s.", tn)
    if rowfilter is None:
        good_rows = np.ones((maxrows,), dtype=np.bool)
    else:
        good_rows = rowfilter(data[0:maxrows])
    data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames]
    data_names = [col.lower() for col in colnames]
    finalrows = len(data_list[0])
    log.info("Initial column conversion complete on %s.", tn)
    if expand is not None:
        for col in expand:
            i = data_names.index(col.lower())
            if isinstance(expand[col], str):
                #
                # Just rename a column.
                #
                log.debug("Renaming column %s (at index %d) to %s.", data_names[i], i, expand[col])
                data_names[i] = expand[col]
            else:
                #
                # Assume this is an expansion of an array-valued column
                # into individual columns.
                #
                del data_names[i]
                del data_list[i]
                for j, n in enumerate(expand[col]):
                    log.debug("Expanding column %d of %s (at index %d) to %s.", j, col, i, n)
                    data_names.insert(i + j, n)
                    data_list.insert(i + j, data[col][:, j].tolist())
                log.debug(data_names)
    log.info("Column expansion complete on %s.", tn)
    del data
    if convert is not None:
        for col in convert:
            i = data_names.index(col)
            data_list[i] = [convert[col](x) for x in data_list[i]]
    log.info("Column conversion complete on %s.", tn)
    if index is not None:
        data_list.insert(0, list(range(1, finalrows+1)))
        data_names.insert(0, index)
        log.info("Added index column '%s'.", index)
    data_rows = list(zip(*data_list))
    del data_list
    log.info("Converted columns into rows on %s.", tn)
    for k in range(finalrows//chunksize + 1):
        data_chunk = [dict(zip(data_names, row))
                      for row in data_rows[k*chunksize:(k+1)*chunksize]]
        if len(data_chunk) > 0:
            engine.execute(tcls.__table__.insert(), data_chunk)
            log.info("Inserted %d rows in %s.",
                     min((k+1)*chunksize, finalrows), tn)
    # for k in range(finalrows//chunksize + 1):
    #     data_insert = [dict([(col, data_list[i].pop(0))
    #                          for i, col in enumerate(data_names)])
    #                    for j in range(chunksize)]
    #     session.bulk_insert_mappings(tcls, data_insert)
    #     log.info("Inserted %d rows in %s..",
    #              min((k+1)*chunksize, finalrows), tn)
    # session.commit()
    # dbSession.commit()
    if q3c:
        q3c_index(tn)
    return
Esempio n. 16
0
    def extract_metadata(self, error=False):
        """Extract metadata from a data model file.

        Parameters
        ----------
        error : :class:`bool`, optional
            If ``True``, failure to extract certain required metadata raises an
            exception.

        Returns
        -------
        :class:`list`
            Metadata in a form similar to :class:`~desidatamodel.stub.Stub`
            metadata.

        Raises
        ------
        :exc:`~desidatamodel.DataModelError`
            If `error` is set and the HDU has no `EXTNAME` keyword.
        """
        metafile = self.filename
        if self.ref is not None:
            metafile = self.ref
        if self._metafile_data is None:
            with open(metafile) as f:
                self._metafile_data = f.read()
        lines = self._metafile_data.split('\n')
        hdu_sections = [
            i for i, l in enumerate(lines)
            if (self._hduline.match(l) is not None
                or self._hduspan.match(l) is not None)
        ]
        self.hdumeta = list()
        for k in range(len(hdu_sections)):
            try:
                section = lines[hdu_sections[k]:hdu_sections[k + 1]]
            except IndexError:
                section = lines[hdu_sections[k]:]
            m = self._hduspan.match(section[0])
            if m is not None:
                #
                # Detected HDU span.
                #
                g = m.groups()
                spanstart = int(g[0])
                spanend = int(g[1])
                log.debug('Detected range specification from HDU %d to HDU %d',
                          spanstart, spanend)
                spanref = [l for l in section if l.startswith('Data:')][0]
                spanext = spanref[spanref.lower().index('see') + 4:].replace(
                    '.', '')
                spanmeta = [
                    m for m in self.hdumeta if m['extname'] == spanext
                ][0]
                spanname = [
                    l.split('=')[1].strip() for l in section
                    if l.startswith('EXTNAME = ')
                ][0]
                extnames = [p.strip() for p in spanname.split(',')]
                if len(range(spanstart, spanend + 1)) == len(extnames):
                    for i, l in enumerate(range(spanstart, spanend + 1)):
                        meta = dict()
                        meta['title'] = 'HDU{0:d}'.format(l)
                        meta['extname'] = extnames[i]
                        meta['extension'] = spanmeta['extension']
                        meta['format'] = spanmeta['format']
                        meta['keywords'] = spanmeta['keywords']
                        self.hdumeta.append(meta)
                else:
                    log.warning(
                        ('Range specification from HDU %d to HDU %d ' +
                         'does not have a matching EXTNAME specification'),
                        spanstart, spanend)
                continue
            meta = dict()
            meta['title'] = section[0]
            if 'Empty HDU.' in section:
                meta['extension'] = 'IMAGE'
                meta['format'] = 'Empty HDU.'
            image_data = [l for l in section if l.startswith('Data:')]
            if image_data:
                meta['extension'] = 'IMAGE'
                meta['format'] = image_data[0]
            try:
                rdtc = section.index('Required Data Table Columns')
            except ValueError:
                rdtc = None
            if rdtc is not None:
                meta['extension'] = 'BINTABLE'
                table = [
                    i for i, l in enumerate(section[rdtc:])
                    if self._tableboundary.match(l) is not None
                ][1:3]
                columns = list(map(len, section[rdtc:][table[0]].split()))
                table_lines = section[rdtc:][table[0] + 1:table[1]]
                meta['format'] = [
                    self._extract_columns(t, columns) for t in table_lines
                ]
                for mk in meta['format']:
                    if not mk[1]:
                        m = "Missing type for column %s in HDU %d of %s!"
                        if error:
                            log.critical(m, mk[0], k, metafile)
                            raise DataModelError(m % (mk[0], k, metafile))
                        else:
                            log.warning(m, mk[0], k, metafile)
                    if mk[2]:
                        bad_unit = self.check_unit(mk[2], error=error)
                        if bad_unit:
                            log.debug(
                                "Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.",
                                bad_unit, mk[0], k, metafile)
            try:
                rhk = section.index('Required Header Keywords')
            except ValueError:
                meta['keywords'] = []
            else:
                table = [
                    i for i, l in enumerate(section[rhk:])
                    if self._tableboundary.match(l) is not None
                ][1:3]
                columns = list(map(len, section[rhk:][table[0]].split()))
                table_lines = section[rhk:][table[0] + 1:table[1]]
                meta['keywords'] = [
                    self._extract_columns(t, columns) for t in table_lines
                ]
                for mk in meta['keywords']:
                    if not mk[2]:
                        m = "Missing type for keyword %s in HDU %d of %s!"
                        if error:
                            log.critical(m, mk[0], k, metafile)
                            raise DataModelError(m % (mk[0], k, metafile))
                        else:
                            log.warning(m, mk[0], k, metafile)
                    if mk[0] == 'BUNIT':
                        bad_unit = self.check_unit(mk[1], error=error)
                        if bad_unit:
                            log.debug(
                                "Non-standard (but acceptable) unit %s detected for column %s in HDU %d of %s.",
                                bad_unit, mk[0], k, metafile)
            #
            # Need to know the format by this point!
            #
            try:
                foo = meta['format']
            except KeyError:
                m = "Unable to determine format for HDU %d in %s!"
                log.critical(m, k, metafile)
                raise DataModelError(m % (k, metafile))
            #
            # See https://github.com/desihub/desidatamodel/issues/69 for
            # the detailed policy on EXTNAME.
            #
            try:
                meta['extname'] = [
                    l.split()[2] for l in section if l.startswith('EXTNAME = ')
                ][0]
            except IndexError:
                meta['extname'] = ''
                if (k > 0 or (k == 0 and meta['format'] != 'Empty HDU.')):
                    m = "HDU %d in %s has no EXTNAME!"
                    if error:
                        log.critical(m, k, metafile)
                        raise DataModelError(m % (k, metafile))
                    else:
                        log.warning(m, k, metafile)
                else:
                    if k == 0 and meta['format'] == 'Empty HDU.':
                        if len(meta['keywords']) > 0:
                            m = "HDU %d in %s should have EXTNAME = 'PRIMARY'."
                            log.warning(m, k, metafile)
            else:
                #
                # If we reach here, meta['extname'] *is* defined.
                #
                if k == 0:
                    if meta['format'] == 'Empty HDU.':
                        if len(meta['keywords']
                               ) > 0 and meta['extname'] != 'PRIMARY':
                            m = "HDU %d in %s has acceptable alternative EXTNAME = '%d'."
                            log.debug(m, k, metafile, meta['extname'])
                    else:
                        if meta['extname'] == 'PRIMARY':
                            m = "HDU %d in %s should have a more meaningful EXTNAME than 'PRIMARY'."
                            log.warning(m, k, metafile)
            self.hdumeta.append(meta)
        return self.hdumeta
Esempio n. 17
0
    def validate_prototype(self, error=False):
        """Compares a model's prototype data file to the data models.

        Parameters
        ----------
        error : :class:`bool`, optional
            If ``True``, failure to extract certain required metadata raises an
            exception.

        Notes
        -----
        * Use set theory to compare the data headers to model headers.  This should
          automatically find missing headers, extraneous headers, etc.
        """
        if self.prototype is None:
            #
            # A warning should have been issued already, so just skip silently.
            #
            return
        log.info("Comparing %s to %s.", self.prototype, self.filename)
        if self._stub is None:
            self._stub = Stub(self.prototype, error=error)
        stub_meta = self._stub_meta = self._stub.hdumeta
        modelmeta = self.extract_metadata(error=error)
        #
        # Check number of headers.
        #
        if self._stub.nhdr != len(modelmeta):
            log.warning(
                "Prototype file %s has the wrong number of " +
                "sections (HDUs) according to %s.", self.prototype,
                self.filename)
            return
        for i in range(self._stub.nhdr):
            dkw = stub_meta[i]['keywords']
            mkw = modelmeta[i]['keywords']
            #
            # Check number of keywords.
            #
            if len(dkw) != len(mkw):
                log.warning(
                    "Prototype file %s has the wrong number of " +
                    "HDU%d keywords according to %s.", self.prototype, i,
                    self.filename)
                continue
            #
            # If number of keywords is correct, check them individually.
            #
            for j in range(len(dkw)):
                if dkw[j][0] != mkw[j][0]:
                    log.warning(
                        "Prototype file %s has a keyword " +
                        "mismatch (%s != %s) in HDU%d according to " + "%s.",
                        self.prototype, dkw[j][0], mkw[j][0], i, self.filename)
            #
            # Check the extension type.
            #
            dex = stub_meta[i]['extension']
            try:
                mex = modelmeta[i]['extension']
            except KeyError:
                mex = "Extension type not found"
            if dex != mex:
                log.warning(
                    "Prototype file %s has an extension type " +
                    "mismatch in HDU%d (%s != %s) " + "according to %s.",
                    self.prototype, i, dex, mex, self.filename)
                continue
            #
            # Check for EXTNAME
            #
            dexex = stub_meta[i]['extname']
            mexex = modelmeta[i]['extname']
            if dexex == '' and i > 0:
                log.warning("Prototype file %s has no EXTNAME in HDU%d.",
                            self.prototype, i)
            if (dexex != '' and mexex != '' and dexex != mexex):
                log.warning(
                    "Prototype file %s has an EXTNAME mismatch " +
                    "in HDU%d (%s != %s) " + "according to %s.",
                    self.prototype, i, dexex, mexex, self.filename)
            #
            # If the extension type is correct, check the contents of the
            # extension.
            #
            dexf = stub_meta[i]['format']
            try:
                mexf = modelmeta[i]['format']
            except KeyError:
                mexf = "Extension format not found"
            if dex == 'IMAGE':
                try:
                    icomma = dexf.index(',')
                except ValueError:
                    icomma = len(dexf)
                if dexf[:icomma] != mexf[:icomma]:
                    log.warning(
                        "Prototype file %s has an extension " +
                        "format mismatch in HDU%d " + "according to %s.",
                        self.prototype, i, self.filename)
            else:
                dexf = dexf[1:]  # Get rid of header line.
                if len(dexf) != len(mexf):
                    log.warning(
                        "Prototype file %s has the wrong " +
                        "number of HDU%d columns according to %s.",
                        self.prototype, i, self.filename)
                else:
                    for j in range(len(dexf)):
                        if dexf[j][0] != mexf[j][0]:
                            log.warning(
                                "Prototype file %s has a " +
                                "column name mismatch (%s != %s) " +
                                "in HDU%d according to %s.", self.prototype,
                                dexf[j][0], mexf[j][0], i, self.filename)
        return
Esempio n. 18
0
def load_fiberassign(datapath, maxpass=4, hdu='FIBERASSIGN', q3c=False,
                     latest_epoch=False, last_column='SUBPRIORITY'):
    """Load fiber assignment files into the fiberassign table.

    Tile files can appear in multiple epochs, so for a given tileid, load
    the tile file with the largest value of epoch.  In the "real world",
    a tile file appears in each epoch until it is observed, therefore
    the tile file corresponding to the actual observation is the one
    with the largest epoch.

    Parameters
    ----------
    datapath : :class:`str`
        Full path to the directory containing tile files.
    maxpass : :class:`int`, optional
        Search for pass numbers up to this value (default 4).
    hdu : :class:`int` or :class:`str`, optional
        Read a data table from this HDU (default 'FIBERASSIGN').
    q3c : :class:`bool`, optional
        If set, create q3c index on the table.
    latest_epoch : :class:`bool`, optional
        If set, search for the latest tile file among several epochs.
    last_column : :class:`str`, optional
        Do not load columns past this name (default 'BRICKNAME').
    """
    fiberpath = os.path.join(datapath, 'tile*.fits')
    log.info("Using tile file search path: %s.", fiberpath)
    tile_files = glob.glob(fiberpath)
    if len(tile_files) == 0:
        log.error("No tile files found!")
        return
    log.info("Found %d tile files.", len(tile_files))
    #
    # Find the latest epoch for every tile file.
    #
    latest_tiles = dict()
    if latest_epoch:
        tileidre = re.compile(r'/(\d+)/fiberassign/tile-(\d+)\.fits$')
        for f in tile_files:
            m = tileidre.search(f)
            if m is None:
                log.error("Could not match %s!", f)
                continue
            epoch, tileid = map(int, m.groups())
            if tileid in latest_tiles:
                if latest_tiles[tileid][0] < epoch:
                    latest_tiles[tileid] = (epoch, f)
            else:
                latest_tiles[tileid] = (epoch, f)
    else:
        for f in tile_files:
            # tile_TILEID.fits or tile-TILEID.fits
            tileid = int(re.match('tile[\-_](\d+)\.fits',
                         os.path.basename(f))[1])
            latest_tiles[tileid] = (0, f)
    log.info("Identified %d tile files for loading.", len(latest_tiles))
    #
    # Read the identified tile files.
    #
    data_index = None
    for tileid in latest_tiles:
        epoch, f = latest_tiles[tileid]
        with fits.open(f) as hdulist:
            data = hdulist[hdu].data
        log.info("Read data from %s HDU %s", f, hdu)
        for col in data.names[:data_index]:
            if data[col].dtype.kind == 'f':
                bad = np.isnan(data[col])
                if np.any(bad):
                    nbad = bad.sum()
                    log.warning("%d rows of bad data detected in column " +
                                "%s of %s.", nbad, col, f)
                    #
                    # This replacement may be deprecated in the future.
                    #
                    if col in ('TARGET_RA', 'TARGET_DEC', 'DESIGN_X', 'DESIGN_Y'):
                        data[col][bad] = -9999.0
                assert not np.any(np.isnan(data[col]))
                assert np.all(np.isfinite(data[col]))
        n_rows = len(data)
        if data_index is None:
            data_index = data.names.index(last_column) + 1
        data_list = ([[tileid]*n_rows] +
                     [data[col].tolist() for col in data.names[:data_index]])
        data_names = ['tileid'] + [col.lower() for col in data.names[:data_index]]
        log.info("Initial column conversion complete on tileid = %d.", tileid)
        data_rows = list(zip(*data_list))
        log.info("Converted columns into rows on tileid = %d.", tileid)
        dbSession.bulk_insert_mappings(FiberAssign, [dict(zip(data_names, row))
                                                     for row in data_rows])
        log.info("Inserted %d rows in %s for tileid = %d.",
                 n_rows, FiberAssign.__tablename__, tileid)
        dbSession.commit()
    if q3c:
        q3c_index('fiberassign', ra='target_ra')
    return
Esempio n. 19
0
    def get_regexp(self, root, error=False):
        """Obtain the regular expression used to match files on disk.

        Parameters
        ----------
        root : :class:`str`
            Path to real files on disk.
        error : :class:`bool`, optional
            If ``True``, failure to find a regular expression raises an
            exception instead of just a warning.

        Returns
        -------
        regular expression
            The regular expression found, or ``None`` if not found.
            The regular expression is also stored internally.

        Raises
        ------
        :exc:`~desimodel.DataModelError`
            If `error` is set and problems with the data model file are
            detected.
        """
        with open(self.filename) as dm:
            for line in dm.readlines():
                if line.startswith('See :doc:'):
                    self.ref = self._cross_reference(line)
                    log.debug("Cross reference detected %s -> %s.",
                              self.filename, self.ref)
                    break
                if self._regexpline.match(line) is not None:
                    d = os.path.dirname(self.filename).replace(
                        self.section, root)
                    for k in self._d2r:
                        d = d.replace(k, self._d2r[k])
                    r = line.strip().split()[1].replace('``', '')
                    self.regexp = re.compile(os.path.join(d, r))
                    break
        if self.regexp is None and self.ref is not None:
            with open(self.ref) as dm:
                for line in dm.readlines():
                    #
                    # Hopefully cross-references are not nested.
                    #
                    # if line.startswith('See :doc:'):
                    #     self.ref = self._cross_reference(line)
                    #     break
                    if self._regexpline.match(line) is not None:
                        d = os.path.dirname(self.filename).replace(
                            self.section, root)
                        for k in self._d2r:
                            d = d.replace(k, self._d2r[k])
                        r = line.strip().split()[1].replace('``', '')
                        self.regexp = re.compile(os.path.join(d, r))
                        break
        if self.regexp is None:
            m = "%s has no file regexp!"
            if error:
                log.critical(m, self.filename)
                raise DataModelError(m % self.filename)
            else:
                log.warning(m, self.filename)
        return self.regexp
Esempio n. 20
0
def load_file(filepath,
              tcls,
              hdu=1,
              expand=None,
              convert=None,
              index=None,
              rowfilter=None,
              q3c=False,
              chunksize=50000,
              maxrows=0):
    """Load a data file into the database, assuming that column names map
    to database column names with no surprises.

    Parameters
    ----------
    filepath : :class:`str`
        Full path to the data file.
    tcls : :class:`sqlalchemy.ext.declarative.api.DeclarativeMeta`
        The table to load, represented by its class.
    hdu : :class:`int` or :class:`str`, optional
        Read a data table from this HDU (default 1).
    expand : :class:`dict`, optional
        If set, map FITS column names to one or more alternative column names.
    convert : :class:`dict`, optional
        If set, convert the data for a named (database) column using the
        supplied function.
    index : :class:`str`, optional
        If set, add a column that just counts the number of rows.
    rowfilter : callable, optional
        If set, apply this filter to the rows to be loaded.  The function
        should return :class:`bool`, with ``True`` meaning a good row.
    q3c : :class:`bool`, optional
        If set, create q3c index on the table.
    chunksize : :class:`int`, optional
        If set, load database `chunksize` rows at a time (default 50000).
    maxrows : :class:`int`, optional
        If set, stop loading after `maxrows` are loaded.  Alteratively,
        set `maxrows` to zero (0) to load all rows.
    """
    tn = tcls.__tablename__
    if filepath.endswith('.fits'):
        with fits.open(filepath) as hdulist:
            data = hdulist[hdu].data
    elif filepath.endswith('.ecsv'):
        data = Table.read(filepath, format='ascii.ecsv')
    else:
        log.error("Unrecognized data file, %s!", filepath)
        return
    if maxrows == 0:
        maxrows = len(data)
    log.info("Read data from %s HDU %s", filepath, hdu)
    try:
        colnames = data.names
    except AttributeError:
        colnames = data.colnames
    for col in colnames:
        if data[col].dtype.kind == 'f':
            bad = np.isnan(data[col][0:maxrows])
            if np.any(bad):
                nbad = bad.sum()
                log.warning(
                    "%d rows of bad data detected in column " + "%s of %s.",
                    nbad, col, filepath)
                #
                # Temporary workaround for bad flux values, see
                # https://github.com/desihub/desitarget/issues/397
                #
                if col in ('FLUX_R', 'FIBERFLUX_R', 'FIBERTOTFLUX_R'):
                    data[col][0:maxrows][bad] = -9999.0
    log.info("Integrity check complete on %s.", tn)
    if rowfilter is None:
        good_rows = np.ones((maxrows, ), dtype=np.bool)
    else:
        good_rows = rowfilter(data[0:maxrows])
    data_list = [data[col][0:maxrows][good_rows].tolist() for col in colnames]
    data_names = [col.lower() for col in colnames]
    finalrows = len(data_list[0])
    log.info("Initial column conversion complete on %s.", tn)
    if expand is not None:
        for col in expand:
            i = data_names.index(col.lower())
            if isinstance(expand[col], str):
                #
                # Just rename a column.
                #
                log.debug("Renaming column %s (at index %d) to %s.",
                          data_names[i], i, expand[col])
                data_names[i] = expand[col]
            else:
                #
                # Assume this is an expansion of an array-valued column
                # into individual columns.
                #
                del data_names[i]
                del data_list[i]
                for j, n in enumerate(expand[col]):
                    log.debug("Expanding column %d of %s (at index %d) to %s.",
                              j, col, i, n)
                    data_names.insert(i + j, n)
                    data_list.insert(i + j, data[col][:, j].tolist())
                log.debug(data_names)
    log.info("Column expansion complete on %s.", tn)
    del data
    if convert is not None:
        for col in convert:
            i = data_names.index(col)
            data_list[i] = [convert[col](x) for x in data_list[i]]
    log.info("Column conversion complete on %s.", tn)
    if index is not None:
        data_list.insert(0, list(range(1, finalrows + 1)))
        data_names.insert(0, index)
        log.info("Added index column '%s'.", index)
    data_rows = list(zip(*data_list))
    del data_list
    log.info("Converted columns into rows on %s.", tn)
    for k in range(finalrows // chunksize + 1):
        data_chunk = [
            dict(zip(data_names, row))
            for row in data_rows[k * chunksize:(k + 1) * chunksize]
        ]
        if len(data_chunk) > 0:
            engine.execute(tcls.__table__.insert(), data_chunk)
            log.info("Inserted %d rows in %s.",
                     min((k + 1) * chunksize, finalrows), tn)
    # for k in range(finalrows//chunksize + 1):
    #     data_insert = [dict([(col, data_list[i].pop(0))
    #                          for i, col in enumerate(data_names)])
    #                    for j in range(chunksize)]
    #     session.bulk_insert_mappings(tcls, data_insert)
    #     log.info("Inserted %d rows in %s..",
    #              min((k+1)*chunksize, finalrows), tn)
    # session.commit()
    # dbSession.commit()
    if q3c:
        q3c_index(tn)
    return