Exemple #1
0
    def export_identifier_metadata(self, dump_path, dataset_blobs):

        if (self.latest and self.latest_id_met_path.exists()):
            blob_id_met = self.latest_id_met

        else:

            def fetch(id):  # FIXME error proof version ...
                try:
                    metadata = id.metadata()
                    metadata['id'] = id.identifier  # FIXME normalization ...
                    return metadata
                except requests.exceptions.HTTPError as e:
                    logd.error(e)
                except (requests.exceptions.ConnectionError,
                        requests.exceptions.SSLError) as e:
                    log.error(e)

            # retrieve doi metadata and materialize it in the dataset
            _dois = set([
                idlib.Auto(id) if not isinstance(id, idlib.Stream) else id
                for blob in dataset_blobs for id in chain(
                    adops.get(blob, ['meta', 'protocol_url_or_doi'],
                              on_failure=[]),
                    adops.get(blob, ['meta', 'originating_article_doi'],
                              on_failure=[]),
                    # TODO data["links"]?
                    [blob['meta']['doi']]) if id is not None
            ])

            dois = [d for d in _dois if isinstance(d, idlib.Doi)]
            metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois)
            bads = [
                {
                    'id': d,
                    'reason': 'no metadata'
                }  # TODO more granular reporting e.g. 404
                for d, m in zip(dois, metadatas) if m is None
            ]
            metadatas = [m for m in metadatas if m is not None]
            blob_id_met = {
                'id': 'identifier-metadata',  # TODO is this ok ?
                'identifier_metadata': metadatas,
                'errors': bads,
                'meta': {
                    'count': len(metadatas)
                },
                'prov': {
                    'timestamp_export_start': self.timestamp,
                    'export_system_identifier': Path.sysid,
                    'export_hostname': gethostname(),
                    'export_project_path':
                    self.export_source_path.cache.anchor,
                },
            }

        with open(dump_path / 'identifier-metadata.json', 'wt') as f:
            json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode)

        return blob_id_met
Exemple #2
0
    def _completeness(self, data):
        accessor = JT(data)  # can go direct if elements are always present
        #organ = accessor.query('meta', 'organ')
        try:
            organ = adops.get(data, ['meta', 'organ'])
        except:
            organ = None

        if isinstance(organ, list) or isinstance(organ, tuple):
            if len(organ) == 1:
                organ, = organ
                organ = OntTerm(organ)
            else:
                organ = [OntTerm(o) for o in organ]

        elif organ == 'othertargets':
            pass
        elif organ:
            organ = OntTerm(organ)

        return (
            accessor.status.submission_index,
            accessor.status.curation_index,
            accessor.status.error_index,
            #accessor.submission_completeness_index,
            #dataset.name,  # from filename (do we not have that in meta!?)
            accessor.query('meta', 'folder_name'),
            accessor.id,  #if 'id' in dowe else None,
            accessor.query('meta', 'award_number'),
            organ,
        )
Exemple #3
0
 def protocol_uris(self,
                   outer_self=self
                   ):  # FIXME this needs to be pipelined
     try:
         yield from adops.get(outer_self.data(),
                              ['meta', 'protocol_url_or_doi'])
     except exc.NoSourcePathError:
         pass
Exemple #4
0
    def update_from_ir(self, ir):
        oi = OntTerm.query._instrumented
        if oi is not OntTerm:
            OntTerm.query._instrumented = OntTerm

        try:
            dataset_blobs = ir['datasets']
            self._wat = self.values[8]
            for blob in dataset_blobs:
                meta = blob['meta']
                self._update_dataset_metadata(
                    id=blob['id'],
                    name=adops.get(blob, ['meta', 'folder_name'],
                                   on_failure=''),
                    award=adops.get(blob, ['meta', 'award_number'],
                                    on_failure=''),
                )
        finally:
            # FIXME this is so dumb :/
            OntTerm.query._instrumented = oi
Exemple #5
0
 def allOf(obj):
     for o in obj['allOf']:
         if '$ref' in o:
             ref = o['$ref']
             if ref in types:
                 yield types[ref]
             else:
                 jpath = ref_to_list(ref)
                 no = adops.get(schema, jpath)
                 yield top(jpath[-1], no)
         else:
             log.debug(f'{obj}')
Exemple #6
0
    def update_from_ir(self, ir):
        oi = OntTerm.query._instrumented
        if oi is not OntTerm:
            OntTerm.query._instrumented = OntTerm

        def cformat(cell):
            if isinstance(cell, OntTerm):
                cell = cell.asCell()

            return cell

        try:
            dataset_blobs = ir['datasets']
            self._wat = self.values[8]
            for blob in dataset_blobs:
                meta = blob['meta']
                #species = adops.get(blob, ['subjects', int, 'species'], on_failure='')  # TODO not implemented
                if 'subjects' in blob:
                    species = '\n'.join(
                        sorted(
                            set([
                                cformat(s['species']) for s in blob['subjects']
                                if 'species' in s
                            ])))
                else:
                    species = ''

                self._update_dataset_metadata(
                    id=blob['id'],
                    name=adops.get(blob, ['meta', 'folder_name'],
                                   on_failure=''),
                    award=adops.get(blob, ['meta', 'award_number'],
                                    on_failure=''),
                    species=species,
                )
        finally:
            # FIXME this is so dumb :/
            OntTerm.query._instrumented = oi
        log.debug(self.uncommitted())
        self.commit()
Exemple #7
0
            def f(self, path=path, terminal=term, term_list=term_list):
                try:
                    end = adops.get(
                        self._blob, path
                    )  # FIXME / and * would need to be implemented on top of this
                except exc.NoSourcePathError:
                    if term_list:
                        return []
                    elif terminal is not None:
                        return terminal({})
                    else:
                        return [
                        ]  # FIXME was return None but that breaks missing keys that should contain lists

                # FIXME type check
                if terminal and term_list:
                    return [terminal(o) for o in end]
                elif terminal:
                    return terminal(end)
                elif term_list:
                    raise NotImplementedError('hrm')
                else:
                    return end
Exemple #8
0
 def keywords(self):
     try:
         yield from adops.get(self.data(), ['meta', 'keywords'])
     except exc.NoSourcePathError:
         pass
Exemple #9
0
 def timestamp_export_start(self):
     return adops.get(self.data, ['prov', 'timestamp_export_start'])
Exemple #10
0
 def deref(ref):
     return adops.get(schema, ref_to_list(ref))
Exemple #11
0
    def export_identifier_metadata(self, dump_path, latest_path,
                                   dataset_blobs):

        latest_id_met_path = latest_path / self.id_metadata
        if (self.latest and latest_id_met_path.exists()):
            with open(latest_id_met_path, 'rt') as f:
                blob_id_met = json.load(f)

        else:
            import requests

            def fetch(id):  # FIXME error proof version ...
                try:
                    metadata = id.metadata()
                    metadata['id'] = id
                    return metadata
                except (requests.exceptions.HTTPError,
                        idlib.exc.RemoteError) as e:
                    logd.error(e)
                except (requests.exceptions.ConnectionError,
                        requests.exceptions.SSLError,
                        idlib.exc.ResolutionError) as e:
                    log.error(e)

            def autoid_report_error(id, blob):
                try:
                    return idlib.Auto(id)
                except idlib.exc.MalformedIdentifierError as e:
                    msg = f'{blob["id"]} bad id: {id}'
                    logd.error(msg)
                    return None

            # retrieve doi metadata and materialize it in the dataset
            _dois = set([
                id if isinstance(id, idlib.Stream) else
                (fromJson(id) if isinstance(id, dict) else autoid_report_error(
                    id, blob)) for blob in dataset_blobs for id in chain(
                        adops.get(blob, ['meta', 'protocol_url_or_doi'],
                                  on_failure=[]),
                        adops.get(blob, ['meta', 'originating_article_doi'],
                                  on_failure=[]),
                        # TODO data["links"]?
                        [blob['meta']['doi']] if 'doi' in blob['meta'] else [])
                if id is not None
            ])

            dois = [d for d in _dois if isinstance(d, idlib.Doi)]
            metadatas = Async(rate=10)(deferred(fetch)(d) for d in dois)
            bads = [
                {
                    'id': d,
                    'reason': 'no metadata'
                }  # TODO more granular reporting e.g. 404
                for d, m in zip(dois, metadatas) if m is None
            ]
            metadatas = [m for m in metadatas if m is not None]
            blob_id_met = {
                'id': 'identifier-metadata',  # TODO is this ok ?
                'identifier_metadata': metadatas,
                'errors': bads,
                'meta': {
                    'count': len(metadatas)
                },
                'prov': {
                    'timestamp_export_start': self.timestamp,
                    'export_system_identifier': Path.sysid,
                    'export_hostname': gethostname(),
                    'export_project_path':
                    self.export_source_path.cache.anchor,
                },
            }

        with open(dump_path / self.id_metadata, 'wt') as f:
            json.dump(blob_id_met, f, sort_keys=True, indent=2, cls=JEncode)

        return blob_id_met