コード例 #1
0
    def to_normalized(self, doc):
        # make the new dict actually contain real items
        normed = {}
        do_not_include = [
            'docID', 'doc', 'filetype', 'timestamps', 'source', 'versions',
            'key'
        ]
        for key, value in dict(doc).items():
            if value and key not in do_not_include:
                try:
                    normed[key] = json.loads(value)
                except (ValueError, TypeError):
                    normed[key] = value

        if normed.get('versions'):
            normed['versions'] = list(map(str, normed['versions']))

        # No datetime means the document wasn't normalized (probably wasn't on the approved list)
        # TODO - fix odd circular import that makes us import this here
        from scrapi.base.helpers import datetime_formatter
        if normed.get('providerUpdatedDateTime'):
            normed['providerUpdatedDateTime'] = datetime_formatter(
                normed['providerUpdatedDateTime'].isoformat())
        else:
            return None

        return NormalizedDocument(normed, validate=False, clean=False)
コード例 #2
0
ファイル: cassandra.py プロジェクト: hmoco/scrapi
    def to_normalized(self, doc):
        # make the new dict actually contain real items
        normed = {}
        do_not_include = ['docID', 'doc', 'filetype', 'timestamps', 'source']
        for key, value in dict(doc).items():
            if value and key not in do_not_include:
                try:
                    normed[key] = json.loads(value)
                except (ValueError, TypeError):
                    normed[key] = value

        if normed.get('versions'):
            normed['versions'] = list(map(str, normed['versions']))

        # No datetime means the document wasn't normalized (probably wasn't on the approved list)
        # TODO - fix odd circular import that makes us import this here
        from scrapi.base.helpers import datetime_formatter
        if normed.get('providerUpdatedDateTime'):
            normed['providerUpdatedDateTime'] = datetime_formatter(normed['providerUpdatedDateTime'].isoformat())
        else:
            return None

        return NormalizedDocument(normed, validate=False, clean=False)
コード例 #3
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x[0] if x else ''),
         'description': ('/subtitle', lambda x: x[0]
                         if (isinstance(x, list) and x) else x or ''),
         'providerUpdatedDateTime':
         ('/issued/date-parts',
          lambda x: datetime_formatter(' '.join([part for part in x[0]]))),
         'uris': {
             'canonicalUri': '/URL'
         },
         'contributors': ('/author', lambda x: [
             process_contributor(*[
                 '{} {}'.format(entry.get('given'), entry.get('family')),
                 entry.get('ORCID')
             ]) for entry in x
         ]),
         'otherProperties':
         build_properties(('referenceCount', '/reference-count'),
                          ('updatePolicy', '/update-policy'),
                          ('depositedTimestamp', '/deposited/timestamp'),
                          ('Empty', '/trash/not-here'), ('Empty2', '/'))
     }
コード例 #4
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x[0] if x else ''),
         'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''),
         'providerUpdatedDateTime': ('/issued/date-parts', lambda x: datetime_formatter(' '.join(
             [part for part in x[0]])
         )),
         'uris': {
             'canonicalUri': '/URL'
         },
         'contributors': ('/author', lambda x: [
             process_contributor(*[
                 '{} {}'.format(entry.get('given'), entry.get('family')),
                 entry.get('ORCID')
             ]) for entry in x
         ]),
         'otherProperties': build_properties(
             ('referenceCount', '/reference-count'),
             ('updatePolicy', '/update-policy'),
             ('depositedTimestamp', '/deposited/timestamp'),
             ('Empty', '/trash/not-here'),
             ('Empty2', '/')
         )
     }