コード例 #1
0
    def test_failing_transformation_with_raises(self):
        base.settings.RAISE_IN_TRANSFORMER = True

        self.harvester.schema = updated_schema(TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'})

        with pytest.raises(XPathEvalError) as e:
            x = [self.harvester.normalize(record) for record in self.harvester.harvest()]
コード例 #2
0
    def test_arg_kwargs(self):
        def process_title(title, title1="test"):
            return title[0] + (title1[0] if isinstance(title1, list) else title1)

        def process_title2(title1="test"):
            return title1[0] if isinstance(title1, list) else title1

        args = ("//dc:title/node()", )
        kwargs = {"title1": "//dc:title/node()"}

        self.harvester.schema = updated_schema(
            TEST_SCHEMA,
            {
                'title': (pack(*args, **kwargs), process_title),
                'otherProperties': build_properties(
                    ('title2', (pack(*args), process_title)),
                    ('title3', (pack(**kwargs), process_title2)),
                    ('title4', (pack('//dc:title/node()', title1='//dc:title/node()'), process_title))
                )
            }
        )


        results = [self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1)]

        for result in results:
            assert result['title'] == "TestTest"
            assert result['otherProperties'][0]['properties']['title2'] == 'Testtest'
            assert result['otherProperties'][1]['properties']['title3'] == 'Test'
            assert result['otherProperties'][2]['properties']['title4'] == "TestTest"
コード例 #3
0
 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris":
             ('//ns0:header/ns0:identifier/node()',
              '//dc:identifier/node()', oai_process_uris_addis_ababa)
         })
コード例 #4
0
ファイル: icpsr.py プロジェクト: AndrewSallans/scrapi
 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": {
             "canonicalUri": ('//dc:identifier/node()', helpers.compose(create_icpsr_url, helpers.single_result)),
             "objectUris": [('//dc:identifier/node()', icpsr_exttract_doi)]
         }
     })
コード例 #5
0
 def schema(self):
     return updated_schema(
         self._schema, {
             'contributors':
             ('//dc:creator/node()', '//dc:contributor/node()',
              aoi_process_contributors_bhl)
         })
コード例 #6
0
class DryadHarvester(OAIHarvester):
    short_name = 'dryad'
    long_name = 'Dryad Data Repository'
    url = 'http://www.datadryad.org/oai/request'

    base_url = 'http://www.datadryad.org/oai/request'
    property_list = ['rights', 'format', 'relation', 'date',
                     'identifier', 'type', 'setSpec']
    timezone_granularity = True

    schema = helpers.updated_schema(
        schemas.OAISCHEMA,
        {
            "uris": {
                "objectUris": ('//dc:relation/node()', '//dc:identifier/node()', format_dois_dryad)
            }
        }
    )

    def normalize(self, raw_doc):
        result = etree.XML(raw_doc['doc'])

        status = (result.xpath('//dc:status/node()', namespaces=self.namespaces) or [''])[0]
        if str(status).lower() in ['deleted', 'item is not available']:
            logger.info('Not normalizing record with ID {}, status {}'.format(raw_doc['docID'], status))
            return None
        doc_type = (result.xpath('//dc:type/node()', namespaces=self.namespaces) or [''])[0]
        if not doc_type.lower() == 'article':
            logger.info('Not normalizing record with ID {}, type {}'.format(raw_doc['docID'], doc_type))
            return None

        return super(OAIHarvester, self).normalize(raw_doc)
コード例 #7
0
ファイル: datacite.py プロジェクト: NeuroVault/scrapi
 def schema(self):
     return updated_schema(self._schema, {
         "description": ("//dc:description/node()", get_second_description),
         "uris": {
             "canonicalUri": ('//dc:identifier/node()', compose(single_result, oai_extract_dois)),
             "objectUris": ('//dc:identifier/node()', oai_extract_dois)
         }
     })
コード例 #8
0
    def test_failing_transformation_wont_raise(self):
        base.transformer.logger.setLevel(50)
        base.settings.RAISE_IN_TRANSFORMER = False

        self.harvester.schema = updated_schema(TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'})

        with pytest.raises(ValidationError) as e:
            x = [self.harvester.normalize(record) for record in self.harvester.harvest()]
コード例 #9
0
ファイル: dryad.py プロジェクト: felliott/scrapi
 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": {
                 "objectUris": ('//dc:relation/node()',
                                '//dc:identifier/node()', format_dois_dryad)
             }
         })
コード例 #10
0
 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": {
                 "canonicalUri":
                 ('//ns0:header/ns0:identifier/node()',
                  helpers.compose(oai_extract_url_pubmedcentral,
                                  helpers.single_result))
             }
         })
コード例 #11
0
ファイル: __init__.py プロジェクト: jeffreyliu3230/scrapi
    def schema(self):
        properties = {
            'otherProperties':
            build_properties(*[(item, ('//dc:{}/node()'.format(item),
                                       '//ns0:{}/node()'.format(item),
                                       self.resolve_property))
                               for item in self.property_list])
        }

        return updated_schema(OAISCHEMA, properties)
コード例 #12
0
ファイル: __init__.py プロジェクト: bdyetton/scrapi
    def schema(self):
        properties = {
            'otherProperties': build_properties(*[(item, (
                '//dc:{}/node()'.format(item),
                '//ns0:{}/node()'.format(item),
                self.resolve_property)
            ) for item in self.property_list])
        }

        return updated_schema(OAISCHEMA, properties)
コード例 #13
0
 def schema(self):
     return updated_schema(
         self._schema, {
             "description":
             ("//dc:description/node()", get_second_description),
             "uris": {
                 "canonicalUri": ('//dc:identifier/node()',
                                  compose(single_result, oai_extract_dois)),
                 "objectUris": ('//dc:identifier/node()', oai_extract_dois)
             }
         })
コード例 #14
0
    def test_failing_transformation_with_raises(self):
        base.settings.RAISE_IN_TRANSFORMER = True

        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'})

        with pytest.raises(XPathEvalError) as e:
            x = [
                self.harvester.normalize(record)
                for record in self.harvester.harvest()
            ]
コード例 #15
0
    def test_failing_transformation_wont_raise(self):
        base.transformer.logger.setLevel(50)
        base.settings.RAISE_IN_TRANSFORMER = False

        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'})

        with pytest.raises(ValidationError) as e:
            x = [
                self.harvester.normalize(record)
                for record in self.harvester.harvest()
            ]
コード例 #16
0
ファイル: icpsr.py プロジェクト: zamattiac/scrapi
 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": {
                 "canonicalUri":
                 ('//dc:identifier/node()',
                  helpers.compose(create_icpsr_url, helpers.single_result)),
                 "objectUris": [
                     ('//dc:identifier/node()', icpsr_exttract_doi)
                 ]
             }
         })
コード例 #17
0
ファイル: bhl.py プロジェクト: jeffreyliu3230/scrapi
class BHLHarvester(OAIHarvester):
    short_name = 'bhl'
    long_name = 'Biodiversity Heritage Library OAI Repository'
    url = 'http://www.biodiversitylibrary.org/'

    base_url = 'http://www.biodiversitylibrary.org/oai'
    schema = updated_schema(
        OAISCHEMA, {
            'contributors': ('//dc:creator/node()', '//dc:contributor/node()',
                             aoi_process_contributors_bhl)
        })
    property_list = ['type', 'date', 'relation', 'setSpec', 'rights']
コード例 #18
0
class ScholarsbankHarvester(OAIHarvester):
    short_name = 'scholarsbank'
    long_name = 'Scholars Bank University of Oregon'
    url = 'http://scholarsbank.uoregon.edu'
    timezone_granularity = True

    base_url = 'http://scholarsbank.uoregon.edu/oai/request'
    property_list = [
        'type', 'source', 'format', 'relation', 'date', 'description',
        'setSpec', 'identifier'
    ]

    schema = updated_schema(
        OAISCHEMA, {'description': ('//dc:description/node()', second_result)})
コード例 #19
0
class PubMedCentralHarvester(OAIHarvester):
    short_name = 'pubmedcentral'
    long_name = 'PubMed Central'
    url = 'http://www.ncbi.nlm.nih.gov/pmc/'

    schema = helpers.updated_schema(
        schemas.OAISCHEMA, {
            "uris": {
                "canonicalUri": ('//ns0:header/ns0:identifier/node()',
                                 helpers.compose(oai_extract_url_pubmedcentral,
                                                 helpers.single_result))
            }
        })

    base_url = 'http://www.pubmedcentral.nih.gov/oai/oai.cgi'
    property_list = [
        'type', 'source', 'rights', 'format', 'setSpec', 'date', 'identifier'
    ]
コード例 #20
0
    def test_constants(self):
        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {
                'tags': (CONSTANT(['X']), lambda x: x),
                'otherProperties': [{
                    'name': CONSTANT('test'),
                    'properties':{
                        'test':  CONSTANT('test')
                    },
                    'uri': CONSTANT('http://example.com'),
                    'description': CONSTANT('A test field')
                }]
            }
        )
        results = [
            self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1)
        ]

        for result in results:
            assert result['otherProperties'][0]['properties']['test'] == 'test'
            assert result['tags'] == ['X']
コード例 #21
0
    def test_constants(self):
        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {
                'tags': (CONSTANT(['X']), lambda x: x),
                'otherProperties': [{
                    'name': CONSTANT('test'),
                    'properties': {
                        'test': CONSTANT('test')
                    },
                    'uri': CONSTANT('http://example.com'),
                    'description': CONSTANT('A test field')
                }]
            })
        results = [
            self.harvester.normalize(record)
            for record in self.harvester.harvest(days_back=1)
        ]

        for result in results:
            assert result['otherProperties'][0]['properties']['test'] == 'test'
            assert result['tags'] == ['X']
コード例 #22
0
    def test_arg_kwargs(self):
        def process_title(title, title1="test"):
            return title[0] + (title1[0]
                               if isinstance(title1, list) else title1)

        def process_title2(title1="test"):
            return title1[0] if isinstance(title1, list) else title1

        args = ("//dc:title/node()", )
        kwargs = {"title1": "//dc:title/node()"}

        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {
                'title': (pack(*args, **kwargs), process_title),
                'otherProperties':
                build_properties(
                    ('title2', (pack(*args), process_title)),
                    ('title3', (pack(**kwargs), process_title2)),
                    ('title4',
                     (pack('//dc:title/node()',
                           title1='//dc:title/node()'), process_title)))
            })

        results = [
            self.harvester.normalize(record)
            for record in self.harvester.harvest(days_back=1)
        ]

        for result in results:
            assert result['title'] == "TestTest"
            assert result['otherProperties'][0]['properties'][
                'title2'] == 'Testtest'
            assert result['otherProperties'][1]['properties'][
                'title3'] == 'Test'
            assert result['otherProperties'][2]['properties'][
                'title4'] == "TestTest"
コード例 #23
0
ファイル: __init__.py プロジェクト: NeuroVault/scrapi
 def _schema(self):
     return updated_schema(OAISCHEMA, self.formatted_properties)
コード例 #24
0
ファイル: utils.py プロジェクト: erinspace/scrapi
    }],
    'description': 'This study seeks to understand how humans impact\
            the dietary patterns of eight free-ranging vervet monkey\
            (Chlorocebus pygerythrus) groups in South Africa using stable\
            isotope analysis.',
    'providerUpdatedDateTime': '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'test'
    }
}


TEST_SCHEMA = updated_schema(DOESCHEMA, {
    "title": ("//dc:title/node()", lambda x: "Title overwritten"),
    "otherProperties": build_properties(
        ("title1", ("//dc:title/node()", single_result)),
        ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())),
        ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower()))
    )
})


TEST_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcq': 'http://purl.org/dc/terms/'
}


TEST_XML_DOC = b'''
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/">
        <records count="97" morepages="true" start="1" end="10">
コード例 #25
0
ファイル: pubmedcentral.py プロジェクト: erinspace/scrapi
 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": {
             "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result))
         }
     })
コード例 #26
0
ファイル: bhl.py プロジェクト: NeuroVault/scrapi
 def schema(self):
     return updated_schema(self._schema, {
         'contributors': ('//dc:creator/node()', '//dc:contributor/node()', aoi_process_contributors_bhl)
     })
コード例 #27
0
 def _schema(self):
     return updated_schema(OAISCHEMA, self.formatted_properties)
コード例 #28
0
 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": ('//dc:identifier/node()', '//dc:relation/node()',
                      helpers.oai_process_uris)
         })
コード例 #29
0
ファイル: utils.py プロジェクト: jeffreyliu3230/scrapi
            the dietary patterns of eight free-ranging vervet monkey\
            (Chlorocebus pygerythrus) groups in South Africa using stable\
            isotope analysis.',
    'providerUpdatedDateTime':
    '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'test'
    }
}

TEST_SCHEMA = updated_schema(
    DOESCHEMA, {
        "title": ("//dc:title/node()", lambda x: "Title overwritten"),
        "otherProperties":
        build_properties(
            ("title1", ("//dc:title/node()", single_result)),
            ("title2",
             ("//dc:title/node()", lambda x: single_result(x).lower())),
            ("title3",
             ("//dc:title/node()", "//dc:title/node()",
              lambda x, y: single_result(x) + single_result(y).lower())))
    })

TEST_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcq': 'http://purl.org/dc/terms/'
}

TEST_XML_DOC = '''
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/">
        <records count="97" morepages="true" start="1" end="10">
コード例 #30
0
ファイル: smithsonian.py プロジェクト: NeuroVault/scrapi
 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": ('//dc:identifier/node()', helpers.oai_process_uris)
     })
コード例 #31
0
ファイル: umontreal.py プロジェクト: zamattiac/scrapi
 def schema(self):
     return updated_schema(self._schema, {
         'languages': ('//dc:language/node()', umontreal_language_processor)
     })
コード例 #32
0
ファイル: utils.py プロジェクト: Eleonore9/scrapi
    # },
    'description': 'This study seeks to understand how humans impact\
            the dietary patterns of eight free-ranging vervet monkey\
            (Chlorocebus pygerythrus) groups in South Africa using stable\
            isotope analysis.',
    'providerUpdatedDateTime': '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'crossref'
    }
}


TEST_SCHEMA = updated_schema(BASEXMLSCHEMA, {
    "title": ("//dc:title/node()", lambda x: "Title overwritten"),
    # "otherProperties": {
    #     "title1": "//dc:title/node()",
    #     "title2": ["//dc:title/node()", lambda x: x.lower()],
    #     "title3": ["//dc:title/node()", "//dc:title/node()", lambda x, y: x + y.lower()]
    # }
})


def get_leaves(d, leaves=None):
    if leaves is None:
        leaves = []

    for k, v in d.items():
        if isinstance(v, dict):
            leaves.extend(get_leaves(v, leaves))
        else:
            leaves.append((k, v))
コード例 #33
0
ファイル: pcurio.py プロジェクト: AndrewSallans/scrapi
 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": ('//ns0:header/ns0:identifier/node()', '//dc:identifier/node()', oai_process_pcurio)
     })
コード例 #34
0
ファイル: pubmedcentral.py プロジェクト: zamattiac/scrapi
 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": ('//ns0:header/ns0:identifier/node()',
                      '//dc:identifier/node()', format_uris_pubmedcentral)
         })
コード例 #35
0
 def schema(self):
     return helpers.updated_schema(
         self._schema,
         {'description': ('//dc:description/node()', second_result)})
コード例 #36
0
ファイル: mblwhoilibrary.py プロジェクト: kms6bn/scrapi
 def schema(self):
     return helpers.updated_schema(
         self._schema, {"uris": ("//dc:identifier/node()", "//dc:relation/node()", helpers.oai_process_uris)}
     )
コード例 #37
0
ファイル: pubmedcentral.py プロジェクト: AndrewSallans/scrapi
 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": ('//ns0:header/ns0:identifier/node()', '//dc:identifier/node()', format_uris_pubmedcentral)
     })
コード例 #38
0
ファイル: dryad.py プロジェクト: erinspace/scrapi
 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": {
             "objectUris": ('//dc:relation/node()', '//dc:identifier/node()', format_dois_dryad)
         }
     })