Esempio n. 1
0
def normalize(raw_doc):
    """A function for parsing the list of XML objects returned by the 
    consume function.
    Returns a list of Json objects in a format that can be recognized 
    by the OSF scrapi."""
    raw_doc_str = raw_doc.get('doc')
    terms_url = 'http://purl.org/dc/terms/'
    elements_url = 'http://purl.org/dc/elements/1.1/'
    record = etree.XML(raw_doc_str)

    title = record.find(str(etree.QName(elements_url, 'title'))).text
    description = record.find(str(etree.QName(elements_url, 'description'))).text or ''

    normalized_dict = {
        'title': copy_to_unicode(title),
        'description': copy_to_unicode(description),
        'contributors': get_contributors(record),
        'properties': get_properties(record),
        'id': get_ids(record, raw_doc),
        'source': NAME,
        'dateCreated': get_date_created(record),
        'dateUpdated' : get_date_updated(record),
        'tags': get_tags(record)
    }
    return NormalizedDocument(normalized_dict)
Esempio n. 2
0
def normalize(raw_doc):
    doc = raw_doc.get('doc')
    record = etree.XML(doc)

    set_spec = record.xpath('ns0:header/ns0:setSpec/node()',
                            namespaces=NAMESPACES)[0]

    if set_spec.replace('publication:', '') not in series_name_list:
        return None

    # title
    title = record.xpath('//dc:title/node()', namespaces=NAMESPACES)[0]

    # description
    description = (record.xpath('//dc:description/node()',
                                namespaces=NAMESPACES) or [''])[0]

    normalized_dict = {
        'title': copy_to_unicode(title),
        'contributors': get_contributors(record),
        'properties': get_properties(record),
        'description': copy_to_unicode(description),
        'id': get_ids(record, raw_doc),
        'tags': get_tags(record),
        'source': NAME,
        'dateCreated': get_date_created(record),
        'dateUpdated': get_date_updated(record),
    }

    return NormalizedDocument(normalized_dict)
Esempio n. 3
0
 def normalize(self, raw_doc):
     transformed = self.transform(etree.XML(raw_doc['doc']),
                                  fail=settings.RAISE_IN_TRANSFORMER)
     transformed['shareProperties'] = {
         'source': self.short_name,
         'docID': raw_doc['docID'],
         'filetype': raw_doc['filetype']
     }
     return NormalizedDocument(transformed, clean=True)
Esempio n. 4
0
def test_cross_db_with_versions(canonical,
                                destination,
                                monkeypatch,
                                index='test'):
    new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide'

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    # Get the test documents into the canonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    # Get a version in there too
    new_normalized = copy.deepcopy(NORMALIZED.attributes)
    new_normalized['title'] = new_title
    canonical_processor.process_normalized(RAW,
                                           NormalizedDocument(new_normalized))

    destination_processor = get_processor(destination)

    # Check to see canonical_processor versions are there, and destination are not
    canonical_versions = list(
        canonical_processor.get_versions(docID=RAW['docID'],
                                         source=RAW['source']))
    assert len(canonical_versions) == 3
    assert canonical_versions[1].normalized['title'] == NORMALIZED['title']
    assert canonical_versions[2].normalized['title'] == new_title

    destination_doc = destination_processor.get(docID=RAW['docID'],
                                                source=RAW['source'])
    assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db,
                  target_db=destination,
                  dry=False,
                  sources=['test'],
                  index=index,
                  versions=True)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    destination_versions = list(
        destination_processor.get_versions(docID=RAW['docID'],
                                           source=RAW['source']))
    assert len(destination_versions) == 3
    assert destination_versions[1].normalized['title'] == NORMALIZED['title']
    assert destination_versions[2].normalized['title'] == new_title

    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc
Esempio n. 5
0
    def normalize(self, raw):
        raw_data = json.loads(raw['doc'])
        document = raw_data['jsonData']

        # This is a workaround for the push API did not have proper email validation
        for contributor in document['contributors']:
            if contributor.get('email', None) == '':
                del contributor['email']

        # If status is marked delted in push API, mark in shareProperties
        if raw_data['status'] == 'deleted':
            document['shareProperties']['status'] = 'deleted'

        return NormalizedDocument(document)
Esempio n. 6
0
def normalize(raw_doc):
    raw_doc = raw_doc.get('doc')
    record = etree.XML(raw_doc)

    normalized_dict = {
        'title': get_title(record),
        'contributors': get_contributors(record),
        'properties': get_properties(record),
        'description': get_description(record),
        'id': get_ids(record),
        'tags': get_tags(record),
        'source': NAME,
        'dateCreated': get_date_created(record),
        'dateUpdated': get_date_updated(record)
    }

    return NormalizedDocument(normalized_dict)
Esempio n. 7
0
    def normalize(self, raw_doc, property_list):
        str_result = raw_doc.get('doc')
        result = etree.XML(str_result)

        # TODO : add series names filtering support
        payload = {
            'source': self.name,
            'title': self.get_title(result),
            'description': self.get_description(result),
            'id': self.get_ids(result, raw_doc),
            'contributors': self.get_contributors(result),
            'tags': self.get_tags(result),
            'properties': self.get_properties(result, property_list),
            'dateUpdated': self.get_date_updated(result),
            'dateCreated': self.get_date_created(result)
        }

        return NormalizedDocument(payload)
Esempio n. 8
0
def normalize(raw_doc):
    raw_doc_text = raw_doc.get('doc')
    doc = etree.XML(raw_doc_text)

    title = doc.xpath("//atom:entry/atom:title/node()", namespaces=NAMESPACES)[0]
    description = (doc.xpath("//atom:summary/node()", namespaces=NAMESPACES) or [""])[0]

    normalized_dict = {
        "title": copy_to_unicode(title),
        "contributors": get_contributors(doc),
        "properties": get_properties(doc),
        "description": copy_to_unicode(description),
        "id": get_ids(doc, raw_doc),
        "source": NAME,
        "tags": get_tags(doc),
        "dateCreated": get_date_created(doc),
        "dateUpdated": get_date_updated(doc)
    }

    return NormalizedDocument(normalized_dict)
Esempio n. 9
0
def normalize(raw_doc):
    doc = raw_doc.get('doc')
    record = etree.XML(doc)

    title = (record.xpath('//dc:title/node()', namespaces=NAMESPACES) or [''])[0]
    description = (record.xpath('ns0:metadata/oai_dc:dc/dc:description/node()', namespaces=NAMESPACES) or [''])[0]

    normalized_dict = {
        'title': copy_to_unicode(title),
        'contributors': get_contributors(record),
        'properties': get_properties(record),
        'description': copy_to_unicode(description),
        'tags': get_tags(record),
        'id': get_ids(record,raw_doc),
        'source': NAME,
        'dateUpdated': get_date_updated(record),
        'dateCreated': get_date_created(record),
    }

    # import json; print(json.dumps(normalized_dict, indent=4))
    return NormalizedDocument(normalized_dict)
Esempio n. 10
0
def normalize(raw_doc):
    doc = raw_doc.get('doc')
    record = etree.XML(doc)

    if 'True' == 'True':
        # # load the list of approved series_names as a file
        with open(os.path.join(os.path.dirname(__file__),
                               'series_names.txt')) as series_names:
            series_name_list = [
                word.replace('\n', '') for word in series_names
            ]

        set_spec = record.xpath('ns0:header/ns0:setSpec/node()',
                                namespaces=NAMESPACES)[0]

        if set_spec.replace('publication:', '') not in series_name_list:
            print('Series not in approved list, not normalizing...')
            return None

    title = (record.xpath('//dc:title/node()', namespaces=NAMESPACES)
             or [''])[0]
    description = (record.xpath('ns0:metadata/oai_dc:dc/dc:description/node()',
                                namespaces=NAMESPACES) or [''])[0]

    normalized_dict = {
        'title': copy_to_unicode(title),
        'contributors': get_contributors(record),
        'properties': get_properties(record),
        'description': copy_to_unicode(description),
        'tags': get_tags(record),
        'id': get_ids(record, raw_doc),
        'source': NAME,
        'dateUpdated': get_date_updated(record),
        'dateCreated': get_date_created(record),
    }

    # import json; print(json.dumps(normalized_dict, indent=4))
    return NormalizedDocument(normalized_dict)
Esempio n. 11
0
def normalize(raw_doc):
    raw_doc_text = raw_doc.get('doc')
    doc = etree.XML(raw_doc_text)

    title = (doc.xpath("str[@name='title']/node()") or [''])[0]
    description = (doc.xpath("str[@name='abstract']/node()") or [''])[0]

    normalized_dict = {
        'title': copy_to_unicode(title),
        'contributors': get_contributors(doc),
        'properties': get_properties(doc),
        'description': copy_to_unicode(description),
        'id': get_ids(doc, raw_doc),
        'tags': get_tags(doc),
        'source': NAME,
        'dateCreated': get_date_created(doc),
        'dateUpdated': get_date_updated(doc)
    }

    if normalized_dict['id']['url'] == u'':
        return None

    #import json; print json.dumps(normalized_dict['contributors'], indent=4)
    return NormalizedDocument(normalized_dict)
import pytest
import utils

from scrapi.linter.document import NormalizedDocument, RawDocument
from scrapi.processing.elasticsearch import es, ElasticsearchProcessor

test_db = ElasticsearchProcessor()

RAW = RawDocument(utils.RAW_DOC)
NORMALIZED = NormalizedDocument(utils.RECORD)


@pytest.mark.elasticsearch
def test_process_normalized():
    test_db.process_normalized(RAW, NORMALIZED, index='test')

    results = es.search(index='test', doc_type=RAW['source'])
    assert (len(results['hits']['hits']) == 1)


@pytest.mark.elasticsearch
def test_versions():
    NORMALIZED['source'] = RAW['source']
    NORMALIZED['_id'] = RAW['docID']
    test_db.process_normalized(RAW, NORMALIZED, index='test')

    old_title = NORMALIZED['title']
    result = es.search(index='test', doc_type=RAW['source'])['hits']['hits'][0]

    assert (result['_source']['title'] == old_title)
Esempio n. 13
0
 def normalize(self, raw_doc):
     transformed = self.transform(json.loads(raw_doc['doc']),
                                  fail=settings.RAISE_IN_TRANSFORMER)
     transformed['shareProperties'] = {'source': self.short_name}
     return NormalizedDocument(transformed)
Esempio n. 14
0
from . import utils

from mock import mock_open, patch

from scrapi.processing.storage import StorageProcessor
from scrapi.linter.document import RawDocument, NormalizedDocument

test_db = StorageProcessor()

RAW = RawDocument(utils.RAW_DOC)
NORMALIZED = NormalizedDocument(utils.NORMALIZED_DOC)


@patch('scrapi.processing.storage.os')
def test_process_normalized(mock_os):
    mock_os.path.exists.return_value = False
    filename = 'archive/{}/{}/normalized.json'.format(RAW['source'],
                                                      RAW['docID'])
    m = mock_open()
    with patch('scrapi.processing.storage.open', m, create=True):
        test_db.process_normalized(RAW, NORMALIZED)

    m.assert_called_once_with(filename, 'w')


@patch('scrapi.processing.storage.os')
def test_process_raw(mock_os):
    mock_os.path.exists.return_value = False
    filename = 'archive/{}/{}/raw.{}'.format(RAW['source'], RAW['docID'],
                                             RAW['filetype'])
    m = mock_open()