def normalize(raw_doc): """A function for parsing the list of XML objects returned by the consume function. Returns a list of Json objects in a format that can be recognized by the OSF scrapi.""" raw_doc_str = raw_doc.get('doc') terms_url = 'http://purl.org/dc/terms/' elements_url = 'http://purl.org/dc/elements/1.1/' record = etree.XML(raw_doc_str) title = record.find(str(etree.QName(elements_url, 'title'))).text description = record.find(str(etree.QName(elements_url, 'description'))).text or '' normalized_dict = { 'title': copy_to_unicode(title), 'description': copy_to_unicode(description), 'contributors': get_contributors(record), 'properties': get_properties(record), 'id': get_ids(record, raw_doc), 'source': NAME, 'dateCreated': get_date_created(record), 'dateUpdated' : get_date_updated(record), 'tags': get_tags(record) } return NormalizedDocument(normalized_dict)
def normalize(raw_doc): doc = raw_doc.get('doc') record = etree.XML(doc) set_spec = record.xpath('ns0:header/ns0:setSpec/node()', namespaces=NAMESPACES)[0] if set_spec.replace('publication:', '') not in series_name_list: return None # title title = record.xpath('//dc:title/node()', namespaces=NAMESPACES)[0] # description description = (record.xpath('//dc:description/node()', namespaces=NAMESPACES) or [''])[0] normalized_dict = { 'title': copy_to_unicode(title), 'contributors': get_contributors(record), 'properties': get_properties(record), 'description': copy_to_unicode(description), 'id': get_ids(record, raw_doc), 'tags': get_tags(record), 'source': NAME, 'dateCreated': get_date_created(record), 'dateUpdated': get_date_updated(record), } return NormalizedDocument(normalized_dict)
def normalize(self, raw_doc): transformed = self.transform(etree.XML(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER) transformed['shareProperties'] = { 'source': self.short_name, 'docID': raw_doc['docID'], 'filetype': raw_doc['filetype'] } return NormalizedDocument(transformed, clean=True)
def test_cross_db_with_versions(canonical, destination, monkeypatch, index='test'): new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide' if canonical == destination: return monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical) # Get the test documents into the canonical processor canonical_processor = get_processor(canonical) canonical_processor.process_raw(RAW) canonical_processor.process_normalized(RAW, NORMALIZED) # Get a version in there too new_normalized = copy.deepcopy(NORMALIZED.attributes) new_normalized['title'] = new_title canonical_processor.process_normalized(RAW, NormalizedDocument(new_normalized)) destination_processor = get_processor(destination) # Check to see canonical_processor versions are there, and destination are not canonical_versions = list( canonical_processor.get_versions(docID=RAW['docID'], source=RAW['source'])) assert len(canonical_versions) == 3 assert canonical_versions[1].normalized['title'] == NORMALIZED['title'] assert canonical_versions[2].normalized['title'] == new_title destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source']) assert not destination_doc # Migrate from the canonical to the destination tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index, versions=True) # Check to see if the document made it to the destinaton, and is still in the canonical destination_versions = list( destination_processor.get_versions(docID=RAW['docID'], source=RAW['source'])) assert len(destination_versions) == 3 assert destination_versions[1].normalized['title'] == NORMALIZED['title'] assert destination_versions[2].normalized['title'] == new_title canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source']) assert canonical_doc
def normalize(self, raw): raw_data = json.loads(raw['doc']) document = raw_data['jsonData'] # This is a workaround for the push API did not have proper email validation for contributor in document['contributors']: if contributor.get('email', None) == '': del contributor['email'] # If status is marked delted in push API, mark in shareProperties if raw_data['status'] == 'deleted': document['shareProperties']['status'] = 'deleted' return NormalizedDocument(document)
def normalize(raw_doc): raw_doc = raw_doc.get('doc') record = etree.XML(raw_doc) normalized_dict = { 'title': get_title(record), 'contributors': get_contributors(record), 'properties': get_properties(record), 'description': get_description(record), 'id': get_ids(record), 'tags': get_tags(record), 'source': NAME, 'dateCreated': get_date_created(record), 'dateUpdated': get_date_updated(record) } return NormalizedDocument(normalized_dict)
def normalize(self, raw_doc, property_list): str_result = raw_doc.get('doc') result = etree.XML(str_result) # TODO : add series names filtering support payload = { 'source': self.name, 'title': self.get_title(result), 'description': self.get_description(result), 'id': self.get_ids(result, raw_doc), 'contributors': self.get_contributors(result), 'tags': self.get_tags(result), 'properties': self.get_properties(result, property_list), 'dateUpdated': self.get_date_updated(result), 'dateCreated': self.get_date_created(result) } return NormalizedDocument(payload)
def normalize(raw_doc): raw_doc_text = raw_doc.get('doc') doc = etree.XML(raw_doc_text) title = doc.xpath("//atom:entry/atom:title/node()", namespaces=NAMESPACES)[0] description = (doc.xpath("//atom:summary/node()", namespaces=NAMESPACES) or [""])[0] normalized_dict = { "title": copy_to_unicode(title), "contributors": get_contributors(doc), "properties": get_properties(doc), "description": copy_to_unicode(description), "id": get_ids(doc, raw_doc), "source": NAME, "tags": get_tags(doc), "dateCreated": get_date_created(doc), "dateUpdated": get_date_updated(doc) } return NormalizedDocument(normalized_dict)
def normalize(raw_doc): doc = raw_doc.get('doc') record = etree.XML(doc) title = (record.xpath('//dc:title/node()', namespaces=NAMESPACES) or [''])[0] description = (record.xpath('ns0:metadata/oai_dc:dc/dc:description/node()', namespaces=NAMESPACES) or [''])[0] normalized_dict = { 'title': copy_to_unicode(title), 'contributors': get_contributors(record), 'properties': get_properties(record), 'description': copy_to_unicode(description), 'tags': get_tags(record), 'id': get_ids(record,raw_doc), 'source': NAME, 'dateUpdated': get_date_updated(record), 'dateCreated': get_date_created(record), } # import json; print(json.dumps(normalized_dict, indent=4)) return NormalizedDocument(normalized_dict)
def normalize(raw_doc): doc = raw_doc.get('doc') record = etree.XML(doc) if 'True' == 'True': # # load the list of approved series_names as a file with open(os.path.join(os.path.dirname(__file__), 'series_names.txt')) as series_names: series_name_list = [ word.replace('\n', '') for word in series_names ] set_spec = record.xpath('ns0:header/ns0:setSpec/node()', namespaces=NAMESPACES)[0] if set_spec.replace('publication:', '') not in series_name_list: print('Series not in approved list, not normalizing...') return None title = (record.xpath('//dc:title/node()', namespaces=NAMESPACES) or [''])[0] description = (record.xpath('ns0:metadata/oai_dc:dc/dc:description/node()', namespaces=NAMESPACES) or [''])[0] normalized_dict = { 'title': copy_to_unicode(title), 'contributors': get_contributors(record), 'properties': get_properties(record), 'description': copy_to_unicode(description), 'tags': get_tags(record), 'id': get_ids(record, raw_doc), 'source': NAME, 'dateUpdated': get_date_updated(record), 'dateCreated': get_date_created(record), } # import json; print(json.dumps(normalized_dict, indent=4)) return NormalizedDocument(normalized_dict)
def normalize(raw_doc): raw_doc_text = raw_doc.get('doc') doc = etree.XML(raw_doc_text) title = (doc.xpath("str[@name='title']/node()") or [''])[0] description = (doc.xpath("str[@name='abstract']/node()") or [''])[0] normalized_dict = { 'title': copy_to_unicode(title), 'contributors': get_contributors(doc), 'properties': get_properties(doc), 'description': copy_to_unicode(description), 'id': get_ids(doc, raw_doc), 'tags': get_tags(doc), 'source': NAME, 'dateCreated': get_date_created(doc), 'dateUpdated': get_date_updated(doc) } if normalized_dict['id']['url'] == u'': return None #import json; print json.dumps(normalized_dict['contributors'], indent=4) return NormalizedDocument(normalized_dict)
import pytest import utils from scrapi.linter.document import NormalizedDocument, RawDocument from scrapi.processing.elasticsearch import es, ElasticsearchProcessor test_db = ElasticsearchProcessor() RAW = RawDocument(utils.RAW_DOC) NORMALIZED = NormalizedDocument(utils.RECORD) @pytest.mark.elasticsearch def test_process_normalized(): test_db.process_normalized(RAW, NORMALIZED, index='test') results = es.search(index='test', doc_type=RAW['source']) assert (len(results['hits']['hits']) == 1) @pytest.mark.elasticsearch def test_versions(): NORMALIZED['source'] = RAW['source'] NORMALIZED['_id'] = RAW['docID'] test_db.process_normalized(RAW, NORMALIZED, index='test') old_title = NORMALIZED['title'] result = es.search(index='test', doc_type=RAW['source'])['hits']['hits'][0] assert (result['_source']['title'] == old_title)
def normalize(self, raw_doc): transformed = self.transform(json.loads(raw_doc['doc']), fail=settings.RAISE_IN_TRANSFORMER) transformed['shareProperties'] = {'source': self.short_name} return NormalizedDocument(transformed)
from . import utils from mock import mock_open, patch from scrapi.processing.storage import StorageProcessor from scrapi.linter.document import RawDocument, NormalizedDocument test_db = StorageProcessor() RAW = RawDocument(utils.RAW_DOC) NORMALIZED = NormalizedDocument(utils.NORMALIZED_DOC) @patch('scrapi.processing.storage.os') def test_process_normalized(mock_os): mock_os.path.exists.return_value = False filename = 'archive/{}/{}/normalized.json'.format(RAW['source'], RAW['docID']) m = mock_open() with patch('scrapi.processing.storage.open', m, create=True): test_db.process_normalized(RAW, NORMALIZED) m.assert_called_once_with(filename, 'w') @patch('scrapi.processing.storage.os') def test_process_raw(mock_os): mock_os.path.exists.return_value = False filename = 'archive/{}/{}/raw.{}'.format(RAW['source'], RAW['docID'], RAW['filetype']) m = mock_open()