Exemple #1
0
def process():
    if request.method == 'POST':
        doc = json.loads(request.form.get('doc'))
        timestamp = request.form.get('timestamp')
    else:
        doc = json.loads(request.args.get('doc'))
        timestamp = request.args.get('timestamp')
    doc['timestamp'] = timestamp
    processed_doc = process_docs.process(doc, timestamp)
    search.update('scrapi', doc, doc['source'], doc['id'])
    return processed_doc
Exemple #2
0
def process():
    if request.method == 'POST':
        doc = json.loads(request.form.get('doc'))
        timestamp = request.form.get('timestamp')
    else:
        doc = json.loads(request.args.get('doc'))
        timestamp = request.args.get('timestamp')
    doc['timestamp'] = timestamp
    processed_doc = process_docs.process(doc, timestamp)
    search.update('scrapi', doc, doc['source'], doc['id'])
    return processed_doc
Exemple #3
0
def _normalize(result, timestamp, registry, manifest):
    iso_timestamp = timestamp.isoformat()
    normalized = registry[manifest['directory']]['normalize'](result, timestamp)
    logger.info('Document {0} normalized successfully'.format(result.get("doc_id")))
    doc = process_docs.process(normalized, timestamp)
    if doc is not None:
        doc.attributes['source'] = manifest['name']
        doc.attributes['location'] = "archive/{source}/{doc_id}/{timestamp}/normalized.json"\
            .format(source=manifest['directory'], doc_id=doc.get('id').get('service_id'), timestamp=doc.get('timestamp')),
        doc.attributes['iso_timestamp'] = str(iso_timestamp)
        logger.info('Document {0} processed successfully'.format(result.get("doc_id")))
        search.update('scrapi', doc.attributes, manifest['directory'], result.get("doc_id"))
    return doc
Exemple #4
0
    def test_process_legal(self):
        raw_doc = RawDocument({
            'doc': json.dumps({'Hello': 'world'}),
            'source': 'TEST',
            'doc_id': 37,
            'filetype': 'json'
        })
        ts = str(process_docs.process_raw(raw_doc, 'test-version'))
        timestamp = None
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))):
            if os.path.isfile(dirname + '/raw.json'):
                timestamp = dirname.split('/')[-1]
        assert timestamp == ts

        doc = NormalizedDocument({
            'title': "TEST PROJECT",
            'contributors': [
                {
                    'full_name': 'Me, Myself',
                    'email': '*****@*****.**'
                },
                {
                    'full_name': 'And I',
                    'email': '*****@*****.**'
                }
            ],
            'properties': {
            },
            'meta': {},
            'id': {
                'service_id': raw_doc.get('doc_id'),
                'doi': 'Not available',
                'url': 'fake.stuff.org/{}'.format(raw_doc.get('doc_id')),
            },
            'source': raw_doc.get('source'),
            'timestamp': str(timestamp),
            'tags': ['1', '2', '3'],
            'date_created': str(timestamp),
            'description': 'science stuff',
        })

        assert process_docs.process(doc, timestamp)

        found = False
        for dirname, dirnames, filenames in os.walk('archive/TEST/{0}'.format(raw_doc.get('doc_id'))):
            if os.path.isfile(dirname + '/normalized.json'):
                found = True
        assert found
Exemple #5
0
def _normalize(result, timestamp, registry, manifest):
    iso_timestamp = timestamp.isoformat()
    normalized = registry[manifest['directory']]['normalize'](result,
                                                              timestamp)
    logger.info('Document {0} normalized successfully'.format(
        result.get("doc_id")))
    doc = process_docs.process(normalized, timestamp)
    if doc is not None:
        doc.attributes['source'] = manifest['name']
        doc.attributes['location'] = "archive/{source}/{doc_id}/{timestamp}/normalized.json"\
            .format(source=manifest['directory'], doc_id=doc.get('id').get('service_id'), timestamp=doc.get('timestamp')),
        doc.attributes['iso_timestamp'] = str(iso_timestamp)
        logger.info('Document {0} processed successfully'.format(
            result.get("doc_id")))
        search.update('scrapi', doc.attributes, manifest['directory'],
                      result.get("doc_id"))
    return doc