Beispiel #1
0
def insert_meta(b, f, c):
    """Insert TIKA extracted metadata and content."""
    client = py.MongoClient('mongo')
    db = client['docs']
    col = db[c]

    session = aws.create_session()
    doc_stream = aws.get_s3_object(session, b, f).get()["Body"].read()

    sha1 = fl.create_sha(doc_stream, True)

    meta_exists = [
        x for x in col.find({"sha1": sha1}, {
            "key": True,
            "_id": False
        })
    ]

    if meta_exists:
        doc = dict()
        doc['key'] = f
        doc['sha1'] = sha1
        doc['uuid'] = fl.create_uuid()
        doc['duplicate'] = meta_exists[0]['key']
        success = pu.create_doc(col, doc)
        return success

    doc = get_tika_content_stream(doc_stream)
    doc['key'] = f
    doc['sha1'] = sha1
    doc['uuid'] = fl.create_uuid()

    if 'content' in doc:
        if doc['content'] != "":
            s3_txt = aws.write_dict_json(doc['content'])
            write_s3_txt = aws.put_s3_object(session, b + "-writable",
                                             doc['key'] + "/extracted.json",
                                             s3_txt)
            doc['content'] = True
        else:
            doc.pop('content', None)

    if 'metadata' in doc:
        s3_meta = aws.write_dict_json(doc['metadata'])
        write_s3_meta = aws.put_s3_object(session, b + "-writable",
                                          doc['key'] + "/metadata.json",
                                          s3_meta)

    if 'attachments' in doc:
        if doc['attachments'] != []:
            doc['no_attach'] = len(doc['attachments'])
            attachments = doc['attachments']
            doc['attachments'] = [
                insert_attachments_meta(db, attachments.get(x), x, c, b, f,
                                        session) for x in attachments
            ]
    success = pu.create_doc(col, doc)

    return success
Beispiel #2
0
def create_document(b, k):
    """Create mongo document.

    Creates a mongo document from a s3 json file.
    Input:
        f: S3 key
    """
    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['greenbook']

    session = aws.create_session()
    s3_obj = aws.get_s3_object(session, b, k)
    success = pu.create_doc(col, aws.read_s3_json(s3_obj))

    return success
Beispiel #3
0
def extract_postcode(d, b):
    """Extract Postcode."""
    client = py.MongoClient('mongo')
    db = client['docs']
    col = db['greenbook']

    session = aws.create_session()

    doc_id = d['_id']
    doc = col.find_one({"_id": doc_id})
    postcodes = find_ukpc(doc['text'])
    if postcodes:
        doc['postcode'] = postcodes
        pc_key = "testing/" + doc['key'] + "/postcodes.json"
        s3_pc = aws.write_dict_json(postcodes)
        write_s3 = aws.put_s3_object(session, b, pc_key, s3_pc)
        success = pu.update_doc(col, doc_id, doc)
    else:
        success = None

    return success
Beispiel #4
0
import boto3
import os
from rq import Queue
from redis import Redis
import docproc.awsutil as aws
from docproc.mgtika import insert_meta
import time

start_time = time.time()

redis_conn = Redis(host='redis')
q = Queue(connection=redis_conn)

session = aws.create_session()
rd = os.environ['S3_READ_BUCKET']
wt = os.environ['S3_WRITE_BUCKET']
rd_path = os.environ['S3_READ_PATH']
col = os.environ['COLLECTION']
rd_bucket = aws.get_s3_bucket(session, rd)

jobs = 0
runs = 1
for run in range(runs):
    for fl in rd_bucket.objects.filter(Prefix=rd_path):
        job = q.enqueue(insert_meta, rd, fl.key, col)
        jobs = jobs + 1
print("Submitted: " + str(jobs) + " jobs")
elapsed_time = time.time() - start_time
print("submission time:", elapsed_time)