def insert_html_images(d): """Inserts an image into page images for html content types input d: ObjectId from pymongo output Boolean sucess indicator """ client = py.MongoClient('mongo') db = client['docs'] col = db['aug_meta'] doc_id = d['_id'] doc = col.find_one({"_id": doc_id}) html_file = pu.get_from_gridfs(db, doc['raw_file']) f = tempfile.NamedTemporaryFile(mode='wb', delete=False, suffix='.html') f.write(html_file) f.flush() display = Display(visible=0, size=(800,600)) display.start() jpgfile = imgkit.from_file(f.name, 'pageimg.jpg') # display.stop() f.delete b = py.import_to_gridfs(db, 'pageimg.jpg', 'image') if 'page_images' not in doc: doc['page_images']=[] doc['page_images'].append(b) success = pu.update_doc(col, doc_id, doc) return success
def insert_content_type(d): """Insert content type. Inserts a standardized content type list in the top level of a document. Inputs: d: Returned ObjectId dictionary from pymongo find Output: Boolean sucess indictor """ client = py.MongoClient('mongo') db = client['docs'] col = db['aug_meta'] doc_id = d['_id'] doc = col.find_one({"_id": doc_id}) c = doc['metadata']['Content-Type'] content_type = standardize_content_type(c) doc['Content-Type'] = dict() doc['Content-Type']['Content'] = content_type[0] if len(content_type) == 2: doc['Content-Type']['Charset'] = content_type[1] success = pu.update_doc(col, doc_id, doc) return success
def insert_glove(d): """Insert document vectors. Inserts a document vector created from averaging glove vectors. Assumes the spacy model has been imported into this module's namspace as "model", to ensure that the model is only loaded once per container. Inputs: d: Returned ObjectId dictionary from pymongo find Output: Boolean sucess indictor """ client = py.MongoClient('mongo') db = client['docs'] col = db['aug_meta'] doc_id = d['_id'] doc = col.find_one({"_id": doc_id}) text = doc['content'] vec = generate_glove(text, model, 300) if 'ml-features' not in doc: doc['ml-features'] = dict() doc['ml-features']['glove'] = vec.tolist() success = pu.update_doc(col, doc_id, doc) return success
def insert_doc2vec(d): """Insert document vectors. Inserts a document vector created from aggregating word2vec vectors. Note the word2vec model needs to be loaded as "model" in this module's namespace. This is so that the model is not repeatedly loaded. Inputs: d: Returned ObjectId dictionary from pymongo find Output: Boolean sucess indictor """ client = py.MongoClient('mongo') db = client['docs'] col = db['aug_meta'] doc_id = d['_id'] doc = col.find_one({"_id": doc_id}) text = doc['content'] vec = generate_doc2vec(text, model, 300) if 'ml-features' not in doc: doc['ml-features'] = dict() doc['ml-features']['doc2vec'] = vec.tolist() success = pu.update_doc(col, doc_id, doc) return success
def insert_pdf_images(d): """Insert TIKA extracted metadata and content.""" client = py.MongoClient('mongo') db = client['docs'] col = db['aug_meta'] temp_dir = tempfile.mkdtemp() doc_id = d['_id'] doc = col.find_one({"_id": doc_id}) pdf_file = pu.get_from_gridfs(db, doc['raw_file']) images = import_page_images(db, pdf_file, temp_dir, False) doc['page_images'] = images success = pu.update_doc(col, doc_id, doc) fl.clean_temp_files(temp_dir) return success
def insert_office_images(d): """Insert TIKA extracted metadata and content.""" client = py.MongoClient('mongo') db = client['docs'] col = db['aug_meta'] temp_dir = tempfile.mkdtemp() doc_id = d['_id'] doc = col.find_one({"_id": doc_id}) pdf_file = pu.get_from_gridfs(db, doc['raw_file']) f = tempfile.NamedTemporaryFile(mode='wb', delete=False) f.write(pdf_file) images = import_page_images(db, f.name, temp_dir, True) doc['page_images'] = images success = pu.update_doc(col, doc_id, doc) fl.clean_temp_files(temp_dir, f.name) return success
def extract_postcode(d, b): """Extract Postcode.""" client = py.MongoClient('mongo') db = client['docs'] col = db['greenbook'] session = aws.create_session() doc_id = d['_id'] doc = col.find_one({"_id": doc_id}) postcodes = find_ukpc(doc['text']) if postcodes: doc['postcode'] = postcodes pc_key = "testing/" + doc['key'] + "/postcodes.json" s3_pc = aws.write_dict_json(postcodes) write_s3 = aws.put_s3_object(session, b, pc_key, s3_pc) success = pu.update_doc(col, doc_id, doc) else: success = None return success