def pdf(request, filename): #file is in ocr bucket if request.GET.get('src') == 'ocr': s3 = S3(settings.AWS_OCR_BUCKET) else: s3 = S3(settings.AWS_ANNOTATIONS_BUCKET) url = s3.get_presigned_url(filename) return render(request, 'viewer.html', {'pdf_url': url})
def check_complete(request): #check if output file exists yet filename = request.POST.get('filename') task_id = request.POST.get('task_id') obj = TaskResult.objects.filter(task_id=task_id) if obj.exists(): obj = obj.first() if obj.status == 'SUCCESS': s3 = S3(settings.AWS_OCR_BUCKET) download_url = s3.get_presigned_download_url(filename) response = { 'status': obj.status, 'successful': True, 'download_url': download_url, 'error_detail': None } else: response = { 'status': obj.status, 'successful': False, 'download_url': None, 'error_detail': obj.result } return JsonResponse(response) else: raise HTTPExceptions.NOT_FOUND
def download(request, filename): s3 = S3(settings.AWS_OCR_BUCKET) if s3.check_file_exists(filename): url = s3.get_presigned_url(filename) return redirect(url) raise HTTPExceptions.NOT_FOUND
def _soffice_process(tempfile_path, filename, md5_hash, process_type): '''create processed file,upload to s3, store ref''' #libre office requires invidual environs to run multiple instances #make empty file named to hash for unique we haz already. loffice_environ_path = os.path.join('/tmp', md5_hash) try: os.makedirs(loffice_environ_path) except FileExistsError: pass s = filename.split('.') child_name = '.'.join(s[:-1]) + '.' + process_type extension = s[-1] outpath = os.path.join('/tmp', child_name) #t1 = time.time() try: os.system('/usr/bin/soffice -env:UserInstallation=file://%s \ --headless --convert-to %s %s --outdir %s' \ % (loffice_environ_path, process_type, tempfile_path, '/tmp')) except: raise HTTPExceptions.UNPROCESSABLE_ENTITY s3 = S3(settings.AWS_ANNOTATIONS_BUCKET) saved_file = open(outpath, 'rb') s3.save_to_bucket(child_name, saved_file) #save ref to db ref = FileUpload(filename=child_name, md5_hash=md5_hash, extension=extension, is_original=False) ref.save() cleanup_temp_file(child_name) cleanup_temp_file(filename) #remove environment file try: shutil.rmtree(loffice_environ_path) except: #shrug pass return child_name
def download(request, filename): s3 = S3(settings.AWS_OCR_BUCKET) if s3.check_file_exists(filename): url = s3.get_presigned_download_url(filename, expire=240000) r = requests.get(url=url, stream=True) r.raise_for_status() response = HttpResponse(r.raw, content_type='application/pdf') response['Content-Disposition'] = f'inline; filename={ filename }' return response raise HTTPExceptions.NOT_FOUND
def csv_view(request, filename): s3 = S3(settings.AWS_ANNOTATIONS_BUCKET) file_obj = s3.download_fileobj_from_bucket(filename) csv_data = file_obj.getvalue().decode('utf-8', 'ignore') reader = csv.reader(csv_data.splitlines()) full_content = [i for i in reader] headers = full_content[0] content = full_content[1:] return render(request, 'csv_table.html', locals())
def download_static(request, filename): '''to download documents from docdrop-v1 url format''' s3 = S3(settings.AWS_ANNOTATIONS_BUCKET) if s3.check_file_exists(filename): url = s3.get_presigned_url(filename, expire=240000, content_type="application/pdf") #return redirect(url) r = requests.get(url=url, stream=True) r.raise_for_status() response = HttpResponse(r.raw, content_type='application/pdf') response['Content-Disposition'] = f'inline; filename={ filename }' return response raise HTTPExceptions.NOT_FOUND
def ocr_pdf(filename, parent_id, md5_hash, force_flag): if not os.path.exists('/tmp/ocr_clients'): os.makedirs('/tmp/ocr_clients') lockfile = os.path.join('/tmp/ocr_clients', md5_hash) try: #prevent too many heavy ocr processes from running at once current_process_count = len(os.listdir('/tmp/ocr_clients')) if current_process_count >= int(settings.MAX_SIM_OCR_PROCESSES): raise MaxProcessesExceededError() #add to current process count with file try: f = open(lockfile, 'x') f.close() except FileExistsError: raise FileInProcessError() input_path = os.path.join('/tmp', filename) #download file and save s3 = S3(settings.AWS_OCR_BUCKET) file_obj = s3.download_fileobj_from_bucket(filename) #file_obj.save(input_path) with open (input_path, 'wb') as tmpfile: tmpfile.write(file_obj.getbuffer()) basename = '.'.join(filename.split('.')[:-1]) if force_flag: processed_filename = basename + '_ocr_force.pdf' force_flag = True else: processed_filename = basename + '_ocr.pdf' force_flag = False output_path = os.path.join('/tmp', processed_filename) if force_flag: f = '--force-ocr' else: f = '' cmd = '/usr/bin/ocrmypdf {} {} {}'.format(f, input_path, output_path) rslt = subprocess.check_output(cmd, shell=True) #save to s3 with open(output_path, 'rb') as file_: s3.save_to_bucket(processed_filename, file_) file_.seek(0) hash_ = md5(file_.read()).hexdigest() #record to db ref = OCRUpload(filename=processed_filename, md5_hash=hash_, is_original=False, is_forced=force_flag, parent_id=parent_id) ref.save() #remove from process count os.remove(lockfile) cleanup_temp_file(filename) cleanup_temp_file(processed_filename) except Exception as e: try: os.remove(os.path.join('/tmp/ocr_clients', md5_hash)) cleanup_temp_file(filename) cleanup_temp_file(processed_filename) except: pass raise e
def upload(request): if request.method == 'POST': file_ = request.FILES.get('pdf-file') processing_error = None if file_ is None: raise HTTPExceptions.NOT_ACCEPTABLE #Error code 406 filename = file_.name if not filename or len(filename) < 3 or not '.' in filename: raise SuspiciousFileOperation('improper file name') filename = sanitize(filename) filename = filename.replace("'", '').replace('"', '') filename = re.sub(r"[\(,\),\s]+", "-", filename) temp = filename.split('.') basename = '.'.join(temp[:-1]) extension = temp[-1] if not extension in ('pdf', 'PDF'): raise SuspiciousFileOperation('improper file type') basename = basename[:60] new_filename = '{0}-{1}.{2}'.format(basename, randword(5), extension) #save to /tmp md5_hash, tempfile_path = save_temp_file(new_filename, file_) #file already exists in system? existing_name = check_ocr_file_exists(md5_hash) #already_has_text? if check_pdf_has_text(new_filename): processing_error = 'This PDF already has text. Use the "Force OCR" button to overwrite text with a fresh OCR if desired. If file was OCRd on previous upload those results will be provided' if not existing_name: already_exists = False #upload original to S3 s3 = S3(settings.AWS_OCR_BUCKET) saved_file = open(tempfile_path, 'rb') s3.save_to_bucket(new_filename, saved_file) ref = OCRUpload(filename=new_filename, md5_hash=md5_hash, is_original=True) ref.save() cleanup_temp_file(new_filename) else: already_exists = True new_filename = existing_name cleanup_temp_file(new_filename) data = { 'file_info': { 'filename': filename, 'size': file_.size, 'new_filename': new_filename, 'processing_error': processing_error, 'tempfile_path': tempfile_path, 'already_exists': already_exists, 'md5_hash': md5_hash } } return JsonResponse(data) return HttpResponseNotAllowed(['POST,'])
def upload(request): filename = "" if request.method == 'POST': file_ = request.FILES['file'] filename = file_.name if not filename or len(filename) < 3 or not '.' in filename: raise SuspiciousFileOperation('improper file name') filename = sanitize(filename) filename = filename.replace("'", '').replace('"', '') filename = re.sub(r"[\(,\),\s]+", "-", filename) temp = filename.split('.') basename = '.'.join(temp[:-1]) extension = temp[-1] basename = basename[:60] new_filename = '{0}-{1}.{2}'.format(basename, randword(5), extension) #save file to disk temporarily. #later it will be deleted after uploading to s3. md5_hash, tempfile_path = save_temp_file(new_filename, file_) extension = extension.lower() #if file (or processed child) exists, return the name existing_name = check_file_exists(md5_hash) if existing_name: cleanup_temp_file(new_filename) return HttpResponse(existing_name) #transform process if needed process_to_file_type = False if extension in ['doc', 'docx', 'odt', 'ott', 'rtf', 'odp', 'ppt', 'pptx']: process_to_file_type = 'pdf' if extension in ['xls', 'xlsx', 'ods']: process_to_file_type = 'csv' if process_to_file_type: child_name = _soffice_process( tempfile_path, new_filename, md5_hash, process_to_file_type) if child_name: cleanup_temp_file(child_name) return HttpResponse(child_name) else: cleanup_temp_file(child_name) raise HTTPExceptions.UNPROCESSABLE_ENTITY if extension == 'pdf': #check if is an image pdf or if it has text if not check_pdf_has_text(new_filename): cleanup_temp_file(new_filename) raise HTTPExceptions.NOT_ACCEPTABLE #Error code 406 #upload to cloud s3 = S3(settings.AWS_ANNOTATIONS_BUCKET) saved_file = open(tempfile_path, 'rb') s3.save_to_bucket(new_filename, saved_file) #save ref to db ref = FileUpload(filename=new_filename, md5_hash=md5_hash, extension=extension, is_original=True) ref.save() cleanup_temp_file(new_filename) return HttpResponse(new_filename) return HttpResponseNotAllowed(['POST,'])
def epub(request, filename): s3 = S3(settings.AWS_ANNOTATIONS_BUCKET) url = s3.get_presigned_url(filename) return render(request, 'epub.html', {'book_url': url})