def pdf_ocr(bucket_name, blob_names, data_dir="raw_ocr/"): mime_type = 'application/pdf' # How many pages should be grouped into each json output file. batch_size = 2 client = vision.ImageAnnotatorClient() feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) async_requests = [] print("Reading file(s): ") for blob_name in blob_names: source_uri = "gs://" + bucket_name + '/' + blob_name print("Reading {}...".format(source_uri)) gcs_source = vision.GcsSource(uri=source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) destination_uri = "gs://" + bucket_name + '/' + data_dir + blob_name print("Saving raw ocr data to {}...".format(destination_uri)) gcs_destination = vision.GcsDestination(uri=destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) async_requests.append(async_request) operation = client.async_batch_annotate_files(requests=async_requests) print('Waiting for the operation to finish.') operation.result(timeout=420) print('Done')
def async_detect_document(gcs_source_uri, gcs_destination_uri,number_of_pages): """OCR with PDF/TIFF as source files on GCS""" import json import re import os from google.cloud import vision from google.cloud import storage from google.protobuf.json_format import MessageToJson # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' # How many pages should be grouped into each json output file. batch_size = number_of_pages client = vision.ImageAnnotatorClient() feature = vision.Feature( type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig( gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig( gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files( requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=420) outputFileName='Json'+os.path.basename(gcs_source_uri)+'.json' #DOWNLOADING THE THE OBJECT THAT IS CONVERTED storage_client = storage.Client() #LOCATION IN THE CLOUD BUCKET bucket_name="mypdf_1" bucket = storage_client.bucket(bucket_name) blobs = [(blob, blob.updated) for blob in storage_client.list_blobs( bucket_name, )] # sort and grab the latest value, based on the updated key latest = sorted(blobs, key=lambda tup: tup[1])[-1][0] string_data = latest.download_as_string() json_data=json.loads(string_data) # print(string_data) with open(outputFileName, 'w') as outfile: json.dump(json_data, outfile) print("Sucessfully Created the json file") return json_data
def async_detect_document(gcs_source_uri, gcs_destination_uri, file_type): """OCR with PDF/TIFF as source files on GCS""" # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = file_type # How many pages should be grouped into each json output file. batch_size = 1 client = vision.ImageAnnotatorClient() feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=420) # Once the request has completed and the output has been # written to GCS, we can list all the output files. storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) print('Output files:') for blob in blob_list: print(blob.name) # Process the first output file from GCS. # Since we specified batch_size=2, the first response contains # the first two pages of the input file. output = blob_list[0] json_string = output.download_as_string() response = json.loads(json_string) # # The actual response for the first page of the input file. first_page_response = response['responses'][0] annotation = first_page_response['fullTextAnnotation'] return annotation
def detect_text_from_pdf(gcs_source_uri, gcs_destination_uri): mime_type = 'application/pdf' batch_size = 50 client = vision.ImageAnnotatorClient() feature = vision.Feature( type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig( gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig( gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files( requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=10000) # Once the request has completed and the output has been # written to GCS, we can list all the output files. storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) blob_list = list(bucket.list_blobs(prefix=prefix)) # Process the first output file from GCS. # Since we specified batch_size=2, the first response contains # the first two pages of the input file. output = blob_list[0] json_string = output.download_as_string() response = json.loads(json_string) full_text = '' for res in response['responses']: full_text = full_text + res['fullTextAnnotation']['text'] return parse_text(full_text)
def detect_pdf_text(gcs_source_uri, gcs_destination_uri): """OCR with PDF/TIFF as source files on GCS""" # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' # How many pages should be grouped into each json output file. batch_size = 50 client = vision.ImageAnnotatorClient() feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=420) # Once the request has completed and the output has been # written to GCS, we can list all the output files. storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) print('Output files:') for blob in blob_list: print(blob.name) return blob_list
def from_document(source_uri: str, destination_uri: str): mime_type = 'application/pdf' # Supported mime_types are: 'application/pdf' and 'image/tiff' batch_size = 2 # How many pages should be grouped into each json output file. client = vision.ImageAnnotatorClient() feature = vision.Feature( type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION # The feature we are going to use ) gcs_source = vision.GcsSource(uri=source_uri) # The source of the files input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) # Configuring the operation gcs_destination = vision.GcsDestination(uri=destination_uri) output_config = vision.OutputConfig( gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files( requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=360) storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) blob_names = [blob.name for blob in blob_list] # Process the first output file from GCS. # Since we specified batch_size=2, the first response contains # the first two pages of the input file. text_response = [] for output in blob_list: json_string = output.download_as_string() response = json.loads(json_string) # The actual response for the first page of the input file. first_page_response = response['responses'][0] annotation = first_page_response['fullTextAnnotation'] text_response.append(annotation['text']) return text_response, blob_names
def async_detect_document(gcs_source_uri, gcs_destination_uri): mime_type = 'application/pdf' batch_size = 100 client = vision.ImageAnnotatorClient() feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) print('Waiting for operation to finish') operation.result(timeout=400)
def async_detect_document(gcs_source_uri, gcs_destination_uri): """OCR with PDF/TIFF as source files on GCS""" import json import re from google.cloud import vision from google.cloud import storage if False: # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' client = vision.ImageAnnotatorClient() feature = vision.Feature( type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=420) # Once the request has completed and the output has been # written to GCS, we can list all the output files. storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) print('Output files:') for blob in blob_list: print(blob.name) # Process the first output file from GCS. # Since we specified batch_size=2, the first response contains # the first two pages of the input file. print('Full text:\n') for output in blob_list: json_string = output.download_as_string() response = json.loads(json_string) for page in response['responses']: if 'fullTextAnnotation' in page.keys(): print(page['fullTextAnnotation']['text'])
def async_detect_document(gcs_source_uri, gcs_destination_uri): feature = vision.Feature( type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig( gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(batch_size=batch_size, gcs_destination=gcs_destination) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files( requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=420) # Once the request has completed and the output has been # written to GCS, we can list all the output files. storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) print('Output files:') for blob in blob_list: print(blob.name) # Process the first output file from GCS. # Since we specified batch_size=2, the first response contains # the first two pages of the input file. output = blob_list[0] json_string = output.download_as_string() json_string.decode('iso8859-1') response = json.loads(json_string) # The actual response for the first page of the input file. first_page_response = response['responses'][0] annotation = first_page_response['fullTextAnnotation'] #NAMEHINDI text = {} text['raw'] = annotation['text'] namehindi = None try: newlist1 = [] for xx in annotation['text'].split('\n'): newlist1.append(xx) newlist1 = list(filter(lambda x: len(x) > 1, newlist1)) a = 0 str2 = "To" for no in newlist1: if str2 in no: b = a a = a + 1 namehindi = newlist1[b + 1] text['namehindi'] = namehindi # #NAMEENGLISH # translator = Translator() # print(namehindi) # c = translator.translate(namehindi,dest='en') # wordlist = text['raw'].split("\n") # # name = get_close_matches(c.text, wordlist) # text['NameEnglish'] = name[0] #GENDER gender = [] female_str = {"Female", "महिला", "FEMALE", "స్త్రీ"} male_str = {"Male", "పురుషుడు", "MALE", "ਮਰਦ", "पुरुष", "male"} for wordlist in text['raw'].split('\n'): for g in female_str: if re.search(g, wordlist): if g not in gender: print(g) gender.append(g) if (len(gender) == 0): for wordlist in text['raw'].split('\n'): for g in male_str: if re.search(g, wordlist): if g not in gender: print("MALE" + g) gender.append(g) if gender[0] == "MALE" or gender[0] == "FEMALE" or gender[0]=="Male": gender_string = gender[1] + "/" + gender[0] else: gender_string = gender[0] + "/" + gender[1] text["gender string"] = gender_string #Download date match = re.search(r'Dow\w+ Date[ :]*\d+[ -/]\d+[ -/]\d+', text['raw']) if (match != None): text["Downloaddate"] = match.group() else: pass #Issue date m = re.search(r'Iss\w+ Date[ :]*\d+[ -/]\d+[ -/]\d+', text['raw']) if (m != None): text["Issuedate"] = m.group() else: pass #ENG ADDRESS addres_hin = None try: newlist = [] for xx in text['raw'].split('\n'): newlist.append(xx) newlist = list(filter(lambda x: len(x) > 0, newlist)) a = 0 str = "Address:" for no in newlist: a = a + 1 c = re.search(r"(?<!\d)\d{6}(?!\d)", no) # r"\(\d[- \d()]\d", line)[0] if c: d = a if str in no: b = a addres_hin = newlist[b] while (b < d - 1): addres_hin = addres_hin + "\n" + newlist[b + 1] b = b + 1 except Exception: pass text['engAddress'] = addres_hin #VID g = None try: newlist = [] for xx in text['raw'].split('\n'): newlist.append(xx) newlist = list(filter(lambda x: len(x) > 12, newlist)) for no in newlist: if re.match("^[VID : 0-9]+$", no): g = no g = g.replace("VID:", "") g = g.replace(" ", "") g = ' '.join(re.findall(r'.{1,4}', g)) except Exception: pass text['VID'] = g #ADHAAR NO aadharno = None try: newlist = [] str = "XXXX" for xx in text['raw'].split('\n'): newlist.append(xx) newlist = list(filter(lambda x: len(x) > 11, newlist)) for word in newlist: if re.match("^[0-9 ]+$", word) or str in word and len(word) == 12: aadharno = word aadharno = aadharno.replace(" ", "") aadharno = ' '.join(re.findall(r'.{1,4}', aadharno)) except Exception: pass text['Adhaar no'] = aadharno #DOB: wordlist=None birth_str = {"जन्म तिथि", "DOB", "ਜਨਮ ਮਿਤੀ", "పుట్టిన తేదీ", "DOB:", "పుట్టిన తిథి:", "Date of Birth"} for i in birth_str: for wordlist in text['raw'].split('\n'): if re.search(i, wordlist): text["DOB"] = wordlist pass # address hindi address = None try: newlist = [] for xx in text['raw'].split('\n'): newlist.append(xx) newlist = list(filter(lambda x: len(x) > 0, newlist)) a = 0 str_a="" str = ["पता:", "ਪਤਾ:","पत्ता","पत्ता:","చిరునామా:"] b = 0 d = 0 for no in newlist: a = a + 1 c = re.search(r"(?<!\d)\d{6}(?!\d)", no) if c: d = a for i in str: if i in no: str_a = i b = a print(b) if d > b & b != 0: break if d > b & b != 0: break address = newlist[b] while (b < d - 1): address = address + "\n" + newlist[b + 1] b = b + 1 text['hindiAddress'] = str_a + "\n" + address except Exception as e: print(traceback.print_exc()) pass phone = None try: newlist = [] for xx in text['raw'].split('\n'): newlist.append(xx) newlist = list(filter(lambda x: len(x) > 5, newlist)) for word in newlist: if re.match("^[0-9 ]+$", word) and len(word) == 10: phone = word text['mobile no'] = phone except Exception: pass except Exception as e: print(traceback.print_exc()) pass return text
def async_detect_document(gcs_source_uri, gcs_destination_uri): """OCR with PDF/TIFF as source files on GCS""" import json import re from google.cloud import vision from google.cloud import storage # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' # How many pages should be grouped into each json output file. batch_size = 2 client = vision.ImageAnnotatorClient() feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=420) # Once the request has completed and the output has been # written to GCS, we can list all the output files. storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) # List objects with the given prefix, filtering out folders. blob_list = [ blob for blob in list(bucket.list_blobs(prefix=prefix)) if not blob.name.endswith('/') ] print('Output files:') for blob in blob_list: print(blob.name) # Process the first output file from GCS. # Since we specified batch_size=2, the first response contains # the first two pages of the input file. output = blob_list[0] json_string = output.download_as_string() response = json.loads(json_string) # The actual response for the first page of the input file. first_page_response = response['responses'][0] annotation = first_page_response['fullTextAnnotation'] # Here we print the full text from the first page. # The response contains more information: # annotation/pages/blocks/paragraphs/words/symbols # including confidence scores and bounding boxes print('Full text:\n') print(annotation['text'])
def async_detect_document(userName, fileName): from google.cloud import vision from google.cloud import storage import json # from google.protobuf import json_format # 결과 테스트 import sys # sys.stdout = open('ocr_pdf_result.txt','w') bucketName = "graduation_bucket" client = vision.ImageAnnotatorClient() # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' # How many pages should be grouped into each json output file. batch_size = 20 feature = vision.Feature( type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source_uri = f"gs://{bucketName}/{fileName}" gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig( gcs_source=gcs_source, mime_type=mime_type) # 폴더 알아서 생성되는지 확인 gcs_destination_uri = f"gs://{bucketName}/results/{userName}/{fileName}/" gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) #배치 추가 가능. default = 20 output_config = vision.OutputConfig( gcs_destination=gcs_destination) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files( requests=[async_request]) # print('Waiting for the operation to finish.') operation.result(timeout=420) # print("done") #============================================================== # fullText 추출하기 storage_client = storage.Client() prefix = f"results/{userName}/{fileName}/" bucket = storage_client.get_bucket(bucketName) # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) # print('Output files:') for blob in blob_list: # print(blob.name) json_string = blob.download_as_string() response = json.loads(json_string) pageLen = len(response['responses']) for i in range (0, pageLen): page_response = response['responses'][i] globalVariable.fullText += page_response['fullTextAnnotation']['text'] # Process the first output file from GCS. # Since we specified batch_size=2, the first response contains # the first two pages of the input file. # print('Full text:\n') # print(globalVariable.fullText) return
# Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' # Number of pages grouped into each json output file - Make this greater than the max document length so it fits in 1 json response. batch_size = 100 client = vision.ImageAnnotatorClient() feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) # Loops over all the items in the pdf_list for i in range(0, len(pdf_list)): # Example gcs_source_uri = 'gs://BUCKET_NAME/225423441-Roberson-Joseph-A078-360-606-BIA-Nov-18-2013.pdf' gcs_source_uri = pdf_list[i] gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) # Example gcs_destination_uri = 'gs://BUCKET_NAME/225423441-Roberson-Joseph-A078-360-606-BIA-Nov-18-2013-output-X-to-Y.json' gcs_destination_uri = json_list[i] gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) print('Waiting for', json_list[i], 'to finish.') operation.result(timeout=420) # Displays progress of all PDFs being cleaned, takes a little under a minute per PDF to be converted. print('PDF Number:', i + 1, 'out of', len(json_list), 'completed.')
def async_detect_document(gcs_source_uri, gcs_destination_uri): """ Annotates PDF document with text detection and saves .txt file to local folder Parameters ---------- gcs_source_uri : gcs path to PDF image gcs_destination_uri : gcs json file which will be written to .txt file Returns ------- None. """ client = vision.ImageAnnotatorClient() batch_size = 10 mime_type = 'application/pdf' feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) operation.result(timeout=180) storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) blob_list = list(bucket.list_blobs(prefix=prefix)) for n in range(1, len(blob_list)): output = blob_list[n] json_string = output.download_as_string() try: response = json.loads(json_string) first_page_response = response['responses'][0] annotation = first_page_response['fullTextAnnotation'] #print('Full text:\n') file = open('chart{}.txt'.format(n), 'w+') #print(annotation['text']) file.write(annotation['text']) except JSONDecodeError: print('jsondecode') pass return None
def p2a_ocr_pdf(bucket, pdf_blob): """ https://cloud.google.com/vision/docs/pdf """ # define the input config gcs_source_uri = "gs://{}/{}".format(bucket.name, pdf_blob.name) gcs_source = vision.GcsSource(uri=gcs_source_uri) # Supported mime_types are: 'application/pdf' and 'image/tiff' mime_type = 'application/pdf' # How many pages should be grouped into each json output file. batch_size = 2 client = vision.ImageAnnotatorClient() feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) # define output config pdf_id = pdf_blob.name.replace(".pdf", "")[:4] # use the first 4 chars as pdf_id gcs_destination_uri = "gs://{}/{}".format(bucket.name, pdf_id + "_") gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=420) # Once the request has completed and the output has been # written to GCS, we can list all the output files. # storage_client = storage.Client() # match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) # bucket_name = match.group(1) # prefix = match.group(2) # bucket = storage_client.get_bucket(bucket_name) # # List objects with the given prefix. # blob_list = list(bucket.list_blobs(prefix=prefix)) # print('Output files:') # for blob in blob_list: # print(blob.name) # # Process the first output file from GCS. # # Since we specified batch_size=2, the first response contains # # the first two pages of the input file. # output = blob_list[0] # json_string = output.download_as_string() # response = json.loads(json_string) # # The actual response for the first page of the input file. # first_page_response = response['responses'][0] # annotation = first_page_response['fullTextAnnotation'] # # Here we print the full text from the first page. # # The response contains more information: # # annotation/pages/blocks/paragraphs/words/symbols # # including confidence scores and bounding boxes # print('Full text:\n') # print(annotation['text']) # convert PDF to PNG files for annotation if ANNOTATION_MODE: convert_pdf2png(bucket, pdf_blob)
def loadPdfText(fPath): gcs_source_uri = "gs://revaise.appspot.com/" + fPath print(gcs_source_uri) #gcs_source_uri = "gs://revaise.appspot.com/images/picture-ScienceVideo Game Deep RL0.8575049874802985" gcs_destination_uri = "gs://revaise.appspot.com/TextOutput/" gcs_destination = vision.GcsSource(uri=gcs_destination_uri) mime_type = "application/pdf" # mime_type = "image/png" # How many pages should be grouped into each json output file. batch_size = 2 client = vision.ImageAnnotatorClient() feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) print(gcs_source_uri) gcs_source = vision.GcsSource(uri=gcs_source_uri) input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type) print(input_config) gcs_destination = vision.GcsDestination(uri=gcs_destination_uri) output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size=batch_size) print(output_config) import time time.sleep(1) async_request = vision.AsyncAnnotateFileRequest( features=[feature], input_config=input_config, output_config=output_config) operation = client.async_batch_annotate_files(requests=[async_request]) print('Waiting for the operation to finish.') operation.result(timeout=420) storage_client = storage.Client() match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri) bucket_name = match.group(1) prefix = match.group(2) bucket = storage_client.get_bucket(bucket_name) # List objects with the given prefix. blob_list = list(bucket.list_blobs(prefix=prefix)) print('Output files:') for blob in blob_list: print(blob.name) txt = "" for output in blob_list: json_string = output.download_as_string() if json_string != b'': response = json.loads(json_string) # The actual response for the first page of the input file. for j in response['responses']: txt += j['fullTextAnnotation']['text'] + " " with open("park.txt", "w+") as f: f.write(txt) for blob in blob_list: blob.delete() print(txt) return txt