def process_file(self, _doc_path, _output_folder_container): if not exists(_doc_path): raise IOError if not exists(_output_folder_container): mkdir(_output_folder_container) filename = basename(_doc_path) doc_identifier = filename[:-4] # Create the output folder for the extracted content and place the PDF inside document_folder_path = join(_output_folder_container, 'p' + doc_identifier) if exists(document_folder_path): error = "output document folder path exists, skipping document %s" % filename self.log_error(_output_folder_container, error) return None mkdir(document_folder_path) copy(_doc_path, join(document_folder_path, filename)) # Extract figures and captions fcx = PDFigCapX(self.chromedriver_path, self.xpdf_pdftohtml_path, self.imagemagick_convert_path) total_elems, total_figs, total_figs_success = fcx.extract( document_folder_path, document_folder_path) print "PDFigCapx (%d/%d)\n" % (total_figs_success, total_figs) if total_figs_success != total_figs: error = "PDFigCapX could not process all the content for %s" % filename self.log_error(_output_folder_container, error) self.remove_folder(document_folder_path, _output_folder_container) return None # Split the figures in the document fsw = FigSplitWrapper(self.figsplit_url) figcapx_output_path = join(document_folder_path, doc_identifier) total_splits, total_splits_success = fsw.split(figcapx_output_path) print "FigSplit (%d/%d)\n" % (total_splits_success, total_splits) if total_splits_success != total_splits: error = "FigSplit could not process all the figures for %s" % filename self.log_error(_output_folder_container, error) self.remove_folder(document_folder_path, _output_folder_container) return None task_service = Task(self.insert_document_service_uri, self.send_task_service_uri) document = task_service.create_document(document_folder_path) saved_document, saving_error = task_service.insert_document(document) if saved_document: task = task_service.send_task(saved_document['_id'], saved_document['name'], self.organization, self.group_name) if task: return task else: error = "Error creating task" self.log_error(_output_folder_container, error) self.remove_folder(document_folder_path, _output_folder_container) else: error = "Error inserting document in the database \n" + saving_error self.log_error(_output_folder_container, error) self.remove_folder(document_folder_path, _output_folder_container) return None
import sys from os import getcwd from os.path import join, abspath current_folder = getcwd() source_folder = abspath(join(current_folder, '..')) sys.path.append(source_folder) from Task import Task insert_document_service_uri = 'http://localhost:3020/api/insertFromPipe' send_task_service_uri = 'http://localhost:3020/api/sendPipeTask' input_test_document_path = abspath( join(current_folder, '..', '..', 'output', 'p15350224')) t = Task(insert_document_service_uri, send_task_service_uri) d = t.create_document(input_test_document_path) saved_document = t.insert_document(d) if saved_document: task_result = t.send_task(saved_document['_id'], saved_document['name'], 'uic', 'uic') if task_result: print "Great!" else: print "Too bad" #print(str(res))