Esempio n. 1
0
    def process_file(self, _doc_path, _output_folder_container):
        if not exists(_doc_path):
            raise IOError
        if not exists(_output_folder_container):
            mkdir(_output_folder_container)

        filename = basename(_doc_path)
        doc_identifier = filename[:-4]

        # Create the output folder for the extracted content and place the PDF inside
        document_folder_path = join(_output_folder_container,
                                    'p' + doc_identifier)
        if exists(document_folder_path):
            error = "output document folder path exists, skipping document %s" % filename
            self.log_error(_output_folder_container, error)
            return None
        mkdir(document_folder_path)
        copy(_doc_path, join(document_folder_path, filename))

        # Extract figures and captions
        fcx = PDFigCapX(self.chromedriver_path, self.xpdf_pdftohtml_path,
                        self.imagemagick_convert_path)
        total_elems, total_figs, total_figs_success = fcx.extract(
            document_folder_path, document_folder_path)
        print "PDFigCapx (%d/%d)\n" % (total_figs_success, total_figs)
        if total_figs_success != total_figs:
            error = "PDFigCapX could not process all the content for %s" % filename
            self.log_error(_output_folder_container, error)
            self.remove_folder(document_folder_path, _output_folder_container)
            return None

        # Split the figures in the document
        fsw = FigSplitWrapper(self.figsplit_url)
        figcapx_output_path = join(document_folder_path, doc_identifier)
        total_splits, total_splits_success = fsw.split(figcapx_output_path)
        print "FigSplit (%d/%d)\n" % (total_splits_success, total_splits)
        if total_splits_success != total_splits:
            error = "FigSplit could not process all the figures for %s" % filename
            self.log_error(_output_folder_container, error)
            self.remove_folder(document_folder_path, _output_folder_container)
            return None

        task_service = Task(self.insert_document_service_uri,
                            self.send_task_service_uri)
        document = task_service.create_document(document_folder_path)
        saved_document, saving_error = task_service.insert_document(document)
        if saved_document:
            task = task_service.send_task(saved_document['_id'],
                                          saved_document['name'],
                                          self.organization, self.group_name)
            if task:
                return task
            else:
                error = "Error creating task"
                self.log_error(_output_folder_container, error)
                self.remove_folder(document_folder_path,
                                   _output_folder_container)
        else:
            error = "Error inserting document in the database \n" + saving_error
            self.log_error(_output_folder_container, error)
            self.remove_folder(document_folder_path, _output_folder_container)
        return None
Esempio n. 2
0
import sys
from os import getcwd
from os.path import join, abspath

current_folder = getcwd()
source_folder = abspath(join(current_folder, '..'))
sys.path.append(source_folder)

from Task import Task

insert_document_service_uri = 'http://localhost:3020/api/insertFromPipe'
send_task_service_uri = 'http://localhost:3020/api/sendPipeTask'
input_test_document_path = abspath(
    join(current_folder, '..', '..', 'output', 'p15350224'))
t = Task(insert_document_service_uri, send_task_service_uri)

d = t.create_document(input_test_document_path)
saved_document = t.insert_document(d)

if saved_document:
    task_result = t.send_task(saved_document['_id'], saved_document['name'],
                              'uic', 'uic')
    if task_result:
        print "Great!"
    else:
        print "Too bad"

#print(str(res))