Esempio n. 1
0
import os
import glob
import sys
import shutil
from luigi import Parameter, BoolParameter
from luiginlp.engine import Task, TargetInfo, StandardWorkflowComponent, registercomponent, InputComponent, Parallel, run, ComponentParameters, InputFormat
from luiginlp.util import replaceextension, DirectoryHandler, getlog
from luiginlp.modules.pdf import Pdf2images
from luiginlp.modules.folia import Foliacat, FoliaHOCR

log = getlog()

class Tesseract(Task):
    """Does OCR on a TIFF image, outputs a hOCR file"""
    executable = 'tesseract'

    language = Parameter()
    outputdir = Parameter(default="")

    in_tiff = None #input slot

    def out_hocr(self):
        if self.outputdir and self.outputdir != '.':
            return TargetInfo(self, os.path.join(self.outputdir, os.path.basename(replaceextension(self.in_tiff().path, ('.tif','.tiff'),'.hocr'))))
        else:
            return TargetInfo(self, replaceextension(self.in_tiff().path, ('.tif','.tiff'),'.hocr'))

    def run(self):
        self.ex(self.in_tiff().path, self.out_hocr().path[:-5], #output path without hocr extension (-5), Tesseract adds it already
                l=self.language,
                c="tessedit_create_hocr=T",
Esempio n. 2
0
import sys
import os
import unittest
import glob
import shutil
import luiginlp
import luigi
import json
from luiginlp.engine import Task, StandardWorkflowComponent, PassParameters, InputFormat, InputComponent, InputSlot, Parameter, IntParameter, registercomponent, ParallelBatch
from luiginlp.util import getlog, chunk

log = getlog()


class VoweleaterTask(Task):
    """Example of a task that invokes an external tool and uses stdin and stdout. This one simply removes vowels from a text."""
    executable = 'sed'
    in_txt = InputSlot()
    encoding = Parameter(default='utf-8')

    def out_txt(self):
        return self.outputfrominput(inputformat='txt',
                                    stripextension='.txt',
                                    addextension='.novowels.txt')

    def run(self):
        self.ex(e='s/[aeiouAEIOU]//g',
                __stdin_from=self.in_txt().path,
                __stdout_to=self.out_txt().path)