Beispiel #1
0
class CassiusImport(Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'cassius-import')

        self.in_file = self.args['<in-file>']
        self.out_file = self.args['<out-file>']

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='cassius-import v0.1')

    def run(self):
        command = "java -cp '{0}{1}saxon9.jar':'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar':'{0}{1}..{1}runtime{1}' net.sf.saxon.Transform -r:org.apache.xml.resolver.tools.CatalogResolver -y:org.apache.xml.resolver.tools.ResolvingXMLReader -x:org.apache.xml.resolver.tools.ResolvingXMLReader -u -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(
            self.dir, os.sep, self.out_file, self.in_file)
        #command = "java -jar '{0}{1}saxon9.jar';'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar' -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file)

        #-r org.apache.xml.resolver.tools.CatalogResolver -catalog '{0}{1}..{1}runtime{1}catalog.xml'

        self.debug.print_debug(self,
                               u'Running saxon transform (JATS -> CaSSius)')

        subprocess.call(command, stdin=None, shell=True)
class CassiusImport (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'cassius-import')

        self.in_file = self.args['<in-file>']
        self.out_file = self.args['<out-file>']

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='cassius-import v0.1')

    def run(self):
        command = "java -cp '{0}{1}saxon9.jar':'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar':'{0}{1}..{1}runtime{1}' net.sf.saxon.Transform -r:org.apache.xml.resolver.tools.CatalogResolver -y:org.apache.xml.resolver.tools.ResolvingXMLReader -x:org.apache.xml.resolver.tools.ResolvingXMLReader -u -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file)
        #command = "java -jar '{0}{1}saxon9.jar';'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar' -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file)

        #-r org.apache.xml.resolver.tools.CatalogResolver -catalog '{0}{1}..{1}runtime{1}catalog.xml'

        self.debug.print_debug(self, u'Running saxon transform (JATS -> CaSSius)')

        subprocess.call(command, stdin=None, shell=True)
Beispiel #3
0
class Disseminate(Debuggable):

    def __init__(self):
        self.args = self.read_command_line()
        self.debug = Debug()
        self.settings = Settings(self.args)
        self.gv = GV(self.settings)
        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()
        self.dr = self.args.get('<path>')
        self.f = self.args.get('<input_file>')
        self.out_type = self.args.get('--out-type').lower()
        self.script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='Disseminate 0.1')



    def get_saxon_path(self):
        """Checks if saxon is available in the default path

        Returns
        --------
        saxon : boolean
            True, if saxon is available. False, if not.

        """

        s = os.path.join(self.script_path, self.gv.apps.get('saxon'))
        if os.path.isfile(s):
            return s
        elif self.args.get('--saxon'):
            if os.path.isfile(self.args.get('--saxon')):
                return self.args.get('--saxon')
            else:
                return False

        else:
            return False

    def get_module_name(self):
        """
        Reads the name of the module for debugging and logging

        Returns
        -------
        name string
         Name of the Module
        """
        name = 'disseminate'
        return name

    def process(self, args):
        """Runs  typesetter with given arguments

        Creates the execution path for  the conversion process. Output,exit-code and  system error codes are captured and returned.


        Parameters
        ----------
        args : list
            application arguments in the correct oder.


        Returns
        -------
        output :str
            system standard output.
        err :str
            system standard error.
        exit_code: str
            system exit_code.

        See Also
        --------
        subprocess.Popen()

        """

        m = ' '.join(args).strip().split(' ')
        print(' '.join(args))
        process = Popen(m, stdout=PIPE)
        output, err = process.communicate()
        exit_code = process.wait()
        if exit_code == 1:
            print(err)
            sys.exit(1)

        return output, err, exit_code

    def run(self):
        """
        Runs converters

        See Also
        --------
        create_output, create_pdf

        """
        self.create_output(self.out_type)


    def create_output(self, out_type):
        """
        Create  FO output

        Parameters
        ----------
        out_type: str
            Output Type


        See Also
        -------
        run_saxon(), get_saxon_path()
        """

        formatters = self.args.get('--formatter').split(',')
        mediums = self.args.get('--medium').split(',')
        for f in formatters:
            f = f.lower()
            for m in mediums:
                m = m.lower()
                self.gv.create_dirs_recursive(self.args.get('<path>').split(os.pathsep))
                if self.out_type=='fo':
                    self.debug.print_console(self, self.gv.RUNNING_FO_CONVERSION)
                    saxon_path = self.get_saxon_path()
                    args = self.run_saxon(saxon_path,f, m)
                if self.out_type=='pdf':
                    self.debug.print_console(self, self.gv.RUNNING_PDF_CONVERSION)
                    args = self.run_fop_processor(f, m)
                output, err, exit_code = self.process(args)


    def run_fop_processor(self,  formatter, medium):

        args = []
        if formatter.lower() == 'fop':
            pth = os.path.join(self.script_path, self.gv.apps.get('fop'))
            if self.gv.check_program(pth):

                args = self.run_apache_fop(pth,formatter, medium)

        elif formatter.lower() == 'ah':
            pth = self.gv.apps.get('ah')
            if self.gv.check_program(pth):
                args = self.run_ah_fop(pth,formatter, medium)
        return args

    def run_ah_fop(self, pth, formatter, medium):
        args=[pth]
        args.append('-d')
        args.append('{}/{}.{}.{}.fo'.format(os.path.dirname(self.f), self.gv.uuid, formatter, medium))
        args.append('-o')
        args.append('{}/{}.{}.{}.pdf'.format(self.dr, self.gv.uuid, formatter, medium))

        return args



    def run_apache_fop(self, pth, formatter, medium):
        style_path = '{}/configurations/fop/conf/{}.{}.xml'.format(self.script_path, formatter,medium)
        args = [pth]
        args.append('-fo')
        args.append('{}/{}.{}.{}.fo'.format(os.path.dirname(self.f),self.gv.uuid, formatter, medium))
        args.append('-pdf')
        args.append('{}/{}.{}.{}.pdf'.format(self.dr,self.gv.uuid, formatter, medium))
        args.append('-c')
        args.append(style_path)
        return args



    def run_saxon(self, saxon_path, formatter, medium):
        """
        Creates the executable path for saxon

        Parameters
        ---------
        saxon_path : str
            absolute path  of the saxon binary jar file
        formatter : str
            name of the FO formatter
        medium : str
            name of the medium

        Returns
        ------
        args:list
            List of arguments for saxon execution path

        """
        args = ["java", "-jar", saxon_path]
        if self.args.get('--xsl'):
            xsl = self.script_path.split(os.sep)
            xsl.append('stylesheets')
            xsl.append(self.args.get('--xsl'))
            args.append("-xsl:" + os.sep.join(xsl))

        s = self.args.get('<input_file>')
        if os.path.exists(s):
            args.append("-s:" + s)
        else:
            self.debug.print_debug(self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + ' ' + s)
            sys.exit(1)
        file_name = '.'.join([self.gv.uuid,formatter.lower(),medium.lower(),'fo'])
        args.append("-o:" + os.path.join(self.args.get('<path>'), file_name))
        args.append('formatter=' + formatter.lower())
        args.append('medium=' + medium.lower())
        return args
Beispiel #4
0
class Merge(Debuggable):
    """
     Standalone Processing object which merges current  JATS/BITS XML file in to the Body of a BITS-XML document.

    """
    def __init__(self):

        self.args = self.read_command_line()
        self.debug = Debug()
        self.settings = Settings(self.args)
        self.gv = GV(self.settings)
        self.dr = self.args.get('<path>')
        self.f = self.args.get('<input_file>')
        self.scheme = self.args.get('<scheme>')
        self.set_numbering_tags = self.args.get('--set-numbering-tags')
        self.tr = etree.parse(os.path.join(self.dr, self.f))

        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='xmlMerge 0.0.1')

    def create_output_bits(self):
        """
        Create bits output file, generates a new file, if no file is found.
        Otherwise the current file is appended to the book body as a book-part.

        See Also
        --------
        create_book_part_bits, create_book_bits, do_file_io

        """
        fuf = os.path.join(self.dr, self.gv.uuid)
        pt = os.path.join(self.dr, os.path.basename(self.gv.uuid))

        trf = None
        if os.path.isfile(fuf):
            trf = etree.parse(fuf)
            bp = trf.find(".//book-body")
            book_part = self.create_book_part_bits()
            bp.append(book_part)
        else:
            trf = self.create_book_bits()
        trf = self.process(trf)

        self.do_file_io(
            etree.tostring(trf,
                           pretty_print=False,
                           xml_declaration=True,
                           encoding='UTF-8',
                           standalone='yes'), 'w', pt)

    def create_output_jats(self):
        """
        Create jats output file, generates a new file,

        See Also
        --------
        create_book_part_bits, create_book_bits, do_file_io

        """
        fuf = os.path.join(self.dr, self.gv.uuid)
        pt = os.path.join(self.dr, os.path.basename(self.gv.uuid))

        trf = None
        if os.path.isfile(fuf):
            trf = etree.parse(fuf)
            bpf = trf.find(".//body")
            f, bd, bk = self.get_xml_parts()
            if bd is not None:
                for sec in list(bd):
                    bpf.append(sec)

            bkrf = trf.find(".//back/ref-list")
            for r in bk.findall('.//ref-list/ref'):
                bkrf.append(r)

            bkff = trf.find(".//back/fn-group")
            for fn in bk.findall('.//fn-group/fn'):
                bkff.append(fn)

        else:
            trf = self.create_journal_jats()

        trf = self.process(trf)
        self.do_file_io(
            etree.tostring(trf,
                           pretty_print=False,
                           xml_declaration=True,
                           encoding='UTF-8',
                           standalone='yes'), 'w', pt)

    def process(self, tr):
        """
        Process  BITS-XML file and do all transformations into the elementtree

        Parameters
        ----------
        tr : elementtree
            element tree as input

        Returns
        -------
        tr : elementtree
            transformed element tree

        See Also
        --------
        globals.set_numbering_tags(), set_book_part_attributes()

        """
        tr = self.gv.set_numbering_tags(self.set_numbering_tags.split(','),
                                        tr) if self.set_numbering_tags else tr

        self.set_book_part_attributes(tr)

        return tr

    def set_book_part_attributes(self, tr):
        """
        Add  specific attributes to book-part

        Parameters
        ----------
        tr : elementtree
            element tree as input


        Returns
        -------
        tr : elementtree
            transformed element tree


        """
        book_parts = tr.findall('.//book-part')
        for i, b in enumerate(book_parts):
            b.attrib['id'] = "ch_" + str(i)
            b.attrib['book-part-type'] = "chapter"
        return tr

    def create_metadata_path(self, metadata):
        """
        creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata

        Parameters
        ----------
        metadata : str
            Suffix of the metadata  files

        Returns
        -------
        pth : str
            Correct path of the metadata file in the folder structure

        Notes
        -----
        We assume that  metadata files are stored in a sub-folder named metadata
        """
        p = os.path.dirname(self.f).split(os.sep)
        del p[-4:]
        name, ext = os.path.splitext(os.path.basename(self.gv.uuid))
        file_name = [name, '.', metadata, '.', 'xml']
        p.append('metadata')
        p.append(''.join(file_name))

        pth = os.sep.join(p)
        self.debug.print_debug(self, 'merging headers' + str(pth))
        return pth

    def get_module_name(self):
        """
        Reads the name of the module for debugging and logging

        Returns
        -------
        name string
         Name of the Module
        """
        name = 'merge'
        return name

    def create_book_bits(self):
        """
        creates a  full BITS XML book and optionally adds metadata

        Returns
        -------
        book : elementtree
            Elementtree which complies to BITS XML Scheme.

        See Also
        ---------
        create_metadata_path, create_book_part_bits

        """
        nsmap = {
            'xlink': "http://www.w3.org/1999/xlink",
            'mml': "http://www.w3.org/1998/Math/MathML",
            "xml": "http://www.w3.org/XML/1998/namespace"
        }
        book = etree.Element(etree.QName('book'), nsmap=nsmap)
        book.attrib['dtd-version'] = "2.1"
        book.attrib[etree.QName(
            '{http://www.w3.org/XML/1998/namespace}lang')] = "de"
        book.attrib['book-type'] = "proceedings"

        metadata = self.args.get('--metadata')

        if metadata:
            pth = self.create_metadata_path(metadata)
            self.debug.print_console(self, 'merging headers' + str(pth))
            if os.path.isfile(pth):
                bp = etree.parse(pth).find('.//book-meta')
                book.insert(0, bp)
            else:
                self.debug.print_console(
                    self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + str(pth))
                #sys.exit(1)

        else:
            sys.exit('Metadata argument undefined')
        bd = etree.Element("book-body")
        bpbd = self.create_book_part_bits()
        bd.append(bpbd)
        book.append(bd)

        return book

    def create_journal_jats(self):
        """
        creates a  full JATS XML book and optionally adds metadata

        Returns
        -------
        book : elementtree
            Elementtree which complies to BITS XML Scheme.

        See Also
        ---------
        create_metadata_path, create_book_part_bits

        """

        nsmap = {
            'xlink': "http://www.w3.org/1999/xlink",
            'mml': "http://www.w3.org/1998/Math/MathML",
            "xml": "http://www.w3.org/XML/1998/namespace"
        }
        journal = etree.Element(etree.QName('article'), nsmap=nsmap)
        journal.attrib['dtd-version'] = "3.0"
        journal.attrib[etree.QName(
            '{http://www.w3.org/XML/1998/namespace}lang')] = "de"

        f, bd, bk = self.get_xml_parts()

        metadata = self.args.get('--metadata')

        if metadata:
            pth = self.create_metadata_path(metadata)
            if os.path.isfile(pth):
                bpm = etree.parse(pth).find('.')
                if bpm is not None:
                    if bpm.getroottree().getroot().tag == 'front':
                        journal.insert(0, bpm)
                    else:
                        self.debug.print_debug(self,
                                               'front metadata unspecified')
                        sys.exit(1)
        else:
            journal.insert(0, f)

        journal.append(bd)
        if len(bk) > 0:
            journal.append(bk)
        else:
            back = etree.Element(etree.QName('back'))
            back.append(etree.Element(etree.QName('fn-group')))
            back.append(etree.Element(etree.QName('ref-list')))
            journal.append(back)
        return journal

    def create_book_part_bits(self):
        """
        Reads a JATS XMl File and creates a book-part element tree according to BITS-XML.

        Returns
        -------
        bp : elementtree
            Book part elementTree
        """
        f, bd, bk = self.get_xml_parts()

        bp = etree.Element("book-part")

        if f is not None:
            if len(f):
                bp.append(f)
        if bd is not None:
            bp.append(bd)
        if bk is not None:
            bp.append(bk)
        return bp

    def get_xml_parts(self):
        """
        Returns  the front-matter , body and back-matter of a JATS XML file in the above order

        Returns
        -------
        f : elementtree
            Front-matter of JATS elementTree
        bd : elementtree
            Body of JATS elementTree
        bk : elementtree
            Back-matter of JATS elementTree

        """
        r = self.tr.getroot()
        f = r.find(".//front")
        if f is None:
            f = r.find(".//book-part-meta")
        bd = r.find(".//body")
        bk = r.find(".//back")
        return f, bd, bk

    def do_file_io(self, s, mode, pth):
        """
        Executes read or write operations on a path

        Parameters
        ----------
        s: str
            Content to be written or None for read
        mode: str
            w for write , r for r
        pth : str
            Path to the file to be read or written

        Raises
        ------
        IOError
            I/O operation fails

        """
        try:
            w = open(pth, mode)
            if mode == 'w':
                w.write(s.rstrip('\r\n'))
                w.close()
            if mode == 'r':
                o = w.read()
                w.close()
        except IOError as i:
            self.debug.print_debug(self, i)
            print(i)
            sys.exit(1)

    def run(self):
        """
         Runs the configuration on the processing object. Process  JATS-XML file and merges it into the full BITS-XML file

        See Also
        --------
        create_output_bits

        Warning
        -------
        function create_output_jats not yet used

        """

        self.gv.create_dirs_recursive(self.dr.split('/'))
        if self.scheme == 'bits':
            self.create_output_bits()

        elif self.scheme == 'jats':
            self.tr = self.create_output_jats()
Beispiel #5
0
class GV(object):
    '''    Global variables    '''

    def __init__(self, settings):
        # GLOBAL VARIABLES
        self.settings = settings


        #application paths
        self.apps = {'fop': 'fop/fop/fop',
                     'saxon': 'tools/meTypeset/runtime/saxon9.jar',
                     'ah': '/usr/AHFormatterV65_64/run.sh',
                     'xep': '/usr/local/xep/bin/xep/xep'
                    }


        # projects
        self.PROJECT_INPUT_FILE_JSON_IS_NOT_VALID = 'project input file json is not valid'
        self.PROJECT_INPUT_FILE_TYPE_IS_NOT_SPECIFIED = 'project input file type is not specified'
        self.PROJECT_INPUT_FILE_HAS_MORE_THAN_TWO_DOTS = 'project input file has more than two dots'
        self.PROJECT_INPUT_FILE_DOES_NOT_EXIST = 'project input_file does not exist'
        self.PROJECT_IS_NOT_ACTIVE = 'project is not active'
        self.PROJECT_OUTPUT_FILE_IS_NOT_DEFINED = 'project output file is not defined'
        self.PROJECT_OUTPUT_FILE_TYPE_IS_NOT_SPECIFIED = 'project output file type is not defined'
        self.PROJECT_OUTPUT_FILE_WAS_NOT_CREATED = 'project output file was not created'
        self.PROJECT_TYPESETTER_IS_NOT_AVAILABLE = 'project typesetter is not available'
        self.PROJECT_TYPESETTER_IS_NOT_SPECIFIED = 'project typesetter is not specified'
        self.PROJECT_TYPESETTER_NAME_IS_NOT_SPECIFIED = 'project typesetter name is not specified'
        self.PROJECT_TYPESETTER_VAR_IS_NOT_SPECIFIED = 'project typesetter varaible is not specified'
        self.PROJECT_TYPESETTERS_ARE_NOT_SPECIFIED = 'project typesetters are not specified'
        self.PROJECTS_VAR_IS_NOT_SPECIFIED = 'project variable is not  specified'
        self.PROJECT_TYPESETTER_PROCESS_METHOD_NOT_SPECIFIED='project typesetter process method not specified'
        self.PROJECTS_TYPESETTER_RUNS_WITH_NO_ARGUMENTS = 'projects typesetter runs with no arguments'


        # typesetter errors
        self.TYPESETTER_ARGUMENTS_NOT_DEFINED = "typesetter arguments not defined"
        self.TYPESETTER_EXECUTABLE_VARIABLE_IS_UNDEFINED = 'typesetter executable variable is undefined'
        self.TYPESETTER_FILE_OUTPUT_TYPE_IS_UNDEFINED = 'typesetter file output type is undefined'
        self.TYPESETTER_METADATA_FILE_WAS_NOT_SPECIFIED = 'Metadata file wasn\'t specified '
        self.TYPESETTER_METYPESET_RUNS_WITH_DEFAULT_METADATA_FILE = 'typesetter metypeset runs with default metadata file'
        self.TYPESETTER_IS_NOT_SPECIFIED = 'typesetter is not specified '
        self.TYPESETTER_PATH_IS_NOT_SPECIFIED = 'typesetter path is not specified '
        self.TYPESETTER_BINARY_IS_UNAVAILABLE = 'typesetter binary is unavailable '
        self.TYPESETTER_RUNS_WITH_NO_ARGUMENTS = 'typesetter runs with no arguments'

        # xml
        self.RUNNING_FO_CONVERSION = 'running FO conversion'
        self.RUNNING_PDF_CONVERSION = 'running PDF conversion'
        self.XML_ELEMENT_NOT_FOUND = 'xml element not found'
        self.XML_FILE_NOT_CREATED = 'xml file not created'
        self.XML_INPUT_FILE_IS_NOT_FOUND = 'xml input file is not found'
        self.XML_INPUT_FILE_IS_NOT_VALID = 'xml input file is not valid'
        self.SAXON_IS_NOT_AVAILABLE = 'saxon is not available'
        self.FOP_PATH_IS_NOT_AVAILABLE='fop path is not available'

        # WORDS
        self.OUTPUT = 'Output'

        self.debug = Debug()
        self.numeral_map = numeral_map

        #LOG Object
        self.log= []

        self.uuid = 'mpt'
        self.version = '0.0.1'

    @staticmethod
    def fatal_error(module, message):
        """
        Prints a formatted error message and exits

        Parameters
        ----------
        module: python module
             Returns the name of the module
        message: str
            Error message


        See Also
        --------
        module.get_module_name()

        """
        print(('[FATAL ERROR] [{0}] {1}'.format(
            module.get_module_name(), message)))
        sys.exit(1)

    def is_json(self, s):
        """
        Checks whether a string is valid json string

        Parameters
        ----------
        s : str
            JSON data as string

        Raises
        ------
        ValueError  error
             Inappropriate json string

        """
        try:
            return json.loads(s)
        except ValueError as e:
            return False
        return True

    def read_json(self, pth):
        """
        Reads a json file from system path or exits

        Parameters
        ----------
        pth: str
             path of the  file in the folder structure

        Returns
        -------
        json : json
            json object

        """
        if os.path.isfile(pth):
            with open(pth) as j:
                return json.load(j)

        else:

            try:
                r = requests.get(pth, verify=False, stream=True)
                if r.status_code==200:
                    return r.json()
                else:
                    self.debug.print_debug(self, self.PROJECT_INPUT_FILE_JSON_IS_NOT_VALID)
                    sys.exit(1)
            except requests.exceptions.ConnectionError as ce:
                self.debug.print_debug(self, str(ce.message))
                sys.exit(1)



    def create_dirs_recursive(self, pth):
        """
        Recursively create directories for a system path or exists if folder exists

        Parameters
        ----------
        pth : str
            system path to be created

        """
        p = ''
        for path in pth:
            p = p + os.path.sep + path.strip('/').strip('/')
            if not os.path.exists(p):
                try:
                    os.makedirs(p)
                except OSError as o:
                    print(o)
                    sys.exit(1)
        return p

    def set_numbering_tags(self, tags, tr):
        """
        Automatic numbering of the list of elements

        Parameters
        ----------
        tags: list
         list of elements

        Returns
        -------
        tr : elementtree


        """
        for tag in tags:
            sh = tr.findall('.//' + tag)
            sid = 1
            for i in sh:
                i.set('id', tag.replace('-', '') + str(sid))
                sid += 1
        return tr

    def check_program(self, p):
        """
        Checks  whether a  the program or typesetter is installed and executable

        Parameters
        ---------
        p: str
            Program path

        Returns
        --------
        None: bool
            Returns None , if  program exists

        """

        def is_exe(f_path):
            """
            Checks whether path is available and executable
            Parameters
            ---------
            f_path: str
                File path

            Returns
            --------
            boolean: bool
                True or False

            """
            return os.path.isfile(f_path) and os.access(f_path, os.X_OK)

        fpath, fname = os.path.split(p)
        if fpath:
            if is_exe(p):
                return p
        else:
            for path in os.environ["PATH"].split(os.pathsep):
                path = path.strip('"')
                exe_file = os.path.join(path, p)
                if is_exe(exe_file):
                    return exe_file

        return None
Beispiel #6
0
class Disseminate(Debuggable):

    def __init__(self):
        self.args = self.read_command_line()
        self.debug = Debug()
        self.gv = GV()
        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()
        self.dr = self.args.get('<path>')
        self.f = self.args.get('<input_file>')
        self.out_type = self.args.get('--out-type').lower()
        self.script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='Disseminate 0.1')



    def get_saxon_path(self):
        """Checks if saxon is available in the default path

        Returns
        --------
        saxon : boolean
            True, if saxon is available. False, if not.

        """

        s = os.path.join(self.script_path, self.gv.METYPESET_PATH)
        if os.path.isfile(s):
            return s
        elif self.args.get('--saxon'):
            if os.path.isfile(self.args.get('--saxon')):
                return self.args.get('--saxon')
            else:
                return False

        else:
            return False

    def get_module_name(self):
        """
        Reads the name of the module for debugging and logging

        Returns
        -------
        name string
         Name of the Module
        """
        name = 'OUTPUT Generation'
        return name

    def process(self, args):
        """Runs  typesetter with given arguments

        Creates the execution path for  the conversion process. Output,exit-code and  system error codes are captured and returned.


        Parameters
        ----------
        args : list
            application arguments in the correct oder.


        Returns
        -------
        output :str
            system standard output.
        err :str
            system standard error.
        exit_code: str
            system exit_code.

        See Also
        --------
        subprocess.Popen()

        """

        m = ' '.join(args).strip().split(' ')
        print ' '.join(args)
        process = Popen(m, stdout=PIPE)
        output, err = process.communicate()
        exit_code = process.wait()
        if exit_code == 1:
            print err
            sys.exit(1)
        return output, err, exit_code

    def run(self):
        """
        Runs converters

        See Also
        --------
        create_output, create_pdf

        """
        self.create_output(self.out_type)


    def create_output(self, out_type):
        """
        Create  FO output

        Parameters
        ----------
        out_type: str
            Output Type


        See Also
        -------
        run_saxon(), get_saxon_path()
        """

        formatters = self.args.get('--formatter').split(',')
        mediums = self.args.get('--medium').split(',')
        for f in formatters:
            f = f.lower()
            for m in mediums:
                m = m.lower()
                self.gv.create_dirs_recursive(self.args.get('<path>').split(os.pathsep))
                if self.out_type=='fo':
                    self.debug.print_console(self, self.gv.RUNNING_FO_CONVERSION)
                    saxon_path = self.get_saxon_path()
                    args = self.run_saxon(saxon_path,f, m)
                if self.out_type=='pdf':
                    self.debug.print_console(self, self.gv.RUNNING_PDF_CONVERSION)
                    args = self.run_fop_processor(f, m)
                output, err, exit_code = self.process(args)
                print output

    def run_fop_processor(self,  formatter, medium):

        args = []
        if formatter.lower() == 'fop':
            pth = os.path.join(self.script_path, self.gv.APACHE_FOP_PATH)
            if self.gv.check_program(pth):
                args = self.run_apache_fop(pth,formatter, medium)

        elif formatter.lower() == 'ah':
            pth = self.gv.ANTENNA_HOUSE_FOP_PATH
            if self.gv.check_program(pth):
                args = self.run_ah_fop(pth,formatter, medium)
        return args

    def run_ah_fop(self, pth, formatter, medium):
        args=[pth]
        args.append('-d')
        args.append('{}/{}.{}.{}.fo'.format(os.path.dirname(self.f), self.gv.uuid, formatter, medium))
        args.append('-o')
        args.append('{}/{}.{}.{}.pdf'.format(self.dr, self.gv.uuid, formatter, medium))

        return args






    def run_apache_fop(self, pth, formatter, medium):
        style_path = '{}/configurations/fop/conf/{}.{}.xml'.format(self.script_path, formatter,medium)
        args = [pth]
        args.append('-fo')
        args.append('{}/{}.{}.{}.fo'.format(os.path.dirname(self.f),self.gv.uuid, formatter, medium))
        args.append('-pdf')
        args.append('{}/{}.{}.{}.pdf'.format(self.dr,self.gv.uuid, formatter, medium))
        args.append('-c')
        args.append(style_path)
        return args



    def run_saxon(self, saxon_path, formatter, medium):
        """
        Creates the executable path for saxon

        Parameters
        ---------
        saxon_path : str
            absolute path  of the saxon binary jar file
        formatter : str
            name of the FO formatter
        medium : str
            name of the medium

        Returns
        ------
        args:list
            List of arguments for saxon execution path

        """
        args = ["java", "-jar", saxon_path]
        if self.args.get('--xsl'):
            xsl = self.script_path.split(os.sep)[:-1]
            xsl.append('stylesheets')
            xsl.append(self.args.get('--xsl'))
            args.append("-xsl:" + os.sep.join(xsl))

        s = self.args.get('<input_file>')
        if os.path.exists(s):
            args.append("-s:" + s)
        else:
            self.debug.print_debug(self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + ' ' + s)
            sys.exit(1)
        file_name = '.'.join([self.gv.uuid,formatter.lower(),medium.lower(),'fo'])
        args.append("-o:" + os.path.join(self.args.get('<path>'), file_name))
        args.append('formatter=' + formatter.lower())
        args.append('medium=' + medium.lower())


        return args
class ChronicWordFreq (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'CWF')

        self.corpus = self.args['<corpus_directory>']
        self.words = self.args['<word_list>'].split(",")
        self.output = self.args['<output_csv>']
        self.terms = {}
        self.years = []
        self.year_count = {}

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='chronicWordFreq 0.1')

    def read_file(self, file):
        match = re.search('\d{4}', file)
        year = match.group(0) if match else 'NODATE'

        if year == 'NODATE':
            self.debug.print_debug(self, u'No date detected in filename: {0}. Ignoring.'.format(file))
            return

        self.debug.print_debug(self, u'Processing {0} for year {1}.'.format(file, year))

        if not year in self.years:
            self.years.append(year)

        if not year in self.year_count:
            self.year_count[year] = 1
        else:
            self.year_count[year] += 1

        with open(join(self.corpus, file)) as f:
            content = f.read()
            content = content.upper()

            for word in self.words:
                if word.upper() in content:
                    if word in self.terms:
                        if year in self.terms[word]:
                            current_value = self.terms[word][year]
                            current_value += 1
                            self.terms[word][year] = current_value
                        else:
                            self.terms[word][year] = 1
                    else:
                        self.terms[word] = {year: 1}
                    self.debug.print_debug(self, u'Found {0} in {1}.'.format(word, file))

    def read_dir(self):
        files = [f for f in listdir(self.corpus) if isfile(join(self.corpus, f))]
        return files

    def write_output(self):
        self.years.sort()

        output_list = [u'{0},{1}\n'.format('Word', ",".join(self.years))]

        for word in self.words:
            line = word

            if word in self.terms:
                for year in self.years:
                    if year in self.terms[word]:
                        percent = (float(self.terms[word][year]) / float(self.year_count[year])) * 100
                        line += u',{0}'.format(percent)
                    else:
                        line += u',0'
                output_list.append(line + '\n')

        with open(self.output, 'w') as f:
            f.writelines(output_list)

    def run(self):
        file_list = self.read_dir()

        for file in file_list:
            self.read_file(file)

        self.write_output()
Beispiel #8
0
class Merge(Debuggable):
    """
     Standalone Processing object which merges current  JATS/BITS XML file in to the Body of a BITS-XML document.

    """

    def __init__(self):

        self.args = self.read_command_line()
        self.debug = Debug()
        self.gv = GV()
        self.uid = self.gv.uuid
        self.dr = self.args.get("<path>")
        self.f = self.args.get("<input_file>")
        self.scheme = self.args.get("<scheme>")
        self.set_numbering_tags = self.args.get("--set-numbering-tags")
        self.tr = etree.parse(os.path.join(self.dr, self.f))

        Debuggable.__init__(self, "Main")
        if self.args.get("--debug"):
            self.debug.enable_debug()

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version="xmlMerge 0.0.1")

    def create_output_bits(self):
        """
        Create bits output file, generates a new file, if no file is found.
        Otherwise the current file is appended to the book body as a book-part.

        See Also
        --------
        create_book_part_bits, create_book_bits, do_file_io

        """
        fuf = os.path.join(self.dr, self.uid)
        pt = os.path.join(self.dr, os.path.basename(self.uid))

        trf = None
        if os.path.isfile(fuf):
            trf = etree.parse(fuf)
            bp = trf.find(".//book-body")
            book_part = self.create_book_part_bits()
            bp.append(book_part)
        else:
            trf = self.create_book_bits()
        trf = self.process(trf)

        self.do_file_io(
            etree.tostring(trf, pretty_print=True, xml_declaration=True, encoding="UTF-8", standalone="yes"), "w", pt
        )

    def process(self, tr):
        """
        Process  BITS-XML file and do all transformations into the elementtree

        Parameters
        ----------
        tr : elementtree
            element tree as input

        Returns
        -------
        tr : elementtree
            transformed element tree

        See Also
        --------
        globals.set_numbering_tags(), set_book_part_attributes()

        """
        tr = self.gv.set_numbering_tags(self.set_numbering_tags.split(","), tr) if self.set_numbering_tags else tr

        self.set_book_part_attributes(tr)

        return tr

    def set_book_part_attributes(self, tr):
        """
        Add  specific attributes to book-part

        Parameters
        ----------
        tr : elementtree
            element tree as input


        Returns
        -------
        tr : elementtree
            transformed element tree


        """
        book_parts = tr.findall(".//book-part")
        for i, b in enumerate(book_parts):
            b.attrib["id"] = "ch_" + str(i)
            b.attrib["book-part-type"] = "chapter"
        return tr

    def create_metadata_path(self, metadata):
        """
        creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata

        Parameters
        ----------
        metadata : str
            Suffix of the metadata  files

        Returns
        -------
        pth : str
            Correct path of the metadata file in the folder structure

        Notes
        -----
        We assume that  metadata files are stored in a sub-folder named metadata
        """
        p = os.path.dirname(self.f).split(os.sep)
        del p[-4:]
        name, ext = os.path.splitext(os.path.basename(self.uid))
        file_name = [name, ".", metadata, ext]
        p.append("metadata")
        p.append("".join(file_name))
        pth = os.sep.join(p)
        return pth

    def create_book_bits(self):
        """
        creates a  full BITS XML book and optionally adds metadata

        Returns
        -------
        book : elementtree
            Elementtree which complies to BITS XML Schheme.

        See Also
        ---------
        create_metadata_path, create_book_part_bits

        """
        nsmap = {
            "xlink": "http://www.w3.org/1999/xlink",
            "mml": "http://www.w3.org/1998/Math/MathML",
            "xml": "http://www.w3.org/XML/1998/namespace",
        }
        book = etree.Element(etree.QName("book"), nsmap=nsmap)
        book.attrib["dtd-version"] = "2.1"
        book.attrib[etree.QName("{http://www.w3.org/XML/1998/namespace}lang")] = "de"
        book.attrib["book-type"] = "proceedings"

        metadata = self.args.get("--metadata")
        if metadata:
            pth = self.create_metadata_path(metadata)
            if os.path.isfile(pth):
                bp = etree.parse(pth).find(".//book-meta")
                book.insert(0, bp)

        bd = etree.Element("book-body")
        bpbd = self.create_book_part_bits()
        bd.append(bpbd)
        book.append(bd)

        return book

    def create_book_part_bits(self):
        """
        Reads a JATS XMl File and creates a book-part element tree according to BITS-XML.

        Returns
        -------
        bp : elementtree
            Book part elementTree
        """

        f, bd, bk = self.get_xml_parts()

        bp = etree.Element("book-part")

        if f is not None:
            if len(f):
                bp.append(f)
        bp.append(bd)
        bp.append(bk)
        return bp

    def get_xml_parts(self):
        """
        Returns  the front-matter , body and back-matter of a JATS XML file in the above order

        Returns
        -------
        f : elementtree
            Front-matter of JATS elementTree
        bd : elementtree
            Body of JATS elementTree
        bk : elementtree
            Back-matter of JATS elementTree

        """
        r = self.tr.getroot()
        f = r.find(".//front")
        if f is None:
            f = r.find(".//book-part-meta")
        bd = r.find(".//body")
        bk = r.find(".//back")
        return f, bd, bk

    def do_file_io(self, s, mode, pth):
        """
        Executes read or write operations on a path

        Parameters
        ----------
        s: str
            Content to be written or None for read
        mode: str
            w for write , r for r
        pth : str
            Path to the file to be read or written

        Raises
        ------
        IOError
            I/O operation fails

        """
        try:
            w = open(pth, mode)
            if mode == "w":
                w.write(s)
                w.close()
            if mode == "r":
                o = w.read()
                w.close()
        except IOError as i:
            self.debug.print_debug(self, i)
            print(i)
            sys.exit(1)

    def run(self):
        """
         Runs the configuration on the processing object. Process  JATS-XML file and merges it into the full BITS-XML file

        See Also
        --------
        create_output_bits

        Warning
        -------
        function create_output_jats not yet used

        """

        self.gv.create_dirs_recursive(self.dr.split("/"))
        if self.scheme == "bits":
            self.create_output_bits()

        elif self.scheme == "jats":
            self.tr = self.create_output_jats(self.tr)
Beispiel #9
0
class KernelDensity (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'plotsummary')

        self.in_dir = self.args['<directory>']
        self.term_file = self.args['<term_file>']

        self.terms = [line.strip().lower() for line in open(self.term_file)]

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

        if self.args['--caption']:
            self.caption = self.args['--caption']
        else:
            self.caption = 'Term Plot'

        if self.args['--nostem']:
            self.nostem = self.args['--nostem']
        else:
            self.nostem = None

        if self.args['single']:
            self.action = 'single'
        elif self.args['group']:
            self.second_term_file = self.args['<second_term_file>']
            self.term_name = self.args['<term_name>']
            self.second_term_name = self.args['<second_term_name>']
            self.second_terms = [line.strip().lower() for line in open(self.second_term_file)]
            self.action = 'group'
        elif self.args['hist']:
            self.action = 'hist'
        elif self.args['rawcount']:
            self.action = 'rawcount'

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='kernel-density-estimation v0.1')

    def run(self):
        if self.args['--debug']:
            if self.nostem:
                with open(self.nostem) as f:
                    nostem_words = set(f.read().splitlines())
            else:
                nostem_words = []

            for term in self.terms:
                if not term in nostem_words:
                    self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                else:
                    self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

            if self.action == 'group':
                for term in self.second_terms:
                    if not term in nostem_words:
                        self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                    else:
                        self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

        file_list = listdir(self.in_dir)

        for file_name in file_list:
            if file_name.endswith(".txt"):
                self.plot(file_name)

    def plot(self, file_name):
        self.debug.print_debug(self, u'Loading ' + file_name)
        textplot = Text.from_file(join(self.in_dir, file_name), self.debug, nostem=self.nostem)

        self.debug.print_debug(self, u'Plotting ' + file_name)
        if self.action == 'single':
            graph = textplot.plot_terms(self.terms, self.caption)

        elif self.action == 'group':
            graph = textplot.plot_terms_two_groups(self.terms, self.term_name, self.second_terms,self.second_term_name, self.caption)

        elif self.action == 'hist':
            graph = textplot.plot_terms_histogram(self.terms, self.caption, 5000)
        elif self.action == 'rawcount':
            graph = textplot.plot_terms_raw_count(self.terms, self.caption, 5000)

        self.debug.print_debug(self, u'Saving ' + file_name.replace('.txt', '.png'))
        graph.savefig(join(self.in_dir, file_name.replace('.txt', '.png')))

        graph.close()
Beispiel #10
0
class MPT(Debuggable):
    """
    MPT Class Object,  which initializes the properties and defines the methods.

    """
    def __init__(self):

        self.args = self.read_command_line()
        self.debug = Debug()
        self.settings = Settings(self.args)
        self.gv = GV(self.settings)
        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()

        self.current_result = datetime.datetime.now().strftime(
            "%Y_%m_%d-%H-%M-%S-") + str(uuid.uuid4())[:4]
        self.config = None
        self.all_typesetters = None
        self.script_folder = os.path.dirname(os.path.realpath(__file__))

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='heiMPT 0.0.1')

    def get_module_name(self):
        """
        Reads the name of the module for debugging and logging

        Returns
        -------
        name string
         Name of the Module
        """
        name = 'heiMPT'
        return name

    def call_typesetter(self, args):
        """Runs  typesetter with given arguments

        Creates the execution path for a typesetter or an application and runs it  as a system process. Output,
        exit-code and  system error codes are captured and returned.


        Parameters
        ----------
        args : list
            application arguments in the correct oder.


        Returns
        -------
        output :str
            system standard output.
        err :str
            system standard error.
        exit_code: str
            system exit_code.

        See Also
        --------
        subprocess.Popen()

        """
        args_str = ' '.join(args)

        if ': ' in args_str:

            args_str = args_str.replace(': ', ':')
            self.debug.print_debug(
                self,
                "Merging command: file into command:file, can be a problem for some applications"
            )
        m = args_str.strip().split(' ')
        process = Popen(m, stdout=PIPE)
        output, err = process.communicate()
        exit_code = process.wait()
        return output, err, exit_code

    def arguments_parse(self, t_props):
        """
        Reads typesetter properties from json  configuration and create  arguments.


        Parameters
        ----------
        t_props : dictionary
            typesetter properties


        Returns
        -------
        args : list
            application execution path and arguments in the correct oder.

        """

        args = []
        if t_props.get('executable'):
            args = [t_props.get('executable')]
        else:
            self.debug.print_debug(
                self, self.gv.TYPESETTER_EXECUTABLE_VARIABLE_IS_UNDEFINED)
            sys.exit(1)
        arguments = t_props.get("arguments")
        if arguments:
            arguments = collections.OrderedDict(sorted(arguments.items()))
            for a in arguments:
                args.append(arguments[a])
        return args

    def create_output_path(self, p, p_id, args, prefix, uid):
        """
        Creates the output path for  the current file

        Output folder is  constructed using project_name, current_time,  sequence number of the current typesetter
        and the sequence number of the current file.

        Parameters
        ---------
        p: dictionary
            json program properties
        p_id:  int
            typesetter id
        args : list
            application arguments in the correct oder.
        prefix: str
            file name prefix  of  the current file
        uid: str
            unique id of the current current typesetter

        Returns
        --------
        True: boolean
            Returns True if the output file is created

        See Also
        --------
        os.makedirs()

        """
        config_args = p.get('typesetters')[p_id].get("arguments")
        if config_args is None:
            self.debug.print_debug(self,
                                   self.gv.TYPESETTER_ARGUMENTS_NOT_DEFINED)
            sys.exit(1)
        ts_args = collections.OrderedDict(sorted(config_args.items()))
        out_type = p.get('typesetters')[p_id].get("out_type")
        out_path = os.path.join(p.get('path'), uid)

        for i in ts_args:
            arg = ts_args[i]

            if arg == '--create-dir':
                args.append(out_path)

            else:
                args.append(arg)
        self.debug.print_debug(self, '{} {}'.format('Execute', ' '.join(args)))
        return True

    def run_typesetter(self, p, pre_path, pre_out_type, p_id, uid, f_id,
                       f_name, args):
        """
        Creates the temporary output path, calls the typesetter and writes the outtput to the correct path for a
        certain file

        Parameters
        ---------
        p: dictionary
            json program properties
        pre_path: str
            project path of the previous iteration
        pre_out_type : str
            output type of the previous iteration
        p_id:  int
            typesetter id
        uid: str
            unique id of the current current typesetter
        f_id:  int
              sequence number of the current file
        f_name:  str
              name of the current file
        args : list
            application arguments in the correct oder.

        Returns
        --------
        p_path : str
            project output path of the current typesetter
        pf_type : str
            project file type of the current typesetter

        See Also
        --------

        call_typesetter, organize_output

        """

        p_path = ''
        pf_type = ''
        prefix = f_name.split('.')[0]
        if p_id == min(i for i in p['typesetters']):
            f_path = os.path.join(p.get('path'), f_name)

        elif p.get("chain"):
            f_path = os.path.join(pre_path, prefix + '.' + pre_out_type)

        if os.path.isfile(f_path) or p['typesetters'].get(p_id).get('expand'):
            self.debug.print_console(
                self, '\t{}:\t {} '.format('Processing', prefix))
            self.gv.log.append(prefix)
            args.append(f_path)
            self.create_output_path(p, p_id, args, prefix, uid)
            output, err, exit_code = self.call_typesetter(args)
            self.debug.print_debug(self, output.decode('utf-8'))
            p_path = self.organize_output(p, p_id, prefix, f_id, uid, args)

            pf_type = p.get('typesetters')[p_id].get("out_type")

        else:
            self.debug.print_debug(
                self, self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + ' ' +
                os.path.join(f_path))

        return p_path, pf_type

    def typeset_file(self, p, pre_path, pre_out_type, p_id, uid, f_id, f_name):
        """
        Typesets the current file

        Parameters
        ---------
        p: dictionary
            json program properties
        pre_path: str
            project path of the previous iteration
        pre_out_type : str
            output type of the previous iteration
        p_id:  int
            typesetter id
        uid: str
            unique id of the current current typesetter
        f_id:  int
              sequence number of the current file
        f_name:  str
              name of the current file
        args: list
            application arguments in the correct oder.

        Returns
        --------
        p_path : str
            project output path of the current typesetter
        pf_type : str
            project file type of the current typesetter


        See Also
        --------
        run_typesetter

        """
        t_props = self.all_typesetters.get(
            p.get('typesetters')[p_id].get("name"))
        p_path, pf_type = '', ''

        if t_props:
            mt = self.arguments_parse(t_props)
            if self.gv.check_program(t_props.get('executable')):
                p_path, pf_type = self.run_typesetter(p, pre_path,
                                                      pre_out_type, p_id, uid,
                                                      f_id, f_name, mt)

            else:
                self.debug.print_debug(
                    self,
                    t_props.get('executable') +
                    self.gv.TYPESETTER_BINARY_IS_UNAVAILABLE)
        else:
            self.debug.print_debug(self,
                                   self.gv.PROJECT_TYPESETTER_IS_NOT_AVAILABLE)
        return p_path, pf_type

    def typeset_files(self, p, pre_path, pre_out_type, pre_id):
        """
        Typeset all files of a  certain project

        Parameters
        ---------
        p: dictionary
            json program properties
        pre_path: str
            project path of the previously executed typesetter
        pre_out_type: str
            project file type of the previously executed typesetter
        pre_id :int
            sequence number of the previously executed file

        Returns
        --------
        p_path : str
            project output path of the current typesetter
        pf_type : str
            project file type of the current typesetter


        See Also
        --------
        typeset_file

        """
        p_path, pf_type = '', ''

        uid = str(uuid.uuid4())

        project_files = collections.OrderedDict(
            sorted((int(key), value)
                   for key, value in list(p.get('files').items())))
        if p.get('typesetters')[pre_id].get("expand"):
            f_name = self.gv.uuid
            p_path, pf_type = self.typeset_file(p, pre_path, pre_out_type,
                                                pre_id, uid, 0, f_name)

        else:
            for f_id in project_files:
                f_name = project_files[f_id]
                p_path, pf_type = self.typeset_file(p, pre_path, pre_out_type,
                                                    pre_id, uid, f_id, f_name)

        return p_path, pf_type

    def typeset_project(self, p):
        """
        Typesets a certain project

        Parameters
        ---------
        p: dictionary
            json program properties

        Returns
        --------
        True: boolean
            Returns True, if  all the typesetters in project has run successfully.


        See Also
        --------
        typeset_files

        """
        typesetters_ordered, temp_path, temp_pre_out_type = '', '', ''
        pre_path = ''
        prev_out_type = ''

        if p.get('active'):
            self.debug.print_console(self, 'PROJECT : ' + p.get('name'))
            self.gv.log.append(p.get("name"))
            ts = p.get('typesetters')
            if ts:
                typesetters_ordered = collections.OrderedDict(
                    sorted(ts.items()))
            else:
                self.debug.print_debug(
                    self, self.gv.PROJECT_TYPESETTERS_ARE_NOT_SPECIFIED)

            if self.all_typesetters is None:
                self.debug.print_debug(
                    self, self.gv.PROJECT_TYPESETTER_VAR_IS_NOT_SPECIFIED)
                sys.exit(1)

            for p_id in typesetters_ordered:
                self.debug.print_console(
                    self, ' '.join([
                        'Step', p_id, ':', '\t',
                        p.get('typesetters')[p_id].get("name")
                    ]))
                self.gv.log.append('{} {}'.format(
                    p_id,
                    p.get('typesetters')[p_id].get("name")))
                temp_path, temp_pre_out_type = self.typeset_files(
                    p, pre_path, prev_out_type, p_id)

                pre_path = temp_path
                prev_out_type = temp_pre_out_type

        else:
            self.debug.print_debug(
                self, self.gv.PROJECT_IS_NOT_ACTIVE + ' ' + p.get('name'))
        return True

    def typeset_all_projects(self):
        """
        Typeset all projects defined in the json file

        Returns
        --------
        True: boolean
            Returns True, if the  all the typesetters in project run

        See Also
        --------
        typeset_project

        """
        projects = self.config.get('projects')
        if projects:
            for p in projects:
                self.typeset_project(p)

        else:
            self.debug.print_debug(self, self.gv.PROJECTS_VAR_IS_NOT_SPECIFIED)
        return True

    def organize_output(self, p, p_id, prefix, f_id, uid, args):
        """
        Copy the temporary results into the  final project path

        This method reads the temporary results of the current typesetter step and copies them in to the correct output
        folder. Output folder is  constructed using project_name, current_time,  sequence number of the current typesetter
        and the sequence number of the current file.  Customized tool specific actions are also defined and handled here.



        Parameters
        ------------
        p: dict
            json program properties
        p_id:  int
            typesetter id
        prefix: str
            file name prefix  of  the current file
        f_id:  int
              sequence number of the current file
        uid: str
            unique id of the current current typesetter
        args: bytearray
            tool parameters , executable file is first element
        Returns
        --------
        project_path: str
            Final path for the current file


        See Also
        --------
        create_merged_file, gv.create_dirs_recursive

        """
        p_name = p.get('typesetters')[p_id].get("name")

        t_path = [p.get('path'), uid]
        if args:
            if len([arg for arg in args if 'meTypeset.py' in arg]) > 0:
                t_path += ['nlm']
        else:
            t_path += [p.get('path'), uid]

        out_type = p['typesetters'][p_id].get('out_type')

        if out_type is None:
            self.debug.print_console(
                self, self.gv.PROJECT_OUTPUT_FILE_TYPE_IS_NOT_SPECIFIED)
            sys.exit(1)
        project_path = [
            p.get('path'), p['name'], self.current_result, p_id + '_' + p_name,
            out_type
        ]

        temp_dir = os.path.join(p.get('path'), uid)

        if p['typesetters'][p_id].get('merge'):
            self.create_merged_file(p, p_id, project_path, t_path)
            if len(list(p.get('files').items())) == f_id:
                shutil.rmtree(temp_dir)
        elif p['typesetters'][p_id].get('expand'):
            for filename in os.listdir(temp_dir):
                p_path = self.gv.create_dirs_recursive(project_path)
                f_path = '{}{}{}'.format(p_path, SEP, filename)
                os.rename(os.path.join(temp_dir, filename), f_path)
            shutil.rmtree(temp_dir)
        elif p['typesetters'][p_id].get('process'):
            if p_name.lower() == 'metypeset' and not os.path.exists(
                    SEP.join(t_path)):
                t_path.append('nlm')
            t_path.append(prefix + '.' + out_type)
            p_path = self.gv.create_dirs_recursive(project_path)
            f_path = '{}{}{}.{}'.format(p_path, SEP, prefix, out_type)
            try:
                os.rename(SEP.join(t_path), f_path)
                shutil.rmtree(temp_dir)
            except FileNotFoundError:
                print('File not found\t{}', SEP.join(t_path))
                sys.exit(1)

        else:
            self.debug.print_debug(
                self, self.gv.PROJECT_TYPESETTER_PROCESS_METHOD_NOT_SPECIFIED)
        if len(list(p.get('typesetters').items())) == int(p_id) and int(
                f_id) == len(list(p.get('files').items())):
            zip_path = ''.join([p.get('path'), SEP, p['name']])
            shutil.make_archive('{}/{}'.format(zip_path, p.get("name")), 'zip',
                                zip_path)

        return SEP.join(project_path)

    def create_merged_file(self, p, p_id, project_path, t_path):
        """
        Create a combined file from a set of input files

        Parameters
        ------------
        p: dict
            json program properties
        p_id:  int
            typesetter id
        t_path : str
            temporary  output directory
        project_path : str
            system path to be created

        See Also
        --------
        create_named_file()


        """
        t_path.append(self.gv.uuid)
        p_path = self.gv.create_dirs_recursive(project_path)

        f_path = '{}{}{}.xml'.format(p_path, SEP, self.gv.uuid)
        shutil.copy2(SEP.join(t_path), f_path)
        self.create_named_file(p, p_id, p_path, t_path)
        return f_path

    def create_named_file(
        self,
        p,
        p_id,
        p_path,
        t_path,
    ):
        """
        Copy  unique file name to a named file

        p: dict
            json program properties
        p_id:  int
            typesetter id
        t_path : str
            temporary  output directory
        p_path : str
            output directory for the current typesetter

        """
        f = p['typesetters'][p_id].get('out_file')
        if f:
            shutil.copy2(SEP.join(t_path), '{}{}{}'.format(p_path, SEP, f))
        return

    def run_modules(self):
        """
        Run MPT in module mode

        """
        # Run import modules
        if self.args.get('import'):
            sys.path.insert(
                0, os.path.join(self.script_folder, 'plugins', 'import'))
            import ImportInterface
            if self.args.get('omp'):
                m = "omp"
                plugin_package = __import__(m, fromlist=['*'])
                plugin_module = getattr(plugin_package, m)
                # Find class inheriting form Import abstract class in the module
                for name in dir(plugin_module):
                    candidate = getattr(plugin_module, name)
                    if inspect.isclass(candidate)\
                            and issubclass(candidate, ImportInterface.Import)\
                            and candidate is not ImportInterface.Import:
                        plugin_class = candidate
                        print(("Found import plugin", name, plugin_class))
                        plugin = plugin_class()
                        self.debug.print_console(self, str(self.args))
                        plugin.run(self.args,
                                   {'base-path': self.script_folder})

                # try:
                #    plugin_module = __import__(m)
                #    plugin_module.plugin.run()
                # except Exception as e:
                #    print('{} {}: {}'.format(m, 'method  import failed', e))
                #    sys.exit(0)
        else:
            self.debug.fatal_error(self, "Unsupported arguments: " + self.args)
        return

    def check_applications(self):
        """
        Check if program binaries are available 

        """
        ps = self.config.get('projects')
        psf = [s for s in ps if s.get('active') == True]
        ts = self.config.get('typesetters')

        for p in [ts[i]['arguments'] for i in ts]:
            for k in [
                    j for j in list(p.values()) if j.find('--formatter') == 0
            ]:
                for l in k.split('=')[1].split(','):
                    if not self.gv.check_program(self.gv.apps.get(l.lower())):
                        self.debug.fatal_error(
                            self, '{} {}'.format(
                                self.gv.apps.get(l.lower()),
                                self.gv.apps.get(l.lower()) +
                                self.gv.TYPESETTER_BINARY_IS_UNAVAILABLE))
                        sys.exit(1)

        for p in [ts[i]['executable'] for i in ts]:
            if not self.gv.check_program(p):
                self.debug.fatal_error(
                    self, '{} {}'.format(
                        p,
                        self.gv.apps.get(l.lower()) +
                        self.gv.TYPESETTER_BINARY_IS_UNAVAILABLE))
                sys.exit(1)
class KernelDensity (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'plotsummary')

        self.in_dir = self.args['<directory>']

        if self.args['<term_file>']:
            self.term_file = self.args['<term_file>']

            self.terms = [line.strip().lower() for line in open(self.term_file)]

        elif self.args["<first_term>"] and self.args["<second_term>"]:
            self.terms = []
            self.terms.append(self.args["<first_term>"])
            self.terms.append(self.args["<second_term>"])

        elif self.args["<term>"]:
            self.terms = []
            self.terms.append(self.args["<term>"])

        if self.args["<count>"]:
            self.max = int(self.args["<count>"])

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

        if self.args['--caption']:
            self.caption = self.args['--caption']
        else:
            self.caption = 'Term Plot'

        if self.args['--nostem']:
            self.nostem = self.args['--nostem']
        else:
            self.nostem = None

        if self.args['single']:
            self.action = 'single'
        elif self.args['group']:
            self.second_term_file = self.args['<second_term_file>']
            self.term_name = self.args['<term_name>']
            self.second_term_name = self.args['<second_term_name>']
            self.second_terms = [line.strip().lower() for line in open(self.second_term_file)]
            self.action = 'group'
        elif self.args['hist']:
            self.action = 'hist'
        elif self.args['rawcount']:
            self.action = 'rawcount'
        elif self.args['overlap']:
            self.action = 'overlap'
        elif self.args['search']:
            self.action = 'search'

        if self.args['--words']:
            self.words = int(self.args['--words'])
        else:
            self.words = 5000

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='kernel-density-estimation v0.1')

    def run(self):
        if self.args['--debug']:
            if self.nostem:
                with open(self.nostem) as f:
                    nostem_words = set(f.read().splitlines())
            else:
                nostem_words = []

            for term in self.terms:
                if not term in nostem_words and term != Text.show_stem(term):
                    self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                else:
                    self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

            if self.action == 'group':
                for term in self.second_terms:
                    if not term in nostem_words:
                        self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                    else:
                        self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

        file_list = listdir(self.in_dir)

        for file_name in file_list:
            if file_name.endswith(".txt"):
                self.plot(file_name)

    def plot(self, file_name):
        self.debug.print_debug(self, u'Loading ' + file_name)

        textplot = Text.from_file(join(self.in_dir, file_name), self.debug, nostem=self.nostem)

        self.debug.print_debug(self, u'Plotting ' + file_name)

        if self.action == 'single':
            graph = textplot.plot_terms(self.terms, self.caption)

        elif self.action == 'group':
            graph = textplot.plot_terms_two_groups(self.terms, self.term_name, self.second_terms,self.second_term_name, self.caption)

        elif self.action == 'hist':
            graph = textplot.plot_terms_histogram(self.terms, self.caption, self.words)

        elif self.action == 'rawcount':
            graph = textplot.plot_terms_raw_count(self.terms, self.caption, self.words)

        elif self.action == 'overlap':
            graph = textplot.plot_kde_overlap(self.terms)

        elif self.action == 'search':
            newterms = textplot.anchored_scores(self.terms[0])

            count = 0
            self.debug.print_(self, u'Top twenty correlated terms (with more than one occurrence) for {0}: '.format(self.terms[0]))

            for item in newterms:
                if len(textplot.terms[item]) > 1 and item != textplot.stem(self.terms[0]):
                    if count > self.max:
                        break

                    self.debug.print_(self, item)
                    count += 1

        if self.action != 'search':
            self.debug.print_debug(self, u'Saving ' + file_name.replace('.txt', '.png'))

            graph.savefig(join(self.in_dir, file_name.replace('.txt', '.png')))
            graph.close()
Beispiel #12
0
class MPT(Debuggable):
    """
    MPT Class Object,  which initializes the properties and defines the methods.

    """

    def __init__(self):

        self.args = self.read_command_line()
        self.debug = Debug()
        self.gv = GV()
        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()

        self.current_result = datetime.datetime.now().strftime(
            "%Y_%m_%d-%H-%M-") + str(uuid.uuid4())[:8]
        self.config = self.gv.read_json(self.args['<config_file>'])
        self.all_typesetters = self.config.get('typesetters')

    def run(self):
        """
        Runs the MPT  Module, which typesets all the projects defined in the json input file

        Returns
        --------
        True: boolean
            Returns True if all the projects are typeset

        See Also
        --------
        typeset_all_projects

        """
        self.typeset_all_projects()
        return True

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='mpt 0.0.1')

    def get_module_name(self):
        """
        Reads the name of the module for debugging and logging

        Returns
        -------
        name string
         Name of the Module
        """
        name = 'MPT'
        return name

    def call_typesetter(self, args):
        """Runs  typesetter with given arguments

        Creates the execution path for a typesetter or an application and runs it  as a system process. Output,
        exit-code and  system error codes are captured and returned.


        Parameters
        ----------
        args : list
            application arguments in the correct oder.


        Returns
        -------
        output :str
            system standard output.
        err :str
            system standard error.
        exit_code: str
            system exit_code.

        See Also
        --------
        subprocess.Popen()

        """
        m = ' '.join(args).strip().split(' ')
        self.debug.print_console(self, ' '.join(m))
        process = Popen(m, stdout=PIPE)
        output, err = process.communicate()
        exit_code = process.wait()
        return output, err, exit_code

    def arguments_parse(self, t_props):
        """
        Reads typesetter properties from json  configuration and create  arguments.


        Parameters
        ----------
        t_props : dictionary
            typesetter properties


        Returns
        -------
        args : list
            application execution path and arguments in the correct oder.

        """

        args = []
        if t_props.get('executable'):
            args = [t_props.get('executable')]
        else:
            self.debug.print_debug(
                self, self.gv.TYPESETTER_EXECUTABLE_VARIABLE_IS_UNDEFINED)
            sys.exit(1)
        arguments = t_props.get("arguments")
        if arguments:
            arguments = collections.OrderedDict(sorted(arguments.items()))
            for a in arguments:
                args.append(arguments[a])
        return args

    def create_output_path(
            self,
            p,
            p_id,
            args,
            prefix,
            uid):
        """
        Creates the output path for  the current file

        Output folder is  constructed using project_name, current_time,  sequence number of the current typesetter
        and the sequence number of the current file.

        Parameters
        ---------
        p: dictionary
            json program properties
        p_id:  int
            typesetter id
        args : list
            application arguments in the correct oder.
        prefix: str
            file name prefix  of  the current file
        uid: str
            unique id of the current current typesetter

        Returns
        --------
        True: boolean
            Returns True if the output file is created

        See Also
        --------
        os.makedirs()

        """
        ts_args = collections.OrderedDict(
            sorted(p.get('typesetters')[p_id].get("arguments").items()))
        out_type = p.get('typesetters')[p_id].get("out_type")
        out_path = os.path.join(p.get('path'), uid)

        for i in ts_args:
            arg = ts_args[i]
            if arg == 'create_output_directory()':
                args.append(out_path)

            elif arg == 'create_output_file()':
                if not os.path.exists(out_path):
                    os.makedirs(out_path)
                args.append(
                    os.path.join(
                        out_path,
                        prefix +
                        '.' +
                        out_type))
            else:
                args.append(arg)
        return True

    def run_typesetter(
            self,
            p,
            pre_path,
            pre_out_type,
            p_id,
            uid,
            f_id,
            f_name,
            args):
        """
        Creates the temporary output path, calls the typesetter and writes the outtput to the correct path for a
        certain file

        Parameters
        ---------
        p: dictionary
            json program properties
        pre_path: str
            project path of the previous iteration
        pre_out_type : str
            output type of the previous iteration
        p_id:  int
            typesetter id
        uid: str
            unique id of the current current typesetter
        f_id:  int
              sequence number of the current file
        f_name:  str
              name of the current file
        args : list
            application arguments in the correct oder.

        Returns
        --------
        p_path : str
            project output path of the current typesetter
        pf_type : str
            project file type of the current typesetter

        See Also
        --------

        call_typesetter, organize_output

        """

        p_path = ''
        pf_type = ''
        prefix = f_name.split('.')[0]
        if p_id == min(i for i in p['typesetters']):
            f_path = os.path.join(p.get('path'), f_name)

        elif p.get("chain"):
            f_path = os.path.join(pre_path,prefix +'.' + pre_out_type)

        if os.path.isfile(f_path) or p['typesetters'].get(p_id).get('expand'):
            args.append(f_path)
            self.create_output_path(p, p_id,  args, prefix, uid)
            output, err, exit_code = self.call_typesetter(args)
            self.debug.print_debug(self, output.decode('utf-8'))
            p_path = self.organize_output(
                p,
                p_id,
                prefix,
                f_id,
                uid)

            pf_type = p.get('typesetters')[p_id].get("out_type")

        else:
            self.debug.print_debug(
                self,
                self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST + ' ' +
                os.path.join(f_path))

        return p_path, pf_type

    def typeset_file(
            self,
            p,
            pre_path,
            pre_out_type,
            p_id,
            uid,
            f_id,
            f_name
    ):
        """
        Typesets the current file

        Parameters
        ---------
        p: dictionary
            json program properties
        pre_path: str
            project path of the previous iteration
        pre_out_type : str
            output type of the previous iteration
        p_id:  int
            typesetter id
        uid: str
            unique id of the current current typesetter
        f_id:  int
              sequence number of the current file
        f_name:  str
              name of the current file
        args: list
            application arguments in the correct oder.

        Returns
        --------
        p_path : str
            project output path of the current typesetter
        pf_type : str
            project file type of the current typesetter


        See Also
        --------
        run_typesetter

        """
        t_props = self.all_typesetters.get(p.get('typesetters')[p_id].get("name"))
        p_path, pf_type = '',''

        if t_props:
            mt = self.arguments_parse(t_props)
            if self.gv.check_program(t_props.get('executable')):
                p_path, pf_type = self.run_typesetter(
                    p,
                    pre_path,
                    pre_out_type,
                    p_id,
                    uid,
                    f_id,
                    f_name,
                    mt)

            else:
                self.debug.print_debug(self, self.gv.TYPESETTER_BINARY_IS_UNAVAILABLE)
        else:
            self.debug.print_debug(
                self, self.gv.PROJECT_TYPESETTER_IS_NOT_AVAILABLE)
        return p_path, pf_type

    def typeset_files(
            self,
            p,
            pre_path,
            pre_out_type,
            pre_id):
        """
        Typeset all files of a  certain project

        Parameters
        ---------
        p: dictionary
            json program properties
        pre_path: str
            project path of the previously executed typesetter
        pre_out_type: str
            project file type of the previously executed typesetter
        pre_id :int
            sequence number of the previously executed file

        Returns
        --------
        p_path : str
            project output path of the current typesetter
        pf_type : str
            project file type of the current typesetter


        See Also
        --------
        typeset_file

        """
        p_path, pf_type = '', ''

        uid = str(uuid.uuid4())

        project_files = collections.OrderedDict(
            sorted((int(key), value) for key, value in p.get('files').items()))
        if p.get('typesetters')[pre_id].get("expand"):
            f_name = self.gv.uuid
            p_path, pf_type = self.typeset_file(
                p,
                pre_path,
                pre_out_type,
                pre_id,
                uid,
                0,
                f_name
            )


        else:
            for f_id in project_files:
                f_name = project_files[f_id]
                p_path, pf_type = self.typeset_file(
                        p,
                        pre_path,
                        pre_out_type,
                        pre_id,
                        uid,
                        f_id,
                        f_name
                    )

        return p_path, pf_type

    def typeset_project(self, p):
        """
        Typesets a certain project

        Parameters
        ---------
        p: dictionary
            json program properties

        Returns
        --------
        True: boolean
            Returns True, if  all the typesetters in project has run successfully.


        See Also
        --------
        typeset_files

        """
        typesetters_ordered, temp_path, temp_pre_out_type = '', '', ''
        pre_path = ''
        prev_out_type = ''

        if p.get('active'):
            ts = p.get('typesetters')
            if ts:
                typesetters_ordered = collections.OrderedDict(
                    sorted(ts.items()))
            else:
                self.debug.print_debug(
                    self, self.gv.PROJECT_TYPESETTERS_ARE_NOT_SPECIFIED)

            if self.all_typesetters is None:
                self.debug.print_debug(
                    self, self.gv.PROJECT_TYPESETTER_VAR_IS_NOT_SPECIFIED)
                sys.exit(1)

            for p_id in typesetters_ordered:
                self.debug.print_console(self, ' '.join(['Runnning Typesetter',p_id,':', p.get('typesetters')[p_id].get("name")]))
                temp_path, temp_pre_out_type = self.typeset_files(
                    p,
                    pre_path,
                    prev_out_type,
                    p_id
                )

                pre_path = temp_path
                prev_out_type = temp_pre_out_type
                self.debug.print_console(self, ' '.join(['ls -al',temp_path]))

        else:
            self.debug.print_debug(self, self.gv.PROJECT_IS_NOT_ACTIVE)
        return True

    def typeset_all_projects(self):
        """
        Typeset all projects defined in the json file

        Returns
        --------
        True: boolean
            Returns True, if the  all the typesetters in project run

        See Also
        --------
        typeset_project

        """
        projects = self.config.get('projects')
        if projects:
            for p in projects:
                self.typeset_project(p)

        else:
            self.debug.print_debug(self, self.gv.PROJECTS_VAR_IS_NOT_SPECIFIED)
        return True


    def organize_output(
            self,
            p,
            p_id,
            prefix,
            f_id,
            uid):
        """
        Copy the temporary results into the  final project path

        This method reads the temporary results of the current typesetter step and copies them in to the correct output
        folder. Output folder is  constructed using project_name, current_time,  sequence number of the current typesetter
        and the sequence number of the current file.  Customized tool specific actions are also defined and handled here.



        Parameters
        ------------
        p: dict
            json program properties
        p_id:  int
            typesetter id
        prefix: str
            file name prefix  of  the current file
        f_id:  int
              sequence number of the current file
        uid: str
            unique id of the current current typesetter

        Returns
        --------
        project_path: str
            Final path for the current file


        See Also
        --------
        create_merged_file, gv.create_dirs_recursive

        """
        p_name = p.get('typesetters')[p_id].get("name")
        t_path = [p.get('path'), uid] + ['nlm'] if p_name == 'metypeset' else [p.get('path'), uid]
        out_type = p['typesetters'][p_id]['out_type']
        project_path = [p.get('path'),p['name'], self.current_result, p_id + '_' + p_name,out_type]
        temp_dir = os.path.join(p.get('path'), uid)

        if p['typesetters'][p_id].get('merge'):
            self.create_merged_file(p, p_id, project_path, t_path)
            if len(p.get('files').items()) == f_id:
                shutil.rmtree(temp_dir)
        elif p['typesetters'][p_id].get('expand'):
            for filename in os.listdir(temp_dir):
                p_path = self.gv.create_dirs_recursive(project_path)
                f_path = '{}{}{}'.format(p_path,SEP,filename)
                os.rename(os.path.join(temp_dir,filename), f_path)
            shutil.rmtree(temp_dir)
        elif p['typesetters'][p_id].get('process'):
            t_path.append(prefix + '.' + out_type)
            p_path = self.gv.create_dirs_recursive(project_path)
            f_path = '{}{}{}.{}'.format(p_path, SEP, prefix, out_type)
            os.rename(SEP.join(t_path), f_path)
            shutil.rmtree(temp_dir)
        else:
            self.debug.print_debug(self, self.gv.PROJECT_TYPESETTER_PROCESS_METHOD_NOT_SPECIFIED)

        #self.debug.print_console(self, '{}  {}'.format(self.gv.OUTPUT,f_path))

        return SEP.join(project_path)

    def create_merged_file(self, p, p_id, project_path, t_path):
        """
        Create a combined file from a set of input files

        Parameters
        ------------
        p: dict
            json program properties
        p_id:  int
            typesetter id
        t_path : str
            temporary  output directory
        project_path : str
            system path to be created

        See Also
        --------
        create_named_file()


        """
        t_path.append(self.gv.uuid)
        p_path = self.gv.create_dirs_recursive(project_path)
        f_path = '{}{}{}.xml'.format(p_path,SEP ,self.gv.uuid)
        shutil.copy2(SEP.join(t_path), f_path)
        self.create_named_file(p, p_id, p_path, t_path)
        return f_path

    def create_named_file(self,  p, p_id, p_path ,t_path,):
        """
        Copy  unique file name to a named file

        p: dict
            json program properties
        p_id:  int
            typesetter id
        t_path : str
            temporary  output directory
        p_path : str
            output directory for the current typesetter

        """
        f = p['typesetters'][p_id].get('out_file')
        if f:
            f_path = '{}{}{}'.format(p_path, SEP,f)
            shutil.copy2(SEP.join(t_path), f_path)
        return
Beispiel #13
0
class MePrePrint (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = docopt(__doc__, version='meTypeset 0.1')

        # initialize debugger
        self.debug = Debug()
        self.debug.enable_debug()
        Debuggable.__init__(self, 'mePrePrint')

        # get arguments
        self.doc_type = self.args['--type']
        self.title = self.args['--article_title']
        self.name = self.args['--author']
        self.copyright_year = self.args['--year']
        self.copyright = self.args['--copyright']
        self.citation = self.args['--citation']
        self.url = self.args['--url']

        if self.doc_type == 'preprint':
            self.version = 'pre-print (not peer reviewed)'
        elif self.doc_type == 'postprint':
            self.version = 'post-print (peer reviewed)'
        elif self.doc_type == 'final':
            self.version = 'final publisher'

    @staticmethod
    def copy(src, dst):
        try:
            shutil.copytree(src, dst)
        except OSError as exc:
            if exc.errno == errno.ENOTDIR:
                shutil.copy(src, dst)
            else: raise

    @staticmethod
    def do_replace(in_string, replace_text, substitute):
        return in_string.replace(replace_text, substitute)

    @staticmethod
    def zip_dir(path, zip_file, final):
        relative = os.path.join(os.path.abspath(os.path.join(path, os.pardir)), final)
        for root, dirs, files in os.walk(path):
            for file_name in files:
                zip_file.write(os.path.join(root, file_name), os.path.relpath(os.path.join(root, file_name),
                                                                              os.path.join(path, relative)))

    def create_coversheet(self, destination):
        # copy the coversheet to a temporary directory
        src = self.args['<input_cover>']
        self.debug.print_debug(self, u'Copying coversheet')
        os.mkdir(os.path.join(destination, u'coversheet'))

        z = zipfile.ZipFile(src, "r")

        z.extractall(os.path.join(destination, u'coversheet'))

        # open the document XML
        self.debug.print_debug(self, u'Replacing cover sheet variables')
        with open (u'{0}'.format(os.path.join(destination, u'coversheet/word/document.xml')), 'r+') as doc_file:
            contents = doc_file.read()

            contents = self.do_replace(contents, '{ARTICLE_TITLE}', self.title)
            contents = self.do_replace(contents, '{AUTHOR_NAME}', self.name)
            contents = self.do_replace(contents, '{VERSION}', self.version)
            contents = self.do_replace(contents, '{JOURNAL_CITATION}', self.citation)
            contents = self.do_replace(contents, '{URL}', self.url)
            contents = self.do_replace(contents, '{COPYRIGHT}', self.copyright)
            contents = self.do_replace(contents, '{COPYRIGHT_YEAR}', self.copyright_year)

            doc_file.seek(0)
            doc_file.write(contents)
            doc_file.truncate()

        self.debug.print_debug(self, u'Replacing cover sheet hyperlinks')
        with open (u'{0}'.format(os.path.join(destination, u'coversheet/word/_rels/document.xml.rels')),
                   'r+') as doc_file:

            contents = doc_file.read()

            contents = self.do_replace(contents, '{URL}', self.url)

            doc_file.seek(0)
            doc_file.write(contents)
            doc_file.truncate()

        # re-package the file into a docx
        z = zipfile.ZipFile(os.path.join(destination, u'final_cover.docx'), "w")

        self.zip_dir(os.path.join(destination, u'coversheet'), z, 'coversheet')

        pdf = os.path.join(destination, u'final_cover.pdf')

        command = 'unoconv -f pdf {0}'.format(os.path.join(destination, u'final_cover.docx'))

        self.debug.print_debug(self, 'Running: {0}'.format(command))

        subprocess.call(command.split(' '))

        # remove the temporary file
        return pdf

    def run(self):
        # create temporary directory
        temp_dir = tempfile.mkdtemp()
        self.debug.print_debug(self, u'Making temporary directory {0}'.format(temp_dir))

        # create the coversheet
        pdf = self.create_coversheet(temp_dir)

        # convert the user's document into a PDF
        user_file = os.path.join(temp_dir, u'user.docx')
        user_pdf = os.path.join(temp_dir, u'user.pdf')
        shutil.copy(self.args['<input_article>'],user_file)
        command = 'unoconv -f pdf {0}'.format(user_file)
        self.debug.print_debug(self, 'Running: {0}'.format(command))
        subprocess.call(command.split(' '))

        # join the PDFs
        command = 'pdfunite {0} {1} {2}'.format(pdf, user_pdf, self.args['<output_file>'])
        self.debug.print_debug(self, 'Running: {0}'.format(command))
        subprocess.call(command.split(' '))

        # remove the temporary directory
        self.debug.print_debug(self, u'Removing temporary directory {0}'.format(temp_dir))
        shutil.rmtree(temp_dir)
Beispiel #14
0
class Prepare(Debuggable):
    """
    Standalone Processing object to combine, clean and modify a JATS XML file and optionally inject BITS Metadata headers.

    Features
    --------
    add Id numbering for any tag type, clean comments, remove unused references,
    set numbering, add unique ids to certain tag types, sort references

    """
    def __init__(self):
        self.args = self.read_command_line()
        self.debug = Debug()
        self.settings = Settings(self.args)
        self.gv = GV(self.settings)
        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()
        self.dr = self.args.get('<path>')
        self.f = self.args.get('<input_file>')
        self.stand_alone = self.args.get('--stand-alone')
        self.tr = etree.parse(os.path.join(self.dr, self.f))

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='xml 0.1')

    def citations_to_references(self):
        """ Removes  mixed-citation block, adds as a <sec> Section element

        Returns
         -------
         tr : elementtree

        """

        t = self.tr.getroot()
        bd = t.find('.//body')
        sc = etree.Element('sec')
        ttl = etree.Element('title')
        ttl.text = 'References'
        sc.append(ttl)
        mc = t.findall('.//mixed-citation')
        if len(mc) > 0:
            for r in mc:
                r.tag = 'p'
                sc.append(r)
            bd.append(sc)
            rlst = t.find('.//ref-list')
            rlst.getparent().remove(rlst)
            bck = t.find('.//back')
            bck.append(etree.Element('ref-list'))

        return self.tr

    def clean_references(self):
        """ removes  references, which are not linked.

         Parameters
         -----------
         tag : str
            name of the XML tag

         Returns
         -------
         tr : elementtree

         See Also
         --------
         remove_element, remove_tags

        """
        r = self.tr.getroot()

        for e in r.findall('.//back/ref-list/ref'):
            if e.attrib.get('id'):
                if r.find(".//xref[@ref-type='bibr'][@rid='" +
                          e.attrib.get('id') + "']") is None:
                    self.remove_element(e)
            else:
                self.remove_element(e)
        for e in r.findall(".//xref[@ref-type='bibr']"):
            if r.find(".//back/ref-list/ref[@id='" + e.attrib.get('rid') +
                      "']") is None:
                if e.getparent() is not None:
                    for c in e.getparent().getiterator():
                        if c.tag == 'xref' and c.attrib.get(
                                'ref-type') == 'bibr':
                            self.remove_tags(c)
        return self.tr

    def remove_tags(self, e):
        """
        Takes an etree element and replaces it with its own text

        Parameters
        ----------
        e : element
            Element to be replaced

        """
        if e.getparent() is not None:
            previous = e.getprevious()
            if previous is not None:
                if previous.tail:
                    if e.text:
                        previous.tail = previous.tail + e.text
                    if e.tail:
                        previous.tail = previous.tail + e.tail
                    e.getparent().remove(e)

    def remove_element(self, e):
        """
        Remove any element only if it has a parent

        Parameters
        ----------
        e : element
            Element to be replaced

        """
        if e.getparent() is not None:
            e.getparent().remove(e)

    def set_uuids_for_back_matter(self, tags):
        """
        Add unique id tags to  any of the sub-elements of the back matter

        Parameters
        ----------
        tags: list
         list of elements

        Returns
        -------
        tr : elementtree

        """
        for s in tags:
            f = {}
            ref_type = 'bibr' if s == 'ref' else s
            fns = self.tr.getroot().findall(''.join(
                ['.//xref/[@ref-type="', ref_type, '"]']))
            for i in fns:
                rid = ''.join(['bibd', str(uuid.uuid4())])
                f[i.attrib['rid']] = rid
                i.set('rid', rid)
            for m in list(f.keys()):
                n = self.tr.getroot().find(''.join(
                    ['.//' + s + '/[@id="', m, '"]']))
                if n is not None:
                    n.set('id', f[m]) if len(n) > 0 else ''
        return self.tr

    def set_numbering_values(self, tag, attr, value, count, range_list):
        """
        Adds numerical values to  a  tag  in arguments list

        Parameters
        ---------
        tag: str
            xml tag name
        attr: str
            attribute name
        value :str
            value name
        count : int
            current sequence number
        range_list : list
           lower and upper level for the  numbering

        See Also
        --------
        set_roman_numbers

        """
        searchTag = './/' + tag + '[@' + attr + '="' + value + '"]'
        elems = self.tr.getroot().findall(searchTag)
        range_count = 1
        for elem in elems:
            elem.text, range_count = self.set_roman_numbers(
                count, range_count, range_list)
            count += 1

        return self.tr, count

    def convert_int_to_roman(self, i):
        """
        Converts an integer number into a roman number

        Parameters
        ---------
        i : int
            integer number

        Returns
        -------
        result : str
            Roman number

        """
        result = []
        for integer, numeral in self.gv.numeral_map:
            count = i // integer
            result.append(numeral * count)
            i -= integer * count
        return ''.join(result)

    def set_roman_numbers(self, count, r_count, range_list):
        """
        Converts a given set of elements defined by range_array into roman numbers

        Parameters
        ---------
        count :int
        r_count : int
        range_list : list
            lower and upper level for the  numbering

        Returns
        -------
        val : str
        r_count: int

        See Also
        --------
        convert_int_to_roman

        """

        val = str(count)
        if int(range_list[0]) <= count <= int(range_list[1]):
            val = self.convert_int_to_roman(r_count).lower()
            r_count += 1
        else:
            val = str(count - r_count + 1)
        return val, r_count

    def merge_metadata(self, metadata):
        """
        reads a metadata file path and  merge its content into the metadata section

        Parameters
        ----------
        metadata : str
             suffix  of the metadata files

        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        -------
        create_metadata_path

        """
        r = self.tr.getroot()

        pth = self.create_metadata_path(metadata)

        if os.path.isfile(pth):
            fr = r.find('.//front')
            if len(fr):
                bg = r.find('.//body').getparent()
                fr.getparent().remove(fr)
                bpm = etree.parse(pth).find('.//book-part-meta')
                if bpm is None:
                    bpm = etree.parse(pth).find('.')
                    if bpm is not None:
                        if bpm.getroottree().getroot().tag == 'front':
                            bg.insert(0, bpm)
                        else:
                            self.debug.print_debug(
                                self, 'front or bookpart metadata unspecified')
                            sys.exit(1)
                else:
                    bg.insert(0, bpm)
            else:
                self.debug.print_debug(self, 'front metadata unspecified')
        else:
            self.debug.print_debug(
                self, pth + self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST)
            sys.exit(1)
        return self.tr

    def create_metadata_path(self, metadata):
        """
        creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata

        Parameters
        ----------
        metadata : str
            Suffix of the metadata  files

        Returns
        -------
        pth : str
            Correct path of the metadata file in the folder structure

        Notes
        -----
        We assume that  metadata files are stored in a sub-folder named metadata
        """
        p = os.path.dirname(self.f).split(os.sep)
        f = os.path.basename(self.f)
        name, ext = os.path.splitext(f)
        file_name = [name, '.', metadata, ext]

        if not self.stand_alone or not os.path.exists(os.sep.join(p)):
            del p[-4:]
        p.append('metadata')
        p.append(''.join(file_name))
        pth = os.sep.join(p)
        return pth

    def sort_by_tags(self, tag_list, elem):
        """
        Sorts  a   list  of elements alphabetically

        Parameters
        ----------
        tag_list : list
            A list of tag types
        elem : Element
            Element to be modified

        """
        data = []
        for e in elem:
            vl = []
            for tag in tag_list:
                vl.append(e.findtext(".//" + tag))
            vl.append(e)
            data.append(tuple(vl))

        data.sort()
        elem[:] = [item[-1] for item in data]

    def sort_references(self, tag_list):
        """
        Sort references based on the  sub-elements list

        Parameters
        ----------
        tag_list : list
            A list of tag types


        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        --------
        sort_by_tags
        """
        elem = self.tr.find('./back/ref-list')
        self.sort_by_tags(tag_list, elem)

        return self.tr

    def sort_footnotes(self, tag_list):
        """
        Sort footnotes based on the  sub-elements list

        Parameters
        ----------
        tag_list : list
            A list of tag types


        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        --------
        sort_by_tags
        """
        elem = self.tr.find('./back/fn-group')
        self.sort_by_tags(tag_list, elem)

        return self.tr

    def process(self):
        """
        Process  JATS-XML file and do all transformations into the elementtree

        See Also
        --------
        merge_metadata, set_numbering_tags,set_uuids_for_back_matter,sort_footnotes,sort_references,set_numbering_values

        """

        citations_to_references = self.args.get('--citations-to-references')
        clean_references = self.args.get('--clean-references')
        set_numbering_tags = self.args.get('--set-numbering-tags')
        set_unique_ids = self.args.get('--set-uuids')
        sort_footnotes = self.args.get('--sort-footnotes')
        sort_references = self.args.get('--sort-references')
        set_numbering_values = self.args.get('--set-numbering-values')

        metadata = self.args.get('--metadata')
        self.tr = self.merge_metadata(metadata) if metadata else self.tr

        self.tr = self.citations_to_references(
        ) if citations_to_references else self.tr
        self.tr = self.clean_references() if clean_references else self.tr
        self.tr = self.gv.set_numbering_tags(
            set_numbering_tags.split(','),
            self.tr) if set_numbering_tags else self.tr
        self.tr = self.set_uuids_for_back_matter(
            set_unique_ids.split(',')) if set_unique_ids else self.tr
        self.tr = self.sort_footnotes(
            sort_footnotes.split(',')) if sort_footnotes else self.tr
        self.tr = self.sort_references(
            sort_references.split(',')) if sort_references else self.tr

        for s in set_numbering_values.split(';'):
            vals = s.split(',')

            count = 1
            range_count = [0, 0]

            if len(vals) > 3:
                r = vals[3].lstrip('{').rstrip('}').split(':')
                range_count = [int(r[0]), int(r[1])]
            self.tr, count = self.set_numbering_values(vals[0], vals[1],
                                                       vals[2], count,
                                                       range_count)

        self.gv.create_dirs_recursive(self.dr.split('/'))
        self.create_xml_file(os.path.join(self.dr, os.path.basename(self.f)))

    def get_module_name(self):
        """
        Reads the name of the module for debugging and logging

        Returns
        -------
        name string
         Name of the Module
        """
        name = 'prepare'
        return name

    def create_xml_file(self, pth):
        """
        Write the current elementTree into the file path

        Parameters
        ----------
        pth : str
            Correct path of the metadata file in the folder structure

        Raises
        ------
        IOError
            I/O operation fails

        Notes
        -----
        Default configuration writes a normalized XML file with XML scheme

        """

        try:

            self.tr.write(pth, pretty_print=False, xml_declaration=True)
            print()
        except IOError as e:
            print(e)
            self.debug.print_debug(self, self.XML_FILE_NOT_CREATED)

    def run(self):
        """
        Runs the configuration on the processing object

        See Also
        --------
        process


        """
        self.process()
Beispiel #15
0
class GV(object):
    '''    Global variables    '''

    def __init__(self):
        # GLOBAL VARIABLES

        #application paths
        self.APACHE_FOP_PATH = u'fop/fop'
        self.METYPESET_PATH = u'meTypeset/runtime/saxon9.jar'
        self.ANTENNA_HOUSE_FOP_PATH=u'/usr/AHFormatterV61_64/run.sh'
        self.XEP_FOP_PATH = u'/usr/local/xep/bin/xep/xep'

        # projects
        self.PROJECT_INPUT_FILE_JSON_IS_NOT_VALID = u'project input file json is not valid'
        self.PROJECT_INPUT_FILE_TYPE_IS_NOT_SPECIFIED = u'project input file type is not specified'
        self.PROJECT_INPUT_FILE_HAS_MORE_THAN_TWO_DOTS = u'project input file has more than two dots'
        self.PROJECT_INPUT_FILE_DOES_NOT_EXIST = u'project input_file does not exist'
        self.PROJECT_IS_NOT_ACTIVE = u'project is not active'
        self.PROJECT_OUTPUT_FILE_IS_NOT_DEFINED = u'project output file is not defined'
        self.PROJECT_OUTPUT_FILE_WAS_NOT_CREATED = u'project output file was not created'
        self.PROJECT_TYPESETTER_IS_NOT_AVAILABLE = u'project typesetter is not available'
        self.PROJECT_TYPESETTER_IS_NOT_SPECIFIED = u'project typesetter is not specified'
        self.PROJECT_TYPESETTER_NAME_IS_NOT_SPECIFIED = u'project typesetter name is not specified'
        self.PROJECT_TYPESETTER_VAR_IS_NOT_SPECIFIED = u'project typesetter varaible is not specified'
        self.PROJECT_TYPESETTERS_ARE_NOT_SPECIFIED = u'project typesetters are not specified'
        self.PROJECTS_VAR_IS_NOT_SPECIFIED = u'project variable is not  specified'
        self.PROJECT_TYPESETTER_PROCESS_METHOD_NOT_SPECIFIED=u'project typesetter process method not specified'
        self.PROJECTS_TYPESETTER_RUNS_WITH_NO_ARGUMENTS = u'projects typesetter runs with no arguments'


        # typesetter errors
        self.TYPESETTER_EXECUTABLE_VARIABLE_IS_UNDEFINED = u'typesetter executable variable is undefined'
        self.TYPESETTER_FILE_OUTPUT_TYPE_IS_UNDEFINED = u'typesetter file output type is undefined'
        self.TYPESETTER_METADATA_FILE_WAS_NOT_SPECIFIED = u'Metadata file wasn\'t specified '
        self.TYPESETTER_METYPESET_RUNS_WITH_DEFAULT_METADATA_FILE = u'typesetter metypeset runs with default metadata file'
        self.TYPESETTER_IS_NOT_SPECIFIED = u'typesetter is not specified '
        self.TYPESETTER_PATH_IS_NOT_SPECIFIED = u'typesetter path is not specified '
        self.TYPESETTER_BINARY_IS_UNAVAILABLE = u'typesetter binary is unavailable '
        self.TYPESETTER_RUNS_WITH_NO_ARGUMENTS = u'typesetter runs with no arguments'

        # xml
        self.RUNNING_FO_CONVERSION = u'running FO conversion'
        self.RUNNING_PDF_CONVERSION = u'running PDF conversion'
        self.XML_ELEMENT_NOT_FOUND = u'xml element not found'
        self.XML_FILE_NOT_CREATED = u'xml file not created'
        self.XML_INPUT_FILE_IS_NOT_FOUND = u'xml input file is not found'
        self.XML_INPUT_FILE_IS_NOT_VALID = u'xml input file is not valid'
        self.SAXON_IS_NOT_AVAILABLE = u'saxon is not available'
        self.FOP_PATH_IS_NOT_AVAILABLE=u'fop path is not available'

        # WORDS
        self.OUTPUT = u'Output'

        self.debug = Debug()
        self.numeral_map = numeral_map

        self.uuid = '4e4dd8cf-26bf-4893-b037-1fd3bf08f112'
        self.version = '0.0.1'

    @staticmethod
    def fatal_error(module, message):
        """
        Prints a formatted error message and exits

        Parameters
        ----------
        module: python module
             Returns the name of the module
        message: str
            Error message


        See Also
        --------
        module.get_module_name()

        """
        print(u'[FATAL ERROR] [{0}] {1}'.format(
            module.get_module_name(), message))
        sys.exit(1)

    def is_json(self, s):
        """
        Checks whether a string is valid json string

        Parameters
        ----------
        s : str
            JSON data as string

        Raises
        ------
        ValueError  error
             Inappropriate json string

        """
        try:
            return json.loads(s)
        except ValueError as e:
            return False
        return True

    def read_json(self, pth):
        """
        Reads a json file from system path or exits

        Parameters
        ----------
        pth: str
             path of the  file in the folder structure

        Returns
        -------
        json : json
            json object

        """
        if os.path.isfile(pth):
            with open(pth) as j:
                return json.load(j)
        else:
            self.debug.print_debug(
                self, self.PROJECT_INPUT_FILE_JSON_IS_NOT_VALID)
            sys.exit(1)

    def create_dirs_recursive(self, pth):
        """
        Recursively create directories for a system path or exists if folder exists

        Parameters
        ----------
        pth : str
            system path to be created

        """
        p = ''
        for path in pth:
            p = p + os.path.sep + path.strip('/').strip('/')
            if not os.path.exists(p):
                try:
                    os.makedirs(p)
                except OSError as o:
                    print o
                    sys.exit(1)
        return p

    def set_numbering_tags(self, tags, tr):
        """
        Automatic numbering of the list of elements

        Parameters
        ----------
        tags: list
         list of elements

        Returns
        -------
        tr : elementtree


        """
        for tag in tags:
            sh = tr.findall('.//' + tag)
            sid = 1
            for i in sh:
                i.set('id', tag.replace('-', '') + str(sid))
                sid += 1
        return tr

    def check_program(self, p):
        """
        Checks  whether a  the program or typesetter is installed and executable

        Parameters
        ---------
        p: str
            Program path

        Returns
        --------
        None: bool
            Returns None , if  program exists

        """

        def is_exe(f_path):
            """
            Checks whether path is available and executable
            Parameters
            ---------
            f_path: str
                File path

            Returns
            --------
            boolean: bool
                True or False

            """
            return os.path.isfile(f_path) and os.access(f_path, os.X_OK)

        fpath, fname = os.path.split(p)
        if fpath:
            if is_exe(p):
                return p
        else:
            for path in os.environ["PATH"].split(os.pathsep):
                path = path.strip('"')
                exe_file = os.path.join(path, p)
                if is_exe(exe_file):
                    return exe_file

        return None
Beispiel #16
0
class Process(Debuggable):
    """
    Standalone Processing object to combine, clean and modify a JATS XML file and optionally inject BITS Metadata headers.

    Features
    --------
    add Id numbering for any tag type, clean comments, remove unused references,
    set numbering, add unique ids to certain tag types, sort references

    """

    def __init__(self):
        self.args = self.read_command_line()
        self.debug = Debug()
        self.gv = GV()
        Debuggable.__init__(self, 'Main')
        if self.args.get('--debug'):
            self.debug.enable_debug()
        self.dr = self.args.get('<path>')
        self.f = self.args.get('<input_file>')
        self.tr = etree.parse(os.path.join(self.dr, self.f))

    @staticmethod
    def read_command_line():
        """
        Reads and  generates a docopt dictionary from the command line parameters.

        Returns
        -------
        docopt : dictionary
          A dictionary, where keys are names of command-line elements  such as  and values are theparsed values of those
          elements.
        """
        return docopt(__doc__, version='xml 0.1')

    def remove_references(self):
        """ removes  references, which are not linked.

         Parameters
         -----------
         tag : str
            name of the XML tag

         Returns
         -------
         tr : elementtree

         See Also
         --------
         remove_element, remove_tags

        """
        r = self.tr.getroot()

        for e in r.findall('.//back/ref-list/ref'):
            if e.attrib.get('id'):
                if r.find(".//xref[@ref-type='bibr'][@rid='" + e.attrib.get('id') + "']") is None:
                    self.remove_element(e)
            else:
                self.remove_element(e)
        for e in r.findall(".//xref[@ref-type='bibr']"):
            if r.find(".//back/ref-list/ref[@id='" + e.attrib.get('rid') + "']") is None:
                if e.getparent() is not None:
                    for c in e.getparent().getiterator():
                        if c.tag == 'xref' and c.attrib.get('ref-type') == 'bibr':
                            self.remove_tags(c)
        return self.tr

    def remove_tags(self, e):
        """
        Takes an etree element and replaces it with its own text

        Parameters
        ----------
        e : element
            Element to be replaced

        """
        if e.getparent() is not None:
            previous = e.getprevious()
            if previous is not None:
                if previous.tail:
                    if e.text:
                        previous.tail = previous.tail + e.text
                    if e.tail:
                        previous.tail = previous.tail + e.tail
                    e.getparent().remove(e)

    def remove_element(self, e):
        """
        Remove any element only if it has a parent

        Parameters
        ----------
        e : element
            Element to be replaced

        """
        if e.getparent() is not None:
            e.getparent().remove(e)

    def set_uuids_for_back_matter(self, tags):
        """
        Add unique id tags to  any of the sub-elements of the back matter

        Parameters
        ----------
        tags: list
         list of elements

        Returns
        -------
        tr : elementtree

        """
        for s in tags:
            f = {}
            ref_type = 'bibr' if s == 'ref' else s
            fns = self.tr.getroot().findall(
                ''.join(['.//xref/[@ref-type="', ref_type, '"]']))
            for i in fns:
                rid = ''.join(['bibd', uuid.uuid4().get_hex()])
                f[i.attrib['rid']] = rid
                i.set('rid', rid)
            for m in f.keys():
                n = self.tr.getroot().find(
                    ''.join(['.//' + s + '/[@id="', m, '"]']))
                if n is not None:
                    n.set('id', f[m]) if len(n) > 0 else ''
        return self.tr

    def set_numbering_values(
            self,
            tag,
            attr,
            value,
            count,
            range_list):
        """
        Adds numerical values to  a  tag  in arguments list

        Parameters
        ---------
        tag: str
            xml tag name
        attr: str
            attribute name
        value :str
            value name
        count : int
            current sequence number
        range_list : list
           lower and upper level for the  numbering

        See Also
        --------
        set_roman_numbers

        """
        searchTag = './/' + tag + '[@' + attr + '="' + value + '"]'
        elems = self.tr.getroot().findall(searchTag)
        range_count = 1
        for elem in elems:
            elem.text, range_count = self.set_roman_numbers(
                count, range_count, range_list)
            count += 1

        return self.tr, count

    def convert_int_to_roman(self, i):
        """
        Converts an integer number into a roman number

        Parameters
        ---------
        i : int
            integer number

        Returns
        -------
        result : str
            Roman number

        """
        result = []
        for integer, numeral in self.gv.numeral_map:
            count = i // integer
            result.append(numeral * count)
            i -= integer * count
        return ''.join(result)

    def set_roman_numbers(self, count, r_count, range_list):
        """
        Converts a given set of elements defined by range_array into roman numbers

        Parameters
        ---------
        count :int
        r_count : int
        range_list : list
            lower and upper level for the  numbering

        Returns
        -------
        val : str
        r_count: int

        See Also
        --------
        convert_int_to_roman

        """

        val = str(count)
        if int(range_list[0]) <= count <= int(range_list[1]):
            val = self.convert_int_to_roman(r_count).lower()
            r_count += 1
        else:
            val = str(count - r_count + 1)
        return val, r_count

    def merge_metadata(self, metadata):
        """
        reads a metadata file path and  merge its content into the metadata section

        Parameters
        ----------
        metadata : str
             suffix  of the metadata files

        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        -------
        create_metadata_path

        """
        r = self.tr.getroot()

        pth = self.create_metadata_path(metadata)

        if os.path.isfile(pth):
            fr = r.find('.//front')
            fr.getparent().remove(fr)
            bpm = etree.parse(pth).find('.//book-part-meta')
            bg = r.find('.//body').getparent()
            bg.insert(0, bpm)

        else:
            self.debug.print_debug(self, pth +
                                   self.gv.PROJECT_INPUT_FILE_DOES_NOT_EXIST)

        return self.tr

    def create_metadata_path(self, metadata):
        """
        creates the correct folder path for the metadata file. Metadata files should be in a folder : metadata

        Parameters
        ----------
        metadata : str
            Suffix of the metadata  files

        Returns
        -------
        pth : str
            Correct path of the metadata file in the folder structure

        Notes
        -----
        We assume that  metadata files are stored in a sub-folder named metadata
        """
        p = os.path.dirname(self.f).split(os.sep)
        del p[-4:]
        f = os.path.basename(self.f)
        name, ext = os.path.splitext(f)
        file_name = [name, '.', metadata, ext]
        p.append('metadata')
        p.append(''.join(file_name))
        pth = os.sep.join(p)
        return pth

    def sort_by_tags(self, tag_list, elem):
        """
        Sorts  a   list  of elements alphabetically

        Parameters
        ----------
        tag_list : list
            A list of tag types
        elem : Element
            Element to be modified

        """
        data = []
        for e in elem:
            vl = []
            for tag in tag_list:
                vl.append(e.findtext(".//" + tag))

            vl.append(e)
            data.append(tuple(vl))

        data.sort()
        elem[:] = [item[-1] for item in data]

    def sort_references(self, tag_list):
        """
        Sort references based on the  sub-elements list

        Parameters
        ----------
        tag_list : list
            A list of tag types


        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        --------
        sort_by_tags
        """
        elem = self.tr.find('./back/ref-list')
        self.sort_by_tags(tag_list, elem)

        return self.tr

    def sort_footnotes(self, tag_list):
        """
        Sort footnotes based on the  sub-elements list

        Parameters
        ----------
        tag_list : list
            A list of tag types


        Returns
        -------
        tr : elementTree
            Element tree of the  current file

        See Also
        --------
        sort_by_tags
        """
        elem = self.tr.find('./back/fn-group')
        self.sort_by_tags(tag_list, elem)

        return self.tr

    def process(self):
        """
        Process  JATS-XML file and do all transformations into the elementtree

        See Also
        --------
        merge_metadata, set_numbering_tags,set_uuids_for_back_matter,sort_footnotes,sort_references,set_numbering_values

        """

        clean_references = self.args.get('--clean-references')

        set_numbering_tags = self.args.get('--set-numbering-tags')
        set_unique_ids = self.args.get('--set-uuids')
        sort_footnotes = self.args.get('--sort-footnotes')
        sort_references = self.args.get('--sort-references')
        set_numbering_values = self.args.get('--set-numbering-values')

        metadata = self.args.get('--metadata')
        self.tr = self.merge_metadata(metadata) if metadata else self.tr

        self.tr = self.remove_references() if clean_references else self.tr
        self.tr = self.gv.set_numbering_tags(set_numbering_tags.split(
            ','), self.tr) if set_numbering_tags else self.tr
        self.tr = self.set_uuids_for_back_matter(
            set_unique_ids.split(',')) if set_unique_ids else self.tr
        self.tr = self.sort_footnotes(
            sort_footnotes.split(',')) if sort_footnotes else self.tr
        self.tr = self.sort_references(
            sort_references.split(',')) if sort_references else self.tr

        for s in set_numbering_values.split(';'):
            vals = s.split(',')

            count = 1
            range_count = [0, 0]

            if len(vals) > 3:
                r = vals[3].lstrip('{').rstrip('}').split(':')
                range_count = [int(r[0]), int(r[1])]
            self.tr, count = self.set_numbering_values(
                vals[0], vals[1], vals[2], count, range_count)

        self.gv.create_dirs_recursive(self.dr.split('/'))
        self.create_xml_file(
            os.path.join(
                self.dr, os.path.basename(
                    self.f)))

    def create_xml_file(self, pth):
        """
        Write the current elementTree into the file path

        Parameters
        ----------
        pth : str
            Correct path of the metadata file in the folder structure

        Raises
        ------
        IOError
            I/O operation fails

        Notes
        -----
        Default configuration writes a normalized XML file with XML scheme

        """

        try:

            self.tr.write(
                pth,
                pretty_print=False,
                xml_declaration=True
            )
            print
        except IOError as e:
            print e
            self.debug.print_debug(self, self.XML_FILE_NOT_CREATED)

    def run(self):
        """
        Runs the configuration on the processing object

        See Also
        --------
        process


        """
        self.process()