コード例 #1
0
ファイル: tree_tagger.py プロジェクト: xflows/tf_taggers
    def __init__(self,
                 path_to_home=None,
                 language='english',
                 encoding='utf-8',
                 verbose=False,
                 abbreviation_list=None,
                 widget_id=None,
                 trained=False):
        """
        Initialize the TreeTagger.

        :param path_to_home: The TreeTagger binary.
        :param language: Default language is german.
        :param encoding: The encoding used by the model. Unicode tokens
            passed to the tag() and batch_tag() methods are converted to
            this charset when they are sent to TreeTagger.
            The default is utf-8.

            This parameter is ignored for str tokens, which are sent as-is.
            The caller must ensure that tokens are encoded in the right charset.
        """
        tagger_bin = os.path.join(settings.TREE_TAGGER, 'bin')
        treetagger_paths = [tagger_bin]
        treetagger_paths = map(os.path.expanduser, treetagger_paths)
        self._abbr_list = abbreviation_list
        self.widget_id = widget_id
        self.params = os.path.join(settings.TREE_TAGGER, 'lib')
        self.trained = trained

        self._encoding = encoding
        if language == 'english':
            treetagger_bin_name = 'tree-tagger'
            train_treetagger_bin_name = 'train-tree-tagger'

        self._treetagger_bin = find_binary(treetagger_bin_name,
                                           path_to_home,
                                           env_vars=('TREETAGGER',
                                                     'TREETAGGER_HOME'),
                                           searchpath=treetagger_paths,
                                           url=_treetagger_url,
                                           verbose=verbose)

        self._train_treetagger_bin = find_binary(train_treetagger_bin_name,
                                                 path_to_home,
                                                 env_vars=('TREETAGGER',
                                                           'TREETAGGER_HOME'),
                                                 searchpath=treetagger_paths,
                                                 url=_treetagger_url,
                                                 verbose=verbose)
コード例 #2
0
ファイル: malt.py プロジェクト: approximatelylinear/nltk
    def config_malt(self, bin=None, verbose=False):
        """
        Configure NLTK's interface to the C{malt} package.  This
        searches for a directory containing the malt jar
        
        :param bin: The full path to the C{malt} binary.  If not
            specified, then nltk will search the system for a C{malt}
            binary; and if one is not found, it will raise a
            C{LookupError} exception.
        :type bin: str
        """
        #: A list of directories that should be searched for the malt
        #: executables.  This list is used by L{config_malt} when searching
        #: for the malt executables.
        _malt_path = ['.',
                     '/usr/lib/malt-1*',
                     '/usr/share/malt-1*',
                     '/usr/local/bin',
                     '/usr/local/malt-1*',
                     '/usr/local/bin/malt-1*',
                     '/usr/local/malt-1*',
                     '/usr/local/share/malt-1*']
        
        # Expand wildcards in _malt_path:
        malt_path = reduce(add, map(glob.glob, _malt_path))

        # Find the malt binary.
        self._malt_bin = find_binary('malt.jar', bin,
            searchpath=malt_path, env_vars=['MALTPARSERHOME'],
            url='http://w3.msi.vxu.se/~jha/maltparser/index.html',
            verbose=verbose)
コード例 #3
0
    def __init__(self,
                 path_to_home=None,
                 language='german',
                 verbose=False,
                 abbreviation_list=None):

        treetagger_paths = [
            '.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
            '/Applications/bin', '~/bin', '~/Applications/bin',
            '~/work/tmp/treetagger/cmd', '~/tree-tagger/cmd',
            '/home/wanderson/Documentos/Tagger/cmd',
            '/home/wanderson/Documentos/TreeTagger/cmd',
            '/home/portservice/TreeTagger/cmd'
        ]

        treetagger_paths = list(map(os.path.expanduser, treetagger_paths))
        self._abbr_list = abbreviation_list

        if language in _treetagger_languages:
            treetagger_bin_name = 'tree-tagger-' + language
        else:
            raise LookupError('Language not in language list!')

        try:
            self._treetagger_bin = find_binary(treetagger_bin_name,
                                               path_to_home,
                                               env_vars=('TREETAGGER',
                                                         'TREETAGGER_HOME'),
                                               searchpath=treetagger_paths,
                                               url=None,
                                               verbose=verbose)
        except LookupError:
            print('NLTK was unable to find the TreeTagger bin!')
コード例 #4
0
    def config_malt(self, bin=None, verbose=False):
        """
        Configure NLTK's interface to the ``malt`` package.  This
        searches for a directory containing the malt jar

        :param bin: The full path to the ``malt`` binary.  If not
            specified, then nltk will search the system for a ``malt``
            binary; and if one is not found, it will raise a
            ``LookupError`` exception.
        :type bin: str
        """
        #: A list of directories that should be searched for the malt
        #: executables.  This list is used by ``config_malt`` when searching
        #: for the malt executables.
        _malt_path = ['.',
                     '/usr/lib/malt-1*',
                     '/usr/share/malt-1*',
                     '/usr/local/bin',
                     '/usr/local/malt-1*',
                     '/usr/local/bin/malt-1*',
                     '/usr/local/malt-1*',
                     '/usr/local/share/malt-1*']

        # Expand wildcards in _malt_path:
        malt_path = reduce(add, map(glob.glob, _malt_path))

        # Find the malt binary.
        self._malt_bin = find_binary('malt.jar', bin,
            searchpath=malt_path, env_vars=['MALTPARSERHOME'],
            url='http://www.maltparser.org/',
            verbose=verbose)
コード例 #5
0
 def _find_binary(self, name, bin_dir, verbose=False):
     return find_binary(name,
                        path_to_bin=bin_dir,
                        env_vars=['CANDCHOME'],
                        url='http://svn.ask.it.usyd.edu.au/trac/candc/',
                        binary_names=[name, name + '.exe'],
                        verbose=verbose)
コード例 #6
0
    def __init__(self,
                 path_to_home=None,
                 language='english',
                 verbose=False,
                 abbreviation_list=None):

        treetagger_paths = [
            '.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
            '/usr/local/treetagger/cmd', '~/treetagger/cmd'
        ]
        treetagger_paths = list(map(os.path.expanduser, treetagger_paths))
        self._abbr_list = abbreviation_list

        if language in _treetagger_languages:
            if sys.platform.startswith("win"):
                treetagger_bin_name = 'chunk-' + language
            else:
                treetagger_bin_name = 'tagger-chunker-' + language
        else:
            raise LookupError('Language not in language list!')

        try:
            self._treetagger_bin = find_binary(
                treetagger_bin_name,
                os.path.join(path_to_home, treetagger_bin_name),
                env_vars=('TREETAGGER', 'TREETAGGER_HOME'),
                searchpath=treetagger_paths,
                url=_treetagger_url,
                verbose=verbose)
        except LookupError:
            print('NLTK was unable to find the TreeTagger bin!')
        if abbreviation_list is not None and \
                not (os.path.exists(abbreviation_list) and os.path.isfile(abbreviation_list)):
            self._abbr_list = None
コード例 #7
0
ファイル: inference.py プロジェクト: Garnovski/nltk-drt
 def _find_binary(self, name, verbose=False):
     return find_binary(name,
         searchpath=Theorem.BINARY_LOCATIONS,
         env_vars=['PROVER9HOME'],
         url='http://www.cs.unm.edu/~mccune/prover9/',
         binary_names=[name],
         verbose=verbose)
コード例 #8
0
def config_tadm(bin=None):
    global _tadm_bin
    _tadm_bin = find_binary("tadm",
                            bin,
                            env_vars=["TADM"],
                            binary_names=["tadm"],
                            url="http://tadm.sf.net")
コード例 #9
0
ファイル: tadm.py プロジェクト: chethankumarka/SuaaS
def config_tadm(bin=None):
    global _tadm_bin
    _tadm_bin = find_binary('tadm',
                            bin,
                            env_vars=['TADM_DIR'],
                            binary_names=['tadm'],
                            url='http://tadm.sf.net')
コード例 #10
0
ファイル: chunkers.py プロジェクト: paudan/opennlp_python
    def __init__(self,
                 path_to_bin=None,
                 path_to_chunker=None,
                 verbose=False,
                 use_punc_tag=False):
        """
        Initialize the OpenNLPChunker.
        :param path_to_bin: Path to bin directory of OpenNLP installation
        :param path_to_chunker: The path to OpenNLP POS chunker .bin file.
        :param use_punc_tag: Whether standalone punctuation marks should be tagged using PUNC tag

        """
        if path_to_chunker is None:
            raise LookupError('OpenNLP model file is not set!')
        self._model_path = path_to_chunker
        self.use_punc_tag = use_punc_tag

        opennlp_paths = [
            '.', '/usr/bin', '/usr/local/apache-opennlp',
            '/opt/local/apache-opennlp', '~/apache-opennlp'
        ]
        opennlp_paths = list(map(os.path.expanduser, opennlp_paths))

        opennlp_bin_name = "opennlp"
        if sys.platform.startswith("win"):
            opennlp_bin_name += ".bat"
        try:
            self._opennlp_bin = find_binary(
                opennlp_bin_name,
                os.path.join(path_to_bin, opennlp_bin_name),
                env_vars=('OPENNLP_HOME', 'OPENNLP'),
                searchpath=opennlp_paths,
                verbose=verbose)
        except LookupError:
            print('Unable to find the Apache OpenNLP run file!')
コード例 #11
0
ファイル: annotation.py プロジェクト: irit-melodi/educe
    def _repr_png_(self):
        """Draws and outputs in PNG for ipython.

        PNG is used instead of PDF, since it can be displayed in the qt
        console and has wider browser support.
        """
        with tempfile.NamedTemporaryFile() as f_tmp:
            in_path = '{0:}.ps'.format(f_tmp.name)
            out_path = '{0:}.png'.format(f_tmp.name)
            # generate PostScript using the drawing utils of NLTK
            self.to_ps(in_path)
            # convert to PNG with ghostscript
            subprocess.call(
                [find_binary('gs',
                             binary_names=['gswin32c.exe', 'gswin64c.exe'],
                             env_vars=['PATH'], verbose=False)] +
                '-q -dEPSCrop {2:} -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'
                .format(out_path, in_path, _GS_PARAMS['png']).split())
            # this function will return the encoded+decoded bytes of the PNG
            # file
            with open(out_path, 'rb') as sr:
                res = sr.read()
            os.remove(in_path)
            os.remove(out_path)
            return base64.b64encode(res).decode()
コード例 #12
0
    def __init__(self, path_to_model, path_to_bin=None,
                 encoding=_hunpos_charset, verbose=False):
        """
        Starts the hunpos-tag executable and establishes a connection with it.

        :param path_to_model: The model file.
        :param path_to_bin: The hunpos-tag binary.
        :param encoding: The encoding used by the model. Unicode tokens
            passed to the tag() and batch_tag() methods are converted to
            this charset when they are sent to hunpos-tag.
            The default is ISO-8859-1 (Latin-1).

            This parameter is ignored for str tokens, which are sent as-is.
            The caller must ensure that tokens are encoded in the right charset.
        """
        hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
                        '/Applications/bin', '~/bin', '~/Applications/bin']
        hunpos_paths = map(os.path.expanduser, hunpos_paths)

        self._hunpos_bin = find_binary(
                'hunpos-tag', path_to_bin,
                env_vars=('HUNPOS', 'HUNPOS_HOME'),
                searchpath=hunpos_paths,
                url=_hunpos_url,
                verbose=verbose)

        self._hunpos_model = find_file(path_to_model,
                env_vars=('HUNPOS', 'HUNPOS_HOME'), verbose=verbose)
        self._encoding = encoding
        self._hunpos = Popen([self._hunpos_bin, self._hunpos_model],
                             shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        self._closed = False
コード例 #13
0
ファイル: inference.py プロジェクト: yash0307/nltk-drt
 def _find_binary(self, name, verbose=False):
     return find_binary(name,
                        searchpath=Theorem.BINARY_LOCATIONS,
                        env_vars=['PROVER9HOME'],
                        url='http://www.cs.unm.edu/~mccune/prover9/',
                        binary_names=[name],
                        verbose=verbose)
コード例 #14
0
 def _find_binary(self, name, bin_dir, verbose=False):
     return find_binary(name,
         path_to_bin=bin_dir,
         env_vars=['CANDC'],
         url='http://svn.ask.it.usyd.edu.au/trac/candc/',
         binary_names=[name, name + '.exe'],
         verbose=verbose)
コード例 #15
0
    def __init__(self, path_to_model, path_to_bin=None,
                 encoding=_hunpos_charset, verbose=False):
        """
        Starts the hunpos-tag executable and establishes a connection with it.

        :param path_to_model: The model file.
        :param path_to_bin: The hunpos-tag binary.
        :param encoding: The encoding used by the model. Unicode tokens
            passed to the tag() and tag_sents() methods are converted to
            this charset when they are sent to hunpos-tag.
            The default is ISO-8859-1 (Latin-1).

            This parameter is ignored for str tokens, which are sent as-is.
            The caller must ensure that tokens are encoded in the right charset.
        """
        self._closed = True
        hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
                        '/Applications/bin', '~/bin', '~/Applications/bin']
        hunpos_paths = list(map(os.path.expanduser, hunpos_paths))

        self._hunpos_bin = find_binary(
            'hunpos-tag', path_to_bin,
            env_vars=('HUNPOS_TAGGER',),
            searchpath=hunpos_paths,
            url=_hunpos_url,
            verbose=verbose
        )

        self._hunpos_model = find_file(
            path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose)
        self._encoding = encoding
        self._hunpos = Popen([self._hunpos_bin, self._hunpos_model],
                             shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        self._closed = False
コード例 #16
0
def config_mallet(mallet_home=None):
    """
    Configure NLTK's interface to the Mallet machine learning package.

    :type mallet_home: str
    :param mallet_home: The full path to the mallet directory. If not
        specified, then NLTK will search the system for a mallet directory;
        and if one is not found, it will raise a ``LookupError`` exception.
    """
    global _mallet_home, _mallet_classpath

    # We don't actually care about this binary -- we just use it to
    # make sure we've found the right directory.
    mallethon_bin = find_binary('mallet',
                                mallet_home,
                                env_vars=['MALLET', 'MALLET_HOME'],
                                binary_names=['mallethon'],
                                url='http://mallet.cs.umass.edu')
    # Record the location where mallet lives.
    bin_dir = os.path.split(mallethon_bin)[0]
    _mallet_home = os.path.split(bin_dir)[0]
    # Construct a classpath for using mallet.
    lib_dir = os.path.join(_mallet_home, 'lib')
    if not os.path.isdir(lib_dir):
        raise ValueError('While configuring mallet: directory %r '
                         'not found.' % lib_dir)
    _mallet_classpath = os.path.pathsep.join(
        os.path.join(lib_dir, filename)
        for filename in sorted(os.listdir(lib_dir))
        if filename.endswith('.jar'))
コード例 #17
0
    def _repr_png_(self):
        """Draws and outputs in PNG for ipython.

        PNG is used instead of PDF, since it can be displayed in the qt
        console and has wider browser support.
        """
        with tempfile.NamedTemporaryFile() as f_tmp:
            in_path = '{0:}.ps'.format(f_tmp.name)
            out_path = '{0:}.png'.format(f_tmp.name)
            # generate PostScript using the drawing utils of NLTK
            self.to_ps(in_path)
            # convert to PNG with ghostscript
            subprocess.call(
                [
                    find_binary('gs',
                                binary_names=['gswin32c.exe', 'gswin64c.exe'],
                                env_vars=['PATH'],
                                verbose=False)
                ] +
                '-q -dEPSCrop {2:} -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'
                .format(out_path, in_path, _GS_PARAMS['png']).split())
            # this function will return the encoded+decoded bytes of the PNG
            # file
            with open(out_path, 'rb') as sr:
                res = sr.read()
            os.remove(in_path)
            os.remove(out_path)
            return base64.b64encode(res).decode()
コード例 #18
0
def config_tadm(bin=None):
    global _tadm_bin
    _tadm_bin = find_binary(
        'tadm', bin,
        env_vars=['TADM'],
        binary_names=['tadm'],
        url='http://tadm.sf.net')
コード例 #19
0
ファイル: mallet.py プロジェクト: carriercomm/PrologMUD
def config_mallet(mallet_home=None):
    """
    Configure NLTK's interface to the Mallet machine learning package.

    :type mallet_home: str
    :param mallet_home: The full path to the mallet directory. If not
        specified, then NLTK will search the system for a mallet directory;
        and if one is not found, it will raise a ``LookupError`` exception.
    """
    global _mallet_home, _mallet_classpath

    # We don't actually care about this binary -- we just use it to
    # make sure we've found the right directory.
    mallethon_bin = find_binary(
        "mallet",
        mallet_home,
        env_vars=["MALLET", "MALLET_HOME"],
        binary_names=["mallethon"],
        url="http://mallet.cs.umass.edu",
    )
    # Record the location where mallet lives.
    bin_dir = os.path.split(mallethon_bin)[0]
    _mallet_home = os.path.split(bin_dir)[0]
    # Construct a classpath for using mallet.
    lib_dir = os.path.join(_mallet_home, "lib")
    if not os.path.isdir(lib_dir):
        raise ValueError("While configuring mallet: directory %r " "not found." % lib_dir)
    _mallet_classpath = os.path.pathsep.join(
        os.path.join(lib_dir, filename) for filename in sorted(os.listdir(lib_dir)) if filename.endswith(".jar")
    )
コード例 #20
0
ファイル: treetagger.py プロジェクト: mbwolff/SonGenApp
    def __init__(self,
                 path_to_treetagger=None,
                 language='english',
                 verbose=False,
                 abbreviation_list=None):
        """
        Initialize the TreeTaggerChunker.
        :param language: Default language is english.
        The encoding used by the model. Unicode tokens
        passed to the parse() and parse_to_tree() methods are converted to
        this charset when they are sent to TreeTaggerChunker.
        The default is utf-8.
        This parameter is ignored for str tokens, which are sent as-is.
        The caller must ensure that tokens are encoded in the right charset.
        """
        if path_to_treetagger:
            self._path_to_treetagger = path_to_treetagger
        else:
            self._path_to_treetagger = None

        treetagger_paths = ['.']
        if 'TREETAGGER_HOME' in os.environ:
            if _platform == "win32":
                tt_path = os.path.normpath(
                    os.path.join(os.environ['TREETAGGER_HOME'], 'bat'))
            else:
                tt_path = os.path.normpath(
                    os.path.join(os.environ['TREETAGGER_HOME'], 'cmd'))
            treetagger_paths.append(tt_path)
        elif self._path_to_treetagger:
            if _platform == "win32":
                tt_path = os.path.normpath(
                    os.path.join(self._path_to_treetagger, 'bat'))
            else:
                tt_path = os.path.normpath(
                    os.path.join(self._path_to_treetagger, 'cmd'))
            treetagger_paths.append(tt_path)
        else:
            raise LookupError(
                'Set \'TREETAGGER_HOME\' or use path_to_treetagger!')
        treetagger_paths = list(map(os.path.expanduser, treetagger_paths))

        self._abbr_list = abbreviation_list

        if language in self.get_installed_lang():
            if _platform == "win32":
                treetagger_chunker_bin_name = 'chunk-' + language + '.bat'
            else:
                treetagger_chunker_bin_name = 'tagger-chunker-' + language
        else:
            raise LookupError('Language not installed!')

        try:
            self._treetagger_chunker_bin = find_binary(
                treetagger_chunker_bin_name,
                searchpath=treetagger_paths,
                url=_treetagger_url,
                verbose=verbose)
        except LookupError:
            print('NLTK was unable to find the TreeTagger Chunker bin!')
コード例 #21
0
ファイル: mallet.py プロジェクト: approximatelylinear/nltk
def config_mallet(mallet_home=None):
    """
    Configure NLTK's interface to the C{mallet} machine learning
    package.

    :param mallet_home: The full path to the C{mallet} directory.  If
        not specified, then nltk will search the system for a
        C{mallet} directory; and if one is not found, it will raise a
        C{LookupError} exception.
    :type mallet_home: str
    """
    global _mallet_home, _mallet_classpath
    
    # We don't actually care about this binary -- we just use it to
    # make sure we've found the right directory.
    mallethon_bin = find_binary(
        'mallet', mallet_home,
        env_vars=['MALLET',  'MALLET_HOME'],
        binary_names=['mallethon'],
        url='http://mallet.cs.umass.edu>')
    # Record the location where mallet lives.
    bin_dir = os.path.split(mallethon_bin)[0]
    _mallet_home = os.path.split(bin_dir)[0]
    # Construct a classpath for using mallet.
    lib_dir = os.path.join(_mallet_home, 'lib')
    if not os.path.isdir(lib_dir):
        raise ValueError('While configuring mallet: directory %r '
                         'not found.' % lib_dir)
    _mallet_classpath = ':'.join([os.path.join(lib_dir, filename)
                                  for filename in sorted(os.listdir(lib_dir))
                                  if filename.endswith('.jar')])
コード例 #22
0
ファイル: boxer.py プロジェクト: navikohli/nltk
 def _find_binary(self, name, bin_dir, verbose=False):
     return find_binary(
         name,
         path_to_bin=bin_dir,
         env_vars=["CANDCHOME"],
         url="http://svn.ask.it.usyd.edu.au/trac/candc/",
         binary_names=[name, name + ".exe"],
         verbose=verbose,
     )
コード例 #23
0
ファイル: tree.py プロジェクト: steve3p0/LING511
    def write_tree_stream(self, nltk_tree):
        """
        Draws and outputs in PNG for ipython.
        PNG is used instead of PDF, since it can be displayed in the qt console and
        has wider browser support.
        """
        import os
        import base64
        import subprocess
        import tempfile
        from nltk.draw.tree import tree_to_treesegment
        from nltk.draw.util import CanvasFrame
        from nltk.internals import find_binary

        _canvas_frame = CanvasFrame()
        # widget = tree_to_treesegment(_canvas_frame.canvas(), self)
        widget = tree_to_treesegment(_canvas_frame.canvas(), nltk_tree)
        _canvas_frame.add_widget(widget)
        x, y, w, h = widget.bbox()
        # print_to_file uses scrollregion to set the width and height of the pdf.
        _canvas_frame.canvas()["scrollregion"] = (0, 0, w, h)
        with tempfile.NamedTemporaryFile() as file:
            in_path = "{0:}.ps".format(file.name)
            out_path = "{0:}.png".format(file.name)
            _canvas_frame.print_to_file(in_path)
            _canvas_frame.destroy_widget(widget)
            try:
                subprocess.call([
                    find_binary(
                        "gs",
                        binary_names=["gswin32c.exe", "gswin64c.exe"],
                        env_vars=["PATH"],
                        verbose=False,
                    )
                ] + "-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}"
                                .format(out_path, in_path).split())
            except LookupError:
                pre_error_message = str(
                    "The Ghostscript executable isn't found.\n"
                    "See http://web.mit.edu/ghostscript/www/Install.htm\n"
                    "If you're using a Mac, you can try installing\n"
                    "https://docs.brew.sh/Installation then `brew install ghostscript`"
                )
                print(pre_error_message, file=sys.stderr)
                raise LookupError

            with open(out_path, "rb") as sr:
                res = sr.read()
                b = bytearray(res)
                # b0 = b[0]
            os.remove(in_path)
            os.remove(out_path)
            # return base64.b64encode(res).decode()
            # return base64.b64encode(res)

            # return b0
            return b
コード例 #24
0
    def __init__(self, path_to_treetagger=None, language='english',
                 verbose=False, abbreviation_list=None):
        """
        Initialize the TreeTagger.

        :param language: Default language is english.

        The encoding used by the model. Unicode tokens
        passed to the tag() method are converted to
        this charset when they are sent to TreeTagger.
        The default is utf-8.

        This parameter is ignored for str tokens, which are sent as-is.
        The caller must ensure that tokens are encoded in the right charset.
        """
        if path_to_treetagger:
            self._path_to_treetagger = path_to_treetagger
        else:
            self._path_to_treetagger = None

        treetagger_paths = ['.']
        if 'TREETAGGER_HOME' in os.environ:
            if _platform == "win32":
                tt_path = os.path.normpath(os.path.join(os.environ['TREETAGGER_HOME'], 'bin'))
            else:
                tt_path = os.path.normpath(os.path.join(os.environ['TREETAGGER_HOME'], 'cmd'))
            treetagger_paths.append(tt_path)
        elif self._path_to_treetagger:
            if _platform == "win32":
                tt_path = os.path.normpath(os.path.join(self._path_to_treetagger, 'bin'))
            else:
                tt_path = os.path.normpath(os.path.join(self._path_to_treetagger, 'cmd'))
            treetagger_paths.append(tt_path)
        else:
            raise LookupError('Set \'TREETAGGER_HOME\' or use path_to_treetagger!')
        treetagger_paths = list(map(os.path.expanduser, treetagger_paths))

        self._abbr_list = abbreviation_list

        if language in self.get_installed_lang():
            if _platform == "win32":
                treetagger_bin_name = 'tag-' + language + '.bat'
            else:
                treetagger_bin_name = 'tree-tagger-' + language
        else:
            raise LookupError('Language not installed!')

        try:
            self._treetagger_bin = find_binary(
                treetagger_bin_name,
                searchpath=treetagger_paths,
                url=_treetagger_url,
                verbose=verbose)
        except LookupError:
            print('NLTK was unable to find the TreeTagger bin!')
コード例 #25
0
    def __init__(self,
                 path_to_home=None,
                 language='german',
                 encoding='utf8',
                 verbose=False):
        """
        Initialize the TreeTagger.

        :param path_to_home: The TreeTagger binary.
        :param language: Default language is german.
        :param encoding: The encoding used by the model. Unicode tokens
            passed to the tag() and batch_tag() methods are converted to
            this charset when they are sent to TreeTagger.
            The default is utf8.

            This parameter is ignored for str tokens, which are sent as-is.
            The caller must ensure that tokens are encoded in the right charset.
        """
        treetagger_paths = [
            '/u/metanet/nlptools/tree-tagger-3.2/bin',
            '/u/metanet/nlptools/tree-tagger-3.2/cmd',
            '~/metanet/nlptools/tree-tagger-3.2/bin',
            '~/metanet/nlptools/tree-tagger-3.2/cmd', '/usr/bin',
            '/usr/local/bin', '/opt/local/bin', '/Applications/bin', '~/bin',
            '~/Applications/bin', '~/work/TreeTagger/cmd'
        ]
        treetagger_paths = map(os.path.expanduser, treetagger_paths)

        try:
            if language in _treetagger_languages[encoding]:
                if encoding == u'latin-1':
                    """the executable has no encoding information for latin-1"""
                    treetagger_bin_name = 'tree-tagger-' + language
                    self._encoding = u'latin-1'
                else:
                    treetagger_bin_name = 'tree-tagger-' + language + u'-' + encoding
                    self._encoding = encoding

            else:
                raise LookupError(
                    'NLTK was unable to find the TreeTagger bin!')
        except KeyError as e:
            raise LookupError('NLTK was unable to find the TreeTagger bin!')

        self._treetagger_bin = find_binary(treetagger_bin_name,
                                           path_to_home,
                                           env_vars=('TREETAGGER',
                                                     'TREETAGGER_HOME'),
                                           searchpath=treetagger_paths,
                                           url=_treetagger_url,
                                           verbose=verbose)

        if encoding in _treetagger_charset:
            self._encoding = encoding
コード例 #26
0
ファイル: prover9.py プロジェクト: willowjar/scratchNLP
 def _find_binary(self, name, verbose=False):
     binary_locations = self.binary_locations()
     if self._binary_location is not None:
         binary_locations += [self._binary_location]
     return internals.find_binary(
         name,
         searchpath=binary_locations,
         env_vars=['PROVER9HOME'],
         url='http://www.cs.unm.edu/~mccune/prover9/',
         binary_names=[name, name + '.exe'],
         verbose=verbose)
コード例 #27
0
ファイル: treetagger.py プロジェクト: estnltk/pfe
    def __init__(self, path_to_home=None, language="english", encoding="latin-1", verbose=False):
        """
        Initialize the TreeTagger.

        :param path_to_home: The TreeTagger binary.
        :param language: Default language is german.
        :param encoding: The encoding used by the model. Unicode tokens
            passed to the tag() and batch_tag() methods are converted to
            this charset when they are sent to TreeTagger.
            The default is utf8.

            This parameter is ignored for str tokens, which are sent as-is.
            The caller must ensure that tokens are encoded in the right charset.
        """
        treetagger_paths = [
            ".",
            "/usr/bin",
            "/usr/local/bin",
            "/opt/local/bin",
            "/Applications/bin",
            "~/bin",
            "~/Applications/bin",
            "~/work/TreeTagger/cmd",
        ]
        treetagger_paths = map(os.path.expanduser, treetagger_paths)

        try:
            if language in _treetagger_languages[encoding]:
                if encoding == u"latin-1":
                    """the executable has no encoding information for latin-1"""
                    treetagger_bin_name = "tree-tagger-" + language
                    self._encoding = u"latin-1"
                else:
                    treetagger_bin_name = "tree-tagger-" + language + u"-" + encoding
                    self._encoding = encoding

            else:
                raise LookupError("NLTK was unable to find the TreeTagger bin!")
        except KeyError as e:
            raise LookupError("NLTK was unable to find the TreeTagger bin!")

        self._treetagger_bin = find_binary(
            treetagger_bin_name,
            path_to_home,
            env_vars=("TREETAGGER", "TREETAGGER_HOME"),
            searchpath=treetagger_paths,
            url=_treetagger_url,
            verbose=verbose,
        )

        if encoding in _treetagger_charset:
            self._encoding = encoding
コード例 #28
0
ファイル: prover9.py プロジェクト: willowjar/scratchNLP
 def config_prover9(self, binary_location, verbose=False):
     if binary_location is None:
         self._binary_location = None
         self._prover9_bin = None
     else:
         name = 'prover9'
         self._prover9_bin = internals.find_binary(
             name,
             path_to_bin=binary_location,
             env_vars=['PROVER9HOME'],
             url='http://www.cs.unm.edu/~mccune/prover9/',
             binary_names=[name, name + '.exe'],
             verbose=verbose)
         self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1)
コード例 #29
0
ファイル: annotation.py プロジェクト: irit-melodi/educe
 def to_pdf(self, filename):
     """Image representation in PDF.
     """
     # generate PostScript using the drawing utils of NLTK
     root, ext = os.path.splitext(filename)
     in_path = '{0:}.ps'.format(root)
     self.to_ps(in_path)
     # convert to PDF with ghostscript
     subprocess.call(
         [find_binary('gs',
                      binary_names=['gswin32c.exe', 'gswin64c.exe'],
                      env_vars=['PATH'], verbose=False)] +
         '-q -dEPSCrop {2:} -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'
         .format(filename, in_path, _GS_PARAMS['pdf']).split())
     os.remove(in_path)
コード例 #30
0
def config_megam(bin=None):
    """Configure NLTK's interface to the ``megam`` maxent optimization
    package.

    :param bin: The full path to the ``megam`` binary.  If not specified,
        then nltk will search the system for a ``megam`` binary; and if
        one is not found, it will raise a ``LookupError`` exception.
    :type bin: str
    """
    global _megam_bin
    _megam_bin = find_binary(
        'megam', bin,
        env_vars=['MEGAM'],
        binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'],
        url='http://www.umiacs.umd.edu/~hal/megam/index.html')
コード例 #31
0
    def __init__(self, path_to_home=None, language='german', 
                 encoding='utf8', verbose=False, abbreviation_list=None):
        """
        Initialize the TreeTagger.

        :param path_to_home: The TreeTagger binary.
        :param language: Default language is german.
        :param encoding: The encoding used by the model. Unicode tokens
            passed to the tag() and batch_tag() methods are converted to
            this charset when they are sent to TreeTagger.
            The default is utf8.

            This parameter is ignored for str tokens, which are sent as-is.
            The caller must ensure that tokens are encoded in the right charset.
        """
        treetagger_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
                        '/Applications/bin', '~/bin', '~/Applications/bin',
                        '~/work/TreeTagger/cmd', '~/tree-tagger/cmd', '/tree-tagger/bin', '/tree-tagger/cmd', '/var/opt/treetagger/bin', '/var/opt/treetagger/cmd']
        treetagger_paths = map(os.path.expanduser, treetagger_paths)
        self._abbr_list = abbreviation_list

        try:
            if language in _treetagger_languages[encoding]:
                if encoding == u'latin-1':
                    """the executable has no encoding information for latin-1"""
                    treetagger_bin_name = 'tree-tagger-' + language
                    self._encoding = u'latin-1'
                else:
                    #treetagger_bin_name = 'tree-tagger-' + language + u'-' + encoding
                    treetagger_bin_name = 'tree-tagger-' + language
                    self._encoding = encoding

            else:
                raise LookupError('NLTK was unable to find the TreeTagger bin!')
        except KeyError as e:
                raise LookupError('NLTK was unable to find the TreeTagger bin!')

        self._treetagger_bin = find_binary(
                treetagger_bin_name, path_to_home,
                env_vars=('TREETAGGER', 'TREETAGGER_HOME'),
                searchpath=treetagger_paths,
                url=_treetagger_url,
                verbose=verbose)
        print(u'### {}'.format(self._treetagger_bin))

        if encoding in _treetagger_charset:
            self._encoding = encoding
        print(u'#### {}'.format(self._encoding))
コード例 #32
0
 def to_pdf(self, filename):
     """Image representation in PDF.
     """
     # generate PostScript using the drawing utils of NLTK
     root, ext = os.path.splitext(filename)
     in_path = '{0:}.ps'.format(root)
     self.to_ps(in_path)
     # convert to PDF with ghostscript
     subprocess.call([
         find_binary('gs',
                     binary_names=['gswin32c.exe', 'gswin64c.exe'],
                     env_vars=['PATH'],
                     verbose=False)
     ] + '-q -dEPSCrop {2:} -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'
                     .format(filename, in_path, _GS_PARAMS['pdf']).split())
     os.remove(in_path)
コード例 #33
0
ファイル: megam.py プロジェクト: gpaulbr/ERelp
def config_megam(bin=None):
    """
    Configure NLTK's interface to the C{megam} maxent optimization
    package.

    @param bin: The full path to the C{megam} binary.  If not specified,
        then nltk will search the system for a C{megam} binary; and if
        one is not found, it will raise a C{LookupError} exception.
    @type bin: C{string}
    """
    global _megam_bin
    _megam_bin = find_binary(
        'megam', bin,
        env_vars=['MEGAM',  'MEGAMHOME'],
        binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'],
        url='http://www.cs.utah.edu/~hal/megam/')
コード例 #34
0
def config_megam(bin=None):
    """
    Configure NLTK's interface to the ``megam`` maxent optimization
    package.

    :param bin: The full path to the ``megam`` binary.  If not specified,
        then nltk will search the system for a ``megam`` binary; and if
        one is not found, it will raise a ``LookupError`` exception.
    :type bin: str
    """
    global _megam_bin
    _megam_bin = find_binary(
        'megam', bin,
        env_vars=['MEGAM'],
        binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'],
        url='http://www.umiacs.umd.edu/~hal/megam/index.html')
コード例 #35
0
    def __init__(self,
                 path_to_home=None,
                 language='german',
                 verbose=False,
                 abbreviation_list=None):
        """
        Initialize the TreeTagger.

        :param path_to_home: The TreeTagger binary.
        :param language: Default language is german.

        The encoding used by the model. Unicode tokens
        passed to the tag() and batch_tag() methods are converted to
        this charset when they are sent to TreeTagger.
        The default is utf-8.

        This parameter is ignored for str tokens, which are sent as-is.
        The caller must ensure that tokens are encoded in the right charset.
        """
        treetagger_paths = [
            '.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
            '/Applications/bin', '~/bin', '~/Applications/bin',
            '~/work/tmp/treetagger/cmd', '~/TreeTagger/cmd'
        ]
        treetagger_paths = list(map(os.path.expanduser, treetagger_paths))
        self._abbr_list = abbreviation_list
        self.language = language

        if language in _treetagger_languages:
            if _platform == "win32":
                treetagger_bin_name = 'tag-' + language
            else:
                treetagger_bin_name = 'tree-tagger-' + language
        else:
            raise LookupError('Language not in language list!')

        try:
            self._treetagger_bin = find_binary(treetagger_bin_name,
                                               path_to_home,
                                               env_vars=('TREETAGGER',
                                                         'TREETAGGER_HOME'),
                                               searchpath=treetagger_paths,
                                               url=_treetagger_url,
                                               verbose=verbose)
        except LookupError:
            print('NLTK was unable to find the TreeTagger bin!')
コード例 #36
0
ファイル: megam.py プロジェクト: tamamshud/nltk
def config_megam(bin=None):
    """
    Configure NLTK's interface to the ``megam`` maxent optimization
    package.

    :param bin: The full path to the ``megam`` binary.  If not specified,
        then nltk will search the system for a ``megam`` binary; and if
        one is not found, it will raise a ``LookupError`` exception.
    :type bin: str
    """
    global _megam_bin
    _megam_bin = find_binary(
        "megam",
        bin,
        env_vars=["MEGAM", "MEGAMHOME"],
        binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
        url="http://www.cs.utah.edu/~hal/megam/",
    )
コード例 #37
0
ファイル: ptbconv.py プロジェクト: Sandy4321/nltk_contrib
def _run_ptbconv(num, format='D', verbose=False):
    bin = find_binary('ptbconv', 
                      env_vars=['PTBCONV'],
                      url='http://www.jaist.ac.jp/~h-yamada/',
                      verbose=False)
    
    input_filename = reduce(os.path.join, 
                        [_treebank_path(), 'combined', 'wsj_%04d.mrg' % num])
    
    cmd = [bin, '-'+format]
    p = subprocess.Popen(cmd, 
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         stdin=open(input_filename))
    (stdout, stderr) = p.communicate()
    
    if verbose:
        print stderr.strip()
        
    return stdout
コード例 #38
0
ファイル: ptbconv.py プロジェクト: sushengyang/NLP-project
def _run_ptbconv(num, format='D', verbose=False):
    bin = find_binary('ptbconv',
                      env_vars=['PTBCONV'],
                      url='http://www.jaist.ac.jp/~h-yamada/',
                      verbose=False)

    input_filename = reduce(
        os.path.join, [_treebank_path(), 'combined',
                       'wsj_%04d.mrg' % num])

    cmd = [bin, '-' + format]
    p = subprocess.Popen(cmd,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         stdin=open(input_filename))
    (stdout, stderr) = p.communicate()

    if verbose:
        print stderr.strip()

    return stdout
コード例 #39
0
ファイル: tnt.py プロジェクト: Sandy4321/nltk_contrib
def config_tnt(bin=None, verbose=False):
    """
    Configure the location of TnT Executable
    
    @param path: Path to the TnT executable
    @type path: C{str}
    """
    
    try:
        if _tnt_bin:
            return _tnt_bin
    except UnboundLocalError:
        pass
    
    # Find the tnt binary.
    tnt_bin = find_binary('tnt', bin,
        searchpath=tnt_search, env_vars=['TNTHOME'],
        url='http://www.coli.uni-saarland.de/~thorsten/tnt/',
        verbose=verbose)
    
    _tnt_bin = tnt_bin
    return _tnt_bin
コード例 #40
0
    def __init__(self, path_to_home=None, language='german', 
                 verbose=False, abbreviation_list=None):
        """
        Initialize the TreeTagger.

        :param path_to_home: The TreeTagger binary.
        :param language: Default language is german.

        The encoding used by the model. Unicode tokens
        passed to the tag() and batch_tag() methods are converted to
        this charset when they are sent to TreeTagger.
        The default is utf-8.

        This parameter is ignored for str tokens, which are sent as-is.
        The caller must ensure that tokens are encoded in the right charset.
        """
        treetagger_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin',
                        '/Applications/bin', '~/bin', '~/Applications/bin',
                        '~/work/tmp/treetagger/cmd', '~/tree-tagger/cmd']
        treetagger_paths = list(map(os.path.expanduser, treetagger_paths))
        self._abbr_list = abbreviation_list

        if language in _treetagger_languages:
            if _platform == "win32":
                treetagger_bin_name = 'tag-' + language
            else:
                treetagger_bin_name = 'tree-tagger-' + language
        else:
            raise LookupError('Language not in language list!')

        try:
            self._treetagger_bin = find_binary(
                treetagger_bin_name, path_to_home,
                env_vars=('TREETAGGER', 'TREETAGGER_HOME'),
                searchpath=treetagger_paths,
                url=_treetagger_url,
                verbose=verbose)
        except LookupError:
            print('NLTK was unable to find the TreeTagger bin!')
コード例 #41
0
ファイル: taggers.py プロジェクト: florecista/film-analysis
    def __init__(self,
                 path_to_bin=None,
                 path_to_model=None,
                 language='en',
                 verbose=False):
        """
        Initialize the OpenNLPTagger.

        :param path_to_bin: Path to bin directory of OpenNLP installation
        :param path_to_model: The path to OpenNLP POS tagger .bin file.
        :param language: Language to use; default setting is 'en'.

        """
        if path_to_model is None:
            raise LookupError('OpenNLP model file is not set!')
        self._model_path = path_to_model

        opennlp_paths = [
            '.', '/usr/bin', '/usr/local/apache-opennlp',
            '/opt/local/apache-opennlp', '~/apache-opennlp'
        ]
        opennlp_paths = list(map(os.path.expanduser, opennlp_paths))

        if language in _opennlp_languages:
            opennlp_bin_name = "opennlp"
            if sys.platform.startswith("win"):
                opennlp_bin_name += ".bat"
        else:
            raise LookupError('Language not in language list!')
        try:
            self._opennlp_bin = find_binary(
                opennlp_bin_name,
                os.path.join(path_to_bin, opennlp_bin_name),
                env_vars=('OPENNLP_HOME', 'OPENNLP'),
                searchpath=opennlp_paths,
                verbose=verbose)
        except LookupError:
            print('Unable to find the Apache OpenNLP run file!')
コード例 #42
0
ファイル: tnt.py プロジェクト: sushengyang/NLP-project
def config_tnt(bin=None, verbose=False):
    """
    Configure the location of TnT Executable
    
    @param path: Path to the TnT executable
    @type path: C{str}
    """

    try:
        if _tnt_bin:
            return _tnt_bin
    except UnboundLocalError:
        pass

    # Find the tnt binary.
    tnt_bin = find_binary('tnt',
                          bin,
                          searchpath=tnt_search,
                          env_vars=['TNTHOME'],
                          url='http://www.coli.uni-saarland.de/~thorsten/tnt/',
                          verbose=verbose)

    _tnt_bin = tnt_bin
    return _tnt_bin