Esempio n. 1
0
from glob import glob
from sys import stderr
from xml.etree.cElementTree import ElementTree, tostring

from daeso.utils.cli import DaesoArgParser
from daeso.utils.etree import write


# TODO:
# - this is not too smart yet - you can do this with sed,
#   but escaping all the slashes is a pain
# - windows support


parser = DaesoArgParser(description=__doc__)


parser.add_argument("corpus", nargs="+", help="parallel graph corpus")

parser.add_argument(
    "-p",
    "--path-prefix-pair",
    nargs=2,
    default=["", ""],
    metavar="DIR",
    help="a pair of path prefixes specifying what to change from and to",
)

parser.add_argument(
    "-t", "--test", action="store_true", help="perform a dry run without changing anything for real (implies -V)"
Esempio n. 2
0
  
  The graphs must have an "id" attribute which is identical to the "id" of a
  corresponding sentence (<s> element, by default) in the marked-up text
  files. That is, a <links>'s "from_id" and "to_id" must identify
  corresponding graphs in the "from" and "to" graphbanks respectively.
  
  The graphbanks are assumed to be in GraphML format, unless specified
  otherwise by means of the --source-graphbank-format and
  --target-graphbank-format option.
  
  The default set of alignment relations for the parallel graph corpus is the
  Daeso set, but you can change it using the --relations option.
  
""" + epilog

parser = DaesoArgParser(description=__doc__, epilog=epilog)

parser.add_argument("-p",
                    "--parallel-text-corpora",
                    metavar="CORPUS",
                    nargs="+",
                    default=(),
                    help='parallel text corpora')

parser.add_argument("-s",
                    "--source-graphbanks",
                    metavar="GRAPHBANK",
                    nargs="+",
                    default=(),
                    help='source graphbanks')
Esempio n. 3
0
"""
copy a parallel graph corpus

Automatically takes care of the internal references to graph bank files.
Usage is similar to the "cp" shell command.
"""

__authors__ = 'Erwin Marsi <*****@*****.**>'

from os.path import isdir, basename, join, samefile, exists
from sys import stderr

from daeso.utils.cli import DaesoArgParser
from daeso.pgc.corpus import ParallelGraphCorpus, LOAD_NONE

parser = DaesoArgParser(description=__doc__)

parser.add_argument("source",
                    nargs="+",
                    help="source parallel graph corpus file")

parser.add_argument("target",
                    help="either a target parallel graph corpus file or "
                    "a target directory")

parser.add_argument("-o",
                    "--overwrite",
                    action="store_true",
                    help="overwrite existing file")

args = parser.parse_args()
Esempio n. 4
0
  
  The graphs must have an "id" attribute which is identical to the "id" of a
  corresponding sentence (<s> element, by default) in the marked-up text
  files. That is, a <links>'s "from_id" and "to_id" must identify
  corresponding graphs in the "from" and "to" graphbanks respectively.
  
  The graphbanks are assumed to be in GraphML format, unless specified
  otherwise by means of the --source-graphbank-format and
  --target-graphbank-format option.
  
  The default set of alignment relations for the parallel graph corpus is the
  Daeso set, but you can change it using the --relations option.
  
""" + epilog 

parser = DaesoArgParser(description=__doc__, epilog=epilog)


parser.add_argument(
    "-p", "--parallel-text-corpora", 
    metavar="CORPUS",
    nargs="+",
    default=(),
    help='parallel text corpora')

parser.add_argument(
    "-s", "--source-graphbanks", 
    metavar="GRAPHBANK",
    nargs="+",
    default=(),
    help='source graphbanks')
Esempio n. 5
0
# TODO:
# - silence warning about meta-data


__authors__ = 'Erwin Marsi <*****@*****.**>'


import sys

from daeso.utils.cli import DaesoArgParser
from daeso.utils.opsys import multiglob
from daeso.pgc.corpus import ParallelGraphCorpus


parser = DaesoArgParser(description=__doc__)

    
parser.add_argument(
    "file",
    nargs="+",
    metavar="FILE",
    help="parallel graph corpus filename, "
    "or quoted file name pattern for parallel graph corpora"
    )

parser.add_argument(
    "-f", "--format",
    action="store_true",
    help="output indented XML"
    )
Esempio n. 6
0
align graphs in parallel graph corpus
"""

__authors__ = 'Erwin Marsi <*****@*****.**>'
__version__ = "0.9"


import imp

from daeso.utils.cli import DaesoArgParser
from daeso.utils.opsys import multiglob
from daeso.pgc.corpus import ParallelGraphCorpus

from daeso_nl.ga.setup import set_up_corpus_aligner

parser = DaesoArgParser(description=__doc__, version=__version__)


parser.add_argument(
    "pgc_files",
    nargs="+",
    metavar="FILE",
    help="parallel graph corpus file"
    )

parser.add_argument(
    "-c", "--config",
    metavar="FILE",
    help="configuration file to set up a corpus aligner")  

parser.add_argument(
Esempio n. 7
0
corpus_dir = getenv("DAESO_CORPUS", "")

if not corpus_dir:
    stderr.write("Warning: environment variable DAESO_CORPUS not found!")


def expand_globs(corpus_dir, globs):
    files = []

    for pattern in globs:
        files.extend(relglob(corpus_dir, pattern))

    return files


parser = DaesoArgParser(description=__doc__, version=__version__)

parser.add_argument(
    "pgc_glob",
    nargs="+",
    help=("glob (i.e. filename pattern) for parallel graph corpora, "
          "interpreted relative to the corpus base directory "
          "(cf. --corpus_dir)"))

parser.add_argument(
    "-c",
    "--corpus-dir",
    default=corpus_dir,
    help="pgc filenames are interpreted relative to this base directory "
    "(default is '" + corpus_dir + "')")
Esempio n. 8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
simple server providing access to the Alpino parser for Dutch through XML-RPC
"""

__author__ = 'Erwin Marsi <*****@*****.**>'

__version__ = "0.9"
from sys import exit

from daeso.utils.cli import DaesoArgParser
from daeso_nl.alpino.server import start_server, DEFAULT_HOST, DEFAULT_PORT

parser = DaesoArgParser(description=__doc__, version="%(prog)s version " +
                        __version__)


parser.add_argument("-H", "--host", 
                    default="%s:%d" % (DEFAULT_HOST, DEFAULT_PORT),
                    metavar="HOST[:PORT]",
                    help="name or IP address of host (default is '%s') "
                    "optionally followed by a port number "
                    "(default is %d)" % (DEFAULT_HOST, DEFAULT_PORT))

parser.add_argument("-c", "--command",
                    help="command line to start Alpino parser")

parser.add_argument("-o", "--out_dir",
                    help="directory for writing temporary files")
Esempio n. 9
0
"""
parallel text corpus diff

reports difference in text alignments between two parallel text corpora
"""

from daeso.utils.cli import DaesoArgParser
from daeso.ptc.diff import print_diff


__authors__ = 'Erwin Marsi <*****@*****.**>'

    

parser = DaesoArgParser(description=__doc__)


parser.add_argument(
    "true_corpus", 
    help="parallel text corpus containing true alignments")

parser.add_argument(
    "pred_corpus", 
    help="parallel text corpus containing predicted alignments")


parser.add_argument(
    "-t", "--tag", 
    default="s", 
    help='only consider alignments involving this tag (defaults is "s"')
Esempio n. 10
0
# (at your option) any later version.
#
# The Algraeph program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Erwin Marsi <*****@*****.**>"

from daeso.utils.cli import DaesoArgParser
from graeph.release import version, description

parser = DaesoArgParser(description=description.strip(), version=version)

parser.add_argument("corpus_file",
                    metavar="FILE",
                    nargs="?",
                    help="parallel graph corpus file")

parser.add_argument("-d",
                    "--dot_exec",
                    metavar="FILE",
                    help='"dot" graph drawing program')

parser.add_argument(
    "-r",
    "--redirect",
    action='store_true',
Esempio n. 11
0
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

__version__ = "1.0"
__author__ = "Erwin Marsi"

from htxt.gui import Hitaext
from daeso.utils.cli import DaesoArgParser

description = """
Hitaext: hierarchical text aligment tool
"""

parser = DaesoArgParser(description=description.strip(), version=__version__)

parser.add_argument("corpus_file",
                    metavar="FILE",
                    nargs="?",
                    help="parallel text corpus file")

parser.add_argument(
    "-r",
    "--redirect",
    action='store_true',
    help="redirect output written to stdout and stderr streams "
    "to a pop-up window")

args = parser.parse_args()
Esempio n. 12
0
__authors__ = 'Erwin Marsi <*****@*****.**>'

from glob import glob
from sys import stderr
from xml.etree.cElementTree import ElementTree, tostring

from daeso.utils.cli import DaesoArgParser
from daeso.utils.etree import write

# TODO:
# - this is not too smart yet - you can do this with sed,
#   but escaping all the slashes is a pain
# - windows support

parser = DaesoArgParser(description=__doc__)

parser.add_argument("corpus", nargs="+", help="parallel graph corpus")

parser.add_argument(
    "-p",
    "--path-prefix-pair",
    nargs=2,
    default=["", ""],
    metavar="DIR",
    help="a pair of path prefixes specifying what to change from and to")

parser.add_argument(
    "-t",
    "--test",
    action="store_true",
Esempio n. 13
0
__authors__ = 'Erwin Marsi <*****@*****.**>'


import os
import sys

from daeso.utils.cli import DaesoArgParser
from daeso.pgc.corpus import ParallelGraphCorpus


def log(s):
    if args.verbose:
        print >>sys.stderr, "***", s
        
parser = DaesoArgParser(description=__doc__)

parser.add_argument(
    "filename",
    metavar="FILE",
    help="parallel graph corpus"
    )

parser.add_argument(
    "-f", "--format",
    action="store_true",
    help="output indented XML"
    )

parser.add_argument(
    "-p", "--parts",
Esempio n. 14
0
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

__version__ = "1.0"
__author__ = "Erwin Marsi"

from htxt.gui import Hitaext
from daeso.utils.cli import DaesoArgParser


description = """
Hitaext: hierarchical text aligment tool
"""

parser = DaesoArgParser(description=description.strip(), version=__version__)

parser.add_argument(
    "corpus_file",
    metavar="FILE",
    nargs="?", 
    help="parallel text corpus file")

parser.add_argument(
    "-r", "--redirect",
    action='store_true',
    help="redirect output written to stdout and stderr streams "
    "to a pop-up window")

args = parser.parse_args()
Esempio n. 15
0
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
dump instance base

Converts a Timbl instance file to a Timbl instance base file given a graph
aligner configuration. Timbl options and filenames are taken from the
configuration file (cf. the "timbl_inst_fname" and "timbl_ib_fname"
attributes).
"""

__authors__ = 'Erwin Marsi <*****@*****.**>'
__version__ = "0.9"

import imp 

from daeso.utils.cli import DaesoArgParser
from daeso_nl.ga.setup import dump_inst_base

parser = DaesoArgParser(description=__doc__, version=__version__)

parser.add_argument(
    "config",
    metavar="FILE",
    help="configuration file to set up a corpus aligner")  

args = parser.parse_args()

config = imp.load_source("config", args.config)
dump_inst_base(config)
Esempio n. 16
0
__authors__ = 'Erwin Marsi <*****@*****.**>'

from glob import glob

from daeso.utils.cli import DaesoArgParser, epilog
from daeso.gb.gbstats import gb_stats

epilog = """
Remarks:
  * Many columns will have zero values, because a parallel graph corpus 
    is required to get alignment information; see pgc_stats.py
  * Failed parses will only be excluded for graph banks in 'alpino' format.
  
""" + epilog

parser = DaesoArgParser(description=__doc__, epilog=epilog)

parser.add_argument("pattern", help="*quoted* pattern for graph bank files")

parser.add_argument("-F",
                    "--format",
                    metavar="STRING",
                    default="alpino",
                    dest="format",
                    help="treebank format (defaults to 'alpino')")

parser.add_argument("-a",
                    "--with-all",
                    action="store_true",
                    dest="with_all",
                    help="include all, sets options -efp")
Esempio n. 17
0
# - handle encoding errors
# - reset cache


__author__ = 'Erwin Marsi <*****@*****.**>'
__version__ = '0.9'

import sys
import socket

from xmlrpclib import ServerProxy, Fault
from daeso.utils.cli import DaesoArgParser
from daeso_nl.alpino.server import DEFAULT_HOST, DEFAULT_PORT


parser = DaesoArgParser(description=__doc__, version=__version__)



parser.add_argument(
    "-H", "--host", 
    default="%s:%d" % (DEFAULT_HOST, DEFAULT_PORT),
    metavar="HOST[:PORT]",
    help="name or IP address of host (default is '%s') "
    "optionally followed by a port number "
    "(default is %d)" % (DEFAULT_HOST, DEFAULT_PORT))

parser.add_argument(
    "-i", "--input-encoding", 
    default="utf8", 
    metavar="utf8|latin1|ascii|...",
Esempio n. 18
0
copy a parallel graph corpus

Automatically takes care of the internal references to graph bank files.
Usage is similar to the "cp" shell command.
"""

__authors__ = 'Erwin Marsi <*****@*****.**>'


from os.path import isdir, basename, join, samefile, exists
from sys import stderr

from daeso.utils.cli import DaesoArgParser
from daeso.pgc.corpus import ParallelGraphCorpus, LOAD_NONE

parser = DaesoArgParser(description=__doc__)
                        

parser.add_argument(
    "source",
    nargs="+",
    help="source parallel graph corpus file"
    )

parser.add_argument(
    "target",
    help="either a target parallel graph corpus file or "
    "a target directory"
    )

parser.add_argument(
Esempio n. 19
0
        zip_arch.write(corpus_filename, arch_filename)

        corpus = ParallelGraphCorpus(inf=corpus_filename,
                                     graph_loading=LOAD_NONE)

        for gb in corpus._graphbanks():
            gb_filename = gb.get_file_path()
            # add graphbank files to archive
            arch_filename = os.path.join(arch_dir,
                                         os.path.basename(gb_filename))
            zip_arch.write(gb_filename, arch_filename)

    zip_arch.close()


parser = DaesoArgParser(description=__doc__, epilog=epilog)

parser.add_argument("zip_file",
                    metavar="ZIP_FILE",
                    help="filename of zip archive")

parser.add_argument("pgc_files",
                    nargs="+",
                    metavar="CORPUS_FILE",
                    help="parallel graph corpus filename, "
                    "or quoted file name pattern for parallel graph corpora")

args = parser.parse_args()

pgc_zip(args.zip_file, args.pgc_files)
Esempio n. 20
0
from glob import glob
from sys import exit

from daeso.utils.cli import DaesoArgParser, epilog
from daeso.pgc.pgcstats import pgc_stats

epilog = """
Examples:
  $ pgc_stats.py -efpu "*.pgc" 
  
Remarks:
  * Failed parses will only be exluded for graph banks in 'alpino' format.

""" + epilog

parser = DaesoArgParser(description=__doc__, epilog=epilog)

parser.add_argument("pattern",
                    help="*quoted* pattern for parallel graph corpus files")

parser.add_argument("-a",
                    "--with-all",
                    action="store_true",
                    dest="with_all",
                    help="include all, sets options -efpru")

#parser.add_argument("-c", "--csv", action="store_true",
#dest="csv",
#help="output in comma separated values")

parser.add_argument("-e",
Esempio n. 21
0
__authors__ = 'Erwin Marsi <*****@*****.**>'

import os
import sys

from daeso.utils.cli import DaesoArgParser
from daeso.pgc.corpus import ParallelGraphCorpus


def log(s):
    if args.verbose:
        print >> sys.stderr, "***", s


parser = DaesoArgParser(description=__doc__)

parser.add_argument("filename", metavar="FILE", help="parallel graph corpus")

parser.add_argument("-f",
                    "--format",
                    action="store_true",
                    help="output indented XML")

parser.add_argument("-p",
                    "--parts",
                    default=2,
                    type=int,
                    metavar="N",
                    help="number of parts")
Esempio n. 22
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
simple server providing access to the Alpino parser for Dutch through XML-RPC
"""

__author__ = 'Erwin Marsi <*****@*****.**>'

__version__ = "0.9"

from sys import exit

from daeso.utils.cli import DaesoArgParser
from daeso_nl.alpino.server import start_server, DEFAULT_HOST, DEFAULT_PORT

parser = DaesoArgParser(description=__doc__,
                        version="%(prog)s version " + __version__)

parser.add_argument("-H",
                    "--host",
                    default="%s:%d" % (DEFAULT_HOST, DEFAULT_PORT),
                    metavar="HOST[:PORT]",
                    help="name or IP address of host (default is '%s') "
                    "optionally followed by a port number "
                    "(default is %d)" % (DEFAULT_HOST, DEFAULT_PORT))

parser.add_argument("-c",
                    "--command",
                    help="command line to start Alpino parser")

parser.add_argument("-o",
                    "--out_dir",
Esempio n. 23
0
        zip_arch.write(corpus_filename, arch_filename)
        
        corpus = ParallelGraphCorpus(inf=corpus_filename,
                                     graph_loading=LOAD_NONE)

        for gb in corpus._graphbanks():
            gb_filename = gb.get_file_path()
            # add graphbank files to archive
            arch_filename = os.path.join( arch_dir,
                                          os.path.basename(gb_filename) )
            zip_arch.write(gb_filename, arch_filename)
            
    zip_arch.close()
    

parser = DaesoArgParser(description=__doc__,
                        epilog=epilog)

parser.add_argument(
    "zip_file", 
    metavar="ZIP_FILE",
    help="filename of zip archive")

parser.add_argument(
    "pgc_files", 
    nargs="+", 
    metavar="CORPUS_FILE",
    help="parallel graph corpus filename, "
    "or quoted file name pattern for parallel graph corpora")

args = parser.parse_args()
Esempio n. 24
0
"""
evaluation of text alignment in parallel text corpora

reports precision, recall and F-score on alignment for a certain tag
for one or more pairs of true and predicted parallel text corpora
"""

from daeso.utils.cli import DaesoArgParser
from daeso.ptc.evaluate import eval_alignment


__authors__ = "Erwin Marsi <*****@*****.**>"


parser = DaesoArgParser(description=__doc__)


parser.add_argument("-t", "--true_corpora", nargs="+", help="parallel text corpus containing true alignments")

parser.add_argument("-p", "--pred_corpora", nargs="+", help="parallel text corpus containing predicted alignments")

parser.add_argument("--tag", default="s", help='only consider alignments involving this tag (defaults is "s"')


args = parser.parse_args()


assert len(args.true_corpora) == len(args.pred_corpora)

eval_alignment(zip(args.true_corpora, args.pred_corpora), args.tag)
Esempio n. 25
0
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
parallel text corpus diff

reports difference in text alignments between two parallel text corpora
"""

from daeso.utils.cli import DaesoArgParser
from daeso.ptc.diff import print_diff

__authors__ = 'Erwin Marsi <*****@*****.**>'

parser = DaesoArgParser(description=__doc__)

parser.add_argument("true_corpus",
                    help="parallel text corpus containing true alignments")

parser.add_argument(
    "pred_corpus", help="parallel text corpus containing predicted alignments")

parser.add_argument(
    "-t",
    "--tag",
    default="s",
    help='only consider alignments involving this tag (defaults is "s"')

parser.add_argument("-e",
                    "--encoding",
Esempio n. 26
0
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
dump instance base

Converts a Timbl instance file to a Timbl instance base file given a graph
aligner configuration. Timbl options and filenames are taken from the
configuration file (cf. the "timbl_inst_fname" and "timbl_ib_fname"
attributes).
"""

__authors__ = 'Erwin Marsi <*****@*****.**>'
__version__ = "0.9"

import imp

from daeso.utils.cli import DaesoArgParser
from daeso_nl.ga.setup import dump_inst_base

parser = DaesoArgParser(description=__doc__, version=__version__)

parser.add_argument("config",
                    metavar="FILE",
                    help="configuration file to set up a corpus aligner")

args = parser.parse_args()

config = imp.load_source("config", args.config)
dump_inst_base(config)
Esempio n. 27
0
"""

__author__ = 'Erwin Marsi <*****@*****.**>'
__version__ = "0.9"

import imp
import sys

from daeso.utils.cli import DaesoArgParser

from daeso_nl.ga.setup import set_up_align_server
from daeso_nl.ga.server import (start_server, DEFAULT_HOST, DEFAULT_PORT)

#-------------------------------------------------------------------------------

parser = DaesoArgParser(description=__doc__, version=__version__)

parser.add_argument("config",
                    metavar="FILE",
                    help="configuration file to set up a graph align server")

parser.add_argument(
    "-H",
    "--host",
    default="%s:%d" % (DEFAULT_HOST, DEFAULT_PORT),
    metavar="HOST[:PORT]",
    help="name or IP address of host (default is '%s') " % DEFAULT_HOST +
    "optionally followed by a port number (default is %d)" % DEFAULT_PORT)

parser.add_argument("-l", "--log", action="store_true", help="log requests")
Esempio n. 28
0
            if verbose:
                banks = graph_pair.get_banks()
                
                columns = [
                    banks.source.get_file_path(),
                    banks.target.get_file_path(),
                    graphs.source.id,
                    graphs.target.id,
                    nodes.source,
                    nodes.target
                    ] + columns
                
            print delimiter.join(columns).encode("utf-8")
            
            
parser = DaesoArgParser(description=__doc__.strip())

parser.add_argument(
    "corpus", 
    nargs="+",
    metavar="FILE",
    help="parallel graph corpus file"
    )

parser.add_argument(
    "-d", "--delimiter",
    default="\t",
    help="column delimiter string (default is tab character '\\t')"
    )

parser.add_argument(
Esempio n. 29
0
from glob import glob

from daeso.utils.cli import DaesoArgParser, epilog
from daeso.gb.gbstats import gb_stats


epilog = """
Remarks:
  * Many columns will have zero values, because a parallel graph corpus 
    is required to get alignment information; see pgc_stats.py
  * Failed parses will only be excluded for graph banks in 'alpino' format.
  
""" + epilog  


parser = DaesoArgParser(description=__doc__, epilog=epilog)

parser.add_argument(
    "pattern", 
    help="*quoted* pattern for graph bank files")

parser.add_argument(
    "-F", "--format", 
    metavar="STRING",
    default="alpino", 
    dest="format", 
    help="treebank format (defaults to 'alpino')")

parser.add_argument(
    "-a", "--with-all", 
    action="store_true", 
Esempio n. 30
0
                graphs.target.get_node_token_string(nodes.target)
            ]

            if verbose:
                banks = graph_pair.get_banks()

                columns = [
                    banks.source.get_file_path(),
                    banks.target.get_file_path(), graphs.source.id,
                    graphs.target.id, nodes.source, nodes.target
                ] + columns

            print delimiter.join(columns).encode("utf-8")


parser = DaesoArgParser(description=__doc__.strip())

parser.add_argument("corpus",
                    nargs="+",
                    metavar="FILE",
                    help="parallel graph corpus file")

parser.add_argument(
    "-d",
    "--delimiter",
    default="\t",
    help="column delimiter string (default is tab character '\\t')")

parser.add_argument(
    "-V",
    "--verbose",
Esempio n. 31
0
# The Algraeph program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


__author__ = "Erwin Marsi <*****@*****.**>"


from daeso.utils.cli import DaesoArgParser
from graeph.release import version, description

parser = DaesoArgParser(description=description.strip(), 
                        version=version)

parser.add_argument(
    "corpus_file",
    metavar="FILE",
    nargs="?", 
    help="parallel graph corpus file")

parser.add_argument(
    "-d", "--dot_exec",
    metavar="FILE", 
    help='"dot" graph drawing program')

parser.add_argument(
    "-r", "--redirect",
    action='store_true',
Esempio n. 32
0
"""
align graphs in parallel graph corpus
"""

__authors__ = 'Erwin Marsi <*****@*****.**>'
__version__ = "0.9"

import imp

from daeso.utils.cli import DaesoArgParser
from daeso.utils.opsys import multiglob
from daeso.pgc.corpus import ParallelGraphCorpus

from daeso_nl.ga.setup import set_up_corpus_aligner

parser = DaesoArgParser(description=__doc__, version=__version__)

parser.add_argument("pgc_files",
                    nargs="+",
                    metavar="FILE",
                    help="parallel graph corpus file")

parser.add_argument("-c",
                    "--config",
                    metavar="FILE",
                    help="configuration file to set up a corpus aligner")

parser.add_argument("-x",
                    "--clear",
                    action="store_true",
                    help="remove all existing alignments")
Esempio n. 33
0
"""
parallel graph corpus diff

reports difference in node alignments between two parallel graph corpora
"""

from daeso.utils.cli import DaesoArgParser
from daeso.pgc.corpus import ParallelGraphCorpus
from daeso.pgc.diff import pgc_diff


__authors__ = 'Erwin Marsi <*****@*****.**>'

    

parser = DaesoArgParser(description=__doc__)


parser.add_argument(
    "corpus1", 
    help="first parallel graph corpus")

parser.add_argument(
    "corpus2", 
    help="second parallel graph corpus")


parser.add_argument(
    "-1", "--first_annotator", 
    metavar="NAME", 
    default="First annotator", 
Esempio n. 34
0
corpus_dir = getenv("DAESO_CORPUS", "")

if not corpus_dir:
    stderr.write("Warning: environment variable DAESO_CORPUS not found!" )


def expand_globs(corpus_dir, globs):
    files = []

    for pattern in globs:
        files.extend(relglob(corpus_dir, pattern))

    return files


parser = DaesoArgParser(description=__doc__, version=__version__)


parser.add_argument(
    "pgc_glob", 
    nargs="+",
    help=( "glob (i.e. filename pattern) for parallel graph corpora, "
           "interpreted relative to the corpus base directory "
           "(cf. --corpus_dir)"))

parser.add_argument(
    "-c", "--corpus-dir", 
    default=corpus_dir,
    help="pgc filenames are interpreted relative to this base directory "
    "(default is '" + corpus_dir + "')")
Esempio n. 35
0
use pgc_diff.py
"""

# TODO:
# - check for at least two input files


__authors__ = 'Erwin Marsi <*****@*****.**>'



from daeso.utils.cli import DaesoArgParser
from daeso.pgc.agreement import run_eval


parser = DaesoArgParser(description=__doc__)

parser.add_argument(
    "corpus_fns",
    nargs="+",
    metavar="corpus",
    help="parallel graph corpus file (at least two are required)")

parser.add_argument(
    "-a", "--annotator",
    dest="annotators",
    metavar="CC", 
    action="append",
    help="initials of the annotator of a parallel graph corpus files "
    "(default is 'A1', 'A2', etc.) Repeat this option as many times as " 
    "there are corpus files")
Esempio n. 36
0
# TODO:
# - handle encoding errors
# - reset cache

__author__ = 'Erwin Marsi <*****@*****.**>'
__version__ = '0.9'

import sys
import socket

from xmlrpclib import ServerProxy, Fault
from daeso.utils.cli import DaesoArgParser
from daeso_nl.alpino.server import DEFAULT_HOST, DEFAULT_PORT
from daeso_nl.alpino.client import alpino_client

parser = DaesoArgParser(description=__doc__, version=__version__)

parser.add_argument("-H",
                    "--host",
                    default="%s:%d" % (DEFAULT_HOST, DEFAULT_PORT),
                    metavar="HOST[:PORT]",
                    help="name or IP address of host (default is '%s') "
                    "optionally followed by a port number "
                    "(default is %d)" % (DEFAULT_HOST, DEFAULT_PORT))

parser.add_argument("-i",
                    "--input-encoding",
                    default="utf8",
                    metavar="utf8|latin1|ascii|...",
                    help="character encoding of input (default is utf8)")
Esempio n. 37
0

import imp
import sys

from daeso.utils.cli import DaesoArgParser

from daeso_nl.ga.setup import set_up_align_server
from daeso_nl.ga.server import ( 
    start_server, 
    DEFAULT_HOST,
    DEFAULT_PORT )

#-------------------------------------------------------------------------------

parser = DaesoArgParser(description=__doc__, version=__version__)

parser.add_argument(
    "config",
    metavar="FILE",
    help="configuration file to set up a graph align server")  

parser.add_argument(
    "-H", "--host", 
    default="%s:%d" % (DEFAULT_HOST, DEFAULT_PORT),
    metavar="HOST[:PORT]",
    help="name or IP address of host (default is '%s') " % DEFAULT_HOST +
    "optionally followed by a port number (default is %d)" % DEFAULT_PORT)

parser.add_argument(
    "-l", "--log", 
Esempio n. 38
0
parallel graph corpora, and generates an analysis in terms of a number of
statistics.

For a more detailed analysis of the diferences between a pair of annotations,
use pgc_diff.py
"""

# TODO:
# - check for at least two input files

__authors__ = 'Erwin Marsi <*****@*****.**>'

from daeso.utils.cli import DaesoArgParser
from daeso.pgc.agreement import run_eval

parser = DaesoArgParser(description=__doc__)

parser.add_argument(
    "corpus_fns",
    nargs="+",
    metavar="corpus",
    help="parallel graph corpus file (at least two are required)")

parser.add_argument(
    "-a",
    "--annotator",
    dest="annotators",
    metavar="CC",
    action="append",
    help="initials of the annotator of a parallel graph corpus files "
    "(default is 'A1', 'A2', etc.) Repeat this option as many times as "
Esempio n. 39
0
from daeso.utils.cli import DaesoArgParser, epilog
from daeso.pgc.pgcstats import pgc_stats 


epilog = """
Examples:
  $ pgc_stats.py -efpu "*.pgc" 
  
Remarks:
  * Failed parses will only be exluded for graph banks in 'alpino' format.

""" + epilog


parser = DaesoArgParser(description=__doc__,
                        epilog=epilog)

parser.add_argument(
    "pattern",
               help="*quoted* pattern for parallel graph corpus files")

parser.add_argument(
    "-a", "--with-all", 
    action="store_true", 
    dest="with_all", 
    help="include all, sets options -efpru")

#parser.add_argument("-c", "--csv", action="store_true", 
                  #dest="csv", 
                  #help="output in comma separated values")
Esempio n. 40
0
"""
evaluation of text alignment in parallel text corpora

reports precision, recall and F-score on alignment for a certain tag
for one or more pairs of true and predicted parallel text corpora
"""

from daeso.utils.cli import DaesoArgParser
from daeso.ptc.evaluate import eval_alignment


__authors__ = 'Erwin Marsi <*****@*****.**>'

    

parser = DaesoArgParser(description=__doc__)


parser.add_argument(
    "-t", "--true_corpora", 
    nargs = "+",
    help="parallel text corpus containing true alignments")

parser.add_argument(
    "-p", "--pred_corpora", 
    nargs = "+",
    help="parallel text corpus containing predicted alignments")

parser.add_argument(
    "--tag", 
    default="s", 
Esempio n. 41
0
Meta-data of everything but the first corpus is discarded!
"""

# TODO:
# - silence warning about meta-data

__authors__ = 'Erwin Marsi <*****@*****.**>'

import sys

from daeso.utils.cli import DaesoArgParser
from daeso.utils.opsys import multiglob
from daeso.pgc.corpus import ParallelGraphCorpus

parser = DaesoArgParser(description=__doc__)

parser.add_argument("file",
                    nargs="+",
                    metavar="FILE",
                    help="parallel graph corpus filename, "
                    "or quoted file name pattern for parallel graph corpora")

parser.add_argument("-f",
                    "--format",
                    action="store_true",
                    help="output indented XML")

parser.add_argument("-V",
                    "--verbose",
                    action="store_true",