Esempio n. 1
0
    def __init__(self, foldtitle=True, ignorecase=False, accelchars="", termlength=3,
                 sourcelanguage="en", invert=False, stopfile=None):
        self.foldtitle = foldtitle
        self.ignorecase = ignorecase
        self.accelchars = accelchars
        self.termlength = termlength

        self.sourcelanguage = sourcelanguage
        self.invert = invert

        self.stopwords = {}
        self.stoprelist = []
        self.stopfoldtitle = True
        self.stopignorecase = False

        if stopfile is None:
            try:
                stopfile = file_discovery.get_abs_data_filename('stoplist-%s' % self.sourcelanguage)
            except:
                pass
        self.stopfile = stopfile
        self.parse_stopword_file()

        # handles c-format and python-format
        self.formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
        # handles XML/HTML elements (<foo>text</foo> => text)
        self.xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
        # handles XML/HTML entities (&#32; &#x20; &amp; &my_entity;)
        self.xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);",
                               flags=re.UNICODE | re.IGNORECASE)

        self.units = 0
        self.glossary = {}
Esempio n. 2
0
def find_langmodel_files():
    from translate.misc.file_discovery import get_abs_data_filename
    lmdir = path.abspath(get_abs_data_filename('langmodels'))
    if not path.isdir(lmdir):
        return []

    return [(path.join(TARGET_DATA_DIR, 'langmodels'), glob(path.join(lmdir, '*.*')))]
Esempio n. 3
0
def main():
    formats = {"po": ("po", None), "pot": ("pot", None), None: ("po", None)}
    parser = TerminologyOptionParser(formats)

    parser.add_option(
        "-u", "--update", type="string", dest="update",
        metavar="UPDATEFILE", help="update terminology in UPDATEFILE")

    parser.add_option("-S", "--stopword-list", type="string", metavar="STOPFILE", dest="stopfile",
                      help="read stopword (term exclusion) list from STOPFILE (default %s)" %
                      file_discovery.get_abs_data_filename('stoplist-en'))

    parser.set_defaults(foldtitle=True, ignorecase=False)
    parser.add_option(
        "-F", "--fold-titlecase", callback=fold_case_option,
        action="callback", help="fold \"Title Case\" to lowercase (default)")
    parser.add_option(
        "-C", "--preserve-case", callback=preserve_case_option,
        action="callback", help="preserve all uppercase/lowercase")
    parser.add_option(
        "-I", "--ignore-case", dest="ignorecase",
        action="store_true", help="make all terms lowercase")

    parser.add_option(
        "", "--accelerator", dest="accelchars", default="",
        metavar="ACCELERATORS", help="ignore the given accelerator characters when matching")

    parser.add_option(
        "-t", "--term-words", type="int", dest="termlength", default="3",
        help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
    parser.add_option(
        "", "--nonstop-needed", type="int", dest="nonstopmin", default="1",
        help="omit terms with less than MIN nonstop words (default 1)", metavar="MIN")
    parser.add_option(
        "", "--inputs-needed", type="int", dest="inputmin",
        help="omit terms appearing in less than MIN input files (default 2, or 1 if only one input file)", metavar="MIN")
    parser.add_option(
        "", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
        help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
    parser.add_option(
        "", "--substr-needed", type="int", dest="substrmin", default="2",
        help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
    parser.add_option(
        "", "--locs-needed", type="int", dest="locmin", default="2",
        help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")

    parser.add_option(
        "", "--sort", dest="sortorders", action="append",
        type="choice", choices=TerminologyExtractor.sortorders_default, metavar="ORDER",
        help="output sort order(s): %s (may repeat option, default is all in above order)" % ', '.join(TerminologyExtractor.sortorders_default))

    parser.add_option(
        "", "--source-language", dest="sourcelanguage", default="en",
        help="the source language code (default 'en')", metavar="LANG")
    parser.add_option(
        "-v", "--invert", dest="invert",
        action="store_true", default=False, help="invert the source and target languages for terminology")
    parser.set_usage()
    parser.description = __doc__
    parser.run()
Esempio n. 4
0
def main():
    formats = {"po": ("po", None), "pot": ("pot", None), None: ("po", None)}
    parser = TerminologyOptionParser(formats)

    parser.add_option(
        "-u", "--update", type="string", dest="update",
        metavar="UPDATEFILE", help="update terminology in UPDATEFILE")

    parser.add_option("-S", "--stopword-list", type="string", metavar="STOPFILE", dest="stopfile",
                      help="read stopword (term exclusion) list from STOPFILE (default %s)" %
                      file_discovery.get_abs_data_filename('stoplist-en'))

    parser.set_defaults(foldtitle=True, ignorecase=False)
    parser.add_option(
        "-F", "--fold-titlecase", callback=fold_case_option,
        action="callback", help="fold \"Title Case\" to lowercase (default)")
    parser.add_option(
        "-C", "--preserve-case", callback=preserve_case_option,
        action="callback", help="preserve all uppercase/lowercase")
    parser.add_option(
        "-I", "--ignore-case", dest="ignorecase",
        action="store_true", help="make all terms lowercase")

    parser.add_option(
        "", "--accelerator", dest="accelchars", default="",
        metavar="ACCELERATORS", help="ignore the given accelerator characters when matching")

    parser.add_option(
        "-t", "--term-words", type="int", dest="termlength", default="3",
        help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
    parser.add_option(
        "", "--nonstop-needed", type="int", dest="nonstopmin", default="1",
        help="omit terms with less than MIN nonstop words (default 1)", metavar="MIN")
    parser.add_option(
        "", "--inputs-needed", type="int", dest="inputmin",
        help="omit terms appearing in less than MIN input files (default 2, or 1 if only one input file)", metavar="MIN")
    parser.add_option(
        "", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
        help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
    parser.add_option(
        "", "--substr-needed", type="int", dest="substrmin", default="2",
        help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
    parser.add_option(
        "", "--locs-needed", type="int", dest="locmin", default="2",
        help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")

    parser.add_option(
        "", "--sort", dest="sortorders", action="append",
        type="choice", choices=TerminologyExtractor.sortorders_default, metavar="ORDER",
        help="output sort order(s): %s (may repeat option, default is all in above order)" % ', '.join(TerminologyExtractor.sortorders_default))

    parser.add_option(
        "", "--source-language", dest="sourcelanguage", default="en",
        help="the source language code (default 'en')", metavar="LANG")
    parser.add_option(
        "-v", "--invert", dest="invert",
        action="store_true", default=False, help="invert the source and target languages for terminology")
    parser.set_usage()
    parser.description = __doc__
    parser.run()
Esempio n. 5
0
def find_langmodel_files():
    from translate.misc.file_discovery import get_abs_data_filename
    lmdir = path.abspath(get_abs_data_filename('langmodels'))
    if not path.isdir(lmdir):
        return []

    return [(path.join(TARGET_DATA_DIR, 'langmodels'), glob(path.join(lmdir, '*.*')))]
Esempio n. 6
0
def get_abs_data_filename(path_parts, basedirs=None):
    """Get the absolute path to the given file- or directory name in Virtaal's
        data directory.

        @type  path_parts: list
        @param path_parts: The path parts that can be joined by os.path.join().
        """
    if basedirs is None:
        basedirs = []
    basedirs += [
        os.path.join(os.path.dirname(unicode(__file__, sys.getfilesystemencoding())), os.path.pardir),
    ]
    return file_discovery.get_abs_data_filename(path_parts, basedirs=basedirs)
Esempio n. 7
0
def get_abs_data_filename(path_parts, basedirs=None):
    """Get the absolute path to the given file- or directory name in Virtaal's
        data directory.

        @type  path_parts: list
        @param path_parts: The path parts that can be joined by os.path.join().
        """
    if basedirs is None:
        basedirs = []
    basedirs += [
        os.path.join(
            os.path.dirname(unicode(__file__, sys.getfilesystemencoding())),
            os.path.pardir),
    ]
    return file_discovery.get_abs_data_filename(path_parts, basedirs=basedirs)
Esempio n. 8
0
class LanguageIdentifier(object):
    MODEL_DIR = get_abs_data_filename('langmodels')
    """The directory containing the ngram language model files."""
    CONF_FILE = 'fpdb.conf'
    """
    The name of the file that contains language name-code pairs
    (relative to ``MODEL_DIR``).
    """
    def __init__(self, model_dir=None, conf_file=None):
        if model_dir is None:
            model_dir = self.MODEL_DIR
        if not path.isdir(model_dir):
            raise ValueError('Directory does not exist: %s' % (model_dir))

        if conf_file is None:
            conf_file = self.CONF_FILE
        conf_file = path.abspath(path.join(model_dir, conf_file))
        if not path.isfile(conf_file):
            raise ValueError('File does not exist: %s' % (conf_file))

        self._lang_codes = {}
        self._load_config(conf_file)
        self.ngram = NGram(model_dir)

    def _load_config(self, conf_file):
        """Load the mapping of language names to language codes as given in the
            configuration file."""
        lines = open(conf_file).read().splitlines()
        for line in lines:
            parts = line.split()
            if not parts or line.startswith('#'):
                continue  # Skip comment- and empty lines
            lname, lcode = parts[0], parts[1]

            # Make sure lname is not prefixed by directory names
            lname = path.split(lname)[-1]
            if extsep in lname:
                lname = lname[:lname.rindex(
                    extsep)]  # Remove extension if it has

            # Remove trailing '[_-]-utf8' from code
            if lcode.endswith('-utf8'):
                lcode = lcode[:-len('-utf8')]
            if lcode.endswith('-') or lcode.endswith('_'):
                lcode = lcode[:-1]

            self._lang_codes[lname] = lcode

    def identify_lang(self, text):
        """Identify the language of the text in the given string."""
        if not text:
            return None
        result = self.ngram.classify(text)
        if result in self._lang_codes:
            result = self._lang_codes[result]
        return result

    def identify_source_lang(self, instore):
        """Identify the source language of the given translation store or
            units.

            :type  instore: ``TranslationStore`` or list or tuple of
                ``TranslationUnit``s.
            :param instore: The translation store to extract source text from.
            :returns: The identified language's code or ``None`` if the language
                could not be identified."""
        if not isinstance(instore, (TranslationStore, list, tuple)):
            return None

        text = u' '.join(unit.source for unit in instore[:50]
                         if unit.istranslatable() and unit.source)
        if not text:
            return None
        return self.identify_lang(text)

    def identify_target_lang(self, instore):
        """Identify the target language of the given translation store or
            units.

            :type  instore: ``TranslationStore`` or list or tuple of
                ``TranslationUnit``s.
            :param instore: The translation store to extract target text from.
            :returns: The identified language's code or ``None`` if the language
                could not be identified."""
        if not isinstance(instore, (TranslationStore, list, tuple)):
            return None

        text = u' '.join(unit.target for unit in instore[:200]
                         if unit.istranslatable() and unit.target)
        if not text:
            return None
        return self.identify_lang(text)
Esempio n. 9
0
            lang = path.split(fname)[-1][:-size]
            n = _NGram()

            file = open(fname, 'r')
            for line in file.readlines():
                n.addText(line)
            file.close()

            n.normalise()
            self.ngrams[lang] = n

    def save(self, folder, ext='.lm'):
        for lang in self.ngrams.keys():
            fname = path.join(folder, lang + ext)
            file = open(fname, 'w')
            for v, k in self.ngrams[lang].sorted_by_score():
                file.write("%s\t %d\n" % (k, v))
            file.close()

if __name__ == '__main__':
    import sys

    # Should you want to generate your own .lm files
    #conf = Generate('/tmp')
    #conf.save('/tmp')

    text = sys.stdin.readline()
    from translate.misc.file_discovery import get_abs_data_filename
    l = NGram(get_abs_data_filename('langmodels'))
    print l.classify(text)
Esempio n. 10
0
            n = _NGram()

            file = open(fname, 'r')
            for line in file.readlines():
                n.addText(line)
            file.close()

            n.normalise()
            self.ngrams[lang] = n

    def save(self, folder, ext='.lm'):
        for lang in self.ngrams.keys():
            fname = path.join(folder, lang + ext)
            file = open(fname, 'w')
            for v, k in self.ngrams[lang].sorted_by_score():
                file.write("%s\t %d\n" % (k, v))
            file.close()


if __name__ == '__main__':
    import sys

    # Should you want to generate your own .lm files
    #conf = Generate('/tmp')
    #conf.save('/tmp')

    text = sys.stdin.readline()
    from translate.misc.file_discovery import get_abs_data_filename
    l = NGram(get_abs_data_filename('langmodels'))
    print l.classify(text)
Esempio n. 11
0
        for fname in glob.glob(path.normcase(folder)):
            lang = path.split(fname)[-1][:-size]
            n = _NGram()

            with open(fname, encoding="utf-8") as fp:
                for line in fp:
                    n.addText(line)

            n.normalise()
            self.ngrams[lang] = n

    def save(self, folder, ext=".lm"):
        for lang in self.ngrams.keys():
            fname = path.join(folder, lang + ext)
            with open(fname, mode="w", encoding="utf-8") as fp:
                for v, k in self.ngrams[lang].sorted_by_score():
                    fp.write("%s\t %d\n" % (k, v))


if __name__ == "__main__":
    # Should you want to generate your own .lm files
    # conf = Generate('/tmp')
    # conf.save('/tmp')

    text = sys.stdin.readline()
    from translate.misc.file_discovery import get_abs_data_filename

    lm = NGram(get_abs_data_filename("langmodels"))
    print(lm.classify(text))