コード例 #1
0
def _analyze(db, fin, fout, backoff, cache):
    if cache:
        analyzer = Analyzer(db, backoff, cache_size=1024)
    else:
        analyzer = Analyzer(db, backoff)

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            analyses = analyzer.analyze(token)

            serialized = _serialize_analyses(fout, token, analyses, db.order)

            if six.PY3:
                fout.write(serialized)
            else:
                fout.write(force_encoding(serialized))

            fout.write('\n\n')

        line = force_unicode(fin.readline())
コード例 #2
0
    def clean(self,
              ar_raw_text: str,
              normalize_teh_marbuta: bool = False,
              normalize_alef: bool = True,
              is_punct: bool = False,
              is_de_diacritize: bool = True,
              is_to_lower: bool = True) -> str:
        clean_text = ar_raw_text.strip("\n").strip()

        if is_to_lower:
            clean_text = clean_text.lower()

        clean_text = force_unicode(clean_text)
        if six.PY3:
            clean_text = self.clean_mapper.map_string(clean_text)
        else:
            clean_text = force_encoding(clean_text)
            clean_text = self.clean_mapper.map_string(clean_text)

        clean_text = self._get_sec_cleaner(
            is_punct=is_punct).clean_text(clean_text)

        clean_text = self._normalize(
            arb_text=clean_text,
            normalize_teh_marbuta=normalize_teh_marbuta,
            normalize_alef=normalize_alef)
        if is_de_diacritize:
            clean_text = self._de_diacritize(arb_text=clean_text)

        for i in range(5):
            clean_text = clean_text.replace("  ", " ")

        return clean_text.strip()
コード例 #3
0
def _arclean(mapper, fin, fout):
    for line in fin:
        line = force_unicode(line)

        if six.PY3:
            fout.write(mapper.map_string(line))
        else:
            fout.write(force_encoding(mapper.map_string(line)))
    fout.flush()
コード例 #4
0
def _analyze(db, fin, fout, backoff, cache):
    analyzer = CalimaStarAnalyzer(db, backoff)
    memoize_table = {} if cache else None

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            if cache and token in memoize_table:
                if six.PY3:
                    fout.write(memoize_table[token])
                else:
                    fout.write(force_encoding(memoize_table[token]))

                fout.write('\n\n')
            else:
                analyses = analyzer.analyze(token)
                serialized = _serialize_analyses(fout, token, analyses,
                                                 db.order)

                if cache:
                    memoize_table[token] = serialized

                if six.PY3:
                    fout.write(serialized)
                else:
                    fout.write(force_encoding(serialized))

                fout.write('\n\n')

        line = force_unicode(fin.readline())
コード例 #5
0
def _reinflect(db, fin, fout):
    reinflector = Reinflector(db)

    line = force_unicode(fin.readline())
    line_num = 1

    while line:
        line = line.strip()

        if len(line) == 0:
            line = force_unicode(fin.readline())
            line_num += 1
            continue

        parsed = _parse_reinflector_line(line)

        if parsed is None:
            if fin is sys.stdin:
                sys.stderr.write('Error: Invalid input line.\n')
            else:
                sys.stderr.write(
                    'Error: Invalid input line. [{}]\n'.format(line_num))

        else:
            word = parsed[0]
            feats = parsed[1]

            try:
                analyses = reinflector.reinflect(word, feats)

                serialized = _serialize_analyses(fout, word, analyses,
                                                 db.order)

                if six.PY3:
                    fout.write(serialized)
                else:
                    fout.write(force_encoding(serialized))

                fout.write('\n\n')
            except MorphologyError as error:
                # This could be thrown by the analyzer, generator, or
                # reinflector.
                if fin is sys.stdin:
                    sys.stderr.write('Error: {}.\n'.format(error.msg))
                else:
                    sys.stderr.write('Error: {}. [{}]\n'.format(
                        error.msg, line_num))

        line = force_unicode(fin.readline())
        line_num += 1
コード例 #6
0
def _analyze(db, fin, fout, backoff, cache, num_disambig=None):
    if cache:
        analyzer = CalimaStarAnalyzer(db, backoff, cache_size=1024)
    else:
        analyzer = CalimaStarAnalyzer(db, backoff)
    disambig = None

    if num_disambig is not None:
        disambig = MLEDisambiguator(analyzer)

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            analyses = analyzer.analyze(token)

            if num_disambig is not None:
                dambg = disambig.disambiguate([token], num_disambig)
                analyses = [a.analysis for a in dambg[0].analyses]
            else:
                analyses = analyzer.analyze(token)

            serialized = _serialize_analyses(fout, token, analyses, db.order)

            if six.PY3:
                fout.write(serialized)
            else:
                fout.write(force_encoding(serialized))

            fout.write('\n\n')

        line = force_unicode(fin.readline())
コード例 #7
0
def main():  # pragma: no cover
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        if arguments['--list']:
            for scheme in _BUILTIN_SCHEMES:
                print("{}   {}".format(scheme[0].ljust(20), scheme[1]))
            sys.exit(0)

        if arguments['--scheme'] is not None:
            if arguments['--scheme'] not in [s[0] for s in _BUILTIN_SCHEMES]:
                sys.stderr.write('Error: {} is not a valid scheme.\n'
                                 'Run `camel_transliterate -l` to see the list'
                                 ' of available schemes.'
                                 '\n'.format(repr(arguments['--scheme'])))
                sys.exit(1)

            if arguments['--marker'] is None:
                marker = '@@IGNORE@@'
            else:
                marker = arguments['--marker']

            ignore_markers = arguments['--ignore-markers']
            strip_markers = arguments['--strip-markers']

            # Open files (or just use stdin and stdout)
            fin, fout = _open_files(arguments['FILE'], arguments['--output'])

            # Load the CharMapper and initialize a Transliterator with it
            try:
                mapper = CharMapper.builtin_mapper(arguments['--scheme'])
                trans = Transliterator(mapper, marker)
            except Exception:  # pylint: disable=W0703
                sys.stderr.write('Error: Could not load builtin scheme'
                                 ' {}.\n'.format(repr(arguments['--scheme'])))
                sys.exit(1)

            # Transliterate lines
            try:
                for line in fin:
                    line = force_unicode(line)

                    if six.PY3:
                        fout.write(
                            trans.transliterate(line, strip_markers,
                                                ignore_markers))
                    else:
                        fout.write(
                            force_encoding(
                                trans.transliterate(line, strip_markers,
                                                    ignore_markers)))
                fout.flush()

            # If everything worked so far, this shouldn't happen
            except Exception:  # pylint: disable=W0703
                sys.stderr.write('Error: An unkown error occured during '
                                 'transliteration.\n')
                sys.exit(1)

            # Cleanup
            if arguments['FILE'] is not None:
                fin.close()
            if arguments['--output'] is not None:
                fout.close()

        sys.exit(0)
    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)
    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)
コード例 #8
0
def _generate(db, fin, fout, backoff):
    generator = Generator(db)
    reinflector = Reinflector(db) if backoff == 'REINFLECT' else None

    line = force_unicode(fin.readline())
    line_num = 1

    while line:
        line = line.strip()

        if len(line) == 0:
            line = force_unicode(fin.readline())
            line_num += 1
            continue

        parsed = _parse_generator_line(line)

        if parsed is None:
            if fin is sys.stdin:
                sys.stderr.write('Error: Invalid input line.\n')
            else:
                sys.stderr.write(
                    'Error: Invalid input line ({}).\n'.format(line_num))

        else:
            lemma = parsed[0]
            feats = parsed[1]

            # Make sure lemma and pos are specified first
            if lemma is None:
                if fin is sys.stdin:
                    sys.stderr.write('Error: Missing lex/lemma feature.\n')
                else:
                    sys.stderr.write(
                        'Error: Missing lex/lemma feature. [{}].\n'.format(
                            line_num))
            elif 'pos' not in feats:
                if fin is sys.stdin:
                    sys.stderr.write('Error: Missing pos feature.\n')
                else:
                    sys.stderr.write(
                        'Error: Missing pos feature. [{}]\n'.format(line_num))
            else:
                try:
                    analyses = generator.generate(lemma, feats)

                    if len(analyses) == 0 and backoff == 'REINFLECT':
                        word = _dediac(lemma)
                        analyses = reinflector.reinflect(word, feats)

                    serialized = _serialize_analyses(fout, lemma, analyses,
                                                     db.order, True)

                    if six.PY3:
                        fout.write(serialized)
                    else:
                        fout.write(force_encoding(serialized))

                    fout.write('\n\n')
                except GeneratorError as error:
                    if fin is sys.stdin:
                        sys.stderr.write('Error: {}.\n'.format(error.msg))
                    else:
                        sys.stderr.write('Error: {}. [{}]\n'.format(
                            error.msg, line_num))

        line = force_encoding(fin.readline())
        line_num += 1