Example #1
0
def _analyze(db, fin, fout, backoff, cache):
    if cache:
        analyzer = Analyzer(db, backoff, cache_size=1024)
    else:
        analyzer = Analyzer(db, backoff)

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            analyses = analyzer.analyze(token)

            serialized = _serialize_analyses(fout, token, analyses, db.order)

            if six.PY3:
                fout.write(serialized)
            else:
                fout.write(force_encoding(serialized))

            fout.write('\n\n')

        line = force_unicode(fin.readline())
Example #2
0
def _reinflect(db, fin, fout):
    reinflector = Reinflector(db)

    line = force_unicode(fin.readline())
    line_num = 1

    while line:
        line = line.strip()

        if len(line) == 0:
            line = force_unicode(fin.readline())
            line_num += 1
            continue

        parsed = _parse_reinflector_line(line)

        if parsed is None:
            if fin is sys.stdin:
                sys.stderr.write('Error: Invalid input line.\n')
            else:
                sys.stderr.write(
                    'Error: Invalid input line. [{}]\n'.format(line_num))

        else:
            word = parsed[0]
            feats = parsed[1]

            try:
                analyses = reinflector.reinflect(word, feats)

                serialized = _serialize_analyses(fout, word, analyses,
                                                 db.order)

                if six.PY3:
                    fout.write(serialized)
                else:
                    fout.write(force_encoding(serialized))

                fout.write('\n\n')
            except MorphologyError as error:
                # This could be thrown by the analyzer, generator, or
                # reinflector.
                if fin is sys.stdin:
                    sys.stderr.write('Error: {}.\n'.format(error.msg))
                else:
                    sys.stderr.write('Error: {}. [{}]\n'.format(
                        error.msg, line_num))

        line = force_unicode(fin.readline())
        line_num += 1
Example #3
0
    def clean(self,
              ar_raw_text: str,
              normalize_teh_marbuta: bool = False,
              normalize_alef: bool = True,
              is_punct: bool = False,
              is_de_diacritize: bool = True,
              is_to_lower: bool = True) -> str:
        clean_text = ar_raw_text.strip("\n").strip()

        if is_to_lower:
            clean_text = clean_text.lower()

        clean_text = force_unicode(clean_text)
        if six.PY3:
            clean_text = self.clean_mapper.map_string(clean_text)
        else:
            clean_text = force_encoding(clean_text)
            clean_text = self.clean_mapper.map_string(clean_text)

        clean_text = self._get_sec_cleaner(
            is_punct=is_punct).clean_text(clean_text)

        clean_text = self._normalize(
            arb_text=clean_text,
            normalize_teh_marbuta=normalize_teh_marbuta,
            normalize_alef=normalize_alef)
        if is_de_diacritize:
            clean_text = self._de_diacritize(arb_text=clean_text)

        for i in range(5):
            clean_text = clean_text.replace("  ", " ")

        return clean_text.strip()
Example #4
0
def _arclean(mapper, fin, fout):
    for line in fin:
        line = force_unicode(line)

        if six.PY3:
            fout.write(mapper.map_string(line))
        else:
            fout.write(force_encoding(mapper.map_string(line)))
    fout.flush()
Example #5
0
def _serialize_analyses(fout, word, analyses, order, generation=False):
    buff = collections.deque()
    buff.append(u'#{}: {}'.format(u'LEMMA' if generation else u'WORD',
                                  force_unicode(word)))

    if len(analyses) == 0:
        buff.append(u'NO_ANALYSIS')
    else:
        sub_buff = set()
        for a in analyses:
            output = u' '.join([
                u'{}:{}'.format(force_unicode(f), force_unicode(a[f]))
                for f in order if f in a
            ])
            sub_buff.add(output)
        buff.extend(sub_buff)

    return u'\n'.join(buff)
def _analyze(db, fin, fout, backoff, cache, num_disambig=None):
    if cache:
        analyzer = CalimaStarAnalyzer(db, backoff, cache_size=1024)
    else:
        analyzer = CalimaStarAnalyzer(db, backoff)
    disambig = None

    if num_disambig is not None:
        disambig = MLEDisambiguator(analyzer)

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            analyses = analyzer.analyze(token)

            if num_disambig is not None:
                dambg = disambig.disambiguate([token], num_disambig)
                analyses = [a.analysis for a in dambg[0].analyses]
            else:
                analyses = analyzer.analyze(token)

            serialized = _serialize_analyses(fout, token, analyses, db.order)

            if six.PY3:
                fout.write(serialized)
            else:
                fout.write(force_encoding(serialized))

            fout.write('\n\n')

        line = force_unicode(fin.readline())
Example #7
0
def _analyze(db, fin, fout, backoff, cache):
    analyzer = CalimaStarAnalyzer(db, backoff)
    memoize_table = {} if cache else None

    line = force_unicode(fin.readline())

    while line:
        if len(line) == 0:
            line = force_unicode(fin.readline())
            continue

        line = line.strip()
        tokens = _tokenize(line)

        for token in tokens:
            if cache and token in memoize_table:
                if six.PY3:
                    fout.write(memoize_table[token])
                else:
                    fout.write(force_encoding(memoize_table[token]))

                fout.write('\n\n')
            else:
                analyses = analyzer.analyze(token)
                serialized = _serialize_analyses(fout, token, analyses,
                                                 db.order)

                if cache:
                    memoize_table[token] = serialized

                if six.PY3:
                    fout.write(serialized)
                else:
                    fout.write(force_encoding(serialized))

                fout.write('\n\n')

        line = force_unicode(fin.readline())
Example #8
0
    def _parse_dbfile(self, fpath):
        with open(fpath, 'r') as dbfile:
            # Process DEFINES
            for line in dbfile:
                line = line = force_unicode(line).strip()

                if line == '###DEFINES###':
                    continue

                if line == '###DEFAULTS###':
                    break

                toks = line.split(u' ')

                # Check if line has the minimum viable format
                if len(toks) < 3 or toks[0] != 'DEFINE':
                    raise DatabaseParseError(
                        'invalid DEFINES line {}'.format(repr(line)))

                new_define = toks[1]
                val_set = set()

                # Parse values for defined keyword
                for tok in toks[2:]:
                    subtoks = tok.split(':')

                    # If it's a malformed entry, ignore it
                    if len(subtoks) != 2 and subtoks[0] != toks[1]:
                        raise DatabaseParseError(
                            'invalid key value pair {} in DEFINES'.format(
                                repr(tok)))

                    # If it's an open class, we use None instead of a set
                    if len(toks) == 3 and subtoks[1] == '*open*':
                        val_set = None
                        break

                    val_set.add(subtoks[1])

                self.defines[new_define] = (
                    list(val_set) if val_set is not None else None)

            # Process DEFAULTS
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###ORDER###':
                    break

                toks = line.split(u' ')

                if len(toks) < 2 or toks[0] != 'DEFAULT':
                    raise DatabaseParseError(
                        'invalid DEFAULTS line {}'.format(repr(line)))

                parsed_default = self._parse_defaults_line_toks(toks[1:])

                if self._defaultKey not in parsed_default:
                    raise DatabaseParseError(
                        'DEFAULTS line {} missing {} value'.format(
                            repr(line), self._defaultKey))

                dkey = parsed_default[self._defaultKey]
                self.defaults[dkey] = parsed_default

            # Process ORDER
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###STEMBACKOFF###':
                    self.compute_feats.update(self.order)
                    break

                toks = line.split(u' ')

                if (self.order is not None and len(toks) < 2 and
                        toks[0] != 'ORDER'):
                    raise DatabaseParseError(
                        'invalid ORDER line {}'.format(repr(line)))

                if toks[1] not in self.defines:
                    raise DatabaseParseError(
                        'invalid feature {} in ORDER line.'.format(
                            repr(toks[1])))

                self.order = toks[1:]

            # Process STEMBACKOFFS
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###PREFIXES###':
                    break

                toks = line.split(u' ')

                if len(toks) < 3 or toks[0] != 'STEMBACKOFF':
                    raise DatabaseParseError(
                        'invalid STEMBACKOFFS line {}'.format(repr(line)))

                self.stem_backoffs[toks[1]] = toks[2:]

            # Process PREFIXES
            for line in dbfile:
                line = force_unicode(line)
                parts = line.split(u'\t')

                if len(parts) != 3:
                    if line.strip() == '###SUFFIXES###':
                        break
                    raise DatabaseParseError(
                        'invalid PREFIXES line {}'.format(repr(line)))

                prefix = parts[0].strip()
                category = parts[1]
                analysis = self._parse_analysis_line_toks(
                    parts[2].strip().split(u' '))

                if self._withAnalysis:
                    if prefix not in self.prefix_hash:
                        self.prefix_hash[prefix] = []
                    self.prefix_hash[prefix].append((category, analysis))

                if self._withGeneration:
                    # FIXME: Make sure analyses for category are unique?
                    if category not in self.prefix_cat_hash:
                        self.prefix_cat_hash[category] = []
                    self.prefix_cat_hash[category].append(analysis)

            # Process SUFFIXES
            for line in dbfile:
                line = force_unicode(line)
                parts = line.split(u'\t')

                if len(parts) != 3:
                    if line.strip() == '###STEMS###':
                        break
                    raise DatabaseParseError(
                        'invalid SUFFIXES line {}'.format(repr(line)))

                suffix = parts[0].strip()
                category = parts[1]
                analysis = self._parse_analysis_line_toks(
                    parts[2].strip().split(u' '))

                if self._withAnalysis:
                    if suffix not in self.suffix_hash:
                        self.suffix_hash[suffix] = []
                    self.suffix_hash[suffix].append((category, analysis))

                if self._withGeneration:
                    # FIXME: Make sure analyses for category are unique?
                    if category not in self.suffix_cat_hash:
                        self.suffix_cat_hash[category] = []
                    self.suffix_cat_hash[category].append(analysis)

            # Process STEMS
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###TABLE AB###':
                    break

                parts = line.split(u'\t')

                if len(parts) != 3:
                    raise DatabaseParseError(
                        'invalid STEMS line {}'.format(repr(line)))

                stem = parts[0]
                category = parts[1]
                analysis = self._parse_analysis_line_toks(parts[2].split(u' '))

                if self._withAnalysis:
                    if stem not in self.stem_hash:
                        self.stem_hash[stem] = []
                    self.stem_hash[stem].append((category, analysis))

                if self._withGeneration:
                    # FIXME: Make sure analyses for category are unique?
                    lemma = analysis['lex']
                    lemma_key = _LEMMA_SPLIT_RE.split(lemma)[0]
                    analysis['stemcat'] = category
                    if lemma_key not in self.lemma_hash:
                        self.lemma_hash[lemma_key] = []
                    self.lemma_hash[lemma_key].append(analysis)

            # Process prefix_stem compatibility table
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###TABLE BC###':
                    break

                toks = line.split()

                if len(toks) != 2:
                    raise DatabaseParseError(
                        'invalid TABLE AB line {}'.format(repr(line)))

                prefix_cat = toks[0]
                stem_cat = toks[1]

                if self._withAnalysis:
                    if prefix_cat not in self.prefix_stem_compat:
                        self.prefix_stem_compat[prefix_cat] = set()
                    self.prefix_stem_compat[prefix_cat].add(stem_cat)

                if self._withGeneration:
                    if stem_cat not in self.stem_prefix_compat:
                        self.stem_prefix_compat[stem_cat] = set()
                    self.stem_prefix_compat[stem_cat].add(prefix_cat)

            # Process stem_suffix compatibility table
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###TABLE AC###':
                    break

                toks = line.split()

                if len(toks) != 2:
                    raise DatabaseParseError(
                        'invalid TABLE BC line {}'.format(repr(line)))

                stem_cat = toks[0]
                suffix_cat = toks[1]

                if stem_cat not in self.stem_suffix_compat:
                    self.stem_suffix_compat[stem_cat] = set()
                self.stem_suffix_compat[stem_cat].add(suffix_cat)

            # Process prefix_suffix compatibility table
            for line in dbfile:
                line = force_unicode(line).strip()

                toks = line.split()

                if len(toks) != 2:
                    raise DatabaseParseError(
                        'invalid TABLE AC line {}'.format(repr(line)))

                prefix_cat = toks[0]
                suffix_cat = toks[1]

                if prefix_cat not in self.prefix_suffix_compat:
                    self.prefix_suffix_compat[prefix_cat] = set()
                self.prefix_suffix_compat[prefix_cat].add(suffix_cat)

            if self._withAnalysis:
                for prefix in self.prefix_hash.keys():
                    self.max_prefix_size = max(self.max_prefix_size,
                                               len(prefix))
                for suffix in self.suffix_hash.keys():
                    self.max_suffix_size = max(self.max_suffix_size,
                                               len(suffix))
Example #9
0
def main():  # pragma: no cover
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        if arguments['--list']:
            for scheme in _BUILTIN_SCHEMES:
                print("{}   {}".format(scheme[0].ljust(20), scheme[1]))
            sys.exit(0)

        if arguments['--scheme'] is not None:
            if arguments['--scheme'] not in [s[0] for s in _BUILTIN_SCHEMES]:
                sys.stderr.write('Error: {} is not a valid scheme.\n'
                                 'Run `camel_transliterate -l` to see the list'
                                 ' of available schemes.'
                                 '\n'.format(repr(arguments['--scheme'])))
                sys.exit(1)

            if arguments['--marker'] is None:
                marker = '@@IGNORE@@'
            else:
                marker = arguments['--marker']

            ignore_markers = arguments['--ignore-markers']
            strip_markers = arguments['--strip-markers']

            # Open files (or just use stdin and stdout)
            fin, fout = _open_files(arguments['FILE'], arguments['--output'])

            # Load the CharMapper and initialize a Transliterator with it
            try:
                mapper = CharMapper.builtin_mapper(arguments['--scheme'])
                trans = Transliterator(mapper, marker)
            except Exception:  # pylint: disable=W0703
                sys.stderr.write('Error: Could not load builtin scheme'
                                 ' {}.\n'.format(repr(arguments['--scheme'])))
                sys.exit(1)

            # Transliterate lines
            try:
                for line in fin:
                    line = force_unicode(line)

                    if six.PY3:
                        fout.write(
                            trans.transliterate(line, strip_markers,
                                                ignore_markers))
                    else:
                        fout.write(
                            force_encoding(
                                trans.transliterate(line, strip_markers,
                                                    ignore_markers)))
                fout.flush()

            # If everything worked so far, this shouldn't happen
            except Exception:  # pylint: disable=W0703
                sys.stderr.write('Error: An unkown error occured during '
                                 'transliteration.\n')
                sys.exit(1)

            # Cleanup
            if arguments['FILE'] is not None:
                fin.close()
            if arguments['--output'] is not None:
                fout.close()

        sys.exit(0)
    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)
    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)
Example #10
0
def _generate(db, fin, fout, backoff):
    generator = Generator(db)
    reinflector = Reinflector(db) if backoff == 'REINFLECT' else None

    line = force_unicode(fin.readline())
    line_num = 1

    while line:
        line = line.strip()

        if len(line) == 0:
            line = force_unicode(fin.readline())
            line_num += 1
            continue

        parsed = _parse_generator_line(line)

        if parsed is None:
            if fin is sys.stdin:
                sys.stderr.write('Error: Invalid input line.\n')
            else:
                sys.stderr.write(
                    'Error: Invalid input line ({}).\n'.format(line_num))

        else:
            lemma = parsed[0]
            feats = parsed[1]

            # Make sure lemma and pos are specified first
            if lemma is None:
                if fin is sys.stdin:
                    sys.stderr.write('Error: Missing lex/lemma feature.\n')
                else:
                    sys.stderr.write(
                        'Error: Missing lex/lemma feature. [{}].\n'.format(
                            line_num))
            elif 'pos' not in feats:
                if fin is sys.stdin:
                    sys.stderr.write('Error: Missing pos feature.\n')
                else:
                    sys.stderr.write(
                        'Error: Missing pos feature. [{}]\n'.format(line_num))
            else:
                try:
                    analyses = generator.generate(lemma, feats)

                    if len(analyses) == 0 and backoff == 'REINFLECT':
                        word = _dediac(lemma)
                        analyses = reinflector.reinflect(word, feats)

                    serialized = _serialize_analyses(fout, lemma, analyses,
                                                     db.order, True)

                    if six.PY3:
                        fout.write(serialized)
                    else:
                        fout.write(force_encoding(serialized))

                    fout.write('\n\n')
                except GeneratorError as error:
                    if fin is sys.stdin:
                        sys.stderr.write('Error: {}.\n'.format(error.msg))
                    else:
                        sys.stderr.write('Error: {}. [{}]\n'.format(
                            error.msg, line_num))

        line = force_encoding(fin.readline())
        line_num += 1