Ejemplo n.º 1
0
    def __init__(self, analyzer, mle_path=None, top=1, cache_size=100000):
        if not isinstance(analyzer, Analyzer):
            raise ValueError('Invalid analyzer instance.')
        if not isinstance(top, int):
            raise ValueError('Invalid value for top.')
        if not isinstance(cache_size, int):
            raise ValueError('Invalid value for cache_size.')

        if mle_path is not None:
            with open(mle_path, 'r', encoding='utf-8') as mle_fp:
                self._mle = json.load(mle_fp)

                # TODO: Remove this when MLE files are fixed
                for analysis in self._mle.values():
                    analysis['lex'] = strip_lex(analysis['lex'])
        else:
            self._mle = None

        self._analyzer = analyzer

        if top < 1:
            top = 1
        self._top = top

        if cache_size < 0:
            cache_size = 0

        self._cache = LFUCache(cache_size)
        self._scored_analyses = cached(self._cache)(self._scored_analyses)
Ejemplo n.º 2
0
    def generate(self, lemma, feats):
        """Generate surface forms and their associated analyses for a given 
        lemma and a given set of (possibly underspecified) features. 
        The surface form is accessed through the `diac` feature.

        Args:
            lemma (:obj:`str`): Lemma to generate from.
            feats (:obj:`dict`): Dictionary of features. Must contain 'pos'
                feature.
                See :doc:`/reference/camel_morphology_features` for
                more information on features and their values.

        Returns:
            :obj:`list` of :obj:`dict`: List of generated analyses.
            See :doc:`/reference/camel_morphology_features` for more
            information on features and their values.

        Raises:
            :obj:`~camel_tools.morphology.errors.InvalidGeneratorFeature`: If
                a feature is given that is not defined in database.
            :obj:`~camel_tools.morphology.errors.InvalidGeneratorFeatureValue`:
                If an invalid value is given to a feature or if 'pos' feature
                is not defined.
        """

        lemma = strip_lex(lemma)
        if lemma not in self._db.lemma_hash:
            return []

        for feat in feats:
            if feat not in self._db.defines:
                raise InvalidGeneratorFeature(feat)
            elif (self._db.defines[feat] is not None and
                  feats[feat] not in self._db.defines[feat]):
                raise InvalidGeneratorFeatureValue(feat, feats[feat])

        if 'pos' not in feats or feats['pos'] not in self._db.defines['pos']:
            raise InvalidGeneratorFeatureValue('pos', feats.get('pos', None))

        feats = copy.copy(feats)

        default = self._db.defaults[feats['pos']]
        default_feat_set = frozenset(default.keys())
        feat_set = frozenset(feats.keys())

        if not feat_set.issubset(default_feat_set):
            return []

        # Set default values for undefined feats
        for feat in ['prc0', 'prc1', 'prc2', 'prc3', 'enc0', 'enc1', 'enc2']:
            if feat not in feats and feat in default:
                feats[feat] = default[feat]

        stem_feats_list = self._db.lemma_hash[lemma]
        analyses = collections.deque()

        for stem_feats in stem_feats_list:

            if 'vox' in feats and stem_feats['vox'] != feats['vox']:
                continue
            if 'rat' in feats and stem_feats['rat'] != feats['rat']:
                continue
            if 'pos' in feats and stem_feats['pos'] != feats['pos']:
                continue

            ignore_stem = False
            for feat in ['prc0', 'prc1', 'prc2', 'prc3', 'enc0', 'enc1', 'enc2']:
                if feat not in feats:
                    continue
                if (feat in stem_feats and
                        stem_feats[feat] != '0' and
                        feats[feat] != stem_feats[feat]):
                    ignore_stem = True
                    break

            if ignore_stem:
                continue

            prefix_cats = self._db.stem_prefix_compat[stem_feats['stemcat']]
            suffix_cats = self._db.stem_suffix_compat[stem_feats['stemcat']]

            for prefix_cat in prefix_cats:
                if prefix_cat not in self._db.prefix_cat_hash:
                    continue

                prefix_feats_list = self._db.prefix_cat_hash[prefix_cat]
                for prefix_feats in prefix_feats_list:
                    ignore_prefix = False

                    for feat in ['prc0', 'prc1', 'prc2', 'prc3']:
                        if feat not in feats:
                            continue
                        if ((feats[feat] != '0' and
                             feat not in prefix_feats and
                             stem_feats.get(feat, '0') != feats[feat]) or
                            (feat in prefix_feats and
                             feats[feat] != prefix_feats[feat])):
                            ignore_prefix = True
                            break

                    if ignore_prefix:
                        continue

                    for suffix_cat in suffix_cats:
                        if suffix_cat not in self._db.suffix_cat_hash:
                            continue
                        suffix_feats_list = (
                            self._db.suffix_cat_hash[suffix_cat])
                        for suffix_feats in suffix_feats_list:
                            if ((prefix_cat not in
                                 self._db.prefix_suffix_compat) or
                                (suffix_cat not in
                                 self._db.prefix_suffix_compat[prefix_cat])):
                                continue

                            ignore_suffix = False

                            for feat in ['enc0', 'enc1', 'enc2']:
                                if feat not in feats:
                                    continue
                                if ((feats[feat] != '0' and
                                     feat not in suffix_feats and
                                     stem_feats.get(feat, '0') != feats[feat])
                                    or (feat in suffix_feats and
                                        feats[feat] != suffix_feats[feat])):
                                    ignore_suffix = True
                                    break

                            if ignore_suffix:
                                continue

                            merged = merge_features(self._db, prefix_feats,
                                                    stem_feats, suffix_feats)

                            ignore_analysis = False
                            for feat in feats.keys():
                                if (feat in merged and
                                        merged[feat] != feats[feat]):
                                    ignore_analysis = True
                                    break

                            if not ignore_analysis:
                                analyses.append(merged)

        return list(analyses)
Ejemplo n.º 3
0
    def _parse_dbfile(self, fpath):
        with open(fpath, 'r', encoding='utf-8') as dbfile:
            # Process DEFINES
            for line in dbfile:
                line = line = force_unicode(line).strip()

                if line == '###DEFINES###':
                    continue

                if line == '###DEFAULTS###':
                    break

                toks = line.split(u' ')

                # Check if line has the minimum viable format
                if len(toks) < 3 or toks[0] != 'DEFINE':
                    raise DatabaseParseError('invalid DEFINES line {}'.format(
                        repr(line)))

                new_define = toks[1]
                val_set = set()

                # Parse values for defined keyword
                for tok in toks[2:]:
                    subtoks = tok.split(':')

                    # If it's a malformed entry, ignore it
                    if len(subtoks) != 2 and subtoks[0] != toks[1]:
                        raise DatabaseParseError(
                            'invalid key value pair {} in DEFINES'.format(
                                repr(tok)))

                    # If it's an open class, we use None instead of a set
                    if len(toks) == 3 and subtoks[1] == '*open*':
                        val_set = None
                        break

                    val_set.add(subtoks[1])

                self.defines[new_define] = (list(val_set)
                                            if val_set is not None else None)

            # Process DEFAULTS
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###ORDER###':
                    break

                toks = line.split(u' ')

                if len(toks) < 2 or toks[0] != 'DEFAULT':
                    raise DatabaseParseError('invalid DEFAULTS line {}'.format(
                        repr(line)))

                parsed_default = self._parse_defaults_line_toks(toks[1:])

                if self._defaultKey not in parsed_default:
                    raise DatabaseParseError(
                        'DEFAULTS line {} missing {} value'.format(
                            repr(line), self._defaultKey))

                dkey = parsed_default[self._defaultKey]
                self.defaults[dkey] = parsed_default

            # Process ORDER
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###TOKENIZATIONS###':
                    self.compute_feats = frozenset(self.order)
                    break

                toks = line.split(u' ')

                if (self.order is not None and len(toks) < 2
                        and toks[0] != 'ORDER'):
                    raise DatabaseParseError('invalid ORDER line {}'.format(
                        repr(line)))

                if toks[1] not in self.defines:
                    raise DatabaseParseError(
                        'invalid feature {} in ORDER line.'.format(
                            repr(toks[1])))

                self.order = toks[1:]

            # Process TOKENIZATIONS
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###STEMBACKOFF###':
                    self.tokenizations = frozenset(self.tokenizations)
                    break

                toks = line.split(u' ')

                if (self.order is not None and len(toks) < 2
                        and toks[0] != 'TOKENIZATION'):
                    raise DatabaseParseError(
                        'invalid TOKENIZATION line {}'.format(repr(line)))

                if toks[1] not in self.defines:
                    raise DatabaseParseError(
                        'invalid feature {} in TOKENIZATION line.'.format(
                            repr(toks[1])))

                self.tokenizations.update(toks[1:])

            # Process STEMBACKOFFS
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###PREFIXES###':
                    break

                toks = line.split(u' ')

                if len(toks) < 3 or toks[0] != 'STEMBACKOFF':
                    raise DatabaseParseError(
                        'invalid STEMBACKOFFS line {}'.format(repr(line)))

                self.stem_backoffs[toks[1]] = toks[2:]

            # Process PREFIXES
            for line in dbfile:
                line = force_unicode(line)
                parts = line.split(u'\t')

                if len(parts) != 3:
                    if line.strip() == '###SUFFIXES###':
                        break
                    raise DatabaseParseError('invalid PREFIXES line {}'.format(
                        repr(line)))

                prefix = parts[0].strip()
                category = parts[1]
                analysis = self._parse_analysis_line_toks(
                    parts[2].strip().split(u' '))

                if self._withAnalysis:
                    if prefix not in self.prefix_hash:
                        self.prefix_hash[prefix] = []
                    self.prefix_hash[prefix].append((category, analysis))

                if self._withGeneration:
                    # FIXME: Make sure analyses for category are unique?
                    if category not in self.prefix_cat_hash:
                        self.prefix_cat_hash[category] = []
                    self.prefix_cat_hash[category].append(analysis)

            # Process SUFFIXES
            for line in dbfile:
                line = force_unicode(line)
                parts = line.split(u'\t')

                if len(parts) != 3:
                    if line.strip() == '###STEMS###':
                        break
                    raise DatabaseParseError('invalid SUFFIXES line {}'.format(
                        repr(line)))

                suffix = parts[0].strip()
                category = parts[1]
                analysis = self._parse_analysis_line_toks(
                    parts[2].strip().split(u' '))

                if self._withAnalysis:
                    if suffix not in self.suffix_hash:
                        self.suffix_hash[suffix] = []
                    self.suffix_hash[suffix].append((category, analysis))

                if self._withGeneration:
                    # FIXME: Make sure analyses for category are unique?
                    if category not in self.suffix_cat_hash:
                        self.suffix_cat_hash[category] = []
                    self.suffix_cat_hash[category].append(analysis)

            # Process STEMS
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###TABLE AB###':
                    break

                parts = line.split(u'\t')

                if len(parts) != 3:
                    raise DatabaseParseError('invalid STEMS line {}'.format(
                        repr(line)))

                stem = parts[0]
                category = parts[1]
                analysis = self._parse_analysis_line_toks(parts[2].split(u' '))
                analysis['lex'] = strip_lex(analysis['lex'])

                if self._withAnalysis:
                    if stem not in self.stem_hash:
                        self.stem_hash[stem] = []
                    self.stem_hash[stem].append((category, analysis))

                if self._withGeneration:
                    # FIXME: Make sure analyses for category are unique?
                    lemma_key = analysis['lex']
                    analysis['stemcat'] = category
                    if lemma_key not in self.lemma_hash:
                        self.lemma_hash[lemma_key] = []
                    self.lemma_hash[lemma_key].append(analysis)

            # Process prefix_stem compatibility table
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###TABLE BC###':
                    break

                toks = line.split()

                if len(toks) != 2:
                    raise DatabaseParseError('invalid TABLE AB line {}'.format(
                        repr(line)))

                prefix_cat = toks[0]
                stem_cat = toks[1]

                if self._withAnalysis:
                    if prefix_cat not in self.prefix_stem_compat:
                        self.prefix_stem_compat[prefix_cat] = set()
                    self.prefix_stem_compat[prefix_cat].add(stem_cat)

                if self._withGeneration:
                    if stem_cat not in self.stem_prefix_compat:
                        self.stem_prefix_compat[stem_cat] = set()
                    self.stem_prefix_compat[stem_cat].add(prefix_cat)

            # Process stem_suffix compatibility table
            for line in dbfile:
                line = force_unicode(line).strip()

                if line == '###TABLE AC###':
                    break

                toks = line.split()

                if len(toks) != 2:
                    raise DatabaseParseError('invalid TABLE BC line {}'.format(
                        repr(line)))

                stem_cat = toks[0]
                suffix_cat = toks[1]

                if stem_cat not in self.stem_suffix_compat:
                    self.stem_suffix_compat[stem_cat] = set()
                self.stem_suffix_compat[stem_cat].add(suffix_cat)

            # Process prefix_suffix compatibility table
            for line in dbfile:
                line = force_unicode(line).strip()

                toks = line.split()

                if len(toks) != 2:
                    raise DatabaseParseError('invalid TABLE AC line {}'.format(
                        repr(line)))

                prefix_cat = toks[0]
                suffix_cat = toks[1]

                if prefix_cat not in self.prefix_suffix_compat:
                    self.prefix_suffix_compat[prefix_cat] = set()
                self.prefix_suffix_compat[prefix_cat].add(suffix_cat)

            if self._withAnalysis:
                for prefix in self.prefix_hash.keys():
                    self.max_prefix_size = max(self.max_prefix_size,
                                               len(prefix))
                for suffix in self.suffix_hash.keys():
                    self.max_suffix_size = max(self.max_suffix_size,
                                               len(suffix))