コード例 #1
0
def matching_structattr(corp: manatee.Corpus, struct: str, attr: str, val: str,
                        search_attr: str) -> Tuple[List[str], int, int]:
    """
    Return a value of search_attr matching provided structural attribute
    [struct].[attr] = [val]
    """
    try:
        size_limit = 1000000
        ans = set()
        query = '<{struct} {attr}="{attr_val}">[]'.format(struct=struct,
                                                          attr=attr,
                                                          attr_val=val)
        conc = manatee.Concordance(corp, query, 0, -1)
        conc.sync()
        size = conc.size()

        kw = manatee.KWICLines(corp, conc.RS(True, 0,
                                             size_limit), '-1', '1', 'word',
                               '', '', '={}.{}'.format(struct, search_attr))
        while kw.nextline():
            refs = kw.get_ref_list()
            if len(refs) > 0:
                ans.add(refs[0])
        return sorted(ans), size, min(size, size_limit)
    except RuntimeError as ex:
        if 'AttrNotFound' in str(ex):
            return [], 0, 0
        raise ex
コード例 #2
0
    def _load_raw_sent(self, corpus, corpus_id, token_id, kwic_len, tree_attrs):
        """
        Retrieve a sentence via Manatee
        Args:
            corpus (manatee.Corpus): a corpus instance
            corpus_id (str): corpus ID
            token_id (int): token number/id
            kwic_len (int): number of tokens in KWIC
            tree_attrs (list of str): a list of positional attributes required by tree nodes/edges

        Returns (dict):
            data: a list of strings (Manatee raw format)
            kwic_pos: a tuple (first_kwic_idx, kwic_length)
        """
        encoding = corpus.get_conf('ENCODING')
        sentence_struct = self._conf.get_sentence_struct(corpus_id)
        conc = manatee.Concordance(corpus, ' '.join(
            '[#%d]' % k for k in range(token_id, token_id + kwic_len)), 1, -1)
        conc.sync()
        kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1),
                               '-1:%s' % sentence_struct,
                               '1:%s' % sentence_struct,
                               ','.join(tree_attrs),
                               ','.join(tree_attrs), '', '')
        if kl.nextline():
            left_tk = kl.get_left()
            kwic_tk = kl.get_kwic()
            return dict(data=[import_string(s, from_encoding=encoding)
                              for s in left_tk + kwic_tk + kl.get_right()],
                        kwic_pos=(len(left_tk) / 4, len(kwic_tk) / 4))
コード例 #3
0
    def get_cached_conc_sizes(self, corp, q=None, cachefile=None):
        """
        arguments:
        corp -- manatee.Corpus instance
        q -- a list containing preprocessed query
        cachefile -- if not provided then the path is determined automatically
        using CACHE_ROOT_DIR and corpus name, corpus name and the query

        returns:
        a dictionary {
            finished : 0/1,
            concsize : int,
            fullsize : int,
            relconcsize : float (concordance size recalculated to a million corpus),
            arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values)
        }
        """
        import struct

        if q is None:
            q = []
        ans = dict(finished=False,
                   concsize=None,
                   fullsize=None,
                   relconcsize=None)
        if not cachefile:  # AJAX call
            q = tuple(q)
            subchash = getattr(corp, 'subchash', None)
            cache_map = self._cache_factory.get_mapping(corp)
            cachefile = cache_map.cache_file_path(subchash, q)
            status = cache_map.get_calc_status(subchash, q)
            if status.error is not None:
                raise ConcCalculationStatusException(
                    'Concordance calculation failed', status.error)

        if cachefile and os.path.isfile(cachefile):
            cache = open(cachefile, 'rb')
            cache.seek(15)
            finished = bool(ord(cache.read(1)))
            (fullsize, ) = struct.unpack('q', cache.read(8))
            cache.seek(32)
            (concsize, ) = struct.unpack('i', cache.read(4))

            if fullsize > 0:
                relconcsize = 1000000.0 * fullsize / corp.search_size()
            else:
                relconcsize = 1000000.0 * concsize / corp.search_size()

            if finished and not is_subcorpus(corp):
                conc = manatee.Concordance(corp, cachefile)
                result_arf = round(conc.compute_ARF(), 2)
            else:
                result_arf = None

            ans['finished'] = finished
            ans['concsize'] = concsize
            ans['fullsize'] = fullsize
            ans['relconcsize'] = relconcsize
            ans['arf'] = result_arf
        return ans
コード例 #4
0
ファイル: validate.py プロジェクト: czcorpus/ictools
def find_struct_begin(corp, alignment, sentence_attr, struct_name, struct_idx):
    conc = manatee.Concordance(corp,
                               '<{0} #{1}>[]'.format(struct_name,
                                                     struct_idx), 0, -1)
    conc.sync()
    if conc.size() != 1:
        print('ERROR: <{0} #{1}> not found'.format(struct_name, struct_idx))
    _find_refs(conc, sentence_attr, alignment, struct_idx)
    return None
コード例 #5
0
 def _load_raw_sent(self, corpus, canonical_corpus_id, token_id,
                    tree_attrs):
     encoding = corpus.get_conf('ENCODING')
     sentence_struct = self._conf.get_sentence_struct(canonical_corpus_id)
     conc = manatee.Concordance(corpus, '[#%d]' % token_id, 1, -1)
     conc.sync()
     kl = manatee.KWICLines(corpus, conc.RS(True, 0,
                                            1), '-1:%s' % sentence_struct,
                            '1:%s' % sentence_struct, ','.join(tree_attrs),
                            ','.join(tree_attrs), '', '')
     if kl.nextline():
         return [
             import_string(s, from_encoding=encoding)
             for s in kl.get_left() + kl.get_kwic() + kl.get_right()
         ]
コード例 #6
0
ファイル: tagextract.py プロジェクト: anukat2015/kontext
    def generate_kwiclines(self, query, corpus):
        """
        Parameters
        ----------
        query : str
          a query to be used to extract all tag values
        corpus : str
          a corpus name

        Returns
        -------
        set
          a set containing all unique tag values as found in the corpus
        """
        conc = manatee.Concordance(corpus, query, 0)
        kw = manatee.KWICLines(conc, '-1#', '1#', 'tag', 'tag', '', '#', 0)
        ans = set()
        for i in range(conc.size()):
            kw.nextline(i)
            ans.add(kw.get_kwic()[0].strip())
        return sorted(tuple(ans))
コード例 #7
0
ファイル: __init__.py プロジェクト: mzimandl/kontext
def add_structattr_support(corp: KCorpus, attrs, token_id):
    """
    A decorator function which turns 'fetch_posattr' into
    a more general function which is able to load
    structural attributes too. The load is performed only
    once for all possible structural attributes.
    """

    data = {}
    refs = [x for x in attrs if '.' in x]
    refs_mapping = {}
    for n in refs:
        if n:
            lab = corp.get_conf(f'{n}.LABEL')
            refs_mapping[lab if lab else n] = n

    if len(refs) > 0:
        conc = manatee.Concordance(corp.unwrap(),
                                   '[#{}]'.format(int(token_id)), 1, -1)
        conc.sync()
        rs = conc.RS(True, 0, 0)
        kl = manatee.KWICLines(corp.unwrap(), rs, '-1', '1', 'word', '', '',
                               ','.join(refs))
        if kl.nextline():
            refs_str = kl.get_refs()
            for kv in refs_str.split(','):
                if '=' in kv:
                    k, v = kv.split('=')
                    k = refs_mapping.get(k)
                    data[k] = v

    def decorator(fn):
        def wrapper(corp, attr, token_id, num_tokens):
            if '.' in attr:
                return data[attr]
            return fn(corp, attr, token_id, num_tokens)

        return wrapper

    return decorator
コード例 #8
0
ファイル: base.py プロジェクト: mzimandl/kontext
    def get_cached_conc_sizes(self,
                              corp: KCorpus,
                              q: Tuple[str, ...] = None) -> Dict[str, Any]:
        """
        arguments:
        corp --
        q -- a list containing preprocessed query
        using CACHE_ROOT_DIR and corpus name, corpus name and the query

        returns:
        a dictionary {
            finished : 0/1,
            concsize : int,
            fullsize : int,
            relconcsize : float (concordance size recalculated to a million corpus),
            arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values)
        }
        """
        import struct

        if q is None:
            q = ()
        ans = dict(finished=False,
                   concsize=0,
                   fullsize=0,
                   relconcsize=0,
                   error=None)
        cache_map = self._cache_factory.get_mapping(corp)
        status = cache_map.get_calc_status(corp.subchash, q)
        if not status:
            raise ConcCalculationStatusException(
                'Concordance calculation not found', None)
        status.check_for_errors(TASK_TIME_LIMIT)
        if status.error:
            ans['finished'] = True
            ans['error'] = status.error
        elif status.cachefile and os.path.isfile(status.cachefile):
            cache = open(status.cachefile, 'rb')
            cache.seek(15)
            finished = bool(ord(cache.read(1)))
            (fullsize, ) = struct.unpack('q', cache.read(8))
            cache.seek(32)
            (concsize, ) = struct.unpack('i', cache.read(4))

            if fullsize > 0:
                relconcsize = 1000000.0 * fullsize / corp.search_size
            else:
                relconcsize = 1000000.0 * concsize / corp.search_size

            if finished and not corp.is_subcorpus:
                conc = manatee.Concordance(corp.unwrap(), status.cachefile)
                result_arf = round(conc.compute_ARF(), 2)
            else:
                result_arf = None

            ans['finished'] = finished
            ans['concsize'] = concsize
            ans['fullsize'] = fullsize
            ans['relconcsize'] = relconcsize
            ans['arf'] = result_arf
        return ans