コード例 #1
0
ファイル: pydic_base.py プロジェクト: pombredanne/pydic
    def __init__(self, path):
        self.path = path
        if os.path.isdir(self.get_path()):
            self.read_pydic_index(self.get_path())
        elif os.path.isfile(self.path):
            self.make_memory_pydic_index(self.get_path())
        else:
            raise RuntimeError("Wrong pydic input resource")
        self.accents = Accents()

        self.memory_recno = ''
コード例 #2
0
    def __init__(self, tf, silent=True):

        self.silent = silent

        self.report('processing accents...')
        self.accents = Accents(tf)
        self.report('\tdone')

        self.report('processing quants...')
        self.quants = Quants(tf).quants
        self.report('\tdone')

        self.report('processing preps...')
        self.preps = Preps(tf).preps
        self.report('\tdone')

        self.report('processing nominals...')
        self.noms = Nominals(tf, preps=self.preps).nominals
        self.report('\tdone')

        base_sets = {
            'quants': self.quants,
            'preps': self.preps,
            'noms': self.noms
        }

        self.report('processing conjunction pairs...')
        self.conj = Conjunction(tf, **base_sets)
        self.report('\tdone')

        self.report('processing construct pairs...')
        self.cons = Construct(tf, **base_sets)
        self.report('\tdone')
コード例 #3
0
ファイル: pydic_base.py プロジェクト: ymachkivskiy/pydic
    def __init__(self, path):
        self.path = path
        if os.path.isdir(self.get_path()):
            self.read_pydic_index(self.get_path())
        elif os.path.isfile(self.path):
            self.make_memory_pydic_index(self.get_path())
        else:
            raise RuntimeError("Wrong pydic input resource")
        self.accents = Accents()

        self.memory_recno = ''
コード例 #4
0
ファイル: shell.py プロジェクト: micahkemp/on-air-sign
 def inner_cube_accents(self):
     return Accents(name="inner_cube_accents",
                    accent_radius=self.inner_accent_radius,
                    edge_radius=self.inner_edge_radius,
                    length=self.inner_length,
                    center_length=self.inner_accent_center_length,
                    width=self.inner_width,
                    height=self.inner_height,
                    vertical_margin=self.inner_accent_vertical_margin,
                    count=self.accent_count,
                    offset=self.inner_accent_offset).component()
コード例 #5
0
ファイル: shell.py プロジェクト: micahkemp/on-air-sign
 def outer_cube_accents(self):
     return Accents(
         name="outer_cube_accents",
         accent_radius=self.accent_radius,
         edge_radius=self.edge_radius,
         length=self.length,
         center_length=self.accent_center_length,
         width=self.width,
         height=self.height,
         vertical_margin=self.accent_vertical_margin,
         count=self.accent_count,
     ).component()
コード例 #6
0
ファイル: pydic_base.py プロジェクト: ymachkivskiy/pydic
class PyDic(object):
    """
    Abstraction layer for accessing single dictionary
    """
    RECNO_INDEX_FMT = '<L'
    MARISA_HASH_FMT = '<L'
    DIR_EXTENSION = 'pydic'
    INTERNAL_DELIMITER = ':'

    def __init__(self, path):
        self.path = path
        if os.path.isdir(self.get_path()):
            self.read_pydic_index(self.get_path())
        elif os.path.isfile(self.path):
            self.make_memory_pydic_index(self.get_path())
        else:
            raise RuntimeError("Wrong pydic input resource")
        self.accents = Accents()

        self.memory_recno = ''

    def __iter__(self):
        return imap(lambda i: PyDicId(i, self.name), xrange(1, len(self) + 1))

    def __len__(self):
        return self.recno_size

    def is_inmemory(self):
        """
        Checks if dictionary is in in-memory only mode. It is needed by other modules
        willing to write some intermediate file structures to pydic folder.
        """
        return os.path.isfile(self.path)

    def get_path(self, join_with=None):

        if join_with:
            return os.path.join(self.path, join_with)
        else:
            return self.path

    def forms_for_prefix(self, prefix):
        """
        :param prefix: prefix of words to search
        :type prefix: unicode
        :return: list of words which have given prefix
        """

        return self.hash.keys(prefix)

    def id(self, form):
        """
        Returns a list of PyDicId that match a given word form

        :param form: word form
        :type form: unicode
        :return: list of PyDicId or empty list
        """
        try:
            return map(lambda x: PyDicId(x[0], self.name),
                       self.hash[form.lower()])
        except KeyError:
            return []

    def a_id(self, form):
        """
        Accents agnostic version of method ``id()``

        :param form: form
        :type form: unicode
        :return: list of PyDicId or empty list
        """
        ids = set(self.id(form))
        for w in self.accents.make_accents(form):
            ids.update(self.id(w))
        return list(ids)

    @require_valid_pydic_id
    def id_forms(self, pydic_id):
        """
        Returns list of forms for a given PyDicId

        :param pydic_id: PyDicId or string id
        :type pydic_id: PyDicId, string
        :return: list of unicode strings or empty list
        """
        if self.is_inmemory():
            try:
                offset = self.recno_index[pydic_id.id-1]
            except IndexError:
                return []
        else:
            try:
                self.recno_index.seek(
                    (pydic_id.id - 1) * struct.calcsize(PyDic.RECNO_INDEX_FMT))
                offset = struct.unpack(PyDic.RECNO_INDEX_FMT, self.recno_index.read(
                    struct.calcsize(PyDic.RECNO_INDEX_FMT)))[0]
            except ValueError:
                return []

        self.recno.seek(offset)
        return self.__decode_form(self.recno.readline().rstrip().decode('utf-8'))


    def word_forms(self, form):
        """
        Returns list of list of forms for a given form

        :param form: word form
        :type form: unicode
        :return: list of lists of unicode strings or empty list
        """

        return map(lambda x: self.id_forms(x), self.id(form))


    def a_word_forms(self, form, mapping=AccentsTable.PL):
        """
        Accent agnostic version of word_forms method.

        :param form: word form
        :type form: unicode
        :return: list of lists of unicode strings or empty list
        """

        return map(lambda x: self.id_forms(x), self.a_id(form))


    def __decode_form(self, string):
        """
        Internal function to decode string format stored in Recno

        :param string:
        :return:
        """
        bits = string.split(PyDic.INTERNAL_DELIMITER)
        return map(lambda x: bits[0] + x, bits[1:])

    @require_valid_pydic_id
    def id_base(self, pydic_id):
        """
        Returns a base form of word given as PyDicId

        :param pydic_id: PyDicId
        :type pydic_id: PyDicId, string
        :return: unicode string or ``None``
        """

        try:
            return self.id_forms(pydic_id)[0]
        except IndexError:
            return None

    def word_base(self, form):
        """
        Returns a list of base forms of form

        :param form: word form
        :type form: unicode string
        :return: list of unicode strings or empty list
        """
        return list(set(map(lambda x: self.id_base(x), self.id(form))))


    def a_word_base(self, form):
        """
        Accents agnostic version of ``word_base()`` method

        :param form: word form
        :type form: unicode string
        :return: list of unicode strings or empty list
        """
        return list(set(map(lambda x: self.id_base(x), self.a_id(form))))


    def read_pydic_index(self, dic_path):
        self.dic_path = dic_path

        self.name = open(self.get_path(NAME_FILENAME)).read().strip()
        self.hash = marisa_trie.RecordTrie(PyDic.MARISA_HASH_FMT)
        self.hash.load(self.get_path(FORMS_HASH_FILENAME))

        recno_file = open(self.get_path(FORMS_RECNO_FILENAME), 'r+b')
        recno_index_file = open(self.get_path(FORMS_RECNO_INDEX_FILENAME), 'r+b')
        self.recno = mmap(recno_file.fileno(), 0)
        self.recno_index = mmap(recno_index_file.fileno(), 0)
        self.recno_size = self.recno_index.size() / struct.calcsize(
            PyDic.RECNO_INDEX_FMT)


    def make_memory_pydic_index(self, from_source, name=None, delimiter=',',
                                verbose=False):
        self.hash, self.recno, self.recno_index = PyDic.make_pydic_index(
            from_source=open(from_source),
            to_path=None,
            name=name,
            delimiter=delimiter,
            verbose=verbose)

        self.name = from_source
        self.recno_size = len(self.recno_index)

    @staticmethod
    def make_pydic_index(from_source, to_path, name, delimiter=',', verbose=False):


        if to_path is not None and os.path.exists(os.path.join(to_path, NAME_FILENAME)):
            raise ConfigurationErrorException(
                'Cowardly refusing to create dictionary in non empty directory')

        if to_path is not None and not os.path.exists(to_path):
            os.makedirs(to_path)

        if to_path is not None:
            name_file = open(os.path.join(to_path, NAME_FILENAME), 'w+')
            name_file.write(name.encode('utf-8') + '\n')
            name_file.close()

        recno = StringIO()
        recno_index = []
        if to_path is not None:
            recno = open(os.path.join(to_path, FORMS_RECNO_FILENAME), 'w+b')
            recno_index = open(os.path.join(to_path, FORMS_RECNO_INDEX_FILENAME), 'w+b')

        def get_next_form(recno, recno_index, ):

            file_offset = 0
            recno_counter = 0

            for line in from_source:
                bits = line.split(delimiter)
                bits = map(lambda x: x.strip().decode('utf-8'), bits)
                bits = filter(lambda x: x, bits)
                if bits:
                    recno_counter += 1

                    bits = OrderedDict.fromkeys(bits).keys() # stable unique
                    bits_prefixed = PyDic.common_prefix(bits)

                    bits_str = (PyDic.INTERNAL_DELIMITER.join(bits_prefixed)).encode(
                        'utf-8') + '\n'

                    recno.write(bits_str)

                    if to_path is None:
                        recno_index.append(file_offset)
                    else:

                        recno_index.write(
                            struct.pack(PyDic.RECNO_INDEX_FMT, file_offset))

                    if verbose:
                        print >> sys.stderr, "[", recno_counter, "]", bits[0]

                    for bit in bits:
                        yield bit.lower(), (recno_counter, )
                    file_offset += len(bits_str)

            raise StopIteration

        forms_index = marisa_trie.RecordTrie(PyDic.MARISA_HASH_FMT,
                                             get_next_form(recno, recno_index
                                                           ))
        if to_path is not None:
            forms_index.save(os.path.join(to_path, FORMS_HASH_FILENAME))

        return forms_index, recno, recno_index

    @staticmethod
    def common_prefix(word_list):
        """
        For a list of words produces a list of [optimal prefix, suffix1, suffix2...]
        :param word_list:
        :return:
        """
        i = min(map(lambda x: len(x), word_list))
        while i > 0:
            lst = map(lambda x: x[0:i], word_list)
            # http://stackoverflow.com/questions/3844801
            # Fastest checking if lst has the same values
            if (not lst or lst.count(lst[0]) == len(lst)):
                break
            i -= 1
        return [word_list[0][0:i]] + map(lambda x: x[i:], word_list)
コード例 #7
0
ファイル: pydic_base.py プロジェクト: pombredanne/pydic
class PyDic(object):
    """
    Abstraction layer for accessing single dictionary
    """
    RECNO_INDEX_FMT = '<L'
    MARISA_HASH_FMT = '<L'
    DIR_EXTENSION = 'pydic'
    INTERNAL_DELIMITER = ':'

    def __init__(self, path):
        self.path = path
        if os.path.isdir(self.get_path()):
            self.read_pydic_index(self.get_path())
        elif os.path.isfile(self.path):
            self.make_memory_pydic_index(self.get_path())
        else:
            raise RuntimeError("Wrong pydic input resource")
        self.accents = Accents()

        self.memory_recno = ''

    def __iter__(self):
        return imap(lambda i: PyDicId(i, self.name), xrange(1, len(self) + 1))

    def __len__(self):
        return self.recno_size

    def is_inmemory(self):
        """
        Checks if dictionary is in in-memory only mode. It is needed by other modules
        willing to write some intermediate file structures to pydic folder.
        """
        return os.path.isfile(self.path)

    def get_path(self, join_with=None):

        if join_with:
            return os.path.join(self.path, join_with)
        else:
            return self.path

    def id(self, form):
        """
        Returns a list of PyDicId that match a given word form

        :param form: word form
        :type form: unicode
        :return: list of PyDicId or empty list
        """
        try:
            return map(lambda x: PyDicId(x[0], self.name),
                       self.hash[form.lower()])
        except KeyError:
            return []

    def a_id(self, form):
        """
        Accents agnostic version of method ``id()``

        :param form: form
        :type form: unicode
        :return: list of PyDicId or empty list
        """
        ids = set(self.id(form))
        for w in self.accents.make_accents(form):
            ids.update(self.id(w))
        return list(ids)

    @require_valid_pydic_id
    def id_forms(self, pydic_id):
        """
        Returns list of forms for a given PyDicId

        :param pydic_id: PyDicId or string id
        :type pydic_id: PyDicId, string
        :return: list of unicode strings or empty list
        """
        if self.is_inmemory():
            try:
                offset = self.recno_index[pydic_id.id-1]
            except IndexError:
                return []
        else:
            try:
                self.recno_index.seek(
                    (pydic_id.id - 1) * struct.calcsize(PyDic.RECNO_INDEX_FMT))
                offset = struct.unpack(PyDic.RECNO_INDEX_FMT, self.recno_index.read(
                    struct.calcsize(PyDic.RECNO_INDEX_FMT)))[0]
            except ValueError:
                return []

        self.recno.seek(offset)
        return self.__decode_form(self.recno.readline().rstrip().decode('utf-8'))


    def word_forms(self, form):
        """
        Returns list of list of forms for a given form

        :param form: word form
        :type form: unicode
        :return: list of lists of unicode strings or empty list
        """

        return map(lambda x: self.id_forms(x), self.id(form))


    def a_word_forms(self, form, mapping=AccentsTable.PL):
        """
        Accent agnostic version of word_forms method.

        :param form: word form
        :type form: unicode
        :return: list of lists of unicode strings or empty list
        """

        return map(lambda x: self.id_forms(x), self.a_id(form))


    def __decode_form(self, string):
        """
        Internal function to decode string format stored in Recno

        :param string:
        :return:
        """
        bits = string.split(PyDic.INTERNAL_DELIMITER)
        return map(lambda x: bits[0] + x, bits[1:])

    @require_valid_pydic_id
    def id_base(self, pydic_id):
        """
        Returns a base form of word given as PyDicId

        :param pydic_id: PyDicId
        :type pydic_id: PyDicId, string
        :return: unicode string or ``None``
        """

        try:
            return self.id_forms(pydic_id)[0]
        except IndexError:
            return None

    def word_base(self, form):
        """
        Returns a list of base forms of form

        :param form: word form
        :type form: unicode string
        :return: list of unicode strings or empty list
        """
        return list(set(map(lambda x: self.id_base(x), self.id(form))))


    def a_word_base(self, form):
        """
        Accents agnostic version of ``word_base()`` method

        :param form: word form
        :type form: unicode string
        :return: list of unicode strings or empty list
        """
        return list(set(map(lambda x: self.id_base(x), self.a_id(form))))


    def read_pydic_index(self, dic_path):
        self.dic_path = dic_path

        self.name = open(self.get_path(NAME_FILENAME)).read().strip()
        self.hash = marisa_trie.RecordTrie(PyDic.MARISA_HASH_FMT)
        self.hash.load(self.get_path(FORMS_HASH_FILENAME))

        recno_file = open(self.get_path(FORMS_RECNO_FILENAME), 'r+b')
        recno_index_file = open(self.get_path(FORMS_RECNO_INDEX_FILENAME), 'r+b')
        self.recno = mmap(recno_file.fileno(), 0)
        self.recno_index = mmap(recno_index_file.fileno(), 0)
        self.recno_size = self.recno_index.size() / struct.calcsize(
            PyDic.RECNO_INDEX_FMT)


    def make_memory_pydic_index(self, from_source, name=None, delimiter=',',
                                verbose=False):
        self.hash, self.recno, self.recno_index = PyDic.make_pydic_index(
            from_source=open(from_source),
            to_path=None,
            name=name,
            delimiter=delimiter,
            verbose=verbose)

        self.name = from_source
        self.recno_size = len(self.recno_index)

    @staticmethod
    def make_pydic_index(from_source, to_path, name, delimiter=',', verbose=False):


        if to_path is not None and os.path.exists(os.path.join(to_path, NAME_FILENAME)):
            raise ConfigurationErrorException(
                'Cowardly refusing to create dictionary in non empty directory')

        if to_path is not None and not os.path.exists(to_path):
            os.makedirs(to_path)

        if to_path is not None:
            name_file = open(os.path.join(to_path, NAME_FILENAME), 'w+')
            name_file.write(name.encode('utf-8') + '\n')
            name_file.close()

        recno = StringIO()
        recno_index = []
        if to_path is not None:
            recno = open(os.path.join(to_path, FORMS_RECNO_FILENAME), 'w+b')
            recno_index = open(os.path.join(to_path, FORMS_RECNO_INDEX_FILENAME), 'w+b')

        def get_next_form(recno, recno_index, ):

            file_offset = 0
            recno_counter = 0

            for line in from_source:
                bits = line.split(delimiter)
                bits = map(lambda x: x.strip().decode('utf-8'), bits)
                bits = filter(lambda x: x, bits)
                if bits:
                    recno_counter += 1

                    bits = OrderedDict.fromkeys(bits).keys() # stable unique
                    bits_prefixed = PyDic.common_prefix(bits)

                    bits_str = (PyDic.INTERNAL_DELIMITER.join(bits_prefixed)).encode(
                        'utf-8') + '\n'

                    recno.write(bits_str)

                    if to_path is None:
                        recno_index.append(file_offset)
                    else:

                        recno_index.write(
                            struct.pack(PyDic.RECNO_INDEX_FMT, file_offset))

                    if verbose:
                        print >> sys.stderr, "[", recno_counter, "]", bits[0]

                    for bit in bits:
                        yield bit.lower(), (recno_counter, )
                    file_offset += len(bits_str)

            raise StopIteration

        forms_index = marisa_trie.RecordTrie(PyDic.MARISA_HASH_FMT,
                                             get_next_form(recno, recno_index
                                                           ))
        if to_path is not None:
            forms_index.save(os.path.join(to_path, FORMS_HASH_FILENAME))

        return forms_index, recno, recno_index

    @staticmethod
    def common_prefix(word_list):
        """
        For a list of words produces a list of [optimal prefix, suffix1, suffix2...]
        :param word_list:
        :return:
        """
        i = min(map(lambda x: len(x), word_list))
        while i > 0:
            lst = map(lambda x: x[0:i], word_list)
            # http://stackoverflow.com/questions/3844801
            # Fastest checking if lst has the same values
            if (not lst or lst.count(lst[0]) == len(lst)):
                break
            i -= 1
        return [word_list[0][0:i]] + map(lambda x: x[i:], word_list)
コード例 #8
0
from accents import Accents
from tf.app import use
A = use('bhsa', hoist=globals(), silent=True)

AObj = Accents(A)
AObj.atype2name2set.keys()
AObj.atype2name2set["disjunct"].keys()