コード例 #1
0
ファイル: basvoc.py プロジェクト: tjade273/lingpy
    def __init__(self, infile=None, col='list', row='key', conf=None):
        QLCParserWithRowsAndCols.__init__(
            self, infile or util.data_path('swadesh', 'swadesh.qlc'), row, col,
            conf or util.data_path('conf', 'swadesh.rc'))

        # get row and key index
        if not hasattr(self, '_rowidx'):
            # add indices to alias dictionary for swadesh lists
            for i, col in enumerate(self.cols):
                self._meta[col] = self._array[np.nonzero(self._array[:, i]),
                                              i][0]
コード例 #2
0
def compile_dvt(path=''):
    """
    Function compiles diacritics, vowels, and tones.

    Notes
    -----
    Diacritics, vowels, and tones are defined in the :file:`data/models/dv/` directory
    of the LingPy package and automatically loaded when loading the LingPy
    library. The values are defined as the constants
    :py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and
    :py:obj:`rcParams['tones']`. Their core purpose is to guide the
    tokenization of IPA strings (cf.
    :py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the
    variables, one simply has to change the text files :file:`diacritics`,
    :file:`tones`, and
    :file:`vowels` in the :file:`data/models/dv` directory. The structure of
    these files is fairly simple: Each line contains a vowel or a diacritic
    character, whereas diacritics are preceded by a dash.
    
    See also
    --------
    lingpy.data.model.Model
    lingpy.data.derive.compile_model
    """
    log.info("Compiling diacritics and vowels...")

    # get the path to the models
    if not path:
        file_path = util.data_path('models', 'dvt')
    elif path in ['evolaemp', 'el']:
        file_path = util.data_path('models', 'dvt_el')
    else:
        file_path = path

    def _read_string(name):
        # normalize stuff
        # TODO: this is potentially dangerous and it is important to decide whether
        # TODO: switching to NFD might not be a better choice
        return util.read_text_file(os.path.join(file_path, name),
                                   normalize='NFC').replace('\n', '')

    diacritics = _read_string('diacritics').replace('-', '')
    vowels = ''.join(
        [v for v in _read_string('vowels') if v not in diacritics])
    tones = _read_string('tones')

    dvt = (diacritics, vowels, tones)

    if path in ['evolaemp', 'el']:
        cache.dump(dvt, 'dvt_el')
    else:
        cache.dump(dvt, 'dvt')

    log.info("Diacritics and sound classes were successfully compiled.")
コード例 #3
0
ファイル: derive.py プロジェクト: xrotwang/lingpy
def compile_dvt(path=''):
    """
    Function compiles diacritics, vowels, and tones.

    Notes
    -----
    Diacritics, vowels, and tones are defined in the :file:`data/models/dv/` directory
    of the LingPy package and automatically loaded when loading the LingPy
    library. The values are defined as the constants
    :py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and
    :py:obj:`rcParams['tones']`. Their core purpose is to guide the
    tokenization of IPA strings (cf.
    :py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the
    variables, one simply has to change the text files :file:`diacritics`,
    :file:`tones`, and
    :file:`vowels` in the :file:`data/models/dv` directory. The structure of
    these files is fairly simple: Each line contains a vowel or a diacritic
    character, whereas diacritics are preceded by a dash.
    
    See also
    --------
    lingpy.data.model.Model
    lingpy.data.derive.compile_model
    """
    log.info("Compiling diacritics and vowels...")

    # get the path to the models
    if not path:
        file_path = util.data_path('models', 'dvt')
    elif path in ['evolaemp', 'el']:
        file_path = util.data_path('models', 'dvt_el')
    else:
        file_path = path

    def _read_string(name):
        # normalize stuff
        # TODO: this is potentially dangerous and it is important to decide whether
        # TODO: switching to NFD might not be a better choice
        return util.read_text_file(
            os.path.join(file_path, name), normalize='NFC').replace('\n', '')

    diacritics = _read_string('diacritics').replace('-', '')
    vowels = ''.join([v for v in _read_string('vowels') if v not in diacritics])
    tones = _read_string('tones')

    dvt = (diacritics, vowels, tones)

    if path in ['evolaemp', 'el']:
        cache.dump(dvt, 'dvt_el')
    else:
        cache.dump(dvt, 'dvt')

    log.info("Diacritics and sound classes were successfully compiled.")
コード例 #4
0
ファイル: basvoc.py プロジェクト: anukat2015/lingpy
    def __init__(self, infile=None, col='list', row='key', conf=None):
        QLCParserWithRowsAndCols.__init__(
            self,
            infile or util.data_path('swadesh', 'swadesh.qlc'),
            row,
            col,
            conf or util.data_path('conf', 'swadesh.rc'))

        # get row and key index
        if not hasattr(self, '_rowidx'):
            # add indices to alias dictionary for swadesh lists
            for i, col in enumerate(self.cols):
                self._meta[col] = self._array[np.nonzero(self._array[:, i]), i][0]
コード例 #5
0
ファイル: test_parser.py プロジェクト: kadster/lingpy
def test_init(test_data):
    p = QLCParser({0: ['a']})
    QLCParser(p)
    with pytest.raises(IOError):
        QLCParser('not-extisting-file')
    with pytest.raises(TypeError):
        QLCParser(None)
    with pytest.raises(ValueError):
        QLCParserWithRowsAndCols({0: ['a']}, 'x', 'y', {})

    with pytest.raises(ValueError):
        QLCParserWithRowsAndCols(
            {
                0: ['concept', 'language', 'bla'],
                1: ['bla', 'blu']
            }, 'concept', 'language', '')

    p2 = QLCParserWithRowsAndCols(str(test_data / 'bad_file2.tsv'), 'concept',
                                  'language', data_path('conf', 'wordlist.rc'))

    assert p2.get_entries('cogid')[0][-1] == 'ff'
    with pytest.raises(KeyError):
        p2.__getitem__(tuple([2000, 'bla']))
    assert p2[3, 'language'] == 'l3'
    assert p2[3, 'nothing'] is None
コード例 #6
0
ファイル: wordlist.py プロジェクト: kadster/lingpy
    def __init__(self, filename, row='concept', col='doculect', conf=None):
        QLCParserWithRowsAndCols.__init__(
            self, filename, row, col, conf or util.data_path('conf', 'wordlist.rc'))

        # setup other local temporary storage
        self._etym_dict = {}

        # check for taxa in meta
        if 'taxa' in self._alias:
            if self._alias['taxa'] not in self._meta:
                self._meta[self._alias['taxa']] = self.cols
コード例 #7
0
ファイル: wordlist.py プロジェクト: LinguList/lingpy
    def __init__(self, filename, row='concept', col='doculect', conf=None):
        QLCParserWithRowsAndCols.__init__(
            self, filename, row, col, conf or util.data_path('conf', 'wordlist.rc'))

        # setup other local temporary storage
        self._etym_dict = {}

        # check for taxa in meta
        if 'taxa' in self._alias:
            if self._alias['taxa'] not in self._meta:
                self._meta[self._alias['taxa']] = self.cols
コード例 #8
0
 def test_init(self):
     p = QLCParser({0: ['a']})
     QLCParser(p)
     self.assertRaises(IOError, QLCParser, 'not-extisting-file')
     self.assertRaises(TypeError, QLCParser, None)
     self.assertRaises(ValueError, QLCParserWithRowsAndCols, {0: ['a']}, 'x', 'y', {})
     
     self.assertRaises(ValueError, QLCParserWithRowsAndCols, 
             {0: ['concept', 'language', 'bla'],
                 1 : ['bla', 'blu']}, 'concept', 'language', '')
     
     p2 = QLCParserWithRowsAndCols(test_data('bad_file2.tsv'), 'concept',
         'language', data_path('conf', 'wordlist.rc'))
     assert p2.get_entries('cogid')[0][-1] == 'ff'
     self.assertRaises(KeyError, p2.__getitem__, tuple([2000, 'bla']))
     assert p2[3, 'language'] == 'l3'
     assert p2[3, 'nothing'] is None
コード例 #9
0
    def __init__(self, model, path=None):
        new_path = lambda *cmps: \
            os.path.join(path or util.data_path('models'), model, *cmps)
        self.name = model

        # try to load the converter
        try:
            self.converter = cache.load(model + '.converter')
        except:
            compile_model(model, path)
            self.converter = cache.load(model + '.converter')

        # give always preference to scorer matrix files
        if os.path.isfile(new_path('matrix')):
            self.scorer = read_scorer(new_path('matrix'))
        elif os.path.isfile(new_path('scorer.bin')):
            try:
                self.scorer = cache.load(model + '.scorer')
            except compat.FileNotFoundError:
                pass
        # if none of the above fits, leave it
        else:
            pass

        # read information from the info-file
        self.info = {}

        info = util.read_text_file(new_path('INFO'))
        data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones']

        for line in data:
            try:
                self.info[line] = re.findall('@' + line + ': (.*)', info)[0]
            except:
                self.info[line] = 'unknown'

        # check for vowels and tones
        if "vowels" in self.info:
            self.vowels = self.info['vowels']
        if "tones" in self.info:
            self.tones = self.info['tones']
コード例 #10
0
ファイル: model.py プロジェクト: anukat2015/lingpy
    def __init__(self, model, path=None):
        new_path = lambda *cmps: \
            os.path.join(path or util.data_path('models'), model, *cmps)
        self.name = model

        # try to load the converter
        try:
            self.converter = cache.load(model + '.converter')
        except:
            compile_model(model, path)
            self.converter = cache.load(model + '.converter')

        # give always preference to scorer matrix files
        if os.path.isfile(new_path('matrix')):
            self.scorer = read_scorer(new_path('matrix'))
        elif os.path.isfile(new_path('scorer.bin')):
            try:
                self.scorer = cache.load(model + '.scorer')
            except compat.FileNotFoundError:
                pass
        # if none of the above fits, leave it
        else:
            pass

        # read information from the info-file
        self.info = {}

        info = util.read_text_file(new_path('INFO'))
        data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones']

        for line in data:
            try:
                self.info[line] = re.findall('@' + line + ': (.*)', info)[0]
            except:
                self.info[line] = 'unknown'

        # check for vowels and tones
        if "vowels" in self.info:
            self.vowels = self.info['vowels']
        if "tones" in self.info:
            self.tones = self.info['tones']
コード例 #11
0
def read_conf(conf=''):
    # load the configuration file
    if not conf:
        conf = util.data_path('conf', 'qlc.rc')

    # read the file defined by its path in conf
    tmp = [line.split('\t') for line in util.read_config_file(conf)]

    aliasD, classD, class_stringD, alias2D = {}, {}, {}, {}
    for name, cls, alias in tmp:
        # make sure the name itself is there
        aliasD[name.lower()] = aliasD[name.upper()] = name
        classD[name.lower()] = classD[name.upper()] = eval(cls)
        class_stringD[name.lower()] = class_stringD[name.upper()] = cls

        # add the aliases
        for a in alias.split(','):
            aliasD[a.lower()] = aliasD[a.upper()] = name
            classD[a.lower()] = classD[a.upper()] = eval(cls)
            class_stringD[a.lower()] = class_stringD[a.upper()] = cls

        alias2D[name] = sorted(set(alias.split(','))) + [name]

    return aliasD, classD, class_stringD, alias2D
コード例 #12
0
ファイル: sampa.py プロジェクト: vermillionbee/lingpy
@date: 2007/07/19
"""
from __future__ import print_function, division, unicode_literals
import re
import sys
import codecs

from lingpy.util import data_path


# data for sampa2ipa (Peter Kleiwegs implementation)
xsdata = []
_xsKeys = [' ']
xs = {' ': ' '}

for line in codecs.open(data_path('ipa', 'sampa.csv'), 'r', 'utf-8'):
    line = line.strip('\n').strip('\r')
    if line and not line.startswith('#'):
        key,val = line.split('\t')
        if key in xs and xs[key] != val:
            raise ValueError("Keys encode too many values.")
        _xsKeys.append(key)
        xs[key] = eval('"""' + val + '"""')

_kk = []
for _k in _xsKeys:
    _kk.append(re.escape(_k))
_kk.sort(reverse=True)  # long before short
_xsPat = '|'.join(_kk)
reXS = re.compile('(' + _xsPat + ')|(.)')
コード例 #13
0
ファイル: wordlist.py プロジェクト: kadster/lingpy
    def from_cldf(
            cls, 
            path,
            columns=(
                'parameter_id',
                'concept_name',
                'language_id',
                'language_name',
                'value',
                'form',
                'segments',
                'language_glottocode',
                'concept_concepticon_id',
                'language_latitude',
                'language_longitude',
                'cognacy'
                ),
            namespace=(
               ('concept_name', 'concept'),
               ('language_id', 'doculect'),
               ('segments', 'tokens'),
               ('language_glottocode', 'glottolog'),
               ('concept_concepticon_id', 'concepticon'),
               ('language_latitude', 'latitude'),
               ('language_longitude', 'longitude'),
               ('cognacy', 'cognacy'),
               ('cogid_cognateset_id', 'cogid')
               ),
            filter=lambda row: row["form"],
            **kwargs):
        """Load a CLDF dataset.

        Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist
        datasets are supported for now, because other modules don't seem to
        make sense for LingPy) and transform it into this Class. Columns from
        the FormTable are imported in lowercase, columns from LanguageTable,
        ParameterTable and CognateTable are prefixed with `langage_`,
        `concept_` and `cogid_`and converted to lowercase.

        Notes
        -----
        CLDFs default column names for wordlists are different from LingPy's,
        so you probably have to use::

        >>> lingpy.Wordlist.from_cldf(
            "Wordlist-metadata.json",
            )

        in order to avoid errors from LingPy not finding required columns.

        Parameters
        ----------
        columns: list or tuple 
          The list of columns to import. (default: all columns)

        filter: function: rowdict → bool
          A condition function for importing only some rows. (default: lambda row: row["form"])

        All other parameters are passed on to the `cls`

        Returns
        -------
        A `cls` object representing the CLDF dataset

        """
        kw = {
                'row': 'concept',
                'col': 'doculect',
                'conf': util.data_path('conf', 'wordlist.rc'),
                }
        kwargs.update(kw)
        
        if isinstance(namespace, tuple):
            namespace = dict(namespace)

        # get the datatypes from configuration as to namespace
        datatypes = read_conf(kwargs['conf'])[1]

        # Load the dataset.
        fname = Path(path)
        if not fname.exists():
            raise FileNotFoundError('{:} does not exist'.format(fname))
        if fname.suffix == '.json':
            dataset = pycldf.dataset.Dataset.from_metadata(fname)
        else:
            dataset = pycldf.dataset.Dataset.from_data(fname)

        if dataset.module == "Wordlist":
            # First, make a list of cognate codes if they are in a separate table.
            cognateset_assignments = {}
            try:
                form_reference = dataset["CognateTable", "formReference"].name
                for row in dataset["CognateTable"].iterdicts():
                    cognateset_assignments[row[form_reference]] = row
            except KeyError:
                # Either there are no cognate codes, or they are in the form
                # table. Both options are fine.
                pass

            f_id = dataset["FormTable", "id"].name

            # Access columns by type, not by name.
            language_column = dataset["FormTable", "languageReference"].name
            parameter_column = dataset["FormTable", "parameterReference"].name

            try:
                l_id = dataset["LanguageTable", "id"].name
                languages = {l[l_id]: l
                            for l in dataset["LanguageTable"].iterdicts()}
            except KeyError:
                l_id = "ID"
                languages = bounce_as_id

            try:
                c_id = dataset["ParameterTable", "id"].name
                concepts = {c[c_id]: c
                            for c in dataset["ParameterTable"].iterdicts()}
            except KeyError:
                c_id = "ID"
                concepts = bounce_as_id


            # create dictionary
            D = {0: columns} # Reserve the header
            for row in dataset["FormTable"].iterdicts():
                # TODO: Improve prefixing behaviour
                s = {"cogid_{:}".format(key).lower(): value
                     for key, value in cognateset_assignments.get(
                             row[f_id], {}).items()}
                s.update(
                    {"language_{:}".format(key).lower(): value
                     for key, value in languages[row[language_column]].items()})
                s.update(
                    {"concept_{:}".format(key).lower(): value
                     for key, value in concepts[row[parameter_column]].items()})
                s.update({k.lower(): v for k, v in row.items()})

                if not filter(s):
                    continue

                # check for numeric ID
                try:
                    idx = int(row[f_id])
                except ValueError:
                    idx = len(D)
                while idx in D:
                    idx += 1

                if not D[0]:
                    columns = list(s.keys())
                    D[0] = [c.lower() for c in columns]

                D[idx] = [datatypes.get(
                    namespace.get(
                        column,
                        ''),
                    lambda x: x)(
                    s.get(column, '')) for column in columns]
            D[0] = [namespace.get(c, c) for c in columns]
            if len(D[0]) != len(set(D[0])):
                log.warning('|'.join(columns))
                log.warning('|'.join(D[0]))
                raise ValueError('name space clashes, cannot parse data')

            # convert to wordlist and return
            return cls(D, **kwargs)
        else:
            # For most LingPy applications, it might be best to see whether we got
            # a Wordlist module.
            raise ValueError("LingPy has no procedures for CLDF {:} data.".format(
                dataset.module))
コード例 #14
0
ファイル: parser.py プロジェクト: javiervz/lingpy
    def __init__(self, filename, conf=''):
        """
        Parse data regularly if the data has not been loaded from a pickled version.
        """
        self.log = log.get_logger()

        # try to load the data
        internal_import = False

        # check whether it's a dictionary from which we load
        if isinstance(filename, dict):
            input_data = filename
            if 'filename' not in input_data:
                self.filename = rcParams['filename']
            internal_import = True
            # make check for correct input, there was a bug with a wrong
            # evaluation which is hopefully fixed by now
            tmp_keys = [k for k in input_data if isinstance(k, int)]
            if len(input_data[0]) != len(input_data[tmp_keys[0]]):
                print(input_data[0], input_data[tmp_keys[0]])
                raise ValueError("[!] Wrong input format!")  # pragma: no cover
        # check whether it's another wordlist-object
        elif hasattr(filename, '_data') and hasattr(filename, '_meta'):
            input_data = dict([(key, [v for v in value]) for key, value in \
                    filename._data.items()])
            input_data.update(filename._meta.items())
            input_data[0] = [
                a for a, b in sorted(
                    filename.header.items(), key=lambda x: x[1], reverse=False)
            ]
            internal_import = True
            self.filename = rcParams['filename']
        # or whether the data is an actual file
        elif isinstance(filename, string_types) and os.path.isfile(filename):
            input_data = read_qlc(filename)
            self.filename = filename
        # raise an error otherwise
        elif isinstance(filename, string_types):
            raise IOError("Input file '{0}' does not exist.".format(filename))
        else:
            raise TypeError(
                "Unrecognized type for 'filename' argument: {0}".format(
                    type(filename).__name__))

        # load the configuration file
        if not conf:
            conf = util.data_path('conf', 'qlc.rc')

        # read the file defined by its path in conf
        tmp = [line.split('\t') for line in util.read_config_file(conf)]

        # define two attributes, _alias, and _class which store the aliases and
        # the datatypes (classes) of the given entries
        self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {}
        for name, cls, alias in tmp:
            # make sure the name itself is there
            self._alias[name.lower()] = self._alias[name.upper()] = name
            self._class[name.lower()] = self._class[name.upper()] = eval(cls)
            self._class_string[name.lower()] = self._class_string[
                name.upper()] = cls

            # add the aliases
            for a in alias.split(','):
                self._alias[a.lower()] = self._alias[a.upper()] = name
                self._class[a.lower()] = self._class[a.upper()] = eval(cls)
                self._class_string[a.lower()] = self._class_string[
                    a.upper()] = cls

            self._alias2[name] = sorted(set(alias.split(','))) + [name]

        # append the names in data[0] to self.conf to make sure that all data
        # is covered, even the types which are not specifically defined in the
        # conf file. the datatype defaults here to "str"
        for name in input_data[0]:
            if name.lower() not in self._alias:
                self._alias[name.lower()] = name.lower()
                self._class[name.lower()] = str
            if name.upper() not in self._alias:
                self._alias[name.upper()] = name.lower()
                self._class[name.upper()] = str

        # add empty alias for empty strings XXX why was that? I can't remember
        # why this was important XXX
        self._alias[''] = ''

        # the header stores the indices of the data in the original data dictionary
        self.header = dict(
            zip([self._alias[x] for x in input_data[0]],
                range(len(input_data[0]))))

        # now create a specific header which has all aliases
        self._header = {k: v for k, v in self.header.items()}

        # add a sorted header for reference
        self.columns = sorted(self.header, key=lambda x: self.header[x])

        # assign all aliases to the header
        for alias in self._alias:
            try:
                self._header[alias] = self._header[self._alias[alias]]
            except:
                pass

        # assign the data as attribute to the word list class. Note that we
        # need to check for the type here, but since numpy also offers integer
        # types, we don't check for type(x) == int, but instead use the
        # str.numeric-function that returns numeric values only if it is an
        # integer
        self._data = {
            int(k): v
            for k, v in input_data.items() if k != 0 and str(k).isnumeric()
        }
        # check for same length of all columns
        check_errors = ''
        for k, v in self._data.items():
            if len(v) != len(self.header):
                check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format(
                    k, len(v), len(self.header))
        if check_errors:
            raise ValueError(check_errors + '\n' +
                             ', '.join(sorted(self.header)))

        # iterate over self._data and change the values according to the
        # functions (only needed when reading from file)
        if not internal_import:
            heads = sorted(self._header.items(), key=lambda x: x[1])
            for key in self._data:
                check = []
                for head, i in heads:
                    if i not in check:
                        logstring = 'Problem with row {0} in col {1}, expected' + \
                                    ' «{4}» as datatype but received «{3}» ' + \
                                    ' (ROW: {2}, entry {5}).'
                        try:
                            self._data[key][i] = self._class[head](
                                self._data[key][i])
                            check.append(i)
                        except KeyError:
                            log.warning(
                                logstring.format(
                                    key, i,
                                    '|'.join([str(x) for x in self._data[key]
                                              ]), self._data[key][i],
                                    self._class[head], head))
                        except ValueError:
                            log.warning(
                                logstring.format(
                                    key, i,
                                    '|'.join([str(x) for x in self._data[key]
                                              ]), self._data[key][i],
                                    self._class[head], head))

        # create entry attribute of the wordlist
        self.entries = sorted(
            set([b.lower() for a, b in self._alias.items() if b]))

        # assign meta-data
        self._meta = {}
        for key in [k for k in input_data if type(k) != int]:
            self._meta[key] = input_data[key]
コード例 #15
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'),
                                          model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in range(len(chars))]
        for (i, charA), (j,
                         charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
コード例 #16
0
ファイル: sampa.py プロジェクト: anukat2015/lingpy
@date: 2007/07/19
"""
from __future__ import print_function, division, unicode_literals
import re
import sys
import codecs

from lingpy.util import data_path


# data for sampa2ipa (Peter Kleiwegs implementation)
xsdata = []
_xsKeys = [' ']
xs = {' ': ' '}

for line in codecs.open(data_path('ipa', 'sampa.csv'), 'r', 'utf-8'):
    line = line.strip('\n').strip('\r')
    if line and not line.startswith('#'):
        key, val = line.split('\t')
        try:
            assert key not in xs
        except:
            sys.stderr.write(key + '\n')
            sys.stderr.flush()
        _xsKeys.append(key)
        xs[key] = eval('"""' + val + '"""')

_kk = []
for _k in _xsKeys:
    _kk.append(re.escape(_k))
_kk.sort(reverse=True)  # long before short
コード例 #17
0
ファイル: parser.py プロジェクト: lingpy/lingpy
    def __init__(self, filename, conf=''):
        """
        Parse data regularly if the data has not been loaded from a pickled version.
        """
        self.log = log.get_logger()

        # try to load the data
        internal_import = False

        # check whether it's a dictionary from which we load
        if isinstance(filename, dict):
            input_data = filename
            if 'filename' not in input_data:
                self.filename = rcParams['filename']
            internal_import = True
            # make check for correct input, there was a bug with a wrong
            # evaluation which is hopefully fixed by now
            tmp_keys = [k for k in input_data if isinstance(k, int)]
            if len(input_data[0]) != len(input_data[tmp_keys[0]]):
                print(input_data[0], input_data[tmp_keys[0]])
                raise ValueError("[!] Wrong input format!")  # pragma: no cover
        # check whether it's another wordlist-object
        elif hasattr(filename, '_data') and hasattr(filename, '_meta'):
            input_data = dict([(key, [v for v in value]) for key, value in \
                    filename._data.items()])
            input_data.update(filename._meta.items())
            input_data[0] = [a for a, b in sorted(
                filename.header.items(),
                key=lambda x: x[1],
                reverse=False)]
            internal_import = True
            self.filename = rcParams['filename']
        # or whether the data is an actual file
        elif isinstance(filename, string_types) and os.path.isfile(filename):
            input_data = read_qlc(filename)
            self.filename = filename
        # raise an error otherwise
        elif isinstance(filename, string_types):
            raise IOError("Input file '{0}' does not exist.".format(filename))
        else:
            raise TypeError("Unrecognized type for 'filename' argument: {0}".format(
                type(filename).__name__))

        # load the configuration file
        if not conf:
            conf = util.data_path('conf', 'qlc.rc')

        # read the file defined by its path in conf
        tmp = [line.split('\t') for line in util.read_config_file(conf)]

        # define two attributes, _alias, and _class which store the aliases and
        # the datatypes (classes) of the given entries
        self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {}
        for name, cls, alias in tmp:
            # make sure the name itself is there
            self._alias[name.lower()] = self._alias[name.upper()] = name
            self._class[name.lower()] = self._class[name.upper()] = eval(cls)
            self._class_string[name.lower()] = self._class_string[name.upper()] = cls

            # add the aliases
            for a in alias.split(','):
                self._alias[a.lower()] = self._alias[a.upper()] = name
                self._class[a.lower()] = self._class[a.upper()] = eval(cls)
                self._class_string[a.lower()] = self._class_string[a.upper()] = cls

            self._alias2[name] = sorted(set(alias.split(','))) + [name]

        # append the names in data[0] to self.conf to make sure that all data
        # is covered, even the types which are not specifically defined in the
        # conf file. the datatype defaults here to "str"
        for name in input_data[0]:
            if name.lower() not in self._alias:
                self._alias[name.lower()] = name.lower()
                self._class[name.lower()] = str
            if name.upper() not in self._alias:
                self._alias[name.upper()] = name.lower()
                self._class[name.upper()] = str

        # add empty alias for empty strings XXX why was that? I can't remember
        # why this was important XXX
        self._alias[''] = ''

        # the header stores the indices of the data in the original data dictionary
        self.header = dict(
            zip([self._alias[x] for x in input_data[0]], range(len(input_data[0]))))

        # now create a specific header which has all aliases
        self._header = {k: v for k, v in self.header.items()}

        # add a sorted header for reference
        self.columns = sorted(self.header, key=lambda x: self.header[x])

        # assign all aliases to the header
        for alias in self._alias:
            try:
                self._header[alias] = self._header[self._alias[alias]]
            except:
                pass

        # assign the data as attribute to the word list class. Note that we
        # need to check for the type here, but since numpy also offers integer
        # types, we don't check for type(x) == int, but instead use the
        # str.numeric-function that returns numeric values only if it is an
        # integer
        self._data = {
            int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric()}
        # check for same length of all columns
        check_errors = ''
        for k, v in self._data.items():
            if len(v) != len(self.header):
                check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format(
                    k, len(v), len(self.header))
        if check_errors:
            raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header)))

        # iterate over self._data and change the values according to the
        # functions (only needed when reading from file)
        if not internal_import:
            heads = sorted(self._header.items(), key=lambda x: x[1])
            for key in self._data:
                check = []
                for head, i in heads:
                    if i not in check:
                        logstring = 'Problem with row {0} in col {1}, expected' + \
                                    ' «{4}» as datatype but received «{3}» ' + \
                                    ' (ROW: {2}, entry {5}).'
                        try:
                            self._data[key][i] = self._class[head](self._data[key][i])
                            check.append(i)
                        except KeyError:
                            log.warning(
                                logstring.format(
                                    key,
                                    i,
                                    '|'.join([str(x) for x in self._data[key]]),
                                    self._data[key][i],
                                    self._class[head],
                                    head))
                        except ValueError:
                            log.warning(
                                logstring.format(
                                    key,
                                    i,
                                    '|'.join([str(x) for x in self._data[key]]),
                                    self._data[key][i],
                                    self._class[head],
                                    head))

        # create entry attribute of the wordlist
        self.entries = sorted(set([b.lower() for a, b in self._alias.items() if b]))

        # assign meta-data
        self._meta = {}
        for key in [k for k in input_data if type(k) != int]:
            self._meta[key] = input_data[key]
コード例 #18
0
ファイル: derive.py プロジェクト: xrotwang/lingpy
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in
                  range(len(chars))]
        for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")