Beispiel #1
0
    def __init__(self, model, path=None):
        new_path = lambda *cmps: \
            os.path.join(path or util.data_path('models'), model, *cmps)
        self.name = model

        # try to load the converter
        try:
            self.converter = cache.load(model + '.converter')
        except:
            compile_model(model, path)
            self.converter = cache.load(model + '.converter')

        # give always preference to scorer matrix files
        if os.path.isfile(new_path('matrix')):
            self.scorer = read_scorer(new_path('matrix'))
        elif os.path.isfile(new_path('scorer.bin')):
            try:
                self.scorer = cache.load(model + '.scorer')
            except FileNotFoundError:
                pass
        # if none of the above fits, leave it
        else:
            pass

        # read information from the info-file
        self.info = {}

        info = util.read_text_file(new_path('INFO'))
        data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones']

        for line in data:
            try:
                self.info[line] = re.findall('@' + line + ': (.*)', info)[0]
            except:
                self.info[line] = 'unknown'

        # check for vowels and tones
        if "vowels" in self.info:
            self.vowels = self.info['vowels']
        if "tones" in self.info:
            self.tones = self.info['tones']
Beispiel #2
0
    def __init__(self, model, path=None):
        new_path = lambda *cmps: \
            os.path.join(path or util.data_path('models'), model, *cmps)
        self.name = model

        # try to load the converter
        try:
            self.converter = cache.load(model + '.converter')
        except:
            compile_model(model, path)
            self.converter = cache.load(model + '.converter')

        # give always preference to scorer matrix files
        if os.path.isfile(new_path('matrix')):
            self.scorer = read_scorer(new_path('matrix'))
        elif os.path.isfile(new_path('scorer.bin')):
            try:
                self.scorer = cache.load(model + '.scorer')
            except compat.FileNotFoundError:
                pass
        # if none of the above fits, leave it
        else:
            pass

        # read information from the info-file
        self.info = {}

        info = util.read_text_file(new_path('INFO'))
        data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones']

        for line in data:
            try:
                self.info[line] = re.findall('@' + line + ': (.*)', info)[0]
            except:
                self.info[line] = 'unknown'

        # check for vowels and tones
        if "vowels" in self.info:
            self.vowels = self.info['vowels']
        if "tones" in self.info:
            self.tones = self.info['tones']
Beispiel #3
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'),
                                          model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in range(len(chars))]
        for (i, charA), (j,
                         charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Beispiel #4
0
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        log.warning(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {
                    k: v[1:-1]
                    for k, v in [key.split('=') for key in tmp.split(' ')[1:]]
                }
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append(
                            [x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                keys.setdefault('id', 'basic')
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:  # pragma: no cover
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' +
                        str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d
Beispiel #5
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in
                  range(len(chars))]
        for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Beispiel #6
0
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        warn(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {k: v[1:-1]
                        for k, v in [key.split('=') for key in tmp.split(' ')[1:]]}
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append([x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                if 'id' not in keys:
                    keys['id'] = 'basic'
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' + str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d