def read_scorer(infile): """ Read a scoring function in a file into a ScoreDict object. Parameters ---------- infile : str The path to the input file that shall be read as a scoring dictionary. The matrix format is a simple csv-file in which the scoring matrix is displayed, with negative values indicating high differences between sound segments (or sound classes) and positive values indicating high similarity. The matrix should be symmetric, columns should be separated by tabstops, and the first column should provide the alphabet for which the scoring function is defined. Returns ------- scoredict : ~lingpy.algorithm.misc.ScoreDict A ScoreDict instance which can be directly passed to LingPy's alignment functions. """ # XXX note that we need a better check here, the previous version also # caused a very hard-to-track bug in our system! XXX if "\t" in infile and "\n" in infile: data = [x.split('\t') for x in infile.split('\n') if x] else: data = csv2list(infile) return misc.ScoreDict([l[0] for l in data], [[float(x) for x in l[1:]] for l in data if l])
def compile_model(model, path=None): """ Function compiles customized sound-class models. Parameters ---------- model : str A string indicating the name of the model which shall be created. path : str A string indication the path where the model-folder is stored. Notes ----- A model is defined by a folder placed in :file:`data/models` directory of the LingPy package. The name of the folder reflects the name of the model. It contains three files: the file :file:`converter`, the file :file:`INFO`, and the optional file :file:`scorer`. The format requirements for these files are as follows: :file:`INFO` The ``INFO``-file serves as a reference for a given sound-class model. It can contain arbitrary information (and also be empty). If one wants to define specific characteristics, like the ``source``, the ``compiler``, the ``date``, or a ``description`` of a given model, this can be done by employing a key-value structure in which the key is preceded by an ``@`` and followed by a colon and the value is written right next to the key in the same line, e.g.:: @source: Dolgopolsky (1986) This information will then be read from the ``INFO`` file and rendered when printing the model to screen with help of the :py:func:`print` function. :file:`converter` The ``converter`` file contains all sound classes which are matched with their respective sound values. Each line is reserved for one class, precede by the key (preferably an ASCII-letter) representing the class:: B : ɸ, β, f, p͡f, p͜f, ƀ E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ D : θ, ð, ŧ, þ, đ G : x, ɣ, χ ... :file:`matrix` A scoring matrix indicating the alignment scores of all sound-class characters defined by the model. The scoring is structured as a simple tab-delimited text file. The first cell contains the character names, the following cells contain the scores in redundant form (with both triangles being filled):: B 10.0 -10.0 5.0 ... E -10.0 5.0 -10.0 ... F 5.0 -10.0 10.0 ... ... :file:`scorer` The ``scorer`` file (which is optional) contains the graph of class-transitions which is used for the calculation of the scoring dictionary. Each class is listed in a separate line, followed by the symbols ``v``,``c``, or ``t`` (indicating whether the class represents vowels, consonants, or tones), and by the classes it is directly connected to. The strength of this connection is indicated by digits (the smaller the value, the shorter the path between the classes):: A : v, E:1, O:1 C : c, S:2 B : c, W:2 E : v, A:1, I:1 D : c, S:2 ... The information in such a file is automatically converted into a scoring dictionary (see :evobib:`List2012b` for details). Based on the information provided by the files, a dictionary for the conversion of IPA-characters to sound classes and a scoring dictionary are created and stored as a binary. The model can be loaded with help of the :py:class:`~lingpy.data.model.Model` class and used in the various classes and functions provided by the library. See also -------- lingpy.data.model.Model compile_dvt """ log.info("Compiling model <" + model + ">...") # get the path to the models new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps) log.debug("Model-Path: %s" % new_path) # load the sound classes sound_classes = _import_sound_classes(new_path('converter')) # dump the data cache.dump(sound_classes, model + '.converter') log.info("... successfully created the converter.") # try to load the scoring function or the score tree scorer = False if os.path.isfile(new_path('matrix')): scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer')): score_tree = _import_score_tree(new_path('scorer')) # calculate the scoring dictionary score_dict = _make_scoring_dictionary(score_tree) # make score_dict a ScoreDict instance chars = sorted(set([s[0] for s in score_dict.keys()])) matrix = [[0 for i in range(len(chars))] for j in range(len(chars))] for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)): if i < j: matrix[i][j] = score_dict.get((charA, charB), -100) matrix[j][i] = score_dict.get((charB, charA), -100) elif i == j: matrix[i][j] = score_dict[charA, charB] scorer = misc.ScoreDict(chars, matrix) util.write_text_file(new_path('matrix'), scorer2str(scorer)) if scorer: cache.dump(scorer, model + '.scorer') log.info("... successfully created the scorer.") else: log.info("... no scoring dictionary defined.") log.info("Model <" + model + "> was compiled successfully.")
def get_partial_scorer(self, **keywords): """ Create a scoring function based on sound correspondences. Parameters ---------- method : str (default='shuffle') Select between "markov", for automatically generated random strings, and "shuffle", for random strings taken directly from the data. ratio : tuple (default=3,2) Define the ratio between derived and original score for sound-matches. vscale : float (default=0.5) Define a scaling factor for vowels, in order to decrease their score in the calculations. runs : int (default=1000) Choose the number of random runs that shall be made in order to derive the random distribution. threshold : float (default=0.7) The threshold which used to select those words that are compared in order to derive the attested distribution. modes : list (default = [("global",-2,0.5),("local",-1,0.5)]) The modes which are used in order to derive the distributions from pairwise alignments. factor : float (default=0.3) The scaling factor for sound segments with identical prosodic environment. force : bool (default=False) Force recalculation of existing distribution. preprocessing: bool (default=False) Select whether SCA-analysis shall be used to derive a preliminary set of cognates from which the attested distribution shall be derived. rands : int (default=1000) If "method" is set to "markov", this parameter defines the number of strings to produce for the calculation of the random distribution. limit : int (default=10000) If "method" is set to "markov", this parameter defines the limit above which no more search for unique strings will be carried out. cluster_method : {"upgma" "single" "complete"} (default="upgma") Select the method to be used for the calculation of cognates in the preprocessing phase, if "preprocessing" is set to c{True}. gop : int (default=-2) If "preprocessing" is selected, define the gap opening penalty for the preprocessing calculation of cognates. unattested : {int, float} (default=-5) If a pair of sounds is not attested in the data, but expected by the alignment algorithm that computes the expected distribution, the score would be -infinity. Yet in order to allow to smooth this behaviour and to reduce the strictness, we set a default negative value which does not necessarily need to be too high, since it may well be that we miss a potentially good pairing in the first runs of alignment analyses. Use this keyword to adjust this parameter. unexpected : {int, float} (default=0.000001) If a pair is encountered in a given alignment but not expected according to the randomized alignments, the score would be not calculable, since we had to divide by zero. For this reason, we set a very small constant, by which the score is divided in this case. Not that this constant is only relevant in those cases where the shuffling procedure was not carried out long enough. """ kw = dict( method=rcParams['lexstat_scoring_method'], ratio=rcParams['lexstat_ratio'], vscale=rcParams['lexstat_vscale'], runs=rcParams['lexstat_runs'], threshold=rcParams['lexstat_scoring_threshold'], modes=rcParams['lexstat_modes'], factor=rcParams['align_factor'], restricted_chars=rcParams['restricted_chars'], force=False, preprocessing=False, rands=rcParams['lexstat_rands'], limit=rcParams['lexstat_limit'], cluster_method=rcParams['lexstat_cluster_method'], gop=rcParams['align_gop'], preprocessing_threshold=rcParams[ 'lexstat_preprocessing_threshold'], preprocessing_method=rcParams['lexstat_preprocessing_method'], subset=False, defaults=False, unattested=-5, unexpected=0.00001, smooth=1) kw.update(keywords) if kw['defaults']: return kw # get parameters and store them in string params = dict( ratio=kw['ratio'], vscale=kw['vscale'], runs=kw['runs'], scoring_threshold=kw['threshold'], preprocessing_threshold=kw['preprocessing_threshold'], modestring=':'.join('{0}-{1}-{2:.2f}'.format(a, abs(b), c) for a, b, c in kw['modes']), factor=kw['factor'], restricted_chars=kw['restricted_chars'], method=kw['method'], preprocessing='{0}:{1}:{2}'.format(kw['preprocessing'], kw['cluster_method'], kw['gop']), unattested=kw['unattested'], unexpected=kw['unexpected'], smooth=kw['smooth']) parstring = '_'.join([ '{ratio[0]}:{ratio[1]}', '{vscale:.2f}', '{runs}', '{scoring_threshold:.2f}', '{modestring}', '{factor:.2f}', '{restricted_chars}', '{method}', '{preprocessing}', '{preprocessing_threshold}', '{unexpected:.2f}', '{unattested:.2f}' ]).format(**params) # check for existing attributes if hasattr(self, 'cscorer') and not kw['force']: log.warning( "An identical scoring function has already been calculated, " "force recalculation by setting 'force' to 'True'.") return # check for attribute if hasattr(self, 'params') and not kw['force']: if 'cscorer' in self.params: if self.params['cscorer'] == params: log.warning( "An identical scoring function has already been " "calculated, force recalculation by setting 'force'" " to 'True'.") return else: log.warning( "A different scoring function has already been calculated, " "overwriting previous settings.") # store parameters self.params = {'cscorer': params} self._meta['params'] = self.params self._stamp += "# Parameters: " + parstring + '\n' # get the correspondence distribution self._corrdist = self._get_partial_corrdist(**kw) # get the random distribution self._randist = self._get_partial_randist(**kw) # get the average gop gop = sum([m[1] for m in kw['modes']]) / len(kw['modes']) # create the new scoring matrix matrix = [[c for c in line] for line in self.bscorer.matrix] char_dict = self.bscorer.chars2int for (i, tA), (j, tB) in util.multicombinations2(enumerate(self.cols)): for charA, charB in product( list(self.freqs[tA]) + [util.charstring(i + 1)], list(self.freqs[tB]) + [util.charstring(j + 1)]): exp = self._randist.get((tA, tB), {}).get((charA, charB), False) att = self._corrdist.get((tA, tB), {}).get((charA, charB), False) # in the following we follow the former lexstat protocol if att <= kw['smooth'] and i != j: att = False if att and exp: score = np.log2((att**2) / (exp**2)) elif att and not exp: score = np.log2((att**2) / kw['unexpected']) elif exp and not att: score = kw['unattested'] # XXX gop ??? else: # elif not exp and not att: score = -90 # ??? # combine the scores if rcParams['gap_symbol'] not in charA + charB: sim = self.bscorer[charA, charB] else: sim = gop # get the real score rscore = (kw['ratio'][0] * score + kw['ratio'][1] * sim) \ / sum(kw['ratio']) try: iA = char_dict[charA] iB = char_dict[charB] # use the vowel scale if charA[4] in self.vowels and charB[4] in self.vowels: matrix[iA][iB] = matrix[iB][iA] = kw['vscale'] * rscore else: matrix[iA][iB] = matrix[iB][iA] = rscore except: pass self.cscorer = misc.ScoreDict(self.chars, matrix) self._meta['scorer']['cscorer'] = self.cscorer