def __init__(self, SFInA, SFInB, SLangIDa, SLangIDb):
        '''
        Constructor
        '''
        FDebug = open('md050crosslevenshtein.debug', 'w')
        LWordsA = self.readWordList(SFInA)
        LWordsB = self.readWordList(
            SFInB
        )  # graphonological object with phonological features over graphemes
        OGraphonolev = md060graphonoLev.clGraphonolev()

        LDistances = []
        ICounter = 0
        ICounterRec = 0
        for (SWordA, SPoSA, IFrqA) in LWordsA:
            LenA = len(SWordA)
            try:
                LogFrqA = math.log(IFrqA)
            except:
                LogFrqA = 0
            LCognates = []
            LCognates1 = []
            ICounter += 1
            if ICounter % 5 == 0:
                sys.stderr.write(SWordA + ' ' + str(ICounter) + '\n')
            ''' 
            # changed:
            for (SWordB, SPoSB, IFrqB) in LWordsB:
                LenB = len(SWordB)
                LenAve = (LenA + LenB) / 2
            
                ILev = self.computeLevenshtein(SWordA, SWordB)
                ALevNorm = ILev/LenAve
                if ALevNorm <= 0.30:
                    LCognates.append((ALevNorm, ILev, SWordB, SPoSB, IFrqB))
            '''
            for (SWordB, SPoSB, IFrqB) in LWordsB:
                (Lev0, Lev1, Lev0Norm,
                 Lev1Norm) = OGraphonolev.computeLevenshtein(
                     SWordA, SWordB, SLangIDa, SLangIDb)
                if Lev0Norm <= 0.36:
                    LCognates.append((Lev0Norm, Lev0, SWordB, SPoSB, IFrqB))
                if Lev1Norm <= 0.36:
                    LCognates1.append((Lev1Norm, Lev1, SWordB, SPoSB, IFrqB))

            LDistances.append((SWordA, SPoSA, IFrqA, LCognates))
            if (len(LCognates) > 0 or len(LCognates1) > 0):
                ICounterRec += 1
                ACognPerCent = ICounterRec / ICounter
                sys.stdout.write(
                    '\t{, %(ICounterRec)d, %(ICounter)d, %(SWordA)s, %(SPoSA)s, frq=%(IFrqA)d, ln=%(LogFrqA).2f, have-cognates: %(ACognPerCent).2f : \n'
                    % locals())
                # sys.stdout.flush()
                self.printCognates(LCognates, LogFrqA, SPoSA)
                sys.stdout.write('\t\n')
                self.printCognates(LCognates1, LogFrqA, SPoSA)
        '''
    def __init__(self, SFInA, SFInB, SLangIDa, SLangIDb):
        '''
        Constructor
        '''
        SNameIName , SNameIExt = os.path.splitext(SFInA) # generate the debug using the first file name
        SFDebug = SNameIName + '-md050crosslevenshtein.debug'
        # FDebug = open(SFDebug, 'w') # debug file for each of the input files.. 
        LWordsA = self.readWordList(SFInA)
        LWordsB = self.readWordList(SFInB) # graphonological object with phonological features over graphemes
        # OGraphonolev = md060graphonoLev.clGraphonolev(Debug = True, DebugFile = SFDebug)
        OGraphonolev = md060graphonoLev.clGraphonolev()

        
        LDistances = []
        ICounter = 0
        ICounterRec = 0
        for (SWordA, SPoSA, IFrqA) in LWordsA:
            LenA = len(SWordA)
            try:
                LogFrqA = math.log(IFrqA)
            except:
                LogFrqA = 0
            LCognates = []
            LCognates1 = []
            ICounter += 1
            if ICounter % 1 == 0:
                sys.stderr.write(SWordA + ' ' + str(ICounter) + '\n')

            
            ''' 
            # changed:
            for (SWordB, SPoSB, IFrqB) in LWordsB:
                LenB = len(SWordB)
                LenAve = (LenA + LenB) / 2
            
                ILev = self.computeLevenshtein(SWordA, SWordB)
                ALevNorm = ILev/LenAve
                if ALevNorm <= 0.30:
                    LCognates.append((ALevNorm, ILev, SWordB, SPoSB, IFrqB))
            '''
            for (SWordB, SPoSB, IFrqB) in LWordsB:
                # Lev0 is baseline Levenshtein distance
                # Lev1 is is graphonological Levenshtein distance
                (Lev0, Lev1, Lev0Norm, Lev1Norm) = OGraphonolev.computeLevenshtein(SWordA, SWordB, SLangIDa, SLangIDb)
                # if Lev0Norm <= 0.36:
                if Lev0Norm <= 0.4:
                    LCognates.append((Lev0Norm, Lev0, SWordB, SPoSB, IFrqB)) # baseline Lev
                # if Lev1Norm <= 0.36:
                if Lev1Norm <= 0.4:
                    LCognates1.append((Lev1Norm, Lev1, SWordB, SPoSB, IFrqB)) # graphonological Lev
            
            
            LDistances.append((SWordA, SPoSA, IFrqA, LCognates))
            if (len(LCognates) > 0 or len(LCognates1) > 0):
                ICounterRec += 1
                ACognPerCent = ICounterRec / ICounter
                # now restricted to writing only one cognate...
                # sys.stdout.write('\t{, %(ICounterRec)d, %(ICounter)d, %(SWordA)s, %(SPoSA)s, frq=%(IFrqA)d, ln=%(LogFrqA).2f, have-cognates: %(ACognPerCent).2f : \n' % locals())
                sys.stdout.write('%(SWordA)s\t%(SPoSA)s\tfrq=%(IFrqA)d\t\n' % locals())
                sys.stdout.flush()
                sys.stdout.write('BASELINE:\n')
                self.printCognate(LCognates, LogFrqA, SPoSA)
                # sys.stdout.write('\t')
                sys.stdout.write('GRAPHONOLOGICAL:\n')
                self.printCognate(LCognates1, LogFrqA, SPoSA)
                # removed (s) --> printCognate(s) in function call : simple production version
                sys.stdout.write('\n\n')
                sys.stdout.flush()
            
        '''
Example #3
0
    def __init__(self, SFInA, SFInB, SLangIDa, SLangIDb):
        '''
        Constructor
        '''
        SNameIName, SNameIExt = os.path.splitext(
            SFInA)  # generate the debug using the first file name
        SFDebug = SNameIName + '-md050crosslevenshtein.debug'
        # FDebug = open(SFDebug, 'w') # debug file for each of the input files..
        LWordsA, LWordsIndexA = self.readWordList(SFInA)
        LWordsB, LWordsIndexB = self.readWordList(
            SFInB
        )  # graphonological object with phonological features over graphemes
        # OGraphonolev = md060graphonoLev.clGraphonolev(Debug = True, DebugFile = SFDebug)
        OGraphonolev = md060graphonoLev.clGraphonolev()

        LDistances = []
        ICounter = 0
        ICounterRec = 0
        for (SWordA, SPoSA, IFrqA, LTargetsEval) in LWordsA:
            LenA = len(SWordA)
            try:
                LogFrqA = math.log(IFrqA)
            except:
                LogFrqA = 0
            LCognates = []
            LCognates1 = []
            ICounter += 1
            if ICounter % 1 == 0:
                sys.stderr.write(SWordA + ' ' + str(ICounter) + '\n')
            print(SWordA)
            print(str(LTargetsEval))

            # count if references can be found in the target?
            ICountEvalInTarget = 0
            # dictionary of translation equivalents to find:
            DTargetsEval = {}
            for STargetE in LTargetsEval:
                (Lev0E, Lev1E, Lev0NormE,
                 Lev1NormE) = OGraphonolev.computeLevenshtein(
                     SWordA, STargetE, SLangIDa, SLangIDb)
                print(
                    '%(SWordA)s,%(STargetE)s,L-G:%(Lev0E)f,%(Lev1E)f,NL-G:%(Lev0NormE)f,%(Lev1NormE)f'
                    % locals())
                # not printing but recording into memory
                DTargetsEval[(STargetE, SPoSA)] = (
                    Lev0NormE, Lev1NormE, Lev0E, Lev1E
                )  # then we should be able to sort by the top candidate
                # if found the evaluation target in reference
                if STargetE in LWordsIndexB:
                    ICountEvalInTarget += 1

            if ICountEvalInTarget == 0:
                print('\t: not in the reference list!')
                continue
            # -- this stops search for items, which where reference is not in Russian list, so cannot be found...
            # can be treated a bit differently, statistics collected, etc.!!!
            ''' 
            # changed:
            for (SWordB, SPoSB, IFrqB) in LWordsB:
                LenB = len(SWordB)
                LenAve = (LenA + LenB) / 2
            
                ILev = self.computeLevenshtein(SWordA, SWordB)
                ALevNorm = ILev/LenAve
                if ALevNorm <= 0.30:
                    LCognates.append((ALevNorm, ILev, SWordB, SPoSB, IFrqB))
            '''
            # LTargetsEval2 not used -- usually large reference word list is not expected to have references
            # create a dictionary to store the equivalents, sort differently;
            DCognates = {}
            for (SWordB, SPoSB, IFrqB, LTargetsEval2) in LWordsB:
                # Lev0 is baseline Levenshtein distance
                # Lev1 is is graphonological Levenshtein distance
                if SPoSB != SPoSA:
                    continue  # restrict search to the same pos space
                (Lev0, Lev1, Lev0Norm,
                 Lev1Norm) = OGraphonolev.computeLevenshtein(
                     SWordA, SWordB, SLangIDa, SLangIDb)
                # if Lev0Norm <= 0.36:
                ''' --> removed on 14/04/2017, to be replaced with a dictionary to find things in the list...
                if Lev0Norm <= 0.5:
                    LCognates.append((Lev0Norm, Lev0, SWordB, SPoSB, IFrqB)) # baseline Lev
                # if Lev1Norm <= 0.36:
                if Lev1Norm <= 0.5:
                    LCognates1.append((Lev1Norm, Lev1, SWordB, SPoSB, IFrqB)) # graphonological Lev
                    
                '''
                DCognates[(SWordB, SPoSB)] = (
                    Lev0Norm, Lev1Norm, Lev0, Lev1
                )  # expand to account for variations of the scores; then evaluate rank accoding to different scores
            ''' -- removed -- not used now
            LDistances.append((SWordA, SPoSA, IFrqA, LCognates))
            
            if (len(LCognates) > 0 or len(LCognates1) > 0):
                ICounterRec += 1
                ACognPerCent = ICounterRec / ICounter
                # now restricted to writing only one cognate...
                # sys.stdout.write('\t{, %(ICounterRec)d, %(ICounter)d, %(SWordA)s, %(SPoSA)s, frq=%(IFrqA)d, ln=%(LogFrqA).2f, have-cognates: %(ACognPerCent).2f : \n' % locals())
                sys.stdout.write('%(SWordA)s\t%(SPoSA)s\tfrq=%(IFrqA)d\t\n' % locals())
                sys.stdout.flush()
                sys.stdout.write('BASELINE:\n')
                self.printCognate(LCognates, LogFrqA, SPoSA)
                # sys.stdout.write('\t')
                sys.stdout.write('GRAPHONOLOGICAL:\n')
                self.printCognate(LCognates1, LogFrqA, SPoSA)
                # removed (s) --> printCognate(s) in function call : simple production version
                sys.stdout.write('\n\n')
                sys.stdout.flush()
            '''
            # list of cognates is formed
            for (STargetE, SPoSA) in sorted(DTargetsEval.keys(),
                                            reverse=False,
                                            key=lambda k: k[1][0]):
                if (STargetE, SPoSA) in DCognates.keys():
                    # find rank and top-n list:
                    # (IRankL, ITopNL, IRankG, ITopNG) = self.findRanks((STargetE, SPoSA), DCognates)
                    LLSortCog = self.findRanks((STargetE, SPoSA), DCognates,
                                               [0, 1])
                    for (ICogRank, ILenLSortCog, LSortCog) in LLSortCog:
                        sys.stdout.write(
                            '%(SWordA)s\t%(STargetE)s\t%(ICogRank)d\t%(ILenLSortCog)d\t\n'
                            % locals())
                        for (ICogRank, TKey, TVals) in LSortCog:  # corrected
                            SKey = str(TKey)
                            SVals = str(TVals)
                            sys.stdout.write(
                                '\t%(ICogRank)d\t%(SKey)s\t%(SVals)s\n' %
                                locals())

                    (Lev0Norm, Lev1Norm, Lev0, Lev1) = DCognates[(STargetE,
                                                                  SPoSA)]
                    sys.stdout.write(
                        '%(STargetE)s\t%(SPoSA)s\t%(Lev0Norm)f\t%(Lev1Norm)f\t -- found \n'
                        % locals())
                    sys.stdout.flush()
                else:
                    (Lev0NormE, Lev1NormE, Lev0E,
                     Lev1E) = DTargetsEval[(STargetE, SPoSA)]
                    sys.stdout.write(
                        '%(STargetE)s\t%(SPoSA)s\t%(Lev0NormE)f\t%(Lev1NormE)f\t -- NOT found \n'
                        % locals())
                    sys.stdout.flush()
        '''
    def __init__(self, SFInA, SFInB, SLangIDa, SLangIDb):
        '''
        Constructor
        '''
        SNameIName, SNameIExt = os.path.splitext(
            SFInA)  # generate the debug using the first file name
        SFDebug = SNameIName + '-md050crosslevenshtein.debug'
        # FDebug = open(SFDebug, 'w') # debug file for each of the input files..
        LWordsA, LWordsIndexA = self.readWordList(SFInA)
        LWordsB, LWordsIndexB = self.readWordList(
            SFInB
        )  # graphonological object with phonological features over graphemes
        # OGraphonolev = md060graphonoLev.clGraphonolev(Debug = True, DebugFile = SFDebug)
        OGraphonolev = md060graphonoLev.clGraphonolev()

        LDistances = []
        ICounter = 0
        ICounterRec = 0
        for (SWordA, SPoSA, IFrqA, LTargetsEval) in LWordsA:
            LenA = len(SWordA)
            try:
                LogFrqA = math.log(IFrqA)
            except:
                LogFrqA = 0
            LCognates = []
            LCognates1 = []
            ICounter += 1
            if ICounter % 1 == 0:
                sys.stderr.write(SWordA + ' ' + str(ICounter) + '\n')
            print(SWordA)
            print(str(LTargetsEval))

            # count if references can be found in the target?
            ICountEvalInTarget = 0
            for STargetE in LTargetsEval:
                (Lev0E, Lev1E, Lev0NormE,
                 Lev1NormE) = OGraphonolev.computeLevenshtein(
                     SWordA, STargetE, SLangIDa, SLangIDb)
                print(
                    '%(SWordA)s,%(STargetE)s,L-G:%(Lev0E)f,%(Lev1E)f,NL-G:%(Lev0NormE)f,%(Lev1NormE)f'
                    % locals())
                # not printing but recording into memory
                # if found the evaluation target in reference
                if STargetE in LWordsIndexB:
                    ICountEvalInTarget += 1

            if ICountEvalInTarget == 0:
                print('\t: not in the reference list!')
                continue
            # -- this stops search for items, which where reference is not in Russian list, so cannot be found...
            # can be treated a bit differently, statistics collected, etc.!!!
            ''' 
            # changed:
            for (SWordB, SPoSB, IFrqB) in LWordsB:
                LenB = len(SWordB)
                LenAve = (LenA + LenB) / 2
            
                ILev = self.computeLevenshtein(SWordA, SWordB)
                ALevNorm = ILev/LenAve
                if ALevNorm <= 0.30:
                    LCognates.append((ALevNorm, ILev, SWordB, SPoSB, IFrqB))
            '''
            # LTargetsEval2 not used -- usually large reference word list is not expected to have references
            for (SWordB, SPoSB, IFrqB, LTargetsEval2) in LWordsB:
                # Lev0 is baseline Levenshtein distance
                # Lev1 is is graphonological Levenshtein distance
                (Lev0, Lev1, Lev0Norm,
                 Lev1Norm) = OGraphonolev.computeLevenshtein(
                     SWordA, SWordB, SLangIDa, SLangIDb)
                # if Lev0Norm <= 0.36:
                if Lev0Norm <= 0.5:
                    LCognates.append(
                        (Lev0Norm, Lev0, SWordB, SPoSB, IFrqB))  # baseline Lev
                # if Lev1Norm <= 0.36:
                if Lev1Norm <= 0.5:
                    LCognates1.append((Lev1Norm, Lev1, SWordB, SPoSB,
                                       IFrqB))  # graphonological Lev

            LDistances.append((SWordA, SPoSA, IFrqA, LCognates))
            if (len(LCognates) > 0 or len(LCognates1) > 0):
                ICounterRec += 1
                ACognPerCent = ICounterRec / ICounter
                # now restricted to writing only one cognate...
                # sys.stdout.write('\t{, %(ICounterRec)d, %(ICounter)d, %(SWordA)s, %(SPoSA)s, frq=%(IFrqA)d, ln=%(LogFrqA).2f, have-cognates: %(ACognPerCent).2f : \n' % locals())
                sys.stdout.write('%(SWordA)s\t%(SPoSA)s\tfrq=%(IFrqA)d\t\n' %
                                 locals())
                sys.stdout.flush()
                sys.stdout.write('BASELINE:\n')
                self.printCognate(LCognates, LogFrqA, SPoSA)
                # sys.stdout.write('\t')
                sys.stdout.write('GRAPHONOLOGICAL:\n')
                self.printCognate(LCognates1, LogFrqA, SPoSA)
                # removed (s) --> printCognate(s) in function call : simple production version
                sys.stdout.write('\n\n')
                sys.stdout.flush()
        '''