def __init__(self, SFInA, SFInB, SLangIDa, SLangIDb): ''' Constructor ''' FDebug = open('md050crosslevenshtein.debug', 'w') LWordsA = self.readWordList(SFInA) LWordsB = self.readWordList( SFInB ) # graphonological object with phonological features over graphemes OGraphonolev = md060graphonoLev.clGraphonolev() LDistances = [] ICounter = 0 ICounterRec = 0 for (SWordA, SPoSA, IFrqA) in LWordsA: LenA = len(SWordA) try: LogFrqA = math.log(IFrqA) except: LogFrqA = 0 LCognates = [] LCognates1 = [] ICounter += 1 if ICounter % 5 == 0: sys.stderr.write(SWordA + ' ' + str(ICounter) + '\n') ''' # changed: for (SWordB, SPoSB, IFrqB) in LWordsB: LenB = len(SWordB) LenAve = (LenA + LenB) / 2 ILev = self.computeLevenshtein(SWordA, SWordB) ALevNorm = ILev/LenAve if ALevNorm <= 0.30: LCognates.append((ALevNorm, ILev, SWordB, SPoSB, IFrqB)) ''' for (SWordB, SPoSB, IFrqB) in LWordsB: (Lev0, Lev1, Lev0Norm, Lev1Norm) = OGraphonolev.computeLevenshtein( SWordA, SWordB, SLangIDa, SLangIDb) if Lev0Norm <= 0.36: LCognates.append((Lev0Norm, Lev0, SWordB, SPoSB, IFrqB)) if Lev1Norm <= 0.36: LCognates1.append((Lev1Norm, Lev1, SWordB, SPoSB, IFrqB)) LDistances.append((SWordA, SPoSA, IFrqA, LCognates)) if (len(LCognates) > 0 or len(LCognates1) > 0): ICounterRec += 1 ACognPerCent = ICounterRec / ICounter sys.stdout.write( '\t{, %(ICounterRec)d, %(ICounter)d, %(SWordA)s, %(SPoSA)s, frq=%(IFrqA)d, ln=%(LogFrqA).2f, have-cognates: %(ACognPerCent).2f : \n' % locals()) # sys.stdout.flush() self.printCognates(LCognates, LogFrqA, SPoSA) sys.stdout.write('\t\n') self.printCognates(LCognates1, LogFrqA, SPoSA) '''
def __init__(self, SFInA, SFInB, SLangIDa, SLangIDb): ''' Constructor ''' SNameIName , SNameIExt = os.path.splitext(SFInA) # generate the debug using the first file name SFDebug = SNameIName + '-md050crosslevenshtein.debug' # FDebug = open(SFDebug, 'w') # debug file for each of the input files.. LWordsA = self.readWordList(SFInA) LWordsB = self.readWordList(SFInB) # graphonological object with phonological features over graphemes # OGraphonolev = md060graphonoLev.clGraphonolev(Debug = True, DebugFile = SFDebug) OGraphonolev = md060graphonoLev.clGraphonolev() LDistances = [] ICounter = 0 ICounterRec = 0 for (SWordA, SPoSA, IFrqA) in LWordsA: LenA = len(SWordA) try: LogFrqA = math.log(IFrqA) except: LogFrqA = 0 LCognates = [] LCognates1 = [] ICounter += 1 if ICounter % 1 == 0: sys.stderr.write(SWordA + ' ' + str(ICounter) + '\n') ''' # changed: for (SWordB, SPoSB, IFrqB) in LWordsB: LenB = len(SWordB) LenAve = (LenA + LenB) / 2 ILev = self.computeLevenshtein(SWordA, SWordB) ALevNorm = ILev/LenAve if ALevNorm <= 0.30: LCognates.append((ALevNorm, ILev, SWordB, SPoSB, IFrqB)) ''' for (SWordB, SPoSB, IFrqB) in LWordsB: # Lev0 is baseline Levenshtein distance # Lev1 is is graphonological Levenshtein distance (Lev0, Lev1, Lev0Norm, Lev1Norm) = OGraphonolev.computeLevenshtein(SWordA, SWordB, SLangIDa, SLangIDb) # if Lev0Norm <= 0.36: if Lev0Norm <= 0.4: LCognates.append((Lev0Norm, Lev0, SWordB, SPoSB, IFrqB)) # baseline Lev # if Lev1Norm <= 0.36: if Lev1Norm <= 0.4: LCognates1.append((Lev1Norm, Lev1, SWordB, SPoSB, IFrqB)) # graphonological Lev LDistances.append((SWordA, SPoSA, IFrqA, LCognates)) if (len(LCognates) > 0 or len(LCognates1) > 0): ICounterRec += 1 ACognPerCent = ICounterRec / ICounter # now restricted to writing only one cognate... # sys.stdout.write('\t{, %(ICounterRec)d, %(ICounter)d, %(SWordA)s, %(SPoSA)s, frq=%(IFrqA)d, ln=%(LogFrqA).2f, have-cognates: %(ACognPerCent).2f : \n' % locals()) sys.stdout.write('%(SWordA)s\t%(SPoSA)s\tfrq=%(IFrqA)d\t\n' % locals()) sys.stdout.flush() sys.stdout.write('BASELINE:\n') self.printCognate(LCognates, LogFrqA, SPoSA) # sys.stdout.write('\t') sys.stdout.write('GRAPHONOLOGICAL:\n') self.printCognate(LCognates1, LogFrqA, SPoSA) # removed (s) --> printCognate(s) in function call : simple production version sys.stdout.write('\n\n') sys.stdout.flush() '''
def __init__(self, SFInA, SFInB, SLangIDa, SLangIDb): ''' Constructor ''' SNameIName, SNameIExt = os.path.splitext( SFInA) # generate the debug using the first file name SFDebug = SNameIName + '-md050crosslevenshtein.debug' # FDebug = open(SFDebug, 'w') # debug file for each of the input files.. LWordsA, LWordsIndexA = self.readWordList(SFInA) LWordsB, LWordsIndexB = self.readWordList( SFInB ) # graphonological object with phonological features over graphemes # OGraphonolev = md060graphonoLev.clGraphonolev(Debug = True, DebugFile = SFDebug) OGraphonolev = md060graphonoLev.clGraphonolev() LDistances = [] ICounter = 0 ICounterRec = 0 for (SWordA, SPoSA, IFrqA, LTargetsEval) in LWordsA: LenA = len(SWordA) try: LogFrqA = math.log(IFrqA) except: LogFrqA = 0 LCognates = [] LCognates1 = [] ICounter += 1 if ICounter % 1 == 0: sys.stderr.write(SWordA + ' ' + str(ICounter) + '\n') print(SWordA) print(str(LTargetsEval)) # count if references can be found in the target? ICountEvalInTarget = 0 # dictionary of translation equivalents to find: DTargetsEval = {} for STargetE in LTargetsEval: (Lev0E, Lev1E, Lev0NormE, Lev1NormE) = OGraphonolev.computeLevenshtein( SWordA, STargetE, SLangIDa, SLangIDb) print( '%(SWordA)s,%(STargetE)s,L-G:%(Lev0E)f,%(Lev1E)f,NL-G:%(Lev0NormE)f,%(Lev1NormE)f' % locals()) # not printing but recording into memory DTargetsEval[(STargetE, SPoSA)] = ( Lev0NormE, Lev1NormE, Lev0E, Lev1E ) # then we should be able to sort by the top candidate # if found the evaluation target in reference if STargetE in LWordsIndexB: ICountEvalInTarget += 1 if ICountEvalInTarget == 0: print('\t: not in the reference list!') continue # -- this stops search for items, which where reference is not in Russian list, so cannot be found... # can be treated a bit differently, statistics collected, etc.!!! ''' # changed: for (SWordB, SPoSB, IFrqB) in LWordsB: LenB = len(SWordB) LenAve = (LenA + LenB) / 2 ILev = self.computeLevenshtein(SWordA, SWordB) ALevNorm = ILev/LenAve if ALevNorm <= 0.30: LCognates.append((ALevNorm, ILev, SWordB, SPoSB, IFrqB)) ''' # LTargetsEval2 not used -- usually large reference word list is not expected to have references # create a dictionary to store the equivalents, sort differently; DCognates = {} for (SWordB, SPoSB, IFrqB, LTargetsEval2) in LWordsB: # Lev0 is baseline Levenshtein distance # Lev1 is is graphonological Levenshtein distance if SPoSB != SPoSA: continue # restrict search to the same pos space (Lev0, Lev1, Lev0Norm, Lev1Norm) = OGraphonolev.computeLevenshtein( SWordA, SWordB, SLangIDa, SLangIDb) # if Lev0Norm <= 0.36: ''' --> removed on 14/04/2017, to be replaced with a dictionary to find things in the list... if Lev0Norm <= 0.5: LCognates.append((Lev0Norm, Lev0, SWordB, SPoSB, IFrqB)) # baseline Lev # if Lev1Norm <= 0.36: if Lev1Norm <= 0.5: LCognates1.append((Lev1Norm, Lev1, SWordB, SPoSB, IFrqB)) # graphonological Lev ''' DCognates[(SWordB, SPoSB)] = ( Lev0Norm, Lev1Norm, Lev0, Lev1 ) # expand to account for variations of the scores; then evaluate rank accoding to different scores ''' -- removed -- not used now LDistances.append((SWordA, SPoSA, IFrqA, LCognates)) if (len(LCognates) > 0 or len(LCognates1) > 0): ICounterRec += 1 ACognPerCent = ICounterRec / ICounter # now restricted to writing only one cognate... # sys.stdout.write('\t{, %(ICounterRec)d, %(ICounter)d, %(SWordA)s, %(SPoSA)s, frq=%(IFrqA)d, ln=%(LogFrqA).2f, have-cognates: %(ACognPerCent).2f : \n' % locals()) sys.stdout.write('%(SWordA)s\t%(SPoSA)s\tfrq=%(IFrqA)d\t\n' % locals()) sys.stdout.flush() sys.stdout.write('BASELINE:\n') self.printCognate(LCognates, LogFrqA, SPoSA) # sys.stdout.write('\t') sys.stdout.write('GRAPHONOLOGICAL:\n') self.printCognate(LCognates1, LogFrqA, SPoSA) # removed (s) --> printCognate(s) in function call : simple production version sys.stdout.write('\n\n') sys.stdout.flush() ''' # list of cognates is formed for (STargetE, SPoSA) in sorted(DTargetsEval.keys(), reverse=False, key=lambda k: k[1][0]): if (STargetE, SPoSA) in DCognates.keys(): # find rank and top-n list: # (IRankL, ITopNL, IRankG, ITopNG) = self.findRanks((STargetE, SPoSA), DCognates) LLSortCog = self.findRanks((STargetE, SPoSA), DCognates, [0, 1]) for (ICogRank, ILenLSortCog, LSortCog) in LLSortCog: sys.stdout.write( '%(SWordA)s\t%(STargetE)s\t%(ICogRank)d\t%(ILenLSortCog)d\t\n' % locals()) for (ICogRank, TKey, TVals) in LSortCog: # corrected SKey = str(TKey) SVals = str(TVals) sys.stdout.write( '\t%(ICogRank)d\t%(SKey)s\t%(SVals)s\n' % locals()) (Lev0Norm, Lev1Norm, Lev0, Lev1) = DCognates[(STargetE, SPoSA)] sys.stdout.write( '%(STargetE)s\t%(SPoSA)s\t%(Lev0Norm)f\t%(Lev1Norm)f\t -- found \n' % locals()) sys.stdout.flush() else: (Lev0NormE, Lev1NormE, Lev0E, Lev1E) = DTargetsEval[(STargetE, SPoSA)] sys.stdout.write( '%(STargetE)s\t%(SPoSA)s\t%(Lev0NormE)f\t%(Lev1NormE)f\t -- NOT found \n' % locals()) sys.stdout.flush() '''
def __init__(self, SFInA, SFInB, SLangIDa, SLangIDb): ''' Constructor ''' SNameIName, SNameIExt = os.path.splitext( SFInA) # generate the debug using the first file name SFDebug = SNameIName + '-md050crosslevenshtein.debug' # FDebug = open(SFDebug, 'w') # debug file for each of the input files.. LWordsA, LWordsIndexA = self.readWordList(SFInA) LWordsB, LWordsIndexB = self.readWordList( SFInB ) # graphonological object with phonological features over graphemes # OGraphonolev = md060graphonoLev.clGraphonolev(Debug = True, DebugFile = SFDebug) OGraphonolev = md060graphonoLev.clGraphonolev() LDistances = [] ICounter = 0 ICounterRec = 0 for (SWordA, SPoSA, IFrqA, LTargetsEval) in LWordsA: LenA = len(SWordA) try: LogFrqA = math.log(IFrqA) except: LogFrqA = 0 LCognates = [] LCognates1 = [] ICounter += 1 if ICounter % 1 == 0: sys.stderr.write(SWordA + ' ' + str(ICounter) + '\n') print(SWordA) print(str(LTargetsEval)) # count if references can be found in the target? ICountEvalInTarget = 0 for STargetE in LTargetsEval: (Lev0E, Lev1E, Lev0NormE, Lev1NormE) = OGraphonolev.computeLevenshtein( SWordA, STargetE, SLangIDa, SLangIDb) print( '%(SWordA)s,%(STargetE)s,L-G:%(Lev0E)f,%(Lev1E)f,NL-G:%(Lev0NormE)f,%(Lev1NormE)f' % locals()) # not printing but recording into memory # if found the evaluation target in reference if STargetE in LWordsIndexB: ICountEvalInTarget += 1 if ICountEvalInTarget == 0: print('\t: not in the reference list!') continue # -- this stops search for items, which where reference is not in Russian list, so cannot be found... # can be treated a bit differently, statistics collected, etc.!!! ''' # changed: for (SWordB, SPoSB, IFrqB) in LWordsB: LenB = len(SWordB) LenAve = (LenA + LenB) / 2 ILev = self.computeLevenshtein(SWordA, SWordB) ALevNorm = ILev/LenAve if ALevNorm <= 0.30: LCognates.append((ALevNorm, ILev, SWordB, SPoSB, IFrqB)) ''' # LTargetsEval2 not used -- usually large reference word list is not expected to have references for (SWordB, SPoSB, IFrqB, LTargetsEval2) in LWordsB: # Lev0 is baseline Levenshtein distance # Lev1 is is graphonological Levenshtein distance (Lev0, Lev1, Lev0Norm, Lev1Norm) = OGraphonolev.computeLevenshtein( SWordA, SWordB, SLangIDa, SLangIDb) # if Lev0Norm <= 0.36: if Lev0Norm <= 0.5: LCognates.append( (Lev0Norm, Lev0, SWordB, SPoSB, IFrqB)) # baseline Lev # if Lev1Norm <= 0.36: if Lev1Norm <= 0.5: LCognates1.append((Lev1Norm, Lev1, SWordB, SPoSB, IFrqB)) # graphonological Lev LDistances.append((SWordA, SPoSA, IFrqA, LCognates)) if (len(LCognates) > 0 or len(LCognates1) > 0): ICounterRec += 1 ACognPerCent = ICounterRec / ICounter # now restricted to writing only one cognate... # sys.stdout.write('\t{, %(ICounterRec)d, %(ICounter)d, %(SWordA)s, %(SPoSA)s, frq=%(IFrqA)d, ln=%(LogFrqA).2f, have-cognates: %(ACognPerCent).2f : \n' % locals()) sys.stdout.write('%(SWordA)s\t%(SPoSA)s\tfrq=%(IFrqA)d\t\n' % locals()) sys.stdout.flush() sys.stdout.write('BASELINE:\n') self.printCognate(LCognates, LogFrqA, SPoSA) # sys.stdout.write('\t') sys.stdout.write('GRAPHONOLOGICAL:\n') self.printCognate(LCognates1, LogFrqA, SPoSA) # removed (s) --> printCognate(s) in function call : simple production version sys.stdout.write('\n\n') sys.stdout.flush() '''