Beispiel #1
0
def createDistroFormFile(startpath):
    distribution = LangDistro()

    with open(startpath, encoding='utf-8') as f:
        for line in f:
            createLangDistribution(line, distribution)

    distribution.removeBelowMinimal(MINIMAL_OCCURANCE)
    distribution.sort()
    distribution.makePercentage()

    return distribution
Beispiel #2
0
def createLangDistribution(line, distribution=None):
    if distribution is None:
        flag = True
        distribution = LangDistro()
    else:
        flag = False
    for c in line:
        if standardchar(c):
            distribution.appenddist(c.lower())
        else:
            pass

    if flag:
        distribution.sort()
        distribution.makePercentage()

    return distribution
def createLangDistribution(line, distribution=None, ngram=2):
    if distribution is None:
        flag = True
        distribution = LangDistro()
    else:
        flag = False
    raw = ''
    for c in line:
        if standardchar(c):
            raw += c.lower()
        else:
            pass

    res = (Counter(raw[idx:idx + ngram] for idx in range(len(raw) - 1)))
    sum = Counter(distribution) + res
    distribution.update(sum)
    if flag:
        distribution.sort()
        distribution.makePercentage()

    return distribution
Beispiel #4
0
import numpy as np

training_padded = np.array(training_padded)
training_labels = np.array(y_new_train)
testing_padded = np.array(testing_padded)
testing_labels = np.array(y_new_test)

print("Loading Model")
model = keras.models.load_model('GRU.h5')
model.summary()

classifications = model.predict(testing_padded)
classifications = classifications.tolist()

sumtests = LangDistro()  # ile testow z danego jezyka bylo
sumfails = LangDistro()  # ile testow nasz algorytm zawalil
# Testujemy!
print("Runing tests...")
counter = 0
for x in range(len(testing_labels)):

    # Zwiekszamy ilosc testow o 1
    sumtests.appenddist(LANGS[testing_labels[x]])
    # Przygotowywujemy licznik do testow oblanych z akrutalnego jezyka
    if LANGS[testing_labels[x]] not in sumfails:
        sumfails[LANGS[testing_labels[x]]] = 0

    result = classifications[counter].index(max(classifications[counter]))

    # Jezeli zly jezyk zwiekszamy liczne porazek
def runDistroTests():
    # ladujemy nasz korpus
    dist = loadCompleteDistro()

    sumtests = []
    sumfails = []
    sumtests.append(LangDistro())  # ile testow z danego jezyka bylo
    sumtests.append(LangDistro())
    sumtests.append(LangDistro())

    sumfails.append(LangDistro())  # ile testow nasz algorytm zawalil
    sumfails.append(LangDistro())
    sumfails.append(LangDistro())

    ansline = []
    print("Runing tests...")
    counter = 0
    # laduemy linijki testow
    with open(FILE_TEST_PATH, "r", encoding='utf-8') as testf:
        testline = testf.readlines()
    # i jezykow w ktorych zostaly napisane - nasze poprawne odpowiedzi
    with open(FILE_ANSWER_PATH, "r", encoding='utf-8') as answerf:
        anslineraw = answerf.readlines()

    # obcinamy zbedny znak konca lini
    for x in anslineraw:
        ansline.append(x.rstrip('\n'))

    # Testujemy!
    for x in range(len(ansline)):

        # Zwiekszamy ilosc testow o 1
        sumtests[0].appenddist(ansline[x])
        sumtests[1].appenddist(ansline[x])
        sumtests[2].appenddist(ansline[x])
        # Przygotowywujemy licznik do testow oblanych z akrutalnego jezyka
        if ansline[x] not in sumfails[0]:
            sumfails[0][ansline[x]] = 0
            sumfails[1][ansline[x]] = 0
            sumfails[2][ansline[x]] = 0

        # analizujemy teskst
        newdistro = createLangDistribution(testline[x])

        # Tutaj wywolujemy nasz algorytm porowujacy rozklad tesktu w nieznanym jezyku z naszymi zapisanymi jezykaim
        # Porownojemy je i zwracamy najbardziej nazwe najbardziej podobnego jezyka
        result1 = BhattacharyyaDistance(newdistro, dist)
        result2 = KullbackLeibner(newdistro, dist)
        result3 = Hellinger(newdistro, dist)

        # Jezeli zly jezyk zwiekszamy liczne porazek
        if result1 != ansline[x]:
            sumfails[0].appenddist(ansline[x])
        if result2 != ansline[x]:
            sumfails[1].appenddist(ansline[x])
        if result3 != ansline[x]:
            sumfails[2].appenddist(ansline[x])
        counter += 1

        print(f'Tested {counter}/{len(ansline)}')



    avg=[]
    for i in range(3):
        mysum=0
        for x in sumtests[i].keys():
            mysum+=(sumtests[i][x] - sumfails[i][x])/( sumtests[i][x]) * 100
        avg.append(mysum/len(sumtests[i].keys()))

    # Wypisujemy statysyki
    print("Printing Stats...")
    print("Bhattacharyya:")
    for x in sumtests[0].keys():
        print("Lang: \"{0}\" Failed: {1} Tested: {2} Accuracy: {3:0.3f}%".format(x, sumfails[0][x], sumtests[0][x],
                                                                                 ((sumtests[0][x] - sumfails[0][x])
                                                                                  / sumtests[0][x]) * 100))
    print("Avg: "+str(avg[0])+"%")
    print("KullbackLeibner:")
    for x in sumtests[1].keys():
        print("Lang: \"{0}\" Failed: {1} Tested: {2} Accuracy: {3:0.3f}%".format(x, sumfails[1][x], sumtests[1][x],
                                                                                 ((sumtests[1][x] - sumfails[1][x])
                                                                                  / sumtests[1][x]) * 100))
    print("Avg: " + str(avg[1]) + "%")
    print("Hellinger:")
    for x in sumtests[2].keys():
        print("Lang: \"{0}\" Failed: {1} Tested: {2} Accuracy: {3:0.3f}%".format(x, sumfails[2][x], sumtests[2][x],
                                                                                 ((sumtests[2][x] - sumfails[2][x])
                                                                                  / sumtests[2][x]) * 100))
    print("Avg: " + str(avg[2]) + "%")