예제 #1
0
def learnLemmasByOrderOfScore(getSentenceScore):
    # Scheme: Learn words as they become possible to learn, in terms of sentences, in order of score

    # Initialize: Load all texts in Texts folder:
    TextParser.addAllTextsFromDirectoryToDatabase("Texts")

    # Will only contain sentences with fewer than or equal to one missing word, marked in order of the missing words frequency
    directlyUnlockableLemmasScore, sentencePairsBySentenceScore, directlyUnlockableLemmas = getPriorityQueueOfDirectlyLearnableSentencesByLemmaFrequency(getSentenceScore)
    lemmasByFrequency = getPriorityQueueOfLemmasByFrequency()

    # Find which words one is forced to learn, without being able to isolate it to one sentence:
    forcedToLearn = []
    notForcedToLearn = []
    orderedLearningList = []
    #First we remove all words that are not true "words", for example names, by learning the NotAWordLemma lemma:
    learnLemmaAndHandleSentencesWithLemmaFrequency(TextParser.NotAWordLemma, notForcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore)

    i = 0
    numberOfLemmas = len(lemmasByFrequency)
    print("Start learning lemmas: " + str(len(lemmasByFrequency)))

    highestScoringDirectlyLearnableSentencePair = None
    highestScoringDirectlyLearnableSentencePairScore = None
    while not hasLearnedAllLemmas(lemmasByFrequency):
        (highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) = getHighestScoringDirectlyLearnablePair(directlyUnlockableLemmasScore, sentencePairsBySentenceScore)

        while hasDirectlyLearnableSentence(directlyUnlockableLemmas):
            currentSentencePair = getHighestScoringUnforcedSentencePair(sentencePairsBySentenceScore, highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore)
            
            assert i + len(lemmasByFrequency) == numberOfLemmas

            # No new word in the sentence:
            if hasNoNewLemmas(currentSentencePair):
                continue
            
            assert i + len(lemmasByFrequency) == numberOfLemmas

            # A new pair of words to learn: lets do it!
            kage = 1
            #TODO (*) Der mangler at blive fjernet en fejl i forbindelse med at opdaterer sætninger, hvis sentence score afhænger af andre sætninger.
            for sentence in currentSentencePair:
                if sentence == None:
                    continue
                if sentence.associatedLearnableSentence != None:
                    sentence.associatedLearnableSentence.scoreDependentSentences.remove(sentence)

                newLemma = sentence.getOnlyUncoveredLemma()
                orderedLearningList.append((newLemma, sentence))
                learnLemmaAndHandleSentencesWithLemmaFrequency(newLemma, notForcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore)            
                if i % 1 == 0 or i < 4000:
                    print(str(i) + ", " + newLemma.getRawLemma() + ", " + str(newLemma.getFrequency()) + " -> " + sentence.rawSentence)
                i += 1  
                
                assert i + len(lemmasByFrequency) == numberOfLemmas
                
            (highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) = getHighestScoringDirectlyLearnablePair(directlyUnlockableLemmasScore, sentencePairsBySentenceScore)
            

        if hasLearnedAllLemmas(lemmasByFrequency):  # When all words have been learned in the loop above
            break

        # There are no more free words: time to learn a frequent word:
        newLemma = getHighestScoringLemma(lemmasByFrequency)
        orderedLearningList.append((newLemma, "NONE"))
        learnLemmaAndHandleSentencesWithLemmaFrequency(newLemma, forcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore)            
        if i < 6000:
            print(str(i) + ", " + newLemma.getRawLemma() + ", " + str(newLemma.getFrequency()) + " -> " + "NONE")
        i += 1
        assert i + len(lemmasByFrequency) == numberOfLemmas

    print("Learned directly " + str(len(orderedLearningList)) + " of " + str(numberOfLemmas) + " lemmas.")
    return orderedLearningList
예제 #2
0
            listOfWordStems.add(wordStem)
        else:
            wordToWordStem[wordConjugation] = {wordStem}

def hasLearnedAllLemmas(lemmasByFrequency):
    return len(lemmasByFrequency) == 0


def hasDirectlyLearnableSentence(directlyUnlockableLemmas):
    can = len(directlyUnlockableLemmas) != 0
    return can
                        
if __name__ == '__main__':
    shouldResetSaveData = False
    if shouldResetSaveData:
        TextParser.addAllTextsFromDirectoryToDatabase("Texts")
        TextParser.saveProcessedData(TextParser.everything, "everything")
    else: 
        test = TextParser.loadProcessedData("everything")
        #numberOfConjugatedVerbs = 0
        #for lemma in TextParser.allLemmas:
        #    if lemma.endswith("ed"):
        #        numberOfConjugatedVerbs += 1
        learningList = learnLemmasByOrderOfScore(getSentenceScoreByNextUnlockableLemma)
        print(len(learningList))

    #Mulige forbedringer:
        #Bedre lemma classefier. Der er f.eks. mange -ed bøjninger der bliver klassificeret som sit eget ord. Kan nok fjerne 1/10 til 1/20 af alle lemaer.
        #Fjern navne og lignende.
        #Hastighedsforbedinger. 
        #Fjern meget korte sætninger, og meget lange sætninger.