def learnLemmasByOrderOfScore(getSentenceScore): # Scheme: Learn words as they become possible to learn, in terms of sentences, in order of score # Initialize: Load all texts in Texts folder: TextParser.addAllTextsFromDirectoryToDatabase("Texts") # Will only contain sentences with fewer than or equal to one missing word, marked in order of the missing words frequency directlyUnlockableLemmasScore, sentencePairsBySentenceScore, directlyUnlockableLemmas = getPriorityQueueOfDirectlyLearnableSentencesByLemmaFrequency(getSentenceScore) lemmasByFrequency = getPriorityQueueOfLemmasByFrequency() # Find which words one is forced to learn, without being able to isolate it to one sentence: forcedToLearn = [] notForcedToLearn = [] orderedLearningList = [] #First we remove all words that are not true "words", for example names, by learning the NotAWordLemma lemma: learnLemmaAndHandleSentencesWithLemmaFrequency(TextParser.NotAWordLemma, notForcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore) i = 0 numberOfLemmas = len(lemmasByFrequency) print("Start learning lemmas: " + str(len(lemmasByFrequency))) highestScoringDirectlyLearnableSentencePair = None highestScoringDirectlyLearnableSentencePairScore = None while not hasLearnedAllLemmas(lemmasByFrequency): (highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) = getHighestScoringDirectlyLearnablePair(directlyUnlockableLemmasScore, sentencePairsBySentenceScore) while hasDirectlyLearnableSentence(directlyUnlockableLemmas): currentSentencePair = getHighestScoringUnforcedSentencePair(sentencePairsBySentenceScore, highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) assert i + len(lemmasByFrequency) == numberOfLemmas # No new word in the sentence: if hasNoNewLemmas(currentSentencePair): continue assert i + len(lemmasByFrequency) == numberOfLemmas # A new pair of words to learn: lets do it! kage = 1 #TODO (*) Der mangler at blive fjernet en fejl i forbindelse med at opdaterer sætninger, hvis sentence score afhænger af andre sætninger. for sentence in currentSentencePair: if sentence == None: continue if sentence.associatedLearnableSentence != None: sentence.associatedLearnableSentence.scoreDependentSentences.remove(sentence) newLemma = sentence.getOnlyUncoveredLemma() orderedLearningList.append((newLemma, sentence)) learnLemmaAndHandleSentencesWithLemmaFrequency(newLemma, notForcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore) if i % 1 == 0 or i < 4000: print(str(i) + ", " + newLemma.getRawLemma() + ", " + str(newLemma.getFrequency()) + " -> " + sentence.rawSentence) i += 1 assert i + len(lemmasByFrequency) == numberOfLemmas (highestScoringDirectlyLearnableSentencePair, highestScoringDirectlyLearnableSentencePairScore) = getHighestScoringDirectlyLearnablePair(directlyUnlockableLemmasScore, sentencePairsBySentenceScore) if hasLearnedAllLemmas(lemmasByFrequency): # When all words have been learned in the loop above break # There are no more free words: time to learn a frequent word: newLemma = getHighestScoringLemma(lemmasByFrequency) orderedLearningList.append((newLemma, "NONE")) learnLemmaAndHandleSentencesWithLemmaFrequency(newLemma, forcedToLearn, sentencePairsBySentenceScore, lemmasByFrequency, directlyUnlockableLemmasScore, directlyUnlockableLemmas, getSentenceScore) if i < 6000: print(str(i) + ", " + newLemma.getRawLemma() + ", " + str(newLemma.getFrequency()) + " -> " + "NONE") i += 1 assert i + len(lemmasByFrequency) == numberOfLemmas print("Learned directly " + str(len(orderedLearningList)) + " of " + str(numberOfLemmas) + " lemmas.") return orderedLearningList
listOfWordStems.add(wordStem) else: wordToWordStem[wordConjugation] = {wordStem} def hasLearnedAllLemmas(lemmasByFrequency): return len(lemmasByFrequency) == 0 def hasDirectlyLearnableSentence(directlyUnlockableLemmas): can = len(directlyUnlockableLemmas) != 0 return can if __name__ == '__main__': shouldResetSaveData = False if shouldResetSaveData: TextParser.addAllTextsFromDirectoryToDatabase("Texts") TextParser.saveProcessedData(TextParser.everything, "everything") else: test = TextParser.loadProcessedData("everything") #numberOfConjugatedVerbs = 0 #for lemma in TextParser.allLemmas: # if lemma.endswith("ed"): # numberOfConjugatedVerbs += 1 learningList = learnLemmasByOrderOfScore(getSentenceScoreByNextUnlockableLemma) print(len(learningList)) #Mulige forbedringer: #Bedre lemma classefier. Der er f.eks. mange -ed bøjninger der bliver klassificeret som sit eget ord. Kan nok fjerne 1/10 til 1/20 af alle lemaer. #Fjern navne og lignende. #Hastighedsforbedinger. #Fjern meget korte sætninger, og meget lange sætninger.