def calculateEmissionProbabilities(self, state: object, observations: list, emittedSymbols: list) -> dict: """ calculateEmissionProbabilities calculates the emission probabilities for a specific state. The method takes the state, an array of observations (which also consists of an array of states) and an array of instances (which also consists of an array of emitted symbols). PARAMETERS ---------- states : set A Set of states, consisting of all possible states for this problem. observations : list An array of instances, where each instance consists of an array of states. emittedSymbols : list An array of instances, where each instance consists of an array of symbols. RETURNS ------- dict A HashMap. Emission probabilities for a single state. Contains a probability for each symbol emitted. """ counts = CounterHashMap() emissionProbabilities = {} for i in range(len(observations)): for j in range(len(observations[i])): currentState = observations[i][j] currentSymbol = emittedSymbols[i][j] if currentState == state: counts.put(currentSymbol) total = counts.sumOfCounts() for symbol in counts: emissionProbabilities[symbol] = counts[symbol] / total return emissionProbabilities
def test_MaxThreshold2(self): counterHashMap = CounterHashMap() for i in range(1000000): counterHashMap.put(randrange(100).__str__()) probability = counterHashMap.count(counterHashMap.max()) / 1000000.0 self.assertIsNotNone(counterHashMap.max(probability - 0.001)) self.assertIsNone(counterHashMap.max(probability + 0.001))
def test_Put2(self): counterHashMap = CounterHashMap() for i in range(1000): counterHashMap.put(randrange(1000)) count = 0 for i in range(1000): count += counterHashMap.count(i) self.assertEquals(1000, count)
def test_Add3(self): counterHashMap1 = CounterHashMap() for i in range(1000): counterHashMap1.put(i) counterHashMap2 = CounterHashMap() for i in range(500, 1000): counterHashMap2.putNTimes(1000 + i, i + 1) counterHashMap1.add(counterHashMap2) self.assertEquals(1500, len(counterHashMap1))
def test_NERCorpus(self): counter = CounterHashMap() nerCorpus = NERCorpus("../nerdata.txt") self.assertEqual(27556, nerCorpus.sentenceCount()) self.assertEqual(492233, nerCorpus.numberOfWords()) for i in range(nerCorpus.sentenceCount()): namedEntitySentence = nerCorpus.getSentence(i) for j in range(namedEntitySentence.wordCount()): namedEntityWord = namedEntitySentence.getWord(j) counter.put(namedEntityWord.getNamedEntityType()) self.assertEqual(438976, counter[NamedEntityType.NONE]) self.assertEqual(23878, counter[NamedEntityType.PERSON]) self.assertEqual(16931, counter[NamedEntityType.ORGANIZATION]) self.assertEqual(12448, counter[NamedEntityType.LOCATION])
def test_Add2(self): counterHashMap1 = CounterHashMap() counterHashMap1.put("item1") counterHashMap1.put("item2") counterHashMap1.put("item1") counterHashMap1.put("item2") counterHashMap1.put("item1") counterHashMap2 = CounterHashMap() counterHashMap2.put("item4") counterHashMap2.putNTimes("item5", 4) counterHashMap2.put("item2") counterHashMap1.add(counterHashMap2) self.assertEquals(3, counterHashMap1.count("item1")) self.assertEquals(3, counterHashMap1.count("item2")) self.assertEquals(1, counterHashMap1.count("item4")) self.assertEquals(4, counterHashMap1.count("item5"))
def test_TransitionWith(self): transitionCounts = CounterHashMap() for state in self.stateList: transitions = self.fsm.getTransitions(state) for transition in transitions: transitionCounts.put(transition.__str__()) topList = transitionCounts.topN(5) self.assertEqual("0", topList[0][1]) self.assertEqual(111, topList[0][0]) self.assertEqual("lAr", topList[1][1]) self.assertEqual(37, topList[1][0]) self.assertEqual("DHr", topList[2][1]) self.assertEqual(28, topList[2][0]) self.assertEqual("Hn", topList[3][1]) self.assertEqual(24, topList[3][0]) self.assertEqual("lArH", topList[4][1]) self.assertEqual(23, topList[4][0])
def test_TransitionWithName(self): transitionCounts = CounterHashMap() for state in self.stateList: transitions = self.fsm.getTransitions(state) for transition in transitions: transitionCounts.put(transition.withName()) topList = transitionCounts.topN(5) self.assertEqual(None, topList[0][1]) self.assertEqual(52, topList[0][0]) self.assertEqual("^DB+VERB+CAUS", topList[1][1]) self.assertEqual(33, topList[1][0]) self.assertEqual("^DB+VERB+PASS", topList[2][1]) self.assertEqual(31, topList[2][0]) self.assertEqual("A3PL", topList[3][1]) self.assertEqual(28, topList[3][0]) self.assertEqual("LOC", topList[4][1]) self.assertEqual(24, topList[4][0])
def getMaximum(classLabels: list) -> str: """ Given an array of class labels, returns the maximum occurred one. PARAMETERS ---------- classLabels : list An array of class labels. RETURNS ------- str The class label that occurs most in the array of class labels (mod of class label list). """ frequencies = CounterHashMap() for label in classLabels: frequencies.put(label) return frequencies.max()
def classify(self, actualClass: str, predictedClass: str): """ The classify method takes two Strings; actual class and predicted class as inputs. If the matrix dictionary contains given actual class String as a key, it then assigns the corresponding object of that key to a CounterHashMap, if not it creates a new CounterHashMap. Then, it puts the given predicted class String to the counterHashMap and also put this counterHashMap to the matrix dictionary together with the given actual class String. PARAMETERS ---------- actualClass : str String input actual class. predictedClass : str String input predicted class. """ if actualClass in self.__matrix: counterHashMap = self.__matrix[actualClass] else: counterHashMap = CounterHashMap() counterHashMap.put(predictedClass) self.__matrix[actualClass] = counterHashMap
def test_StartEndStates(self): endStateCount = 0 for state in self.stateList: if state.isEndState(): endStateCount = endStateCount + 1 self.assertEqual(35, endStateCount) posCounts = CounterHashMap() for state in self.stateList: posCounts.put(state.getPos()) self.assertEqual(1, posCounts.get("HEAD")) self.assertEqual(6, posCounts.get("PRON")) self.assertEqual(1, posCounts.get("PROP")) self.assertEqual(8, posCounts.get("NUM")) self.assertEqual(7, posCounts.get("ADJ")) self.assertEqual(1, posCounts.get("INTERJ")) self.assertEqual(1, posCounts.get("DET")) self.assertEqual(1, posCounts.get("ADVERB")) self.assertEqual(1, posCounts.get("QUES")) self.assertEqual(1, posCounts.get("CONJ")) self.assertEqual(26, posCounts.get("VERB")) self.assertEqual(1, posCounts.get("POSTP")) self.assertEqual(1, posCounts.get("DUP")) self.assertEqual(11, posCounts.get("NOUN"))
def test_Max(self): counterHashMap = CounterHashMap() counterHashMap.put("item1") counterHashMap.put("item2") counterHashMap.put("item3") counterHashMap.put("item1") counterHashMap.put("item2") counterHashMap.put("item1") self.assertEquals("item1", counterHashMap.max())
def test_DependencyCorpus(self): relationCounts = CounterHashMap() corpus = TurkishDependencyTreeBankCorpus("../metu-treebank.xml") self.assertEqual(5635, corpus.sentenceCount()) wordCount = 0 for i in range(corpus.sentenceCount()): sentence = corpus.getSentence(i) wordCount += sentence.wordCount() for j in range(sentence.wordCount()): word = sentence.getWord(j) if word.getRelation() is not None: relationCounts.put(word.getRelation().getTurkishDependencyType()) self.assertEqual(11692, relationCounts.get(TurkishDependencyType.MODIFIER)) self.assertEqual(903, relationCounts.get(TurkishDependencyType.INTENSIFIER)) self.assertEqual(1142, relationCounts.get(TurkishDependencyType.LOCATIVE_ADJUNCT)) self.assertEqual(240, relationCounts.get(TurkishDependencyType.VOCATIVE)) self.assertEqual(7261, relationCounts.get(TurkishDependencyType.SENTENCE)) self.assertEqual(16, relationCounts.get(TurkishDependencyType.EQU_ADJUNCT)) self.assertEqual(159, relationCounts.get(TurkishDependencyType.NEGATIVE_PARTICLE)) self.assertEqual(4481, relationCounts.get(TurkishDependencyType.SUBJECT)) self.assertEqual(2476, relationCounts.get(TurkishDependencyType.COORDINATION)) self.assertEqual(2050, relationCounts.get(TurkishDependencyType.CLASSIFIER)) self.assertEqual(73, relationCounts.get(TurkishDependencyType.COLLOCATION)) self.assertEqual(1516, relationCounts.get(TurkishDependencyType.POSSESSOR)) self.assertEqual(523, relationCounts.get(TurkishDependencyType.ABLATIVE_ADJUNCT)) self.assertEqual(23, relationCounts.get(TurkishDependencyType.FOCUS_PARTICLE)) self.assertEqual(1952, relationCounts.get(TurkishDependencyType.DETERMINER)) self.assertEqual(1361, relationCounts.get(TurkishDependencyType.DATIVE_ADJUNCT)) self.assertEqual(202, relationCounts.get(TurkishDependencyType.APPOSITION)) self.assertEqual(289, relationCounts.get(TurkishDependencyType.QUESTION_PARTICLE)) self.assertEqual(597, relationCounts.get(TurkishDependencyType.S_MODIFIER)) self.assertEqual(10, relationCounts.get(TurkishDependencyType.ETOL)) self.assertEqual(8338, relationCounts.get(TurkishDependencyType.OBJECT)) self.assertEqual(271, relationCounts.get(TurkishDependencyType.INSTRUMENTAL_ADJUNCT)) self.assertEqual(85, relationCounts.get(TurkishDependencyType.RELATIVIZER)) self.assertEqual(53993, wordCount)
def test_MaxThreshold1(self): counterHashMap = CounterHashMap() counterHashMap.put("item1") counterHashMap.put("item2") counterHashMap.put("item3") counterHashMap.put("item1") counterHashMap.put("item2") counterHashMap.put("item1") self.assertEquals("item1", counterHashMap.max(0.4999)) self.assertNotEquals("item1", counterHashMap.max(0.5001))
def test_TopN1(self): counterHashMap = CounterHashMap() counterHashMap.put("item1") counterHashMap.put("item2") counterHashMap.put("item3") counterHashMap.put("item1") counterHashMap.put("item2") counterHashMap.put("item1") self.assertEquals("item1", counterHashMap.topN(1)[0][1]) self.assertEquals("item2", counterHashMap.topN(2)[1][1]) self.assertEquals("item3", counterHashMap.topN(3)[2][1])
def test_Put1(self): counterHashMap = CounterHashMap() counterHashMap.put("item1") counterHashMap.put("item2") counterHashMap.put("item3") counterHashMap.put("item1") counterHashMap.put("item2") counterHashMap.put("item1") self.assertEquals(3, counterHashMap.count("item1")) self.assertEquals(2, counterHashMap.count("item2")) self.assertEquals(1, counterHashMap.count("item3"))
def test_Add1(self): counterHashMap1 = CounterHashMap() counterHashMap1.put("item1") counterHashMap1.put("item2") counterHashMap1.put("item3") counterHashMap1.put("item1") counterHashMap1.put("item2") counterHashMap1.put("item1") counterHashMap2 = CounterHashMap() counterHashMap2.putNTimes("item1", 2) counterHashMap2.putNTimes("item2", 3) counterHashMap2.putNTimes("item3", 6) counterHashMap2.putNTimes("item1", 2) counterHashMap2.putNTimes("item2", 3) counterHashMap2.putNTimes("item1", 2) counterHashMap1.add(counterHashMap2) self.assertEquals(9, counterHashMap1.count("item1")) self.assertEquals(8, counterHashMap1.count("item2")) self.assertEquals(7, counterHashMap1.count("item3"))
def test_SumOfCounts(self): counterHashMap = CounterHashMap() for i in range(1000): counterHashMap.put(randrange(1000)) self.assertEquals(1000, counterHashMap.sumOfCounts())
class Corpus: paragraphs: list sentences: list wordList: CounterHashMap fileName: str def __init__(self, fileName=None, splitterOrChecker=None): """ Constructor of Corpus class which takes a file name as an input. Then reads the input file line by line and calls addSentence method with each read line. PARAMETERS ---------- fileName : str String file name input that will be read. """ self.sentences = [] self.paragraphs = [] self.wordList = CounterHashMap() if fileName is not None: self.fileName = fileName file = open(fileName, "r", encoding='utf8') lines = file.readlines() if splitterOrChecker is not None: if isinstance(splitterOrChecker, SentenceSplitter): for line in lines: sentences = splitterOrChecker.split(line.strip()) paragraph = Paragraph() for sentence in sentences: paragraph.addSentence(sentence) self.addParagraph(paragraph) elif isinstance(splitterOrChecker, LanguageChecker): for line in lines: sentence = Sentence(line.strip(), splitterOrChecker) self.addSentence(sentence) else: for line in lines: self.addSentence(Sentence(line.strip())) def combine(self, corpus: Corpus): """ The combine method takes a Corpus as an input and adds each sentence of sentences list. PARAMETERS ---------- corpus : Corpus Corpus type input. """ for sentence in corpus.sentences: self.addSentence(sentence) def addSentence(self, s: Sentence): """ The addSentence method takes a Sentence as an input. It adds given input to sentences list and loops through the each word in sentence and puts these words into wordList CounterHashMap. PARAMETERS ---------- s : Sentence Sentence type input that will be added to sentences list and its words will be added to wordList CounterHashMap. """ self.sentences.append(s) for i in range(s.wordCount()): w = s.getWord(i) self.wordList.put(w) def numberOfWords(self) -> int: """ The numberOfWords method loops through the sentences list and accumulates the number of words in sentence. RETURNS ------- int size which holds the total number of words. """ size = 0 for s in self.sentences: size += s.wordCount() return size def contains(self, word: str) -> bool: """ The contains method takes a String word as an input and checks whether wordList CounterHashMap has the given word and returns true if so, otherwise returns false. PARAMETERS ---------- word : str String input to check. RETURNS ------- bool True if wordList has the given word, False otherwise. """ return Word(word) in self.wordList def addParagraph(self, p: Paragraph): """ The addParagraph method takes a Paragraph type input. It gets the sentences in the given paragraph and add these to the sentences list and the words in the sentences to the wordList CounterHashMap. PARAMETERS ---------- p : Paragraph Paragraph type input to add sentences and wordList. """ self.paragraphs.append(p) for i in range(p.sentenceCount()): self.addSentence(p.getSentence(i)) def getFileName(self) -> str: """ Getter for the file name. RETURNS ------- str file name. """ return self.fileName def getWordList(self) -> set: """ Getter for the wordList. RETURNS ------- set The keySet of wordList. """ return set(self.wordList.keys()) def wordCount(self) -> int: """ The wordCount method returns the size of the wordList CounterHashMap. RETURNS ------- int The size of the wordList CounterHashMap. """ return len(self.wordList) def getCount(self, word: Word) -> int: """ The getCount method returns the count value of given word. PARAMETERS ---------- word : Word Word type input to check. RETURNS ------- int The count value of given word. """ return self.wordList[word] def sentenceCount(self) -> int: """ The sentenceCount method returns the size of the sentences list. RETURNS ------- int The size of the sentences list. """ return len(self.sentences) def getSentence(self, index: int) -> Sentence: """ Getter for getting a sentence at given index. PARAMETERS ---------- index : int index to get sentence from. RETURNS ------- Sentence The sentence at given index. """ return self.sentences[index] def paragraphCount(self) -> int: """ The paragraphCount method returns the size of the paragraphs list. RETURNS ------- int The size of the paragraphs list. """ return len(self.paragraphs) def getParagraph(self, index: int) -> Paragraph: """ Getter for getting a paragraph at given index. PARAMETERS ---------- index : int index to get paragraph from. RETURNS ------- Paragraph The paragraph at given index. """ return self.paragraphs[index] def maxSentenceLength(self) -> int: """ The maxSentenceLength method finds the sentence with the maximum number of words and returns this number. RETURNS ------- int maximum length. """ maxLength = 0 for s in self.sentences: if s.wordCount() > maxLength: maxLength = s.wordCount() return maxLength def getAllWordsAsList(self) -> list: """ The getAllWordsAsList method creates new list of lists and adds each word in each sentence of sentences list into new list. RETURNS ------- list Newly created and populated list. """ allWords = [] for i in range(self.sentenceCount()): allWords.append(self.getSentence(i).getWords()) return allWords def shuffleSentences(self, seed: int): """ The shuffleSentences method randomly shuffles sentences list with given seed value. PARAMETERS ---------- seed : int value to randomize shuffling. """ random.seed(seed) random.shuffle(self.sentences) def getTrainCorpus(self, foldNo: int, foldCount: int) -> Corpus: """ The getTrainCorpus method takes two integer inputs foldNo and foldCount for determining train data size and count of fold respectively. Initially creates a new empty Corpus, then finds the sentenceCount as N. Then, starting from the index 0 it loops through the index (foldNo * N) / foldCount and add each sentence of sentences list to new Corpus. Later on, starting from the index ((foldNo + 1) * N) / foldCount, it loops through the index N and add each sentence of sentences list to new Corpus. PARAMETERS ---------- foldNo : int Integer input for train set size. foldCount : int Integer input for counting fold. RETURNS ------- Corpus The newly created and populated Corpus. """ trainCorpus = Corpus() N = self.sentenceCount() for i in range((foldNo * N) // foldCount): trainCorpus.addSentence(self.sentences[i]) for i in range(((foldNo + 1) * N) // foldCount, N): trainCorpus.addSentence(self.sentences[i]) return trainCorpus def getTestCorpus(self, foldNo: int, foldCount: int) -> Corpus: """ The getTestCorpus method takes two integer inputs foldNo and foldCount for determining test data size and count of fold respectively. Initially creates a new empty Corpus, then finds the sentenceCount as N. Then, starting from the index (foldNo * N) / foldCount it loops through the index ((foldNo + 1) * N) / foldCount and add each sentence of sentences list to new Corpus. PARAMETERS ---------- foldNo : int Integer input for test size. foldCount : int Integer input counting fold. RETURNS ------- Corpus The newly created and populated Corpus. """ testCorpus = Corpus() N = self.sentenceCount() for i in range((foldNo * N) // foldCount, ((foldNo + 1) * N) // foldCount): testCorpus.addSentence(self.sentences[i]) return testCorpus
def nextWordPos(nextParseList: FsmParseList) -> str: _map = CounterHashMap() for i in range(nextParseList.size()): _map.put(nextParseList.getFsmParse(i).getPos()) return _map.max()
def test_Put3(self): counterHashMap = CounterHashMap() for i in range(1000000): counterHashMap.put(randrange(1000000)) self.assertAlmostEqual(len(counterHashMap) / 1000000.0, 0.632, 3)