Beispiel #1
0
	def ngramStemmer (self, wordList, size, equality):
		"reduces wordList according to the n-gram stemming method"
		
		# use return_list and stop_list for the terms to be removed, later
		returnList = []
		stopList = []
		ngramAdvas = Advas("","")

		# calculate length and range
		listLength = len(wordList)
		outerListRange = range(0, listLength)
		
		for i in outerListRange:
			term1 = wordList[i]
			innerListRange = range (0, i)
			
			# define basic n-gram object
			term1Ngram = Ngram(term1, 2)
			term1Ngram.deriveNgrams()
			term1NgramList = term1Ngram.getNgrams()

			for j in innerListRange:
				term2 = wordList[j]
				term2Ngram = Ngram(term2, 2)
				term2Ngram.deriveNgrams()
				term2NgramList = term2Ngram.getNgrams()
				
				# calculate n-gram value
				ngramSimilarity = ngramAdvas.compareNgramLists (term1NgramList, term2NgramList)
	
				# compare
				degree = ngramSimilarity - equality
				if (degree>0):
					# ... these terms are so similar that they can be conflated
					# remove the longer term, keep the shorter one
					if (len(term2)>len(term1)):
						stopList.append(term2)
					else:
						stopList.append(term1)
					# end if
				# end if
			# end for
		# end for

		# conflate the matrix
		# remove all the items which appear in stopList
		return list(set(wordList) - set(stopList))
		 
Beispiel #2
0
    def getNgramsByWord(self, word, ngramSize):
        if not ngramSize:
            return []

        term = Ngram(word, ngramSize)
        if term.deriveNgrams():
            return term.getNgrams()
        else:
            return []
Beispiel #3
0
    def getNgramsByLine(self, ngramSize):
        if not ngramSize:
            return []

        occurency = []

        # split the given text into single lines
        lines = self.splitParagraph()
        for line in lines:
            term = Ngram(line, ngramSize)
            if term.deriveNgrams():
                occurency.append(term.getNgrams())
            else:
                occurency.append([])
        return occurency
Beispiel #4
0
    def calcSuccVariety(self):

        # derive two-letter combinations
        ngramObject = Ngram(self.term, 2)
        ngramObject.deriveNgrams()
        ngramSet = set(ngramObject.getNgrams())

        # count appearances of the second letter
        varietyList = {}
        for entry in ngramSet:
            letter1 = entry[0]
            letter2 = entry[1]
            if varietyList.has_key(letter1):
                items = varietyList[letter1]
                if not letter2 in items:
                    # extend the existing one
                    items.append(letter2)
                    varietyList[letter1] = items
            else:
                # create a new one
                varietyList[letter1] = [letter2]

        return varietyList