def _pairGain(self, wordPair): frequencyInGoodTexts = TextHelpers.countwordpairoccurences( wordPair, self._goodText) frequencyInBadTexts = TextHelpers.countwordpairoccurences( wordPair, self._badText) if frequencyInBadTexts == 0.0: frequencyInBadTexts = 0.1 / self._badWordCount return frequencyInGoodTexts / frequencyInBadTexts
def _stringGain(self, string): goodFrequency = TextHelpers.countstringoccurencesinword( string, self._goodText) badFrequency = TextHelpers.countstringoccurencesinword( string, self._badText) if badFrequency == 0: badFrequency = 0.1 / self._badWordCount gain = goodFrequency / badFrequency return gain
def __init__(self, keyword): self.__stopwords = Texts.GetStopwords() self.__texts = Texts.GetText(keyword) self.__textWordCount = TextHelpers.countwords(self.__texts) self.__reducedTexts = TextHelpers.removeWords(self.__texts, self.__stopwords) self.__otherTexts = Texts.GetText('-' + keyword) self.__otherTextsWordCount = TextHelpers.countwords(self.__otherTexts) self.__reducedOtherTexts = TextHelpers.removeWords( self.__otherTexts, self.__stopwords)
def __init__( self, keyword, ): self.__paragarphMargin = 3 self.__keyword = keyword self.__decisions = self.__getDecisionsWithKeyordInKeywordsField() self.__featureTexts, self.__otherTexts = self.__getRelevantPargaphsFromDecisions( ) self.__stopwords = Texts.GetStopwords() self.__reducedFeatureTexts = TextHelpers.removeWords( self.__featureTexts, self.__stopwords) self.__reducedOtherTexts = TextHelpers.removeWords( self.__otherTexts, self.__stopwords)
def _extractFeatures(self): goodWordFrequencies = { word: TextHelpers.countwordoccurences(word, self._goodText) / self._goodWordCount for word in {x for x in TextHelpers.getwords(self._goodText)} } badWordFrequencies = { word: TextHelpers.countwordoccurences(word, self._badText) / self._badWordCount for word in {x for x in TextHelpers.getwords(self._goodText)} } keptWords = sorted(goodWordFrequencies.keys(), key=(lambda k: goodWordFrequencies[k] - badWordFrequencies.get(k, 0)), reverse=True)[:self._featureNumber] result = [WordClassificationFeature(x) for x in keptWords] return result
def _extractFeatures(self): goodWordPairs = TextHelpers.getwordpairs(self._goodText) pairGains = { wordPair: self._pairGain(wordPair) for wordPair in goodWordPairs } keptPairs = sorted(pairGains.keys(), key=(lambda k: pairGains[k]), reverse=True)[:self._featureNumber] result = [ StringInWordPairClassificationFeature(x, y) for (x, y) in keptPairs ] return result
def _extractFeatures(self): goodWords = {word for word in TextHelpers.getwords(self._goodText)} shortGoodWords = { word for word in goodWords if len(word) <= self._minlength } shortWordGains = {x: self._stringGain(x) for x in shortGoodWords} longGoodWords = set.difference(goodWords, shortGoodWords) stringGains = self._getSubstringGains(longGoodWords) allGains = stringGains allGains.update(shortWordGains) #allGains = { **stringGains, **shortWordGains } keptWords = sorted(allGains.keys(), key=(lambda k: allGains[k]), reverse=True)[:self._featureNumber] result = [ StringClassificationFeature(x) for x in keptWords if allGains[x] > 1.0 ] return result
def CountOccurrences(self, text): return TextHelpers.countstringpairsinwords(self._string1, self._string2, 0, text)
def CountOccurrences(self, text): return TextHelpers.countstringoccurencesinword(self._string, text)
def CountOccurrences(self, text): return TextHelpers.countwordoccurences(self._word, text)
def GetNonFeatureTextWordCount(self): return TextHelpers.countwords(self.__otherTexts)
def GetFeatureTextWordCount(self): return TextHelpers.countwords(self.__featureTexts)