def _pairGain(self, wordPair):
     frequencyInGoodTexts = TextHelpers.countwordpairoccurences(
         wordPair, self._goodText)
     frequencyInBadTexts = TextHelpers.countwordpairoccurences(
         wordPair, self._badText)
     if frequencyInBadTexts == 0.0:
         frequencyInBadTexts = 0.1 / self._badWordCount
     return frequencyInGoodTexts / frequencyInBadTexts
 def _stringGain(self, string):
     goodFrequency = TextHelpers.countstringoccurencesinword(
         string, self._goodText)
     badFrequency = TextHelpers.countstringoccurencesinword(
         string, self._badText)
     if badFrequency == 0:
         badFrequency = 0.1 / self._badWordCount
     gain = goodFrequency / badFrequency
     return gain
    def __init__(self, keyword):
        self.__stopwords = Texts.GetStopwords()

        self.__texts = Texts.GetText(keyword)
        self.__textWordCount = TextHelpers.countwords(self.__texts)
        self.__reducedTexts = TextHelpers.removeWords(self.__texts,
                                                      self.__stopwords)

        self.__otherTexts = Texts.GetText('-' + keyword)
        self.__otherTextsWordCount = TextHelpers.countwords(self.__otherTexts)
        self.__reducedOtherTexts = TextHelpers.removeWords(
            self.__otherTexts, self.__stopwords)
Esempio n. 4
0
    def __init__(
        self,
        keyword,
    ):
        self.__paragarphMargin = 3
        self.__keyword = keyword
        self.__decisions = self.__getDecisionsWithKeyordInKeywordsField()
        self.__featureTexts, self.__otherTexts = self.__getRelevantPargaphsFromDecisions(
        )

        self.__stopwords = Texts.GetStopwords()
        self.__reducedFeatureTexts = TextHelpers.removeWords(
            self.__featureTexts, self.__stopwords)
        self.__reducedOtherTexts = TextHelpers.removeWords(
            self.__otherTexts, self.__stopwords)
    def _extractFeatures(self):
        goodWordFrequencies = {
            word: TextHelpers.countwordoccurences(word, self._goodText) /
            self._goodWordCount
            for word in {x
                         for x in TextHelpers.getwords(self._goodText)}
        }

        badWordFrequencies = {
            word: TextHelpers.countwordoccurences(word, self._badText) /
            self._badWordCount
            for word in {x
                         for x in TextHelpers.getwords(self._goodText)}
        }

        keptWords = sorted(goodWordFrequencies.keys(),
                           key=(lambda k: goodWordFrequencies[k] -
                                badWordFrequencies.get(k, 0)),
                           reverse=True)[:self._featureNumber]

        result = [WordClassificationFeature(x) for x in keptWords]

        return result
    def _extractFeatures(self):

        goodWordPairs = TextHelpers.getwordpairs(self._goodText)
        pairGains = {
            wordPair: self._pairGain(wordPair)
            for wordPair in goodWordPairs
        }

        keptPairs = sorted(pairGains.keys(),
                           key=(lambda k: pairGains[k]),
                           reverse=True)[:self._featureNumber]

        result = [
            StringInWordPairClassificationFeature(x, y) for (x, y) in keptPairs
        ]

        return result
    def _extractFeatures(self):
        goodWords = {word for word in TextHelpers.getwords(self._goodText)}
        shortGoodWords = {
            word
            for word in goodWords if len(word) <= self._minlength
        }
        shortWordGains = {x: self._stringGain(x) for x in shortGoodWords}
        longGoodWords = set.difference(goodWords, shortGoodWords)
        stringGains = self._getSubstringGains(longGoodWords)
        allGains = stringGains
        allGains.update(shortWordGains)
        #allGains = { **stringGains, **shortWordGains }

        keptWords = sorted(allGains.keys(),
                           key=(lambda k: allGains[k]),
                           reverse=True)[:self._featureNumber]

        result = [
            StringClassificationFeature(x) for x in keptWords
            if allGains[x] > 1.0
        ]

        return result
Esempio n. 8
0
 def CountOccurrences(self, text):
     return TextHelpers.countstringpairsinwords(self._string1,
                                                self._string2, 0, text)
Esempio n. 9
0
 def CountOccurrences(self, text):
     return TextHelpers.countstringoccurencesinword(self._string, text)
Esempio n. 10
0
 def CountOccurrences(self, text):
     return TextHelpers.countwordoccurences(self._word, text)
Esempio n. 11
0
 def GetNonFeatureTextWordCount(self):
     return TextHelpers.countwords(self.__otherTexts)
Esempio n. 12
0
 def GetFeatureTextWordCount(self):
     return TextHelpers.countwords(self.__featureTexts)