Beispiel #1
0
    def __init__(
        self,
        path_to_stop="./english.dat",
        additional_stop=[],
        remove_stop=[],
        min_unigram=10,
        min_ngram=5,
        exclude=[],
        language="english",
    ):

        print("Loading stopwords for %s" % language)
        self._stopwords = StopWords(path_to_stop)[language] | set(additional_stop)
        for ii in remove_stop:
            if ii in self._stopwords:
                self._stopwords.remove(ii)

        self._exclude_from_bigram = set(exclude)

        self._min_uni = min_unigram
        self._min_ngram = min_ngram

        self._dual = defaultdict(int)
        self._left = defaultdict(int)
        self._right = defaultdict(int)

        self._unigram = defaultdict(int)

        self._invalid_chars = set(punctuation)
Beispiel #2
0
class BigramFinder:
    def __init__(
        self,
        path_to_stop="./english.dat",
        additional_stop=[],
        remove_stop=[],
        min_unigram=10,
        min_ngram=5,
        exclude=[],
        language="english",
    ):

        print("Loading stopwords for %s" % language)
        self._stopwords = StopWords(path_to_stop)[language] | set(additional_stop)
        for ii in remove_stop:
            if ii in self._stopwords:
                self._stopwords.remove(ii)

        self._exclude_from_bigram = set(exclude)

        self._min_uni = min_unigram
        self._min_ngram = min_ngram

        self._dual = defaultdict(int)
        self._left = defaultdict(int)
        self._right = defaultdict(int)

        self._unigram = defaultdict(int)

        self._invalid_chars = set(punctuation)

    # Apparently this checks if word has invalid chars
    def normalize_word(self, word):
        word = word.lower()
        reduced = "".join(x for x in word if not x in self._invalid_chars)
        if len(reduced) > 1 and not reduced in self._stopwords:
            return reduced
        else:
            return ""

    def score(self, bigram, array=None):
        if not array:
            array = zeros((2, 2))

        left, right = bigram

        if any(x in self._exclude_from_bigram for x in bigram):
            return 0.0

        array[0, 0] = self._dual[bigram]
        array[0, 1] = self._left[left]
        array[1, 0] = self._right[right]
        either = array[1, 0] + array[0, 1] - array[0, 0]
        array[1, 1] = self._total - either

        stat, p = chisquare(array)
        return p[0]

    def set_counts(self, counts):
        for ii in counts:
            if counts[ii] >= self._min_uni:
                self._unigram[self.normalize_word(ii)] += counts[ii]

    # 1
    def create_counts(self, tokens):
        for word in tokens:
            reduced = self.normalize_word(word)
            if reduced:
                self._unigram[reduced] += 1

        # now cull low counts
        to_delete = set()
        for ww in self._unigram:
            if self._unigram[ww] < self._min_uni:
                to_delete.add(ww)

        for ii in to_delete:
            del self._unigram[ii]

    def add_ngram_counts(self, tokens):
        for ngram in ibigrams(tokens):
            if all(x in self._unigram for x in ngram):
                self._dual[ngram] += 1

    # 2
    def find_ngrams(self, tokens=[]):
        self.add_ngram_counts(tokens)

        to_delete = set()
        for ngram in self._dual:
            if self._dual[ngram] < self._min_ngram:
                to_delete.add(ngram)

        for ngram in to_delete:
            del self._dual[ngram]

        self._total = 0
        for ll, rr in self._dual:
            count = self._dual[(ll, rr)]
            self._total += count
            self._left[ll] += count
            self._right[rr] += count

    # 3
    def real_ngrams(self, cutoff):
        d = {}
        for ii in self._dual:
            score = self.score(ii)
            if score > cutoff:
                d[ii] = self._dual[ii]
        return d

    def print_ngrams(self, limit=10):
        num_tokens = 0
        for ii in self._dual:
            if num_tokens > limit:
                break
            print(ii, self._left[ii[0]], self._right[ii[1]], self._dual[ii], self.score(ii))
            num_tokens += 1

    def print_unigrams(self, limit=10):
        num_tokens = 0
        for ii in self._unigram:
            if num_tokens > limit:
                break
            print(ii, self._unigram[ii])
            num_tokens += 1