Example #1
0
 def addForm(self, form_tokens, form_postag=None, form_occs=1 ):
     """
     increments form edges
     """
     if form_postag is None:
         form_postag = ["?"]
     form_label = PyTextMiner.form_label( form_tokens )
     self.addEdge('label', form_label, form_occs)
     self.addEdge('postag', form_label, form_postag)
Example #2
0
    def ngramize(ngrams, minSize, maxSize, tagTokens, filters, stemmer):
        """
        common ngramizing method
        returns a dict of filtered NGram instances
        using the optional stopwords object to filter by ngram length

        @tagTokens == [[word1 tokens], [word2 tokens], etc]
        """
        # content is the list of words from tagTokens
        content = tagger.TreeBankPosTagger.getContent(tagTokens)
        stemmedcontent = []
        for word in content:
             stemmedcontent += [stemmer.stem(word)]
        # tags is the list of tags from tagTokens
        tags = tagger.TreeBankPosTagger.getTag(tagTokens)
        for i in range(len(content)):
            for n in range(minSize, maxSize + 1):
                if len(content) >= i + n:
                    # updates document's ngrams cache
                    ngid = ngram.NGram.getNormId(stemmedcontent[i:n+i])
                    # id made from the stemmedcontent and label from the real tokens
                    ng = ngram.NGram(
                        content[i:n+i],
                        id = ngid,
                        label = PyTextMiner.form_label(content[i:n+i]),
                        occs = 1,
                        postag = tags[i:n+i]
                    )
                    if filtering.apply_filters(ng, filters) is True:
                        if ngid in ngrams:
                            ngrams[ngid].addForm( content[i:n+i], tags[i:n+i], 1 )
                            ngrams[ngid].updateMajorForm()
                            ngrams[ngid]['occs'] += ng['occs']
                        else:
                            ngrams[ngid] = ng
        return ngrams