def addForm(self, form_tokens, form_postag=None, form_occs=1 ): """ increments form edges """ if form_postag is None: form_postag = ["?"] form_label = PyTextMiner.form_label( form_tokens ) self.addEdge('label', form_label, form_occs) self.addEdge('postag', form_label, form_postag)
def ngramize(ngrams, minSize, maxSize, tagTokens, filters, stemmer): """ common ngramizing method returns a dict of filtered NGram instances using the optional stopwords object to filter by ngram length @tagTokens == [[word1 tokens], [word2 tokens], etc] """ # content is the list of words from tagTokens content = tagger.TreeBankPosTagger.getContent(tagTokens) stemmedcontent = [] for word in content: stemmedcontent += [stemmer.stem(word)] # tags is the list of tags from tagTokens tags = tagger.TreeBankPosTagger.getTag(tagTokens) for i in range(len(content)): for n in range(minSize, maxSize + 1): if len(content) >= i + n: # updates document's ngrams cache ngid = ngram.NGram.getNormId(stemmedcontent[i:n+i]) # id made from the stemmedcontent and label from the real tokens ng = ngram.NGram( content[i:n+i], id = ngid, label = PyTextMiner.form_label(content[i:n+i]), occs = 1, postag = tags[i:n+i] ) if filtering.apply_filters(ng, filters) is True: if ngid in ngrams: ngrams[ngid].addForm( content[i:n+i], tags[i:n+i], 1 ) ngrams[ngid].updateMajorForm() ngrams[ngid]['occs'] += ng['occs'] else: ngrams[ngid] = ng return ngrams