def ngramize(self, document, minSize, maxSize, tagTokens, filters, stemmer): """ common tagTokens decomposition method returns a dict of filtered NGram instances @tagTokens == [[word1 tokens], [word2 tokens], etc] """ # content is the list of words from tagTokens content = self.getContent(tagTokens) stemmedcontent = [] for word in content: stemmedcontent += [stemmer.stem(word)] # tags is the list of tags from tagTokens tags = self.getTag(tagTokens) #logging.debug(tags) for i in range(len(content)): for n in range(minSize, maxSize + 1): if len(content) >= i+n: # id made from the stemmedcontent and label made from the real tokens sha256ngid = getNodeId(stemmedcontent[i:n+i]) # ngram is already attached to this cable if sha256ngid in document['edges']['NGram']: document = addEdge(document, 'NGram', sha256ngid, 1) else: # else if ngram is not already in the corpus #savedngram = self.mongodb.ngrams.find_one({'_id': sha256ngid}) if self.mongodb.ngrams.find_one({'_id': sha256ngid}) is None: # create NGram object to pass it throug the filters label = getNodeLabel(content[i:n+i]) ngram = { '_id': sha256ngid, 'label': label, 'content': content[i:n+i], 'edges': { 'postag' : { label : tags[i:n+i] }, 'label': { label : 1 } }, 'postag' : tags[i:n+i], 'category': "NGram", 'occs': 1, } # application defined filtering if filtering.apply_filters(ngram, filters) is True: # create the node ngram['postag'] = ",".join(ngram['postag']) # increment document occurrences document = addEdge(document, 'NGram', sha256ngid, 1) # save the new NGram self.mongodb.ngrams.save(ngram) print "saving ngram %s"%ngram['postag'] else: # was already in the corpus and not in this document #savedngram['occs'] += 1 # increments ngram total occurrences self.mongodb.ngrams.update({'_id': sha256ngid}, {"$inc":{"occs":1}}) #self.mongodb.ngrams.save(savedngram) document = addEdge(document, 'NGram', sha256ngid, 1)
def add(self, stopng): """ Adds a stop-ngram to the object @stopng must be a list of words """ if not isinstance(stopng, list): raise Exception("%s is not a valid ngram (not a list)"%stopng) while len(self.words) < len(stopng) + 1: self.words+=[{}] self.words[len(stopng)][getNodeLabel(stopng)] = 1
def ngramize(self, document, minSize, maxSize, tagTokens, filters, stemmer): """ common tagTokens decomposition method returns a dict of filtered NGram instances @tagTokens == [[word1 tokens], [word2 tokens], etc] """ # content is the list of words from tagTokens content = self.getContent(tagTokens) stemmedcontent = [] for word in content: stemmedcontent += [stemmer.stem(word)] # tags is the list of tags from tagTokens tags = self.getTag(tagTokens) #logging.debug(tags) for i in range(len(content)): for n in range(minSize, maxSize + 1): if len(content) >= i + n: # id made from the stemmedcontent and label made from the real tokens sha256ngid = getNodeId(stemmedcontent[i:n + i]) # ngram is already attached to this cable if sha256ngid in document['edges']['NGram']: document = addEdge(document, 'NGram', sha256ngid, 1) else: # else if ngram is not already in the corpus #savedngram = self.mongodb.ngrams.find_one({'_id': sha256ngid}) if self.mongodb.ngrams.find_one({'_id': sha256ngid }) is None: # create NGram object to pass it throug the filters label = getNodeLabel(content[i:n + i]) ngram = { '_id': sha256ngid, 'label': label, 'content': content[i:n + i], 'edges': { 'postag': { label: tags[i:n + i] }, 'label': { label: 1 } }, 'postag': tags[i:n + i], 'category': "NGram", 'occs': 1, } # application defined filtering if filtering.apply_filters(ngram, filters) is True: # create the node ngram['postag'] = ",".join(ngram['postag']) # increment document occurrences document = addEdge(document, 'NGram', sha256ngid, 1) # save the new NGram self.mongodb.ngrams.save(ngram) print "saving ngram %s" % ngram['postag'] else: # was already in the corpus and not in this document #savedngram['occs'] += 1 # increments ngram total occurrences self.mongodb.ngrams.update({'_id': sha256ngid}, {"$inc": { "occs": 1 }}) #self.mongodb.ngrams.save(savedngram) document = addEdge(document, 'NGram', sha256ngid, 1)