def ngramize(self, document, minSize, maxSize, tagTokens, filters, stemmer):
     """
     common tagTokens decomposition method
     returns a dict of filtered NGram instances
     @tagTokens == [[word1 tokens], [word2 tokens], etc]
     """
     # content is the list of words from tagTokens
     content = self.getContent(tagTokens)
     stemmedcontent = []
     for word in content:
          stemmedcontent += [stemmer.stem(word)]
     # tags is the list of tags from tagTokens
     tags = self.getTag(tagTokens)
     #logging.debug(tags)
     for i in range(len(content)):
         for n in range(minSize, maxSize + 1):
             if len(content) >= i+n:
                 # id made from the stemmedcontent and label made from the real tokens
                 sha256ngid = getNodeId(stemmedcontent[i:n+i])
                 # ngram is already attached to this cable
                 if sha256ngid in document['edges']['NGram']:
                     document = addEdge(document, 'NGram', sha256ngid, 1)
                 else:
                     # else if ngram is not already in the corpus
                     #savedngram = self.mongodb.ngrams.find_one({'_id': sha256ngid})
                     if self.mongodb.ngrams.find_one({'_id': sha256ngid}) is None:
                         # create NGram object to pass it throug the filters
                         label = getNodeLabel(content[i:n+i])
                         ngram = {
                             '_id': sha256ngid,
                             'label': label,
                             'content': content[i:n+i],
                             'edges': {
                                 'postag' : { label : tags[i:n+i] },
                                 'label': { label : 1 }
                             },
                             'postag' : tags[i:n+i],
                             'category': "NGram",
                             'occs': 1,
                         }
                         # application defined filtering
                         if filtering.apply_filters(ngram, filters) is True:
                             # create the node
                             ngram['postag'] = ",".join(ngram['postag'])
                             # increment document occurrences
                             document = addEdge(document, 'NGram', sha256ngid, 1)
                             # save the new NGram
                             self.mongodb.ngrams.save(ngram)
                             print "saving ngram %s"%ngram['postag']
                     else:
                         # was already in the corpus and not in this document
                         #savedngram['occs'] += 1
                         # increments ngram total occurrences
                         self.mongodb.ngrams.update({'_id': sha256ngid}, {"$inc":{"occs":1}})
                         #self.mongodb.ngrams.save(savedngram)
                         document = addEdge(document, 'NGram', sha256ngid, 1)
Example #2
0
 def add(self, stopng):
     """
     Adds a stop-ngram to the object
     @stopng must be a list of words
     """
     if not isinstance(stopng, list):
         raise Exception("%s is not a valid ngram (not a list)"%stopng)
     while len(self.words) < len(stopng) + 1:
         self.words+=[{}]
     self.words[len(stopng)][getNodeLabel(stopng)] = 1
 def ngramize(self, document, minSize, maxSize, tagTokens, filters,
              stemmer):
     """
     common tagTokens decomposition method
     returns a dict of filtered NGram instances
     @tagTokens == [[word1 tokens], [word2 tokens], etc]
     """
     # content is the list of words from tagTokens
     content = self.getContent(tagTokens)
     stemmedcontent = []
     for word in content:
         stemmedcontent += [stemmer.stem(word)]
     # tags is the list of tags from tagTokens
     tags = self.getTag(tagTokens)
     #logging.debug(tags)
     for i in range(len(content)):
         for n in range(minSize, maxSize + 1):
             if len(content) >= i + n:
                 # id made from the stemmedcontent and label made from the real tokens
                 sha256ngid = getNodeId(stemmedcontent[i:n + i])
                 # ngram is already attached to this cable
                 if sha256ngid in document['edges']['NGram']:
                     document = addEdge(document, 'NGram', sha256ngid, 1)
                 else:
                     # else if ngram is not already in the corpus
                     #savedngram = self.mongodb.ngrams.find_one({'_id': sha256ngid})
                     if self.mongodb.ngrams.find_one({'_id': sha256ngid
                                                      }) is None:
                         # create NGram object to pass it throug the filters
                         label = getNodeLabel(content[i:n + i])
                         ngram = {
                             '_id': sha256ngid,
                             'label': label,
                             'content': content[i:n + i],
                             'edges': {
                                 'postag': {
                                     label: tags[i:n + i]
                                 },
                                 'label': {
                                     label: 1
                                 }
                             },
                             'postag': tags[i:n + i],
                             'category': "NGram",
                             'occs': 1,
                         }
                         # application defined filtering
                         if filtering.apply_filters(ngram, filters) is True:
                             # create the node
                             ngram['postag'] = ",".join(ngram['postag'])
                             # increment document occurrences
                             document = addEdge(document, 'NGram',
                                                sha256ngid, 1)
                             # save the new NGram
                             self.mongodb.ngrams.save(ngram)
                             print "saving ngram %s" % ngram['postag']
                     else:
                         # was already in the corpus and not in this document
                         #savedngram['occs'] += 1
                         # increments ngram total occurrences
                         self.mongodb.ngrams.update({'_id': sha256ngid},
                                                    {"$inc": {
                                                        "occs": 1
                                                    }})
                         #self.mongodb.ngrams.save(savedngram)
                         document = addEdge(document, 'NGram', sha256ngid,
                                            1)