class BOWBuilderSimple(AbstractProducer): """ Creates bow for each class. """ def customInitialization(self): self.parser = self.kwargs["parserInstance"] self.initializeCounters() def initializeCounters(self): """ Initialization for BOWBuilderSimple is trivial (unlike BOWBuilderComplex).""" self.counters = CounterManager() self.counters.title = "word" # 026 Title is used by FilesAdaptor to determine column. def consume(self): """ Consume just asks parser for next url. """ url = self.parser.next() # 026 Just non-mandatory logging: self.cx += 1 if url and self.cx % 10 == 0: self.logger.debug("Processing url number: %i. Last processed url: %s" % (self.cx, url.Composed)) return url def verifySemiProduct(self, semiProduct): """ SemiProduct must be an SmartURL instance. """ if isinstance(semiProduct, SmartURL): return True else: return False def produce(self, semiProduct): """ Doesn't create product every iteration. Just acumulates words until counters reaches number of settings.bowTempSize. Then put old counters into output queue and starts counting from 0. """ words = semiProduct.Words(self.settings.urlWordsSeparators) for word in words: if word not in self.settings.commonWords: self.counters.increment(word) if self.counters.NumberOfCounters > self.settings.bowTempSize or self.parser.linklistEOF: # 026 Put full counters to output queue. self.outputCondition.acquire() self.outputQueue.put(self.counters) self.outputCondition.release() # 026 Create new empty counters: self.counters = CounterManager() self.counters.title = "word" return True else: return False # 026 The meaning of the return value is, if a counter was added to the queue. def finalize(self): self.outputCondition.acquire() self.outputQueue.put(self.counters) self.outputCondition.release()
def test_updateBOW(self): """ When specific counter is sent to update first time, counter gets INSERTed to db. Second time is just updated """ self.filesAdaptor.connectDB() counters=CounterManager() counters.title="word" testWords=[["word1", 1],["word2", 2],["word3", 3]] counters.increment(testWords[0][0], testWords[0][1]) counters.increment(testWords[1][0], testWords[1][1]) counters.increment(testWords[2][0], testWords[2][1]) # First update and check of the counters. result=self.filesAdaptor.updateBOW(counters) self.filesAdaptor.DBcursor.execute("select * from BOW order by word asc;") rowIndex=0 for row in self.filesAdaptor.DBcursor: self.assertEqual(row[1],testWords[rowIndex][0]) self.assertEqual(row[2],testWords[rowIndex][1]) rowIndex+=1 self.assertEqual(rowIndex,len(testWords)) self.assertEqual(result,len(testWords)) # Second update. The counters shall be doubled, while the count of them shall stay same. self.filesAdaptor.updateBOW(counters) self.filesAdaptor.DBcursor.execute("select * from BOW order by word asc;") rowIndex=0 for row in self.filesAdaptor.DBcursor: self.assertEqual(row[1],testWords[rowIndex][0]) self.assertEqual(row[2],testWords[rowIndex][1]*2) rowIndex+=1 self.assertEqual(rowIndex,len(testWords)) self.assertEqual(result,len(testWords))
def produce(self, semiProduct): """ Doesn't create product every iteration. Just acumulates words until counters reaches number of settings.bowTempSize. Then put old counters into output queue and starts counting from 0. """ words = semiProduct.Words(self.settings.urlWordsSeparators) for word in words: if word not in self.settings.commonWords: self.counters.increment(word) if self.counters.NumberOfCounters > self.settings.bowTempSize or self.parser.linklistEOF: # 026 Put full counters to output queue. self.outputCondition.acquire() self.outputQueue.put(self.counters) self.outputCondition.release() # 026 Create new empty counters: self.counters = CounterManager() self.counters.title = "word" return True else: return False
def initializeCounters(self): """ Initialization for BOWBuilderSimple is trivial (unlike BOWBuilderComplex).""" self.counters = CounterManager() self.counters.title = "word" # 026 Title is used by FilesAdaptor to determine column.