def __parseTweet(self, tweet): for url in tweet.urls(): if url.isError(): logger.info(u"Tweet bad: wrong url: " + unicode(tweet) + u" " + unicode(url)) self.__urlBuilder.delete(url) break url.setDocumentClasses(TxtClassificatorWrapper.instance().classify(url.getText())) if url.isRoot() or url.lang() != "en" or "short" in url.documentClasses(): logger.info(u"Tweet bad: " + unicode(tweet) + u" " + unicode(url)) self.__urlBuilder.delete(url) break logger.info(u"Tweet good: " + unicode(tweet) + u" " + unicode(url)) logger.info(u"URL: " + unicode(url)) self.__model.updateUrl(url)
def _doSmthElse(self): if self.__refreshGui.isSet(): logger.info("Send data to GUI") self.__refreshGui.clear() data = {} data["urls"] = self.__tweetResolvedListener.finalUrls() Publisher.sendMessage("update.urls", data=data) if self.__showProbDist.isSet(): url = self.__probDistUrl self.__showProbDist.clear() self.__probDistUrl = None probDistI = TxtClassificatorWrapper.instance().probDist(url.getText()) if self.__refreshStatusBar.isSet(): self.__refreshStatusBar.clear() data = {} data["cache"] = self.__urlResolver.cacheHitRate() data["position"] = self.__iter.position() data["position_end"] = self.__iter.count() data["current_file_c"] = self.__iter.currentFile() data["last_file_c"] = self.__iter.filesCount() Publisher.sendMessage("update.statusBar", data=data)
def _classifier(self): return TxtClassificatorWrapper.instance()