def runPart(self): try: s = self.__elem or self.__iter.next() self.__elem = s if u"text" in s: try: retweeted = ( TweetText(s[u"retweeted_status"], self.__urlBuilder, self.__userBuilder, None) if s.has_key(u"retweeted_status") else None ) if retweeted: for url in retweeted.urls(): self.__urlResolver.addUrlToQueue(url) tweet = TweetText(s, self.__urlBuilder, self.__userBuilder, retweeted.id() if retweeted else None) for url in tweet.urls(): self.__urlResolver.addUrlToQueue(url) except UrlException as e: logger.warn(u"Cannot build url: " + str(e)) self._doSmthElse() self.__elem = None except Full: return except StopIteration: raise NothingToDo()
def __getTimelineFeatures(self, timeline): logger.info(u"Get timeline features") tweets = [] self.__changePhase(PHASE["GET_TIMELINE_URLS"]) for t in timeline: try: tweet = TweetText(t, self.__urlBuilder, self.__userBuilder) except: logger.exception(u"Error: \"" + unicode(t) + u"\"") raise ValueError(t) logger.debug(u"Tweet:" + unicode(tweet)) tweets.append(tweet) urls = [] ti = 0 for tweet in tweets: for url in tweet.urls(): self.__breakIfStopped() self.__urlResolver.addUrlToQueue(url) urls.append(url) logger.info(u"Tweet:" + unicode(tweet)) ti += 1 self.__proc = 100 * float(ti) / float(len(tweets)) #Kategorie self.__changePhase(PHASE["GET_TIMELINE_FEATURES"]) url2labels = {} ui = 0 for url in urls: self.__breakIfStopped() if not url.isError(): logger.debug(u"Classify " + unicode(url.getUrl())) url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText()) ui += 1 self.__proc = 100 * float(ui) / float(len(urls)) labelsFreq = FreqDist() for labels in url2labels.values(): for label in labels: labelsFreq.inc(label) self.__catFreq = labelsFreq.items() logger.info(u"Categories: " + unicode(labelsFreq.items())) labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']] #normalizacja labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues} logger.info(u"Category factors: " + unicode(labelsFreqValues)) #Języki langFreq = FreqDist() for u in urls: langFreq.inc(u.lang()) self.__langFreq = langFreq.items() logger.info(u"Languages: " + unicode(langFreq.items())) return labelsFreqValues