Esempio n. 1
0
    def runPart(self):
        try:
            s = self.__elem or self.__iter.next()
            self.__elem = s
            if u"text" in s:
                try:
                    retweeted = (
                        TweetText(s[u"retweeted_status"], self.__urlBuilder, self.__userBuilder, None)
                        if s.has_key(u"retweeted_status")
                        else None
                    )
                    if retweeted:
                        for url in retweeted.urls():
                            self.__urlResolver.addUrlToQueue(url)

                    tweet = TweetText(s, self.__urlBuilder, self.__userBuilder, retweeted.id() if retweeted else None)
                    for url in tweet.urls():
                        self.__urlResolver.addUrlToQueue(url)

                except UrlException as e:
                    logger.warn(u"Cannot build url: " + str(e))
            self._doSmthElse()
            self.__elem = None
        except Full:
            return
        except StopIteration:
            raise NothingToDo()
Esempio n. 2
0
    def __getTimelineFeatures(self, timeline):
        logger.info(u"Get timeline features")
        tweets = []
        self.__changePhase(PHASE["GET_TIMELINE_URLS"])
        for t in timeline:
            try:
                tweet = TweetText(t, self.__urlBuilder, self.__userBuilder)
            except:
                logger.exception(u"Error: \"" + unicode(t) + u"\"")
                raise ValueError(t)
            logger.debug(u"Tweet:" + unicode(tweet))
            tweets.append(tweet)

        urls = []
        ti = 0
        for tweet in tweets:
            for url in tweet.urls():
                self.__breakIfStopped()
                self.__urlResolver.addUrlToQueue(url)
                urls.append(url)
            logger.info(u"Tweet:" + unicode(tweet))
            ti += 1
            self.__proc = 100 * float(ti) / float(len(tweets))

        #Kategorie
        self.__changePhase(PHASE["GET_TIMELINE_FEATURES"])
        url2labels = {}
        ui = 0
        for url in urls:
            self.__breakIfStopped()
            if not url.isError():
                logger.debug(u"Classify " + unicode(url.getUrl()))
                url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText())
            ui += 1
            self.__proc = 100 * float(ui) / float(len(urls))

        labelsFreq = FreqDist()
        for labels in url2labels.values():
            for label in labels:
                labelsFreq.inc(label)
        self.__catFreq = labelsFreq.items()
        logger.info(u"Categories: "  + unicode(labelsFreq.items()))
        labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']]
        #normalizacja
        labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues}
        logger.info(u"Category factors: "  + unicode(labelsFreqValues))

        #Języki
        langFreq = FreqDist()
        for u in urls:
            langFreq.inc(u.lang())
        self.__langFreq = langFreq.items()
        logger.info(u"Languages: " + unicode(langFreq.items()))

        return labelsFreqValues