Esempio n. 1
0
class TopicScrapingTask:
    def __init__(self, topic_scraper, article_scraping_task):
        """
        @type topic_scraper: Scraper
        @type article_scraping_task: ArticleScrapingTask
        """
        self._topic_scraper = topic_scraper
        self._article_scraping_task = article_scraping_task
        self._logger = Logger(self.__class__.__name__)

    def run(self, topic_url):
        for parser in self._topic_scraper.scrape(topic_url):
            assert isinstance(parser, TopicParser)
            self._logger.info('Scraping topic at %s.' % topic_url)

            articles = []
            for article in parser.get_articles():
                try:
                    if self._article_scraping_task.run(article):
                        articles.append(article)
                    else:
                        self._logger.warn('Could not parse article body at %s', article.url)

                except IOError, e:
                    self._logger.error('Failed scraping article: %s' % e)
                    continue

            return articles
Esempio n. 2
0
class ArticleScrapingTask:
    def __init__(self, scraper, min_word_count_heuristic=100):
        """
        @type scraper: Scraper
        @type min_word_count_heuristic: int
        """
        self._scraper = scraper
        self._min_word_count_heuristic = min_word_count_heuristic
        self._logger = Logger(self.__class__.__name__)
        pass

    def run(self, article):
        """
        @type article: Article
        """
        if article.url:
            self._logger.info('Scraping %s.', article.url)

            for parser in self._scraper.scrape(article.url):
                assert isinstance(parser, ArticleParser)

                # The final URL of the article may be different, during scraping, the scraper
                # passing the final URL to each constructed Parser.
                article.url = parser.url

                title = parser.get_title()
                publish_date = parser.get_publish_date()
                preview_image_url = parser.get_preview_image_url()
                body = parser.get_body()

                if title:
                    article.title = title
                if publish_date:
                    article.publish_date = publish_date
                if preview_image_url:
                    article.preview_image_url = preview_image_url

                if body and self._is_article_body(body):
                    article.body = body
                elif article.description and self._is_article_body(article.description):
                    article.body = article.description
                else:
                    break

                return True
        return False

    def _is_article_body(self, body):
        return self._count_words(body) > self._min_word_count_heuristic

    @staticmethod
    def _count_words(s):
        return len(s.split())
Esempio n. 3
0
class KeywordAlgorithm(Algorithm):
    name = 'keyword'

    def __init__(self):
        Algorithm.__init__(self)
        self._score_mapper = ScoreMapper()
        self._logger = Logger(self.__class__.__name__)

    def train(self, articles, states):
        self._params.extractor = KeywordFeatureExtractor(finder=KeywordFinder(), text_key=lambda a: a.title)

        features = np.array(self._params.extractor.train_extract(articles))
        scores = np.array(self._score_mapper.map_batch_score(states))

        regression = LinearRegression(fit_intercept=True)

        n_features = features.shape[1]
        self._logger.info('Feature space uses %d keywords', n_features)

        if n_features >= 100:
            param_grid = {
                'pca__n_components': range(50, n_features, 50)
            }
            pca = PCA(n_components=100)
            pipeline = Pipeline([('pca', pca), ('regression', regression)])
            clf = GridSearchCV(pipeline, param_grid, n_jobs=1, verbose=0, cv=3, score_func=top_item_scorer)
        else:
            clf = regression

        self._params.classifier = clf
        #self._params.classifier = regression

        self._params.classifier.fit(features, scores)

    def score(self, articles):
        self._logger.info('Feature space uses %d keywords', self._params.extractor.keyword_count())

        features = self._params.extractor.extract(articles)
        return self._params.classifier.predict(np.array(features))
Esempio n. 4
0
class WorkerThread(threading.Thread):
    def __init__(self, worker_id, task_queue, completed_queue):
        """
        @type worker_id: int
        @type task_queue: Queue.Queue
        @type completed_queue: Queue.Queue
        """
        self._name = '%s-%d' % (self.__class__.__name__, worker_id)

        threading.Thread.__init__(self, name=self._name)

        self._logger = Logger(self._name)
        self._id = worker_id
        self._task_queue = task_queue
        self._completed_queue = completed_queue
        self._continue = True

    def stop(self):
        self._continue = False

    def run(self):
        while self._continue:
            self.work()
        self.exit()

    def work(self):
        raise NotImplementedError

    def exit(self):
        self._logger.info('Exiting.')

    @classmethod
    def initializer(cls, *args, **kwargs):
        class _WorkerThread(cls):
            def __init__(self, worker_id, task_queue, completed_queue):
                cls.__init__(self, worker_id, task_queue, completed_queue, *args, **kwargs)

        return _WorkerThread