Beispiel #1
0
 def launch(self):
     '''
         launch extractor
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._launch)
         logger.info("extractor: launch extractor - %s, time used: %fs",
                     self.name(), time_used)
     except IOError, e:
         pass
Beispiel #2
0
    def update(self, uri, extras):
        '''
            udpate uri context with crawl response extras data
        :param uri: object, Uri object
        :param extras: dict, extras data for crawled response
        :return:
        '''
        time_used, ret = Helper.timerun(self._update, uri, extras)

        logger.info("linker: update link %s, updated. time used:%fs",
                    uri.url(), time_used)
Beispiel #3
0
 def shutdown(self):
     '''
         shutdown extractor
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._shutdown)
         logger.info("extractor: shutdown extractor - %s, time used: %fs",
                     self.name(), time_used)
     except Exception, e:
         logger.info("extractor: shutdown extractor - %s, error: %s",
                     self.name(), e.message)
Beispiel #4
0
 def persist(self):
     '''
         persist extractor data
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._persist)
         logger.info("extractor: persist extractor - %s, time used: %fs",
                     self.name(), time_used)
     except Exception, e:
         logger.info("extractor: persist extractor - %s, error: %s",
                     self.name(), e.message)
Beispiel #5
0
    def parse(self, uri, content):
        '''
            parse wrapper for actual @_parse method
        :param uri: object, uri for the @content
        :param content: string, content for the @url
        :return: list, list with @Uri objects
        '''
        if not self.accept(uri):
            return None

        time_used, links = Helper.timerun(self._parse, uri, content)
        logger.info(
            "parser: parse links: %s, parsed. links: %d, time used: %fs",
            uri.url(), len(links), time_used)

        return links
Beispiel #6
0
    def pull(self):
        '''
            pull next link from linker
        :return: object, Link object or None
        '''
        time_used, link = Helper.timerun(self._pull)

        if link is not None:
            logger.info("linker: pull link %s, pulled. time used: %fs",
                        link.uri().url(), time_used)
            return link.uri()
        else:
            logger.info(
                "linker: pull link none, no more links. time used: %fs",
                time_used)
            return None
Beispiel #7
0
    def extract(self, uri, content):
        '''
            extract data from content
        :param uri: object, @Uri object of content
        :param content: string, content of @uri
        :return: object, extract result object or None
        '''
        if not self.accept(uri):
            return None

        time_used, result = Helper.timerun(self._extract, uri, content)

        logger.info(
            "extractor: extract data from: %s, extracted. time used: %fs",
            uri.url(), time_used)

        return result
Beispiel #8
0
    def push(self, uri):
        '''
            push a uri to linker
        :param uri: object, Uri object
        :return: object, key of stored link
        '''
        if self.exists(uri):
            logger.info("linker: push link %s, exists.", uri.url())
            return

        if not self.accept(uri):
            logger.info("linker: push link %s, filtered.", uri.url())
            return

        time_used, ret = Helper.timerun(self._push, uri)

        logger.info("linker: push link %s, pushed. time used:%fs", uri.url(),
                    time_used)