def _persist(self): #persist filter of linker self.__filter.persist() #save configures of linker fconfigs = Helper.open(self.workdir(), self.__CONFIG_FILE_NAME, "w") json.dump(self.__configs.encode(), fconfigs) fconfigs.close() #save links of linker flinks = Helper.open(self.workdir(), self.__LINKS_FILE_NAME, "w") json.dump(self.__links.encode(), flinks) flinks.close()
def _launch(self): #launch filter of linker self.__filter.launch() #load configures of linker fconfigs = Helper.open(self.workdir(), self.__CONFIG_FILE_NAME, "r") jsonobj = json.load(fconfigs) fconfigs.close() self.__configs.decode(jsonobj) #load links of linker flinks = Helper.open(self.workdir(), self.__LINKS_FILE_NAME, "r") jsonobj = json.load(flinks) flinks.close() self.__links.decode(jsonobj)
def _key(self, uri): ''' generate key for @uri :param uri: object, @Uri object :return: string, md5 of url ''' return Helper.md5(uri.url())
def launch(self): ''' launch extractor :return: ''' try: time_used, ret = Helper.timerun(self._launch) logger.info("extractor: launch extractor - %s, time used: %fs", self.name(), time_used) except IOError, e: pass
def update(self, uri, extras): ''' udpate uri context with crawl response extras data :param uri: object, Uri object :param extras: dict, extras data for crawled response :return: ''' time_used, ret = Helper.timerun(self._update, uri, extras) logger.info("linker: update link %s, updated. time used:%fs", uri.url(), time_used)
def shutdown(self): ''' shutdown extractor :return: ''' try: time_used, ret = Helper.timerun(self._shutdown) logger.info("extractor: shutdown extractor - %s, time used: %fs", self.name(), time_used) except Exception, e: logger.info("extractor: shutdown extractor - %s, error: %s", self.name(), e.message)
def persist(self): ''' persist extractor data :return: ''' try: time_used, ret = Helper.timerun(self._persist) logger.info("extractor: persist extractor - %s, time used: %fs", self.name(), time_used) except Exception, e: logger.info("extractor: persist extractor - %s, error: %s", self.name(), e.message)
def _parse(self, uri, content): # regex for parsing "img" tag's links regex = re.compile(r'<img.* src="([^"]+)"[^>]*>', re.IGNORECASE) # links parsed links = [] # parse links from content urls = regex.findall(content) for url in urls: url = Helper.combine_path(uri.url(), url) links.append(Uri(url, uri.url())) return links
def parse(self, uri, content): ''' parse wrapper for actual @_parse method :param uri: object, uri for the @content :param content: string, content for the @url :return: list, list with @Uri objects ''' if not self.accept(uri): return None time_used, links = Helper.timerun(self._parse, uri, content) logger.info( "parser: parse links: %s, parsed. links: %d, time used: %fs", uri.url(), len(links), time_used) return links
def pull(self): ''' pull next link from linker :return: object, Link object or None ''' time_used, link = Helper.timerun(self._pull) if link is not None: logger.info("linker: pull link %s, pulled. time used: %fs", link.uri().url(), time_used) return link.uri() else: logger.info( "linker: pull link none, no more links. time used: %fs", time_used) return None
def extract(self, uri, content): ''' extract data from content :param uri: object, @Uri object of content :param content: string, content of @uri :return: object, extract result object or None ''' if not self.accept(uri): return None time_used, result = Helper.timerun(self._extract, uri, content) logger.info( "extractor: extract data from: %s, extracted. time used: %fs", uri.url(), time_used) return result
def push(self, uri): ''' push a uri to linker :param uri: object, Uri object :return: object, key of stored link ''' if self.exists(uri): logger.info("linker: push link %s, exists.", uri.url()) return if not self.accept(uri): logger.info("linker: push link %s, filtered.", uri.url()) return time_used, ret = Helper.timerun(self._push, uri) logger.info("linker: push link %s, pushed. time used:%fs", uri.url(), time_used)
def _persist(self): file = Helper.open(self.workdir(), self.name(), "w") json.dump(self.__patterns, file) file.close()
def _launch(self): file = Helper.open(self.workdir(), self.name(), "r") patterns = json.load(file) file.close() self._filter(*tuple(patterns))