def __init__(self, library_name, parser, **kwargs): self.library_name = library_name self.parser = parser(self.library_name) self.files = [] if 'index' in kwargs: local_file = resolver.cache(kwargs['index']) root = html.parse(local_file) uris = root.xpath(kwargs['xpath']) self.files.extend([ kwargs['base'] + f for f in filter(lambda x : re.match(kwargs['regexp'], x), uris) ]) self.files.sort() else: self.files = kwargs['files'] for f in self.files: log.debug(self.library_name + ': ' + f)
def parse(self): for url in self.files: local_file = resolver.cache(url) self.parser.parse(local_file)