def parse(self, response): # self.store_html(response) for item in self.parse_tables(response): yield item links = Selector(response).xpath("//a/@href").extract() for l in links: normLink = Utility.normalize(response.url, l) yield Request(normLink, callback=self.follow_links)
def follow_links(self, response): if next(self.table_counter) > self.table_limit: raise CloseSpider(reason="Enough tables") # self.store_html(response) for item in self.parse_tables(response): yield item links = Selector(response).xpath("//a/@href").extract() for l in links: normLink = Utility.normalize(response.url, l) yield Request(normLink, callback=self.follow_links)
def main(): # crawlerdb example # db = load_db("../data/crawlerdb") # print 'DB at first run' # db.display() # db.add_one_html() # db.add_n_tables(100) # db.display() # print 'saving database...' # save_db(db, "../data/crawlerdb") # print 'load database...' # db2 = load_db("../data/crawlerdb") # db2.display() """ A program to generate seeds file for the web crawler. """ parser = argparse.ArgumentParser() parser.add_argument("-r", "--rootdir", type=str, help="path to root directory") parser.add_argument("-c", "--countries", type=str, help="a file contains country list") parser.add_argument("-t", "--transports", type=str, help="a file contains transportation mode") parser.add_argument("-s", "--seeds", type=str, help="a file contains seed") parser.add_argument("-l", "--loglevel", type=str, help="set logging level") args = parser.parse_args() if args.loglevel: numeric_level = getattr(logging, args.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % args.loglevel) logging.basicConfig(level=numeric_level) countries = args.rootdir + '/' + args.countries transports = args.rootdir + '/' + args.transports seeds = args.rootdir + '/' + args.seeds Utility.generate_seeds(countries, transports, seeds)