コード例 #1
0
ファイル: table_spider.py プロジェクト: totucuong/tablesuf
    def parse(self, response):
        # self.store_html(response)
        for item in self.parse_tables(response):
            yield item

        links = Selector(response).xpath("//a/@href").extract()
        for l in links:
            normLink = Utility.normalize(response.url, l)
            yield Request(normLink, callback=self.follow_links)
コード例 #2
0
ファイル: table_spider.py プロジェクト: totucuong/tablesuf
    def follow_links(self, response):
        if next(self.table_counter) > self.table_limit:
            raise CloseSpider(reason="Enough tables")

        # self.store_html(response)
        for item in self.parse_tables(response):
            yield item

        links = Selector(response).xpath("//a/@href").extract()
        for l in links:
            normLink = Utility.normalize(response.url, l)
            yield Request(normLink, callback=self.follow_links)
コード例 #3
0
ファイル: generator.py プロジェクト: totucuong/tablesuf
def main():
    # crawlerdb example
    # db = load_db("../data/crawlerdb")
    # print 'DB at first run'
    # db.display()
    # db.add_one_html()
    # db.add_n_tables(100)
    # db.display()
    # print 'saving database...'
    # save_db(db, "../data/crawlerdb")
    # print 'load database...'
    # db2 =  load_db("../data/crawlerdb")
    # db2.display()

    """
    A program to generate seeds file for the web crawler.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("-r", "--rootdir", type=str,
                        help="path to root directory")
    parser.add_argument("-c", "--countries", type=str,
                        help="a file contains country list")
    parser.add_argument("-t", "--transports", type=str,
                        help="a file contains transportation mode")
    parser.add_argument("-s", "--seeds", type=str,
                        help="a file contains seed")
    parser.add_argument("-l", "--loglevel", type=str, help="set logging level")
    args = parser.parse_args()

    if args.loglevel:
        numeric_level = getattr(logging, args.loglevel.upper(), None)
        if not isinstance(numeric_level, int):
            raise ValueError('Invalid log level: %s' % args.loglevel)
        logging.basicConfig(level=numeric_level)

    countries = args.rootdir + '/' + args.countries
    transports = args.rootdir + '/' + args.transports
    seeds = args.rootdir + '/' + args.seeds
    Utility.generate_seeds(countries, transports, seeds)