logging.basicConfig( format='%(levelname)7s - %(name)s - %(asctime)s: %(message)s', filename='run.log', level=log_level) console = logging.StreamHandler() console.setFormatter( logging.Formatter('%(levelname)7s - %(name)-8s: %(message)s')) logging.getLogger('').addHandler(console) log = logging.getLogger('main') # ---------------------------------------------------------------- # Load various components, and configure the modules that control # the crawling process # corpus_table = CorpusTable.CorpusTable(args.dbdir) # Storage layer spider = HTTPClient.HTTPClient() # Retrieval code url_normaliser = Normalisation.URLNormaliser() # URL normaliser feature_extractor = Features.Features(url_normaliser, ['title', 'h1']) # Feature extractor # URL Fitness Function #url_rank_function = SimplicityURLRank.SimplicityURLRank() # Prefer simple URLs #url_rank_function = SampleURLRank.SampleURLRank() # Sample code url_rank_function = HumanReadableURLRank.HumanReadableURLRank( ) # Prefer human-readable URLs page_filters = [ # Filters for page rejection # FuzzyDuplicateFilter.FuzzyDuplicateFilter(corpus_table), # Fuzzy hash using ssdeep DuplicateFilter.DuplicateFilter(corpus_table), # Perfect duplicate checker MinimumLengthFilter.MinimumLengthFilter(100), # Min length MaximumLengthFilter.MaximumLengthFilter(800000), # Max length URLCountFilter.URLCountFilter(0, 1000), # URL count MetadataRegexpFilter.MetadataRegexpFilter(