def test_urls(self): self.assertEqual(tuple(get_dynamic_urls(('http://host1/{1:2}/?param={3:4}',))), (('http://host1/1/?param=3', [1, 3], [1, 1]), ('http://host1/1/?param=4', [1, 4], [1, 1]), ('http://host1/2/?param=3', [2, 3], [1, 1]), ('http://host1/2/?param=4', [2, 4], [1, 1])) )
def test_multiple_urls(self): self.assertEqual(tuple(get_dynamic_urls(('http://host1/{1:2}/?param={3:4}', 'http://host2/dummy?param={11:12}',))), (('http://host1/1/?param=3', [1, 3], [1, 1]), ('http://host1/1/?param=4', [1, 4], [1, 1]), ('http://host1/2/?param=3', [2, 3], [1, 1]), ('http://host1/2/?param=4', [2, 4], [1, 1]), ('http://host2/dummy?param=11', [11], [2]), ('http://host2/dummy?param=12', [12], [2])) )
def main(options=None, *args): if not options: # invocation from command line options, args = parser.parse_args() if len(args)<1 or options.help: # personal version of the help, to being able to keep \n in description result = ['Allanon: a crawler for visit a predictable set of URLs, ' 'and download resources from them\n'] result.append(parser.get_usage()) result.append(DESCRIPTION+"\n") result.append(parser.format_option_help(parser.formatter)) result.append('By Luca Fabbri - luca<at>keul.it\n') result.append('See https://github.com/keul/Allanon for detailed documentation or ' 'provide bug report.') print(("\n".join(result))) sys.exit(0) if options.user_agent: config.USER_AGENT = options.user_agent if options.timeout: config.TIMEOUT = options.timeout if options.sleep: config.SLEEP_TIME = options.sleep # first, command line URLs sequence try: urls = get_dynamic_urls(args) index_digit_len = 0 # optimization: we don't need to count all the URLs in that case if options.filename_model and '%INDEX' in options.filename_model: urls = tuple(urls) index_digit_len = len(str(len(urls))) # in case we are not directly downloading, we need to look for inner resources if options.search_queries: urls = search_resources(urls, options.search_queries) for index, urls_data in enumerate(urls): if options.offset and (index+1)<=options.offset: print(("Skipping resource %d due to offset settings" % (index+1))) continue url, ids, max_ids = urls_data rg = ResourceGrabber(url) rg.download(options.destination_directory, options.filename_model, ids, index+1, ids_digit_len=max_ids, index_digit_len=index_digit_len, duplicate_check=options.duplicate_check) time.sleep(options.sleep) except KeyboardInterrupt: print("\nTerminated by user action") sys.exit(1)