Example #1
0
 def test_urls(self):
     self.assertEqual(tuple(get_dynamic_urls(('http://host1/{1:2}/?param={3:4}',))),
                      (('http://host1/1/?param=3', [1, 3], [1, 1]),
                       ('http://host1/1/?param=4', [1, 4], [1, 1]),
                       ('http://host1/2/?param=3', [2, 3], [1, 1]),
                       ('http://host1/2/?param=4', [2, 4], [1, 1]))
                      )
Example #2
0
 def test_multiple_urls(self):
     self.assertEqual(tuple(get_dynamic_urls(('http://host1/{1:2}/?param={3:4}',
                                              'http://host2/dummy?param={11:12}',))),
                      (('http://host1/1/?param=3', [1, 3], [1, 1]),
                       ('http://host1/1/?param=4', [1, 4], [1, 1]),
                       ('http://host1/2/?param=3', [2, 3], [1, 1]),
                       ('http://host1/2/?param=4', [2, 4], [1, 1]),
                       ('http://host2/dummy?param=11', [11], [2]),
                       ('http://host2/dummy?param=12', [12], [2]))
                      )
Example #3
0
File: main.py Project: keul/Allanon
def main(options=None, *args):
    if not options:
        # invocation from command line
        options, args = parser.parse_args()
    
    if len(args)<1 or options.help:
        # personal version of the help, to being able to keep \n in description
        result = ['Allanon: a crawler for visit a predictable set of URLs, '
                  'and download resources from them\n']
        result.append(parser.get_usage())
        result.append(DESCRIPTION+"\n")
        result.append(parser.format_option_help(parser.formatter))
        result.append('By Luca Fabbri - luca<at>keul.it\n')
        result.append('See https://github.com/keul/Allanon for detailed documentation or '
                      'provide bug report.')
        print(("\n".join(result)))
        sys.exit(0)
    
    if options.user_agent:
        config.USER_AGENT = options.user_agent
    if options.timeout:
        config.TIMEOUT = options.timeout
    if options.sleep:
        config.SLEEP_TIME = options.sleep

    # first, command line URLs sequence
    try:
        urls = get_dynamic_urls(args)
        index_digit_len = 0

        # optimization: we don't need to count all the URLs in that case
        if options.filename_model and '%INDEX' in options.filename_model:
            urls = tuple(urls)
            index_digit_len = len(str(len(urls)))

        # in case we are not directly downloading, we need to look for inner resources
        if options.search_queries:
            urls = search_resources(urls, options.search_queries)

        for index, urls_data in enumerate(urls):
            if options.offset and (index+1)<=options.offset:
                print(("Skipping resource %d due to offset settings" % (index+1)))
                continue
            url, ids, max_ids = urls_data
            rg = ResourceGrabber(url)
            rg.download(options.destination_directory, options.filename_model, ids, index+1,
                        ids_digit_len=max_ids,
                        index_digit_len=index_digit_len,
                        duplicate_check=options.duplicate_check)
            time.sleep(options.sleep)
    except KeyboardInterrupt:
        print("\nTerminated by user action")
        sys.exit(1)