Ejemplo n.º 1
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script', 'tumblr-static.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'tumblr-static-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('tumblr-static-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'list':
            r = requests.get('http://grafana.fvz.io/.well-known/at/{}'.format(item_value))
            if r.status_code != 200:
                raise Exception('Could not get URLs list from github.')
            for url in r.text.splitlines():
                url = url.strip()
                if '%20' in url:
                    urls = url.split('%20')
                    urls.append(url.replace('%20', ''))
                else:
                    urls = [url]
                for url in urls:
                    if len(url) == 0 or not re.search(r'^https?://[^/]+/', url) \
                            or 'www.tumblr.com' in url:
                        continue
                    wget_args.extend(['--warc-header', 'tumblr-static-url: {}'.format(url)])
                    wget_args.append(url)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 2
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U',
            USER_AGENT,
            '-nv',
            '--content-on-error',
            '--lua-script',
            'domains.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            '--recursive',
            '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            'domains-dld-script-version: ' + VERSION,
            '--warc-header',
            ItemInterpolation('domains-item: %(item_name)s'),
            '--warc-dedup-url-agnostic',
        ]

        item_name = item['item_name']

        wget_args.extend(['--domains', item_name])
        wget_args.extend(['--warc-header', 'domain: ' + item_name])

        wget_args.append('http://{}/'.format(item_name))
        wget_args.append('https://{}/'.format(item_name))

        if item_name.count('.') == 1:
            wget_args.append('http://www.{}/'.format(item_name))
            wget_args.append('https://www.{}/'.format(item_name))

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 3
0
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies',
            '--content-on-error', '--lua-script', 'mercurial.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--span-hosts', '--waitretry', '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s-main'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'mercurial-dld-script-version: ' + VERSION, '--warc-header',
            ItemInterpolation('mercurial-item: %(item_name)s'),
            '--warc-dedup-url-agnostic'
        ]

        item_name = item['item_name']
        item_value = item_name

        item['item_value'] = item_value

        wget_args.extend(
            ['--warc-header', 'mercurial-repository: ' + str(item_value)])
        wget_args.extend(['--warc-header', 'warc-type: main'])
        wget_args.append(item_value + '?cmd=capabilities')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
    def realize(self, item):
        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        wpull_args = [
            WPULL_EXE,
            '-nv',
            '-U', 'ArchiveTeam; Googlebot/2.1',
            '--no-check-certificate',
            '--no-robots',
            '--dns-timeout', '20',
            '--connect-timeout', '20',
            '--read-timeout', '900',
            '--session-timeout', '1800',
            '--tries', '5',
            '--waitretry', '5',
            '--max-redirect', '20',
            '--output-file', ItemInterpolation("%(item_dir)s/wpull.log"),
            '--database', ItemInterpolation("%(item_dir)s/wpull.db"),
            '--delete-after',
            '--page-requisites',
            '--no-parent',
            '--concurrent', '5',
            '--warc-file', ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            '--level', '0',
            '--page-requisites-level', '5',
            '--span-hosts-allow', 'page-requisites',
            '--warc-header', 'pipeline-py-sha256: ' + PIPELINE_SHA256,
            '--warc-header', 'warrior-install-sh-sha256: ' + WARRIOR_INSTALL_SHA256,
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'newsgrabber-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('ftp-item: %(item_name)s'),
            '--reject-regex', r'^https?://launcher\.spot\.im/spot/(www\.spot\.im/launcher/|launcher\.spot\.im/|modules/launcher/){3,}bundle\.js$'
        ]

        if '-videos' in item_value:
            wpull_args.append('--youtube-dl')
            wpull_args.append('--youtube-dl-exe')
            wpull_args.append(YOUTUBE_DL_EXE)

        list_url = 'http://master.newsbuddy.net/' + item_value
        list_data = requests.get(list_url)
        #wpull_args.append(list_url)
        if list_data.status_code == 200:
            for url in list_data.text.splitlines():
                url = url.strip()
                wpull_args.append(url)

        if 'bind_address' in globals():
            wpull_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wpull will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wpull_args, item)
Ejemplo n.º 5
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script', 'firefox-addons.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'mozilla.org',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'firefox-addons-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('firefox-addons-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'ffaddon':
            wget_args.extend(['--warc-header', 'firefox-addon-identifier: {}'.format(item_value)])
            wget_args.append('https://addons.mozilla.org/en-US/firefox/addon/{}/'.format(item_value))
            wget_args.append('https://services.addons.mozilla.org/api/v3/addons/addon/{}/'.format(item_value))
            wget_args.append('https://services.addons.mozilla.org/api/v4/addons/addon/{}/'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/en/firefox/addon/{}/'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=homepage-collection-featured'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=featured'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=hp-dl-promo'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=collection'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=hotness'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=rating'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=recommended_fallback'.format(item_value))
            #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/'.format(item_value))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 6
0
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error',
            '--lua-script', 'parler.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'parler.com', '--span-hosts', '--waitretry',
            '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'x-wget-at-project-version: ' + VERSION, '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic', '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include'
        ]

        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = TRACKER_ID
        wget_args.extend([
            '--warc-zstd-dict',
            ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        for item_name in item['item_name'].split('\0'):
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'post':
                wget_args.extend(
                    ['--warc-header', 'parler-post: {}'.format(item_value)])
                wget_args.append(
                    'https://parler.com/post/{}'.format(item_value))
            elif item_type == 'profile':
                wget_args.extend(
                    ['--warc-header', 'parler-post: {}'.format(item_value)])
                wget_args.append(
                    'https://parler.com/profile/{}'.format(item_value))
            elif item_type == 'url':
                wget_args.append(item_value)
            else:
                raise ValueError('item_type not supported.')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 7
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--content-on-error',
            '--lua-script', 'pastebin.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'pastebin.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'pastebin-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('pastebin-item: %(item_name)s'),
            '--warc-dedup-url-agnostic',
            '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include'
        ]

        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = TRACKER_ID
        wget_args.extend([
            '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        item_name = item['item_name']
        item_value = item_name
        if len(item_value) > 8:
            item_value = self.int_to_str(int(item_name.replace('b36.', ''), 36))

        item['item_value'] = item_value

        wget_args.extend(['--warc-header', 'pastebin-paste: ' + str(item_value)])
        wget_args.append('https://pastebin.com/{}'.format(item_value))

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 8
0
    def realize(self, item):
        wget_args = [
            WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script',
            'sketch.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'sonymobile.com', '--span-hosts',
            '--waitretry', '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'sketch-dld-script-version: ' + VERSION, '--warc-header',
            ItemInterpolation('sketch-item: %(item_name)s'), '--header',
            'Accept-Encoding: gzip', '--compression', 'gzip'
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        http_client = httpclient.HTTPClient()

        if item_type == 'sketches':
            r = http_client.fetch('http://103.230.141.2/sketch/' + item_value,
                                  method='GET')
            for s in r.body.decode('utf-8', 'ignore').splitlines():
                s = s.strip()
                if len(s) == 0:
                    continue
                wget_args.extend(
                    ['--warc-header', 'sketch-sketch-id: {}'.format(s)])
                wget_args.append(
                    'https://sketch.sonymobile.com/api/1/sharedsketch/{}'.
                    format(s))
        elif item_type == 'user':
            wget_args.extend(
                ['--warc-header', 'sketch-user-id: '.format(item_value)])
            wget_args.append(
                'https://sketch.sonymobile.com/api/1/artist/{}'.format(
                    item_value))
        else:
            raise Exception('Unknown item')

        http_client.close()

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 9
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
          #  "--no-cookies",
            "--load-cookies", "cookies.txt",
            "--lua-script", "codebender.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "codebender.cc",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "codebender-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("codebender-item: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('sketches', 'user')

        if item_type == 'sketches':
            start, stop = item_value.split('-')
            for i in range(int(start), int(stop)+1):
                wget_args.extend(['--warc-header', 'codebender-sketch: {i}'.format(**locals())])
                wget_args.append('https://codebender.cc/sketch:{i}'.format(**locals()))
                wget_args.append('https://codebender.cc/sketch:{i}?noCookies=true'.format(**locals()))
                wget_args.append('https://codebender.cc/utilities/download/{i}'.format(**locals()))
                wget_args.append('https://codebender.cc/utilities/download/{i}?noCookies=true'.format(**locals()))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Ejemplo n.º 10
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U', USER_AGENT,
            '-nv',
            '--content-on-error',
            '--lua-script', 'so-net-u-page-plus.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'upp.so-net.ne.jp',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'x-wget-at-project-version: ' + VERSION,
            '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
        ]

        for item_name in item['item_name'].split('\0'):
            item_name = item_name.replace('http://', '')
            wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
            wget_args.append('item-name://'+item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == "userdir":
                wget_args.extend(['--warc-header', 'so-net-u-page-plus-userdir: ' + item_value])
                hostname = item_value.split("/")[0]
                user_dir_name = item_value.split("/")[1]
                wget_args.append('http://{}.upp.so-net.ne.jp/{}/'.format(hostname, user_dir_name))
                # Alternate forms, because I amn't sure how they parse this
                wget_args.append('http://{}.upp.so-net.ne.jp/{}/index.htm'.format(hostname, user_dir_name))
                wget_args.append('http://{}.upp.so-net.ne.jp/{}/index.html'.format(hostname, user_dir_name))
                wget_args.append('http://{}.upp.so-net.ne.jp/{}'.format(hostname, user_dir_name))
            else:
                raise ValueError('item_type not supported.')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 11
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--load-cookies", "cookies.txt",
            "--lua-script", "coursera.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "coursera.org",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "coursera-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("coursera-user: %(item_name)s"),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('oldcourse')

        if item_type == 'oldcourse':
            X_CSRFToken = ''.join(random.choice(string.digits) for i in range(20))
            X_CSRF2_Cookie = 'csrf2_token_' + ''.join(random.choice(string.digits) for i in range(8))
            X_CSRF2_Token = ''.join(random.choice(string.digits) for i in range(24))
            Cookie = "csrftoken=%s; %s=%s" % (X_CSRFToken, X_CSRF2_Cookie, X_CSRF2_Token)
            os.system(WGET_LUA + " --save-cookies cookies.txt --keep-session-cookies --post-data '[email protected]&password=123456&webrequest=true' --header='Cookie: " + Cookie + "' --header='X-CSRFToken: " + X_CSRFToken + "' --header='X-CSRF2-Cookie: " + X_CSRF2_Cookie + "' --header='X-CSRF2-Token: " + X_CSRF2_Token + "' https://www.coursera.org/api/login/v3")
            os.remove('v3')
            wget_args.append('https://www.coursera.org/course/' + item_value)
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Ejemplo n.º 12
0
    def realize(self, item):
        wget_args = [
            WPULL_EXE,
            "-nv",
            # "--user-agent", USER_AGENT,
            "--python-script",
            "examplecity.py",
            "-o",
            ItemInterpolation("%(item_dir)s/wpull.log"),
            "--no-check-certificate",
            "--database",
            ItemInterpolation("%(item_dir)s/wpull.db"),
            "--delete-after",
            "--no-robots",
            "--no-cookies",
            "--rotate-dns",
            # "--recursive", "--level=inf",
            "--recursive",
            "--level=2",
            "--no-parent",
            "--page-requisites",
            "--span-hosts-allow",
            "page-requisites,linked-pages",
            "--timeout",
            "30",
            "--tries",
            "2",
            "--wait",
            "0.5",
            "--random-wait",
            "--waitretry",
            "5",
            # "--domains", "example.com,example.net",
            # "--hostnames", "assets.cloudspeeder.invalid,cnd.wahoo.invalid",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "examplecity-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("examplecity-user: %(item_name)s"),
        ]

        domain = item['item_name']
        wget_args.append("http://{0}".format(domain))

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 13
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--content-on-error',
            '--lua-script', 'playstv.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'plays.tv',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'playstv-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('playstv-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'user':
            wget_args.extend(['--warc-header', 'playstv-user-id: ' + item_value])
            wget_args.append('https://plays.tv/playsapi/usersys/v1/user/' + item_value)
        elif item_type == 'video':
            for s in item_value.split(';'):
                print(s)
                wget_args.extend(['--warc-header', 'playstv-video-id: ' + s])
                wget_args.append('https://plays.tv/playsapi/feedsys/v1/media/' + s)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error',
            '--load-cookies', 'cookies.txt', '--lua-script',
            'super-mario-maker-bookmarks.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'x-wget-at-project-version: ' + VERSION, '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic'
        ]

        item_names = item['item_name'].split('\0')
        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        for item_name in item_names:
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'user':
                wget_args.extend([
                    '--warc-header',
                    'super-mario-world-bookmarks-user: '******'https://supermariomakerbookmark.nintendo.net/profile/' +
                    item_value)
            elif item_type == 'course':
                wget_args.extend([
                    '--warc-header',
                    'super-mario-world-bookmarks-course: ' + item_value
                ])
                wget_args.append(
                    'https://supermariomakerbookmark.nintendo.net/courses/' +
                    item_value)
            else:
                raise ValueError('item_type not supported.')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 15
0
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies',
            '--content-on-error', '--lua-script', 'bitbucket.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'bitbucket.com', '--span-hosts', '--waitretry',
            '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'bitbucket-dld-script-version: ' + VERSION, '--warc-header',
            ItemInterpolation('bitbucket-item: %(item_name)s'),
            '--warc-dedup-url-agnostic', '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include'
        ]

        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = TRACKER_ID
        wget_args.extend([
            '--warc-zstd-dict',
            ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'hg':
            wget_args.append('https://bitbucket.org/' + item_value + '/src/')
            wget_args.append('https://bitbucket.org/' + item_value)
            wget_args.append('https://bitbucket.org/' + item_value +
                             '/src/default/')
            wget_args.append('https://bitbucket.org/!api/2.0/repositories/' +
                             item_value)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 16
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--content-on-error',
            '--lua-script', 'gfycat.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'gfycat.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'gfycat-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('gfycat-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'disco':
            prefix = 'https://api.gfycat.com/v1/gfycats/' + item_value
            with open('animals', 'r') as f:
                for line in f:
                    if line.startswith('#'):
                        continue
                    wget_args.append(prefix + line.strip())
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 17
0
 def __init__(self):
     args = [
         sys.executable,
         os.path.join(PIPELINE_DIR, "warc2warc_greader.py"), "--gzip",
         "--decode_http", "--strip-404s", "--json-hrefs-file",
         ItemInterpolation("%(data_dir)s/%(warc_file_base)s.hrefs.bz2"),
         "--output",
         ItemInterpolation(
             "%(data_dir)s/%(warc_file_base)s.cooked.warc.gz"),
         ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
     ]
     ExternalProcess.__init__(self, "CookWARC", args)
Ejemplo n.º 18
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U', USER_AGENT,
            '-nv',
            '--content-on-error',
            '--lua-script', 'google-poly.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'poly.google.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'x-wget-at-project-version: ' + VERSION,
            '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
            '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8',
        ]

        for item_name in item['item_name'].split('\0'):
            wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
            wget_args.append('item-name://'+item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'poly':
                wget_args.extend(['--warc-header', 'google-poly-item: '+item_value])
                wget_args.append('https://poly.google.com/view/'+item_value)
            elif item_type == 'user':
                wget_args.extend(['--warc-header', 'google-poly-user: '******'https://poly.google.com/user/'+item_value)
            else:
                raise ValueError('item_type not supported.')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 19
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--no-cookies",
            "--content-on-error",
            "--lua-script", "vidme.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--domains", "vid.me",
            "--span-hosts",
            "--waitretry", "30",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "vidme-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("vidme-item: %(item_name)s"),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'video':
            wget_args.extend(['--warc-header', 'vidme-video-id: {i}'.format(i=item_value)])
            wget_args.append('https://api.vid.me/video/{i}'.format(i=item_value))
            wget_args.append('https://api.vid.me/video/{i}/upnext'.format(i=item_value))
            wget_args.append('https://api.vid.me/video/{i}/likes?offset=0&limit=10'.format(i=item_value))
            wget_args.append('https://api.vid.me/video/{i}/comments?offsetAtParentLevel=true&order=score&offset=0&limit=20'.format(i=item_value))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 20
0
    def realize(self, item):
        wget_args = [
            WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error',
            '--load-cookies', 'cookies.txt', '--lua-script', 'bintray.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'x-wget-at-project-version: ' + VERSION, '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic'
        ]

        item_names = item['item_name'].split('\0')
        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        item_names_to_submit = item_names.copy()
        for item_name in item_names:
            assert item_name not in {'user:account', 'user:assets'
                                     }, 'Doing this out of caution'
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'user':
                wget_args.extend(
                    ['--warc-header', 'bintray-user: '******'https://bintray.com/{item_value}')
                wget_args.append(f'https://bintray.com/{item_value}/')
            elif item_type == 'file':
                wget_args.extend(
                    ['--warc-header', 'bintray-file: ' + item_value])
                assert item_value.startswith(
                    "http"), "If this fails, something strange has happened"
                wget_args.append(item_value)
            else:
                raise ValueError('item_type not supported.')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 21
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--content-on-error',
            '--lua-script', 'mixer.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'mixer.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'mixer-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('mixer-item: %(item_name)s'),
            '--warc-dedup-url-agnostic',
        ]

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'clip':
            wget_args.extend(['--warc-header', 'mixer-clip: ' + item_value])
            wget_args.append('https://mixer.com/api/v1/clips/' + item_value)
        elif item_type in ('rec', 'rec-meta'):
            wget_args.extend(['--warc-header', 'mixer-recording: ' + item_value])
            wget_args.append('https://mixer.com/api/v1/recordings/' + item_value)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 22
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script', 'tumblr.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'tumblr.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'tumblr-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('tumblr-blog: %(item_name)s')
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'tumblr-blog':
            split_items = item_value.split(':')
            for x in split_items:
                wget_args.extend(['--warc-header', 'tumblr-blog: ' + x])
                wget_args.append('http://{}.tumblr.com/'.format(x))
                wget_args.append('http://{}.tumblr.com/sitemap.xml'.format(x))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 23
0
    def realize(self, item):
        wget_args = [
            WGET_LUA, '-U', USER_AGENT, '-nv', '--lua-script',
            'sketch-static.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains',
            'sketch.sonymobile.com,sketch-cloud-storage.s3.amazonaws.com',
            '--span-hosts', '--waitretry', '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'sketch-dld-script-version: ' + VERSION, '--warc-header',
            ItemInterpolation('sketches-created-on: %(item_value)s')
        ]

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        http_client = httpclient.HTTPClient()

        if item_type == 'sketches' or item_type == 'tests':
            r = http_client.fetch(
                'https://raw.githubusercontent.com/marked/sketch-items/master/'
                + item_type + "/" + item_value,
                method='GET')
            for s in r.body.decode('utf-8', 'ignore').splitlines():
                s = s.strip()
                if len(s) == 0:
                    continue
                wget_args.append(
                    'https://storage.sketch.sonymobile.com/feed/{}/image'.
                    format(s))
        else:
            raise Exception('Unknown item')

        http_client.close()

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 24
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--header', 'Cookie: rxx=5xsz8gpps7w.1cxbbha8&v=1; _ga=GA1.2.67023728.1544887148; _gid=GA1.2.55111282.1544887148; __utma=189990958.67023728.1544887148.1544887148.1544887148.1; __utmb=189990958.0.10.1544887148; __utmc=189990958; __utmz=189990958.1544887148.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); language=%2Cen_US; logged_in=1; pfx=025a404c8fe0a59b8c35f218ec03d27834e8b87ef79b414c0872edf4ff0e146c%230%234099695469; pfg=253dab60b55590b025d9ae175a9442d5895d72b2f7aeb68188ca352cc075242c%23%7B%22eu_resident%22%3A1%2C%22gdpr_is_acceptable_age%22%3A1%2C%22gdpr_consent_core%22%3A1%2C%22gdpr_consent_first_party_ads%22%3A1%2C%22gdpr_consent_third_party_ads%22%3A1%2C%22gdpr_consent_search_history%22%3A1%2C%22exp%22%3A1576423244%2C%22vc%22%3A%22%22%7D%230120119809; tmgioct=5c151bcc4067d10993384260',
            '--lua-script', 'tumblr.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'tumblr.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'tumblr-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('tumblr-blog: %(item_name)s')
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'tumblr-blog':
            split_items = item_value.split(':')
            for x in split_items:
                wget_args.extend(['--warc-header', 'tumblr-blog: ' + x])
                wget_args.append('http://{}.tumblr.com/'.format(x))
                wget_args.append('http://{}.tumblr.com/sitemap.xml'.format(x))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 25
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U",
            USER_AGENT,
            "-nv",
            "--lua-script",
            "justintv.lua",
            "-o",
            ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e",
            "robots=off",
            "--no-cookies",
            "--rotate-dns",
            # "--recursive", "--level=inf",
            "--page-requisites",
            "--timeout",
            "60",
            "--tries",
            "inf",
            "--span-hosts",
            "--waitretry",
            "3600",
            # "--domains", "canv.as,canvasugc.com",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "justintv-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("justintv-user: %(item_name)s"),
        ]

        item_name = item['item_name']
        wget_args.append('http://justin.tv/{0}'.format(item_name))
        wget_args.append(item['video_url'])

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 26
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
			#Insert project
            '--lua-script', 'project.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
			#Insert project
            '--domains', '',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
			#Insert project
            '--warc-header', '-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('-item: %(item_name)s')
        ]
        
        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value

        http_client = httpclient.HTTPClient()
		
		#Insert project item code

        http_client.close()
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')
            
        return realize(wget_args, item)
Ejemplo n.º 27
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "canvas.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--no-cookies",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--page-requisites",
            "--timeout", "60",
            "--tries", "inf",
            "--span-hosts",
            "--waitretry", "3600",
            "--domains", "canv.as,canvasugc.com",
            "--warc-file",
                ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "canvas-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("canvas-user: %(item_name)s"),
        ]

        item_name = item['item_name']
        item_type, item_data = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_data'] = item_data

        if item_type == 'drawing':
            wget_args.append('http://canv.as/p/%s' % item_data)

        elif item_type == 'profile':
            wget_args.append('http://canv.as/user/%s' % item_data)

        else:
            raise Exception('Unknown item_type')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 28
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U', USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script', 'jamiiforums.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'jamiiforums.com',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'jamiiforums-dld-script-version: ' + VERSION,
            '--warc-header', ItemInterpolation('jamiiforums-item: %(item_name)s')
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'threads':
            start, end = (int(i) for i in item_value.split('-'))
            for id_ in range(start, end+1):
                wget_args.extend(['--warc-header', 'jamiiforums-thread-id: {}'.format(id_)])
                wget_args.append('https://www.jamiiforums.com/threads/x.{}/'.format(id_))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 29
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", USER_AGENT,
            "-nv",
            "--lua-script", "swipnet.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--no-cookies",
            "--rotate-dns",
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--span-hosts",
            "--waitretry", "30",
            "--domains", "swipnet.se",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "swipnet-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("swipnet-user: %(item_name)s"),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        assert item_type
        assert item_value

        item['item_type'] = item_type
        item['item_value'] = item_value

        wget_args.append('http://{0}.swipnet.se/{1}/'.format(item_type, item_value))

        # wget_args.append('http://home.swipnet.se/{0}/'.format(item_name))

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Ejemplo n.º 30
0
 def realize(self, item):
     wget_args = [
          WGET_LUA,
         "-U", USER_AGENT,
         "-nv",
                  #"--lua-script", "cobook.lua",
         "-o", ItemInterpolation("%(item_dir)s/wget.log"),
         "--no-check-certificate",
         "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
         "--truncate-output",
         "-e", "robots=off",
         "--rotate-dns",
         "--recursive", "--level=inf",
         "--no-parent",
         "--page-requisites",
         "--timeout", "30",
         "--tries", "inf",
                  #"--domains", "cobook.co",
         "--header","Cookie: iccmtspmvrfy=ano",
         "--span-hosts",
         "--waitretry", "30",
         "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
         "--warc-header", "operator: Archive Team",
         "--warc-header", "internetcentrum-dld-script-version: " + VERSION,
         "--warc-header", ItemInterpolation("internetcentrum-user: %(item_name)s"),
     ]
     
     item_name = item['item_name']
     assert ':' in item_name
     item_type, item_value = item_name.split(':', 1)
     
     item['item_type'] = item_type
     item['item_value'] = item_value
     
     assert item_type in ('site')
     
     if item_type == 'site':
         wget_args.append('{0}'.format(item_value))
     else:
         raise Exception('Unknown item')
     
     if 'bind_address' in globals():
         wget_args.extend(['--bind-address', globals()['bind_address']])
         print('')
         print('*** Wget will bind address at {0} ***'.format(
             globals()['bind_address']))
         print('')
         
     return realize(wget_args, item)