def data(self, item): data = { "downloader": realize(self.downloader, item), "api_version": "2" } if self.version: data["version"] = realize(self.version, item) return data
def data(self, item): data = { "downloader": realize(self.downloader, item), "item_name": item["item_name"] } if self.version: data["version"] = realize(self.version, item) return data
def stdin_data(self, item): return "".join( [ "%s\n" % os.path.relpath( realize(f, item), realize(self.target_source_path, item) ) for f in realize(self.files, item) ]).encode('utf-8')
def start_warrior_server(warrior, bind_address="localhost", port_number=8001, http_username=None, http_password=None): '''Starts the warrior web interface.''' SeesawConnection.warrior = warrior warrior.on_projects_loaded += SeesawConnection.handle_projects_loaded warrior.on_project_refresh += SeesawConnection.handle_project_refresh warrior.on_project_installing += SeesawConnection.handle_project_installing warrior.on_project_installed += SeesawConnection.handle_project_installed warrior.on_project_installation_failed += \ SeesawConnection.handle_project_installation_failed warrior.on_project_selected += SeesawConnection.handle_project_selected warrior.on_broadcast_message_received += SeesawConnection.handle_broadcast_message warrior.on_status += SeesawConnection.handle_warrior_status warrior.runner.on_pipeline_start_item += SeesawConnection.handle_start_item warrior.runner.on_pipeline_finish_item += \ SeesawConnection.handle_finish_item warrior.runner.on_status += SeesawConnection.handle_runner_status if not http_username: http_username = warrior.http_username if not http_password: http_password = warrior.http_password ioloop.PeriodicCallback(SeesawConnection.broadcast_bandwidth, 1000).start() ioloop.PeriodicCallback(SeesawConnection.broadcast_timestamp, 1000).start() router = SockJSRouter(SeesawConnection) application = web.Application( router.apply_routes([(r"/(.*\.(html|css|js|swf|png|ico))$", web.StaticFileHandler, { "path": PUBLIC_PATH }), ("/", IndexHandler), ("/api/(.+)$", ApiHandler, { "warrior": warrior })]), # flash_policy_port = 843, # flash_policy_file = os.path.join(PUBLIC_PATH, "flashpolicy.xml"), socket_io_address=bind_address, socket_io_port=port_number, # settings for AuthenticatedApplication auth_enabled=lambda: (realize(http_password) or "").strip() != "", check_auth=lambda r, username, password: (password == realize(http_password) and (realize(http_username) or "").strip() in ["", username]), auth_realm="ArchiveTeam Warrior", skip_auth=[]) application.listen(port_number, bind_address)
def start_warrior_server(warrior, bind_address="localhost", port_number=8001, http_username=None, http_password=None): '''Starts the warrior web interface.''' SeesawConnection.warrior = warrior warrior.on_projects_loaded += SeesawConnection.handle_projects_loaded warrior.on_project_refresh += SeesawConnection.handle_project_refresh warrior.on_project_installing += SeesawConnection.handle_project_installing warrior.on_project_installed += SeesawConnection.handle_project_installed warrior.on_project_installation_failed += \ SeesawConnection.handle_project_installation_failed warrior.on_project_selected += SeesawConnection.handle_project_selected warrior.on_broadcast_message_received += SeesawConnection.handle_broadcast_message warrior.on_status += SeesawConnection.handle_warrior_status warrior.runner.on_pipeline_start_item += SeesawConnection.handle_start_item warrior.runner.on_pipeline_finish_item += \ SeesawConnection.handle_finish_item warrior.runner.on_status += SeesawConnection.handle_runner_status if not http_username: http_username = warrior.http_username if not http_password: http_password = warrior.http_password ioloop.PeriodicCallback(SeesawConnection.broadcast_bandwidth, 1000).start() ioloop.PeriodicCallback(SeesawConnection.broadcast_timestamp, 1000).start() router = SockJSRouter(SeesawConnection) application = web.Application( router.apply_routes([ (r"/(.*\.(html|css|js|swf|png|ico))$", web.StaticFileHandler, {"path": PUBLIC_PATH}), ("/", IndexHandler), ("/api/(.+)$", ApiHandler, {"warrior": warrior})]), # flash_policy_port = 843, # flash_policy_file = os.path.join(PUBLIC_PATH, "flashpolicy.xml"), socket_io_address=bind_address, socket_io_port=port_number, # settings for AuthenticatedApplication auth_enabled=lambda: (realize(http_password) or "").strip() != "", check_auth=lambda r, username, password: ( password == realize(http_password) and (realize(http_username) or "").strip() in ["", username] ), auth_realm="ArchiveTeam Warrior", skip_auth=[] ) application.listen(port_number, bind_address)
def process(self, item): total_bytes = {} for (group, files) in self.file_groups.iteritems(): total_bytes[group] = sum([ os.path.getsize(f) for f in realize(files, item)]) stats = {} stats.update(self.defaults) stats["item"] = item["item_name"] stats["bytes"] = total_bytes if self.id_function: stats["id"] = self.id_function(item) item["stats"] = realize(stats, item)
def process(self, item): with self.task_cwd(): p = AsyncPopen(args=realize(self.args, item), env=realize(self.env, item), stdin=subprocess.PIPE, close_fds=True) p.on_output += functools.partial(self.on_subprocess_stdout, p, item) p.on_end += functools.partial(self.on_subprocess_end, item) p.run() p.stdin.write(self.stdin_data(item)) p.stdin.close()
def process_body(self, body, item): data = json.loads(body) if "upload_target" in data: files = realize(self.files, item) inner_task = None if re.match(r"^rsync://", data["upload_target"]): item.log_output("Uploading with Rsync to %s" % data["upload_target"]) inner_task = RsyncUpload(data["upload_target"], files, target_source_path=self.rsync_target_source_path, bwlimit=self.rsync_bwlimit, extra_args=self.rsync_extra_args, max_tries=1) elif re.match(r"^https?://", data["upload_target"]): item.log_output("Uploading with Curl to %s" % data["upload_target"]) if len(files) != 1: item.log_output("Curl expects to upload a single file.") self.fail_item(item) return inner_task = CurlUpload(data["upload_target"], files[0], self.curl_connect_timeout, self.curl_speed_limit, self.curl_speed_time, max_tries=1) else: item.log_output("Received invalid upload type.") self.fail_item(item) return inner_task.on_complete_item += self._inner_task_complete_item inner_task.on_fail_item += self._inner_task_fail_item inner_task.enqueue(item) else: item.log_output("Tracker did not provide an upload target.") self.schedule_retry(item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'mercurial.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s-main'), '--warc-header', 'operator: Archive Team', '--warc-header', 'mercurial-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('mercurial-item: %(item_name)s'), '--warc-dedup-url-agnostic' ] item_name = item['item_name'] item_value = item_name item['item_value'] = item_value wget_args.extend( ['--warc-header', 'mercurial-repository: ' + str(item_value)]) wget_args.extend(['--warc-header', 'warc-type: main']) wget_args.append(item_value + '?cmd=capabilities') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def process(self, item): with self.task_cwd(): p = AsyncPopen( args=realize(self.args, item), env=realize(self.env, item), stdin=subprocess.PIPE, close_fds=True ) p.on_output += functools.partial(self.on_subprocess_stdout, p, item) p.on_end += functools.partial(self.on_subprocess_end, item) p.run() p.stdin.write(self.stdin_data(item)) p.stdin.close()
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'domains.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'domains-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('domains-item: %(item_name)s'), '--warc-dedup-url-agnostic', ] item_name = item['item_name'] wget_args.extend(['--domains', item_name]) wget_args.extend(['--warc-header', 'domain: ' + item_name]) wget_args.append('http://{}/'.format(item_name)) wget_args.append('https://{}/'.format(item_name)) if item_name.count('.') == 1: wget_args.append('http://www.{}/'.format(item_name)) wget_args.append('https://www.{}/'.format(item_name)) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "furaffinity.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "furaffinity.net", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "furaffinity-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("furaffinity-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('image', 'imagelogin') if item_type == 'image': suffixesa = string.digits + string.lowercase suffixesb = string.digits + string.lowercase for url in ['http://www.furaffinity.net/view/{0}{1}{2}/'.format(item_value, a, b) for a in suffixesa for b in suffixesb]: wget_args.append(url) wget_args.extend(["--no-cookies"]) elif item_type == 'imagelogin' suffixesa = string.digits + string.lowercase suffixesb = string.digits + string.lowercase for url in ['http://www.furaffinity.net/view/{0}{1}{2}/'.format(item_value, a, b) for a in suffixesa for b in suffixesb]: wget_args.append(url) wget_args.extend(["--load-cookies", "cookies.txt"]) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'tumblr-static.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'tumblr-static-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('tumblr-static-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'list': r = requests.get('http://grafana.fvz.io/.well-known/at/{}'.format(item_value)) if r.status_code != 200: raise Exception('Could not get URLs list from github.') for url in r.text.splitlines(): url = url.strip() if '%20' in url: urls = url.split('%20') urls.append(url.replace('%20', '')) else: urls = [url] for url in urls: if len(url) == 0 or not re.search(r'^https?://[^/]+/', url) \ or 'www.tumblr.com' in url: continue wget_args.extend(['--warc-header', 'tumblr-static-url: {}'.format(url)]) wget_args.append(url) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): yga_args = [PYTHON, '../../../yahoo.py', '-a', '-t', '-w'] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value http_client = httpclient.HTTPClient() if item_type == 'group': yga_args.append(item_value) elif item_type == 'group_cookie': cookie_json = http_client.fetch( 'https://df58.host.cs.st-andrews.ac.uk/yahoogroups/cookieget/' + item_value + '/', method='GET') if cookie_json.code != 200: raise ValueError('Got bad status code {}.'.format( cookie_json.code)) cookies = json.loads(cookie_json.body.decode('utf-8', 'ignore')) yga_args.extend(['-cy', "%s" % cookies["cookie_Y"]]) yga_args.extend(['-ct', "%s" % cookies["cookie_T"]]) yga_args.append(item_value) else: raise Exception('Unknown item') return realize(yga_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "rutracker.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "rutracker.org", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "rutracker-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("rutracker-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('thread', 'forum') if item_type == 'thread': suffixes = string.digits for suffix in suffixes: wget_args.append('http://rutracker.org/forum/viewtopic.php?t={0}{1}'.format(item_value, suffix)) wget_args.append('http://api.rutracker.org/v1/get_peer_stats?by=topic_id&val={0}{1}'.format(item_value, suffix)) wget_args.append('http://api.rutracker.org/v1/get_tor_hash?by=topic_id&val={0}{1}'.format(item_value, suffix)) wget_args.append('http://api.rutracker.org/v1/get_tor_topic_data?by=topic_id&val={0}{1}'.format(item_value, suffix)) elif item_type == 'forum': suffixes = string.digits for suffix in suffixes: wget_args.append('http://rutracker.org/forum/viewforum.php?f={0}{1}'.format(item_value, suffix)) wget_args.append('http://api.rutracker.org/v1/get_forum_name?by=forum_id&val={0}{1}'.format(item_value, suffix)) wget_args.append('http://api.rutracker.org/v1/get_forum_data?by=forum_id&val={0}{1}'.format(item_value, suffix)) wget_args.append('http://api.rutracker.org/v1/static/pvc/f/{0}{1}'.format(item_value, suffix)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'parler.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'parler.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', '--warc-compression-use-zstd', '--warc-zstd-dict-no-include' ] dict_data = ZstdDict.get_dict() with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: f.write(dict_data['dict']) item['dict_id'] = dict_data['id'] item['dict_project'] = TRACKER_ID wget_args.extend([ '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), ]) for item_name in item['item_name'].split('\0'): wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'post': wget_args.extend( ['--warc-header', 'parler-post: {}'.format(item_value)]) wget_args.append( 'https://parler.com/post/{}'.format(item_value)) elif item_type == 'profile': wget_args.extend( ['--warc-header', 'parler-post: {}'.format(item_value)]) wget_args.append( 'https://parler.com/profile/{}'.format(item_value)) elif item_type == 'url': wget_args.append(item_value) else: raise ValueError('item_type not supported.') item['item_name_newline'] = item['item_name'].replace('\0', '\n') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'firefox-addons.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'mozilla.org', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'firefox-addons-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('firefox-addons-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'ffaddon': wget_args.extend(['--warc-header', 'firefox-addon-identifier: {}'.format(item_value)]) wget_args.append('https://addons.mozilla.org/en-US/firefox/addon/{}/'.format(item_value)) wget_args.append('https://services.addons.mozilla.org/api/v3/addons/addon/{}/'.format(item_value)) wget_args.append('https://services.addons.mozilla.org/api/v4/addons/addon/{}/'.format(item_value)) #wget_args.append('https://addons.mozilla.org/en/firefox/addon/{}/'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=homepage-collection-featured'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=featured'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=hp-dl-promo'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=collection'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=hotness'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=rating'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/?src=recommended_fallback'.format(item_value)) #wget_args.append('https://addons.mozilla.org/firefox/addon/{}/'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value wpull_args = [ WPULL_EXE, '-nv', '-U', 'ArchiveTeam; Googlebot/2.1', '--no-check-certificate', '--no-robots', '--dns-timeout', '20', '--connect-timeout', '20', '--read-timeout', '900', '--session-timeout', '1800', '--tries', '5', '--waitretry', '5', '--max-redirect', '20', '--output-file', ItemInterpolation("%(item_dir)s/wpull.log"), '--database', ItemInterpolation("%(item_dir)s/wpull.db"), '--delete-after', '--page-requisites', '--no-parent', '--concurrent', '5', '--warc-file', ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), '--level', '0', '--page-requisites-level', '5', '--span-hosts-allow', 'page-requisites', '--warc-header', 'pipeline-py-sha256: ' + PIPELINE_SHA256, '--warc-header', 'warrior-install-sh-sha256: ' + WARRIOR_INSTALL_SHA256, '--warc-header', 'operator: Archive Team', '--warc-header', 'newsgrabber-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('ftp-item: %(item_name)s'), '--reject-regex', r'^https?://launcher\.spot\.im/spot/(www\.spot\.im/launcher/|launcher\.spot\.im/|modules/launcher/){3,}bundle\.js$' ] if '-videos' in item_value: wpull_args.append('--youtube-dl') wpull_args.append('--youtube-dl-exe') wpull_args.append(YOUTUBE_DL_EXE) list_url = 'http://master.newsbuddy.net/' + item_value list_data = requests.get(list_url) #wpull_args.append(list_url) if list_data.status_code == 200: for url in list_data.text.splitlines(): url = url.strip() wpull_args.append(url) if 'bind_address' in globals(): wpull_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wpull will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wpull_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "gamefront.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "gamefront.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "gamefront-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("gamefront-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('file', 'singlefile') if item_type == 'file': suffixes = string.digits for suffix in suffixes: wget_args.append('http://www.gamefront.com/files/{0}{1}'.format(item_value, suffix)) elif item_type == 'singlefile': wget_args.append('http://www.gamefront.com/files/{0}'.format(item_value)) session1 = requests.Session() mainpage = session1.get('http://www.gamefront.com/files/' + item_value).text if re.search(r"plopMe\('[0-9]+',\s+'[^']+'\)", mainpage): plopme = re.search(r"plopMe\('[0-9]+',\s+'([^']+)'\)", mainpage).group(1) print('Received token ' + plopme + '.') print('Received ' + session1.post('http://www.gamefront.com/files/service/request', data = {'token':plopme}, headers={'referer': 'http://www.gamefront.com/files/' + item_value}).text + '.') session1.get('http://www.gamefront.com/files/service/thankyou?id=' + item_value, headers={'referer': 'http://www.gamefront.com/files/' + item_value}) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): dedupe_args = [ PYTHON35_EXE, "-u", "deduplicate.py", ItemInterpolation("%(data_dir)s/%(item_name)s.warc.gz"), ] return realize(dedupe_args, item)
def enqueue(self, item): self.start_item(item) item.log_output("Starting %s for %s\n" % (self, item.description())) item["tries"] = 1 item['WgetDownloadMany.urls'] = realize(self.unrealized_urls, item) item['WgetDownloadMany.urls_index'] = 0 item['WgetDownloadMany.current_url'] = None self.process(item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'pastebin.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'pastebin.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'pastebin-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('pastebin-item: %(item_name)s'), '--warc-dedup-url-agnostic', '--warc-compression-use-zstd', '--warc-zstd-dict-no-include' ] dict_data = ZstdDict.get_dict() with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: f.write(dict_data['dict']) item['dict_id'] = dict_data['id'] item['dict_project'] = TRACKER_ID wget_args.extend([ '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), ]) item_name = item['item_name'] item_value = item_name if len(item_value) > 8: item_value = self.int_to_str(int(item_name.replace('b36.', ''), 36)) item['item_value'] = item_value wget_args.extend(['--warc-header', 'pastebin-paste: ' + str(item_value)]) wget_args.append('https://pastebin.com/{}'.format(item_value)) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "musicbrainz.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "musicbrainz-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("musicbrainz-user: %(item_name)s"), ] item_name = item["item_name"] assert ":" in item_name item_sort, item_item, item_file = item_name.split(":", 2) item["item_item"] = item_item item_list = requests.get("http://archive.org/download/{0}/{1}".format(item_item, item_file)) if item_list.status_code != 200: raise Exception( "You received status code %d with URL %s" % (item_list.status_code, "https://archive.org/download/{0}/{1}".format(item_item, item_file)) ) for url in item_list.text.splitlines(): wget_args.append("{0}".format(url)) if "bind_address" in globals(): wget_args.extend(["--bind-address", globals()["bind_address"]]) print("") print("*** Wget will bind address at {0} ***".format(globals()["bind_address"])) print("") return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", random.choice(USER_AGENTS), "-nv", "--lua-script", "twitpic-api.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--no-cookies", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--span-hosts", "--waitretry", "30", "--domains", "twitpic.com,cloudfront.net,twimg.com,amazonaws.com", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "twitpic-api-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("twitpic-api-user: %(item_name)s"), "--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "--header", "DNT: 1", "--header", random.choice(ACCEPT_LANGUAGE_HEADERS), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('image', 'user', 'tag', 'event') if item_type == 'imageapi': suffixes = string.digits + string.lowercase for args in [( 'http://api.twitpic.com/2/media/show.json?id={0}{1}'.format(item_value, s), \ 'http://api.twitpic.com/2/comments/show.json?media_id={0}{1}&page=1'.format(item_value, s)) for s in suffixes]: wget_args.append(args[0]) wget_args.append(args[1]) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--no-cookies", "--lua-script", "portalgraphics.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "portalgraphics.net", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "portalgraphics-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("portalgraphics-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 2) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('image_id', 'user_id') if item_type == 'image_id': wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}'.format(item_value)) wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}&lang=ja'.format(item_value)) wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}&lang=en'.format(item_value)) wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}'.format(item_value)) wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}&lang=ja'.format(item_value)) wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}&lang=en'.format(item_value)) wget_args.append('http://www.portalgraphics.net/pg/movie/address.php?image%5Fid={0}'.format(item_value)) wget_args.append('http://www.portalgraphics.net/pg/movie/address.php?image_id={0}'.format(item_value)) elif item_type == 'user_id': wget_args.append('http://portalgraphics.net/pg/profile/?user_id={0}'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'sketch.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'sonymobile.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'sketch-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('sketch-item: %(item_name)s'), '--header', 'Accept-Encoding: gzip', '--compression', 'gzip' ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value http_client = httpclient.HTTPClient() if item_type == 'sketches': r = http_client.fetch('http://103.230.141.2/sketch/' + item_value, method='GET') for s in r.body.decode('utf-8', 'ignore').splitlines(): s = s.strip() if len(s) == 0: continue wget_args.extend( ['--warc-header', 'sketch-sketch-id: {}'.format(s)]) wget_args.append( 'https://sketch.sonymobile.com/api/1/sharedsketch/{}'. format(s)) elif item_type == 'user': wget_args.extend( ['--warc-header', 'sketch-user-id: '.format(item_value)]) wget_args.append( 'https://sketch.sonymobile.com/api/1/artist/{}'.format( item_value)) else: raise Exception('Unknown item') http_client.close() if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--no-cookies", "--lua-script", "panoramio.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "panoramio.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "panoramio-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("panoramio-item: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('photos', 'users') if item_type == 'photos': start, stop = item_value.split('-') for i in range(int(start), int(stop)+1): wget_args.extend(['--warc-header', 'panoramio-photo: {i}'.format(**locals())]) wget_args.append('http://www.panoramio.com/photo/{i}'.format(**locals())) elif item_type == 'users': start, stop = item_value.split('-') for i in range(int(start), int(stop)+1): wget_args.extend(['--warc-header', 'panoramio-user: {i}'.format(**locals())]) wget_args.append('http://www.panoramio.com/user/{i}'.format(**locals())) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): dedup_args = [ PYTHON2_EXE, '-u', # no output buffering 'dedupe.py', '%(item_dir)s/%(warc_file_base)s.warc.gz' % item, '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz' % item ] return realize(dedup_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', '500px.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', '500px.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', '500px-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('500px-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'photos': for id_ in item_value.split(';'): wget_args.extend(['--warc-header', '500px-photo: {}'.format(id_)]) wget_args.append('https://500px.com/photo/{}'.format(id_)) wget_args.append('https://api.500px.com/v1/photos/{}/comments?sort=created_at&include_subscription=1&include_flagged=1&nested=1&page=1&rpp=30'.format(id_)) wget_args.append('https://api.500px.com/v1/photos?image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14&expanded_user_info=true&include_tags=true&include_geo=true&include_equipment_info=true&include_licensing=true&include_releases=true&liked_by=1&following_sample=100&ids={}'.format(id_)) #wget_args.append('https://api.500px.com/v1/photos/{}/navigation?from=user&formats=jpeg%2Clytro&image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14'.format(id_)) elif item_type == 'all': start, end = item_value.split('-') for id_ in range(int(start), int(end)+1): wget_args.extend(['--warc-header', '500px-photo: {}'.format(id_)]) wget_args.append('https://500px.com/photo/{}'.format(id_)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "sourceforge.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "sourceforge.net", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "sourceforge-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("sourceforge-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('project') if item_type == 'project': wget_args.append('http://sourceforge.net/projects/{0}/'.format(item_value)) wget_args.append('http://sourceforge.net/projects/{0}/?source=directory'.format(item_value)) wget_args.append('http://sourceforge.net/projects/{0}/?source=directory-featured'.format(item_value)) wget_args.append('http://sourceforge.net/projects/{0}/?source=frontpage&position=1'.format(item_value)) wget_args.append('http://sourceforge.net/projects/{0}/?source=frontpage'.format(item_value)) wget_args.append('http://sourceforge.net/projects/{0}/'.format(item_value)) wget_args.append('http://sourceforge.net/p/{0}/'.format(item_value)) wget_args.append('http://sourceforge.net/rest/p/{0}/'.format(item_value)) wget_args.append('http://sourceforge.net/rest/p/{0}?doap'.format(item_value)) wget_args.append('http://{0}.sourceforge.net/'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", # "--no-cookies", "--load-cookies", "cookies.txt", "--lua-script", "codebender.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "codebender.cc", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "codebender-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("codebender-item: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('sketches', 'user') if item_type == 'sketches': start, stop = item_value.split('-') for i in range(int(start), int(stop)+1): wget_args.extend(['--warc-header', 'codebender-sketch: {i}'.format(**locals())]) wget_args.append('https://codebender.cc/sketch:{i}'.format(**locals())) wget_args.append('https://codebender.cc/sketch:{i}?noCookies=true'.format(**locals())) wget_args.append('https://codebender.cc/utilities/download/{i}'.format(**locals())) wget_args.append('https://codebender.cc/utilities/download/{i}?noCookies=true'.format(**locals())) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "yuku.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--no-cookies", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "yuku.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "yuku-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("yuku-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_name, item_type, item_value, item_thread = item_name.split(':', 3) item['item_type'] = item_type item['item_value'] = item_value item['item_thread'] = item_thread # Example item: yuku:10threads:deltasforest29697:17 assert item_type in ('thread', '10threads') if item_type == 'thread': wget_args.append('http://%s.yuku.com/topic/%s/'%(item_value, item_thread)) elif item_type == '10threads': suffixes = string.digits for suffix in suffixes: wget_args.append('http://%s.yuku.com/topic/%s%s/'%(item_value, item_thread, suffix)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WPULL_EXE, "-nv", "--python-script", "ftp.py", "-o", ItemInterpolation("%(item_dir)s/wpull.log"), "--no-check-certificate", "--database", ItemInterpolation("%(item_dir)s/wpull.db"), "--delete-after", "--no-robots", "--no-cookies", "--rotate-dns", "--timeout", "60", "--tries", "inf", "--wait", "0.5", "--random-wait", "--waitretry", "5", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "ftp-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("ftp-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_sort, item_item, item_file = item_name.split(':', 2) item['item_item'] = item_item MAX_SIZE = 10737418240 item_list = requests.get('http://archive.org/download/{0}/{1}'.format(item_item, item_file)) if item_list.status_code != 200: raise Exception('You received status code %d with URL %s'%(item_list.status_code, 'https://archive.org/download/{0}/{1}'.format(item_item, item_file))) itemsize = int(re.search(r'ITEM_TOTAL_SIZE: ([0-9]+)', item_list.text).group(1)) if itemsize > MAX_SIZE: raise Exception('Item is %d bytes. This is larger then %d bytes.'%(itemsize, MAX_SIZE)) for url in item_list.text.splitlines(): if url.startswith('ftp://'): url = url.replace(' ', '%20').replace('&', '&') url = urllib.unquote(url) if item_item == 'archiveteam_ftp_items_2015120102': url = url.replace('ftp://ftp.research.microsoft.com/downloads/downloads/', 'ftp://ftp.research.microsoft.com/downloads/') if '#' in url: raise Exception('%s containes a bad character.'%(url)) else: wget_args.append("{0}".format(url)) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "canvas.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--no-cookies", "--rotate-dns", # "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "60", "--tries", "inf", "--span-hosts", "--waitretry", "3600", "--domains", "canv.as,drawquest-export.s3-website-us-east-1.amazonaws.com", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "canvas-archive-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("canvas-user: %(item_name)s"), "--header", "Host: drawquest-export.s3-website-us-east-1.amazonaws.com", ] item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('user', 'homepage') if item_type == 'user': wget_args.append('http://canv.as/{0}/'.format(item_value)) wget_args.extend(["--recursive", "--level=inf"]) elif item_type == 'homepage': wget_args.append('http://canv.as/') else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'so-net-u-page-plus.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'upp.so-net.ne.jp', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', ] for item_name in item['item_name'].split('\0'): item_name = item_name.replace('http://', '') wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name]) wget_args.append('item-name://'+item_name) item_type, item_value = item_name.split(':', 1) if item_type == "userdir": wget_args.extend(['--warc-header', 'so-net-u-page-plus-userdir: ' + item_value]) hostname = item_value.split("/")[0] user_dir_name = item_value.split("/")[1] wget_args.append('http://{}.upp.so-net.ne.jp/{}/'.format(hostname, user_dir_name)) # Alternate forms, because I amn't sure how they parse this wget_args.append('http://{}.upp.so-net.ne.jp/{}/index.htm'.format(hostname, user_dir_name)) wget_args.append('http://{}.upp.so-net.ne.jp/{}/index.html'.format(hostname, user_dir_name)) wget_args.append('http://{}.upp.so-net.ne.jp/{}'.format(hostname, user_dir_name)) else: raise ValueError('item_type not supported.') item['item_name_newline'] = item['item_name'].replace('\0', '\n') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "gamefront.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--no-cookies", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "gamefront.com,filefront.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "gamefront-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("gamefront-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('forums', 'members') suffixes = string.digits if item_type == 'forums': for suffix in suffixes: wget_args.append('http://forums.filefront.com/showthread.php?t={0}{1}'.format(item_value, suffix)) elif item_type == 'members': for suffix in suffixes: wget_args.append('http://forums.filefront.com/member.php?u={0}{1}'.format(item_value, suffix)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "olympe.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "olympe-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("olympe-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('site') if item_type == 'site': urls = ['http://' + item_value + '.olympe.in/'] response = requests.get(urls[0]) urls.append(re.search(r'^(https?://[^/]+)', response.url).group(1)) with open('seedurls', 'w') as file: file.write('\n'.join([re.search(r'^https?://([^/]+)', url).group(1) for url in urls])) for url in urls: wget_args.append(url) wget_args.append(response.url) wget_args += ["--domains", ','.join([re.search(r'^https?://([^/]+)', url).group(1) for url in urls])] else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "yahoomaps.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", # "--recursive", "--level=inf", "--no-parent", # "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "yahoo.com,here.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "sourceforge-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("yahoomaps-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('tiles') if item_type == 'tiles': tile_layer, tile_num, tile_range = item_value.split(':') tile_start, tile_end = tile_range.split('-') print(tile_start) print(tile_end) for tilenum in range(int(tile_start), int(tile_end)): wget_args.append('http://1.base.maps.api.here.com/maptile/2.1/maptile/187ddf591c/normal.day/{0}/{1}/{2}/256/png8?lg=ENG&token=TrLJuXVK62IQk0vuXFzaig%3D%3D&requestid=yahoo.prod&app_id=eAdkWGYRoc4RfxVo0Z4B'.format(tile_layer, tile_num, tilenum)) wget_args.append('http://1.aerial.maps.api.here.com/maptile/2.1/maptile/187ddf591c/hybrid.day/{0}/{1}/{2}/256/jpg?lg=ENG&token=TrLJuXVK62IQk0vuXFzaig%3D%3D&requestid=yahoo.prod&app_id=eAdkWGYRoc4RfxVo0Z4B'.format(tile_layer, tile_num, tilenum)) wget_args.append('http://1.aerial.maps.api.here.com/maptile/2.1/maptile/187ddf591c/satellite.day/{0}/{1}/{2}/256/jpg?lg=ENG&token=TrLJuXVK62IQk0vuXFzaig%3D%3D&requestid=yahoo.prod&app_id=eAdkWGYRoc4RfxVo0Z4B'.format(tile_layer, tile_num, tilenum)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WPULL_EXE, "-nv", # "--user-agent", USER_AGENT, "--python-script", "examplecity.py", "-o", ItemInterpolation("%(item_dir)s/wpull.log"), "--no-check-certificate", "--database", ItemInterpolation("%(item_dir)s/wpull.db"), "--delete-after", "--no-robots", "--no-cookies", "--rotate-dns", # "--recursive", "--level=inf", "--recursive", "--level=2", "--no-parent", "--page-requisites", "--span-hosts-allow", "page-requisites,linked-pages", "--timeout", "30", "--tries", "2", "--wait", "0.5", "--random-wait", "--waitretry", "5", # "--domains", "example.com,example.net", # "--hostnames", "assets.cloudspeeder.invalid,cnd.wahoo.invalid", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "examplecity-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("examplecity-user: %(item_name)s"), ] domain = item['item_name'] wget_args.append("http://{0}".format(domain)) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--load-cookies", "cookies.txt", "--lua-script", "coursera.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "coursera.org", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "coursera-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("coursera-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('oldcourse') if item_type == 'oldcourse': X_CSRFToken = ''.join(random.choice(string.digits) for i in range(20)) X_CSRF2_Cookie = 'csrf2_token_' + ''.join(random.choice(string.digits) for i in range(8)) X_CSRF2_Token = ''.join(random.choice(string.digits) for i in range(24)) Cookie = "csrftoken=%s; %s=%s" % (X_CSRFToken, X_CSRF2_Cookie, X_CSRF2_Token) os.system(WGET_LUA + " --save-cookies cookies.txt --keep-session-cookies --post-data '[email protected]&password=123456&webrequest=true' --header='Cookie: " + Cookie + "' --header='X-CSRFToken: " + X_CSRFToken + "' --header='X-CSRF2-Cookie: " + X_CSRF2_Cookie + "' --header='X-CSRF2-Token: " + X_CSRF2_Token + "' https://www.coursera.org/api/login/v3") os.remove('v3') wget_args.append('https://www.coursera.org/course/' + item_value) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "toshiba.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "toshiba.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "toshiba-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("toshiba-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('download') if item_type == 'download': suffixes = string.digits for url in ['http://support.toshiba.com/sscontent?contentId={0}{1}{2}'.format(item_value, a, b) for a in suffixes for b in suffixes]: wget_args.append(url) for url in ['http://support.toshiba.com/support/viewContentDetail?contentId={0}{1}{2}'.format(item_value, a, b) for a in suffixes for b in suffixes]: wget_args.append(url) for url in ['http://support.toshiba.com/support/staticContentDetail?contentId={0}{1}{2}'.format(item_value, a, b) for a in suffixes for b in suffixes]: wget_args.append(url) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): item_name = item['item_name'] warcfile = item['data_dir'] + "/" + item['item_name'] + ".warc.gz" wget_args = [ 'wget', '-nv', '-U', 'ArchiveTeam; Googlebot/2.1', '--tries', '5', '--waitretry', '5', '-O', warcfile, ItemInterpolation( "https://archive.org/download/archiveteam_%(item_name)s/%(item_name)s.megawarc.warc.gz" ) ] return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'playstv.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'plays.tv', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'playstv-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('playstv-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'user': wget_args.extend(['--warc-header', 'playstv-user-id: ' + item_value]) wget_args.append('https://plays.tv/playsapi/usersys/v1/user/' + item_value) elif item_type == 'video': for s in item_value.split(';'): print(s) wget_args.extend(['--warc-header', 'playstv-video-id: ' + s]) wget_args.append('https://plays.tv/playsapi/feedsys/v1/media/' + s) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "layervault.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "layervault.com,layervau.lt", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "layervault-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("layervault-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('project', 'file') a = string.digits if item_type == 'project': for url in ['https://layervault.com/api/v2/projects/{0}{1}{2}'.format(item_value, sufa, sufb) for sufa in a for sufb in a]: wget_args.append(url) elif item_type == 'file': for url in ['https://layervault.com/api/v2/files/{0}{1}{2}'.format(item_value, sufa, sufb) for sufa in a for sufb in a]: wget_args.append(url) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def process_one(self, item): with self.task_cwd(): url = item['WgetDownloadMany.current_url'] item.log_output("Start downloading URL %s" % url) p = seesaw.externalprocess.AsyncPopen( args=realize(self.args, item) + [url], env=realize(self.env, item), stdin=subprocess.PIPE, close_fds=True ) p.on_output += functools.partial(self.on_subprocess_stdout, p, item) p.on_end += functools.partial(self.on_subprocess_end, item) p.run() p.stdin.write(self.stdin_data(item)) p.stdin.close()
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--load-cookies', 'cookies.txt', '--lua-script', 'super-mario-maker-bookmarks.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic' ] item_names = item['item_name'].split('\0') item['item_name_newline'] = item['item_name'].replace('\0', '\n') for item_name in item_names: wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'user': wget_args.extend([ '--warc-header', 'super-mario-world-bookmarks-user: '******'https://supermariomakerbookmark.nintendo.net/profile/' + item_value) elif item_type == 'course': wget_args.extend([ '--warc-header', 'super-mario-world-bookmarks-course: ' + item_value ]) wget_args.append( 'https://supermariomakerbookmark.nintendo.net/courses/' + item_value) else: raise ValueError('item_type not supported.') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "jux.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "jux.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "jux-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("jux-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ("jux") if item_type == 'jux': wget_args.append('http://{0}.jux.com/'.format(item_value)) wget_args.append('http://{0}.jux.com/robots.txt'.format(item_value)) wget_args.append('http://{0}.jux.com/sitemap.xml'.format(item_value)) wget_args.append('http://{0}.jux.com/owner.json'.format(item_value)) wget_args.append('http://{0}.jux.com/quarks.json'.format(item_value)) wget_args.append('http://{0}.jux.com/quarks.json?per_page=1000000000'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'bitbucket.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'bitbucket.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'bitbucket-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('bitbucket-item: %(item_name)s'), '--warc-dedup-url-agnostic', '--warc-compression-use-zstd', '--warc-zstd-dict-no-include' ] dict_data = ZstdDict.get_dict() with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: f.write(dict_data['dict']) item['dict_id'] = dict_data['id'] item['dict_project'] = TRACKER_ID wget_args.extend([ '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), ]) item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'hg': wget_args.append('https://bitbucket.org/' + item_value + '/src/') wget_args.append('https://bitbucket.org/' + item_value) wget_args.append('https://bitbucket.org/' + item_value + '/src/default/') wget_args.append('https://bitbucket.org/!api/2.0/repositories/' + item_value) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "ancestry.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "mundia.com,muncn.com,genealogy.com,familyorigins.com,genforum.com,myfamily.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "ancestry-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("ancestry-user: %(item_name)s"), ] #example item: genealogy:users:c:o:x:Helen-Cox-NJ #example item: familytreemaker:users:s:c:h:Aaron-J-Schwartz #example item: familyorigins:users:s:c:h:Beverly-G-Schweppe item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ("genforum") if item_type == "genforum": wget_args.append('http://genforum.genealogy.com/{0}/'.format(item_value)) wget_args.append('http://genforum.com/{0}/'.format(item_value)) wget_args.extend(["--no-cookies"]) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def process(self, item): with self.task_cwd(): p = AsyncPopen2(args=realize(self.args, item), env=realize(self.env, item), stdin=subprocess.PIPE, close_fds=True) p.on_output += functools.partial(self.on_subprocess_stdout, p, item) p.on_end += functools.partial(self.on_subprocess_end, item) p.run() try: p.stdin.write(self.stdin_data(item)) except Exception as error: # FIXME: We need to properly propagate errors item.log_output("Error writing to process: %s" % str(error)) item["ExternalProcess.stdin_write_error"] = True p.stdin.close()
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "friendfeed.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", # "--recursive", "--level=inf", "--no-parent", # "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "friendfeed.com,friendfeed-media.com,friendfeed-api.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "friendfeed-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("friendfeed-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('account') if item_type == 'account': # wget_args.append('http://friendfeed-api.com/v2/picture/{0}'.format(item_value)) # wget_args.append('http://friendfeed-api.com/v2/picture/{0}?size=small'.format(item_value)) # wget_args.append('http://friendfeed-api.com/v2/picture/{0}?size=medium'.format(item_value)) # wget_args.append('http://friendfeed-api.com/v2/picture/{0}?size=large'.format(item_value)) wget_args.append('http://friendfeed-api.com/v2/feed/{0}?pretty=1&num=100&start=0&hidden=1&raw=1'.format(item_value)) wget_args.append('http://friendfeed.com/{0}'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "sandbox.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", # "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "yoyogames.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "gamemakersandbox-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("gamemakersandbox-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ("user", 'game') if item_type == 'user': wget_args.append('http://sandbox.yoyogames.com/users/{0}'.format(item_value)) elif item_type == 'game': wget_args.append('http://sandbox.yoyogames.com/games/{0}'.format(item_value)) game_id = item_value.split('-', 1)[0] wget_args.append('http://sandbox.yoyogames.com/games/{0}/download'.format(game_id)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'gfycat.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'gfycat.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'gfycat-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('gfycat-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'disco': prefix = 'https://api.gfycat.com/v1/gfycats/' + item_value with open('animals', 'r') as f: for line in f: if line.startswith('#'): continue wget_args.append(prefix + line.strip()) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--load-cookies', 'cookies.txt', '--lua-script', 'bintray.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic' ] item_names = item['item_name'].split('\0') item['item_name_newline'] = item['item_name'].replace('\0', '\n') item_names_to_submit = item_names.copy() for item_name in item_names: assert item_name not in {'user:account', 'user:assets' }, 'Doing this out of caution' wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'user': wget_args.extend( ['--warc-header', 'bintray-user: '******'https://bintray.com/{item_value}') wget_args.append(f'https://bintray.com/{item_value}/') elif item_type == 'file': wget_args.extend( ['--warc-header', 'bintray-file: ' + item_value]) assert item_value.startswith( "http"), "If this fails, something strange has happened" wget_args.append(item_value) else: raise ValueError('item_type not supported.') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)