def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', '500px.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', '500px.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', '500px-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('500px-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'photos': for id_ in item_value.split(';'): wget_args.extend( ['--warc-header', '500px-photo: {}'.format(id_)]) wget_args.append('https://500px.com/photo/{}'.format(id_)) wget_args.append( 'https://api.500px.com/v1/photos/{}/comments?sort=created_at&include_subscription=1&include_flagged=1&nested=1&page=1&rpp=30' .format(id_)) wget_args.append( 'https://api.500px.com/v1/photos?image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14&expanded_user_info=true&include_tags=true&include_geo=true&include_equipment_info=true&include_licensing=true&include_releases=true&liked_by=1&following_sample=100&ids={}' .format(id_)) #wget_args.append('https://api.500px.com/v1/photos/{}/navigation?from=user&formats=jpeg%2Clytro&image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14'.format(id_)) elif item_type == 'all': start, end = item_value.split('-') for id_ in range(int(start), int(end) + 1): wget_args.extend( ['--warc-header', '500px-photo: {}'.format(id_)]) wget_args.append('https://500px.com/photo/{}'.format(id_)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, #'-nv', '--no-cookies', '--lua-script', 'static-only.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', # '--recursive', '--level=inf', # '--no-parent', # '--page-requisites', '--timeout', '30', # '--tries', 'inf', # '--domains', '.com', # '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'static-only-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('static-only-item: %(item_name)s'), # --warc-header static-url-id: ... filled in below # '--header', 'Accept-Encoding: gzip', # '--compression', 'gzip' # changed flags # ] item_type = item["item_type"] item_value = item["item_value"] wget_urls = [] task_line = item_value #if len(task_line) == 0: # continue # elif item_type == 'static_job_json': # TODO # elif item_type == 'static_job_urls': # TODO if item_type == 'static_url': print("T> " + task_line) #debug wget_urls.append(task_line) else: raise Exception('Unknown item') item["todo_url_count"] = str(len(wget_urls)) print("URIs ToDo: {}".format(len(wget_urls))) if len(wget_urls) == 0: wget_args.append("-V") else: wget_args.extend(wget_urls) # print("\nD^ ".join(defer_assets)) #debug # print("\nD^ ", end="") #debug if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
WgetDownload( WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 7, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), "downloader": downloader } ), PrepareStatsForTracker( defaults={"downloader": downloader, "version": VERSION}, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz") ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz") ],
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "ovi-store.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "ovi.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "ovi-store-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("ovi-store-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('app') if item_type == 'app': if item_value == '': suffixes = '123456789' else: suffixes = string.digits for url in [ 'http://store.ovi.com/content/{0}{1}'.format( item_value, s) for s in suffixes ]: wget_args.append(url) for url in [ 'http://store.ovi.com/content/{0}{1}/Download'.format( item_value, s) for s in suffixes ]: wget_args.append(url) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--no-cookies", "--lua-script", "soundcloud.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "soundcloud.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "soundcloud-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("soundcloud-item: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value def add_api(i, v): wget_args.extend([ '--warc-header', 'soundcloud-track-api: {i}'.format(i=i, v=v) ]) wget_args.append( 'https://api.soundcloud.com/app/v2/tracks/{i}/comments?threaded=1&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=200&offset=0&linked_partitioning=1&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/tracks/{i}/related?anon_user_id=33006123&variant=control&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=10&offset=0&linked_partitioning=1&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/tracks/{i}/albums?representation=mini&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=10&offset=0&linked_partitioning=1&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/tracks/{i}/playlists_without_albums?representation=mini&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=10&offset=0&linked_partitioning=1&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/tracks/{i}/likers?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=9&offset=0&linked_partitioning=1&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/tracks/{i}/reposters?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=9&offset=0&linked_partitioning=1&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/audio-ad?sc_a_id=28936013-0c76-4245-b4dd-a0d7fc590135&device_locale=nl&track_id={i}&rubicon_user_id=J0JKJ4XR-1E-1OEW&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api.soundcloud.com/app/v2/tracks/{i}/comments?filter_replies=1&limit=200&offset=0&linked_partitioning=1&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}' .format(i=i, v=v)) #wget_args.append('https://api.soundcloud.com/i1/tracks/{i}/streams?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z'.format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/stations/soundcloud:track-stations:{i}/tracks?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=10&offset=0&linked_partitioning=1&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/stations?urns=soundcloud%3Atrack-stations%3A{i}&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}' .format(i=i, v=v)) #extra wget_args.append( 'https://api.soundcloud.com/tracks/{i}/favoriters?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}' .format(i=i, v=v)) wget_args.append( 'https://api-v2.soundcloud.com/tracks/{i}/playlists_without_albums?offset=10&limit=10&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}' .format(i=i, v=v)) if item_type == 'api': start, stop = item_value.split('-') for i in range(int(start), int(stop) + 1): add_api(i, 1500299175) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
_, _, _, pipeline_id = monitoring.pipeline_id() wpull_args = WpullArgs( default_user_agent=DEFAULT_USER_AGENT, wpull_exe=WPULL_EXE, youtube_dl_exe=YOUTUBE_DL, finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"], warc_max_size=WARC_MAX_SIZE, monitor_disk=WPULL_MONITOR_DISK, monitor_memory=WPULL_MONITOR_MEMORY, ) check_wpull_args(wpull_args) wpull_env = { 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'], } if OPENSSL_CONF: wpull_env['OPENSSL_CONF'] = OPENSSL_CONF if TMPDIR: wpull_env['TMPDIR'] = TMPDIR pipeline = Pipeline( CheckIP(), CheckLocalWebserver(), GetItemFromQueue(control, pipeline_id, downloader, ao_only=env.get('AO_ONLY'), large=env.get('LARGE'), version_check = (VERSION, pipeline_version)),
accept_on_exit_code=[0, 4, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), 'warc_file_base': ItemValue('warc_file_base') }), Deduplicate(), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation( "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz") ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default="20", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
PrepareDirectories(warc_prefix=TRACKER_ID), WgetDownload( WgetArgs(), max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'warc_file_base': ItemValue('warc_file_base') } ), SetBadUrls(), PrepareStatsForTracker( defaults={'downloader': downloader, 'version': VERSION}, file_groups={ 'data': [ ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst') ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent(NumberConfigValue(min=1, max=20, default='20', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'), ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--load-cookies', 'cookies.txt', '--content-on-error', '--no-http-keep-alive', '--lua-script', 'telegram.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 't.me,telegram.org', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', '--warc-compression-use-zstd', '--warc-zstd-dict-no-include', '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8', '--secure-protocol', 'TLSv1_2' ] dict_data = ZstdDict.get_dict() with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: f.write(dict_data['dict']) item['dict_id'] = dict_data['id'] item['dict_project'] = TRACKER_ID wget_args.extend([ '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), ]) item['item_name'] = '\0'.join( s for s in item['item_name'].split('\0') if not s.startswith('user:'******'item_name'].split('\0'): wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name]) wget_args.append('item-name://'+item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'post': group, post_id = item_value.split(':', 1) wget_args.extend(['--warc-header', 'telegram-post: {}/{}'.format(group, post_id)]) wget_args.append('https://t.me/{}/{}?embed=1'.format(group, post_id)) elif item_type == 'channel': wget_args.extend(['--warc-header', 'telegram-channel: '+item_value]) wget_args.append('https://t.me/s/'+item_value) #elif item_type == 'url': # wget_args.extend(['--warc-header', 'telegram-resource: '+item_value]) # wget_args.append(item_value) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): with open('user-agents', 'r') as f: user_agent = random.choice(list(f)).strip() wget_args = [ WGET_AT, '-U', user_agent, '-nv', '--no-cookies', '--content-on-error', '--lua-script', 'reddit.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'reddit.com', '--header', 'Cookie: over18=1; _options=%7B%22pref_quarantine_optin%22%3A%20true%7D', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', '--warc-compression-use-zstd', '--warc-zstd-dict-no-include', '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8' ] dict_data = ZstdDict.get_dict() with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f: f.write(dict_data['dict']) item['dict_id'] = dict_data['id'] item['dict_project'] = 'reddit' wget_args.extend([ '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'), ]) for item_name in item['item_name'].split('\0'): wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type in ('post', 'comment'): if item_type == 'post': wget_args.extend( ['--warc-header', 'reddit-post: ' + item_value]) wget_args.append( 'https://www.reddit.com/api/info.json?id=t3_' + item_value) elif item_type == 'comment': wget_args.extend( ['--warc-header', 'reddit-comment: ' + item_value]) wget_args.append( 'https://www.reddit.com/api/info.json?id=t1_' + item_value) else: raise Exception('Unknown item') item['item_name_newline'] = item['item_name'].replace('\0', '\n') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'halo.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'bungie.net', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', ] def queue_range(start, end, url_prefix, header_prefix): for i in range(start, end + 1): wget_args.extend( ['--warc-header', header_prefix + ': ' + str(i)]) wget_args.append(url_prefix + str(i)) def process_sequential(s, url_prefix, header_prefix): if '-' in s: start, end = s.split('-') else: start, end = int(s), int(s) queue_range(int(start), int(end), url_prefix, header_prefix) item_names = item['item_name'].split('\0') for item_name in item_names[:]: wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'reach-file': process_sequential( item_value, 'http://halo.bungie.net/Stats/Reach/FileDetails.aspx?fid=', 'halo-bungie-reach-file') elif item_type == 'reach-guid': wget_args.extend([ '--warc-header', 'halo-bungie-reach-game-guid: ' + item_value ]) wget_args.append( 'http://halo.bungie.net/Stats/Reach/GameStats.aspx?guid=' + item_value) elif item_type == 'reach-stats': process_sequential( item_value, 'http://halo.bungie.net/Stats/Reach/GameStats.aspx?gameid=', 'halo-bungie-reach-game-stats') elif item_type in ('player', 'reach-player'): item_names.remove(item_name) # wget_args.extend(['--warc-header', 'halo-bungie-player: '+item_value]) # wget_args.extend(['--warc-header', 'halo-bungie-reach-player: '+item_value]) # wget_args.extend(['--warc-header', 'halo-bungie-halo2-player: '+item_value]) # wget_args.extend(['--warc-header', 'halo-bungie-halo3-player: '+item_value]) # wget_args.append('http://halo.bungie.net/Stats/Reach/default.aspx?player='+item_value) # wget_args.append('http://halo.bungie.net/Stats/PlayerStatsHalo2.aspx?player='+item_value) # wget_args.append('http://halo.bungie.net/Stats/Halo3/default.aspx?player='+item_value) elif item_type == 'reach-pxd': wget_args.extend([ '--warc-header', 'halo-bungie-reach-player: ' + item_value ]) wget_args.append( 'http://halo.bungie.net/Stats/Reach/default.aspx?pxd=' + item_value) else: raise ValueError('item_type not supported.') item['item_name'] = '\0'.join(item_names) item['item_name_newline'] = '\n'.join(item_names) if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'storyfire.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'storyfire.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', ] for item_name in item['item_name'].split('\0'): wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'video': wget_args.extend( ['--warc-header', 'storyfire-video: ' + item_value]) wget_args.append('https://storyfire.com/video-details/' + item_value) elif item_type == 'user': wget_args.extend( ['--warc-header', 'storyfire-user: '******'https://storyfire.com/user/' + item_value) else: raise ValueError('item_type not supported.') item['item_name_newline'] = item['item_name'].replace('\0', '\n') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), }), Deduplicate(), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ ItemInterpolation( '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz') ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='20', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID), downloader=downloader,
def realize(self, item): with open('user-agents', 'r') as f: user_agent = random.choice(list(f)).strip() wget_args = [ WGET_AT, '-U', user_agent, '-nv', '--content-on-error', '--load-cookies', 'cookies.txt', '--lua-script', 'niconico.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'voat.co', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', '--header', 'Accept-Language: ja', '--header', 'Content-Type: text/plain', ] item_names = item['item_name'].split('\0') for item_name in item_names: wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name]) wget_args.append('item-name://'+item_name) item_type, item_value = item_name.split(':', 1) if item_type == 'vid': wget_args.extend(['--warc-header', 'niconico-vid: '+item_value]) wget_args.append('https://www.nicovideo.jp/watch/' + item_value) else: raise ValueError('item_type not supported.') #item_name = item['item_name'] #item_type, item_value = item_name.split(':') #item['item_type'] = item_type #item['item_value'] = item_value #if item_type == "metadatarange": #wget_args.extend(['--warc-header', 'niconico-metadatarange: ' + item_value]) #[prefix, start, end] = item_value.split("-") #for i in range(int(start), int(end)): #wget_args.append(f'https://www.nicovideo.jp/watch/{prefix}{i}') #else: #raise ValueError('item_type not supported.') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', #'--load-cookies', 'cookies.txt', '--lua-script', 'wikispaces.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'wikispaces.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'wikispaces-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('wikispaces-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type == 'wiki': wget_args.extend(['--warc-header', 'wikispace: ' + item_value]) wget_args.append( 'https://{}.wikispaces.com/space/content?utable=WikiTablePageList&ut_csv=1' .format(item_value)) wget_args.append( 'https://{}.wikispaces.com/space/content'.format(item_value)) wget_args.append('https://{}.wikispaces.com/'.format(item_value)) wget_args.append( 'https://{}.wikispaces.com/robots.txt'.format(item_value)) wget_args.append( 'https://{}.wikispaces.com/sitemap.xml'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--lua-script", "reddit.lua", "--load-cookies", "cookies", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "reddit.com,redditmedia.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "reddit-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("reddit-user: %(item_name)s"), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('36comments') if item_type == '36comments': suffixes = string.digits + string.ascii_lowercase for url in [ 'http://redd.it/{0}{1}'.format(item_value, a) for a in suffixes ]: wget_args.append(url) # for suffix in suffixes: # commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix) # html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'}) # print('Downloaded', html.status_code, getattr(html, 'reason')) # sys.stdout.flush() # if html.status_code == 200: # if not html.text: # raise Exception('Something went wrong during the download. ({0})'.format(html.status_code)) # else: # for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text): # if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl): # wget_args.append(origurl) # elif html.status_code == 404: # print('This url is 404.') # else: # raise Exception('Something went wrong during the download. ({0})'.format(html.status_code)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'yourshot.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'sonymobile.com', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'yourshot-dld-script-version: ' + VERSION, '--warc-header', ItemInterpolation('yourshot-item: %(item_name)s'), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value http_client = httpclient.HTTPClient() if item_type == 'photos': start, end = item_value.split('-', 1) for i in range(int(start), int(end) + 1): wget_args.extend( ['--warc-header', 'yourshot-photo-id: {}'.format(i)]) wget_args.append( 'https://yourshot.nationalgeographic.com/photos/{}/'. format(i)) else: raise Exception('Unknown item') http_client.close() if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
# be too big. The deadline is optional. project = Project( title="sourceforgersync", project_html=""" <img class="project-logo" alt="Project logo" src="" height="50px" title=""/> <h2>sourceforge.net <span class="links"><a href="http://sourceforge.net/">Website</a> · <a href="http://tracker.archiveteam.org/sourceforge/">Leaderboard</a></span></h2> <p>Saving all project from SourceForge. rsyncing all of the source code repositories.</p> """ ) pipeline = Pipeline( CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]), LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])), ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]), LimitConcurrent(NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION, files=[ cleanItem("%(data_dir)s/%(item_name)s.tar.gz") #ItemInterpolation("foo.tar.gz") ], rsync_target_source_path=ItemInterpolation("%(data_dir)s/"), rsync_extra_args=[ "--recursive", "--partial",
project_html=""" <img class="project-logo" alt="" src="http://archiveteam.org/images/b/b2/Puush_logo.png" /> <h2>Puush <span class="links"><a href="http://puush.me/">Website</a> · <a href="http://%s/%s/">Leaderboard</a></span></h2> <p><b>Puush</b> adds expiry dates to their files.</p> """ % (TRACKER_HOST, TRACKER_ID) # , utc_deadline = datetime.datetime(2013,08,01, 00,00,1) ) pipeline = Pipeline( GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), ExtraItemParams(), PrepareDirectories(warc_prefix="puush"), SpecializedWgetDownloadMany([ WGET_LUA, "-U", USER_AGENT, "-nv", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--lua-script", "puush.lua", "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--timeout", "60", "--tries", "20", "--waitretry", "5", "--warc-file", ItemInterpolation("%(item_dir)s/%(current_warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "puush-dld-script-version: " + VERSION, ], URLsToDownload(), max_tries=20,
accept_on_exit_code=[0, 4, 7, 8], env={ "item_dir": ItemValue("item_dir"), "item_value": ItemValue("item_value"), "item_type": ItemValue("item_type"), "escaped_item_name": ItemValue("escaped_item_name"), "downloader": downloader }), ProcessScrapeFile(), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"), ItemInterpolation( "%(item_dir)s/twitpic2-scrape-%(escaped_item_name)s.txt.gz" ) ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."),
def realize(self, item): wget_args = [ WGET_LUA, "-U", USER_AGENT, "-nv", "--no-cookies", "--lua-script", "flickr.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--rotate-dns", "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--domains", "flickr.com", "--span-hosts", "--waitretry", "30", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "flickr-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("flickr-item: %(item_name)s"), "--no-warc-compression", ] item_name = item['item_name'] item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value http_client = httpclient.HTTPClient() if item_type == 'user': wget_args.extend( ['--warc-header', 'flickr-user: {}'.format(item_value)]) wget_args.append( 'https://www.flickr.com/photos/{}/'.format(item_value)) if item_type == 'disco': raise Exception('Skipping...') try: r = http_client.fetch( 'https://www.flickr.com/photos/{}/'.format(item_value), method='GET') except httpclient.HTTPError as e: r = e.response if r.code == 404: print('Account was deleted.') wget_args.append( 'https://www.flickr.com/photos/{}/'.format(item_value)) elif r.code != 200: raise Exception('Bad status code, {}.'.format(r.code)) else: text = r.body.decode('utf-8', 'ignore') api_key = re.search( 'root\.YUI_config\.flickr\.api\.site_key\s*=\s*"([^"]+)";', text).group(1) req_id = re.search( 'root\.YUI_config\.flickr\.request\.id\s*=\s*"([^"]+)";', text).group(1) item.log_output('Found api_key {} and req_id {}.'.format( api_key, req_id)) wget_args.append( 'https://api.flickr.com/services/rest?per_page=50&page=1&extras=can_addmeta%2Ccan_comment%2Ccan_download%2Ccan_share%2Ccontact%2Ccount_comments%2Ccount_faves%2Ccount_views%2Cdate_taken%2Cdate_upload%2Cdescription%2Cicon_urls_deep%2Cisfavorite%2Cispro%2Clicense%2Cmedia%2Cneeds_interstitial%2Cowner_name%2Cowner_datecreate%2Cpath_alias%2Crealname%2Crotation%2Csafety_level%2Csecret_k%2Csecret_h%2Curl_c%2Curl_f%2Curl_h%2Curl_k%2Curl_l%2Curl_m%2Curl_n%2Curl_o%2Curl_q%2Curl_s%2Curl_sq%2Curl_t%2Curl_z%2Cvisibility%2Cvisibility_source%2Co_dims%2Cpubliceditability&get_user_info=1&jump_to=&user_id={}&view_as=use_pref&sort=use_pref&viewerNSID=&method=flickr.people.getPhotos&csrf=&api_key={}&format=json&hermes=1&hermesClient=1&reqId={}&nojsoncallback=1' .format(item_value, api_key, req_id)) elif item_type == 'photos': raise Exception('Skipping...') r = http_client.fetch('http://195.201.219.254/' + item_value, method='GET') user = item_value.split('/')[0] for i in r.body.decode('utf-8', 'ignore').splitlines(): i = i.strip() wget_args.extend( ['--warc-header', 'flickr-photo: {}'.format(i)]) wget_args.extend( ['--warc-header', 'flickr-photo-user: {}'.format(user)]) wget_args.extend([ '--warc-header', 'flickr-photo-{}-user: {}'.format(i, user) ]) wget_args.append('https://www.flickr.com/photos/{}/{}/'.format( user, i)) wget_args.append( 'https://www.flickr.com/photos/{}/{}/sizes/'.format( user, i)) wget_args.append( 'https://www.flickr.com/video_download.gne?id={}'.format( i)) elif item_type == 'photoscc': r = http_client.fetch('http://195.201.219.254/' + item_value, method='GET') for s in r.body.decode('utf-8', 'ignore').splitlines(): s = s.strip() if s.startswith('www.flickr.com/photos/'): s = '/'.join(s.split('/')[2:4]) elif s.startswith('flickr.com/'): s = s.split('/', 1)[1].rstrip('/') user, i = s.split( '/' ) # NOTE: do not replace with anything that skips invalid urls, we want to catch those with pipeline aborts wget_args.extend( ['--warc-header', 'flickr-photo: {}'.format(i)]) wget_args.extend( ['--warc-header', 'flickr-photo-user: {}'.format(user)]) wget_args.extend([ '--warc-header', 'flickr-photo-{}-user: {}'.format(i, user) ]) wget_args.append('https://www.flickr.com/photos/{}/{}/'.format( user, i)) wget_args.append( 'https://www.flickr.com/video_download.gne?id={}'.format( i)) else: raise Exception('Unknown item') http_client.close() if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, "-U", random.choice(USER_AGENTS), "-nv", "--lua-script", "twitpic.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", # "-w", "1", "--no-cookies", "--rotate-dns", # "--recursive", "--level=inf", "--no-parent", # "--page-requisites", "--timeout", "30", "--tries", "inf", "--span-hosts", "--waitretry", "30", "--domains", "twitpic.com,cloudfront.net,twimg.com,amazonaws.com", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "twitpic2-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("twitpic2-user: %(item_name)s"), "--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "--header", "DNT: 1", "--header", random.choice(ACCEPT_LANGUAGE_HEADERS), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('image', ) if item_type == 'image': start_id, end_id = item_value.split(':', 1) start_num = str_to_int(start_id) end_num = str_to_int(end_id) for num in range(start_num, end_num + 1): twitpic_name = int_to_str(num) url = 'http://twitpic.com/{0}'.format(twitpic_name) wget_args.append(url) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
max_tries=2, accept_on_exit_code=[0, 4, 8], env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base') }), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz') #ItemInterpolation('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz') ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='20', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker(
phantomjs_exe=PHANTOMJS, finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"], warc_max_size=WARC_MAX_SIZE) pipeline = Pipeline( CheckIP(), GetItemFromQueue(control, pipeline_id, downloader, ao_only=env.get('AO_ONLY'), large=env.get('LARGE')), StartHeartbeat(control), SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control), WgetDownload(wpull_args, accept_on_exit_code=AcceptAny(), env={ 'ITEM_IDENT': ItemInterpolation('%(ident)s'), 'LOG_KEY': ItemInterpolation('%(log_key)s'), 'REDIS_URL': REDIS_URL, 'PATH': os.environ['PATH'] }), RelabelIfAborted(control), WriteInfo(), MoveFiles(), LimitConcurrent( 2, RsyncUpload(target=RSYNC_URL, target_source_path=ItemInterpolation("%(data_dir)s"), files=ItemValue("all_target_files"), extra_args=['--partial', '--partial-dir', '.rsync-tmp'])), StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME)) def stop_control(): #control.flag_logging_thread_for_termination()
accept_on_exit_code=[0], # [0, 4, 8], #changed env={ 'item_dir': ItemValue('item_dir'), 'item_value': ItemValue('item_value'), 'item_type': ItemValue('item_type'), 'warc_file_base': ItemValue('warc_file_base'), 'todo_url_count': ItemValue('todo_url_count'), }), PrepareStatsForTracker( defaults={ 'downloader': downloader, 'version': VERSION }, file_groups={ 'data': [ ItemInterpolation( '%(item_dir)s/%(warc_file_base)s.warc.gz') #TODO ? ] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=20, default='20', name='shared:rsync_threads', title='Rsync threads', description='The maximum number of concurrent uploads.'), UploadWithTracker( 'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
def realize(self, item): wget_args = [ WGET_AT, '-U', USER_AGENT, '-nv', '--content-on-error', '--lua-script', 'so-net-u-page-plus.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf', '--no-parent', '--page-requisites', '--timeout', '30', '--tries', 'inf', '--domains', 'upp.so-net.ne.jp', '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', '--warc-header', 'x-wget-at-project-version: ' + VERSION, '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID, '--warc-dedup-url-agnostic', ] for item_name in item['item_name'].split('\0'): wget_args.extend( ['--warc-header', 'x-wget-at-project-item-name: ' + item_name]) wget_args.append('item-name://' + item_name) item_type, item_value = item_name.split(':', 1) if item_type == "userdir": wget_args.extend([ '--warc-header', 'so-net-u-page-plus-userdir: ' + item_value ]) hostname = item_value.split("/")[0] user_dir_name = item_value.split("/")[1] wget_args.append('http://{}.upp.so-net.ne.jp/{}/'.format( hostname, user_dir_name)) # Alternate forms, because I amn't sure how they parse this wget_args.append( 'http://{}.upp.so-net.ne.jp/{}/index.htm'.format( hostname, user_dir_name)) wget_args.append( 'http://{}.upp.so-net.ne.jp/{}/index.html'.format( hostname, user_dir_name)) wget_args.append('http://{}.upp.so-net.ne.jp/{}'.format( hostname, user_dir_name)) else: raise ValueError('item_type not supported.') item['item_name_newline'] = item['item_name'].replace('\0', '\n') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
l.split('"')[1] for l in lines if l.strip() and not "schedule=" in l) except KeyboardInterrupt: raise except: print traceback.format_exc() print "Continuing anyway..." usernames = [] with open("%(data_dir)s/%(warc_file_base)s.friends" % item, "wb") as f: f.write("\n".join(usernames) + "\n") wget_args = [ WGET_LUA, "-U", ItemInterpolation("%(user_agent)s"), "-nv", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--lua-script", "wretch.lua", "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--no-cookies", "--rotate-dns", "--recursive", "--level=inf",
CheckIP(), GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION), PrepareDirectories(warc_prefix="livejournaldisco"), ExternalProcess('Scraper', CustomProcessArgs(), max_tries=2, accept_on_exit_code=[0], env={"item_dir": ItemValue("item_dir")}), PrepareStatsForTracker( defaults={ "downloader": downloader, "version": VERSION }, file_groups={ "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt")] }, id_function=stats_id_function, ), MoveFiles(), LimitConcurrent( NumberConfigValue( min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads", description="The maximum number of concurrent uploads."), UploadWithTracker( "http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader=downloader, version=VERSION,
def realize(self, item): wget_args = [ WGET_LUA, "-U", random.choice(USER_AGENTS), "-nv", "--lua-script", "twitpic.lua", "-o", ItemInterpolation("%(item_dir)s/wget.log"), "--no-check-certificate", "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"), "--truncate-output", "-e", "robots=off", "--no-cookies", "--rotate-dns", # Do download recursive, we're checking the urls in twitpic.lua "--recursive", "--level=inf", "--no-parent", "--page-requisites", "--timeout", "30", "--tries", "inf", "--span-hosts", "--waitretry", "30", "--domains", "twitpic.com,cloudfront.net,twimg.com,amazonaws.com", "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"), "--warc-header", "operator: Archive Team", "--warc-header", "twitpic-dld-script-version: " + VERSION, "--warc-header", ItemInterpolation("twitpic-user: %(item_name)s"), "--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "--header", "DNT: 1", "--header", random.choice(ACCEPT_LANGUAGE_HEADERS), ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value assert item_type in ('image', 'user', 'tag', 'event') if item_type == 'image': suffixes = string.digits + string.lowercase for args in [('http://twitpic.com/{0}{1}'.format(item_value, s), \ 'http://twitpic.com/show/thumb/{0}{1}'.format(item_value, s), \ 'http://twitpic.com/show/large/{0}{1}'.format(item_value, s), \ 'http://twitpic.com/show/mini/{0}{1}'.format(item_value, s)) for s in suffixes]: wget_args.append(args[0]) wget_args.append(args[1]) wget_args.append(args[2]) wget_args.append(args[3]) elif item_type == 'user': wget_args.append('http://twitpic.com/photos/{0}'.format(item_value)) wget_args.append('http://twitpic.com/events/{0}'.format(item_value)) wget_args.append('http://twitpic.com/places/{0}'.format(item_value)) wget_args.append('http://twitpic.com/faces/{0}'.format(item_value)) wget_args.append('http://api.twitpic.com/2/users/show.json?username={0}'.format(item_value)) wget_args.append('http://api.twitpic.com/2/places/show.json?user={0}'.format(item_value)) wget_args.append('http://api.twitpic.com/2/events/show.json?user={0}'.format(item_value)) elif item_type == 'tag': wget_args.append('http://twitpic.com/tag/{0}'.format(item_value)) wget_args.append('http://api.twitpic.com/2/tags/show.json?tag={0}'.format(item_value)) wget_args.append('http://twitpic.com/tag/{0}.json'.format(item_value)) elif item_type == 'event': wget_args.append('http://api.twitpic.com/2/event/show.json?id={0}'.format(item_value)) else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)
def realize(self, item): wget_args = [ WGET_LUA, '-U', USER_AGENT, '-nv', '--no-cookies', '--lua-script', 'gfycat-disc.lua', '-o', ItemInterpolation('%(item_dir)s/wget.log'), '--no-check-certificate', '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output', '-e', 'robots=off', '--rotate-dns', # '--recursive', '--level=inf', # '--no-parent', # '--page-requisites', '--timeout', '30', '--tries', 'inf', # '--domains', 'nationalgeographic.com', # '--span-hosts', '--waitretry', '30', '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'), '--warc-header', 'operator: Archive Team', #'--warc-header', 'yourshot-static-dld-script-version: ' + VERSION, #'--warc-header', ItemInterpolation('yourshot-static-item: %(item_name)s'), # --warc-header yourshot-photo-id: ... filled in below # '--header', 'Accept-Encoding: gzip', # '--compression', 'gzip' # changed flags # ] item_name = item['item_name'] assert ':' in item_name item_type, item_value = item_name.split(':', 1) item['item_type'] = item_type item['item_value'] = item_value if item_type.startswith('AdjAdj'): wget_urls = [] defer_assets = [] photo_ids = [] item_version = None for animal in animals: wget_urls.append("https://api.gfycat.com/v1/gfycats/" + item_value + animal.capitalize()) if item_version is None: item_version = len(wget_urls) item["version"] = item_version item["todo_url_count"] = str(len(wget_urls)) print("URIs ToDo: {}".format(len(wget_urls))) if len(wget_urls) == 0: wget_args.append("-V") else: wget_args.extend(wget_urls) # print("\nD^ ", end="") #debug # print("\nD^ ".join(defer_assets)) #debug else: raise Exception('Unknown item') if 'bind_address' in globals(): wget_args.extend(['--bind-address', globals()['bind_address']]) print('') print('*** Wget will bind address at {0} ***'.format( globals()['bind_address'])) print('') return realize(wget_args, item)