Exemple #1
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U',
            USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script',
            '500px.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            '--recursive',
            '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            '--domains',
            '500px.com',
            '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            '500px-dld-script-version: ' + VERSION,
            '--warc-header',
            ItemInterpolation('500px-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'photos':
            for id_ in item_value.split(';'):
                wget_args.extend(
                    ['--warc-header', '500px-photo: {}'.format(id_)])
                wget_args.append('https://500px.com/photo/{}'.format(id_))
                wget_args.append(
                    'https://api.500px.com/v1/photos/{}/comments?sort=created_at&include_subscription=1&include_flagged=1&nested=1&page=1&rpp=30'
                    .format(id_))
                wget_args.append(
                    'https://api.500px.com/v1/photos?image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14&expanded_user_info=true&include_tags=true&include_geo=true&include_equipment_info=true&include_licensing=true&include_releases=true&liked_by=1&following_sample=100&ids={}'
                    .format(id_))
                #wget_args.append('https://api.500px.com/v1/photos/{}/navigation?from=user&formats=jpeg%2Clytro&image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14'.format(id_))
        elif item_type == 'all':
            start, end = item_value.split('-')
            for id_ in range(int(start), int(end) + 1):
                wget_args.extend(
                    ['--warc-header', '500px-photo: {}'.format(id_)])
                wget_args.append('https://500px.com/photo/{}'.format(id_))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #2
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U',
            USER_AGENT,
            #'-nv',
            '--no-cookies',
            '--lua-script',
            'static-only.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            # '--recursive', '--level=inf',
            # '--no-parent',
            # '--page-requisites',
            '--timeout',
            '30',
            # '--tries', 'inf',
            # '--domains', '.com',
            # '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            'static-only-dld-script-version: ' + VERSION,
            '--warc-header',
            ItemInterpolation('static-only-item: %(item_name)s'),
            # --warc-header static-url-id: ... filled in below
            # '--header', 'Accept-Encoding: gzip',
            # '--compression', 'gzip'
            # changed flags #
        ]

        item_type = item["item_type"]
        item_value = item["item_value"]

        wget_urls = []

        task_line = item_value

        #if len(task_line) == 0:
        #    continue
        # elif item_type == 'static_job_json': #  TODO
        # elif item_type == 'static_job_urls': #  TODO

        if item_type == 'static_url':
            print("T>  " + task_line)  #debug
            wget_urls.append(task_line)
        else:
            raise Exception('Unknown item')

        item["todo_url_count"] = str(len(wget_urls))

        print("URIs ToDo: {}".format(len(wget_urls)))
        if len(wget_urls) == 0:
            wget_args.append("-V")
        else:
            wget_args.extend(wget_urls)

        # print("\nD^      ".join(defer_assets))  #debug
        # print("\nD^      ", end="")  #debug

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #3
0
 WgetDownload(
     WgetArgs(),
     max_tries=2,
     accept_on_exit_code=[0, 4, 7, 8],
     env={
         "item_dir": ItemValue("item_dir"),
         "item_value": ItemValue("item_value"),
         "item_type": ItemValue("item_type"),
         "downloader": downloader
     }
 ),
 PrepareStatsForTracker(
     defaults={"downloader": downloader, "version": VERSION},
     file_groups={
         "data": [
             ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz")
         ]
     },
     id_function=stats_id_function,
 ),
 MoveFiles(),
 LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
     name="shared:rsync_threads", title="Rsync threads",
     description="The maximum number of concurrent uploads."),
     UploadWithTracker(
         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
         downloader=downloader,
         version=VERSION,
         files=[
             ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
         ],
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U",
            USER_AGENT,
            "-nv",
            "--lua-script",
            "ovi-store.lua",
            "-o",
            ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e",
            "robots=off",
            "--rotate-dns",
            "--recursive",
            "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout",
            "30",
            "--tries",
            "inf",
            "--domains",
            "ovi.com",
            "--span-hosts",
            "--waitretry",
            "30",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "ovi-store-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("ovi-store-user: %(item_name)s"),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        assert item_type in ('app')

        if item_type == 'app':
            if item_value == '':
                suffixes = '123456789'
            else:
                suffixes = string.digits
            for url in [
                    'http://store.ovi.com/content/{0}{1}'.format(
                        item_value, s) for s in suffixes
            ]:
                wget_args.append(url)
            for url in [
                    'http://store.ovi.com/content/{0}{1}/Download'.format(
                        item_value, s) for s in suffixes
            ]:
                wget_args.append(url)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #5
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U",
            USER_AGENT,
            "-nv",
            "--no-cookies",
            "--lua-script",
            "soundcloud.lua",
            "-o",
            ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e",
            "robots=off",
            "--rotate-dns",
            "--recursive",
            "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout",
            "30",
            "--tries",
            "inf",
            "--domains",
            "soundcloud.com",
            "--span-hosts",
            "--waitretry",
            "30",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "soundcloud-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("soundcloud-item: %(item_name)s"),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        def add_api(i, v):
            wget_args.extend([
                '--warc-header', 'soundcloud-track-api: {i}'.format(i=i, v=v)
            ])
            wget_args.append(
                'https://api.soundcloud.com/app/v2/tracks/{i}/comments?threaded=1&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=200&offset=0&linked_partitioning=1&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/tracks/{i}/related?anon_user_id=33006123&variant=control&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=10&offset=0&linked_partitioning=1&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/tracks/{i}/albums?representation=mini&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=10&offset=0&linked_partitioning=1&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/tracks/{i}/playlists_without_albums?representation=mini&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=10&offset=0&linked_partitioning=1&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/tracks/{i}/likers?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=9&offset=0&linked_partitioning=1&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/tracks/{i}/reposters?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=9&offset=0&linked_partitioning=1&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/audio-ad?sc_a_id=28936013-0c76-4245-b4dd-a0d7fc590135&device_locale=nl&track_id={i}&rubicon_user_id=J0JKJ4XR-1E-1OEW&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api.soundcloud.com/app/v2/tracks/{i}/comments?filter_replies=1&limit=200&offset=0&linked_partitioning=1&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}'
                .format(i=i, v=v))
            #wget_args.append('https://api.soundcloud.com/i1/tracks/{i}/streams?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z'.format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/stations/soundcloud:track-stations:{i}/tracks?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&limit=10&offset=0&linked_partitioning=1&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/stations?urns=soundcloud%3Atrack-stations%3A{i}&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}'
                .format(i=i, v=v))

            #extra
            wget_args.append(
                'https://api.soundcloud.com/tracks/{i}/favoriters?client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}'
                .format(i=i, v=v))
            wget_args.append(
                'https://api-v2.soundcloud.com/tracks/{i}/playlists_without_albums?offset=10&limit=10&client_id=2t9loNQH90kzJcsFCODdigxfp325aq4z&app_version={v}'
                .format(i=i, v=v))

        if item_type == 'api':
            start, stop = item_value.split('-')
            for i in range(int(start), int(stop) + 1):
                add_api(i, 1500299175)
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #6
0
_, _, _, pipeline_id = monitoring.pipeline_id()

wpull_args = WpullArgs(
    default_user_agent=DEFAULT_USER_AGENT,
    wpull_exe=WPULL_EXE,
    youtube_dl_exe=YOUTUBE_DL,
    finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"],
    warc_max_size=WARC_MAX_SIZE,
    monitor_disk=WPULL_MONITOR_DISK,
    monitor_memory=WPULL_MONITOR_MEMORY,
)

check_wpull_args(wpull_args)

wpull_env = {
    'ITEM_IDENT': ItemInterpolation('%(ident)s'),
    'LOG_KEY': ItemInterpolation('%(log_key)s'),
    'REDIS_URL': REDIS_URL,
    'PATH': os.environ['PATH'],
}
if OPENSSL_CONF:
    wpull_env['OPENSSL_CONF'] = OPENSSL_CONF
if TMPDIR:
    wpull_env['TMPDIR'] = TMPDIR

pipeline = Pipeline(
    CheckIP(),
    CheckLocalWebserver(),
    GetItemFromQueue(control, pipeline_id, downloader,
        ao_only=env.get('AO_ONLY'), large=env.get('LARGE'),
        version_check = (VERSION, pipeline_version)),
Exemple #7
0
              accept_on_exit_code=[0, 4, 8],
              env={
                  "item_dir": ItemValue("item_dir"),
                  "item_value": ItemValue("item_value"),
                  "item_type": ItemValue("item_type"),
                  'warc_file_base': ItemValue('warc_file_base')
              }),
 Deduplicate(),
 PrepareStatsForTracker(
     defaults={
         "downloader": downloader,
         "version": VERSION
     },
     file_groups={
         "data": [
             ItemInterpolation(
                 "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz")
         ]
     },
     id_function=stats_id_function,
 ),
 MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=20,
         default="20",
         name="shared:rsync_threads",
         title="Rsync threads",
         description="The maximum number of concurrent uploads."),
     UploadWithTracker(
         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
Exemple #8
0
 PrepareDirectories(warc_prefix=TRACKER_ID),
 WgetDownload(
     WgetArgs(),
     max_tries=2,
     accept_on_exit_code=[0, 4, 8],
     env={
         'item_dir': ItemValue('item_dir'),
         'warc_file_base': ItemValue('warc_file_base')
     }
 ),
 SetBadUrls(),
 PrepareStatsForTracker(
     defaults={'downloader': downloader, 'version': VERSION},
     file_groups={
         'data': [
             ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.zst')
         ]
     },
     id_function=stats_id_function,
 ),
 MoveFiles(),
 LimitConcurrent(NumberConfigValue(min=1, max=20, default='20',
     name='shared:rsync_threads', title='Rsync threads',
     description='The maximum number of concurrent uploads.'),
     UploadWithTracker(
         'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
         downloader=downloader,
         version=VERSION,
         files=[
             ItemInterpolation('%(data_dir)s/%(warc_file_base)s.%(dict_project)s.%(dict_id)s.warc.zst'),
             ItemInterpolation('%(data_dir)s/%(warc_file_base)s_data.txt')
Exemple #9
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U', USER_AGENT,
            '-nv',
            '--load-cookies', 'cookies.txt',
            '--content-on-error',
            '--no-http-keep-alive',
            '--lua-script', 'telegram.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 't.me,telegram.org',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'x-wget-at-project-version: ' + VERSION,
            '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
            '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include',
            '--header', 'Accept-Language: en-US;q=0.9, en;q=0.8',
            '--secure-protocol', 'TLSv1_2'
        ]
        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = TRACKER_ID
        wget_args.extend([
            '--warc-zstd-dict', ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        item['item_name'] = '\0'.join(
            s for s in item['item_name'].split('\0')
            if not s.startswith('user:'******'item_name'].split('\0'):
            wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
            wget_args.append('item-name://'+item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'post':
                group, post_id = item_value.split(':', 1)
                wget_args.extend(['--warc-header', 'telegram-post: {}/{}'.format(group, post_id)])
                wget_args.append('https://t.me/{}/{}?embed=1'.format(group, post_id))
            elif item_type == 'channel':
                wget_args.extend(['--warc-header', 'telegram-channel: '+item_value])
                wget_args.append('https://t.me/s/'+item_value)
            #elif item_type == 'url':
            #    wget_args.extend(['--warc-header', 'telegram-resource: '+item_value])
            #    wget_args.append(item_value)
            else:
                raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #10
0
    def realize(self, item):
        with open('user-agents', 'r') as f:
            user_agent = random.choice(list(f)).strip()
        wget_args = [
            WGET_AT, '-U', user_agent, '-nv', '--no-cookies',
            '--content-on-error', '--lua-script', 'reddit.lua', '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate', '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'), '--truncate-output',
            '-e', 'robots=off', '--rotate-dns', '--recursive', '--level=inf',
            '--no-parent', '--page-requisites', '--timeout', '30', '--tries',
            'inf', '--domains', 'reddit.com', '--header',
            'Cookie: over18=1; _options=%7B%22pref_quarantine_optin%22%3A%20true%7D',
            '--span-hosts', '--waitretry', '30', '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team', '--warc-header',
            'x-wget-at-project-version: ' + VERSION, '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic', '--warc-compression-use-zstd',
            '--warc-zstd-dict-no-include', '--header',
            'Accept-Language: en-US;q=0.9, en;q=0.8'
        ]

        dict_data = ZstdDict.get_dict()
        with open(os.path.join(item['item_dir'], 'zstdict'), 'wb') as f:
            f.write(dict_data['dict'])
        item['dict_id'] = dict_data['id']
        item['dict_project'] = 'reddit'
        wget_args.extend([
            '--warc-zstd-dict',
            ItemInterpolation('%(item_dir)s/zstdict'),
        ])

        for item_name in item['item_name'].split('\0'):
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type in ('post', 'comment'):
                if item_type == 'post':
                    wget_args.extend(
                        ['--warc-header', 'reddit-post: ' + item_value])
                    wget_args.append(
                        'https://www.reddit.com/api/info.json?id=t3_' +
                        item_value)
                elif item_type == 'comment':
                    wget_args.extend(
                        ['--warc-header', 'reddit-comment: ' + item_value])
                    wget_args.append(
                        'https://www.reddit.com/api/info.json?id=t1_' +
                        item_value)
            else:
                raise Exception('Unknown item')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #11
0
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U',
            USER_AGENT,
            '-nv',
            '--content-on-error',
            '--lua-script',
            'halo.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            '--recursive',
            '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            '--domains',
            'bungie.net',
            '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            'x-wget-at-project-version: ' + VERSION,
            '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
        ]

        def queue_range(start, end, url_prefix, header_prefix):
            for i in range(start, end + 1):
                wget_args.extend(
                    ['--warc-header', header_prefix + ': ' + str(i)])
                wget_args.append(url_prefix + str(i))

        def process_sequential(s, url_prefix, header_prefix):
            if '-' in s:
                start, end = s.split('-')
            else:
                start, end = int(s), int(s)
            queue_range(int(start), int(end), url_prefix, header_prefix)

        item_names = item['item_name'].split('\0')

        for item_name in item_names[:]:
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'reach-file':
                process_sequential(
                    item_value,
                    'http://halo.bungie.net/Stats/Reach/FileDetails.aspx?fid=',
                    'halo-bungie-reach-file')
            elif item_type == 'reach-guid':
                wget_args.extend([
                    '--warc-header',
                    'halo-bungie-reach-game-guid: ' + item_value
                ])
                wget_args.append(
                    'http://halo.bungie.net/Stats/Reach/GameStats.aspx?guid=' +
                    item_value)
            elif item_type == 'reach-stats':
                process_sequential(
                    item_value,
                    'http://halo.bungie.net/Stats/Reach/GameStats.aspx?gameid=',
                    'halo-bungie-reach-game-stats')
            elif item_type in ('player', 'reach-player'):
                item_names.remove(item_name)
            #    wget_args.extend(['--warc-header', 'halo-bungie-player: '+item_value])
            #    wget_args.extend(['--warc-header', 'halo-bungie-reach-player: '+item_value])
            #    wget_args.extend(['--warc-header', 'halo-bungie-halo2-player: '+item_value])
            #    wget_args.extend(['--warc-header', 'halo-bungie-halo3-player: '+item_value])
            #    wget_args.append('http://halo.bungie.net/Stats/Reach/default.aspx?player='+item_value)
            #    wget_args.append('http://halo.bungie.net/Stats/PlayerStatsHalo2.aspx?player='+item_value)
            #    wget_args.append('http://halo.bungie.net/Stats/Halo3/default.aspx?player='+item_value)
            elif item_type == 'reach-pxd':
                wget_args.extend([
                    '--warc-header', 'halo-bungie-reach-player: ' + item_value
                ])
                wget_args.append(
                    'http://halo.bungie.net/Stats/Reach/default.aspx?pxd=' +
                    item_value)
            else:
                raise ValueError('item_type not supported.')

        item['item_name'] = '\0'.join(item_names)
        item['item_name_newline'] = '\n'.join(item_names)

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U',
            USER_AGENT,
            '-nv',
            '--content-on-error',
            '--lua-script',
            'storyfire.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            '--recursive',
            '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            '--domains',
            'storyfire.com',
            '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            'x-wget-at-project-version: ' + VERSION,
            '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
        ]

        for item_name in item['item_name'].split('\0'):
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'video':
                wget_args.extend(
                    ['--warc-header', 'storyfire-video: ' + item_value])
                wget_args.append('https://storyfire.com/video-details/' +
                                 item_value)
            elif item_type == 'user':
                wget_args.extend(
                    ['--warc-header', 'storyfire-user: '******'https://storyfire.com/user/' + item_value)
            else:
                raise ValueError('item_type not supported.')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #13
0
              max_tries=2,
              accept_on_exit_code=[0, 4, 8],
              env={
                  'item_dir': ItemValue('item_dir'),
                  'item_value': ItemValue('item_value'),
                  'item_type': ItemValue('item_type'),
                  'warc_file_base': ItemValue('warc_file_base'),
              }), Deduplicate(),
 PrepareStatsForTracker(
     defaults={
         'downloader': downloader,
         'version': VERSION
     },
     file_groups={
         'data': [
             ItemInterpolation(
                 '%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz')
         ]
     },
     id_function=stats_id_function,
 ), MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=20,
         default='20',
         name='shared:rsync_threads',
         title='Rsync threads',
         description='The maximum number of concurrent uploads.'),
     UploadWithTracker(
         'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
         downloader=downloader,
Exemple #14
0
    def realize(self, item):
        with open('user-agents', 'r') as f:
            user_agent = random.choice(list(f)).strip()
        wget_args = [
            WGET_AT,
            '-U', user_agent,
            '-nv',
            '--content-on-error',
            '--load-cookies', 'cookies.txt',
            '--lua-script', 'niconico.lua',
            '-o', ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e', 'robots=off',
            '--rotate-dns',
            '--recursive', '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout', '30',
            '--tries', 'inf',
            '--domains', 'voat.co',
            '--span-hosts',
            '--waitretry', '30',
            '--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header', 'operator: Archive Team',
            '--warc-header', 'x-wget-at-project-version: ' + VERSION,
            '--warc-header', 'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
            '--header', 'Accept-Language: ja',
            '--header', 'Content-Type: text/plain',
        ]
        
        item_names = item['item_name'].split('\0')

        for item_name in item_names:
            wget_args.extend(['--warc-header', 'x-wget-at-project-item-name: '+item_name])
            wget_args.append('item-name://'+item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == 'vid':
                wget_args.extend(['--warc-header', 'niconico-vid: '+item_value])
                wget_args.append('https://www.nicovideo.jp/watch/' + item_value)
            else:
                raise ValueError('item_type not supported.')

        #item_name = item['item_name']
        #item_type, item_value = item_name.split(':')

        #item['item_type'] = item_type
        #item['item_value'] = item_value

        #if item_type == "metadatarange":
            #wget_args.extend(['--warc-header', 'niconico-metadatarange: ' + item_value])
            #[prefix, start, end] = item_value.split("-")
            #for i in range(int(start), int(end)):
                #wget_args.append(f'https://www.nicovideo.jp/watch/{prefix}{i}')
        #else:
            #raise ValueError('item_type not supported.')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U',
            USER_AGENT,
            '-nv',
            '--no-cookies',
            #'--load-cookies', 'cookies.txt',
            '--lua-script',
            'wikispaces.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            '--recursive',
            '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            '--domains',
            'wikispaces.com',
            '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            'wikispaces-dld-script-version: ' + VERSION,
            '--warc-header',
            ItemInterpolation('wikispaces-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type == 'wiki':
            wget_args.extend(['--warc-header', 'wikispace: ' + item_value])
            wget_args.append(
                'https://{}.wikispaces.com/space/content?utable=WikiTablePageList&ut_csv=1'
                .format(item_value))
            wget_args.append(
                'https://{}.wikispaces.com/space/content'.format(item_value))
            wget_args.append('https://{}.wikispaces.com/'.format(item_value))
            wget_args.append(
                'https://{}.wikispaces.com/robots.txt'.format(item_value))
            wget_args.append(
                'https://{}.wikispaces.com/sitemap.xml'.format(item_value))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #16
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U",
            USER_AGENT,
            "-nv",
            "--lua-script",
            "reddit.lua",
            "--load-cookies",
            "cookies",
            "-o",
            ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e",
            "robots=off",
            "--rotate-dns",
            "--recursive",
            "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout",
            "30",
            "--tries",
            "inf",
            "--domains",
            "reddit.com,redditmedia.com",
            "--span-hosts",
            "--waitretry",
            "30",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "reddit-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("reddit-user: %(item_name)s"),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        assert item_type in ('36comments')

        if item_type == '36comments':
            suffixes = string.digits + string.ascii_lowercase
            for url in [
                    'http://redd.it/{0}{1}'.format(item_value, a)
                    for a in suffixes
            ]:
                wget_args.append(url)
#            for suffix in suffixes:
#                commenturl = 'https://www.reddit.com/comments/{0}{1}/'.format(item_value, suffix)
#                html = requests.get(commenturl, headers={'User-Agent': 'ArchiveTeam'})
#                print('Downloaded', html.status_code, getattr(html, 'reason'))
#                sys.stdout.flush()
#                if html.status_code == 200:
#                    if not html.text:
#                        raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
#                    else:
#                        for origurl in re.findall(r'href="(https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/{0}{1}\/[^"]+)"'.format(item_value, suffix), html.text):
#                            if (re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/', origurl) or re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/', origurl)) and not re.search(r'https?:\/\/www\.reddit\.com\/r\/[^/]+\/comments\/[^/]+\/[^/]+\/.', origurl):
#                                wget_args.append(origurl)
#                elif html.status_code == 404:
#                    print('This url is 404.')
#                else:
#                    raise Exception('Something went wrong during the download. ({0})'.format(html.status_code))
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #17
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U',
            USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script',
            'yourshot.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            '--recursive',
            '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            '--domains',
            'sonymobile.com',
            '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            'yourshot-dld-script-version: ' + VERSION,
            '--warc-header',
            ItemInterpolation('yourshot-item: %(item_name)s'),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        http_client = httpclient.HTTPClient()

        if item_type == 'photos':
            start, end = item_value.split('-', 1)
            for i in range(int(start), int(end) + 1):
                wget_args.extend(
                    ['--warc-header', 'yourshot-photo-id: {}'.format(i)])
                wget_args.append(
                    'https://yourshot.nationalgeographic.com/photos/{}/'.
                    format(i))
        else:
            raise Exception('Unknown item')

        http_client.close()

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #18
0
# be too big. The deadline is optional.
project = Project(
	title="sourceforgersync",
	project_html="""
		<img class="project-logo" alt="Project logo" src="" height="50px" title=""/>
		<h2>sourceforge.net <span class="links"><a href="http://sourceforge.net/">Website</a> &middot; <a href="http://tracker.archiveteam.org/sourceforge/">Leaderboard</a></span></h2>
		<p>Saving all project from SourceForge. rsyncing all of the source code repositories.</p>
	"""
)

pipeline = Pipeline(
	CheckIP(),
	GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
	ExternalProcess("Size Test",[RSYNC_TEST,"-t",getRsyncURL("foo"),"-m",MAX_RSYNC]),
	LimitConcurrent(1,ExternalProcess("rsync", ["rsync", "-av", getRsyncURL("foo"), cleanItem("%(data_dir)s/%(item_name)s")])),
	ExternalProcess("tar", ["tar", "-czf", cleanItem("%(data_dir)s/%(item_name)s.tar.gz"), "-C", ItemInterpolation("%(data_dir)s/"), "--owner=1999", "--group=2015", "--no-same-permissions", cleanItem("%(item_name)s")]),
	LimitConcurrent(NumberConfigValue(min=1, max=4, default="1",
		name="shared:rsync_threads", title="Rsync threads",
		description="The maximum number of concurrent uploads."),
		UploadWithTracker(
			"http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
			downloader=downloader,
			version=VERSION,
			files=[
				cleanItem("%(data_dir)s/%(item_name)s.tar.gz")
				#ItemInterpolation("foo.tar.gz")
			],
			rsync_target_source_path=ItemInterpolation("%(data_dir)s/"),
			rsync_extra_args=[
				"--recursive",
				"--partial",
Exemple #19
0
    project_html="""
    <img class="project-logo" alt="" src="http://archiveteam.org/images/b/b2/Puush_logo.png" />
    <h2>Puush <span class="links"><a href="http://puush.me/">Website</a> &middot; <a href="http://%s/%s/">Leaderboard</a></span></h2>
    <p><b>Puush</b> adds expiry dates to their files.</p>
    """ % (TRACKER_HOST, TRACKER_ID)
    # , utc_deadline = datetime.datetime(2013,08,01, 00,00,1)
)

pipeline = Pipeline(
    GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader, VERSION),
    ExtraItemParams(),
    PrepareDirectories(warc_prefix="puush"),
    SpecializedWgetDownloadMany([ WGET_LUA,
          "-U", USER_AGENT,
          "-nv",
          "-o", ItemInterpolation("%(item_dir)s/wget.log"),
          "--lua-script", "puush.lua",
          "--no-check-certificate",
          "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
          "--truncate-output",
          "-e", "robots=off",
          "--rotate-dns",
          "--timeout", "60",
          "--tries", "20",
          "--waitretry", "5",
          "--warc-file", ItemInterpolation("%(item_dir)s/%(current_warc_file_base)s"),
          "--warc-header", "operator: Archive Team",
          "--warc-header", "puush-dld-script-version: " + VERSION,
        ],
        URLsToDownload(),
        max_tries=20,
Exemple #20
0
              accept_on_exit_code=[0, 4, 7, 8],
              env={
                  "item_dir": ItemValue("item_dir"),
                  "item_value": ItemValue("item_value"),
                  "item_type": ItemValue("item_type"),
                  "escaped_item_name": ItemValue("escaped_item_name"),
                  "downloader": downloader
              }), ProcessScrapeFile(),
 PrepareStatsForTracker(
     defaults={
         "downloader": downloader,
         "version": VERSION
     },
     file_groups={
         "data": [
             ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz"),
             ItemInterpolation(
                 "%(item_dir)s/twitpic2-scrape-%(escaped_item_name)s.txt.gz"
             )
         ]
     },
     id_function=stats_id_function,
 ), MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=4,
         default="1",
         name="shared:rsync_threads",
         title="Rsync threads",
         description="The maximum number of concurrent uploads."),
Exemple #21
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U",
            USER_AGENT,
            "-nv",
            "--no-cookies",
            "--lua-script",
            "flickr.lua",
            "-o",
            ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e",
            "robots=off",
            "--rotate-dns",
            "--recursive",
            "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout",
            "30",
            "--tries",
            "inf",
            "--domains",
            "flickr.com",
            "--span-hosts",
            "--waitretry",
            "30",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "flickr-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("flickr-item: %(item_name)s"),
            "--no-warc-compression",
        ]

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        http_client = httpclient.HTTPClient()

        if item_type == 'user':
            wget_args.extend(
                ['--warc-header', 'flickr-user: {}'.format(item_value)])
            wget_args.append(
                'https://www.flickr.com/photos/{}/'.format(item_value))
        if item_type == 'disco':
            raise Exception('Skipping...')
            try:
                r = http_client.fetch(
                    'https://www.flickr.com/photos/{}/'.format(item_value),
                    method='GET')
            except httpclient.HTTPError as e:
                r = e.response
            if r.code == 404:
                print('Account was deleted.')
                wget_args.append(
                    'https://www.flickr.com/photos/{}/'.format(item_value))
            elif r.code != 200:
                raise Exception('Bad status code, {}.'.format(r.code))
            else:
                text = r.body.decode('utf-8', 'ignore')
                api_key = re.search(
                    'root\.YUI_config\.flickr\.api\.site_key\s*=\s*"([^"]+)";',
                    text).group(1)
                req_id = re.search(
                    'root\.YUI_config\.flickr\.request\.id\s*=\s*"([^"]+)";',
                    text).group(1)
                item.log_output('Found api_key {} and req_id {}.'.format(
                    api_key, req_id))
                wget_args.append(
                    'https://api.flickr.com/services/rest?per_page=50&page=1&extras=can_addmeta%2Ccan_comment%2Ccan_download%2Ccan_share%2Ccontact%2Ccount_comments%2Ccount_faves%2Ccount_views%2Cdate_taken%2Cdate_upload%2Cdescription%2Cicon_urls_deep%2Cisfavorite%2Cispro%2Clicense%2Cmedia%2Cneeds_interstitial%2Cowner_name%2Cowner_datecreate%2Cpath_alias%2Crealname%2Crotation%2Csafety_level%2Csecret_k%2Csecret_h%2Curl_c%2Curl_f%2Curl_h%2Curl_k%2Curl_l%2Curl_m%2Curl_n%2Curl_o%2Curl_q%2Curl_s%2Curl_sq%2Curl_t%2Curl_z%2Cvisibility%2Cvisibility_source%2Co_dims%2Cpubliceditability&get_user_info=1&jump_to=&user_id={}&view_as=use_pref&sort=use_pref&viewerNSID=&method=flickr.people.getPhotos&csrf=&api_key={}&format=json&hermes=1&hermesClient=1&reqId={}&nojsoncallback=1'
                    .format(item_value, api_key, req_id))
        elif item_type == 'photos':
            raise Exception('Skipping...')
            r = http_client.fetch('http://195.201.219.254/' + item_value,
                                  method='GET')
            user = item_value.split('/')[0]
            for i in r.body.decode('utf-8', 'ignore').splitlines():
                i = i.strip()
                wget_args.extend(
                    ['--warc-header', 'flickr-photo: {}'.format(i)])
                wget_args.extend(
                    ['--warc-header', 'flickr-photo-user: {}'.format(user)])
                wget_args.extend([
                    '--warc-header',
                    'flickr-photo-{}-user: {}'.format(i, user)
                ])
                wget_args.append('https://www.flickr.com/photos/{}/{}/'.format(
                    user, i))
                wget_args.append(
                    'https://www.flickr.com/photos/{}/{}/sizes/'.format(
                        user, i))
                wget_args.append(
                    'https://www.flickr.com/video_download.gne?id={}'.format(
                        i))
        elif item_type == 'photoscc':
            r = http_client.fetch('http://195.201.219.254/' + item_value,
                                  method='GET')
            for s in r.body.decode('utf-8', 'ignore').splitlines():
                s = s.strip()
                if s.startswith('www.flickr.com/photos/'):
                    s = '/'.join(s.split('/')[2:4])
                elif s.startswith('flickr.com/'):
                    s = s.split('/', 1)[1].rstrip('/')
                user, i = s.split(
                    '/'
                )  # NOTE: do not replace with anything that skips invalid urls, we want to catch those with pipeline aborts
                wget_args.extend(
                    ['--warc-header', 'flickr-photo: {}'.format(i)])
                wget_args.extend(
                    ['--warc-header', 'flickr-photo-user: {}'.format(user)])
                wget_args.extend([
                    '--warc-header',
                    'flickr-photo-{}-user: {}'.format(i, user)
                ])
                wget_args.append('https://www.flickr.com/photos/{}/{}/'.format(
                    user, i))
                wget_args.append(
                    'https://www.flickr.com/video_download.gne?id={}'.format(
                        i))
        else:
            raise Exception('Unknown item')

        http_client.close()

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #22
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U",
            random.choice(USER_AGENTS),
            "-nv",
            "--lua-script",
            "twitpic.lua",
            "-o",
            ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document",
            ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e",
            "robots=off",
            # "-w", "1",
            "--no-cookies",
            "--rotate-dns",
            # "--recursive", "--level=inf",
            "--no-parent",
            # "--page-requisites",
            "--timeout",
            "30",
            "--tries",
            "inf",
            "--span-hosts",
            "--waitretry",
            "30",
            "--domains",
            "twitpic.com,cloudfront.net,twimg.com,amazonaws.com",
            "--warc-file",
            ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header",
            "operator: Archive Team",
            "--warc-header",
            "twitpic2-dld-script-version: " + VERSION,
            "--warc-header",
            ItemInterpolation("twitpic2-user: %(item_name)s"),
            "--header",
            "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "--header",
            "DNT: 1",
            "--header",
            random.choice(ACCEPT_LANGUAGE_HEADERS),
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        assert item_type in ('image', )

        if item_type == 'image':
            start_id, end_id = item_value.split(':', 1)
            start_num = str_to_int(start_id)
            end_num = str_to_int(end_id)

            for num in range(start_num, end_num + 1):
                twitpic_name = int_to_str(num)
                url = 'http://twitpic.com/{0}'.format(twitpic_name)
                wget_args.append(url)

        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #23
0
              max_tries=2,
              accept_on_exit_code=[0, 4, 8],
              env={
                  'item_dir': ItemValue('item_dir'),
                  'item_value': ItemValue('item_value'),
                  'item_type': ItemValue('item_type'),
                  'warc_file_base': ItemValue('warc_file_base')
              }),
 PrepareStatsForTracker(
     defaults={
         'downloader': downloader,
         'version': VERSION
     },
     file_groups={
         'data': [
             ItemInterpolation('%(item_dir)s/%(warc_file_base)s.warc.gz')
             #ItemInterpolation('%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz')
         ]
     },
     id_function=stats_id_function,
 ),
 MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=20,
         default='20',
         name='shared:rsync_threads',
         title='Rsync threads',
         description='The maximum number of concurrent uploads.'),
     UploadWithTracker(
Exemple #24
0
                       phantomjs_exe=PHANTOMJS,
                       finished_warcs_dir=os.environ["FINISHED_WARCS_DIR"],
                       warc_max_size=WARC_MAX_SIZE)

pipeline = Pipeline(
    CheckIP(),
    GetItemFromQueue(control,
                     pipeline_id,
                     downloader,
                     ao_only=env.get('AO_ONLY'),
                     large=env.get('LARGE')), StartHeartbeat(control),
    SetFetchDepth(), PreparePaths(), WriteInfo(), DownloadUrlFile(control),
    WgetDownload(wpull_args,
                 accept_on_exit_code=AcceptAny(),
                 env={
                     'ITEM_IDENT': ItemInterpolation('%(ident)s'),
                     'LOG_KEY': ItemInterpolation('%(log_key)s'),
                     'REDIS_URL': REDIS_URL,
                     'PATH': os.environ['PATH']
                 }), RelabelIfAborted(control), WriteInfo(), MoveFiles(),
    LimitConcurrent(
        2,
        RsyncUpload(target=RSYNC_URL,
                    target_source_path=ItemInterpolation("%(data_dir)s"),
                    files=ItemValue("all_target_files"),
                    extra_args=['--partial', '--partial-dir', '.rsync-tmp'])),
    StopHeartbeat(), MarkItemAsDone(control, EXPIRE_TIME))


def stop_control():
    #control.flag_logging_thread_for_termination()
Exemple #25
0
     accept_on_exit_code=[0],  # [0, 4, 8],  #changed
     env={
         'item_dir': ItemValue('item_dir'),
         'item_value': ItemValue('item_value'),
         'item_type': ItemValue('item_type'),
         'warc_file_base': ItemValue('warc_file_base'),
         'todo_url_count': ItemValue('todo_url_count'),
     }),
 PrepareStatsForTracker(
     defaults={
         'downloader': downloader,
         'version': VERSION
     },
     file_groups={
         'data': [
             ItemInterpolation(
                 '%(item_dir)s/%(warc_file_base)s.warc.gz')  #TODO ?
         ]
     },
     id_function=stats_id_function,
 ),
 MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=20,
         default='20',
         name='shared:rsync_threads',
         title='Rsync threads',
         description='The maximum number of concurrent uploads.'),
     UploadWithTracker(
         'http://%s/%s' % (TRACKER_HOST, TRACKER_ID),
    def realize(self, item):
        wget_args = [
            WGET_AT,
            '-U',
            USER_AGENT,
            '-nv',
            '--content-on-error',
            '--lua-script',
            'so-net-u-page-plus.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            '--recursive',
            '--level=inf',
            '--no-parent',
            '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            '--domains',
            'upp.so-net.ne.jp',
            '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            '--warc-header',
            'x-wget-at-project-version: ' + VERSION,
            '--warc-header',
            'x-wget-at-project-name: ' + TRACKER_ID,
            '--warc-dedup-url-agnostic',
        ]

        for item_name in item['item_name'].split('\0'):
            wget_args.extend(
                ['--warc-header', 'x-wget-at-project-item-name: ' + item_name])
            wget_args.append('item-name://' + item_name)
            item_type, item_value = item_name.split(':', 1)
            if item_type == "userdir":
                wget_args.extend([
                    '--warc-header',
                    'so-net-u-page-plus-userdir: ' + item_value
                ])
                hostname = item_value.split("/")[0]
                user_dir_name = item_value.split("/")[1]
                wget_args.append('http://{}.upp.so-net.ne.jp/{}/'.format(
                    hostname, user_dir_name))
                # Alternate forms, because I amn't sure how they parse this
                wget_args.append(
                    'http://{}.upp.so-net.ne.jp/{}/index.htm'.format(
                        hostname, user_dir_name))
                wget_args.append(
                    'http://{}.upp.so-net.ne.jp/{}/index.html'.format(
                        hostname, user_dir_name))
                wget_args.append('http://{}.upp.so-net.ne.jp/{}'.format(
                    hostname, user_dir_name))
            else:
                raise ValueError('item_type not supported.')

        item['item_name_newline'] = item['item_name'].replace('\0', '\n')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #27
0
                l.split('"')[1] for l in lines
                if l.strip() and not "schedule=" in l)
        except KeyboardInterrupt:
            raise
        except:
            print traceback.format_exc()
            print "Continuing anyway..."
            usernames = []
        with open("%(data_dir)s/%(warc_file_base)s.friends" % item, "wb") as f:
            f.write("\n".join(usernames) + "\n")


wget_args = [
    WGET_LUA,
    "-U",
    ItemInterpolation("%(user_agent)s"),
    "-nv",
    "-o",
    ItemInterpolation("%(item_dir)s/wget.log"),
    "--lua-script",
    "wretch.lua",
    "--no-check-certificate",
    "--output-document",
    ItemInterpolation("%(item_dir)s/wget.tmp"),
    "--truncate-output",
    "-e",
    "robots=off",
    "--no-cookies",
    "--rotate-dns",
    "--recursive",
    "--level=inf",
 CheckIP(),
 GetItemFromTracker("http://%s/%s" % (TRACKER_HOST, TRACKER_ID), downloader,
                    VERSION),
 PrepareDirectories(warc_prefix="livejournaldisco"),
 ExternalProcess('Scraper',
                 CustomProcessArgs(),
                 max_tries=2,
                 accept_on_exit_code=[0],
                 env={"item_dir": ItemValue("item_dir")}),
 PrepareStatsForTracker(
     defaults={
         "downloader": downloader,
         "version": VERSION
     },
     file_groups={
         "data": [ItemInterpolation("%(item_dir)s/%(warc_file_base)s.txt")]
     },
     id_function=stats_id_function,
 ), MoveFiles(),
 LimitConcurrent(
     NumberConfigValue(
         min=1,
         max=4,
         default="1",
         name="shared:rsync_threads",
         title="Rsync threads",
         description="The maximum number of concurrent uploads."),
     UploadWithTracker(
         "http://%s/%s" % (TRACKER_HOST, TRACKER_ID),
         downloader=downloader,
         version=VERSION,
Exemple #29
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            "-U", random.choice(USER_AGENTS),
            "-nv",
            "--lua-script", "twitpic.lua",
            "-o", ItemInterpolation("%(item_dir)s/wget.log"),
            "--no-check-certificate",
            "--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
            "--truncate-output",
            "-e", "robots=off",
            "--no-cookies",
            "--rotate-dns",
            # Do download recursive, we're checking the urls in twitpic.lua
            "--recursive", "--level=inf",
            "--no-parent",
            "--page-requisites",
            "--timeout", "30",
            "--tries", "inf",
            "--span-hosts",
            "--waitretry", "30",
            "--domains", "twitpic.com,cloudfront.net,twimg.com,amazonaws.com",
            "--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
            "--warc-header", "operator: Archive Team",
            "--warc-header", "twitpic-dld-script-version: " + VERSION,
            "--warc-header", ItemInterpolation("twitpic-user: %(item_name)s"),
            "--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "--header", "DNT: 1",
            "--header", random.choice(ACCEPT_LANGUAGE_HEADERS),
        ]
        
        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)
        
        item['item_type'] = item_type
        item['item_value'] = item_value
        
        assert item_type in ('image', 'user', 'tag', 'event')
        
        if item_type == 'image':
            suffixes = string.digits + string.lowercase

            for args in [('http://twitpic.com/{0}{1}'.format(item_value, s), \
                          'http://twitpic.com/show/thumb/{0}{1}'.format(item_value, s), \
                          'http://twitpic.com/show/large/{0}{1}'.format(item_value, s), \
                          'http://twitpic.com/show/mini/{0}{1}'.format(item_value, s)) for s in suffixes]:
                wget_args.append(args[0])
                wget_args.append(args[1])
                wget_args.append(args[2])
                wget_args.append(args[3])

        elif item_type == 'user':
            wget_args.append('http://twitpic.com/photos/{0}'.format(item_value))
            wget_args.append('http://twitpic.com/events/{0}'.format(item_value))
            wget_args.append('http://twitpic.com/places/{0}'.format(item_value))
            wget_args.append('http://twitpic.com/faces/{0}'.format(item_value))
            wget_args.append('http://api.twitpic.com/2/users/show.json?username={0}'.format(item_value))
            wget_args.append('http://api.twitpic.com/2/places/show.json?user={0}'.format(item_value))
            wget_args.append('http://api.twitpic.com/2/events/show.json?user={0}'.format(item_value))
        elif item_type == 'tag':
            wget_args.append('http://twitpic.com/tag/{0}'.format(item_value))
            wget_args.append('http://api.twitpic.com/2/tags/show.json?tag={0}'.format(item_value))
            wget_args.append('http://twitpic.com/tag/{0}.json'.format(item_value))
        elif item_type == 'event':
            wget_args.append('http://api.twitpic.com/2/event/show.json?id={0}'.format(item_value))
        else:
            raise Exception('Unknown item')
        
        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)
Exemple #30
0
    def realize(self, item):
        wget_args = [
            WGET_LUA,
            '-U',
            USER_AGENT,
            '-nv',
            '--no-cookies',
            '--lua-script',
            'gfycat-disc.lua',
            '-o',
            ItemInterpolation('%(item_dir)s/wget.log'),
            '--no-check-certificate',
            '--output-document',
            ItemInterpolation('%(item_dir)s/wget.tmp'),
            '--truncate-output',
            '-e',
            'robots=off',
            '--rotate-dns',
            # '--recursive', '--level=inf',
            # '--no-parent',
            # '--page-requisites',
            '--timeout',
            '30',
            '--tries',
            'inf',
            # '--domains', 'nationalgeographic.com',
            # '--span-hosts',
            '--waitretry',
            '30',
            '--warc-file',
            ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
            '--warc-header',
            'operator: Archive Team',
            #'--warc-header', 'yourshot-static-dld-script-version: ' + VERSION,
            #'--warc-header', ItemInterpolation('yourshot-static-item: %(item_name)s'),
            # --warc-header yourshot-photo-id: ... filled in below
            # '--header', 'Accept-Encoding: gzip',
            # '--compression', 'gzip'
            # changed flags #
        ]

        item_name = item['item_name']
        assert ':' in item_name
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        if item_type.startswith('AdjAdj'):
            wget_urls = []
            defer_assets = []
            photo_ids = []
            item_version = None

            for animal in animals:
                wget_urls.append("https://api.gfycat.com/v1/gfycats/" +
                                 item_value + animal.capitalize())

            if item_version is None:
                item_version = len(wget_urls)
            item["version"] = item_version
            item["todo_url_count"] = str(len(wget_urls))

            print("URIs ToDo: {}".format(len(wget_urls)))
            if len(wget_urls) == 0:
                wget_args.append("-V")
            else:
                wget_args.extend(wget_urls)

            # print("\nD^      ", end="")  #debug
            # print("\nD^      ".join(defer_assets))  #debug
        else:
            raise Exception('Unknown item')

        if 'bind_address' in globals():
            wget_args.extend(['--bind-address', globals()['bind_address']])
            print('')
            print('*** Wget will bind address at {0} ***'.format(
                globals()['bind_address']))
            print('')

        return realize(wget_args, item)