Ejemplo n.º 1
0
def crawl_post(post, total, append=True):
    success, favs, code = download(source_uri % post['id'])

    if success and favs:
        favs = favs['favorited_users'].split(',')
        favs = [{'id': post['id'], 'username': fav} for fav in favs]

        store_csv(favs, fav_file, append, ['id', 'username'])
    else:
        print('post', post['id'], 'failed with code', code)
Ejemplo n.º 2
0
from _connection import download
from _storage import store_csv
from tqdm import tqdm

source_uri = 'https://e621.net/tag/index.json?limit=500&order=name&page=%d'
target_uri = '../data/tags.csv'
last_ids = []
page = 0

with tqdm(desc='Crawling') as bar:
    while True:
        page += 1
        bar.update(1)
        success, tags, code = download(source_uri % page)

        if success and tags:
            tags = [tag for tag in tags if tag['id'] not in last_ids]
            last_ids = [tag['id'] for tag in tags]

            store_csv(tags, target_uri, append=(page > 1),
                      fields=['id', 'name', 'count', 'type'])
        elif success:
            # no more tags, page is empty
            break
        else:
            print('\nFailed to access page %d, code %d' % (page, code))
            continue
Ejemplo n.º 3
0
from _connection import download
from _storage import store_csv
from tqdm import tqdm

source_uri = 'https://e621.net/artist/index.json?page=%d&limit=100'
info_target_uri = '../data/artists.info.csv'
urls_target_uri = '../data/artists.urls.csv'
last_ids = []
page = 0

with tqdm(desc='Crawling') as bar:
    while True:
        page += 1
        bar.update(1)
        success, artists, code = download(source_uri % page)

        if success and artists:
            info = [a for a in artists if a['id'] not in last_ids]
            urls = [{'id': a['id'], 'url': u} for a in info for u in a['urls']]
            last_ids = [artist['id'] for artist in info]

            store_csv(info,
                      info_target_uri,
                      append=(page > 1),
                      fields=[
                          'id', 'name', 'other_names', 'group_name',
                          'is_active', 'version', 'updater_id'
                      ])
            store_csv(urls,
                      urls_target_uri,
                      append=(page > 1),
Ejemplo n.º 4
0
from _storage import store_csv
from datetime import datetime
import sys
from tqdm import tqdm

source_uri = 'https://e621.net/user/index.json?page=%d'
target_uri = '../data/users.csv'

page = int(sys.argv[1]) - 1 if len(sys.argv) > 1 else 0
last = int(sys.argv[2]) if len(sys.argv) > 2 else float('inf')

with tqdm(desc='Crawling') as bar:
    while True:
        page += 1
        bar.update(1)
        success, users, code = download(source_uri % page)

        if success and users:
            for user in users:
                # flatten user['stats'] into user
                user.update(user['stats'])
                # reformat timestamp
                user['created_at'] = datetime.strptime(
                    user['created_at'], '%Y-%m-%d %H:%M').strftime('%s')

            users = [user for user in users if user['id'] < last]
            last = users[-1]['id'] if users else last

            store_csv(users,
                      target_uri,
                      append=(page > 1),
Ejemplo n.º 5
0
from _connection import download
from _storage import store_csv
from tqdm import tqdm

source_uri = 'https://e621.net/tag_alias/index.json?approved=true&page=%d'
target_uri = '../data/aliases.csv'
last_ids = []
page = 0

with tqdm(desc='Crawling') as bar:
    while True:
        page += 1
        bar.update(1)
        success, aliases, code = download(source_uri % page)

        if success and aliases:
            aliases = [a for a in aliases if a['id'] not in last_ids]
            last_ids = [alias['id'] for alias in aliases]

            store_csv(aliases,
                      target_uri,
                      append=(page > 1),
                      fields=['id', 'name', 'alias_id'])
        elif success:
            # no more tags, page is empty
            break
        else:
            print('\nFailed to access page %d, code %d' % (page, code))
            continue
Ejemplo n.º 6
0
from tqdm import tqdm

source_uri = 'https://e621.net/post/index.json?limit=320&before_id=%s'
target_uri = {
    'kpi': '../data/posts.kpi.csv',
    'content': '../data/posts.content.csv',
    'info': '../data/posts.info.csv',
    'tags': '../data/posts.tags.csv',
    'artists': '../data/posts.artists.csv'
}
last_id = ''

with tqdm(desc='Crawling') as bar:
    while True:
        bar.update(1)
        success, posts, code = download(source_uri % last_id)

        if success and posts:
            for post in posts:
                # do some preprocessing so the output is more useful
                post['created_at'] = post['created_at']['s']
                post['description'] = post['description'].replace('\n', ' ')

            artists = [{
                'id': post['id'],
                'artist': artist
            } for post in posts for artist in post['artist']]
            tags = [{
                'id': post['id'],
                'tag': tag
            } for post in posts for tag in post['tags'].split()]