Ejemplo n.º 1
0
 def handle(self, *args, **options):
     ua = '[adjective][species] (explore621) {}'.format(options['username'])
     started = datetime.now()
     started = started.replace(tzinfo=pytz.UTC)
     ingested = 0
     total_new = 0
     total_updated = 0
     last_id = 0
     tags_added = {}
     sources_added = {}
     artists_added = {}
     self.stdout.write('Refreshing data from e621 starting at {}'.format(
         started.isoformat()))
     self.stdout.write(
         'Fetching {} pages worth of posts at {} per page'.format(
             options['pages'], options['per_page']))
     for i in range(1, options['pages'] + 1):
         time.sleep(1)
         self.stdout.write('--- Fetching page {}'.format(i))
         r = requests.get(
             'https://e621.net/post/index.json',
             {'limit': options['per_page'], 'page': i},
             headers={'user-agent': ua})
         if r.status_code != 200:
             self.stdout.write(
                 self.style.NOTICE('    got {} on page {}, skipping'.format(
                     r.status_code, i)))
             continue
         tags_added = {}
         sources_added = {}
         artists_added = {}
         updated = 0
         new = 0
         for record in r.json():
             if record['id'] > last_id:
                 last_id = record['id']
             created_at = pytz.utc.localize(
                 datetime.fromtimestamp(record['created_at']['s']))
             try:
                 post = Post.objects.get(source_id=record['id'])
                 post.source_id = record['id']
                 post.description = record['description']
                 post.created_at = created_at
                 post.creator_id = record['creator_id']
                 post.author = record['author']
                 post.change = record['change']
                 post.source = record['source']
                 post.score = record['score']
                 post.fav_count = record['fav_count']
                 post.md5 = record['md5']
                 post.file_size = record['file_size']
                 post.file_url = record['file_url']
                 post.width = record['width']
                 post.height = record['height']
                 post.file_ext = record['file_ext']
                 post.preview_url = record['preview_url']
                 post.preview_width = record['preview_width']
                 post.preview_height = record['preview_height']
                 post.sample_url = record['sample_url']
                 post.sample_width = record['sample_width']
                 post.sample_height = record['sample_height']
                 post.rating = record['rating']
                 post.status = record['status']
                 post.has_comments = record['has_comments']
                 post.has_notes = record['has_notes']
                 post.has_children = record['has_children']
                 post.children = record['children']
                 post.parent_id = record['parent_id']
                 updated += 1
             except Post.DoesNotExist:
                 post = Post(
                     source_id=record['id'],
                     description=record['description'],
                     created_at=created_at,
                     creator_id=record['creator_id'],
                     author=record['author'],
                     change=record['change'],
                     source=record['source'],
                     score=record['score'],
                     fav_count=record['fav_count'],
                     md5=record['md5'],
                     file_size=record['file_size'],
                     file_url=record['file_url'],
                     width=record['width'],
                     height=record['height'],
                     file_ext=record['file_ext'],
                     preview_url=record['preview_url'],
                     preview_width=record['preview_width'],
                     preview_height=record['preview_height'],
                     sample_url=record['sample_url'],
                     sample_width=record['sample_width'],
                     sample_height=record['sample_height'],
                     rating=record['rating'],
                     status=record['status'],
                     has_comments=record['has_comments'],
                     has_notes=record['has_notes'],
                     has_children=record['has_children'],
                     children=record['children'],
                     parent_id=record['parent_id'])
                 new += 1
             record_tags = record['tags'].split(' ')
             tags = []
             for record_tag in record_tags:
                 if record_tag in tags_added:
                     tag = tags_added[record_tag]
                 else:
                     try:
                         tag = Tag.objects.get(tag=record_tag)
                     except Tag.DoesNotExist:
                         tag = Tag(tag=record_tag)
                         tag.save()
                     tags_added[record_tag] = tag
                 tags.append(tag)
             artists = []
             for record_artist in record.get('artist', []):
                 if record_artist in artists_added:
                     artist = artists_added[record_artist]
                 else:
                     try:
                         artist = Artist.objects.get(name=record_artist)
                     except Artist.DoesNotExist:
                         artist = Artist(name=record_artist)
                         artist.save()
                     artists_added[record_artist] = artist
                 artists.append(artist)
             sources = []
             for record_source in record.get('sources', []):
                 if record_source in sources_added:
                     source = sources_added[record_source]
                 else:
                     try:
                         source = Source.objects.get(url=record_source)
                     except Source.DoesNotExist:
                         source = Source(url=record_source)
                         source.save()
                     sources_added[record_source] = source
                 sources.append(source)
             try:
                 post.save()
             except IntegrityError:
                 continue
             post.artists.set(artists)
             post.sources.set(sources)
             post.tags.set(tags)
             ingested += 1
         total_new += new
         total_updated += updated
         self.stdout.write(
             self.style.SUCCESS(
                 '    processed page {}; {} new, {} updated'.format(
                     i, new, updated)))
     self.stdout.write(
         self.style.SUCCESS('{} posts ingested ({} new - {} updated)'.format(
             ingested, total_new, total_updated)))
     tags_fixed = 0
     fixed_tags = []
     self.stdout.write('Fixing typeless tags')
     for tag in Tag.objects.filter(tag_type=-1):
         self.stdout.write('--- Fixing tag {}'.format(tag.tag))
         r = requests.get(
             'https://e621.net/tag/show.json',
             params={'name': tag.tag},
             headers={'user-agent': '[adjective][species]'})
         if 'type' not in r.json():
             self.stdout.write(
                 self.style.NOTICE('    not fixing {}'.format(tag.tag)))
             continue
         tag.tag_type = r.json()['type']
         tag.save()
         tags_fixed += 1
         fixed_tags.append(tag.tag)
         self.stdout.write(
             self.style.SUCCESS('    fixed {} ({})'.format(
                 tag.tag, tag.get_tag_type_display())))
         time.sleep(0.7)
     self.stdout.write(
         self.style.SUCCESS('{} tags fixed'.format(tags_fixed)))
     empty = Tag.objects.annotate(Count('post')).filter(post__count=0)
     tags_deleted = 0
     deleted_tags = []
     self.stdout.write('Deleting empty tags')
     for tag in empty:
         self.stdout.write(
             self.style.NOTICE('--- deleting {}'.format(tag.tag)))
         deleted_tags.append(tag.tag)
         tag.delete()
         tags_deleted += 1
     self.stdout.write(
         self.style.SUCCESS('{} empty tags deleted'.format(tags_deleted)))
     self.stdout.write('Deleting empty sources')
     sources_deleted = len([
         s.delete() for s in
         Source.objects.annotate(Count('post')).filter(post__count=0)])
     self.stdout.write(
         self.style.NOTICE('--- {} sources deleted'.format(
             sources_deleted)))
     self.stdout.write('Deleting empty artists')
     artists_deleted = len([
         a.delete() for a in
         Artist.objects.annotate(Count('post')).filter(post__count=0)])
     self.stdout.write(
         self.style.NOTICE('--- {} artists deleted'.format(
             artists_deleted)))
     log = IngestLog(
         started=started,
         records_ingested=ingested,
         new=total_new,
         updated=total_updated,
         last_id=last_id,
         fixed_tags=' '.join(fixed_tags),
         deleted_tags=' '.join(deleted_tags),
         sources_deleted=sources_deleted,
         artists_deleted=artists_deleted)
     log.save()
     self.stdout.write(
         self.style.SUCCESS('Finished refreshing in {}'.format(
             str(log.finished - log.started))))
Ejemplo n.º 2
0
 def handle(self, *args, **options):
     ua = '[adjective][species] (explore621) {}'.format(options['username'])
     started = datetime.now()
     self.stdout.write(
         'Performing a full refresh from e621 starting at {}'.format(
             started.isoformat()))
     max_id = Post.objects.aggregate(mid=Max('source_id'))['mid']
     min_id = Post.objects.aggregate(mid=Min('source_id'))['mid']
     total_added = 0
     while max_id > min_id:
         time.sleep(1)
         r = requests.get('https://e621.net/post/index.json', {
             'limit': 320,
             'before_id': max_id
         },
                          headers={'user-agent': ua})
         if r.status_code != 200:
             self.stdout.write(
                 self.style.NOTICE(
                     '    got {} on before_id {}, skipping'.format(
                         r.status_code, max_id)))
             continue
         skipped = 0
         added = 0
         for record in r.json():
             max_id = record['id']
             created_at = pytz.utc.localize(
                 datetime.fromtimestamp(record['created_at']['s']))
             if Post.objects.filter(source_id=record['id']).count() != 0:
                 skipped += 1
                 continue
             post = Post(source_id=record['id'],
                         description=record['description'],
                         created_at=created_at,
                         creator_id=record['creator_id'],
                         author=record['author'],
                         change=record['change'],
                         source=record['source'],
                         score=record['score'],
                         fav_count=record['fav_count'],
                         md5=record['md5'],
                         file_size=record['file_size'],
                         file_url=record['file_url'],
                         width=record['width'],
                         height=record['height'],
                         file_ext=record['file_ext'],
                         preview_url=record['preview_url'],
                         preview_width=record['preview_width'],
                         preview_height=record['preview_height'],
                         sample_url=record['sample_url'],
                         sample_width=record['sample_width'],
                         sample_height=record['sample_height'],
                         rating=record['rating'],
                         status=record['status'],
                         has_comments=record['has_comments'],
                         has_notes=record['has_notes'],
                         has_children=record['has_children'],
                         children=record['children'],
                         parent_id=record['parent_id'])
             record_tags = record['tags'].split(' ')
             tags = []
             for record_tag in record_tags:
                 try:
                     tag = Tag.objects.get(tag=record_tag)
                 except Tag.DoesNotExist:
                     tag = Tag(tag=record_tag)
                     tag.save()
                 tags.append(tag)
             artists = []
             for record_artist in record.get('artist', []):
                 try:
                     artist = Artist.objects.get(name=record_artist)
                 except Artist.DoesNotExist:
                     artist = Artist(name=record_artist)
                     artist.save()
                 artists.append(artist)
             sources = []
             for record_source in record.get('sources', []):
                 try:
                     source = Source.objects.get(url=record_source)
                 except Source.DoesNotExist:
                     source = Source(url=record_source)
                     source.save()
                 sources.append(source)
             try:
                 post.save()
             except IntegrityError:
                 continue
             post.artists.set(artists)
             post.sources.set(sources)
             post.tags.set(tags)
             added += 1
         self.stdout.write(
             self.style.SUCCESS(
                 'Ending at {}: {} added / {} skipped'.format(
                     max_id, added, skipped)))
         total_added += added
     self.stdout.write('Fixing typeless tags')
     for tag in Tag.objects.filter(tag_type=-1):
         self.stdout.write('--- Fixing tag {}'.format(tag.tag))
         r = requests.get('https://e621.net/tag/show.json',
                          params={'name': tag.tag},
                          headers={'user-agent': '[adjective][species]'})
         if 'type' not in r.json():
             self.stdout.write(
                 self.style.NOTICE('    not fixing {}'.format(tag.tag)))
             continue
         tag.tag_type = r.json()['type']
         tag.save()
         tags_fixed += 1
         fixed_tags.append(tag.tag)
         self.stdout.write(
             self.style.SUCCESS('    fixed {} ({})'.format(
                 tag.tag, tag.get_tag_type_display())))
         time.sleep(0.7)
     self.stdout.write(
         self.style.SUCCESS('{} tags fixed'.format(tags_fixed)))
     self.stdout.write(
         self.style.SUCCESS('Finished full refresh in {}'.format(
             str(datetime.now() - started))))