def get_text(self, total_time, **kwargs): """Get the text for printing. Total processing time must be provided externally.""" try: wait_percent = round( self.get('total_wait_time_with_lock').total_seconds() / total_time.total_seconds() * 100 / AppSettings.get('SCRAPER_THREADS_NUMBER')) downloading_percent = round( self.get('total_download_time').total_seconds() / total_time.total_seconds() * 100 / AppSettings.get('SCRAPER_THREADS_NUMBER')) except: # Division by zero. Set to 0 for the statistics. wait_percent = 0 downloading_percent = 0 return 'Time passed: %s seconds (%s%% waiting, %s%% downloading files) Processed threads: %s Added posts: %s Removed posts: %s Downloaded images: %s Downloaded thumbnails: %s Downloaded threads: %s' % ( round(total_time.total_seconds(), 2), wait_percent, downloading_percent, self.get('processed_threads'), self.get('added_posts'), self.get('removed_posts'), self.get('downloaded_images'), self.get('downloaded_thumbnails'), self.get('downloaded_threads'), )
def update(self): """Call this to update the database.""" # Get the catalog from the API. try: self.catalog = self.get_catalog_json() except: raise ScrapError( 'Unable to download or parse the catalog data. Board update stopped.' ) # Launch the initial threads. Next ones will be launched automatically. for i in range(0, AppSettings.get('SCRAPER_THREADS_NUMBER')): self.launch_thread() # Wait for all threads to finish. while True: time.sleep(1) with self.running_threads_lock: if not len(self.running_threads): break self.stats.add('total_wait_time', self.queuer.get_total_wait_time()) self.stats.add('total_wait_time_with_lock', self.queuer.get_total_wait_time_with_lock())
def get_url(self, url): """Download data from an url.""" download_start = datetime.datetime.now() data = requests.get(url, timeout=AppSettings.get('CONNECTION_TIMEOUT')) self.stats.add('total_download_time', datetime.datetime.now() - download_start) return data
def api_wait(self): """Wait in order to satisfy the API rules.""" wait_start = datetime.datetime.now() with self.api_wait_lock: self.wait(AppSettings.get('API_WAIT'), self.last_api_request) self.last_api_request = datetime.datetime.now() self.total_wait_time_with_lock += datetime.datetime.now() - wait_start
def file_wait(self): """Wait in order to satisfy the rules. Used before downloading images.""" wait_start = datetime.datetime.now() with self.file_wait_lock: self.wait(AppSettings.get('FILE_WAIT'), self.last_file_request) self.last_file_request = datetime.datetime.now() self.total_wait_time_with_lock += datetime.datetime.now() - wait_start
def get_stats(**kwargs): board_name = kwargs.get('board', None) thread_number = kwargs.get('thread', None) context = {} # This time is used when selecting data for a chart and recent posts. # It is supposed to prevent drawing to much data on the chart and ensures correct results when calculating posts per hour # (saved threads which do not get deleted would alter the results). timespan = AppSettings.get('RECENT_POSTS_AGE') queryset_posts = Post.objects queryset_threads = Thread.objects if board_name is not None: queryset_posts = queryset_posts.filter(thread__board=board_name) queryset_threads = queryset_threads.filter(board=board_name) if thread_number is not None: queryset_posts = queryset_posts.filter(thread__number=thread_number) queryset_threads = queryset_threads.filter(number=thread_number) # Increase accuracy in thread mode. if board_name and thread_number: times = queryset_threads.annotate(first=Min('post__time'), last=Max('post__time')).first() timespan = (times.last - times.first).total_seconds() / 3600 # Base this on the time of the last matched post. It is possible to get an empty chart # in the older threads if this is based on the current time. timespan_time = queryset_posts.last().time - datetime.timedelta( hours=timespan) # Prepare data for the chart. It is necessary to convert it to a format required by Google Charts. posts = queryset_posts.filter(time__gt=timespan_time).extra({ 'date': 'date("time")', 'hour': "date_part(\'hour\', \"time\")" }).values('date', 'hour').order_by('date', 'hour').annotate(amount=Count('id')).filter() context['chart_data'] = get_posts_chart_data(posts) # Posts. context['total_posts'] = queryset_posts.count() context['total_image_posts'] = queryset_posts.exclude(image=None).count() context['recent_posts'] = queryset_posts.filter( time__gt=timespan_time).count() context['recent_posts_timespan'] = timespan # Threads. context['total_threads'] = queryset_threads.count() return context
def handle(self, *args, **options): # Prevent multiple instances. Apparently fcntl.lockf is very useful and does completely nothing. me = singleton.SingleInstance() boards = Board.objects.filter(active=True) # Show progress? if options['progress']: progress = True else: progress = False # Get new data for each board. for board in boards: # Info. processing_start = datetime.datetime.utcnow().replace(tzinfo=utc) update = Update.objects.create( board=board, start=processing_start, used_threads=AppSettings.get('SCRAPER_THREADS_NUMBER')) try: # Actual update. scraper = BoardScraper(board, progress=progress) scraper.update() # Info. update.status = Update.COMPLETED except Exception as e: sys.stderr.write('%s\n' % (e)) finally: # Info. try: if update.status != Update.COMPLETED: update.status = Update.FAILED processing_end = datetime.datetime.utcnow().replace( tzinfo=utc) processing_time = processing_end - processing_start update.end = processing_end update = scraper.stats.add_to_record( update, processing_time) except Exception as e: sys.stderr.write('%s\n' % (e)) finally: update.save() # Everything below is just info. print('%s Board: %s %s' % ( datetime.datetime.now(), board, scraper.stats.get_text(processing_time), ))
def get_stats(**kwargs): board_name = kwargs.get('board', None) thread_number = kwargs.get('thread', None) context = {} # This time is used when selecting data for a chart and recent posts. # It is supposed to prevent drawing to much data on the chart and ensures correct results when calculating posts per hour # (saved threads which do not get deleted would alter the results). timespan = AppSettings.get('RECENT_POSTS_AGE') queryset_posts = Post.objects queryset_threads = Thread.objects if board_name is not None: queryset_posts = queryset_posts.filter(thread__board=board_name) queryset_threads = queryset_threads.filter(board=board_name) if thread_number is not None: queryset_posts = queryset_posts.filter(thread__number=thread_number) queryset_threads = queryset_threads.filter(number=thread_number) # Increase accuracy in thread mode. if board_name and thread_number: times = queryset_threads.annotate( first=Min('post__time'), last=Max('post__time') ).first() timespan = (times.last - times.first).total_seconds() / 3600 # Base this on the time of the last matched post. It is possible to get an empty chart # in the older threads if this is based on the current time. timespan_time = queryset_posts.last().time - datetime.timedelta(hours=timespan) # Prepare data for the chart. It is necessary to convert it to a format required by Google Charts. posts = queryset_posts.filter(time__gt=timespan_time).extra({ 'date': 'date("time")', 'hour': "date_part(\'hour\', \"time\")" }).values('date', 'hour').order_by('date', 'hour').annotate(amount=Count('id')).filter() context['chart_data'] = get_posts_chart_data(posts) # Posts. context['total_posts'] = queryset_posts.count() context['total_image_posts'] = queryset_posts.exclude(image=None).count() context['recent_posts'] = queryset_posts.filter(time__gt=timespan_time).count() context['recent_posts_timespan'] = timespan # Threads. context['total_threads'] = queryset_threads.count() return context
def add_to_record(self, record, total_time, **kwargs): """Save the statistics in the database.""" used_threads = kwargs.get('used_threads', AppSettings.get('SCRAPER_THREADS_NUMBER')) wait_time = self.get('total_wait_time_with_lock').total_seconds() / used_threads download_time = self.get('total_download_time').total_seconds() / used_threads record.total_time = total_time.total_seconds() record.wait_time = wait_time record.download_time = download_time record.processed_threads = self.get('processed_threads') record.added_posts = self.get('added_posts') record.removed_posts = self.get('removed_posts') record.downloaded_images = self.get('downloaded_images') record.downloaded_thumbnails = self.get('downloaded_thumbnails') record.downloaded_threads = self.get('downloaded_threads') return record
def add_to_record(self, record, total_time, **kwargs): """Save the statistics in the database.""" used_threads = kwargs.get('used_threads', AppSettings.get('SCRAPER_THREADS_NUMBER')) wait_time = self.get( 'total_wait_time_with_lock').total_seconds() / used_threads download_time = self.get( 'total_download_time').total_seconds() / used_threads record.total_time = total_time.total_seconds() record.wait_time = wait_time record.download_time = download_time record.processed_threads = self.get('processed_threads') record.added_posts = self.get('added_posts') record.removed_posts = self.get('removed_posts') record.downloaded_images = self.get('downloaded_images') record.downloaded_thumbnails = self.get('downloaded_thumbnails') record.downloaded_threads = self.get('downloaded_threads') return record
def update(self): """Call this to update the database.""" # Get the catalog from the API. try: self.catalog = self.get_catalog_json() except: raise ScrapError('Unable to download or parse the catalog data. Board update stopped.') # Launch the initial threads. Next ones will be launched automatically. for i in range(0, AppSettings.get('SCRAPER_THREADS_NUMBER')): self.launch_thread() # Wait for all threads to finish. while True: time.sleep(1) with self.running_threads_lock: if not len(self.running_threads): break self.stats.add('total_wait_time', self.queuer.get_total_wait_time()) self.stats.add('total_wait_time_with_lock', self.queuer.get_total_wait_time_with_lock())
import os, time from django.db import models from django.db.models import Max, Min, Count, F from django.core.urlresolvers import reverse from django.core.files.storage import FileSystemStorage from archive_chan.settings import AppSettings # This overrides the global media url. fs = FileSystemStorage(base_url=AppSettings.get('MEDIA_URL')) class Board(models.Model): name = models.CharField(max_length=255, primary_key = True) active = models.BooleanField( default=True, help_text='Should this board be updated with new posts?' ) store_threads_for = models.IntegerField( default=48, help_text='[hours] After that much time passes from the last reply in a NOT SAVED thread it will be deleted. Set to 0 to preserve threads forever.' ) replies_threshold = models.IntegerField( default=20, help_text='Store threads after they reach that many replies.' ) class Meta: ordering = ['name'] def __str__(self):
import os, time from django.db import models from django.db.models import Max, Min, Count, F from django.core.urlresolvers import reverse from django.core.files.storage import FileSystemStorage from archive_chan.settings import AppSettings # This overrides the global media url. fs = FileSystemStorage(base_url=AppSettings.get('MEDIA_URL')) class Board(models.Model): name = models.CharField(max_length=255, primary_key=True) active = models.BooleanField( default=True, help_text='Should this board be updated with new posts?') store_threads_for = models.IntegerField( default=48, help_text= '[hours] After that much time passes from the last reply in a NOT SAVED thread it will be deleted. Set to 0 to preserve threads forever.' ) replies_threshold = models.IntegerField( default=20, help_text='Store threads after they reach that many replies.') class Meta: ordering = ['name'] def __str__(self): return format("/%s/" % self.name)
def get_text(self, total_time, **kwargs): """Get the text for printing. Total processing time must be provided externally.""" try: wait_percent = round(self.get('total_wait_time_with_lock').total_seconds() / total_time.total_seconds() * 100 / AppSettings.get('SCRAPER_THREADS_NUMBER')) downloading_percent = round(self.get('total_download_time').total_seconds() / total_time.total_seconds() * 100 / AppSettings.get('SCRAPER_THREADS_NUMBER')) except: # Division by zero. Set to 0 for the statistics. wait_percent = 0 downloading_percent = 0 return 'Time passed: %s seconds (%s%% waiting, %s%% downloading files) Processed threads: %s Added posts: %s Removed posts: %s Downloaded images: %s Downloaded thumbnails: %s Downloaded threads: %s' % ( round(total_time.total_seconds(), 2), wait_percent, downloading_percent, self.get('processed_threads'), self.get('added_posts'), self.get('removed_posts'), self.get('downloaded_images'), self.get('downloaded_thumbnails'), self.get('downloaded_threads'), )
from django.conf.urls import patterns, url from django.views.decorators.csrf import ensure_csrf_cookie from django.views.decorators.cache import cache_page import archive_chan.views.core as core import archive_chan.views.api as api from archive_chan.settings import AppSettings cache = AppSettings.get('VIEW_CACHE_AGE') cache_static = AppSettings.get('VIEW_CACHE_AGE_STATIC') urlpatterns = patterns('', # Global. url(r'^$', cache_page(cache)(core.IndexView.as_view()), name='index'), url(r'^stats/$', cache_page(cache_static)(core.StatsView.as_view()), name='stats'), url(r'^gallery/$', cache_page(cache_static)(core.GalleryView.as_view()), name='gallery'), url(r'^search/$', core.SearchView.as_view(), name='search'), # Board. url(r'^board/(?P<board>[a-z]+)/$', cache_page(cache)(core.BoardView.as_view()), name='board'), url(r'^board/(?P<board>[a-z]+)/stats/$', cache_page(cache_static)(core.StatsView.as_view()), name='board_stats'), url(r'^board/(?P<board>[a-z]+)/gallery/$', cache_page(cache_static)(core.GalleryView.as_view()), name='board_gallery'), url(r'^board/(?P<board>[a-z]+)/search/$', core.SearchView.as_view(), name='board_search'), # Stats. url(r'^board/(?P<board>[a-z]+)/thread/(?P<thread>[0-9]+)/$', ensure_csrf_cookie(core.ThreadView.as_view()), name='thread'), url(r'^board/(?P<board>[a-z]+)/thread/(?P<thread>[0-9]+)/stats/$', cache_page(cache_static)(core.StatsView.as_view()), name='thread_stats'), url(r'^board/(?P<board>[a-z]+)/thread/(?P<thread>[0-9]+)/gallery/$', cache_page(cache_static)(core.GalleryView.as_view()), name='thread_gallery'), url(r'^board/(?P<board>[a-z]+)/thread/(?P<thread>[0-9]+)/search/$', core.SearchView.as_view(), name='thread_search'),
from django.conf.urls import patterns, url from django.views.decorators.csrf import ensure_csrf_cookie from django.views.decorators.cache import cache_page import archive_chan.views.core as core import archive_chan.views.api as api from archive_chan.settings import AppSettings cache = AppSettings.get('VIEW_CACHE_AGE') cache_static = AppSettings.get('VIEW_CACHE_AGE_STATIC') urlpatterns = patterns( '', # Global. url(r'^$', cache_page(cache)(core.IndexView.as_view()), name='index'), url(r'^stats/$', cache_page(cache_static)(core.StatsView.as_view()), name='stats'), url(r'^gallery/$', cache_page(cache_static)(core.GalleryView.as_view()), name='gallery'), url(r'^search/$', core.SearchView.as_view(), name='search'), # Board. url(r'^board/(?P<board>[a-z]+)/$', cache_page(cache)(core.BoardView.as_view()), name='board'), url(r'^board/(?P<board>[a-z]+)/stats/$', cache_page(cache_static)(core.StatsView.as_view()), name='board_stats'),