Esempio n. 1
0
    def get_text(self, total_time, **kwargs):
        """Get the text for printing. Total processing time must be provided externally."""
        try:
            wait_percent = round(
                self.get('total_wait_time_with_lock').total_seconds() /
                total_time.total_seconds() * 100 /
                AppSettings.get('SCRAPER_THREADS_NUMBER'))
            downloading_percent = round(
                self.get('total_download_time').total_seconds() /
                total_time.total_seconds() * 100 /
                AppSettings.get('SCRAPER_THREADS_NUMBER'))

        except:
            # Division by zero. Set to 0 for the statistics.
            wait_percent = 0
            downloading_percent = 0

        return 'Time passed: %s seconds (%s%% waiting, %s%% downloading files) Processed threads: %s Added posts: %s Removed posts: %s Downloaded images: %s Downloaded thumbnails: %s Downloaded threads: %s' % (
            round(total_time.total_seconds(), 2),
            wait_percent,
            downloading_percent,
            self.get('processed_threads'),
            self.get('added_posts'),
            self.get('removed_posts'),
            self.get('downloaded_images'),
            self.get('downloaded_thumbnails'),
            self.get('downloaded_threads'),
        )
Esempio n. 2
0
    def update(self):
        """Call this to update the database."""
        # Get the catalog from the API.
        try:
            self.catalog = self.get_catalog_json()

        except:
            raise ScrapError(
                'Unable to download or parse the catalog data. Board update stopped.'
            )

        # Launch the initial threads. Next ones will be launched automatically.
        for i in range(0, AppSettings.get('SCRAPER_THREADS_NUMBER')):
            self.launch_thread()

        # Wait for all threads to finish.
        while True:
            time.sleep(1)
            with self.running_threads_lock:
                if not len(self.running_threads):
                    break

        self.stats.add('total_wait_time', self.queuer.get_total_wait_time())
        self.stats.add('total_wait_time_with_lock',
                       self.queuer.get_total_wait_time_with_lock())
Esempio n. 3
0
 def get_url(self, url):
     """Download data from an url."""
     download_start = datetime.datetime.now()
     data = requests.get(url, timeout=AppSettings.get('CONNECTION_TIMEOUT'))
     self.stats.add('total_download_time',
                    datetime.datetime.now() - download_start)
     return data
Esempio n. 4
0
    def api_wait(self):
        """Wait in order to satisfy the API rules."""
        wait_start = datetime.datetime.now()

        with self.api_wait_lock:
            self.wait(AppSettings.get('API_WAIT'), self.last_api_request)
            self.last_api_request = datetime.datetime.now()

        self.total_wait_time_with_lock += datetime.datetime.now() - wait_start
Esempio n. 5
0
    def file_wait(self):
        """Wait in order to satisfy the rules. Used before downloading images."""
        wait_start = datetime.datetime.now()

        with self.file_wait_lock:
            self.wait(AppSettings.get('FILE_WAIT'), self.last_file_request)
            self.last_file_request = datetime.datetime.now()

        self.total_wait_time_with_lock += datetime.datetime.now() - wait_start
Esempio n. 6
0
def get_stats(**kwargs):
    board_name = kwargs.get('board', None)
    thread_number = kwargs.get('thread', None)

    context = {}

    # This time is used when selecting data for a chart and recent posts.
    # It is supposed to prevent drawing to much data on the chart and ensures correct results when calculating posts per hour
    # (saved threads which do not get deleted would alter the results).
    timespan = AppSettings.get('RECENT_POSTS_AGE')

    queryset_posts = Post.objects
    queryset_threads = Thread.objects

    if board_name is not None:
        queryset_posts = queryset_posts.filter(thread__board=board_name)
        queryset_threads = queryset_threads.filter(board=board_name)

    if thread_number is not None:
        queryset_posts = queryset_posts.filter(thread__number=thread_number)
        queryset_threads = queryset_threads.filter(number=thread_number)

    # Increase accuracy in thread mode.
    if board_name and thread_number:
        times = queryset_threads.annotate(first=Min('post__time'),
                                          last=Max('post__time')).first()

        timespan = (times.last - times.first).total_seconds() / 3600

    # Base this on the time of the last matched post. It is possible to get an empty chart
    # in the older threads if this is based on the current time.
    timespan_time = queryset_posts.last().time - datetime.timedelta(
        hours=timespan)

    # Prepare data for the chart. It is necessary to convert it to a format required by Google Charts.
    posts = queryset_posts.filter(time__gt=timespan_time).extra({
        'date':
        'date("time")',
        'hour':
        "date_part(\'hour\', \"time\")"
    }).values('date',
              'hour').order_by('date',
                               'hour').annotate(amount=Count('id')).filter()
    context['chart_data'] = get_posts_chart_data(posts)

    # Posts.
    context['total_posts'] = queryset_posts.count()
    context['total_image_posts'] = queryset_posts.exclude(image=None).count()

    context['recent_posts'] = queryset_posts.filter(
        time__gt=timespan_time).count()
    context['recent_posts_timespan'] = timespan

    # Threads.
    context['total_threads'] = queryset_threads.count()

    return context
Esempio n. 7
0
    def file_wait(self):
        """Wait in order to satisfy the rules. Used before downloading images."""
        wait_start = datetime.datetime.now()

        with self.file_wait_lock:
            self.wait(AppSettings.get('FILE_WAIT'), self.last_file_request)
            self.last_file_request = datetime.datetime.now()

        self.total_wait_time_with_lock += datetime.datetime.now() - wait_start
Esempio n. 8
0
    def api_wait(self):
        """Wait in order to satisfy the API rules."""
        wait_start = datetime.datetime.now()

        with self.api_wait_lock:
            self.wait(AppSettings.get('API_WAIT'), self.last_api_request)
            self.last_api_request = datetime.datetime.now()

        self.total_wait_time_with_lock += datetime.datetime.now() - wait_start
Esempio n. 9
0
    def handle(self, *args, **options):
        # Prevent multiple instances. Apparently fcntl.lockf is very useful and does completely nothing.
        me = singleton.SingleInstance()

        boards = Board.objects.filter(active=True)

        # Show progress?
        if options['progress']:
            progress = True
        else:
            progress = False

        # Get new data for each board.
        for board in boards:
            # Info.
            processing_start = datetime.datetime.utcnow().replace(tzinfo=utc)
            update = Update.objects.create(
                board=board,
                start=processing_start,
                used_threads=AppSettings.get('SCRAPER_THREADS_NUMBER'))

            try:
                # Actual update.
                scraper = BoardScraper(board, progress=progress)
                scraper.update()

                # Info.
                update.status = Update.COMPLETED

            except Exception as e:
                sys.stderr.write('%s\n' % (e))

            finally:
                # Info.
                try:
                    if update.status != Update.COMPLETED:
                        update.status = Update.FAILED

                    processing_end = datetime.datetime.utcnow().replace(
                        tzinfo=utc)
                    processing_time = processing_end - processing_start
                    update.end = processing_end
                    update = scraper.stats.add_to_record(
                        update, processing_time)

                except Exception as e:
                    sys.stderr.write('%s\n' % (e))

                finally:
                    update.save()

                # Everything below is just info.
                print('%s Board: %s %s' % (
                    datetime.datetime.now(),
                    board,
                    scraper.stats.get_text(processing_time),
                ))
Esempio n. 10
0
def get_stats(**kwargs):
    board_name = kwargs.get('board', None)
    thread_number = kwargs.get('thread', None)

    context = {}

    # This time is used when selecting data for a chart and recent posts.
    # It is supposed to prevent drawing to much data on the chart and ensures correct results when calculating posts per hour
    # (saved threads which do not get deleted would alter the results).
    timespan = AppSettings.get('RECENT_POSTS_AGE')

    queryset_posts = Post.objects
    queryset_threads = Thread.objects

    if board_name is not None:
        queryset_posts = queryset_posts.filter(thread__board=board_name)
        queryset_threads = queryset_threads.filter(board=board_name)

    if thread_number is not None:
        queryset_posts = queryset_posts.filter(thread__number=thread_number)
        queryset_threads = queryset_threads.filter(number=thread_number)

    # Increase accuracy in thread mode.
    if board_name and thread_number:
        times = queryset_threads.annotate(
            first=Min('post__time'),
            last=Max('post__time')
        ).first()

        timespan = (times.last - times.first).total_seconds() / 3600

    # Base this on the time of the last matched post. It is possible to get an empty chart
    # in the older threads if this is based on the current time.
    timespan_time = queryset_posts.last().time - datetime.timedelta(hours=timespan)

    # Prepare data for the chart. It is necessary to convert it to a format required by Google Charts.
    posts = queryset_posts.filter(time__gt=timespan_time).extra({
        'date': 'date("time")',
        'hour': "date_part(\'hour\', \"time\")"
    }).values('date', 'hour').order_by('date', 'hour').annotate(amount=Count('id')).filter()
    context['chart_data'] = get_posts_chart_data(posts)

    # Posts.
    context['total_posts'] = queryset_posts.count()
    context['total_image_posts'] = queryset_posts.exclude(image=None).count()

    context['recent_posts'] = queryset_posts.filter(time__gt=timespan_time).count()
    context['recent_posts_timespan'] = timespan

    # Threads.
    context['total_threads'] = queryset_threads.count()

    return context
Esempio n. 11
0
    def add_to_record(self, record, total_time, **kwargs):
        """Save the statistics in the database."""
        used_threads = kwargs.get('used_threads', AppSettings.get('SCRAPER_THREADS_NUMBER'))

        wait_time = self.get('total_wait_time_with_lock').total_seconds() / used_threads
        download_time = self.get('total_download_time').total_seconds() / used_threads

        record.total_time = total_time.total_seconds()
        record.wait_time = wait_time
        record.download_time = download_time
        record.processed_threads = self.get('processed_threads')
        record.added_posts = self.get('added_posts')
        record.removed_posts = self.get('removed_posts')
        record.downloaded_images = self.get('downloaded_images')
        record.downloaded_thumbnails = self.get('downloaded_thumbnails')
        record.downloaded_threads = self.get('downloaded_threads')

        return record
Esempio n. 12
0
    def add_to_record(self, record, total_time, **kwargs):
        """Save the statistics in the database."""
        used_threads = kwargs.get('used_threads',
                                  AppSettings.get('SCRAPER_THREADS_NUMBER'))

        wait_time = self.get(
            'total_wait_time_with_lock').total_seconds() / used_threads
        download_time = self.get(
            'total_download_time').total_seconds() / used_threads

        record.total_time = total_time.total_seconds()
        record.wait_time = wait_time
        record.download_time = download_time
        record.processed_threads = self.get('processed_threads')
        record.added_posts = self.get('added_posts')
        record.removed_posts = self.get('removed_posts')
        record.downloaded_images = self.get('downloaded_images')
        record.downloaded_thumbnails = self.get('downloaded_thumbnails')
        record.downloaded_threads = self.get('downloaded_threads')

        return record
Esempio n. 13
0
    def update(self):
        """Call this to update the database."""
        # Get the catalog from the API.
        try:
            self.catalog = self.get_catalog_json()

        except:
            raise ScrapError('Unable to download or parse the catalog data. Board update stopped.')

        # Launch the initial threads. Next ones will be launched automatically.
        for i in range(0, AppSettings.get('SCRAPER_THREADS_NUMBER')):
            self.launch_thread()

        # Wait for all threads to finish.
        while True:
            time.sleep(1)
            with self.running_threads_lock:
                if not len(self.running_threads):
                    break

        self.stats.add('total_wait_time', self.queuer.get_total_wait_time())
        self.stats.add('total_wait_time_with_lock', self.queuer.get_total_wait_time_with_lock())
Esempio n. 14
0
import os, time

from django.db import models
from django.db.models import Max, Min, Count, F
from django.core.urlresolvers import reverse
from django.core.files.storage import FileSystemStorage

from archive_chan.settings import AppSettings

# This overrides the global media url.
fs = FileSystemStorage(base_url=AppSettings.get('MEDIA_URL'))

class Board(models.Model):
    name = models.CharField(max_length=255, primary_key = True)
    active = models.BooleanField(
        default=True,
        help_text='Should this board be updated with new posts?'
    )
    store_threads_for = models.IntegerField(
        default=48,
        help_text='[hours] After that much time passes from the last reply in a NOT SAVED thread it will be deleted. Set to 0 to preserve threads forever.'
    )
    replies_threshold = models.IntegerField(
        default=20,
        help_text='Store threads after they reach that many replies.'
    )

    class Meta:
        ordering = ['name']

    def __str__(self):
Esempio n. 15
0
import os, time

from django.db import models
from django.db.models import Max, Min, Count, F
from django.core.urlresolvers import reverse
from django.core.files.storage import FileSystemStorage

from archive_chan.settings import AppSettings

# This overrides the global media url.
fs = FileSystemStorage(base_url=AppSettings.get('MEDIA_URL'))


class Board(models.Model):
    name = models.CharField(max_length=255, primary_key=True)
    active = models.BooleanField(
        default=True, help_text='Should this board be updated with new posts?')
    store_threads_for = models.IntegerField(
        default=48,
        help_text=
        '[hours] After that much time passes from the last reply in a NOT SAVED thread it will be deleted. Set to 0 to preserve threads forever.'
    )
    replies_threshold = models.IntegerField(
        default=20,
        help_text='Store threads after they reach that many replies.')

    class Meta:
        ordering = ['name']

    def __str__(self):
        return format("/%s/" % self.name)
Esempio n. 16
0
 def get_url(self, url):
     """Download data from an url."""
     download_start = datetime.datetime.now()
     data = requests.get(url, timeout=AppSettings.get('CONNECTION_TIMEOUT'))
     self.stats.add('total_download_time', datetime.datetime.now() - download_start)
     return data
Esempio n. 17
0
    def get_text(self, total_time, **kwargs):
        """Get the text for printing. Total processing time must be provided externally."""
        try:
            wait_percent = round(self.get('total_wait_time_with_lock').total_seconds() / total_time.total_seconds() * 100 / AppSettings.get('SCRAPER_THREADS_NUMBER'))
            downloading_percent = round(self.get('total_download_time').total_seconds() / total_time.total_seconds() * 100 / AppSettings.get('SCRAPER_THREADS_NUMBER'))

        except:
            # Division by zero. Set to 0 for the statistics.
            wait_percent = 0
            downloading_percent = 0

        return 'Time passed: %s seconds (%s%% waiting, %s%% downloading files) Processed threads: %s Added posts: %s Removed posts: %s Downloaded images: %s Downloaded thumbnails: %s Downloaded threads: %s' % (
            round(total_time.total_seconds(), 2),
            wait_percent,
            downloading_percent,
            self.get('processed_threads'),
            self.get('added_posts'),
            self.get('removed_posts'),
            self.get('downloaded_images'),
            self.get('downloaded_thumbnails'),
            self.get('downloaded_threads'),
        )
Esempio n. 18
0
from django.conf.urls import patterns, url
from django.views.decorators.csrf import ensure_csrf_cookie
from django.views.decorators.cache import cache_page

import archive_chan.views.core as core
import archive_chan.views.api as api

from archive_chan.settings import AppSettings

cache = AppSettings.get('VIEW_CACHE_AGE')
cache_static = AppSettings.get('VIEW_CACHE_AGE_STATIC')

urlpatterns = patterns('',
    # Global.
    url(r'^$', cache_page(cache)(core.IndexView.as_view()), name='index'),
    url(r'^stats/$', cache_page(cache_static)(core.StatsView.as_view()), name='stats'),
    url(r'^gallery/$', cache_page(cache_static)(core.GalleryView.as_view()), name='gallery'),
    url(r'^search/$', core.SearchView.as_view(), name='search'),

    # Board.
    url(r'^board/(?P<board>[a-z]+)/$', cache_page(cache)(core.BoardView.as_view()), name='board'),
    url(r'^board/(?P<board>[a-z]+)/stats/$', cache_page(cache_static)(core.StatsView.as_view()), name='board_stats'),
    url(r'^board/(?P<board>[a-z]+)/gallery/$', cache_page(cache_static)(core.GalleryView.as_view()), name='board_gallery'),
    url(r'^board/(?P<board>[a-z]+)/search/$', core.SearchView.as_view(), name='board_search'),

    # Stats.
    url(r'^board/(?P<board>[a-z]+)/thread/(?P<thread>[0-9]+)/$', ensure_csrf_cookie(core.ThreadView.as_view()), name='thread'),
    url(r'^board/(?P<board>[a-z]+)/thread/(?P<thread>[0-9]+)/stats/$', cache_page(cache_static)(core.StatsView.as_view()), name='thread_stats'),
    url(r'^board/(?P<board>[a-z]+)/thread/(?P<thread>[0-9]+)/gallery/$', cache_page(cache_static)(core.GalleryView.as_view()), name='thread_gallery'),
    url(r'^board/(?P<board>[a-z]+)/thread/(?P<thread>[0-9]+)/search/$', core.SearchView.as_view(), name='thread_search'),
Esempio n. 19
0
from django.conf.urls import patterns, url
from django.views.decorators.csrf import ensure_csrf_cookie
from django.views.decorators.cache import cache_page

import archive_chan.views.core as core
import archive_chan.views.api as api

from archive_chan.settings import AppSettings

cache = AppSettings.get('VIEW_CACHE_AGE')
cache_static = AppSettings.get('VIEW_CACHE_AGE_STATIC')

urlpatterns = patterns(
    '',
    # Global.
    url(r'^$', cache_page(cache)(core.IndexView.as_view()), name='index'),
    url(r'^stats/$',
        cache_page(cache_static)(core.StatsView.as_view()),
        name='stats'),
    url(r'^gallery/$',
        cache_page(cache_static)(core.GalleryView.as_view()),
        name='gallery'),
    url(r'^search/$', core.SearchView.as_view(), name='search'),

    # Board.
    url(r'^board/(?P<board>[a-z]+)/$',
        cache_page(cache)(core.BoardView.as_view()),
        name='board'),
    url(r'^board/(?P<board>[a-z]+)/stats/$',
        cache_page(cache_static)(core.StatsView.as_view()),
        name='board_stats'),