Example #1
0
 def parse_all(buffer_size=512, progress=True):
     parser = get_nginx_parser()
     buffer = []
     start = datetime.datetime.now()
     for log_file in parser.matching_files():
         n_lines = count_lines(log_file)
         progress_bar = ProgressBar(sys.stdout if progress else None, n_lines)  # noqa
         print('Reading log file %s: %s lines' % (log_file, n_lines))
         with open(log_file) as f:
             for count, line in enumerate(f, 1):
                 try:
                     data = parser.parse_string(line)
                 except AttributeError:
                     # TODO: log the line
                     print('Error while parsing log line: %s' % line)
                     continue
                 log_object = RequestLog(**parser.format_data(data))
                 log_object.complete(save=False)
                 buffer.append(log_object)
                 if len(buffer) >= buffer_size:
                     RequestLog.objects.bulk_create(buffer)
                     buffer.clear()
                 progress_bar.update(count)
             if len(buffer) > 0:
                 RequestLog.objects.bulk_create(buffer)
                 buffer.clear()
     end = datetime.datetime.now()
     print('Elapsed time: %s' % (end - start))
Example #2
0
 def parse_all(buffer_size=512, progress=True):
     parser = get_nginx_parser()
     buffer = []
     start = datetime.datetime.now()
     for log_file in parser.matching_files():
         n_lines = count_lines(log_file)
         progress_bar = ProgressBar(sys.stdout if progress else None,
                                    n_lines)  # noqa
         print('Reading log file %s: %s lines' % (log_file, n_lines))
         with open(log_file) as f:
             for count, line in enumerate(f, 1):
                 try:
                     data = parser.parse_string(line)
                 except AttributeError:
                     # TODO: log the line
                     print('Error while parsing log line: %s' % line)
                     continue
                 log_object = RequestLog(**parser.format_data(data))
                 log_object.complete(save=False)
                 buffer.append(log_object)
                 if len(buffer) >= buffer_size:
                     RequestLog.objects.bulk_create(buffer)
                     buffer.clear()
                 progress_bar.update(count)
             if len(buffer) > 0:
                 RequestLog.objects.bulk_create(buffer)
                 buffer.clear()
     end = datetime.datetime.now()
     print('Elapsed time: %s' % (end - start))
Example #3
0
    def anonymize_answers(self, lorem_ipsum):
        # This method is very mathematical and has a lot of "one new variable per line" code, but we think it's okay.
        # pylint: disable=too-many-locals
        self.stdout.write("Replacing text answers with fake ones...")
        for text_answer in TextAnswer.objects.all():
            text_answer.answer = self.lorem(text_answer.answer, lorem_ipsum)
            if text_answer.original_answer:
                text_answer.original_answer = self.lorem(text_answer.original_answer, lorem_ipsum)
            text_answer.save()

        self.stdout.write("Shuffling rating answer counter counts...")

        contributions = Contribution.objects.all().prefetch_related("ratinganswercounter_set__question")
        try:
            self.stdout.ending = ""
            progress_bar = ProgressBar(self.stdout, contributions.count())
            for contribution_counter, contribution in enumerate(contributions):
                progress_bar.update(contribution_counter + 1)

                counters_per_question = defaultdict(list)
                for counter in contribution.ratinganswercounter_set.all():
                    counters_per_question[counter.question].append(counter)

                for question, counters in counters_per_question.items():
                    original_sum = sum(counter.count for counter in counters)

                    missing_values = set(CHOICES[question.type].values).difference(set(c.answer for c in counters))
                    missing_values.discard(NO_ANSWER)  # don't add NO_ANSWER counter if it didn't exist before
                    for value in missing_values:
                        counters.append(RatingAnswerCounter(question=question, contribution=contribution, answer=value, count=0))

                    generated_counts = [random.random() for c in counters]  # nosec
                    generated_sum = sum(generated_counts)
                    generated_counts = [floor(count / generated_sum * original_sum) for count in generated_counts]

                    to_add = original_sum - sum(generated_counts)
                    index = random.randint(0, len(generated_counts) - 1)  # nosec
                    generated_counts[index] += to_add

                    for counter, generated_count in zip(counters, generated_counts):
                        assert generated_count >= 0
                        counter.count = generated_count

                        if counter.count:
                            counter.save()
                        elif counter.id:
                            counter.delete()

                    assert original_sum == sum(counter.count for counter in counters)
        finally:
            self.stdout.ending = "\n"
Example #4
0
 def get_ip_info(only_update=False):
     param = 'client_ip_address'
     if not only_update:
         unique_ips = set(
             RequestLog.objects.distinct(param).values_list(
                 param, flat=True))  # noqa
         checked_ips = set(
             IPInfoCheck.objects.values_list('ip_address',
                                             flat=True))  # noqa
         not_checked_ips = unique_ips - checked_ips
         print('Checking IP addresses information (%s)' %
               len(not_checked_ips))
         check_progress_bar = ProgressBar(sys.stdout, len(not_checked_ips))
         for count, ip in enumerate(not_checked_ips, 1):
             try:
                 IPInfoCheck.check_ip(ip)
             except RateExceededError:
                 print(' Rate exceeded')
                 break
             check_progress_bar.update(count)
     no_ip_info = RequestLog.objects.filter(ip_info=None)
     no_ip_info_ip = set(
         no_ip_info.distinct(param).values_list(param, flat=True))  # noqa
     checks = IPInfoCheck.objects.filter(ip_address__in=no_ip_info_ip)
     print('Updating request logs\' IP info (%s)' % no_ip_info.count())
     print('%s related checks' % checks.count())
     logs_progress_bar = ProgressBar(sys.stdout, checks.count())
     for count, check in enumerate(checks, 1):
         no_ip_info.filter(client_ip_address=check.ip_address).update(
             ip_info=check.ip_info)
         logs_progress_bar.update(count)
Example #5
0
    def handle(self, *args, **options):
        self.stdout.write("Clearing cache...")
        cache.clear()
        total_count = Course.objects.count()

        self.stdout.write("Calculating results for all courses...")

        self.stdout.ending = None
        progress_bar = ProgressBar(self.stdout, total_count)

        for counter, course in enumerate(Course.objects.all()):
            progress_bar.update(counter + 1)
            calculate_results(course)

        self.stdout.write("Results cache has been refreshed.\n")
    def handle(self, *args, **options):
        self.stdout.write("Clearing results cache...")
        caches['results'].clear()
        total_count = Course.objects.count()

        self.stdout.write("Calculating results for all courses...")

        self.stdout.ending = None
        progress_bar = ProgressBar(self.stdout, total_count)

        for counter, course in enumerate(Course.objects.all()):
            progress_bar.update(counter + 1)
            collect_results(course)

        self.stdout.write("Prerendering result index page...\n")

        warm_up_template_cache(Course.objects.filter(state='published'))

        self.stdout.write("Results cache has been refreshed.\n")
Example #7
0
    def autocomplete(queryset, batch_size=512, rewrite=True, progress=True, **kwargs):  # noqa
        # perf improvement: avoid QuerySet.__getitem__ when doing qs[i]
        # FIXME: though we may need to buffer because the queryset can be huge
        total = queryset.count()
        progress_bar = ProgressBar(sys.stdout if progress else None, total)
        print('Completing information for %s request logs' % total)
        count = 0

        start = datetime.datetime.now()
        while count < total:
            buffer = queryset[count:count+batch_size]
            with transaction.atomic():  # is this a real improvement?
                for obj in buffer:
                    obj.complete(rewrite=rewrite, **kwargs)
                    count += 1
                    progress_bar.update(count)
            # end transaction
        end = datetime.datetime.now()
        print('Elapsed time: %s' % (end - start))
Example #8
0
    def handle(self, *args, **options):
        self.stdout.write("Clearing results cache...")
        caches["results"].clear()

        self.stdout.write("Calculating results for all evaluations...")

        self.stdout.ending = None
        evaluations = Evaluation.objects.filter(
            state__in=STATES_WITH_RESULTS_CACHING)
        progress_bar = ProgressBar(self.stdout, evaluations.count())
        for counter, evaluation in enumerate(evaluations):
            progress_bar.update(counter + 1)
            cache_results(evaluation)

        self.stdout.write("Prerendering result index page...\n")

        warm_up_template_cache(
            Evaluation.objects.filter(
                state__in=STATES_WITH_RESULT_TEMPLATE_CACHING))

        self.stdout.write("Results cache has been refreshed.\n")
Example #9
0
 def get_ip_info(only_update=False):
     param = 'client_ip_address'
     if not only_update:
         unique_ips = set(RequestLog.objects.distinct(param).values_list(param, flat=True))  # noqa
         checked_ips = set(IPInfoCheck.objects.values_list('ip_address', flat=True))  # noqa
         not_checked_ips = unique_ips - checked_ips
         print('Checking IP addresses information (%s)' % len(not_checked_ips))
         check_progress_bar = ProgressBar(sys.stdout, len(not_checked_ips))
         for count, ip in enumerate(not_checked_ips, 1):
             try:
                 IPInfoCheck.check_ip(ip)
             except RateExceededError:
                 print(' Rate exceeded')
                 break
             check_progress_bar.update(count)
     no_ip_info = RequestLog.objects.filter(ip_info=None)
     no_ip_info_ip = set(no_ip_info.distinct(param).values_list(param, flat=True))  # noqa
     checks = IPInfoCheck.objects.filter(ip_address__in=no_ip_info_ip)
     print('Updating request logs\' IP info (%s)' % no_ip_info.count())
     print('%s related checks' % checks.count())
     logs_progress_bar = ProgressBar(sys.stdout, checks.count())
     for count, check in enumerate(checks, 1):
         no_ip_info.filter(
             client_ip_address=check.ip_address
         ).update(ip_info=check.ip_info)
         logs_progress_bar.update(count)
Example #10
0
    def autocomplete(queryset,
                     batch_size=512,
                     rewrite=True,
                     progress=True,
                     **kwargs):  # noqa
        # perf improvement: avoid QuerySet.__getitem__ when doing qs[i]
        # FIXME: though we may need to buffer because the queryset can be huge
        total = queryset.count()
        progress_bar = ProgressBar(sys.stdout if progress else None, total)
        print('Completing information for %s request logs' % total)
        count = 0

        start = datetime.datetime.now()
        while count < total:
            buffer = queryset[count:count + batch_size]
            with transaction.atomic():  # is this a real improvement?
                for obj in buffer:
                    obj.complete(rewrite=rewrite, **kwargs)
                    count += 1
                    progress_bar.update(count)
            # end transaction
        end = datetime.datetime.now()
        print('Elapsed time: %s' % (end - start))
Example #11
0
    def handle(self, *args, **options):
        total = 0
        start = time()
        qss = Security.objects.filter(type='S')
        progress_bar = ProgressBar(sys.stdout, qss.count())
        count = 0
        for s in qss.iterator():
            count += 1
            progress_bar.update(count)
            qsc = Candle.objects.filter(
                ticker=s, type='D').order_by('-time')[252:]
            if not qsc.exists():
                continue
            ids = [vl[0] for vl in qsc.values_list('id')]
            qscd = Candle.objects.filter(id__in=ids)
            qscd.delete()
            total += len(ids)

        self.stdout.write('\n')
        self.stdout.write(self.style.SUCCESS('Success'))
        elapsed = str(timedelta(seconds=time() - start))

        return json.dumps({'Candles removed': total, 'Elapsed': elapsed},
                          indent=4)
Example #12
0
    def handle(self, *args, **options):
        global prefetched

        start = time()

        schema = options['schema']
        if schema is None:
            raise CommandError('No schema')

        all_obj_names = list_objects(CRAWL_OCI_BUCKET)
        obj_names = [o for o in all_obj_names if o.startswith(schema)]
        if not obj_names:
            raise CommandError(f'No objects for {schema}')
        obj_names.sort(key=lambda x: datetime.strptime(
            x, f'{schema}_{TIME_FORMAT}.json')
        )
        model = MAPPING.get(schema)
        if model is None:
            raise CommandError(f'No model for {schema}')

        fields = model._meta.get_fields()
        model_name = model._meta.object_name

        total = {
            model_name: {
                'created': 0,
                'updated': 0,
            }
        }
        for on in obj_names:
            data = load_json(CRAWL_OCI_BUCKET, on)
            if data is None or not isinstance(data, list):
                raise CommandError(f'Incorret data for {schema} in {on}')

            self.stdout.write(self.style.HTTP_INFO(f'Loading {schema}'))
            progress_bar = ProgressBar(sys.stdout, len(data))

            if model_name != 'Security' and prefetched is None:
                self.stdout.write('Prefetching tickers')
                tickers_qs = Security.objects.all()
                prefetched = {}
                for s in tickers_qs:
                    prefetched[s.ticker] = s

            objs = deserialize(data, fields, model_name)
            count = 0
            while True:
                batch = list(islice(objs, ORACLE_BATCH_SIZE))
                if not batch:
                    break

                uniques = defaultdict(list)
                for __, unique in batch:
                    count += 1
                    for k, v in unique.items():
                        uniques[k + '__in'].append(v)

                # self.stdout.write('Prefetching existing batch')
                existing = list(model.objects.filter(**uniques))

                objs_to_create = []
                objs_to_update = []
                d = None
                # self.stdout.write('Creating / Updating instances')
                for d, unique in batch:
                    instance = query_pop(existing, unique)
                    if instance is not None:
                        for attr, value in d.items():
                            setattr(instance, attr, value)
                        objs_to_update.append(instance)
                    else:
                        objs_to_create.append(model(**d))

                    progress_bar.update(count)

                batch_update_or_create(model, objs_to_create)
                batch_update_or_create(model, objs_to_update, d.keys())
                total[model_name]['created'] += len(objs_to_create)
                total[model_name]['updated'] += len(objs_to_update)

            self.stdout.write('\n')

            remove(CRAWL_OCI_BUCKET, on)

        self.stdout.write(self.style.SUCCESS(f'Successfully loaded {schema}'))

        total['Elapsed'] = str(timedelta(seconds=time() - start))

        return json.dumps(total, indent=4)