def parse_all(buffer_size=512, progress=True): parser = get_nginx_parser() buffer = [] start = datetime.datetime.now() for log_file in parser.matching_files(): n_lines = count_lines(log_file) progress_bar = ProgressBar(sys.stdout if progress else None, n_lines) # noqa print('Reading log file %s: %s lines' % (log_file, n_lines)) with open(log_file) as f: for count, line in enumerate(f, 1): try: data = parser.parse_string(line) except AttributeError: # TODO: log the line print('Error while parsing log line: %s' % line) continue log_object = RequestLog(**parser.format_data(data)) log_object.complete(save=False) buffer.append(log_object) if len(buffer) >= buffer_size: RequestLog.objects.bulk_create(buffer) buffer.clear() progress_bar.update(count) if len(buffer) > 0: RequestLog.objects.bulk_create(buffer) buffer.clear() end = datetime.datetime.now() print('Elapsed time: %s' % (end - start))
def anonymize_answers(self, lorem_ipsum): # This method is very mathematical and has a lot of "one new variable per line" code, but we think it's okay. # pylint: disable=too-many-locals self.stdout.write("Replacing text answers with fake ones...") for text_answer in TextAnswer.objects.all(): text_answer.answer = self.lorem(text_answer.answer, lorem_ipsum) if text_answer.original_answer: text_answer.original_answer = self.lorem(text_answer.original_answer, lorem_ipsum) text_answer.save() self.stdout.write("Shuffling rating answer counter counts...") contributions = Contribution.objects.all().prefetch_related("ratinganswercounter_set__question") try: self.stdout.ending = "" progress_bar = ProgressBar(self.stdout, contributions.count()) for contribution_counter, contribution in enumerate(contributions): progress_bar.update(contribution_counter + 1) counters_per_question = defaultdict(list) for counter in contribution.ratinganswercounter_set.all(): counters_per_question[counter.question].append(counter) for question, counters in counters_per_question.items(): original_sum = sum(counter.count for counter in counters) missing_values = set(CHOICES[question.type].values).difference(set(c.answer for c in counters)) missing_values.discard(NO_ANSWER) # don't add NO_ANSWER counter if it didn't exist before for value in missing_values: counters.append(RatingAnswerCounter(question=question, contribution=contribution, answer=value, count=0)) generated_counts = [random.random() for c in counters] # nosec generated_sum = sum(generated_counts) generated_counts = [floor(count / generated_sum * original_sum) for count in generated_counts] to_add = original_sum - sum(generated_counts) index = random.randint(0, len(generated_counts) - 1) # nosec generated_counts[index] += to_add for counter, generated_count in zip(counters, generated_counts): assert generated_count >= 0 counter.count = generated_count if counter.count: counter.save() elif counter.id: counter.delete() assert original_sum == sum(counter.count for counter in counters) finally: self.stdout.ending = "\n"
def get_ip_info(only_update=False): param = 'client_ip_address' if not only_update: unique_ips = set( RequestLog.objects.distinct(param).values_list( param, flat=True)) # noqa checked_ips = set( IPInfoCheck.objects.values_list('ip_address', flat=True)) # noqa not_checked_ips = unique_ips - checked_ips print('Checking IP addresses information (%s)' % len(not_checked_ips)) check_progress_bar = ProgressBar(sys.stdout, len(not_checked_ips)) for count, ip in enumerate(not_checked_ips, 1): try: IPInfoCheck.check_ip(ip) except RateExceededError: print(' Rate exceeded') break check_progress_bar.update(count) no_ip_info = RequestLog.objects.filter(ip_info=None) no_ip_info_ip = set( no_ip_info.distinct(param).values_list(param, flat=True)) # noqa checks = IPInfoCheck.objects.filter(ip_address__in=no_ip_info_ip) print('Updating request logs\' IP info (%s)' % no_ip_info.count()) print('%s related checks' % checks.count()) logs_progress_bar = ProgressBar(sys.stdout, checks.count()) for count, check in enumerate(checks, 1): no_ip_info.filter(client_ip_address=check.ip_address).update( ip_info=check.ip_info) logs_progress_bar.update(count)
def handle(self, *args, **options): self.stdout.write("Clearing cache...") cache.clear() total_count = Course.objects.count() self.stdout.write("Calculating results for all courses...") self.stdout.ending = None progress_bar = ProgressBar(self.stdout, total_count) for counter, course in enumerate(Course.objects.all()): progress_bar.update(counter + 1) calculate_results(course) self.stdout.write("Results cache has been refreshed.\n")
def handle(self, *args, **options): self.stdout.write("Clearing results cache...") caches['results'].clear() total_count = Course.objects.count() self.stdout.write("Calculating results for all courses...") self.stdout.ending = None progress_bar = ProgressBar(self.stdout, total_count) for counter, course in enumerate(Course.objects.all()): progress_bar.update(counter + 1) collect_results(course) self.stdout.write("Prerendering result index page...\n") warm_up_template_cache(Course.objects.filter(state='published')) self.stdout.write("Results cache has been refreshed.\n")
def autocomplete(queryset, batch_size=512, rewrite=True, progress=True, **kwargs): # noqa # perf improvement: avoid QuerySet.__getitem__ when doing qs[i] # FIXME: though we may need to buffer because the queryset can be huge total = queryset.count() progress_bar = ProgressBar(sys.stdout if progress else None, total) print('Completing information for %s request logs' % total) count = 0 start = datetime.datetime.now() while count < total: buffer = queryset[count:count+batch_size] with transaction.atomic(): # is this a real improvement? for obj in buffer: obj.complete(rewrite=rewrite, **kwargs) count += 1 progress_bar.update(count) # end transaction end = datetime.datetime.now() print('Elapsed time: %s' % (end - start))
def handle(self, *args, **options): self.stdout.write("Clearing results cache...") caches["results"].clear() self.stdout.write("Calculating results for all evaluations...") self.stdout.ending = None evaluations = Evaluation.objects.filter( state__in=STATES_WITH_RESULTS_CACHING) progress_bar = ProgressBar(self.stdout, evaluations.count()) for counter, evaluation in enumerate(evaluations): progress_bar.update(counter + 1) cache_results(evaluation) self.stdout.write("Prerendering result index page...\n") warm_up_template_cache( Evaluation.objects.filter( state__in=STATES_WITH_RESULT_TEMPLATE_CACHING)) self.stdout.write("Results cache has been refreshed.\n")
def get_ip_info(only_update=False): param = 'client_ip_address' if not only_update: unique_ips = set(RequestLog.objects.distinct(param).values_list(param, flat=True)) # noqa checked_ips = set(IPInfoCheck.objects.values_list('ip_address', flat=True)) # noqa not_checked_ips = unique_ips - checked_ips print('Checking IP addresses information (%s)' % len(not_checked_ips)) check_progress_bar = ProgressBar(sys.stdout, len(not_checked_ips)) for count, ip in enumerate(not_checked_ips, 1): try: IPInfoCheck.check_ip(ip) except RateExceededError: print(' Rate exceeded') break check_progress_bar.update(count) no_ip_info = RequestLog.objects.filter(ip_info=None) no_ip_info_ip = set(no_ip_info.distinct(param).values_list(param, flat=True)) # noqa checks = IPInfoCheck.objects.filter(ip_address__in=no_ip_info_ip) print('Updating request logs\' IP info (%s)' % no_ip_info.count()) print('%s related checks' % checks.count()) logs_progress_bar = ProgressBar(sys.stdout, checks.count()) for count, check in enumerate(checks, 1): no_ip_info.filter( client_ip_address=check.ip_address ).update(ip_info=check.ip_info) logs_progress_bar.update(count)
def autocomplete(queryset, batch_size=512, rewrite=True, progress=True, **kwargs): # noqa # perf improvement: avoid QuerySet.__getitem__ when doing qs[i] # FIXME: though we may need to buffer because the queryset can be huge total = queryset.count() progress_bar = ProgressBar(sys.stdout if progress else None, total) print('Completing information for %s request logs' % total) count = 0 start = datetime.datetime.now() while count < total: buffer = queryset[count:count + batch_size] with transaction.atomic(): # is this a real improvement? for obj in buffer: obj.complete(rewrite=rewrite, **kwargs) count += 1 progress_bar.update(count) # end transaction end = datetime.datetime.now() print('Elapsed time: %s' % (end - start))
def handle(self, *args, **options): total = 0 start = time() qss = Security.objects.filter(type='S') progress_bar = ProgressBar(sys.stdout, qss.count()) count = 0 for s in qss.iterator(): count += 1 progress_bar.update(count) qsc = Candle.objects.filter( ticker=s, type='D').order_by('-time')[252:] if not qsc.exists(): continue ids = [vl[0] for vl in qsc.values_list('id')] qscd = Candle.objects.filter(id__in=ids) qscd.delete() total += len(ids) self.stdout.write('\n') self.stdout.write(self.style.SUCCESS('Success')) elapsed = str(timedelta(seconds=time() - start)) return json.dumps({'Candles removed': total, 'Elapsed': elapsed}, indent=4)
def handle(self, *args, **options): global prefetched start = time() schema = options['schema'] if schema is None: raise CommandError('No schema') all_obj_names = list_objects(CRAWL_OCI_BUCKET) obj_names = [o for o in all_obj_names if o.startswith(schema)] if not obj_names: raise CommandError(f'No objects for {schema}') obj_names.sort(key=lambda x: datetime.strptime( x, f'{schema}_{TIME_FORMAT}.json') ) model = MAPPING.get(schema) if model is None: raise CommandError(f'No model for {schema}') fields = model._meta.get_fields() model_name = model._meta.object_name total = { model_name: { 'created': 0, 'updated': 0, } } for on in obj_names: data = load_json(CRAWL_OCI_BUCKET, on) if data is None or not isinstance(data, list): raise CommandError(f'Incorret data for {schema} in {on}') self.stdout.write(self.style.HTTP_INFO(f'Loading {schema}')) progress_bar = ProgressBar(sys.stdout, len(data)) if model_name != 'Security' and prefetched is None: self.stdout.write('Prefetching tickers') tickers_qs = Security.objects.all() prefetched = {} for s in tickers_qs: prefetched[s.ticker] = s objs = deserialize(data, fields, model_name) count = 0 while True: batch = list(islice(objs, ORACLE_BATCH_SIZE)) if not batch: break uniques = defaultdict(list) for __, unique in batch: count += 1 for k, v in unique.items(): uniques[k + '__in'].append(v) # self.stdout.write('Prefetching existing batch') existing = list(model.objects.filter(**uniques)) objs_to_create = [] objs_to_update = [] d = None # self.stdout.write('Creating / Updating instances') for d, unique in batch: instance = query_pop(existing, unique) if instance is not None: for attr, value in d.items(): setattr(instance, attr, value) objs_to_update.append(instance) else: objs_to_create.append(model(**d)) progress_bar.update(count) batch_update_or_create(model, objs_to_create) batch_update_or_create(model, objs_to_update, d.keys()) total[model_name]['created'] += len(objs_to_create) total[model_name]['updated'] += len(objs_to_update) self.stdout.write('\n') remove(CRAWL_OCI_BUCKET, on) self.stdout.write(self.style.SUCCESS(f'Successfully loaded {schema}')) total['Elapsed'] = str(timedelta(seconds=time() - start)) return json.dumps(total, indent=4)