def crawl(request, pk): file = Files.objects.get(pk=pk) # add to queue last_imported = datetime.datetime.now() index_filedirectory.apply_async( kwargs={ 'filename': file.uri }, queue='tasks', priority=5 ) # save new timestamp file.last_imported = last_imported file.save() return render(request, 'files/files_crawl.html', {'id': pk,})
def queue_index_file(request): uri = request.GET["uri"] result = index_filedirectory.apply_async(kwargs={'filename': uri}, queue='tasks', priority=5) return HttpResponse(json.dumps({'queue': result.id}), content_type="application/json")
def queue_index_file(request): uri = request.GET["uri"] from opensemanticetl.tasks import index_filedirectory result = index_filedirectory.apply_async(kwargs={'filename': uri}, queue='open_semantic_etl_tasks', priority=5) return HttpResponse(json.dumps({'queue': result.id}), content_type="application/json")
def recrawl(request): verbose = True log = [] count = 0 count_queued = 0 for file in Files.objects.all(): count += 1 if verbose: log.append( "Checking delta time of file or directory: {}".format(file)) add_to_queue = True # If delta 0, no automatic import if not file.delta: add_to_queue = False # If delta time, do not import this file within this time by setting add_to_queue to false if file.delta and file.last_imported: # when next import allowed (because time delta passed)? next_import = file.last_imported + timedelta(minutes=file.delta) # don't check time delta if last import in future (i.e. if system time was wrong) if file.last_imported < timezone.now(): # if time for next import not reached, do not index if timezone.now() < next_import: add_to_queue = False if verbose: log.append("Last addition to queue: {}".format( file.last_imported)) log.append("Next addition to queue: {}".format(next_import)) if add_to_queue: if verbose: log.append( "Adding file or directory to queue: {}".format(file)) # add to queue last_imported = datetime.datetime.now() from opensemanticetl.tasks import index_filedirectory index_filedirectory.apply_async(kwargs={'filename': file.uri}, queue='tasks', priority=5) # save new timestamp file.last_imported = last_imported file.save() count_queued += 1 # # stats / log # response = "Files or directories to queue: {} of {}".format( count_queued, count) if len(log) > 0: response += "\n\n" + "\n".join(log) # # return response # status = HttpResponse(response) status["Content-Type"] = "text/plain" return status