Ejemplo n.º 1
0
def index(request):

    form = MainProjectForm()

    if request.method == 'POST':
        form = MainProjectForm(request.POST)

        if form.is_valid():
            form.save(commit=False)
            form.save()

            DOMAIN_NAME = get_domain_name(str(request.POST['HOMEPAGE']))
            QUEUE_FILE = str(request.POST['PROJECT_NAME']) + '/queue.txt'
            CRAWLED_FILE = str(request.POST['PROJECT_NAME']) + '/crawled.txt'
            NUMBER_OF_THREADS = 8
            queue = Queue()
            Spider(str(request.POST['PROJECT_NAME']),
                   str(request.POST['HOMEPAGE']), DOMAIN_NAME)

            # Create worker threads (will die when main exits)
            def create_workers():
                for _ in range(NUMBER_OF_THREADS):
                    t = threading.Thread(target=work)
                    t.daemon = True
                    t.start()

            # Do the next job in the queue
            def work():
                while True:
                    url = queue.get()
                    Spider.crawl_page(threading.current_thread().name, url)
                    queue.task_done()

            # Each queued link is a new job
            def create_jobs():
                for link in file_to_set(QUEUE_FILE):
                    queue.put(link)
                queue.join()
                crawl()

            # Check if there are items in the queue, if so crawl them
            def crawl():
                queued_links = len(file_to_set(QUEUE_FILE))
                if queued_links > 0:
                    print(f'{queued_links} links in the queue')
                    create_jobs()

            create_workers()
            crawl()

            return HttpResponse('We have started crawling the website.')
        else:
            return HttpResponse(
                'Something went wrong while submitting the URL, please check again?'
            )

    return render(request, 'index.html', {'form': form})
Ejemplo n.º 2
0
def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Ejemplo n.º 3
0
import threading
from queue import Queue
from mainapp.helpers.spider import Spider
from mainapp.helpers.domain import *
from mainapp.helpers.general import *



PROJECT_NAME_APP = PROJECT_NAME
HOMEPAGE_APP = HOMEPAGE
DOMAIN_NAME = get_domain_name(HOMEPAGE_APP)
QUEUE_FILE = PROJECT_NAME_APP + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME_APP + '/crawled.txt'
NUMBER_OF_THREADS = 8
queue = Queue()
Spider(PROJECT_NAME_APP, HOMEPAGE_APP, DOMAIN_NAME)


# Create worker threads (will die when main exits)
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


# Do the next job in the queue
def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)