def _secondPass(self, clients, loop):

        error_clients = {}

        for client, value in clients.items():

            print(client)
            client_dict = {client: None}
            name, number, button, url = clients[client]['Connect']
            website = Website(name, number, button, url)
            loop.run_until_complete(website.addWebsite())
            check = website.runCheck()

            first_check = clients[client]['Check']
            second_check = self._compare(check, name)

            if first_check == second_check:
                client_dict[client] = value
                error_clients.update(client_dict)

        return error_clients
Example #2
0
def dispatch_website(id, url, keywords):
    """
    Dispatcher to start crawling of a website
    """
    try:
        Database.set_website_status(id=id, status='queued')

        # create and set website and page object for a job
        website = Website(id=id, url=url, keywords=keywords)
        website.preInit()
        page = Page(website.url, website.url)

        # set website watch variables in redis db
        rDB.set(website.id+':pages_queued', 1)
        rDB.set(website.id+':pages_crawled', 0)

        # Enqueue job in redis-queue
        job = qL.enqueue(crawl_page, website, page)

        log.debug('Website Added in Queue :: {0}'.format(url))
    except Exception as e:
        log.exception('Error occurred in dispatch website')
Example #3
0
def dispatch_website(id, url, keywords):
    """
    Dispatcher to start crawling of a website
    """
    try:
        Database.set_website_status(id=id, status='queued')

        # create and set website and page object for a job
        website = Website(id=id, url=url, keywords=keywords)
        website.preInit()
        page = Page(website.url, website.url)

        # set website completion variables in redis db
        rDB.set(website.id + ':pages_queued', 1)
        rDB.set(website.id + ':pages_crawled', 0)

        # Enqueue job in redis-queue
        job = qH.enqueue(crawl_page, website, page)

        log.debug('Website Added in Queue :: {0}'.format(url))
    except Exception as e:
        log.exception('Error occurred in dispatch website')
Example #4
0
def crawl_page(website, page):
    """
    Crawl a single page at a time and checks is job of crawling
    a website done or not and take required steps
    """
    try:
        # set website status to started if it's a first page of website
        print 'Pages Crawled:: {0}'.format(rDB.get(website.id+':pages_crawled'))
        print 'Pages Queued:: {0}'.format(rDB.get(website.id+':pages_queued'))

        if rDB.get(website.id+':pages_queued')=='1':
            Database.set_website_status(id=website.id, status='started')

        log.debug('Crawling :: {0}'.format(page.url))

        # get page content
        log.info('Getting Page Content :: {0}'.format(page.url))
        page.get_content()

        # get keywords matched
        keys = page.get_keywords_matched(website.aho)
        log.info('Matched Keywords :: {0}'.format(keys))

        # get external links
        # page.get_external_links()
        log.info('Found External Links :: {0}'.format(len(page.external_links)))

        # get internal links
        page.get_internal_links(website)
        log.info('Found Internal Links :: {0}'.format(len(page.internal_links)))

        # get status code of all links
        log.info('Getting Status of all Links')
        page.get_status_codes_of_links(website)

        log.info('Enqueueing New Jobs ')
        # enqueue the un-broken internal links
        for p in page.crawl_pages:
            log.info('Enqueued :: {0}'.format(p.url))
            rDB.incr(website.id+':pages_queued')
            qH.enqueue(crawl_page, website, p)


        log.info('Adding Result to website')
        # add rotto links to result
        if page.rotto_links:
            log.info('Broken Links Found :: {0}'.format(page.rotto_links))
            rDB.rpush(website.id+':result', Website.result_to_json(page))

        log.debug('Crawled :: {0}'.format(page.url))

        # increment website crawled page counter
        rDB.incr(website.id+':pages_crawled')

        log.info('Pages Queued:: {0}'.format(rDB.get(website.id+':pages_queued')))
        log.info('Pages Crawled:: {0}'.format(rDB.get(website.id+':pages_crawled')))

        # checks if website crawled completely or not
        if rDB.get(website.id+':pages_queued')==rDB.get(website.id+':pages_crawled'):

            log.info('Website {0} crawled Completely'.format(website.url))

            # save results to database
            log.info('Saving results to database')
            qH.enqueue(save_result_to_database, website)

            # send the email to user
            log.info('Sending email to user')
            send_mail_to_user(website)

    except Exception as e:
        log.exception('Error in crawling :: {0}'.format(page.url))
    def main(self):

        row = self._row
        client_links = {}
        error_clients = {}
        unreachable_clients = {'404s': [], 'Missing_Info': [], 'Selenium': []}
        keys = []
        frames = []

        loop = asyncio.get_event_loop()

        try:
            for row in self._clients:

                    name = str(row[self._columns[0]])
                    number = str(row[self._columns[1]])
                    button = str(row[self._columns[2]])
                    url = str(row[self._columns[3]])

                    if (name and number and button and url and
                        number[:3].isdigit() is True and re.search('eventplicity', button) is not None):

                        client = Website(name, number, button, url)
                        connected = loop.run_until_complete(client.addWebsite())

                        if connected != 0:
                            check = client.runCheck()

                            if check == {name: {'Index': {'Phone_Number': 0, 'Eventplicity_Link': 0}}}:
                                if len(client._extensions) > 1:
                                    compare = self._compare(check, name)
                                    if compare is not None:
                                        error_clients[name] = {'Website': client, 'Check': compare, 'Connect': [name, number, button, url]}
                                    for key, value in check.items():
                                        keys.append(key)
                                        frame = pd.DataFrame.from_dict(value, orient='index')
                                        frames.append(frame)
                                else:
                                    unreachable_clients['Selenium'].append(name)
                            else:
                                compare = self._compare(check, name)
                                if compare is not None:
                                    error_clients[name] = {'Website': client, 'Check': compare, 'Connect': [name, number, button, url]}
                                for key, value in check.items():
                                    keys.append(key)
                                    frame = pd.DataFrame.from_dict(value, orient='index')
                                    frames.append(frame)

                        else:
                            unreachable_clients['404s'].append(name)

                        links = {name: client._links}
                        client_links.update(links)

                    else:
                        unreachable_clients['Missing_Info'].append(name)

            error_clients = self._secondPass(error_clients, loop)

        finally:
            loop.close()

        self._generateData(keys, frames, client_links, unreachable_clients)
        subject, message = self._composeEmail(error_clients, unreachable_clients)

        return subject, message
Example #6
0
def crawl_page(website, page):
    """
    Crawl a single page at a time and checks is job of crawling
    a website done or not and take required steps
    """
    try:
        # set website status to started if it's a first page of website
        print 'Pages Crawled:: {0}'.format(
            rDB.get(website.id + ':pages_crawled'))
        print 'Pages Queued:: {0}'.format(rDB.get(website.id +
                                                  ':pages_queued'))

        if rDB.get(website.id + ':pages_queued') == '1':
            Database.set_website_status(id=website.id, status='started')

        log.debug('Crawling :: {0}'.format(page.url))

        # get page content
        log.info('Getting Page Content :: {0}'.format(page.url))
        page.get_content()

        # get keywords matched
        keys = page.get_keywords_matched(website.aho)
        log.info('Matched Keywords :: {0}'.format(keys))

        # get external links
        # page.get_external_links()
        log.info('Found External Links :: {0}'.format(len(
            page.external_links)))

        # get internal links
        page.get_internal_links(website)
        log.info('Found Internal Links :: {0}'.format(len(
            page.internal_links)))

        # get status code of all links
        log.info('Getting Status of all Links')
        page.get_status_codes_of_links(website)

        log.info('Enqueueing New Jobs ')
        # enqueue the un-broken internal links
        for p in page.crawl_pages:
            log.info('Enqueued :: {0}'.format(p.url))
            rDB.incr(website.id + ':pages_queued')
            qH.enqueue(crawl_page, website, p)

        log.info('Adding Result to website')
        # add rotto links to result
        if page.rotto_links:
            log.info('Broken Links Found :: {0}'.format(page.rotto_links))
            rDB.rpush(website.id + ':result', Website.result_to_json(page))

        log.debug('Crawled :: {0}'.format(page.url))

        # increment website crawled page counter
        rDB.incr(website.id + ':pages_crawled')

        log.info('Pages Queued:: {0}'.format(
            rDB.get(website.id + ':pages_queued')))
        log.info('Pages Crawled:: {0}'.format(
            rDB.get(website.id + ':pages_crawled')))

        # checks if website crawled completely or not
        if rDB.get(website.id + ':pages_queued') == rDB.get(website.id +
                                                            ':pages_crawled'):

            log.info('Website {0} crawled Completely'.format(website.url))

            # save results to database
            log.info('Saving results to database')
            qH.enqueue(save_result_to_database, website)

            # send the email to user
            log.info('Sending email to user')
            send_mail_to_user(website)

    except Exception as e:
        log.exception('Error in crawling :: {0}'.format(page.url))