Ejemplo n.º 1
0
        # Start listening for commands.
        while self.number_of_non_trivial_indexes <= self.max_links_to_crawl:
            write_cmd = self.main_thread_cmd_queue.pop(
                timeout=Crawler.POP_TIMEOUT_IN_SECONDS)
            if isinstance(write_cmd, RunOnMainThread):
                write_cmd.run()
            else:
                logger.warn(
                    "Main thread received a command it couldn't parse: ",
                    write_cmd)

        # Crawling complete. Hola the team!
        logger.info("Crawling complete. Logged: {n_urls}".format(
            n_urls=len(self.finished_indexers_list)))


def main():
    #[x.delete() for x in list(IndexedPage.objects.all()) + list(WordFromIndexedPage.objects.all())]
    crawler = Crawler(links_queue=TQueue(
        ["https://www.facebook.com/moforjohn"]),
                      max_active_indexers=10,
                      max_links_to_crawl=50)
    #crawler.start()
    #crawler.join()
    crawler.run()
    logger.info("Crawling complete!")


if __name__ == '__main__':
    profile_main()
Ejemplo n.º 2
0
            for link in lxml.html.fromstring(self.raw_html).xpath('//a/@href')
        ]
        if hasattr(self.links_queue, "extend"):
            self.links_queue.extend(all_links)
        logger.info("finished indexing url={url}".format(url=self.url))
        self.done()

    def run(self):
        logger.info("starting to index page url={url}".format(url=self.url))
        self.final_url = final_url_after_redirects(self.url)
        if self.final_url is not None:
            if self.links_queue is not None:
                self.populate_indexedPage(self.final_url)
            else:
                logger.warn("links_queue is None! Aborting")
                self.done()
        else:
            logger.warn("final_url is None. No content picked up. Aborting")
            self.done()


def main():
    #t = Indexer(indexed_page="http://example.com", on_finished_indexing=None, links_queue=[])
    # t.start()
    # t.join()
    pass


if __name__ == '__main__':
    profile_main('main()')
Ejemplo n.º 3
0
        all_links = [urljoin(self.final_url, link)
                     for link in lxml.html.fromstring(self.raw_html).xpath('//a/@href')]
        if hasattr(self.links_queue, "extend"):
            self.links_queue.extend(all_links)
        logger.info("finished indexing url={url}".format(url=self.url))
        self.done()

    def run(self):
        logger.info("starting to index page url={url}".format(url=self.url))
        self.final_url = final_url_after_redirects(self.url)
        if self.final_url is not None:
            if self.links_queue is not None:
                self.populate_indexedPage(self.final_url)
            else: 
                logger.warn("links_queue is None! Aborting")
                self.done()
        else: 
            logger.warn("final_url is None. No content picked up. Aborting")
            self.done()
        


def main():
    #t = Indexer(indexed_page="http://example.com", on_finished_indexing=None, links_queue=[])
    # t.start()
    # t.join()
    pass

if __name__ == '__main__':
    profile_main('main()')
Ejemplo n.º 4
0
        # Start listening for commands.
        while self.number_of_non_trivial_indexes <= self.max_links_to_crawl:
            write_cmd = self.main_thread_cmd_queue.pop(timeout=Crawler.POP_TIMEOUT_IN_SECONDS)
            if isinstance(write_cmd, RunOnMainThread):
                write_cmd.run()
            else:
                logger.warn("Main thread received a command it couldn't parse: ", write_cmd)

        # Crawling complete. Hola the team!
        logger.info(
            "Crawling complete. Logged: {n_urls}".format(
                n_urls=len(
                    self.finished_indexers_list)))


def main():
    #[x.delete() for x in list(IndexedPage.objects.all()) + list(WordFromIndexedPage.objects.all())]
    crawler = Crawler(
        links_queue=TQueue(
            ["https://www.facebook.com/moforjohn"]),
        max_active_indexers=10,
        max_links_to_crawl=50)
    #crawler.start()
    #crawler.join()
    crawler.run()
    logger.info("Crawling complete!")

if __name__ == '__main__':
    profile_main()