Beispiel #1
0
 def test_generate_html(self):
     index = {
         'http://www.davidcmoss.co.uk/':
         Page(title=['David C Moss'],
              links=set([
                  u'http://www.davidcmoss.co.uk/static/Curriculum Vitae.pdf'
              ]),
              ex_links=set(['https://www.heroku.com']),
              images=['/static/img/profile.jpeg']),
         'http://www.davidcmoss.co.uk/static/Curriculum%20Vitae.pdf':
         Page(title=[], links=set([]), ex_links=set([]), images=[])
     }
     html = generate_html(index)
     assert "https://www.heroku.com" in html
Beispiel #2
0
 def test_crawl_site(self):
     index = crawl_site('http://www.davidcmoss.co.uk')
     expected_response = {
         u'http://www.davidcmoss.co.uk/':
         Page(title=u'David C Moss',
              links=set([
                  u'http://www.davidcmoss.co.uk/static/Curriculum Vitae.pdf'
              ]),
              ex_links=['https://www.heroku.com'],
              images=[
                  u'http://www.davidcmoss.co.uk/static/img/profile.jpeg'
              ]),
         u'http://www.davidcmoss.co.uk/static/Curriculum Vitae.pdf':
         Page(title=[], links=set([]), ex_links=[], images=[])
     }
     assert index == expected_response
Beispiel #3
0
    def build_crawl_list(self):
        """
        Build a list of all of the URLs based on the depth specified.
        """
        current_depth = 1
        page = requests.get(self.base_url).text
        if self.children > 0:
            self.urls = Page.get_urls(page)[:self.children]
        else:
            self.urls = Page.get_urls(page)
        # Below list holds previously scanned URLs, to stop URLs being added twice
        scanned_urls = []
        while current_depth <= self.depth:
            # Append the links for each page then search it for more
            print 'Starting crawl depth', current_depth, 'with', len(self.urls), 'URLs to scan'
            new_urls = []
            for url in self.urls:
                # If the url is not already scanned, and if it is not an image, xml etc. scan it.
                if url not in scanned_urls:
                    if TasteDotCom.is_wanted_object(url):
                        print 'Looking for child URLs in ', url
                        markup = requests.get(url).text
                        scanned_urls.append(url)
                        if self.children > 0:
                            new_urls = Page.get_urls(markup)[:self.children]
                        else:
                            new_urls = Page.get_urls(markup)
            print 'Found', len(new_urls), 'new pages'
            # for url in new_urls:
            #     check_and_add(url)
            self.urls += new_urls
            current_depth += 1
        print 'Finished crawling', self.base_url, 'found', len(self.urls), 'total URLs'

    # def run(self):
    #     """
    #     Start Crawling the page specified
    #     """
    #     #todo Make use of this method
    #     print "Starting crawl session for", self.base_url
    #     page = requests.get(self.base_url).text
    #     child_urls = Page.get_urls(page)
    #     for url in child_urls:
    #         self.check_and_add(url)

# def check_and_add(url):
#     pass
 def test_get_urls(self):
     # Use a known static page for testing
     html = file("../miscellany/sausage_and_punpkin_curry.html").read()
     urls = Page.get_urls(html)
     self.assertEquals(len(urls), 387)
     a = re.compile(r"http://www\.[//\a\w\.\+]+")
     for url in urls:
         # Check each URL matches a hyperlink pattern
         self.assertTrue(a.match(url))
Beispiel #5
0
import network2 as nx
import pylab as plt

from crawler import Crawler, Page, Document, Corpus

if __name__ == '__main__':
    start_page = Page('http://info.cern.ch/hypertext/WWW/TheProject.html')
    crawler = Crawler(start_page)
    crawler.crawl()


    web_graph = nx.DiGraph()
    edges = []
    edges2 = []

    for page in crawler.web:
        for link in page.links:
            edges.append((hash(page.address), hash(link)))

    web_graph.add_edges_from(edges)
    nx.draw(web_graph)
    plt.show()
    pageRanks = nx.pagerank(web_graph)
    for page in crawler.web:
        page.page_rank = pageRanks[hash(page.address)]
    pages = sorted(crawler.web, key=lambda p: p.page_rank, reverse=True)

    corpus = []
    for page in pages:
        corpus.append((page.address, page.text))