Beispiel #1
0
class Page():

    def __init__(self, link, response):
        self.link = link
        self.frontier = Frontier(pop='random')
        self.response = response
        self.read_resp = self.response.read()
        self.soup = BeautifulSoup(self.read_resp)
        self.body = self.soup.body
        self.get_links()

    def __str__(self):
        return self.link

    def get_links(self):
        try:
            article = self.body.find("div", {"id": "mw-content-text"})
            for link in article.findAll('a'):
                try:
                    if self.is_valid_link(link['href']):
                        self.frontier.append(link['href'])
                        print link['href']
                except:
                    link
        except:
            "No body"

    def is_valid_link(self, link):
        if re.match('/wiki/.*', link) and not ":" in link:
            return link