Esempio n. 1
0
class HtmlParser(object):

    crawled_file = ''
    urls_file = ''
    depth, width = 3, 5

    def __init__(self, urls):
        self.urllist = set(urls)
        self.cr = Crawler(HtmlParser.depth, HtmlParser.width)
        self.title = ""
        self.text = ""
        self.word_pos = {}
        self.queue = m.JoinableQueue()
        self.crawled = set()
        HtmlParser.crawled_file = 'crawled_links.txt'
        HtmlParser.urls_file = 'urls.txt'
        tools.set_to_file(self.urllist, HtmlParser.urls_file)

    def geturllist(self, response, soup):
        base = response.geturl()
        i = 0
        for link in soup.find_all('a'):
            if i >= self.cr.width:
                break
            parsed_link = urlparse.urldefrag(urlparse.urljoin(
                base, link.get('href')))[0].encode('ascii', 'ignore')
            if (parsed_link in self.crawled) or (parsed_link in self.urllist):
                continue
            self.urllist.add(parsed_link)
            i += 1

    def gettext(self, soup):
        for s in soup.find_all(['style', 'script', '[document]', 'head',
                                'title']):
            s.decompose()
        self.text = ' '.join(soup.get_text().split())

    def gettitle(self, soup):
        self.title = soup.title.string.encode('utf-8')

    def startcrawler(self):
        while True:
            url = self.queue.get()
            if url is None:

                break
            if url not in self.crawled:
                try:
                    self.urllist.remove(url)
                except KeyError:
                    print ('No such url in urllist')
                self.crawled.add(url)
                response = self.cr.open(url)
                soup = BeautifulSoup(response, 'html.parser')
                self.gettitle(soup)
                self.gettext(soup)
                self.word_pos = self._words_positions(self.text)
                if self.cr.depth >= 0:
                    self.geturllist(response, soup)
                page = WebPage(url=url, title=self.title, text=self.text,
                               indexed=False)
                try:
                    page.save()
                except IntegrityError:
                    print "IntegrityError founded. Go ahead."
                for (word, list_pos) in self.word_pos.iteritems():
                    index = Indexing.objects.create(
                            word=word, frequency=len(list_pos), webpage=page)
                    index.save()
                page.indexed = True
                page.save()
                self.update_files()
                for link in self.urllist:
                    self.queue.put(link)
                self.queue.task_done()
            self.queue.task_done()

    def multiproc(self):
        if len(self.urllist) > 0:
            for link in self.urllist:
                self.queue.put(link)

        workers = []
        for _ in range(NUMBER_OF_THREADS):
            worker = m.Process(target=self.startcrawler)
            workers.append(worker)
            worker.start()
            worker.join()

        self.queue.join()
        self.queue.put(None)
        self.queue.put(None)

        for i in range(NUMBER_OF_THREADS):
            workers[i].join(None)

    def update_files(self):
        tools.set_to_file(self.crawled, HtmlParser.crawled_file)
        tools.set_to_file(self.urllist, HtmlParser.urls_file)

    @staticmethod
    def _words_positions(text):
        words = re.findall(ur'\w+', unicode(text).lower(), flags=re.UNICODE)
        word_pos = {}
        for index, word in enumerate(words):
            if word in word_pos:
                word_pos[word].append(index)
            else:
                word_pos[word] = [index]
        return word_pos