def getWantToSeeMovies(self): searchURL = self.baseURL + '/usuario/' + self.username + '/quero-ver/' moviesVec = [] netflixVec = [] nThreads = 8 threadPool = ThreadPool(nThreads) threadPool.startWorking() def parsePage(pageUrl): wantToSeeCatalogueHTML = urllib.request.urlopen( urllib.request.Request(pageUrl, headers=self.hdr)) catalogueSoup = BeautifulSoup(wantToSeeCatalogueHTML, 'html.parser') print(pageUrl) #looping through each movie in the current page for movieDiv in catalogueSoup.findAll( 'li', {'class': 'span2 movie_list_item'}): divSoup = BeautifulSoup(str(movieDiv), 'html.parser') moviehref = str(divSoup.find("a")['href']) print(moviehref) movieURL = self.baseURL + moviehref threadPool.putInQueue(parseMovie, {"movieURL": movieURL}) def parseMovie(movieURL): movie = {} moviePageHtml = urllib.request.urlopen( urllib.request.Request(movieURL, headers=self.hdr)) moviePageSoup = BeautifulSoup(moviePageHtml, 'html.parser') movie['name'] = str( moviePageSoup.find('h2', { 'class': 'movie-original-title' }).string) movie['duration'] = str( moviePageSoup.find('span', { 'class': 'running_time' }).string) print(movie) moviesVec.append(movie) threadPool.putInQueue(checkNetflix, {'title': movie['name']}) def checkNetflix(title): netflixWrapper = NetflixWrapper() resp = netflixWrapper.isTitleInNetflix(title) if (resp[0]): netflixVec.append(resp[1]) for i in range(1, self.getWantToSeePages() + 1): pageUrl = searchURL + '?pagina=' + str(i) threadPool.putInQueue(parsePage, {'pageUrl': pageUrl}) #block until all tasks are done threadPool.end() return [moviesVec, netflixVec]
def getWantToSeeMovies(self): searchURL = self.baseURL + '/usuario/' + self.username + '/quero-ver/'; moviesVec = [] netflixVec = [] nThreads = 8 threadPool = ThreadPool(nThreads) threadPool.startWorking() def parsePage(pageUrl): wantToSeeCatalogueHTML = urllib.request.urlopen(urllib.request.Request(pageUrl, headers=self.hdr)) catalogueSoup = BeautifulSoup(wantToSeeCatalogueHTML, 'html.parser') print(pageUrl) #looping through each movie in the current page for movieDiv in catalogueSoup.findAll('li', { 'class': 'span2 movie_list_item'}): divSoup = BeautifulSoup(str(movieDiv), 'html.parser') moviehref = str(divSoup.find("a")['href']) print(moviehref) movieURL = self.baseURL + moviehref threadPool.putInQueue(parseMovie, {"movieURL": movieURL}) def parseMovie(movieURL): movie = {} moviePageHtml = urllib.request.urlopen(urllib.request.Request(movieURL, headers=self.hdr)) moviePageSoup = BeautifulSoup(moviePageHtml, 'html.parser') movie['name'] = str(moviePageSoup.find('h2',{'class':'movie-original-title'}).string) movie['duration'] = str(moviePageSoup.find('span',{'class':'running_time'}).string) print(movie) moviesVec.append(movie) threadPool.putInQueue(checkNetflix, {'title': movie['name']}) def checkNetflix(title): netflixWrapper = NetflixWrapper() resp = netflixWrapper.isTitleInNetflix(title) if(resp[0]): netflixVec.append(resp[1]) for i in range(1, self.getWantToSeePages() + 1): pageUrl = searchURL + '?pagina=' + str(i) threadPool.putInQueue(parsePage, {'pageUrl': pageUrl}) #block until all tasks are done threadPool.end() return [moviesVec, netflixVec]