def __init__(self, database_writer=DatabaseWriter(), database_reader=DatabaseReader(), parser=Parser()): self.database_writer = database_writer self.database_reader = database_reader self.parser = parser
def setUp(self): self.parser = Parser() self.test_soup = BeautifulSoup( '<!DOCTYPE html>\n<html>\n\n<head>\n <title>Cats and Dogs</title> \n<meta name="description" content="Page about cats and dogs"> \n <meta name="keywords" content="cats,dogs">\n</head><body><a href="www.dogs.com">Dogs</a><a href="www.cats.com">Cats</a></body></html>', 'html.parser') self.bad_test_soup = BeautifulSoup('<!DOCTYPE html>\n<html>', 'html.parser')
async def test_get_ratings_per_user(self): content = await self._load_movie_user_reviews() parser = Parser(content) ratings = parser.get_ratings_per_user() answer = [('127878', 10), ('78539', 10), ('42238', 1), ('70404', 9)] self.assertEqual(answer, ratings)
def analyze_url(self, page: Page): self.seen_urls.add(page) if not self.check_domains(str(page)): return html = self.get_html(page.url) if html is None: return if self.analyze_robot(page.url): return if self.visited_urls_count < self.max_count_urls: self.visited_urls_count += 1 parser = Parser(page.url) info = parser.get_info(html, str(page)) if len(self.request.intersection(info)) != 0 \ and page not in self.result_urls: self.result_urls.add(page) self.update_parents() if self.download: self.write_html(page, html) found_links = set(parser.get_urls(html)) for link in found_links.difference(self.seen_urls): if link: if str(link)[-1] == '/': page = Page(link.parent) else: page = Page(link) self.urls.put(page) else: return
def _parse_content(self, url, content): if url in self._visited: return self._visited.add(url) self.logger.info(f'url {url} visited') parser = Parser(content) links = parser.get_links() - self._visited self.logger.info(f'{len(links)} links added') for link in links: self._push_url(link) if 'other/moviePoint' in url: ratings = self._parse_ratings_per_user(url, parser) if ratings: self._ratings.extend(ratings) elif 'moviedb/main' in url: metadata = self._parse_metadata(url, parser) if metadata: self._metadata.append(metadata) ratings = self._parse_ratings_per_movie(url, parser) if ratings: self._ratings.extend(ratings) elif 'moviedb/grade' in url: ratings = self._parse_ratings_per_movie(url, parser) if ratings: self._ratings.extend(ratings)
async def test_get_ratings_per_movie_1(self): content = await self._load_movie_main() parser = Parser(content) ratings = parser.get_ratings_per_movie() answer = [('ckh5SQ==', 0), ('Q0s1Yk0=', 9), ('OHVFYTQ=', 10), ('NGsxa0M=', 10), ('NHJ5aHM=', 10)] self.assertEqual(answer, ratings)
def __init__(self): try: self.crawler = Crawler() self.pgrpah = Graph() self.parser = Parser() pass except Exception as e: print("ERROR " + str(e)) sys.exit(-1)
async def test_get_ratings_per_movie_2(self): content = await self._load_movie_reviews() parser = Parser(content) ratings = parser.get_ratings_per_movie() answer = [('OEhrRm4=', 9), ('OEVQQnU=', 1), ('MkpxV2Y=', 1), ('QmtPMWI=', 9), ('VU9ROA==', 7), ('dEdabQ==', 10), ('NFB0NUY=', 6), ('M0tpUlA=', 10), ('QzF4Wnc=', 10), ('OUMwUnA=', 7)] self.assertEqual(answer, ratings)
async def test_get_metadata(self): content = await self._load_movie_main() parser = Parser(content) metadata = parser.get_metadata() self.assertTrue(metadata is not None) self.assertEqual('우리집 (2019)', metadata['title']) self.assertEqual('드라마/가족', metadata['genre']) self.assertEqual('한국', metadata['country']) self.assertEqual(92, metadata['running_time'])
async def test_get_links(self): content = await self._load_movie_main() parser = Parser(content) links = parser.get_links() self.logger.info('# of links: %d', len(links)) self.assertTrue(any(('movieId=128635' in link for link in links))) self.assertTrue(any(('personId=271829' in link for link in links))) self.assertTrue(any(('personId=518464' in link for link in links)))
def fill_disallow_urls(self, url: URL): parser = Parser(url) host = parser.host if host in self.seen_hosts: return self.seen_hosts.add(host) robots_txt_url = parser.host / 'robots.txt' robots_txt = requests.get(str(robots_txt_url)).text.lower() try: index = robots_txt.index('user-agent: *') except ValueError: return robots_txt = robots_txt[index::] robots_txt = robots_txt.split('\n') try: for string in robots_txt: if string.startswith('disallow'): string = string.replace('*', '.+') string = string.split(':') self.disallow_urls.add( re.compile(fr"{host}/{string[1][2::]}", re.IGNORECASE)) except IndexError: pass
def __init__(self, url): self.url = url self.parser = Parser()
def __init__(self) -> None: self.downloader = Downloader() self.parser = Parser()
def __init__(self, mode): self.mode = mode self.store = self._set_orm() self.parser = Parser() self.links = self._set_links()
def setUp(self): self.parser = Parser() with open(FILE_PATH + '/test.html', 'r') as file: self.html = file.read() self.titles = self.parser.parser_titles(self.html)
def test_get_url_with_urls(self): with open('test.html', 'r') as test: text = test.read() test_url = Parser(URL('https://t/')) self.assertEqual(len(test_url.get_urls(text)), 4)