Exemple #1
0
 def __init__(self,
              database_writer=DatabaseWriter(),
              database_reader=DatabaseReader(),
              parser=Parser()):
     self.database_writer = database_writer
     self.database_reader = database_reader
     self.parser = parser
 def setUp(self):
     self.parser = Parser()
     self.test_soup = BeautifulSoup(
         '<!DOCTYPE html>\n<html>\n\n<head>\n <title>Cats and Dogs</title> \n<meta name="description" content="Page about cats and dogs"> \n <meta name="keywords" content="cats,dogs">\n</head><body><a href="www.dogs.com">Dogs</a><a href="www.cats.com">Cats</a></body></html>',
         'html.parser')
     self.bad_test_soup = BeautifulSoup('<!DOCTYPE html>\n<html>',
                                        'html.parser')
    async def test_get_ratings_per_user(self):
        content = await self._load_movie_user_reviews()
        parser = Parser(content)
        ratings = parser.get_ratings_per_user()

        answer = [('127878', 10), ('78539', 10), ('42238', 1), ('70404', 9)]
        self.assertEqual(answer, ratings)
Exemple #4
0
 def analyze_url(self, page: Page):
     self.seen_urls.add(page)
     if not self.check_domains(str(page)):
         return
     html = self.get_html(page.url)
     if html is None:
         return
     if self.analyze_robot(page.url):
         return
     if self.visited_urls_count < self.max_count_urls:
         self.visited_urls_count += 1
         parser = Parser(page.url)
         info = parser.get_info(html, str(page))
         if len(self.request.intersection(info)) != 0 \
                 and page not in self.result_urls:
             self.result_urls.add(page)
             self.update_parents()
             if self.download:
                 self.write_html(page, html)
         found_links = set(parser.get_urls(html))
         for link in found_links.difference(self.seen_urls):
             if link:
                 if str(link)[-1] == '/':
                     page = Page(link.parent)
                 else:
                     page = Page(link)
                 self.urls.put(page)
     else:
         return
Exemple #5
0
    def _parse_content(self, url, content):
        if url in self._visited:
            return
        self._visited.add(url)
        self.logger.info(f'url {url} visited')

        parser = Parser(content)
        links = parser.get_links() - self._visited
        self.logger.info(f'{len(links)} links added')
        for link in links:
            self._push_url(link)

        if 'other/moviePoint' in url:
            ratings = self._parse_ratings_per_user(url, parser)
            if ratings:
                self._ratings.extend(ratings)
        elif 'moviedb/main' in url:
            metadata = self._parse_metadata(url, parser)
            if metadata:
                self._metadata.append(metadata)
            ratings = self._parse_ratings_per_movie(url, parser)
            if ratings:
                self._ratings.extend(ratings)
        elif 'moviedb/grade' in url:
            ratings = self._parse_ratings_per_movie(url, parser)
            if ratings:
                self._ratings.extend(ratings)
    async def test_get_ratings_per_movie_1(self):
        content = await self._load_movie_main()
        parser = Parser(content)
        ratings = parser.get_ratings_per_movie()

        answer = [('ckh5SQ==', 0), ('Q0s1Yk0=', 9), ('OHVFYTQ=', 10),
                  ('NGsxa0M=', 10), ('NHJ5aHM=', 10)]
        self.assertEqual(answer, ratings)
 def __init__(self):
     try:
         self.crawler = Crawler()
         self.pgrpah = Graph()
         self.parser = Parser()
         pass
     except Exception as e:
         print("ERROR " + str(e))
         sys.exit(-1)
    async def test_get_ratings_per_movie_2(self):
        content = await self._load_movie_reviews()
        parser = Parser(content)
        ratings = parser.get_ratings_per_movie()

        answer = [('OEhrRm4=', 9), ('OEVQQnU=', 1), ('MkpxV2Y=', 1),
                  ('QmtPMWI=', 9), ('VU9ROA==', 7), ('dEdabQ==', 10),
                  ('NFB0NUY=', 6), ('M0tpUlA=', 10), ('QzF4Wnc=', 10),
                  ('OUMwUnA=', 7)]
        self.assertEqual(answer, ratings)
    async def test_get_metadata(self):
        content = await self._load_movie_main()
        parser = Parser(content)
        metadata = parser.get_metadata()

        self.assertTrue(metadata is not None)
        self.assertEqual('우리집 (2019)', metadata['title'])
        self.assertEqual('드라마/가족', metadata['genre'])
        self.assertEqual('한국', metadata['country'])
        self.assertEqual(92, metadata['running_time'])
    async def test_get_links(self):
        content = await self._load_movie_main()
        parser = Parser(content)
        links = parser.get_links()

        self.logger.info('# of links: %d', len(links))

        self.assertTrue(any(('movieId=128635' in link for link in links)))
        self.assertTrue(any(('personId=271829' in link for link in links)))
        self.assertTrue(any(('personId=518464' in link for link in links)))
Exemple #11
0
 def fill_disallow_urls(self, url: URL):
     parser = Parser(url)
     host = parser.host
     if host in self.seen_hosts:
         return
     self.seen_hosts.add(host)
     robots_txt_url = parser.host / 'robots.txt'
     robots_txt = requests.get(str(robots_txt_url)).text.lower()
     try:
         index = robots_txt.index('user-agent: *')
     except ValueError:
         return
     robots_txt = robots_txt[index::]
     robots_txt = robots_txt.split('\n')
     try:
         for string in robots_txt:
             if string.startswith('disallow'):
                 string = string.replace('*', '.+')
                 string = string.split(':')
                 self.disallow_urls.add(
                     re.compile(fr"{host}/{string[1][2::]}", re.IGNORECASE))
     except IndexError:
         pass
Exemple #12
0
 def __init__(self, url):
     self.url = url
     self.parser = Parser()
Exemple #13
0
 def __init__(self) -> None:
     self.downloader = Downloader()
     self.parser = Parser()
Exemple #14
0
 def __init__(self, mode):
     self.mode = mode
     self.store = self._set_orm()
     self.parser = Parser()
     self.links = self._set_links()
Exemple #15
0
 def setUp(self):
     self.parser = Parser()
     with open(FILE_PATH + '/test.html', 'r') as file:
         self.html = file.read()
     self.titles = self.parser.parser_titles(self.html)
Exemple #16
0
 def test_get_url_with_urls(self):
     with open('test.html', 'r') as test:
         text = test.read()
         test_url = Parser(URL('https://t/'))
         self.assertEqual(len(test_url.get_urls(text)), 4)