Ejemplo n.º 1
0
    def test_extract_descriptiom(self):
        pass
        knu_html = load_html_page(
            'Taras Shevchenko National University of Kyiv.htm')
        tree = build_html_tree(knu_html)
        description = extract_description(tree)
        self.assertIsNone(description)

        head_txt = """
        <head>
        <title>Taras Shevchenko National University of Kyiv</title>
        <link rel="StyleSheet" href="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/main.css" type="text/css">
        <link rel="alternate" type="application/rss+xml" href="http://knu.ua/ua/test/rss">
        <meta http-equiv="content-type" content="text/html; charset=UTF-8">
        <meta charset="UTF-8">
        <meta name="description" content="This is the site of Taras Shevchenko National University of Kyiv">
        <!-- base href="http://knu.ua/" -->
        <link rel="shortcut icon" href="http://knu.ua/favicon.ico">
        <script src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/watch.js" async="" type="text/javascript"></script><script type="text/javascript" src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/jquery-1.js"></script>
        </head>
        """

        tree = build_html_tree(head_txt)
        description = extract_description(tree)
        self.assertEqual(
            description,
            "This is the site of Taras Shevchenko National University of Kyiv")
Ejemplo n.º 2
0
    def test_extract_text(self):
        html_txt = """
        				<div class="b-horizontal-block__also">
						<h4 class="b-horizontal-block__also-title">
							See also
						</h4>
						<ul class="b-horizontal-block__links">
							<li class="b-horizontal-block__link-holder">
								<a href="http://knu.ua/en/geninf/statut/" class="b-horizontal-block__link">
									Statute</a>
								<br>
								<a href="http://knu.ua/en/geninf/ukaz/" class="b-horizontal-block__link">
									Decrees of the President
								</a>
								<br>
								<a href="http://knu.ua/pdfs/official/Lizenzia.pdf" class="b-horizontal-block__link">
									License
								</a>
								<br>
								<a href="http://knu.ua/en/official/accreditation" class="b-horizontal-block__link">
									Certificates of Accreditation
								</a>

								<a href="http://knu.ua/pdfs/Svidotstvo2012.pdf" class="b-horizontal-block__link">
									Certificate to the sign for the goods and services
								</a>
							</li>

						</ul>
					</div>
        """
        lst = []
        tree = build_html_tree(html_txt)
        extract_text(tree, lst)
        rez_lst = [
            'See also', 'Statute', 'Decrees of the President', 'License',
            'Certificates of Accreditation',
            'Certificate to the sign for the goods and services'
        ]
        self.assertListEqual(lst, rez_lst)

        f = open(
            self.get_html_page_path(
                'Taras Shevchenko National University of Kyiv.htm'), 'r')
        html_txt = f.read()
        f.close()
        tree = build_html_tree(html_txt)
        extract_text(tree, lst)

        # f_output = open('D:\psa\PyCharmPrj\site_analyzer\html_pages\output_taras_shevchenk.txt', 'r')
        f_output = codecs.open(
            self.get_html_page_path('output_taras_shevchenk.txt'),
            encoding='utf-8')
        output_lst = [l.strip() for l in f_output]
        f_output.close()
Ejemplo n.º 3
0
    def test_extract_heading(self):
        f = open(
            self.get_html_page_path(
                'Taras Shevchenko National University of Kyiv.htm'), 'r')
        html_txt = f.read()
        f.close()
        tree = build_html_tree(html_txt)
        output = extract_heading(tree)

        headings = [('h3', ['General information']), ('h4', ['See also']),
                    ('h3', ['Official information']), ('h4', ['See also']),
                    ('h3', ['Faculties and Institutes']), ('h4', ['See also']),
                    ('h3', ['For University entrants']), ('h4', ['See also']),
                    ('h3', ['Science']), ('h4', ['See also']),
                    ('h3', ['For students']), ('h4', ['See also']),
                    ('h3', ['Information technologies']),
                    ('h3', ['Resources']), ('h3', ['Libraries']),
                    ('h3', [u'NEWS']), ('h4', ['14.08.2014']),
                    ('h4', ['12.08.2014']), ('h4', ['31.07.2014']),
                    ('h4', ['29.07.2014']), ('h4', ['28.07.2014']),
                    ('h3', ['Favorite Videos']),
                    ('h3', ['International Association Of University']),
                    ('h3', ['University of the Internet']),
                    ('h3', ['Internet radio']), ('h4', ['CAMPUS RADIO'])]

        self.assertListEqual(output, headings)
Ejemplo n.º 4
0
    def test_extract_links(self):
        html_page = self.load_page('http://knu.ua')
        # f = open(self.get_html_page_path('Taras Shevchenko National University of Kyiv.htm'), 'r')
        # html_txt = f.read()
        # f.close()

        tree = build_html_tree(html_page)
        body = tree.find('body')
        links = extract_links(tree)
        pass
Ejemplo n.º 5
0
 def test_code_to_text_ratio(self):
     f = open(
         self.get_html_page_path(
             'Taras Shevchenko National University of Kyiv.htm'), 'r')
     html_txt = f.read()
     f.close()
     tree = build_html_tree(html_txt)
     body = tree.find('body')
     ratio = calc_code_to_text_ratio(body, len(html_txt))
     self.assertEqual(math.ceil(ratio * 100) / 100, 0.16)
Ejemplo n.º 6
0
 def test_extract_words(self):
     f = open(
         self.get_html_page_path(
             'Taras Shevchenko National University of Kyiv.htm'), 'r')
     html_txt = f.read()
     f.close()
     tree = build_html_tree(html_txt)
     words_lst = [w for w in extract_cleaned_words_iter(tree.find('body'))]
     output_lst = self.convert_lines_to_lst('words-lst.txt')
     self.assertListEqual(words_lst, output_lst)
Ejemplo n.º 7
0
    def test_extract_img_iter(self):
        f = open(
            self.get_html_page_path(
                'Taras Shevchenko National University of Kyiv.htm'), 'r')
        html_txt = f.read()
        f.close()
        tree = build_html_tree(html_txt)
        # l = [etree.tostring(item,method='html', pretty_print=True) for item in extract_img_iter(tree.find('body'))]
        l = [item for item in img_iter(tree.find('body'))]

        img_lst = [
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/rss.gif" title="RSS" align="left">\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/10.jpg">\n  \n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/9.jpg">\n  \n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/8.jpg">\n  \n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/3.jpg">\n  \n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/4.jpg">\n  \n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/178e.jpg" alt="" class="b-gallery__video">\n\t\t\t\t\t\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/observ.jpg" alt="" height="63" width="142">\n\t\t\t\t\t\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/iau.jpg" alt="" height="63" width="142">\n\t\t\t\t\t\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/eua.jpg" height="63" width="142">\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/euroasian.jpg" alt="" border="0" height="63" width="142">\n\t\t\t\t\t\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/een.jpg" title="EEN" alt="EEN" border="0" '
            'height="120" hspace="2" vspace="2" width="160">\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/monBaner.jpg" alt="&#1052;&#1054;&#1053;" border="0" hspace="2">\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/fbBaner.gif" alt="&#1052;&#1054;&#1053;" border="0">\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/vnz.png" title="&#1042;&#1080;&#1097;'
            '&#1072; &#1086;&#1089;&#1074;&#1110;&#1090;&#1072;" alt="&#1042;&#1080;&#1097;&#1072; &#1086;&#1089;&#1074;'
            '&#1110;&#1090;&#1072;" border="0" height="60" hspace="2" vspace="2" width="180">\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/qs.png" title="World University Rankings" '
            'alt="World University Rankings" border="0" vspace="2">\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/compas_banner.png" title="Best '
            'Universities" alt="Best Universities" border="0" height="140" hspace="2" vspace="2" width="96">\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/radio.jpg" alt="" class="b-triplet__img_type_radio">\n\t\t\t\t\t\n',
            '<img src="Taras%20Shevchenko%20National%20University%20of%20Kyiv_files/gerb3.png" alt="" class="b-foot__emblem">\n\t\t\n',
            '<img src="//mc.yandex.ru/watch/20542042" style="position:absolute; left:-9999px;" alt="">\n'
        ]

        self.assertListEqual([
            etree.tostring(item, method='html', pretty_print=True)
            for item in l
        ], img_lst)
        cnt = len(list(img_filter_iter(img_iter(tree.find('body')))))
        # cnt = calc_empty_img_alt(img_iter(tree.find('body')))
        self.assertEqual(cnt, 14)
Ejemplo n.º 8
0
 def test_extract_title(self):
     knu_html = load_html_page(
         'Taras Shevchenko National University of Kyiv.htm')
     tree = build_html_tree(knu_html)
     title = extract_title(tree)
     self.assertEqual(title, 'Taras Shevchenko National University of Kyiv')
Ejemplo n.º 9
0
    def test_create_words_frequency_lst(self):
        f = open(
            self.get_html_page_path(
                'Taras Shevchenko National University of Kyiv.htm'), 'r')
        html_txt = f.read()
        f.close()
        tree = build_html_tree(html_txt)
        words_lst = extract_cleaned_words_iter(tree.find('body'))
        wfl = create_words_frequency_dic(words_lst)
        #print wfl
        words_cloud = create_words_cloud(wfl)
        etalon_words_cloud = [
            ('university', 26), ('information', 12), ('more', 10),
            ('students', 9), ('for', 9), ('also', 6), ('about', 6),
            ('faculties', 6), ('kyiv', 6), ('institutes', 6), ('see', 6),
            ('academic', 5), (u's', 5), ('is', 5), ('with', 5),
            ('scientific', 5), ('in', 4), ('science', 4), ('centre', 4),
            ('ukraine', 4), ('achievements', 4), ('research', 4),
            ('national', 4), ('educational', 3), ('computer', 3), ('an', 3),
            ('museum', 3), ('entrants', 3), ('by', 3), ('internet', 3),
            ('are', 3), ('library', 3), ('resources', 3), (u'degrees', 3),
            ('staff', 3), ('center', 2), ('student', 2), ('ukrainian', 2),
            (u'higher', 2), ('as', 2), ('high', 2), ('history', 2),
            ('newspaper', 2), ('publications', 2), ('official', 2),
            ('campus', 2), ('observatory', 2), ('search', 2), ('academy', 2),
            ('halls', 2), ('there', 2), ('electronic', 2), ('sports', 2),
            ('taras', 2), ('news', 2), ('awards', 2), (u'specialist', 2),
            ('numerous', 2), ('underpinned', 2), ('maksymovych', 2),
            ('classical', 2), ('status', 2), ('radio', 2), ('facilities', 2),
            ('association', 2), ('international', 2), ('has', 2),
            ('admission', 2), ('sciences', 2), ('shevchenko', 2), ('its', 2),
            ('arising', 1), ('map', 1), ('organization', 1), ('banks', 1),
            ('enterprise', 1), ('crimean', 1), ('departments', 1), ('well', 1),
            ('professor', 1), ('studies', 1), ('branches', 1), ('statute', 1),
            ('kiev', 1), ('coast', 1), ('president', 1), ('services', 1),
            ('faculty', 1), ('catalog', 1),
            (u'phd', 1), (u'qualifications', 1), ('welcome', 1),
            ('independent', 1), ('profile', 1), ('you', 1), ('department', 1),
            (u'postgraduate', 1), ('other', 1), ('responsibilities', 1),
            (u'graduate', 1), ('administration', 1), ('comfortable', 1),
            ('orders', 1), ('technology', 1), ('education', 1), ('spa', 1),
            ('have', 1), ('cluster', 1), ('general', 1), ('m', 1),
            ('classic', 1), ('conferences', 1), (u'reserved', 1), ('pre', 1),
            ('called', 1), ('ua', 1), ('residence', 1), ('healthy', 1),
            ('clubs', 1), (u'rights', 1), ('lecture', 1), ('interfaculty', 1),
            (u'junior', 1), ('technologies', 1), ('particular', 1),
            ('bidding', 1), (u'thousand', 1), ('hub', 1), ('parliament', 1),
            ('website', 1), ('today', 1), ('black', 1), (u'master', 1),
            ('recognised', 1), ('accommodation', 1), ('europe', 1),
            ('leading', 1), ('distinct', 1), ('area', 1), ('certificate', 1),
            ('publishing', 1), ('from', 1), ('training', 1), ('unit', 1),
            (u'post', 1), ('methodological', 1), ('computing', 1), ('hans', 1),
            ('number', 1), ('within', 1), (u'military', 1),
            (u'qualification', 1), ('dnipro', 1), ('range', 1), ('union', 1),
            ('foreign', 1), ('favorite', 1), ('specialized', 1),
            ('institute', 1), ('license', 1),
            (u'working', 1), ('lifestyle', 1), ('goods', 1), ('both', 1),
            ('cafeterias', 1), ('adresses', 1), ('informatics', 1),
            (u'bachelor', 1), ('continuing', 1), ('zoological', 1),
            ('rosling', 1), ('email', 1), ('new', 1), ('certificates', 1),
            ('safety', 1), ('state', 1), ('access', 1), ('academies', 1),
            ('broad', 1), ('accreditation', 1), ('network', 1), ('primary', 1),
            ('ru', 1), ('employment', 1), ('crimea', 1), ('decrees', 1),
            ('sea', 1), ('astrophysical', 1), ('challenges', 1), ('sign', 1),
            ('phones', 1), ('councils', 1), ('lectures', 1), ('cisco', 1),
            ('work', 1), ('nationals', 1), ('river', 1), ('facing', 1),
            ('reprographics', 1), ('prize', 1), (u'school', 1), ('formal', 1),
            ('provided', 1), ('report', 1), ('nation', 1), (u'overall', 1),
            ('libraries', 1), ('rules', 1), ('gave', 1), ('local', 1),
            ('health', 1), ('networking', 1), ('videos', 1), ('geological', 1),
            ('young', 1), ('periodicals', 1), ('promote', 1), (u'doctoral', 1),
            ('linguistics', 1), ('rector', 1), ('committee', 1),
            ('including', 1), ('schools', 1), ('unofficial', 1), ('trade', 1),
            ('contemporary', 1), ('astronomical', 1), ('radiation', 1),
            ('dance', 1), ('rectors', 1), (u'all', 1)
        ]

        self.assertListEqual(words_cloud, etalon_words_cloud)