Python get Examples

Programming Language: Python

Namespace/Package Name: seoanalyzer.http.http

Method/Function: get

Examples at hotexamples.com: 3

Python get - 3 examples found. These are the top rated real world Python examples of seoanalyzer.http.http.get extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def crawl(self):
        if self.sitemap:
            page = http.get(self.sitemap)
            xmldoc = minidom.parseString(page.data.decode('utf-8'))
            sitemap_urls = xmldoc.getElementsByTagName('loc')

            for url in sitemap_urls:
                self.page_queue.append(self.get_text_from_xml(url.childNodes))

        self.page_queue.append(self.base_url)

        for url in self.page_queue:
            if url in self.crawled_urls:
                continue

            page = Page(url=url, base_domain=self.base_url)

            if page.parsed_url.netloc != page.base_domain.netloc:
                continue

            page.analyze()

            for w in page.wordcount:
                self.wordcount[w] += page.wordcount[w]

            for b in page.bigrams:
                self.bigrams[b] += page.bigrams[b]

            for t in page.trigrams:
                self.trigrams[t] += page.trigrams[t]

            self.page_queue.extend(page.links)

            self.crawled_pages.append(page)
            self.crawled_urls.add(page.url)

Example #2

Show file

    def crawl(self):
        if self.sitemap:
            page = http.get(self.sitemap)
            if self.sitemap.endswith('xml'):
                xmldoc = minidom.parseString(page.data.decode('utf-8'))
                sitemap_urls = xmldoc.getElementsByTagName('loc')
                for url in sitemap_urls:
                    self.page_queue.append(
                        self.get_text_from_xml(url.childNodes))
            elif self.sitemap.endswith('txt'):
                sitemap_urls = page.data.decode('utf-8').split('\n')
                for url in sitemap_urls:
                    self.page_queue.append(url)

        self.page_queue.append(self.base_url)

        for url in self.page_queue:
            if url in self.crawled_urls:
                continue

            page = Page(url=url,
                        base_domain=self.base_url,
                        analyze_headings=self.analyze_headings,
                        analyze_extra_tags=self.analyze_extra_tags)

            if page.parsed_url.netloc != page.base_domain.netloc:
                continue

            page.analyze()

            self.content_hashes[page.content_hash].add(page.url)

            for w in page.wordcount:
                self.wordcount[w] += page.wordcount[w]

            for b in page.bigrams:
                self.bigrams[b] += page.bigrams[b]

            for t in page.trigrams:
                self.trigrams[t] += page.trigrams[t]

            self.page_queue.extend(page.links)

            self.crawled_pages.append(page)
            self.crawled_urls.add(page.url)

            if not self.follow_links:
                break

Example #3

Show file

    def analyze(self, raw_html=None):
        """
        Analyze the page and populate the warnings list
        """

        if not raw_html:
            valid_prefixes = []

            # only allow http:// https:// and //
            for s in [
                    'http://',
                    'https://',
                    '//',
            ]:
                valid_prefixes.append(self.url.startswith(s))

            if True not in valid_prefixes:
                self.warn(
                    f'{self.url} does not appear to have a valid protocol.')
                return

            if self.url.startswith('//'):
                self.url = f'{self.base_domain.scheme}:{self.url}'

            if self.parsed_url.netloc != self.base_domain.netloc:
                self.warn(
                    f'{self.url} is not part of {self.base_domain.netloc}.')
                return

            try:
                page = http.get(self.url)
            except HTTPError as e:
                self.warn(f'Returned {e}')
                return

            encoding = 'ascii'

            if 'content-type' in page.headers:
                encoding = page.headers['content-type'].split('charset=')[-1]

            if encoding.lower() not in ('text/html', 'text/plain', 'utf-8'):
                # there is no unicode function in Python3
                # try:
                #     raw_html = unicode(page.read(), encoding)
                # except:
                self.warn(f'Can not read {encoding}')
                return
            else:
                raw_html = page.data.decode('utf-8')

        self.content_hash = hashlib.sha1(raw_html.encode('utf-8')).hexdigest()

        # remove comments, they screw with BeautifulSoup
        clean_html = re.sub(r'<!--.*?-->', r'', raw_html, flags=re.DOTALL)

        soup_lower = BeautifulSoup(clean_html.lower(),
                                   'html.parser')  #.encode('utf-8')
        soup_unmodified = BeautifulSoup(clean_html,
                                        'html.parser')  #.encode('utf-8')

        texts = soup_lower.findAll(text=True)
        visible_text = [w for w in filter(self.visible_tags, texts)]

        self.process_text(visible_text)

        self.populate(soup_lower)

        self.analyze_title()
        self.analyze_description()
        self.analyze_og(soup_lower)
        self.analyze_a_tags(soup_unmodified)
        self.analyze_img_tags(soup_lower)
        self.analyze_h1_tags(soup_lower)

        return True