Python get_webpage_text Exemples, util.get_webpage_text Python Exemples

Exemple #1

0

Afficher le fichier

    def set_content(self, github_main_page_text):
        matches = re.findall("href=\"(.*blob/.*/citation.*?)\"",
                             github_main_page_text, re.IGNORECASE)
        if not matches:
            matches = re.findall("href=\"(.*/inst)\"", github_main_page_text,
                                 re.IGNORECASE)
            if matches:
                inst_url = "http://github.com{}".format(matches[0])
                r = requests.get(inst_url)
                inst_page_text = r.text
                matches = re.findall("href=\"(.*blob/.*/citation.*?)\"",
                                     inst_page_text, re.IGNORECASE)

        if matches:
            filename_part = matches[0]
            filename_part = filename_part.replace("/blob", "")
            filename_part = filename_part.replace("https://github.com", "")
            filename_part = filename_part.replace("http://github.com", "")
            filename = "https://raw.githubusercontent.com{}".format(
                filename_part)

            # check if symlink
            decoded_content = self.get_symlink_content(matches)

            if decoded_content:
                self.content = decoded_content
            else:
                self.content = get_webpage_text(filename)
            self.content_url = filename

Exemple #2

0

Afficher le fichier

 def set_content(self, input):
     if self.content_url.startswith(("http://", "https://")):
         relation_link = self.check_for_rel_cite_as_header(self.content_url)
         if relation_link:
             self.content_url = relation_link
             if 'doi.org' in relation_link:
                 self.content = 'found'
             else:
                 return get_webpage_text(relation_link)

Exemple #3

0

Afficher le fichier

    def set_content(self, bitbucket_main_page_text):
        matches = re.findall('href=\"(.*\/readme.*?\?.*)\"',
                             bitbucket_main_page_text, re.IGNORECASE)
        if matches:
            filename_part = matches[0]
            filename = get_raw_bitbucket_url(filename_part)

            self.content = get_webpage_text(filename)
            self.content_url = filename

Exemple #4

0

Afficher le fichier

 def set_content(self, github_main_page_text):
     matches = re.findall("href=\"(.*blob/.*/description.*?)\"",
                          github_main_page_text, re.IGNORECASE)
     if matches:
         filename_part = matches[0]
         filename_part = filename_part.replace("/blob", "")
         filename = "https://raw.githubusercontent.com{}".format(
             filename_part)
         self.content = get_webpage_text(filename)
         self.content_url = filename

Exemple #5

0

Afficher le fichier

 def set_content(self, input):
     self.set_content_url(input)
     if self.content_url:
         page = get_webpage_text(self.content_url)
         # get rid of the header because it has pypi specific stuff, not stuff about the library
         # makes it hard to get github links out for the library
         # see for example https://pypi.python.org/pypi/executor
         if '<div id="content-body">' in page:
             page = page.split('<div id="content-body">')[1]
         self.content = page

Exemple #6

0

Afficher le fichier

    def set_content(self, bitbucket_main_page_text):
        found_match = False
        matches = re.findall('href=\"(.*\/description.*?)\"',
                             bitbucket_main_page_text, re.IGNORECASE)

        if matches:
            filename_part = matches[0]
            filename = get_raw_bitbucket_url(filename_part)

            self.content = get_webpage_text(filename)
            self.content_url = filename

Exemple #7

0

Afficher le fichier

    def set_content(self, input):
        if not "github.com" in input:
            return
        if input.startswith("http"):
            url = "/".join(input.split("/", 5)[0:5])
        else:
            url = find_or_empty_string('\"(https?://github.com/.+?)\"', input)
            url = url.replace("/issues", "")
            url = url.replace("/new", "")
            if 'sphinx' and 'theme' in url or url.endswith('.zip'):
                url = None
            if not url:
                return

        self.content = get_webpage_text(url)
        self.content_url = url

Exemple #8

0

Afficher le fichier

    def set_content(self, input):
        if not "bitbucket.org" in input:
            return
        if input.startswith("http"):
            url = "/".join(input.split("/", 5)[0:5])
            url = url + '/src'
        else:
            url = find_or_empty_string(
                '"(https?:\/\/bitbucket.org\/\w+\/\w+/?)"', input)
            if not url:
                return
            else:
                url = "/".join(url.split("/")[0:5])
                url = url + '/src'

        self.content = get_webpage_text(url)
        self.content_url = url

Exemple #9

0

Afficher le fichier

    def extract_doi(self, text):
        if text.startswith('https://zenodo.org/record/'):
            text = get_webpage_text(text)

        badge_doi = find_or_empty_string("://zenodo.org/badge/doi/(.+?).svg",
                                         text)
        if badge_doi:
            return self.strip_junk_from_end_of_doi(badge_doi)
        zenodo_doi = find_or_empty_string("10.5281/zenodo\.\d+", text)
        if zenodo_doi:
            return self.strip_junk_from_end_of_doi(zenodo_doi)

        possible_dois = re.findall("10.\d{4,9}/[-._;()/:A-Z0-9+]+", text,
                                   re.IGNORECASE | re.MULTILINE)
        for doi in possible_dois:
            if "10.5063/schema/codemeta-2.0" in doi.lower():
                pass
            else:
                print("HERE I AM", doi)
                return self.strip_junk_from_end_of_doi(doi)

Exemple #10

0

Afficher le fichier

    def set_content(self, input):
        filename = self.parent_content_url + '/DESCRIPTION'
        page = get_webpage_text(filename)

        self.content = page
        self.content_url = filename

Exemple #11

0

Afficher le fichier

 def set_content(self, input):
     if self.content_url:
         self.content = get_webpage_text(self.content_url)